Skip to content

Commit

Permalink
added support for multi-token orgs and persons
Browse files Browse the repository at this point in the history
  • Loading branch information
jbutcher21 committed Mar 6, 2020
1 parent be2771b commit 365dc20
Show file tree
Hide file tree
Showing 4 changed files with 47 additions and 26 deletions.
1 change: 1 addition & 0 deletions csv_functions.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
"AND",
"ASSETS",
"BANK",
"CITY OF",
"CLINIC",
"CO",
"COMPANY",
Expand Down
68 changes: 43 additions & 25 deletions csv_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,14 +31,15 @@ def __init__(self):
return

#--ensure these lists exist
if 'GARBAGE_VALUES' not in self.variantJson:
self.variantJson['GARBAGE_VALUES'] = []
if 'ORGANIZATION_TOKENS' not in self.variantJson:
self.variantJson['ORGANIZATION_TOKENS'] = []
if 'PERSON_TOKENS' not in self.variantJson:
self.variantJson['PERSON_TOKENS'] = []
if 'SENZING_ATTRIBUTES' not in self.variantJson:
self.variantJson['SENZING_ATTRIBUTES'] = []
keys = [
"GARBAGE_VALUES",
"ORGANIZATION_TOKENS",
"PERSON_TOKENS",
"SENZING_ATTRIBUTES"
]
for key in keys:
if key not in self.variantJson:
self.variantJson[key] = []

#--turn lists into dictionaries for speed
self.variantData = {}
Expand Down Expand Up @@ -94,10 +95,35 @@ def format_date(self, dateString, outputFormat = None):
def clean_value(self, valueString):
#--remove extra spaces
returnValue = ' '.join(str(valueString).strip().split())
if returnValue.upper() in self.variantData['GARBAGE_VALUES']:
#--whole field must match a garbage value
if returnValue.upper() in self.variantData['GARBAGE_VALUES']:
returnValue = ''
return returnValue

#-----------------------------------
def is_organization_name(self, nameString):
if nameString:
priorTokens = []
for token in nameString.replace('.',' ').replace(',',' ').upper().split():
if token in self.variantData['ORGANIZATION_TOKENS'] or \
' '.join(priorTokens[-2:]) in self.variantData['ORGANIZATION_TOKENS'] or \
' '.join(priorTokens[-3:]) in self.variantData['ORGANIZATION_TOKENS']:
return True
priorTokens.append(token)
return False

#-----------------------------------
def is_person_name(self, nameString):
if nameString:
priorTokens = []
for token in nameString.replace('.',' ').replace(',',' ').upper().split():
if token in self.variantData['PERSON_TOKENS'] or \
' '.join(priorTokens[-2:]) in self.variantData['PERSON_TOKENS'] or \
' '.join(priorTokens[-3:]) in self.variantData['PERSON_TOKENS']:
return True
priorTokens.append(token)
return False

#-----------------------------------
def is_senzing_attribute(self, attrName):
attrName = attrName.upper()
Expand All @@ -118,24 +144,16 @@ def get_senzing_attribute(self, attrName):
attrName = attrName.upper()
if attrName in self.variantData['SENZING_ATTRIBUTES']:
return self.variantData['SENZING_ATTRIBUTES'][attrName]
elif '_' in attrName:
baseName = attrName[attrName.find('_') + 1:]
if baseName in self.variantData['SENZING_ATTRIBUTES']:
return self.variantData['SENZING_ATTRIBUTES'][baseName]
else:
baseName = attrName[0:attrName.rfind('_')]
if baseName in self.variantData['SENZING_ATTRIBUTES']:
return self.variantData['SENZING_ATTRIBUTES'][baseName]
return {}

#-----------------------------------
def is_organization_name(self, nameString):
if nameString:
for token in nameString.replace('.',' ').replace(',',' ').split():
if token.upper() in self.variantData['ORGANIZATION_TOKENS']:
return True
return False

#-----------------------------------
def is_person_name(self, nameString):
if nameString:
for token in nameString.replace('.',' ').replace(',',' ').split():
if token.upper() in self.variantData['PERSON_TOKENS']:
return True
return False

#----------------------------------------
if __name__ == "__main__":
appPath = os.path.dirname(os.path.abspath(sys.argv[0]))
Expand Down
3 changes: 2 additions & 1 deletion input/test_set1.csv
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,5 @@ uniqueid,name,gender,dob,ssn,dlnum,proflic,taxid,addr1,city,state,zip
1004,Mary Smith,F,,,,1004044,,444 Fourth,Las Vegas,NV,89114
1005,Peter Anderson,,,,,1005055,,555 Fifth,Las Vegas,NV,89115
1006,Cleveland Clinic,,,6060016,,1006066,6060016,666 Sixth,Las Vegas,NV,89116
1007,,F,,,,,700777,777 Seventh,Las Vegas,NV,89117
1007,,F,,,,,700777,777 Seventh,Las Vegas,NV,89117
1008,City of Hope,F,,,,,,888 Eighth,Las Vegas,NV,89118
1 change: 1 addition & 0 deletions output/test_set1.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@
{"DATA_SOURCE": "test", "ENTITY_TYPE": "PERSON", "RECORD_ID": "1004", "PRIMARY_NAME_FULL": "Mary Smith", "RECORD_TYPE": "PERSON", "GENDER": "F", "PROF_LICENSE": "1004044", "HOME_ADDR_LINE1": "444 Fourth", "HOME_ADDR_CITY": "Las Vegas", "HOME_ADDR_STATE": "NV", "HOME_ADDR_POSTAL_CODE": "89114"}
{"DATA_SOURCE": "test", "ENTITY_TYPE": "PERSON", "RECORD_ID": "1005", "PRIMARY_NAME_FULL": "Peter Anderson", "RECORD_TYPE": "PERSON", "PROF_LICENSE": "1005055", "HOME_ADDR_LINE1": "555 Fifth", "HOME_ADDR_CITY": "Las Vegas", "HOME_ADDR_STATE": "NV", "HOME_ADDR_POSTAL_CODE": "89115"}
{"DATA_SOURCE": "test", "ENTITY_TYPE": "ORGANIZATION", "RECORD_ID": "1006", "PRIMARY_NAME_ORG": "Cleveland Clinic", "RECORD_TYPE": "ORGANIZATION", "REF_SSN": "6060016", "PROF_LICENSE": "1006066", "TAX_ID_NUMBER": "6060016", "HOME_ADDR_LINE1": "666 Sixth", "HOME_ADDR_CITY": "Las Vegas", "HOME_ADDR_STATE": "NV", "HOME_ADDR_POSTAL_CODE": "89116"}
{"DATA_SOURCE": "test", "ENTITY_TYPE": "ORGANIZATION", "RECORD_ID": "1008", "PRIMARY_NAME_ORG": "City of Hope", "RECORD_TYPE": "ORGANIZATION", "REF_GENDER": "F", "HOME_ADDR_LINE1": "888 Eighth", "HOME_ADDR_CITY": "Las Vegas", "HOME_ADDR_STATE": "NV", "HOME_ADDR_POSTAL_CODE": "89118"}

0 comments on commit 365dc20

Please sign in to comment.