From 365dc2084f89293f7fb30be4167b2a16b03479ba Mon Sep 17 00:00:00 2001 From: Jeff Butcher Date: Fri, 6 Mar 2020 09:32:52 -0800 Subject: [PATCH] added support for multi-token orgs and persons --- csv_functions.json | 1 + csv_functions.py | 68 +++++++++++++++++++++++++++---------------- input/test_set1.csv | 3 +- output/test_set1.json | 1 + 4 files changed, 47 insertions(+), 26 deletions(-) diff --git a/csv_functions.json b/csv_functions.json index dc258df..6a1ab92 100644 --- a/csv_functions.json +++ b/csv_functions.json @@ -7,6 +7,7 @@ "AND", "ASSETS", "BANK", + "CITY OF", "CLINIC", "CO", "COMPANY", diff --git a/csv_functions.py b/csv_functions.py index 48f5a7a..b22d6bd 100644 --- a/csv_functions.py +++ b/csv_functions.py @@ -31,14 +31,15 @@ def __init__(self): return #--ensure these lists exist - if 'GARBAGE_VALUES' not in self.variantJson: - self.variantJson['GARBAGE_VALUES'] = [] - if 'ORGANIZATION_TOKENS' not in self.variantJson: - self.variantJson['ORGANIZATION_TOKENS'] = [] - if 'PERSON_TOKENS' not in self.variantJson: - self.variantJson['PERSON_TOKENS'] = [] - if 'SENZING_ATTRIBUTES' not in self.variantJson: - self.variantJson['SENZING_ATTRIBUTES'] = [] + keys = [ + "GARBAGE_VALUES", + "ORGANIZATION_TOKENS", + "PERSON_TOKENS", + "SENZING_ATTRIBUTES" + ] + for key in keys: + if key not in self.variantJson: + self.variantJson[key] = [] #--turn lists into dictionaries for speed self.variantData = {} @@ -94,10 +95,35 @@ def format_date(self, dateString, outputFormat = None): def clean_value(self, valueString): #--remove extra spaces returnValue = ' '.join(str(valueString).strip().split()) - if returnValue.upper() in self.variantData['GARBAGE_VALUES']: + #--whole field must match a garbage value + if returnValue.upper() in self.variantData['GARBAGE_VALUES']: returnValue = '' return returnValue + #----------------------------------- + def is_organization_name(self, nameString): + if nameString: + priorTokens = [] + for token in nameString.replace('.',' ').replace(',',' ').upper().split(): + if token in self.variantData['ORGANIZATION_TOKENS'] or \ + ' '.join(priorTokens[-2:]) in self.variantData['ORGANIZATION_TOKENS'] or \ + ' '.join(priorTokens[-3:]) in self.variantData['ORGANIZATION_TOKENS']: + return True + priorTokens.append(token) + return False + + #----------------------------------- + def is_person_name(self, nameString): + if nameString: + priorTokens = [] + for token in nameString.replace('.',' ').replace(',',' ').upper().split(): + if token in self.variantData['PERSON_TOKENS'] or \ + ' '.join(priorTokens[-2:]) in self.variantData['PERSON_TOKENS'] or \ + ' '.join(priorTokens[-3:]) in self.variantData['PERSON_TOKENS']: + return True + priorTokens.append(token) + return False + #----------------------------------- def is_senzing_attribute(self, attrName): attrName = attrName.upper() @@ -118,24 +144,16 @@ def get_senzing_attribute(self, attrName): attrName = attrName.upper() if attrName in self.variantData['SENZING_ATTRIBUTES']: return self.variantData['SENZING_ATTRIBUTES'][attrName] + elif '_' in attrName: + baseName = attrName[attrName.find('_') + 1:] + if baseName in self.variantData['SENZING_ATTRIBUTES']: + return self.variantData['SENZING_ATTRIBUTES'][baseName] + else: + baseName = attrName[0:attrName.rfind('_')] + if baseName in self.variantData['SENZING_ATTRIBUTES']: + return self.variantData['SENZING_ATTRIBUTES'][baseName] return {} - #----------------------------------- - def is_organization_name(self, nameString): - if nameString: - for token in nameString.replace('.',' ').replace(',',' ').split(): - if token.upper() in self.variantData['ORGANIZATION_TOKENS']: - return True - return False - - #----------------------------------- - def is_person_name(self, nameString): - if nameString: - for token in nameString.replace('.',' ').replace(',',' ').split(): - if token.upper() in self.variantData['PERSON_TOKENS']: - return True - return False - #---------------------------------------- if __name__ == "__main__": appPath = os.path.dirname(os.path.abspath(sys.argv[0])) diff --git a/input/test_set1.csv b/input/test_set1.csv index ea889c2..9d645ac 100644 --- a/input/test_set1.csv +++ b/input/test_set1.csv @@ -5,4 +5,5 @@ uniqueid,name,gender,dob,ssn,dlnum,proflic,taxid,addr1,city,state,zip 1004,Mary Smith,F,,,,1004044,,444 Fourth,Las Vegas,NV,89114 1005,Peter Anderson,,,,,1005055,,555 Fifth,Las Vegas,NV,89115 1006,Cleveland Clinic,,,6060016,,1006066,6060016,666 Sixth,Las Vegas,NV,89116 -1007,,F,,,,,700777,777 Seventh,Las Vegas,NV,89117 \ No newline at end of file +1007,,F,,,,,700777,777 Seventh,Las Vegas,NV,89117 +1008,City of Hope,F,,,,,,888 Eighth,Las Vegas,NV,89118 \ No newline at end of file diff --git a/output/test_set1.json b/output/test_set1.json index f7c651f..646ac57 100644 --- a/output/test_set1.json +++ b/output/test_set1.json @@ -4,3 +4,4 @@ {"DATA_SOURCE": "test", "ENTITY_TYPE": "PERSON", "RECORD_ID": "1004", "PRIMARY_NAME_FULL": "Mary Smith", "RECORD_TYPE": "PERSON", "GENDER": "F", "PROF_LICENSE": "1004044", "HOME_ADDR_LINE1": "444 Fourth", "HOME_ADDR_CITY": "Las Vegas", "HOME_ADDR_STATE": "NV", "HOME_ADDR_POSTAL_CODE": "89114"} {"DATA_SOURCE": "test", "ENTITY_TYPE": "PERSON", "RECORD_ID": "1005", "PRIMARY_NAME_FULL": "Peter Anderson", "RECORD_TYPE": "PERSON", "PROF_LICENSE": "1005055", "HOME_ADDR_LINE1": "555 Fifth", "HOME_ADDR_CITY": "Las Vegas", "HOME_ADDR_STATE": "NV", "HOME_ADDR_POSTAL_CODE": "89115"} {"DATA_SOURCE": "test", "ENTITY_TYPE": "ORGANIZATION", "RECORD_ID": "1006", "PRIMARY_NAME_ORG": "Cleveland Clinic", "RECORD_TYPE": "ORGANIZATION", "REF_SSN": "6060016", "PROF_LICENSE": "1006066", "TAX_ID_NUMBER": "6060016", "HOME_ADDR_LINE1": "666 Sixth", "HOME_ADDR_CITY": "Las Vegas", "HOME_ADDR_STATE": "NV", "HOME_ADDR_POSTAL_CODE": "89116"} +{"DATA_SOURCE": "test", "ENTITY_TYPE": "ORGANIZATION", "RECORD_ID": "1008", "PRIMARY_NAME_ORG": "City of Hope", "RECORD_TYPE": "ORGANIZATION", "REF_GENDER": "F", "HOME_ADDR_LINE1": "888 Eighth", "HOME_ADDR_CITY": "Las Vegas", "HOME_ADDR_STATE": "NV", "HOME_ADDR_POSTAL_CODE": "89118"}