diff --git a/nomenklatura/matching/compare/names.py b/nomenklatura/matching/compare/names.py index 761e505f..cc2680d9 100644 --- a/nomenklatura/matching/compare/names.py +++ b/nomenklatura/matching/compare/names.py @@ -2,7 +2,7 @@ from itertools import product from followthemoney.proxy import E from followthemoney.types import registry -from fingerprints import clean_name_light +from fingerprints import clean_name_light, clean_name_ascii from nomenklatura.util import names_word_list, levenshtein from nomenklatura.util import fingerprint_name, normalize_name, jaro_winkler from nomenklatura.matching.util import type_pair, props_pair, has_schema @@ -79,6 +79,14 @@ def _fp_name_parts(name: str) -> List[str]: return names_word_list([name], normalizer=fingerprint_name, min_length=2) +def _fpw_name_parts(name: str) -> List[str]: + parts = names_word_list([name], normalizer=fingerprint_name, min_length=2) + for part in names_word_list([name], normalizer=clean_name_ascii, min_length=2): + if part not in parts: + parts.append(part) + return parts + + def name_fingerprint_levenshtein(query: E, result: E) -> float: """Two non-person entities have similar fingerprinted names. This includes simplifying entity type names (e.g. "Limited" -> "Ltd") and uses the @@ -87,7 +95,7 @@ def name_fingerprint_levenshtein(query: E, result: E) -> float: return 0.0 query_names_, result_names_ = type_pair(query, result, registry.name) query_names = [_fp_name_parts(n) for n in query_names_] - result_names = [_fp_name_parts(n) for n in result_names_] + result_names = [_fpw_name_parts(n) for n in result_names_] max_score = 0.0 for (qn, rn) in product(query_names, result_names): if len(qn) == 0: @@ -112,7 +120,7 @@ def name_fingerprint_levenshtein(query: E, result: E) -> float: raligned = " ".join(p[1] for p in aligned) distance = levenshtein(qaligned, raligned) # Skip results with an overall distance of more than 5 characters: - max_edits = min(5, (min(len(qaligned), len(raligned)) // 3)) + max_edits = min(4, (min(len(qaligned), len(raligned)) // 3)) if distance > max_edits: continue score = 1.0 - (distance / float(max(len(qaligned), len(raligned)))) diff --git a/tests/matching/test_names.py b/tests/matching/test_names.py index 7ede05e7..7999199e 100644 --- a/tests/matching/test_names.py +++ b/tests/matching/test_names.py @@ -298,7 +298,7 @@ def test_org_name_partial_match(): assert name_fingerprint_levenshtein(query, result) == 0.0 query = e("Company", name="CRYSTALORD") assert name_fingerprint_levenshtein(query, result) == 1.0 - query = e("Company", name="CRISTALORD") + query = e("Company", name="CRISTALORD LIMITED") assert name_fingerprint_levenshtein(query, result) > 0.8 @@ -321,6 +321,11 @@ def test_org_name_example_2(): assert name_fingerprint_levenshtein(query, result) > 0.8 assert name_fingerprint_levenshtein(query, result) < 1.0 + query = e("Company", name="TACTICAL MISSILES CORPORATION JOINT STOCK COMPANY") + result = e("Company", name="TACTICAL MISSILES CORPORATION JOYNT STOCK COMPANY") + assert name_fingerprint_levenshtein(query, result) > 0.8 + assert name_fingerprint_levenshtein(query, result) < 1.0 + def test_org_name_example_3(): query = e("Company", name="Iskusstvo Krasoty")