Skip to content

Commit

Permalink
fine-tune
Browse files Browse the repository at this point in the history
  • Loading branch information
pudo committed Oct 7, 2023
1 parent 12497d7 commit 06a9ce2
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 4 deletions.
14 changes: 11 additions & 3 deletions nomenklatura/matching/compare/names.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from itertools import product
from followthemoney.proxy import E
from followthemoney.types import registry
from fingerprints import clean_name_light
from fingerprints import clean_name_light, clean_name_ascii
from nomenklatura.util import names_word_list, levenshtein
from nomenklatura.util import fingerprint_name, normalize_name, jaro_winkler
from nomenklatura.matching.util import type_pair, props_pair, has_schema
Expand Down Expand Up @@ -79,6 +79,14 @@ def _fp_name_parts(name: str) -> List[str]:
return names_word_list([name], normalizer=fingerprint_name, min_length=2)


def _fpw_name_parts(name: str) -> List[str]:
parts = names_word_list([name], normalizer=fingerprint_name, min_length=2)
for part in names_word_list([name], normalizer=clean_name_ascii, min_length=2):
if part not in parts:
parts.append(part)
return parts


def name_fingerprint_levenshtein(query: E, result: E) -> float:
"""Two non-person entities have similar fingerprinted names. This includes
simplifying entity type names (e.g. "Limited" -> "Ltd") and uses the
Expand All @@ -87,7 +95,7 @@ def name_fingerprint_levenshtein(query: E, result: E) -> float:
return 0.0
query_names_, result_names_ = type_pair(query, result, registry.name)
query_names = [_fp_name_parts(n) for n in query_names_]
result_names = [_fp_name_parts(n) for n in result_names_]
result_names = [_fpw_name_parts(n) for n in result_names_]
max_score = 0.0
for (qn, rn) in product(query_names, result_names):
if len(qn) == 0:
Expand All @@ -112,7 +120,7 @@ def name_fingerprint_levenshtein(query: E, result: E) -> float:
raligned = " ".join(p[1] for p in aligned)
distance = levenshtein(qaligned, raligned)
# Skip results with an overall distance of more than 5 characters:
max_edits = min(5, (min(len(qaligned), len(raligned)) // 3))
max_edits = min(4, (min(len(qaligned), len(raligned)) // 3))
if distance > max_edits:
continue
score = 1.0 - (distance / float(max(len(qaligned), len(raligned))))
Expand Down
7 changes: 6 additions & 1 deletion tests/matching/test_names.py
Original file line number Diff line number Diff line change
Expand Up @@ -298,7 +298,7 @@ def test_org_name_partial_match():
assert name_fingerprint_levenshtein(query, result) == 0.0
query = e("Company", name="CRYSTALORD")
assert name_fingerprint_levenshtein(query, result) == 1.0
query = e("Company", name="CRISTALORD")
query = e("Company", name="CRISTALORD LIMITED")
assert name_fingerprint_levenshtein(query, result) > 0.8


Expand All @@ -321,6 +321,11 @@ def test_org_name_example_2():
assert name_fingerprint_levenshtein(query, result) > 0.8
assert name_fingerprint_levenshtein(query, result) < 1.0

query = e("Company", name="TACTICAL MISSILES CORPORATION JOINT STOCK COMPANY")
result = e("Company", name="TACTICAL MISSILES CORPORATION JOYNT STOCK COMPANY")
assert name_fingerprint_levenshtein(query, result) > 0.8
assert name_fingerprint_levenshtein(query, result) < 1.0


def test_org_name_example_3():
query = e("Company", name="Iskusstvo Krasoty")
Expand Down

0 comments on commit 06a9ce2

Please sign in to comment.