Skip to content

Commit

Permalink
Make levenshtein matcher roughly equivalent to JW impl
Browse files Browse the repository at this point in the history
  • Loading branch information
pudo committed Oct 7, 2023
1 parent 6308c2b commit 12497d7
Show file tree
Hide file tree
Showing 3 changed files with 81 additions and 54 deletions.
65 changes: 33 additions & 32 deletions nomenklatura/matching/compare/names.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,8 @@
from fingerprints import clean_name_light
from nomenklatura.util import names_word_list, levenshtein
from nomenklatura.util import fingerprint_name, normalize_name, jaro_winkler
from nomenklatura.matching.util import type_pair, props_pair, compare_sets, has_schema
from nomenklatura.matching.util import type_pair, props_pair, has_schema
from nomenklatura.matching.compare.util import is_disjoint, clean_map, has_overlap
from nomenklatura.matching.compare.util import compare_levenshtein


def _name_parts(name: str) -> List[str]:
Expand Down Expand Up @@ -76,46 +75,48 @@ def name_literal_match(query: E, result: E) -> float:
return 1.0 if has_overlap(qnames, rnames) else 0.0


def _fp_name_parts(name: str) -> List[str]:
return names_word_list([name], normalizer=fingerprint_name, min_length=2)


def name_fingerprint_levenshtein(query: E, result: E) -> float:
"""Two non-person entities have similar fingerprinted names. This includes
simplifying entity type names (e.g. "Limited" -> "Ltd") and uses the
Damerau-Levensthein string distance algorithm."""
if has_schema(query, result, "Person"):
return 0.0
query_names, result_names = type_pair(query, result, registry.name)
qnames = clean_map(query_names, fingerprint_name)
qnames.update(clean_map(query_names, clean_name_light))
rnames = clean_map(result_names, fingerprint_name)
rnames.update(clean_map(result_names, clean_name_light))
return compare_sets(qnames, rnames, compare_levenshtein)


def _org_name_parts(name: str) -> List[str]:
return names_word_list([name], normalizer=fingerprint_name)


def org_name_partial_match(query: E, result: E) -> float:
"""All query name parts are included in a result organization name. The
comparison is conducted on the fingerprinted names."""
if not has_schema(query, result, "Organization"):
return 0.0
query_names_, result_names_ = type_pair(query, result, registry.name)
query_names = [_org_name_parts(n) for n in query_names_]
result_names = [_org_name_parts(n) for n in result_names_]
query_names = [_fp_name_parts(n) for n in query_names_]
result_names = [_fp_name_parts(n) for n in result_names_]
max_score = 0.0
for (qn, rn) in product(query_names, result_names):
common_length = 0
remainder = list(rn)
for elem in qn:
try:
remainder.remove(elem)
common_length += len(elem)
except ValueError:
pass
if common_length == 0:
if len(qn) == 0:
continue
scores: Dict[Tuple[str, str], float] = {}
# compute all pairwise scores for name parts:
for q, r in product(set(qn), set(rn)):
distance = levenshtein(q, r)
scores[(q, r)] = 1.0 - (float(distance) / max(len(q), len(r)))
aligned: List[Tuple[str, str, float]] = []
# find the best pairing for each name part by score:
for (q, r), score in sorted(scores.items(), key=lambda i: i[1], reverse=True):
# one name part can only be used once, but can show up multiple times:
while q in qn and r in rn:
qn.remove(q)
rn.remove(r)
aligned.append((q, r, score))
# assume there should be at least a candidate for each query name part:
if len(qn):
continue
qaligned = " ".join(p[0] for p in aligned)
raligned = " ".join(p[1] for p in aligned)
distance = levenshtein(qaligned, raligned)
# Skip results with an overall distance of more than 5 characters:
max_edits = min(5, (min(len(qaligned), len(raligned)) // 3))
if distance > max_edits:
continue
query_length = sum(len(q) for q in qn)
max_score = max(max_score, common_length / float(query_length))
score = 1.0 - (distance / float(max(len(qaligned), len(raligned))))
max_score = max(max_score, score)
return max_score


Expand Down
2 changes: 0 additions & 2 deletions nomenklatura/matching/logic.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
from nomenklatura.matching.compare.names import last_name_mismatch, name_literal_match
from nomenklatura.matching.compare.names import name_fingerprint_levenshtein
from nomenklatura.matching.compare.names import weak_alias_match
from nomenklatura.matching.compare.names import org_name_partial_match
from nomenklatura.matching.compare.phonetic import person_name_phonetic_match
from nomenklatura.matching.compare.phonetic import name_soundex_match
from nomenklatura.matching.compare.phonetic import name_metaphone_match
Expand All @@ -37,7 +36,6 @@ class LogicV1(HeuristicAlgorithm):
# These are there so they can be enabled using custom weights:
Feature(func=name_metaphone_match, weight=FNUL),
Feature(func=name_soundex_match, weight=FNUL),
Feature(func=org_name_partial_match, weight=FNUL),
Feature(func=address_entity_match, weight=0.98),
Feature(func=crypto_wallet_address, weight=0.98),
Feature(func=isin_security_match, weight=0.98),
Expand Down
68 changes: 48 additions & 20 deletions tests/matching/test_names.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
from nomenklatura.matching.compare.names import name_fingerprint_levenshtein
from nomenklatura.matching.compare.names import person_name_jaro_winkler
from nomenklatura.matching.compare.names import weak_alias_match
from nomenklatura.matching.compare.names import org_name_partial_match
from nomenklatura.matching.compare.phonetic import person_name_phonetic_match
from nomenklatura.matching.compare.phonetic import name_metaphone_match
from nomenklatura.matching.compare.phonetic import name_soundex_match
Expand Down Expand Up @@ -38,21 +37,6 @@ def test_last_name_missmatch():
assert last_name_mismatch(main, other) == 0.0


def test_name_fingerprint_levenshtein():
main = e("Company", name="Siemens AG")
other = e("Company", name="Siemens Aktiengesellschaft")

assert name_fingerprint_levenshtein(main, other) == 1.0

other = e("Company", name="Siemens Aktiongesellschaft")
assert name_fingerprint_levenshtein(main, other) > 0.0
assert name_fingerprint_levenshtein(main, other) < 0.5

other = e("Company", name="Siemens AktG")
assert name_fingerprint_levenshtein(main, other) > 0.7
assert name_fingerprint_levenshtein(main, other) < 1.0


def test_arabic_name_similarity():
query = e("Person", name="Shaikh Isa Bin Tarif Al Bin Ali")
result = e("Person", name="Shaikh Isa Bin Tarif Al Bin Ali")
Expand Down Expand Up @@ -292,12 +276,56 @@ def test_weak_name_match():
assert weak_alias_match(query, result) == 1.0


def test_name_fingerprint_levenshtein():
query = e("Company", name="Siemens AG")
result = e("Company", name="Siemens Aktiengesellschaft")
assert name_fingerprint_levenshtein(query, result) == 1.0

# result = e("Company", name="Siemens Aktiongesellschaft")
# assert name_fingerprint_levenshtein(query, result) > 0.0
# assert name_fingerprint_levenshtein(query, result) < 0.5

result = e("Company", name="Siemens AktG")
assert name_fingerprint_levenshtein(query, result) > 0.7
assert name_fingerprint_levenshtein(query, result) < 1.0


def test_org_name_partial_match():
query = e("Company", name="CRYSTALORD LIMITED")
result = e("Company", name="CRYSTALORD LTD")
assert org_name_partial_match(query, result) == 1.0
assert name_fingerprint_levenshtein(query, result) == 1.0
query = e("Company", name="CRYSTALORD SYSTEMS LIMITED")
assert org_name_partial_match(query, result) < 0.7
assert org_name_partial_match(query, result) > 0.5
assert name_fingerprint_levenshtein(query, result) == 0.0
query = e("Company", name="CRYSTALORD")
assert org_name_partial_match(query, result) == 1.0
assert name_fingerprint_levenshtein(query, result) == 1.0
query = e("Company", name="CRISTALORD")
assert name_fingerprint_levenshtein(query, result) > 0.8


def test_org_name_example_1():
query = e("Company", name="faberlic")
result = e("Company", name="FABERLIC EUROPE Sp. z o.o.")
assert name_fingerprint_levenshtein(query, result) == 1.0
query = e("Company", name="faberlick")
assert name_fingerprint_levenshtein(query, result) > 0.8
assert name_fingerprint_levenshtein(query, result) < 1.0


def test_org_name_example_2():
query = e("Company", name="TACTICAL MISSILES CORPORATION JOINT STOCK COMPANY")
result = e("Company", name="TACTICAL MISSILES CORPORATION JOINT STOCK COMPANY")
assert name_fingerprint_levenshtein(query, result) == 1.0
result = e("Company", name="TACTICAL MISSILES CORPORATION JSC")
assert name_fingerprint_levenshtein(query, result) == 1.0
query = e("Company", name="TACTICAL MISSILES CORPORATION OJSC")
assert name_fingerprint_levenshtein(query, result) > 0.8
assert name_fingerprint_levenshtein(query, result) < 1.0


def test_org_name_example_3():
query = e("Company", name="Iskusstvo Krasoty")
result = e("Company", name="LIMITED LIABILITY COMPANY ISKUSSTVO KRASOTY")
assert name_fingerprint_levenshtein(query, result) == 1.0
query = e("Company", name="Iskustvo Krasoty")
assert name_fingerprint_levenshtein(query, result) > 0.9
assert name_fingerprint_levenshtein(query, result) < 1.0

0 comments on commit 12497d7

Please sign in to comment.