Skip to content

Commit

Permalink
Use new normality functions to do phonetic matching more precisely
Browse files Browse the repository at this point in the history
  • Loading branch information
pudo committed Oct 7, 2023
1 parent 0b8fb94 commit 4cebb37
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 13 deletions.
20 changes: 9 additions & 11 deletions nomenklatura/matching/compare/phonetic.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,28 +2,26 @@
from itertools import product
from followthemoney.proxy import E
from followthemoney.types import registry
from normality.cleaning import decompose_nfkd, category_replace
from fingerprints import clean_name_light, clean_entity_prefix, replace_types
from nomenklatura.util import names_word_list, list_intersection
from normality.scripts import is_modern_alphabet
from fingerprints import clean_name_ascii, clean_entity_prefix
from nomenklatura.util import names_word_list, list_intersection, fingerprint_name
from nomenklatura.util import phonetic_token, metaphone_token, soundex_token
from nomenklatura.matching.util import type_pair, has_schema


def _clean_phonetic_person(original: str) -> Optional[str]:
"""Normalize a person name without transliteration."""
if not is_modern_alphabet(original):
return None
text = clean_entity_prefix(original)
cleaned = clean_name_light(text)
cleaned = decompose_nfkd(cleaned)
return category_replace(cleaned)
return clean_name_ascii(text)


def _clean_phonetic_entity(original: str) -> Optional[str]:
"""Normalize a legal entity name without transliteration."""
text = clean_entity_prefix(original)
cleaned = clean_name_light(text)
cleaned = decompose_nfkd(cleaned)
cleaned = category_replace(cleaned)
return replace_types(cleaned)
if not is_modern_alphabet(original):
return None
return fingerprint_name(original)


def _phonetic_tokens(token: str) -> List[str]:
Expand Down
5 changes: 3 additions & 2 deletions tests/matching/test_names.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ def test_person_name_phonetic_match():
result = e("Person", name="George Hussein Onyango Obama")
assert person_name_phonetic_match(query, result) < 0.7
result = e("Person", name="Բարակ Օբամա")
assert person_name_phonetic_match(query, result) < 0.7
assert person_name_phonetic_match(query, result) > 0.7
result = e("Person", name="ジョージ")
assert person_name_phonetic_match(query, result) < 0.7
result = e("Person", name="Marie-Therese Abena Ondoa")
Expand Down Expand Up @@ -254,7 +254,8 @@ def test_jaro_lindemann():
def test_name_alphabets():
query = e("Person", name="Ротенберг Аркадий")
result = e("Person", name="Arkadiii Romanovich Rotenberg")
assert person_name_phonetic_match(query, result) == 0.0
assert person_name_phonetic_match(query, result) > 0.0
assert person_name_phonetic_match(query, result) < 0.7
assert person_name_jaro_winkler(query, result) > 0.7

query = e("Person", name="Osama bin Laden")
Expand Down

0 comments on commit 4cebb37

Please sign in to comment.