Skip to content

Commit

Permalink
transliteration on phonetic search is too wonky
Browse files Browse the repository at this point in the history
  • Loading branch information
pudo committed Oct 6, 2023
1 parent ca3663d commit b417b96
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 7 deletions.
25 changes: 20 additions & 5 deletions nomenklatura/matching/compare/names.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
from typing import List, Dict, Tuple
from typing import List, Dict, Tuple, Optional
from itertools import product
from followthemoney.proxy import E
from followthemoney.types import registry
from fingerprints.cleanup import clean_name_light
from fingerprints import clean_name_light, clean_entity_prefix, replace_types
from nomenklatura.util import names_word_list, list_intersection
from nomenklatura.util import fingerprint_name, normalize_name, jaro_winkler
from nomenklatura.util import phonetic_token, metaphone_token, soundex_token
Expand All @@ -12,10 +12,25 @@
from nomenklatura.matching.compare.util import compare_levenshtein


def _clean_phonetic_person(original: str) -> Optional[str]:
"""Normalize a person name without transliteration."""
text = original.lower()
text = clean_entity_prefix(text)
return clean_name_light(original)


def _clean_phonetic_entity(original: str) -> Optional[str]:
"""Normalize a legal entity name without transliteration."""
text = original.lower()
text = clean_entity_prefix(text)
cleaned = clean_name_light(original)
return replace_types(cleaned)


def _phonetic_tokens(token: str) -> List[str]:
return names_word_list(
[token],
normalizer=normalize_name,
normalizer=_clean_phonetic_person,
processor=phonetic_token,
min_length=2,
)
Expand Down Expand Up @@ -46,7 +61,7 @@ def person_name_phonetic_match(query: E, result: E) -> float:
def _metaphone_tokens(token: str) -> List[str]:
return names_word_list(
[token],
normalizer=fingerprint_name,
normalizer=_clean_phonetic_entity,
processor=metaphone_token,
min_length=2,
)
Expand All @@ -64,7 +79,7 @@ def name_metaphone_match(query: E, result: E) -> float:
def _soundex_tokens(token: str) -> List[str]:
return names_word_list(
[token],
normalizer=fingerprint_name,
normalizer=_clean_phonetic_entity,
processor=soundex_token,
min_length=2,
)
Expand Down
15 changes: 13 additions & 2 deletions tests/matching/test_names.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,18 @@ def test_person_name_phonetic_match():
assert name_soundex_match(query, result) > 0.0
assert name_soundex_match(query, result) < 0.5

query = e("Person", name="Barack Obama")
result = e("Person", name="George Hussein Onyango Obama")
assert person_name_phonetic_match(query, result) < 0.7
result = e("Person", name="Բարակ Օբամա")
assert person_name_phonetic_match(query, result) < 0.7
result = e("Person", name="ジョージ")
assert person_name_phonetic_match(query, result) < 0.7
result = e("Person", name="Marie-Therese Abena Ondoa")
assert person_name_phonetic_match(query, result) < 0.7
result = e("Person", name="ماري تيريز أدينا أوندوا")
assert person_name_phonetic_match(query, result) < 0.7

query = e("Person", name="Shaikh Isa Bin Tarif Al Bin Ali")
result = e("Person", name="Shaikh Isa Bin Tarif Al Bin Ali")
assert person_name_phonetic_match(query, result) == 1.0
Expand Down Expand Up @@ -238,8 +250,7 @@ def test_person_name_jaro_winkler():
def test_name_alphabets():
query = e("Person", name="Ротенберг Аркадий")
result = e("Person", name="Arkadiii Romanovich Rotenberg")
assert person_name_phonetic_match(query, result) > 0.4
assert person_name_phonetic_match(query, result) < 1.0
assert person_name_phonetic_match(query, result) == 0.0
assert person_name_jaro_winkler(query, result) > 0.7

query = e("Person", name="Osama bin Laden")
Expand Down

0 comments on commit b417b96

Please sign in to comment.