diff --git a/nomenklatura/matching/compare/countries.py b/nomenklatura/matching/compare/countries.py index 358341df..3aa9d9fc 100644 --- a/nomenklatura/matching/compare/countries.py +++ b/nomenklatura/matching/compare/countries.py @@ -13,7 +13,7 @@ def country_mismatch(query: E, result: E) -> float: def country_match(query: E, result: E) -> float: - """Both entities are linked to the same country.""" + """Positive when some countries match. Negative when no countries match.""" qv, rv = type_pair(query, result, registry.country) if qv and rv: if has_overlap(qv, rv): diff --git a/nomenklatura/matching/compare/dates.py b/nomenklatura/matching/compare/dates.py index 420acb48..6c108463 100644 --- a/nomenklatura/matching/compare/dates.py +++ b/nomenklatura/matching/compare/dates.py @@ -80,6 +80,9 @@ def dob_year_disjoint(query: E, result: E) -> float: def dob_similarity(query: E, result: E) -> float: """ + Positive for matching or similar dates, negative for disjoint dates. + Lower precision values result in dampened scores. + 1.0: precise dates match 0.75: years match 0.5: dates within 1 edit from each other diff --git a/nomenklatura/matching/regression_v3/model.py b/nomenklatura/matching/regression_v3/model.py index 031c7c56..1dde3c56 100644 --- a/nomenklatura/matching/regression_v3/model.py +++ b/nomenklatura/matching/regression_v3/model.py @@ -10,7 +10,11 @@ from nomenklatura.matching.regression_v3.names import family_name_match from nomenklatura.matching.regression_v3.names import name_levenshtein, name_match from nomenklatura.matching.regression_v3.names import name_token_overlap, name_numbers -from nomenklatura.matching.regression_v3.misc import phone_match, email_match, position_country_mismatch +from nomenklatura.matching.regression_v3.misc import ( + phone_match, + email_match, + position_country_mismatch, +) from nomenklatura.matching.regression_v3.misc import address_match, address_numbers from nomenklatura.matching.regression_v3.misc import identifier_match, birth_place from nomenklatura.matching.regression_v3.misc import org_identifier_match @@ -26,9 +30,18 @@ class RegressionV3(ScoringAlgorithm): - """A simple matching algorithm based on a regression model.""" + """ + This is an experimental release and not recommended for production use. - NAME = "regression-v3" + A simple matching algorithm based on a regression model considering name + part alignment and date precision. + + Similarity is rewarded and dissimilarity is penalised for country and date of birth. + + Security-type entity similarity is heavily dependent on ISIN match. + """ + + NAME = "regression-v3-alpha" MODEL_PATH = DATA_PATH.joinpath(f"{NAME}.pkl") FEATURES: List[CompareFunction] = [ name_numbers, diff --git a/nomenklatura/matching/regression_v3/names.py b/nomenklatura/matching/regression_v3/names.py index d2b42e04..41e7bff8 100644 --- a/nomenklatura/matching/regression_v3/names.py +++ b/nomenklatura/matching/regression_v3/names.py @@ -6,7 +6,11 @@ from nomenklatura.matching.regression_v3.util import tokenize_pair, compare_levenshtein from nomenklatura.matching.compare.util import is_disjoint, has_overlap, extract_numbers -from nomenklatura.matching.compare.names import aligned_levenshtein, name_fingerprint_levenshtein, symmetric_aligned_levenshtein +from nomenklatura.matching.compare.names import ( + aligned_levenshtein, + name_fingerprint_levenshtein, + symmetric_aligned_levenshtein, +) from nomenklatura.matching.util import has_schema, props_pair, type_pair from nomenklatura.matching.util import max_in_sets from nomenklatura.util import fingerprint_name @@ -97,7 +101,12 @@ def name_numbers(left: E, right: E) -> float: def name_similarity(left: E, right: E) -> float: """Compute the similarity between the names of two entities, picking the max from a full string match, token overlap-based score, and levenshtein distance-based - score.""" + score. + + Full name match rewards longer names up to 10 parts. + + The levenshtein approach first aligns name parts to find the most similar arrangement + """ return max( [ name_match(left, right),