Skip to content

Commit

Permalink
Rename regression-v3-alpha and document matcher a bit better
Browse files Browse the repository at this point in the history
  • Loading branch information
jbothma committed Dec 10, 2024
1 parent 14c9f8a commit 0aeef40
Show file tree
Hide file tree
Showing 4 changed files with 31 additions and 6 deletions.
2 changes: 1 addition & 1 deletion nomenklatura/matching/compare/countries.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ def country_mismatch(query: E, result: E) -> float:


def country_match(query: E, result: E) -> float:
"""Both entities are linked to the same country."""
"""Positive when some countries match. Negative when no countries match."""
qv, rv = type_pair(query, result, registry.country)
if qv and rv:
if has_overlap(qv, rv):
Expand Down
3 changes: 3 additions & 0 deletions nomenklatura/matching/compare/dates.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,9 @@ def dob_year_disjoint(query: E, result: E) -> float:

def dob_similarity(query: E, result: E) -> float:
"""
Positive for matching or similar dates, negative for disjoint dates.
Lower precision values result in dampened scores.
1.0: precise dates match
0.75: years match
0.5: dates within 1 edit from each other
Expand Down
19 changes: 16 additions & 3 deletions nomenklatura/matching/regression_v3/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,11 @@
from nomenklatura.matching.regression_v3.names import family_name_match
from nomenklatura.matching.regression_v3.names import name_levenshtein, name_match
from nomenklatura.matching.regression_v3.names import name_token_overlap, name_numbers
from nomenklatura.matching.regression_v3.misc import phone_match, email_match, position_country_mismatch
from nomenklatura.matching.regression_v3.misc import (
phone_match,
email_match,
position_country_mismatch,
)
from nomenklatura.matching.regression_v3.misc import address_match, address_numbers
from nomenklatura.matching.regression_v3.misc import identifier_match, birth_place
from nomenklatura.matching.regression_v3.misc import org_identifier_match
Expand All @@ -26,9 +30,18 @@


class RegressionV3(ScoringAlgorithm):
"""A simple matching algorithm based on a regression model."""
"""
This is an experimental release and not recommended for production use.
NAME = "regression-v3"
A simple matching algorithm based on a regression model considering name
part alignment and date precision.
Similarity is rewarded and dissimilarity is penalised for country and date of birth.
Security-type entity similarity is heavily dependent on ISIN match.
"""

NAME = "regression-v3-alpha"
MODEL_PATH = DATA_PATH.joinpath(f"{NAME}.pkl")
FEATURES: List[CompareFunction] = [
name_numbers,
Expand Down
13 changes: 11 additions & 2 deletions nomenklatura/matching/regression_v3/names.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,11 @@

from nomenklatura.matching.regression_v3.util import tokenize_pair, compare_levenshtein
from nomenklatura.matching.compare.util import is_disjoint, has_overlap, extract_numbers
from nomenklatura.matching.compare.names import aligned_levenshtein, name_fingerprint_levenshtein, symmetric_aligned_levenshtein
from nomenklatura.matching.compare.names import (
aligned_levenshtein,
name_fingerprint_levenshtein,
symmetric_aligned_levenshtein,
)
from nomenklatura.matching.util import has_schema, props_pair, type_pair
from nomenklatura.matching.util import max_in_sets
from nomenklatura.util import fingerprint_name
Expand Down Expand Up @@ -97,7 +101,12 @@ def name_numbers(left: E, right: E) -> float:
def name_similarity(left: E, right: E) -> float:
"""Compute the similarity between the names of two entities, picking the max from
a full string match, token overlap-based score, and levenshtein distance-based
score."""
score.
Full name match rewards longer names up to 10 parts.
The levenshtein approach first aligns name parts to find the most similar arrangement
"""
return max(
[
name_match(left, right),
Expand Down

0 comments on commit 0aeef40

Please sign in to comment.