Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Regression v3 matcher #176

Open
wants to merge 20 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion nomenklatura/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

from nomenklatura.cache import Cache
from nomenklatura.index import Index
from nomenklatura.matching import train_v2_matcher, train_v1_matcher
from nomenklatura.matching import train_v2_matcher, train_v1_matcher, train_v3_matcher
from nomenklatura.store import load_entity_file_store
from nomenklatura.resolver import Resolver
from nomenklatura.dataset import Dataset, DefaultDataset
Expand Down Expand Up @@ -191,6 +191,13 @@ def train_v2_matcher_(pairs_file: Path) -> None:
train_v2_matcher(pairs_file)


@cli.command("train-v3-matcher", help="Train a matching model from judgement pairs")
@click.argument("pairs_file", type=InPath)
@click.option("-s", "--splits", type=int, default=1)
def train_v3_matcher_(pairs_file: Path, splits: int = 1) -> None:
train_v3_matcher(pairs_file, splits)


@cli.command("match", help="Generate matches from an enrichment source")
@click.argument("config", type=InPath)
@click.argument("entities", type=InPath)
Expand Down
Binary file added nomenklatura/data/regression-v3-alpha.pkl
Binary file not shown.
5 changes: 5 additions & 0 deletions nomenklatura/matching/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
from nomenklatura.matching.regression_v1.train import train_matcher as train_v1_matcher
from nomenklatura.matching.regression_v2.model import RegressionV2
from nomenklatura.matching.regression_v2.train import train_matcher as train_v2_matcher
from nomenklatura.matching.regression_v3.model import RegressionV3
from nomenklatura.matching.regression_v3.train import train_matcher as train_v3_matcher
from nomenklatura.matching.name_based import NameMatcher, NameQualifiedMatcher
from nomenklatura.matching.logic import LogicV1
from nomenklatura.matching.types import ScoringAlgorithm
Expand All @@ -13,6 +15,7 @@
NameQualifiedMatcher,
RegressionV1,
RegressionV2,
RegressionV3,
]

DefaultAlgorithm = RegressionV2
Expand All @@ -31,6 +34,8 @@ def get_algorithm(name: str) -> Optional[Type[ScoringAlgorithm]]:
"train_v1_matcher",
"RegressionV2",
"train_v2_matcher",
"RegressionV3",
"train_v3_matcher",
"DefaultAlgorithm",
"ScoringAlgorithm",
"NameMatcher",
Expand Down
14 changes: 13 additions & 1 deletion nomenklatura/matching/compare/countries.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,23 @@
from followthemoney.proxy import E
from followthemoney.types import registry
import numpy as np

from nomenklatura.matching.util import type_pair
from nomenklatura.matching.compare.util import is_disjoint
from nomenklatura.matching.compare.util import has_overlap, is_disjoint


def country_mismatch(query: E, result: E) -> float:
"""Both entities are linked to different countries."""
qv, rv = type_pair(query, result, registry.country)
return 1.0 if is_disjoint(qv, rv) else 0.0


def country_match(query: E, result: E) -> float:
"""Positive when some countries match. Negative when no countries match."""
qv, rv = type_pair(query, result, registry.country)
if qv and rv:
if has_overlap(qv, rv):
return 1.0
elif is_disjoint(qv, rv):
return -1.0
return np.nan
55 changes: 55 additions & 0 deletions nomenklatura/matching/compare/dates.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,17 @@
from typing import Iterable, Set
from prefixdate import Precision
from followthemoney.proxy import E
from rigour.text.distance import dam_levenshtein
from itertools import product
import numpy as np

from nomenklatura.matching.compare.util import has_overlap, is_disjoint
from nomenklatura.matching.util import props_pair


MAX_YEARS = 2


def _dates_precision(values: Iterable[str], precision: Precision) -> Set[str]:
dates = set()
for value in values:
Expand Down Expand Up @@ -70,3 +76,52 @@ def dob_year_disjoint(query: E, result: E) -> float:
if is_disjoint(query_years, result_years):
return 1.0
return 0.0


def dob_similarity(query: E, result: E) -> float:
"""
Positive for matching or similar dates, negative for disjoint dates.
Lower precision values result in dampened scores.

1.0: precise dates match
0.75: years match
0.5: dates within 1 edit from each other
0.25: years within 2 years from each other
-0.2: imprecise dates are disjoint
-0.3: precise dates are disjoint
"""
query_dates, result_dates = props_pair(query, result, ["birthDate"])

# missing data
if len(query_dates) == 0 or len(result_dates) == 0:
return np.nan

# exact match on precise dates
result_days = _dates_precision(result_dates, Precision.DAY)
query_days = _dates_precision(query_dates, Precision.DAY)
if has_overlap(query_days, result_days):
return 1.0

# clerical errors on precise dates
for qd, rd in product(query_days, result_days):
if dam_levenshtein(qd, rd) <= 1:
return 0.3

# precise dates available but have no common values
if is_disjoint(query_days, result_days):
return -0.3

# years overlap
query_years = _dates_precision(query_dates, Precision.YEAR)
result_years = _dates_precision(result_dates, Precision.YEAR)
if has_overlap(query_years, result_years):
return 0.5

# years are close
for qy, ry in product(query_years, result_years):
years_difference = abs(int(qy) - int(ry))
if years_difference <= MAX_YEARS:
return 0.2

# dates exist but are disjoint other than above options
return -0.2
90 changes: 64 additions & 26 deletions nomenklatura/matching/compare/names.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import List, Dict, Tuple
from typing import Callable, List, Dict, Tuple
from itertools import product
from followthemoney.proxy import E
from followthemoney.types import registry
Expand Down Expand Up @@ -68,7 +68,68 @@ def person_name_jaro_winkler(query: E, result: E) -> float:
return score


def name_fingerprint_levenshtein(query: E, result: E) -> float:
def aligned_levenshtein(qfp: str, rfp: str) -> float:
qtokens = name_words(qfp, min_length=2)
rtokens = name_words(rfp, min_length=2)
for part in name_words(clean_name_ascii(rfp), min_length=2):
if part not in rtokens:
rtokens.append(part)

scores: Dict[Tuple[str, str], float] = {}
# compute all pairwise scores for name parts:
for q, r in product(set(qtokens), set(rtokens)):
scores[(q, r)] = levenshtein_similarity(q, r)
aligned: List[Tuple[str, str, float]] = []
# find the best pairing for each name part by score:
for (q, r), score in sorted(scores.items(), key=lambda i: i[1], reverse=True):
# one name part can only be used once, but can show up multiple times:
while q in qtokens and r in rtokens:
qtokens.remove(q)
rtokens.remove(r)
aligned.append((q, r, score))
# assume there should be at least a candidate for each query name part:
if len(qtokens):
return 0.0
qaligned = "".join(p[0] for p in aligned)
raligned = "".join(p[1] for p in aligned)
return levenshtein_similarity(qaligned, raligned)


def symmetric_aligned_levenshtein(qfp: str, rfp: str) -> float:
qtokens = name_words(qfp, min_length=2)
rtokens = name_words(rfp, min_length=2)
qlen = len(qtokens)
rlen = len(rtokens)
for part in name_words(clean_name_ascii(rfp), min_length=2):
if part not in rtokens:
rtokens.append(part)

scores: Dict[Tuple[str, str], float] = {}
# compute all pairwise scores for name parts:
for q, r in product(set(qtokens), set(rtokens)):
scores[(q, r)] = levenshtein_similarity(q, r)
aligned: List[Tuple[str, str, float]] = []
# find the best pairing for each name part by score:
for (q, r), score in sorted(scores.items(), key=lambda i: i[1], reverse=True):
# one name part can only be used once, but can show up multiple times:
while q in qtokens and r in rtokens:
qtokens.remove(q)
rtokens.remove(r)
aligned.append((q, r, score))

qfactor = (qlen - len(qtokens)) / qlen
rfactor = (rlen - len(rtokens)) / rlen
qaligned = "".join(p[0] for p in aligned)
raligned = "".join(p[1] for p in aligned)
score = levenshtein_similarity(qaligned, raligned)
return score * max(qfactor, rfactor)


def name_fingerprint_levenshtein(
query: E,
result: E,
lev: Callable[[str, str], float] = aligned_levenshtein,
) -> float:
"""Two non-person entities have similar fingerprinted names. This includes
simplifying entity type names (e.g. "Limited" -> "Ltd") and uses the
Damerau-Levensthein string distance algorithm."""
Expand All @@ -85,30 +146,7 @@ def name_fingerprint_levenshtein(query: E, result: E) -> float:
continue
score = levenshtein_similarity(qfp.replace(" ", ""), rfp.replace(" ", ""))
max_score = max(max_score, score)
qtokens = name_words(qfp, min_length=2)
rtokens = name_words(rfp, min_length=2)
for part in name_words(clean_name_ascii(rfp), min_length=2):
if part not in rtokens:
rtokens.append(part)

scores: Dict[Tuple[str, str], float] = {}
# compute all pairwise scores for name parts:
for q, r in product(set(qtokens), set(rtokens)):
scores[(q, r)] = levenshtein_similarity(q, r)
aligned: List[Tuple[str, str, float]] = []
# find the best pairing for each name part by score:
for (q, r), score in sorted(scores.items(), key=lambda i: i[1], reverse=True):
# one name part can only be used once, but can show up multiple times:
while q in qtokens and r in rtokens:
qtokens.remove(q)
rtokens.remove(r)
aligned.append((q, r, score))
# assume there should be at least a candidate for each query name part:
if len(qtokens):
continue
qaligned = "".join(p[0] for p in aligned)
raligned = "".join(p[1] for p in aligned)
score = levenshtein_similarity(qaligned, raligned)
score = lev(qfp, rfp)
max_score = max(max_score, score)
return max_score

Expand Down
15 changes: 10 additions & 5 deletions nomenklatura/matching/pairs.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,23 +8,27 @@


class JudgedPair(object):
"""A pair of two entities which have been judged to be the same
(or not) by a user."""
"""
A pair of two entities which have been judged to be the same
(or not) by a user.
"""

__slots__ = ("left", "right", "judgement")
__slots__ = ("left", "right", "judgement", "group")

def __init__(
self, left: EntityProxy, right: EntityProxy, judgement: Judgement
self, left: EntityProxy, right: EntityProxy, judgement: Judgement, group: int
) -> None:
self.left = left
self.right = right
self.judgement = judgement
self.group = group

def to_dict(self) -> Dict[str, Any]:
return {
"left": self.left.to_dict(),
"right": self.right.to_dict(),
"judgement": self.judgement.value,
"group": self.group,
}


Expand All @@ -38,4 +42,5 @@ def read_pairs(pairs_file: PathLike) -> Generator[JudgedPair, None, None]:
judgement = Judgement(data["judgement"])
if judgement not in (Judgement.POSITIVE, Judgement.NEGATIVE):
continue
yield JudgedPair(left_entity, right_entity, judgement)
group = data.get("group", None)
yield JudgedPair(left_entity, right_entity, judgement, group)
Empty file.
82 changes: 82 additions & 0 deletions nomenklatura/matching/regression_v3/misc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
from followthemoney.proxy import E
from followthemoney.types import registry
import numpy as np

from nomenklatura.matching.regression_v1.util import tokenize_pair, compare_levenshtein
from nomenklatura.matching.compare.util import has_overlap, extract_numbers, is_disjoint
from nomenklatura.matching.util import props_pair, type_pair
from nomenklatura.matching.util import max_in_sets, has_schema
from nomenklatura.util import normalize_name


def birth_place(query: E, result: E) -> float:
"""Same place of birth."""
lv, rv = tokenize_pair(props_pair(query, result, ["birthPlace"]))
tokens = min(len(lv), len(rv))
return float(len(lv.intersection(rv))) / float(max(2.0, tokens))


def address_match(query: E, result: E) -> float:
"""Text similarity between addresses."""
lv, rv = type_pair(query, result, registry.address)
if not (lv and rv):
return np.nan
lvn = [normalize_name(v) for v in lv]
rvn = [normalize_name(v) for v in rv]
return max_in_sets(lvn, rvn, compare_levenshtein)


def address_numbers(query: E, result: E) -> float:
"""Find if names contain numbers, score if the numbers are different."""
lv, rv = type_pair(query, result, registry.address)
lvn = extract_numbers(lv)
rvn = extract_numbers(rv)
common = len(lvn.intersection(rvn))
disjoint = len(lvn.difference(rvn))
return common - disjoint


def phone_match(query: E, result: E) -> float:
"""Matching phone numbers between the two entities."""
lv, rv = type_pair(query, result, registry.phone)
return 1.0 if has_overlap(lv, rv) else 0.0


def email_match(query: E, result: E) -> float:
"""Matching email addresses between the two entities."""
lv, rv = type_pair(query, result, registry.email)
return 1.0 if has_overlap(lv, rv) else 0.0


def identifier_match(query: E, result: E) -> float:
"""Matching identifiers (e.g. passports, national ID cards, registration or
tax numbers) between the two entities."""
if has_schema(query, result, "Organization"):
return 0.0
lv, rv = type_pair(query, result, registry.identifier)
return 1.0 if has_overlap(lv, rv) else 0.0


def org_identifier_match(query: E, result: E) -> float:
"""Matching identifiers (e.g. registration or tax numbers) between two
organizations or companies."""
if not has_schema(query, result, "Organization"):
return 0.0
lv, rv = type_pair(query, result, registry.identifier)
return 1.0 if has_overlap(lv, rv) else 0.0


def position_country_mismatch(query: E, result: E) -> float:
"""Whether positions have the same country or not"""
if not has_schema(query, result, "Position"):
return 0.0
lv, rv = type_pair(query, result, registry.country)
return 1.0 if is_disjoint(lv, rv) else 0


def security_isin_mismatch(query: E, result: E) -> float:
"""Both entities are linked to different ISIN codes."""
if not has_schema(query, result, "Security"):
return 0.0
qv, rv = props_pair(query, result, ["isin"])
return 1.0 if is_disjoint(qv, rv) else 0.0
Loading
Loading