Skip to content

Commit

Permalink
Merge pull request #168 from opensanctions/micro-optimizations
Browse files Browse the repository at this point in the history
shortcut when finding max value
  • Loading branch information
SimonThordal authored Aug 21, 2024
2 parents 5fa94c2 + 0b57eec commit ff196d7
Show file tree
Hide file tree
Showing 5 changed files with 30 additions and 10 deletions.
4 changes: 2 additions & 2 deletions nomenklatura/matching/compare/identifiers.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from followthemoney.types import registry
from rigour.text.distance import levenshtein

from nomenklatura.matching.util import type_pair, props_pair, has_schema, compare_sets
from nomenklatura.matching.util import type_pair, props_pair, has_schema, max_in_sets
from nomenklatura.matching.compare.util import has_overlap, clean_map, CleanFunc


Expand Down Expand Up @@ -99,7 +99,7 @@ def orgid_disjoint(query: E, result: E) -> float:
return 0.0
if len(query_ids.intersection(result_ids)) > 0:
return 0.0
return 1 - compare_sets(query_ids, result_ids, _nq_compare_identifiers)
return 1 - max_in_sets(query_ids, result_ids, _nq_compare_identifiers)


def identifier_match(query: E, result: E) -> float:
Expand Down
4 changes: 2 additions & 2 deletions nomenklatura/matching/regression_v1/misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from nomenklatura.matching.regression_v1.util import tokenize_pair, compare_levenshtein
from nomenklatura.matching.compare.util import has_overlap, extract_numbers
from nomenklatura.matching.util import props_pair, type_pair
from nomenklatura.matching.util import compare_sets, has_schema
from nomenklatura.matching.util import max_in_sets, has_schema
from nomenklatura.util import normalize_name


Expand All @@ -20,7 +20,7 @@ def address_match(query: E, result: E) -> float:
lv, rv = type_pair(query, result, registry.address)
lvn = [normalize_name(v) for v in lv]
rvn = [normalize_name(v) for v in rv]
return compare_sets(lvn, rvn, compare_levenshtein)
return max_in_sets(lvn, rvn, compare_levenshtein)


def address_numbers(query: E, result: E) -> float:
Expand Down
4 changes: 2 additions & 2 deletions nomenklatura/matching/regression_v1/names.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from nomenklatura.matching.regression_v1.util import tokenize_pair, compare_levenshtein
from nomenklatura.matching.compare.util import is_disjoint, has_overlap, extract_numbers
from nomenklatura.matching.util import props_pair, type_pair
from nomenklatura.matching.util import compare_sets
from nomenklatura.matching.util import max_in_sets
from nomenklatura.util import fingerprint_name


Expand All @@ -23,7 +23,7 @@ def name_levenshtein(left: E, right: E) -> float:
similar names linked to both entities."""
lv, rv = type_pair(left, right, registry.name)
lvn, rvn = normalize_names(lv), normalize_names(rv)
return compare_sets(lvn, rvn, compare_levenshtein)
return max_in_sets(lvn, rvn, compare_levenshtein)


def first_name_match(left: E, right: E) -> float:
Expand Down
4 changes: 2 additions & 2 deletions nomenklatura/matching/regression_v2/names.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from followthemoney.types import registry
from rigour.text.distance import levenshtein

from nomenklatura.matching.util import compare_sets, props_pair, type_pair
from nomenklatura.matching.util import max_in_sets, props_pair, type_pair
from nomenklatura.matching.compare.util import is_disjoint, has_overlap
from nomenklatura.matching.compare.util import extract_numbers
from nomenklatura.util import fingerprint_name, names_word_list, soundex_token
Expand Down Expand Up @@ -40,7 +40,7 @@ def name_levenshtein(left: E, right: E) -> float:
lv, rv = type_pair(left, right, registry.name)
lvp = _name_norms(lv)
rvp = _name_norms(rv)
return compare_sets(lvp, rvp, _compare_levenshtein)
return max_in_sets(lvp, rvp, _compare_levenshtein)


def first_name_match(left: E, right: E) -> float:
Expand Down
24 changes: 22 additions & 2 deletions nomenklatura/matching/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,15 +35,35 @@ def type_pair(left: E, right: E, type_: PropertyType) -> Tuple[List[str], List[s
return left_values, right_values


def max_in_sets(
left: Iterable[Optional[V]],
right: Iterable[Optional[V]],
compare_func: Callable[[V, V], float],
max_res: float = 1.0,
) -> float:
"""Compare two sets of values pair-wise and select the highest-scored result."""
res: float = 0.0
for le, ri in product(left, right):
if le is None or ri is None:
continue
v = compare_func(le, ri)
if v <= res:
continue
res = v
if res >= max_res:
return res
return res


def compare_sets(
left: Iterable[Optional[V]],
right: Iterable[Optional[V]],
compare_func: Callable[[V, V], float],
select_func: Callable[[Iterable[float]], float] = max,
) -> float:
"""Compare two sets of values pair-wise and select the highest-scored result."""
"""Compare two sets of values pair-wise and select a return value from select_func."""
results: List[float] = []
for (le, ri) in product(left, right):
for le, ri in product(left, right):
if le is None or ri is None:
continue
results.append(compare_func(le, ri))
Expand Down

0 comments on commit ff196d7

Please sign in to comment.