Skip to content

Commit

Permalink
Merge pull request #169 from opensanctions/bench-matcher
Browse files Browse the repository at this point in the history
Little command to benchmark matchers
  • Loading branch information
jbothma authored Aug 27, 2024
2 parents ff196d7 + c3e540f commit c58bf25
Show file tree
Hide file tree
Showing 2 changed files with 39 additions and 0 deletions.
9 changes: 9 additions & 0 deletions nomenklatura/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from nomenklatura.stream import StreamEntity
from nomenklatura.xref import xref as run_xref
from nomenklatura.tui import dedupe_ui
from nomenklatura.matching.bench import bench_matcher

INDEX_SEGMENT = "xref-index"

Expand Down Expand Up @@ -308,5 +309,13 @@ def statements_aggregate(
write_entity(outfh, entity)


@cli.command("bench", help="Benchmark a matching algorithm")
@click.argument("name", type=str)
@click.argument("pairs_file", type=InPath)
@click.option("-n", "--number", type=int, default=1000)
def bench(name: str, pairs_file: Path, number: int = 1000) -> None:
bench_matcher(name, pairs_file, number)


if __name__ == "__main__":
cli()
30 changes: 30 additions & 0 deletions nomenklatura/matching/bench.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import datetime
from timeit import timeit
from itertools import cycle
import logging

from nomenklatura.matching import get_algorithm
from nomenklatura.matching.pairs import read_pairs
from nomenklatura.util import PathLike


log = logging.getLogger(__name__)


def bench_matcher(name: str, pairs_file: PathLike, number: int) -> None:
log.info("Loading pairs from %s", pairs_file)
pairs = list(read_pairs(pairs_file))
log.info("Read %d pairs", len(pairs))
matcher = get_algorithm(name)
if matcher is None:
raise ValueError("No matcher named %s", name)
log.info("Loaded %s", matcher.NAME)
infinite_pairs = cycle(pairs)

def compare_one_pair() -> None:
pair = next(infinite_pairs)
matcher.compare(pair.left, pair.right)

log.info("Running benchmark for %d iterations", number)
seconds = timeit(compare_one_pair, number=number)
log.info("Total time %s", datetime.timedelta(seconds=seconds))

0 comments on commit c58bf25

Please sign in to comment.