diff --git a/npiai/tools/web/scraper/__test__/incremental.py b/npiai/tools/web/scraper/__test__/incremental.py index 453d27e..062f8c1 100644 --- a/npiai/tools/web/scraper/__test__/incremental.py +++ b/npiai/tools/web/scraper/__test__/incremental.py @@ -43,16 +43,20 @@ async def summarize(skip_item_hashes: Set[str] | None = None): start = time.monotonic() count = 0 hashes = set() + matched_hashes = set() async for chunk in stream: count += len(chunk["items"]) print("Chunk:", json.dumps(chunk, indent=2)) + matched_hashes.update(chunk["matched_hashes"]) for item in chunk["items"]: hashes.add(item["hash"]) end = time.monotonic() print(f"Summarized {count} items in {end - start:.2f} seconds") + print("Matched hashes:", matched_hashes) + print("Unmatched hashes:", hashes - matched_hashes) return hashes