diff --git a/contracting_process/field_level/report_examples.py b/contracting_process/field_level/report_examples.py index 8008f3f..9903303 100644 --- a/contracting_process/field_level/report_examples.py +++ b/contracting_process/field_level/report_examples.py @@ -8,7 +8,7 @@ logger = logging.getLogger("pelican.contracting_process.field_level.report_examples") -examples_cap = 20 +sample_size = 20 def create(dataset_id): @@ -45,8 +45,8 @@ def create(dataset_id): "checks": {}, "passed_examples": [], "failed_examples": [], - "passed_sampler": ReservoirSampler(examples_cap), - "failed_sampler": ReservoirSampler(examples_cap), + "passed_sampler": ReservoirSampler(sample_size), + "failed_sampler": ReservoirSampler(sample_size), } for _, check_name in checks: @@ -59,8 +59,8 @@ def create(dataset_id): examples[path][key]["checks"][check_name] = { "passed_examples": [], "failed_examples": [], - "passed_sampler": ReservoirSampler(examples_cap), - "failed_sampler": ReservoirSampler(examples_cap), + "passed_sampler": ReservoirSampler(sample_size), + "failed_sampler": ReservoirSampler(sample_size), } logger.info("Starting processing pages.") @@ -114,15 +114,15 @@ def create(dataset_id): logger.info("Storing examples for field level checks for dataset_id %s", dataset_id) for path, path_checks in examples.items(): for key in ("coverage", "quality"): - path_checks[key]["passed_examples"] = path_checks[key]["passed_sampler"].retrieve_samples() - path_checks[key]["failed_examples"] = path_checks[key]["failed_sampler"].retrieve_samples() + path_checks[key]["passed_examples"] = path_checks[key]["passed_sampler"].sample + path_checks[key]["failed_examples"] = path_checks[key]["failed_sampler"].sample del path_checks[key]["passed_sampler"] del path_checks[key]["failed_sampler"] for check_name, check in path_checks[key]["checks"].items(): - check["passed_examples"] = check["passed_sampler"].retrieve_samples() - check["failed_examples"] = check["failed_sampler"].retrieve_samples() + check["passed_examples"] = check["passed_sampler"].sample + check["failed_examples"] = check["failed_sampler"].sample del check["passed_sampler"] del check["failed_sampler"] diff --git a/contracting_process/resource_level/examples.py b/contracting_process/resource_level/examples.py index 6539f40..20c8ea4 100644 --- a/contracting_process/resource_level/examples.py +++ b/contracting_process/resource_level/examples.py @@ -4,7 +4,7 @@ from pelican.util.checks import ReservoirSampler from pelican.util.services import commit, get_cursor -examples_cap = 20 +sample_size = 20 def create(dataset_id): @@ -17,9 +17,9 @@ def create(dataset_id): check_samplers = { check_name: { - "passed": ReservoirSampler(examples_cap), - "failed": ReservoirSampler(examples_cap), - "undefined": ReservoirSampler(examples_cap), + "passed": ReservoirSampler(sample_size), + "failed": ReservoirSampler(sample_size), + "undefined": ReservoirSampler(sample_size), } for check_name in definitions } diff --git a/dataset/consistent/related_process_title.py b/dataset/consistent/related_process_title.py index 457be92..9a29679 100644 --- a/dataset/consistent/related_process_title.py +++ b/dataset/consistent/related_process_title.py @@ -4,7 +4,7 @@ from pelican.util.getter import get_values version = 1.0 -examples_cap = 100 +sample_size = 100 def add_item(scope, item, item_id): @@ -125,20 +125,20 @@ def pick_examples(scope, related_process_key, result): } if result: - if scope["meta"]["total_passed"] < examples_cap: + if scope["meta"]["total_passed"] < sample_size: scope["meta"]["passed_examples"].append(example) else: r = random.randint(0, scope["meta"]["total_passed"]) - if r < examples_cap: + if r < sample_size: scope["meta"]["passed_examples"][r] = example scope["meta"]["total_passed"] += 1 else: - if scope["meta"]["total_failed"] < examples_cap: + if scope["meta"]["total_failed"] < sample_size: scope["meta"]["failed_examples"].append(example) else: r = random.randint(0, scope["meta"]["total_failed"]) - if r < examples_cap: + if r < sample_size: scope["meta"]["failed_examples"][r] = example scope["meta"]["total_failed"] += 1 diff --git a/dataset/distribution/buyer.py b/dataset/distribution/buyer.py index b9b519f..51ef430 100644 --- a/dataset/distribution/buyer.py +++ b/dataset/distribution/buyer.py @@ -3,7 +3,7 @@ version = 1.0 min_resources_num = 1000 -examples_cap = 20 +sample_size = 20 def add_item(scope, item, item_id): @@ -54,7 +54,7 @@ def get_result(scope): "100+": {"total_ocid_count": 0, "total_buyer_count": 0}, } - buyer_with_one_ocid_sampler = ReservoirSampler(examples_cap) + buyer_with_one_ocid_sampler = ReservoirSampler(sample_size) # filling in the histogram for value in scope["buyers"].values(): @@ -87,7 +87,7 @@ def get_result(scope): "counts": ocid_histogram, "total_ocid_count": scope["total_ocid_count"], "total_buyer_count": len(scope["buyers"]), - "examples": buyer_with_one_ocid_sampler.retrieve_samples(), + "examples": buyer_with_one_ocid_sampler.sample, } else: result["meta"] = {"reason": "no data items were processed"} diff --git a/dataset/distribution/buyer_repetition.py b/dataset/distribution/buyer_repetition.py index 45d3154..80d255a 100644 --- a/dataset/distribution/buyer_repetition.py +++ b/dataset/distribution/buyer_repetition.py @@ -3,7 +3,7 @@ version = 1.0 min_resources_num = 1000 -examples_cap = 20 +sample_size = 20 def add_item(scope, item, item_id): @@ -33,7 +33,7 @@ def add_item(scope, item, item_id): key, { "total_ocid_count": 0, - "sampler": ReservoirSampler(examples_cap), + "sampler": ReservoirSampler(sample_size), }, ) scope["buyers"][key]["total_ocid_count"] += 1 @@ -65,7 +65,7 @@ def get_result(scope): "total_ocid_count": scope["total_ocid_count"], "ocid_count": biggest_buyer["total_ocid_count"], "ocid_share": biggest_buyer["total_ocid_count"] / scope["total_ocid_count"], - "examples": biggest_buyer["sampler"].retrieve_samples(), + "examples": biggest_buyer["sampler"].sample, "specifics": {"buyer.identifier.id": biggest_buyer_id, "buyer.identifier.scheme": biggest_buyer_scheme}, } diff --git a/dataset/distribution/code_distribution.py b/dataset/distribution/code_distribution.py index bc0fd6a..b8703aa 100644 --- a/dataset/distribution/code_distribution.py +++ b/dataset/distribution/code_distribution.py @@ -43,7 +43,7 @@ def get_result(self, scope): passed = True for key, value in scope.items(): value["share"] = value["count"] / total_count - value["examples"] = value["sampler"].retrieve_samples() + value["examples"] = value["sampler"].sample del value["sampler"] if key in self.test_values: diff --git a/dataset/distribution/value_repetition.py b/dataset/distribution/value_repetition.py index f547ce3..9ccb7b2 100644 --- a/dataset/distribution/value_repetition.py +++ b/dataset/distribution/value_repetition.py @@ -10,7 +10,7 @@ from pelican.util.getter import get_values version = 1.0 -examples_cap = 10 +sample_size = 10 most_frequent_cap = 5 most_frequent_computation = 3 @@ -51,11 +51,11 @@ def add_item(scope, item, item_id, path): } # reservoir sampling - if scope[key]["count"] < examples_cap: + if scope[key]["count"] < sample_size: scope[key]["examples"].append({"item_id": item_id, "ocid": ocid}) else: r = random.randint(0, scope[key]["count"]) - if r < examples_cap: + if r < sample_size: scope[key]["examples"][r] = {"item_id": item_id, "ocid": ocid} scope[key]["count"] += 1 diff --git a/dataset/reference/related_process_identifier.py b/dataset/reference/related_process_identifier.py index b2f24e3..d07a205 100644 --- a/dataset/reference/related_process_identifier.py +++ b/dataset/reference/related_process_identifier.py @@ -4,7 +4,7 @@ from pelican.util.getter import get_values version = 2.0 -examples_cap = 100 +sample_size = 100 def add_item(scope, item, item_id): @@ -95,20 +95,20 @@ def pick_examples(scope, related_process_key, result): } if result: - if scope["meta"]["total_passed"] < examples_cap: + if scope["meta"]["total_passed"] < sample_size: scope["meta"]["passed_examples"].append(example) else: r = random.randint(0, scope["meta"]["total_passed"]) - if r < examples_cap: + if r < sample_size: scope["meta"]["passed_examples"][r] = example scope["meta"]["total_passed"] += 1 else: - if scope["meta"]["total_failed"] < examples_cap: + if scope["meta"]["total_failed"] < sample_size: scope["meta"]["failed_examples"].append(example) else: r = random.randint(0, scope["meta"]["total_failed"]) - if r < examples_cap: + if r < sample_size: scope["meta"]["failed_examples"][r] = example scope["meta"]["total_failed"] += 1 diff --git a/dataset/unique/tender_id.py b/dataset/unique/tender_id.py index af6e30c..cd05999 100644 --- a/dataset/unique/tender_id.py +++ b/dataset/unique/tender_id.py @@ -5,6 +5,7 @@ from pelican.util.getter import get_values version = 2.0 +sample_size = 100 def add_item(scope, item, item_id): @@ -45,8 +46,8 @@ def get_result(scope): passed_releases_count = sum(len(v) for v in scope["tender_id_mapping"].values() if len(v) == 1) result["result"] = relevant_releases_count == passed_releases_count - passed_examples_sampler = ReservoirSampler(100) - failed_examples_sampler = ReservoirSampler(100) + passed_examples_sampler = ReservoirSampler(sample_size) + failed_examples_sampler = ReservoirSampler(sample_size) for tender_id, items in scope["tender_id_mapping"].items(): main_item = random.choice(items) sample = { @@ -60,8 +61,8 @@ def get_result(scope): else: failed_examples_sampler.process(sample) - result["meta"]["passed_examples"] = passed_examples_sampler.retrieve_samples() - result["meta"]["failed_examples"] = failed_examples_sampler.retrieve_samples() + result["meta"]["passed_examples"] = passed_examples_sampler.sample + result["meta"]["failed_examples"] = failed_examples_sampler.sample result["value"] = 100 * passed_releases_count / relevant_releases_count result["meta"]["total_processed"] = relevant_releases_count diff --git a/pelican/util/checks.py b/pelican/util/checks.py index b3bc948..8f1db59 100644 --- a/pelican/util/checks.py +++ b/pelican/util/checks.py @@ -252,19 +252,16 @@ def __init__(self, limit: int): raise ValueError("limit must be a positive integer") self._limit = limit - self._samples: list[Any] = [] self._index = 0 + self.sample = [] # https://en.wikipedia.org/wiki/Reservoir_sampling def process(self, value: Any) -> None: if self._index < self._limit: - self._samples.append(value) + self.sample.append(value) else: r = random.randint(0, self._index) if r < self._limit: - self._samples[r] = value + self.sample[r] = value self._index += 1 - - def retrieve_samples(self) -> list[Any]: - return self._samples diff --git a/tests/dataset/distribution/test_buyer.py b/tests/dataset/distribution/test_buyer.py index d944905..ba4ff07 100644 --- a/tests/dataset/distribution/test_buyer.py +++ b/tests/dataset/distribution/test_buyer.py @@ -87,7 +87,7 @@ def test_failed(): assert result["meta"]["total_buyer_count"] == buyer.min_resources_num assert result["meta"]["counts"]["1"]["total_ocid_count"] == buyer.min_resources_num assert result["meta"]["counts"]["1"]["total_buyer_count"] == buyer.min_resources_num - assert len(result["meta"]["examples"]) == buyer.examples_cap + assert len(result["meta"]["examples"]) == buyer.sample_size scope = {} @@ -105,7 +105,7 @@ def test_failed(): assert result["meta"]["counts"]["1"]["total_buyer_count"] == buyer.min_resources_num assert result["meta"]["counts"]["100+"]["total_ocid_count"] == buyer.min_resources_num assert result["meta"]["counts"]["100+"]["total_buyer_count"] == 1 - assert len(result["meta"]["examples"]) == buyer.examples_cap + assert len(result["meta"]["examples"]) == buyer.sample_size items_test_passed_multiple = [] @@ -159,4 +159,4 @@ def test_passed_multiple(): assert result["meta"]["counts"]["21_50"]["total_ocid_count"] == 100 * 21 assert result["meta"]["counts"]["51_100"]["total_ocid_count"] == 100 * 51 assert result["meta"]["counts"]["100+"]["total_ocid_count"] == 100 * 101 - assert len(result["meta"]["examples"]) == buyer.examples_cap + assert len(result["meta"]["examples"]) == buyer.sample_size diff --git a/tests/tools/test_checks.py b/tests/tools/test_checks.py index 701f5d3..0a4efeb 100644 --- a/tests/tools/test_checks.py +++ b/tests/tools/test_checks.py @@ -13,20 +13,17 @@ def test_reservoir_sampler(): sampler = ReservoirSampler(5) for i in range(3): sampler.process(i) - samples = sampler.retrieve_samples() - assert len(samples) == 3 - assert all(i in samples for i in range(3)) + assert len(sampler.sample) == 3 + assert all(i in sampler.sample for i in range(3)) sampler = ReservoirSampler(5) for i in range(5): sampler.process(i) - samples = sampler.retrieve_samples() - assert len(samples) == 5 - assert all(i in samples for i in range(5)) + assert len(sampler.sample) == 5 + assert all(i in sampler.sample for i in range(5)) sampler = ReservoirSampler(5) for i in range(10): sampler.process(i) - samples = sampler.retrieve_samples() - assert len(samples) == 5 - assert all(example in range(10) for example in samples) + assert len(sampler.sample) == 5 + assert all(i in range(10) for i in sampler.sample)