Skip to content

Commit

Permalink
chore: Rename examples_cap to sample_size. Change retrieve_samples() …
Browse files Browse the repository at this point in the history
…function to sample attribute. Define sample_size for dataset unique.tender_id.
  • Loading branch information
jpmckinney committed Aug 10, 2023
1 parent 3fb9c82 commit bd2907c
Show file tree
Hide file tree
Showing 12 changed files with 50 additions and 55 deletions.
18 changes: 9 additions & 9 deletions contracting_process/field_level/report_examples.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

logger = logging.getLogger("pelican.contracting_process.field_level.report_examples")

examples_cap = 20
sample_size = 20


def create(dataset_id):
Expand Down Expand Up @@ -45,8 +45,8 @@ def create(dataset_id):
"checks": {},
"passed_examples": [],
"failed_examples": [],
"passed_sampler": ReservoirSampler(examples_cap),
"failed_sampler": ReservoirSampler(examples_cap),
"passed_sampler": ReservoirSampler(sample_size),
"failed_sampler": ReservoirSampler(sample_size),
}

for _, check_name in checks:
Expand All @@ -59,8 +59,8 @@ def create(dataset_id):
examples[path][key]["checks"][check_name] = {
"passed_examples": [],
"failed_examples": [],
"passed_sampler": ReservoirSampler(examples_cap),
"failed_sampler": ReservoirSampler(examples_cap),
"passed_sampler": ReservoirSampler(sample_size),
"failed_sampler": ReservoirSampler(sample_size),
}

logger.info("Starting processing pages.")
Expand Down Expand Up @@ -114,15 +114,15 @@ def create(dataset_id):
logger.info("Storing examples for field level checks for dataset_id %s", dataset_id)
for path, path_checks in examples.items():
for key in ("coverage", "quality"):
path_checks[key]["passed_examples"] = path_checks[key]["passed_sampler"].retrieve_samples()
path_checks[key]["failed_examples"] = path_checks[key]["failed_sampler"].retrieve_samples()
path_checks[key]["passed_examples"] = path_checks[key]["passed_sampler"].sample
path_checks[key]["failed_examples"] = path_checks[key]["failed_sampler"].sample

del path_checks[key]["passed_sampler"]
del path_checks[key]["failed_sampler"]

for check_name, check in path_checks[key]["checks"].items():
check["passed_examples"] = check["passed_sampler"].retrieve_samples()
check["failed_examples"] = check["failed_sampler"].retrieve_samples()
check["passed_examples"] = check["passed_sampler"].sample
check["failed_examples"] = check["failed_sampler"].sample

del check["passed_sampler"]
del check["failed_sampler"]
Expand Down
8 changes: 4 additions & 4 deletions contracting_process/resource_level/examples.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from pelican.util.checks import ReservoirSampler
from pelican.util.services import commit, get_cursor

examples_cap = 20
sample_size = 20


def create(dataset_id):
Expand All @@ -17,9 +17,9 @@ def create(dataset_id):

check_samplers = {
check_name: {
"passed": ReservoirSampler(examples_cap),
"failed": ReservoirSampler(examples_cap),
"undefined": ReservoirSampler(examples_cap),
"passed": ReservoirSampler(sample_size),
"failed": ReservoirSampler(sample_size),
"undefined": ReservoirSampler(sample_size),
}
for check_name in definitions
}
Expand Down
10 changes: 5 additions & 5 deletions dataset/consistent/related_process_title.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from pelican.util.getter import get_values

version = 1.0
examples_cap = 100
sample_size = 100


def add_item(scope, item, item_id):
Expand Down Expand Up @@ -125,20 +125,20 @@ def pick_examples(scope, related_process_key, result):
}

if result:
if scope["meta"]["total_passed"] < examples_cap:
if scope["meta"]["total_passed"] < sample_size:
scope["meta"]["passed_examples"].append(example)
else:
r = random.randint(0, scope["meta"]["total_passed"])
if r < examples_cap:
if r < sample_size:
scope["meta"]["passed_examples"][r] = example

scope["meta"]["total_passed"] += 1
else:
if scope["meta"]["total_failed"] < examples_cap:
if scope["meta"]["total_failed"] < sample_size:
scope["meta"]["failed_examples"].append(example)
else:
r = random.randint(0, scope["meta"]["total_failed"])
if r < examples_cap:
if r < sample_size:
scope["meta"]["failed_examples"][r] = example

scope["meta"]["total_failed"] += 1
Expand Down
6 changes: 3 additions & 3 deletions dataset/distribution/buyer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

version = 1.0
min_resources_num = 1000
examples_cap = 20
sample_size = 20


def add_item(scope, item, item_id):
Expand Down Expand Up @@ -54,7 +54,7 @@ def get_result(scope):
"100+": {"total_ocid_count": 0, "total_buyer_count": 0},
}

buyer_with_one_ocid_sampler = ReservoirSampler(examples_cap)
buyer_with_one_ocid_sampler = ReservoirSampler(sample_size)

# filling in the histogram
for value in scope["buyers"].values():
Expand Down Expand Up @@ -87,7 +87,7 @@ def get_result(scope):
"counts": ocid_histogram,
"total_ocid_count": scope["total_ocid_count"],
"total_buyer_count": len(scope["buyers"]),
"examples": buyer_with_one_ocid_sampler.retrieve_samples(),
"examples": buyer_with_one_ocid_sampler.sample,
}
else:
result["meta"] = {"reason": "no data items were processed"}
Expand Down
6 changes: 3 additions & 3 deletions dataset/distribution/buyer_repetition.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

version = 1.0
min_resources_num = 1000
examples_cap = 20
sample_size = 20


def add_item(scope, item, item_id):
Expand Down Expand Up @@ -33,7 +33,7 @@ def add_item(scope, item, item_id):
key,
{
"total_ocid_count": 0,
"sampler": ReservoirSampler(examples_cap),
"sampler": ReservoirSampler(sample_size),
},
)
scope["buyers"][key]["total_ocid_count"] += 1
Expand Down Expand Up @@ -65,7 +65,7 @@ def get_result(scope):
"total_ocid_count": scope["total_ocid_count"],
"ocid_count": biggest_buyer["total_ocid_count"],
"ocid_share": biggest_buyer["total_ocid_count"] / scope["total_ocid_count"],
"examples": biggest_buyer["sampler"].retrieve_samples(),
"examples": biggest_buyer["sampler"].sample,
"specifics": {"buyer.identifier.id": biggest_buyer_id, "buyer.identifier.scheme": biggest_buyer_scheme},
}

Expand Down
2 changes: 1 addition & 1 deletion dataset/distribution/code_distribution.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def get_result(self, scope):
passed = True
for key, value in scope.items():
value["share"] = value["count"] / total_count
value["examples"] = value["sampler"].retrieve_samples()
value["examples"] = value["sampler"].sample
del value["sampler"]

if key in self.test_values:
Expand Down
6 changes: 3 additions & 3 deletions dataset/distribution/value_repetition.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from pelican.util.getter import get_values

version = 1.0
examples_cap = 10
sample_size = 10
most_frequent_cap = 5
most_frequent_computation = 3

Expand Down Expand Up @@ -51,11 +51,11 @@ def add_item(scope, item, item_id, path):
}

# reservoir sampling
if scope[key]["count"] < examples_cap:
if scope[key]["count"] < sample_size:
scope[key]["examples"].append({"item_id": item_id, "ocid": ocid})
else:
r = random.randint(0, scope[key]["count"])
if r < examples_cap:
if r < sample_size:
scope[key]["examples"][r] = {"item_id": item_id, "ocid": ocid}

scope[key]["count"] += 1
Expand Down
10 changes: 5 additions & 5 deletions dataset/reference/related_process_identifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from pelican.util.getter import get_values

version = 2.0
examples_cap = 100
sample_size = 100


def add_item(scope, item, item_id):
Expand Down Expand Up @@ -95,20 +95,20 @@ def pick_examples(scope, related_process_key, result):
}

if result:
if scope["meta"]["total_passed"] < examples_cap:
if scope["meta"]["total_passed"] < sample_size:
scope["meta"]["passed_examples"].append(example)
else:
r = random.randint(0, scope["meta"]["total_passed"])
if r < examples_cap:
if r < sample_size:
scope["meta"]["passed_examples"][r] = example

scope["meta"]["total_passed"] += 1
else:
if scope["meta"]["total_failed"] < examples_cap:
if scope["meta"]["total_failed"] < sample_size:
scope["meta"]["failed_examples"].append(example)
else:
r = random.randint(0, scope["meta"]["total_failed"])
if r < examples_cap:
if r < sample_size:
scope["meta"]["failed_examples"][r] = example

scope["meta"]["total_failed"] += 1
Expand Down
9 changes: 5 additions & 4 deletions dataset/unique/tender_id.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from pelican.util.getter import get_values

version = 2.0
sample_size = 100


def add_item(scope, item, item_id):
Expand Down Expand Up @@ -45,8 +46,8 @@ def get_result(scope):
passed_releases_count = sum(len(v) for v in scope["tender_id_mapping"].values() if len(v) == 1)
result["result"] = relevant_releases_count == passed_releases_count

passed_examples_sampler = ReservoirSampler(100)
failed_examples_sampler = ReservoirSampler(100)
passed_examples_sampler = ReservoirSampler(sample_size)
failed_examples_sampler = ReservoirSampler(sample_size)
for tender_id, items in scope["tender_id_mapping"].items():
main_item = random.choice(items)
sample = {
Expand All @@ -60,8 +61,8 @@ def get_result(scope):
else:
failed_examples_sampler.process(sample)

result["meta"]["passed_examples"] = passed_examples_sampler.retrieve_samples()
result["meta"]["failed_examples"] = failed_examples_sampler.retrieve_samples()
result["meta"]["passed_examples"] = passed_examples_sampler.sample
result["meta"]["failed_examples"] = failed_examples_sampler.sample

result["value"] = 100 * passed_releases_count / relevant_releases_count
result["meta"]["total_processed"] = relevant_releases_count
Expand Down
9 changes: 3 additions & 6 deletions pelican/util/checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,19 +252,16 @@ def __init__(self, limit: int):
raise ValueError("limit must be a positive integer")

self._limit = limit
self._samples: list[Any] = []
self._index = 0
self.sample = []

# https://en.wikipedia.org/wiki/Reservoir_sampling
def process(self, value: Any) -> None:
if self._index < self._limit:
self._samples.append(value)
self.sample.append(value)
else:
r = random.randint(0, self._index)
if r < self._limit:
self._samples[r] = value
self.sample[r] = value

self._index += 1

def retrieve_samples(self) -> list[Any]:
return self._samples
6 changes: 3 additions & 3 deletions tests/dataset/distribution/test_buyer.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ def test_failed():
assert result["meta"]["total_buyer_count"] == buyer.min_resources_num
assert result["meta"]["counts"]["1"]["total_ocid_count"] == buyer.min_resources_num
assert result["meta"]["counts"]["1"]["total_buyer_count"] == buyer.min_resources_num
assert len(result["meta"]["examples"]) == buyer.examples_cap
assert len(result["meta"]["examples"]) == buyer.sample_size

scope = {}

Expand All @@ -105,7 +105,7 @@ def test_failed():
assert result["meta"]["counts"]["1"]["total_buyer_count"] == buyer.min_resources_num
assert result["meta"]["counts"]["100+"]["total_ocid_count"] == buyer.min_resources_num
assert result["meta"]["counts"]["100+"]["total_buyer_count"] == 1
assert len(result["meta"]["examples"]) == buyer.examples_cap
assert len(result["meta"]["examples"]) == buyer.sample_size


items_test_passed_multiple = []
Expand Down Expand Up @@ -159,4 +159,4 @@ def test_passed_multiple():
assert result["meta"]["counts"]["21_50"]["total_ocid_count"] == 100 * 21
assert result["meta"]["counts"]["51_100"]["total_ocid_count"] == 100 * 51
assert result["meta"]["counts"]["100+"]["total_ocid_count"] == 100 * 101
assert len(result["meta"]["examples"]) == buyer.examples_cap
assert len(result["meta"]["examples"]) == buyer.sample_size
15 changes: 6 additions & 9 deletions tests/tools/test_checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,20 +13,17 @@ def test_reservoir_sampler():
sampler = ReservoirSampler(5)
for i in range(3):
sampler.process(i)
samples = sampler.retrieve_samples()
assert len(samples) == 3
assert all(i in samples for i in range(3))
assert len(sampler.sample) == 3
assert all(i in sampler.sample for i in range(3))

sampler = ReservoirSampler(5)
for i in range(5):
sampler.process(i)
samples = sampler.retrieve_samples()
assert len(samples) == 5
assert all(i in samples for i in range(5))
assert len(sampler.sample) == 5
assert all(i in sampler.sample for i in range(5))

sampler = ReservoirSampler(5)
for i in range(10):
sampler.process(i)
samples = sampler.retrieve_samples()
assert len(samples) == 5
assert all(example in range(10) for example in samples)
assert len(sampler.sample) == 5
assert all(i in range(10) for i in sampler.sample)

0 comments on commit bd2907c

Please sign in to comment.