Skip to content

Commit

Permalink
added option for setting the significance level
Browse files Browse the repository at this point in the history
  • Loading branch information
FabianGroeger96 committed Aug 27, 2024
1 parent fd7c60e commit 4f55e4e
Show file tree
Hide file tree
Showing 6 changed files with 141 additions and 126 deletions.
133 changes: 67 additions & 66 deletions examples/Investigate_Imagenette.ipynb

Large diffs are not rendered by default.

124 changes: 66 additions & 58 deletions examples/Investigate_OxfordIIITPet.ipynb

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def parse_requirements(filename):
name=PACKAGE_NAME,
packages=proj_packages,
package_dir={PACKAGE_NAME: SOURCE_DIRECTORY},
version="0.0.25",
version="0.0.26",
author="Fabian Groeger",
author_email="fabian.groeger@unibas.ch",
description="A holistic self-supervised data cleaning strategy to detect irrelevant samples, near duplicates and label errors.",
Expand Down
5 changes: 5 additions & 0 deletions src/cleaner/auto_cleaning_mixin.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ def __init__(
irrelevant_cut_off: float = 0.01,
near_duplicate_cut_off: float = 0.01,
label_error_cut_off: float = 0.01,
significance_level: float = 0.05,
cleaner_kwargs: dict = {},
**kwargs,
):
Expand All @@ -30,6 +31,7 @@ def __init__(
self.irrelevant_cut_off = irrelevant_cut_off
self.near_duplicate_cut_off = near_duplicate_cut_off
self.label_error_cut_off = label_error_cut_off
self.significance_level = significance_level
self.cleaner_kwargs = cleaner_kwargs

def perform_auto_cleaning(
Expand All @@ -39,6 +41,9 @@ def perform_auto_cleaning(
output_path: Optional[Union[str, Path]] = None,
):
if self.auto_cleaning:
# make sure the significance level is correctly set
self.cleaner_kwargs["q"] = self.significance_level

# Near Duplicates
near_duplicate_issues = issue_manger["near_duplicates"]
if near_duplicate_issues is not None:
Expand Down
1 change: 1 addition & 0 deletions tests/unittests/cleaner/test_auto_cleaning.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ def test_predict_auto_cleaning_diff_cut_off(self):
irrelevant_cut_off=0.01,
near_duplicate_cut_off=0.01,
label_error_cut_off=0.01,
significance_level=0.01,
)
cleaner.fit(emb_space=self.emb_space, labels=self.labels)
out_dict = cleaner.predict()
Expand Down
2 changes: 1 addition & 1 deletion tests/unittests/cleaner/test_selfclean_cleaner.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ def setUp(self):
self.emb_space = np.random.rand(50, 198)
self.labels = np.random.randint(5, size=50)
self.class_labels = [f"test_{x}" for x in np.unique(self.labels)]
self.memory_profiling = True
self.memory_profiling = False

def test_fit(self):
cleaner = SelfCleanCleaner(memmap=False)
Expand Down

0 comments on commit 4f55e4e

Please sign in to comment.