Skip to content

Commit

Permalink
added possibility to approximate NDs
Browse files Browse the repository at this point in the history
  • Loading branch information
FabianGroeger96 committed Sep 12, 2024
2 parents 15bcdf2 + 4f55e4e commit efd7476
Show file tree
Hide file tree
Showing 15 changed files with 292 additions and 134 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@ These examples analyze different benchmark datasets such as:
- <a href="https://github.com/fastai/imagenette">Imagenette</a> 🖼️ (Open in <a href="https://nbviewer.org/github/Digital-Dermatology/SelfClean/blob/main/examples/Investigate_Imagenette.ipynb">NBViewer</a> | <a href="https://github.com/Digital-Dermatology/SelfClean/blob/main/examples/Investigate_Imagenette.ipynb">GitHub</a> | <a href="https://colab.research.google.com/github/Digital-Dermatology/SelfClean/blob/main/examples/Investigate_Imagenette.ipynb">Colab</a>)
- <a href="https://www.robots.ox.ac.uk/~vgg/data/pets/">Oxford-IIIT Pet</a> 🐶 (Open in <a href="https://nbviewer.org/github/Digital-Dermatology/SelfClean/blob/main/examples/Investigate_OxfordIIITPet.ipynb">NBViewer</a> | <a href="https://github.com/Digital-Dermatology/SelfClean/blob/main/examples/Investigate_OxfordIIITPet.ipynb">GitHub</a> | <a href="https://colab.research.google.com/github/Digital-Dermatology/SelfClean/blob/main/examples/Investigate_OxfordIIITPet.ipynb">Colab</a>)

Also, check out our <a href="https://www.kaggle.com/code/fabiangrger/removing-the-psychic-from-the-dataset">Kaggle notebook</a> to see an illustration of how to get a gold medal for cleaning a competition dataset.

## Development Environment
Run `make` for a list of possible targets.

Expand Down
133 changes: 67 additions & 66 deletions examples/Investigate_Imagenette.ipynb

Large diffs are not rendered by default.

124 changes: 66 additions & 58 deletions examples/Investigate_OxfordIIITPet.ipynb

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,4 @@ jupyter
loguru
faiss-cpu
faiss-gpu
memory-profiler
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def parse_requirements(filename):
name=PACKAGE_NAME,
packages=proj_packages,
package_dir={PACKAGE_NAME: SOURCE_DIRECTORY},
version="0.0.24",
version="0.0.26",
author="Fabian Groeger",
author_email="fabian.groeger@unibas.ch",
description="A holistic self-supervised data cleaning strategy to detect irrelevant samples, near duplicates and label errors.",
Expand Down
5 changes: 5 additions & 0 deletions src/cleaner/auto_cleaning_mixin.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ def __init__(
irrelevant_cut_off: float = 0.01,
near_duplicate_cut_off: float = 0.01,
label_error_cut_off: float = 0.01,
significance_level: float = 0.05,
cleaner_kwargs: dict = {},
**kwargs,
):
Expand All @@ -30,6 +31,7 @@ def __init__(
self.irrelevant_cut_off = irrelevant_cut_off
self.near_duplicate_cut_off = near_duplicate_cut_off
self.label_error_cut_off = label_error_cut_off
self.significance_level = significance_level
self.cleaner_kwargs = cleaner_kwargs

def perform_auto_cleaning(
Expand All @@ -39,6 +41,9 @@ def perform_auto_cleaning(
output_path: Optional[Union[str, Path]] = None,
):
if self.auto_cleaning:
# make sure the significance level is correctly set
self.cleaner_kwargs["q"] = self.significance_level

# Near Duplicates
near_duplicate_issues = issue_manger["near_duplicates"]
if near_duplicate_issues is not None:
Expand Down
3 changes: 2 additions & 1 deletion src/cleaner/near_duplicates/embedding_distance_mixin.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def get_near_duplicate_ranking(self) -> Tuple[np.ndarray, np.ndarray]:
vec_index_mapping = np.vectorize(condensed_to_square)
# here the chunk size is x**2 since we have quadratically more
chunk_size = self.chunk_size**2
# chunk the sorted values for memory optimization
# chunk the sorted values for memory efficiency
n_chunks = math.ceil(self.condensed_size / chunk_size)
if self.memmap:
indices_file = self.memmap_path / "near_duplicate_indices.dat"
Expand All @@ -63,6 +63,7 @@ def get_near_duplicate_ranking(self) -> Tuple[np.ndarray, np.ndarray]:
shape=(self.condensed_size, 2),
dtype=np.int32,
)
# this creates the corresponding indices of the sorted array
for i in tqdm(
range(n_chunks),
desc="Processing possible near duplicates",
Expand Down
22 changes: 20 additions & 2 deletions src/cleaner/selfclean.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import platform
from enum import Enum
from pathlib import Path
from typing import Optional, Union
from typing import List, Optional, Union

import numpy as np
import torch
Expand All @@ -12,6 +12,7 @@
from torchvision.datasets import ImageFolder
from torchvision.transforms import InterpolationMode

from ..cleaner.issue_manager import IssueTypes
from ..cleaner.selfclean_cleaner import SelfCleanCleaner
from ..ssl_library.src.augmentations.multi_crop import MultiCropAugmentation
from ..ssl_library.src.pkg import Embedder, embed_dataset
Expand Down Expand Up @@ -134,6 +135,11 @@ def run_on_image_folder(
num_workers: Optional[int] = os.cpu_count(),
pretraining_type: PretrainingType = PretrainingType.DINO,
hyperparameters: dict = DINO_STANDARD_HYPERPARAMETERS,
issues_to_detect: List[IssueTypes] = [
IssueTypes.NEAR_DUPLICATES,
IssueTypes.IRRELEVANTS,
IssueTypes.LABEL_ERRORS,
],
# embedding
n_layers: int = 1,
apply_l2_norm: bool = True,
Expand All @@ -156,6 +162,7 @@ def run_on_image_folder(
num_workers=num_workers,
pretraining_type=pretraining_type,
hyperparameters=hyperparameters,
issues_to_detect=issues_to_detect,
n_layers=n_layers,
apply_l2_norm=apply_l2_norm,
additional_run_info=(
Expand All @@ -176,6 +183,11 @@ def run_on_dataset(
num_workers: Optional[int] = os.cpu_count(),
pretraining_type: PretrainingType = PretrainingType.DINO,
hyperparameters: dict = DINO_STANDARD_HYPERPARAMETERS,
issues_to_detect: List[IssueTypes] = [
IssueTypes.NEAR_DUPLICATES,
IssueTypes.IRRELEVANTS,
IssueTypes.LABEL_ERRORS,
],
# embedding
n_layers: int = 1,
apply_l2_norm: bool = True,
Expand All @@ -194,6 +206,7 @@ def run_on_dataset(
num_workers=num_workers,
pretraining_type=pretraining_type,
hyperparameters=hyperparameters,
issues_to_detect=issues_to_detect,
n_layers=n_layers,
apply_l2_norm=apply_l2_norm,
additional_run_info=(
Expand All @@ -214,6 +227,11 @@ def _run(
num_workers: Optional[int] = os.cpu_count(),
pretraining_type: PretrainingType = PretrainingType.DINO,
hyperparameters: dict = DINO_STANDARD_HYPERPARAMETERS,
issues_to_detect: List[IssueTypes] = [
IssueTypes.NEAR_DUPLICATES,
IssueTypes.IRRELEVANTS,
IssueTypes.LABEL_ERRORS,
],
# embedding
n_layers: int = 1,
apply_l2_norm: bool = True,
Expand Down Expand Up @@ -275,7 +293,7 @@ def _run(
dataset=dataset,
class_labels=dataset.classes if hasattr(dataset, "classes") else None,
)
return self.cleaner.predict()
return self.cleaner.predict(issues_to_detect=issues_to_detect)

def train_dino(
self,
Expand Down
19 changes: 15 additions & 4 deletions src/cleaner/selfclean_cleaner.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from ..ssl_library.src.utils.logging import set_log_level
from ..ssl_library.src.utils.utils import fix_random_seeds
from ..utils.plotting import plot_inspection_result
from ..utils.utils import triu_indices_memmap


class SelfCleanCleaner(
Expand All @@ -37,7 +38,7 @@ def __init__(
# distance calculation
distance_function_path: str = "sklearn.metrics.pairwise.",
distance_function_name: str = "cosine_similarity",
chunk_size: int = 100,
chunk_size: int = 10_000,
precision_type_distance: type = np.float32,
# memory management
memmap: bool = True,
Expand Down Expand Up @@ -151,15 +152,25 @@ def fit(
mode="w+",
shape=(self.condensed_size,),
)
triu_indices = triu_indices_memmap(
str(self.memmap_path / "triu_indices"),
N=self.N,
k=1,
)
else:
self.p_distances = np.zeros(
shape=(self.condensed_size,),
dtype=self.precision_type_distance,
)
self.p_distances[:] = self.distance_matrix[
~np.tril(np.ones((self.N, self.N), dtype=bool))
]
triu_indices = np.triu_indices(self.N, k=1)
# create the upper triangular matrix of the distance matrix
for start_idx in range(0, len(triu_indices[0]), self.chunk_size):
end_idx = min(start_idx + self.chunk_size, len(triu_indices[0]))
self.p_distances[start_idx:end_idx] = self.distance_matrix[
triu_indices[0][start_idx:end_idx], triu_indices[1][start_idx:end_idx]
]
self.is_fitted = True
del triu_indices
return self

def predict(
Expand Down
46 changes: 46 additions & 0 deletions src/utils/utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import math
from functools import partial
from pathlib import Path

import numpy as np
import torch
Expand Down Expand Up @@ -41,6 +42,51 @@ def actual_indices(idx, n):
return ii, jj


def triu_indices_memmap(filename: str, N: int, k: int = 0):
"""
Generate the indices for the upper-triangular part of a matrix using memmap.
Parameters:
filename (str): The name of the file to use for memmap.
N (int): The size of the square matrix.
k (int): Diagonal offset. k=0 is the main diagonal, k>0 is above, and k<0 is below.
Returns:
tuple of ndarray: Indices for the upper-triangular part of the matrix.
"""
# Calculate the number of elements in the upper triangular part
num_elements = sum(max(0, N - k - i) for i in range(N))

# Create memmap arrays for row and column indices
rows_filename = Path(filename + "_rows.dat")
cols_filename = Path(filename + "_cols.dat")
if rows_filename.exists():
rows_filename.unlink()
if cols_filename.exists():
cols_filename.unlink()
rows_memmap = np.memmap(
str(rows_filename),
dtype="int64",
mode="w+",
shape=(num_elements,),
)
cols_memmap = np.memmap(
str(cols_filename),
dtype="int64",
mode="w+",
shape=(num_elements,),
)

idx = 0
for i in range(N):
for j in range(i + k, N):
rows_memmap[idx] = i
cols_memmap[idx] = j
idx += 1

return rows_memmap, cols_memmap


def has_same_label(arr) -> np.ndarray:
arr = np.array(arr)
result = arr[:, None] == arr
Expand Down
22 changes: 20 additions & 2 deletions tests/integration_tests/test_selfclean_IT.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

from torchvision.datasets import FakeData

from src.cleaner.issue_manager import IssueTypes
from src.cleaner.selfclean import PretrainingType, SelfClean
from tests.testutils.paths import testfiles_path

Expand Down Expand Up @@ -45,6 +46,19 @@ def test_run_with_files_dino_with_output_path(self):
)
self._check_output(out_dict)

def test_run_with_files_dino_single_issue_type(self):
temp_work_dir = tempfile.TemporaryDirectory()
selfclean = SelfClean()
out_dict = selfclean.run_on_image_folder(
input_path=testfiles_path,
pretraining_type=PretrainingType.DINO,
work_dir=temp_work_dir.name,
epochs=1,
num_workers=4,
issues_to_detect=[IssueTypes.IRRELEVANTS],
)
self._check_output(out_dict, issue_types=["irrelevants"])

def test_run_with_files_dino_wo_pretraining(self):
selfclean = SelfClean()
out_dict = selfclean.run_on_image_folder(
Expand Down Expand Up @@ -108,8 +122,12 @@ def test_run_with_plotting(self):
)
self._check_output(out_dict)

def _check_output(self, out_dict):
for issue_type in ["irrelevants", "near_duplicates", "label_errors"]:
def _check_output(
self,
out_dict,
issue_types=["irrelevants", "near_duplicates", "label_errors"],
):
for issue_type in issue_types:
v = out_dict.get_issues(issue_type)
self.assertIsNotNone(v)
self.assertTrue("indices" in v)
Expand Down
1 change: 1 addition & 0 deletions tests/unittests/cleaner/test_auto_cleaning.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ def test_predict_auto_cleaning_diff_cut_off(self):
irrelevant_cut_off=0.01,
near_duplicate_cut_off=0.01,
label_error_cut_off=0.01,
significance_level=0.01,
)
cleaner.fit(emb_space=self.emb_space, labels=self.labels)
out_dict = cleaner.predict()
Expand Down
10 changes: 10 additions & 0 deletions tests/unittests/cleaner/test_selfclean_cleaner.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import unittest

import numpy as np
from memory_profiler import profile

from src.cleaner.base_cleaner import BaseCleaner
from src.cleaner.issue_manager import IssueTypes
Expand All @@ -12,10 +13,13 @@ def setUp(self):
self.emb_space = np.random.rand(50, 198)
self.labels = np.random.randint(5, size=50)
self.class_labels = [f"test_{x}" for x in np.unique(self.labels)]
self.memory_profiling = False

def test_fit(self):
cleaner = SelfCleanCleaner(memmap=False)
self.assertEqual(cleaner.is_fitted, False)
if self.memory_profiling:
cleaner.fit = profile(cleaner.fit, precision=4)
cleaner.fit(emb_space=self.emb_space, labels=self.labels)
self.assertEqual(cleaner.is_fitted, True)
self.assertIsInstance(cleaner, BaseCleaner)
Expand All @@ -24,13 +28,19 @@ def test_fit(self):

def test_fit_with_memmaps(self):
cleaner = SelfCleanCleaner(memmap=True)
if self.memory_profiling:
cleaner.fit = profile(cleaner.fit, precision=4)
cleaner.fit(emb_space=self.emb_space, labels=self.labels)
self.assertIsNotNone(cleaner.distance_matrix)
self.assertIsNotNone(cleaner.p_distances)

def test_predict(self):
cleaner = SelfCleanCleaner(memmap=False)
if self.memory_profiling:
cleaner.fit = profile(cleaner.fit, precision=4)
cleaner.fit(emb_space=self.emb_space, labels=self.labels)
if self.memory_profiling:
cleaner.predict = profile(cleaner.predict, precision=4)
out_dict = cleaner.predict()
for issue_type in ["irrelevants", "near_duplicates", "label_errors"]:
v = out_dict.get_issues(issue_type)
Expand Down
Empty file.
36 changes: 36 additions & 0 deletions tests/unittests/utils/test_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import shutil
import tempfile
import unittest
from pathlib import Path

import numpy as np

from src.utils.utils import triu_indices_memmap


class TestUtils(unittest.TestCase):

@classmethod
def setUpClass(cls):
cls.temp_path = Path(tempfile.mkdtemp())

@classmethod
def tearDownClass(cls):
shutil.rmtree(cls.temp_path)

def test_triu_indices_memmap(self):
l_N = np.arange(1, 100, 10)
for N in l_N:
l_k = np.arange(0, N)
for k in l_k:
triu_indices = np.triu_indices(N, k=k)
triu_indices_mem = triu_indices_memmap(
str(TestUtils.temp_path / "triu_indices"),
N=N,
k=k,
)
self.assertTrue(np.array_equal(triu_indices_mem, triu_indices))


if __name__ == "__main__":
unittest.main()

0 comments on commit efd7476

Please sign in to comment.