diff --git a/Makefile b/Makefile index 18ca3a0..3b42617 100644 --- a/Makefile +++ b/Makefile @@ -101,6 +101,12 @@ clean: ##@Utils clean the project @rm -f -R tmp/ @rm -f -R cov_html/ +_build_publish: + @python3 -m pip install --upgrade pip + @python3 -m pip install setuptools wheel twine + @python3 setup sdist bdist_wheel + @python3 -m twine upload --verbose dist/* + ########################### # DOCKER ########################### diff --git a/requirements.txt b/requirements.txt index 0dc1e8f..031461e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -21,3 +21,5 @@ seaborn SciencePlots scikit-image codecov +jupyter +loguru diff --git a/src/cleaner/auto_cleaning_mixin.py b/src/cleaner/auto_cleaning_mixin.py index 68c8c2f..15ab402 100644 --- a/src/cleaner/auto_cleaning_mixin.py +++ b/src/cleaner/auto_cleaning_mixin.py @@ -3,6 +3,7 @@ import numpy as np import scipy import scipy.stats +from loguru import logger from ..utils.plotting import ( plot_frac_cut, @@ -86,7 +87,6 @@ def fraction_cut( plot_result: bool = False, ax=None, bins="sqrt", - debug: bool = False, path: Optional[str] = None, ): M = len(scores) @@ -113,8 +113,7 @@ def fraction_cut( # Exclude the scores below probability threshold exclude = logit_scores < cutoff n = exclude.sum() - if debug: - print(f"{n} outliers ({n/self.N:.1%})") + logger.debug(f"{n} outliers ({n/self.N:.1%})") if plot_result: if ax is not None: @@ -155,7 +154,6 @@ def threshold_sensitivity(self, scores: np.ndarray, ax=None): alpha=0.1, q=q, plot_result=False, - debug=False, ).shape[0], ) for q in thresholds @@ -187,7 +185,6 @@ def alpha_sensitivity(self, scores: np.ndarray, ax=None): scores=scores, alpha=a, plot_result=False, - debug=False, ).shape[0], ) for a in alphas diff --git a/src/cleaner/near_duplicates/embedding_distance_mixin.py b/src/cleaner/near_duplicates/embedding_distance_mixin.py index 1ea5423..56de421 100644 --- a/src/cleaner/near_duplicates/embedding_distance_mixin.py +++ b/src/cleaner/near_duplicates/embedding_distance_mixin.py @@ -2,7 +2,7 @@ from typing import List, Tuple import numpy as np -from tqdm import tqdm +from tqdm.auto import tqdm from ...cleaner.near_duplicates.base_near_duplicate_mixin import BaseNearDuplicateMixin from ...ssl_library.src.utils.logging import plot_dist diff --git a/src/cleaner/selfclean.py b/src/cleaner/selfclean.py index 45b20ee..c081d24 100644 --- a/src/cleaner/selfclean.py +++ b/src/cleaner/selfclean.py @@ -14,6 +14,7 @@ from ..ssl_library.src.augmentations.ibot import iBOTDataAugmentation from ..ssl_library.src.pkg import Embedder, embed_dataset from ..ssl_library.src.trainers.dino_trainer import DINOTrainer +from ..ssl_library.src.utils.logging import set_log_level from ..ssl_library.src.utils.utils import cleanup, init_distributed_mode from ..utils.utils import set_dataset_transformation @@ -26,7 +27,8 @@ "warmup_epochs": 10, "momentum_teacher": 0.996, "clip_grad": 3.0, - "apply_l2_norm": True, + "apply_l2_norm": False, # TODO: check influence of this + "save_every_n_epochs": 10, "model": { "out_dim": 4096, "emb_dim": 192, @@ -35,11 +37,11 @@ "use_bn_in_head": False, "norm_last_layer": True, "student": { - "drop_path_rate": 0.1, + "drop_path_rate": 0.1, # TODO: check influence of this "pretrained": True, }, "teacher": { - "drop_path_rate": 0.1, + "drop_path_rate": 0.1, # TODO: check influence of this "pretrained": True, }, "eval": {"n_last_blocks": 4, "avgpool_patchtokens": False}, @@ -84,13 +86,14 @@ def __init__( plot_top_N: Optional[int] = None, output_path: Optional[str] = None, figsize: tuple = (10, 8), + # logging + log_level: str = "INFO", **kwargs, ): + set_log_level(min_log_level=log_level) self.memmap = memmap self.memmap_path = memmap_path - self.model = None - self.cleaner = SelfCleanCleaner( distance_function_path=distance_function_path, distance_function_name=distance_function_name, @@ -102,9 +105,9 @@ def __init__( plot_top_N=plot_top_N, output_path=output_path, figsize=figsize, + log_level=log_level, **kwargs, ) - self.base_transform = transforms.Compose( [ transforms.Resize(256, interpolation=InterpolationMode.BICUBIC), @@ -118,7 +121,7 @@ def run_on_image_folder( self, input_path: Union[str, Path], epochs: int = 100, - batch_size: int = 32, + batch_size: int = 64, ssl_pre_training: bool = True, work_dir: Optional[str] = None, num_workers: int = 24, @@ -158,7 +161,7 @@ def run_on_dataset( self, dataset, epochs: int = 100, - batch_size: int = 32, + batch_size: int = 64, ssl_pre_training: bool = True, work_dir: Optional[str] = None, num_workers: int = 24, @@ -194,7 +197,7 @@ def _run( self, dataset, epochs: int = 100, - batch_size: int = 32, + batch_size: int = 64, ssl_pre_training: bool = True, work_dir: Optional[str] = None, num_workers: int = 24, @@ -244,6 +247,7 @@ def _run( normalize=apply_l2_norm, memmap=self.memmap, memmap_path=self.memmap_path, + tqdm_desc="Creating dataset representation", ) # for default datasets we can set the paths manually if hasattr(dataset, "_image_files") and paths is None: @@ -262,7 +266,7 @@ def train_dino( self, dataset: Dataset, epochs: int = 100, - batch_size: int = 32, + batch_size: int = 64, ssl_pre_training: bool = True, work_dir: Optional[str] = None, hyperparameters: dict = DINO_STANDARD_HYPERPARAMETERS, diff --git a/src/cleaner/selfclean_cleaner.py b/src/cleaner/selfclean_cleaner.py index 977dff9..3e30c30 100644 --- a/src/cleaner/selfclean_cleaner.py +++ b/src/cleaner/selfclean_cleaner.py @@ -6,7 +6,7 @@ import numpy as np import scienceplots # noqa: F401 import sklearn # noqa: F401 -from tqdm import tqdm +from tqdm.auto import tqdm from ..cleaner.auto_cleaning_mixin import AutoCleaningMixin from ..cleaner.base_cleaner import BaseCleaner @@ -17,6 +17,7 @@ from ..cleaner.near_duplicates.embedding_distance_mixin import EmbeddingDistanceMixin from ..distances import * # noqa: F401, F403 from ..distances.projective_distance import * # noqa: F401, F403 +from ..ssl_library.src.utils.logging import set_log_level from ..utils.plotting import plot_inspection_result @@ -42,8 +43,12 @@ def __init__( plot_top_N: Optional[int] = None, output_path: Optional[str] = None, figsize: tuple = (10, 8), + # logging + log_level: str = "INFO", **kwargs, ): + set_log_level(min_log_level=log_level) + self.memmap = memmap self.chunk_size = chunk_size self.precision_type_distance = precision_type_distance diff --git a/src/scoring/lad_scoring.py b/src/scoring/lad_scoring.py index c81afe1..1734656 100644 --- a/src/scoring/lad_scoring.py +++ b/src/scoring/lad_scoring.py @@ -3,9 +3,10 @@ from typing import List, Optional, Tuple import matplotlib.pyplot as plt +from loguru import logger from matplotlib.patches import Rectangle from scipy.cluster import hierarchy -from tqdm import tqdm +from tqdm.auto import tqdm class LAD: @@ -29,7 +30,6 @@ def calc_scores( linkage_matrix: list, global_leaves: bool = False, save_fig_path: Optional[str] = None, - debug: bool = False, ) -> List[Tuple[float, int]]: if self.plot_scores: plt.figure(figsize=(5, 5)) @@ -140,14 +140,13 @@ def calc_scores( linestyle="dotted", ) - if debug: - print( - f"ID: {node.id}, #leaves: {n_leaves}, scores: {len(scores)}, " - f"dist: {round(node.dist, 2)}, square: {round(square, 2)}, " - f"start: {round(start, 2)}, end: {round(end, 2)}, " - f"p_left: {round(p_left, 2)}, p_right: {round(p_right, 2)}, " - f"w_left: {round(w_left, 2)}, w_right: {round(w_right, 2)}" - ) + logger.debug( + f"ID: {node.id}, #leaves: {n_leaves}, scores: {len(scores)}, " + f"dist: {round(node.dist, 2)}, square: {round(square, 2)}, " + f"start: {round(start, 2)}, end: {round(end, 2)}, " + f"p_left: {round(p_left, 2)}, p_right: {round(p_right, 2)}, " + f"w_left: {round(w_left, 2)}, w_right: {round(w_right, 2)}" + ) node_right = NodeElement( node=node.right, @@ -167,12 +166,11 @@ def calc_scores( ) queue.insert(0, node_left) else: - if debug: - print( - f"Leaf ({node.id}), " - f"score: {round(sum(scores), 2)}, square: {round(square, 2)}, " - f"start: {round(start, 2)}, end: {round(end, 2)}" - ) + logger.debug( + f"Leaf ({node.id}), " + f"score: {round(sum(scores), 2)}, square: {round(square, 2)}, " + f"start: {round(start, 2)}, end: {round(end, 2)}" + ) if self.plot_scores: plt.gca().text( diff --git a/src/ssl_library b/src/ssl_library index 923a794..bf1efcf 160000 --- a/src/ssl_library +++ b/src/ssl_library @@ -1 +1 @@ -Subproject commit 923a794b267c47475518fcf28b5358b778ce71ad +Subproject commit bf1efcfafecb794ff51d6e234b5df2389c1118f6 diff --git a/src/utils/utils.py b/src/utils/utils.py index e81c9a8..fa1787a 100644 --- a/src/utils/utils.py +++ b/src/utils/utils.py @@ -2,6 +2,7 @@ import numpy as np import torch +from loguru import logger from torch.utils.data import ConcatDataset, Dataset @@ -35,7 +36,7 @@ def actual_indices(idx, n): shifts = np.concatenate([[0], n_row_elems]) jj = np.arange(1, n)[ii] + idx - shifts[ii] if np.sum(ii < 0) > 0 or np.sum(jj < 0) > 0: - print("Negative indices") + logger.error("Negative indices") return ii, jj diff --git a/tests/unittests/cleaner/test_auto_cleaning.py b/tests/unittests/cleaner/test_auto_cleaning.py index 875f571..f47640b 100644 --- a/tests/unittests/cleaner/test_auto_cleaning.py +++ b/tests/unittests/cleaner/test_auto_cleaning.py @@ -56,7 +56,7 @@ def test_predict_auto_cleaning_with_plotting(self): cleaner = SelfCleanCleaner( memmap=False, auto_cleaning=True, - cleaner_kwargs={"debug": True, "plot_result": True}, + cleaner_kwargs={"plot_result": True}, ) cleaner.fit(emb_space=self.emb_space, labels=self.labels) out_dict = cleaner.predict()