Skip to content

Commit

Permalink
Merge pull request #2 from Digital-Dermatology/selfclean_w_training
Browse files Browse the repository at this point in the history
Selfclean with Training
  • Loading branch information
FabianGroeger96 authored Mar 19, 2024
2 parents abb98e6 + 7610137 commit db3799a
Show file tree
Hide file tree
Showing 61 changed files with 1,159 additions and 71 deletions.
6 changes: 3 additions & 3 deletions .dockerignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@ models/*/*
wandb
wandb/*
wandb/*/*
notebooks
notebooks/*
notebooks/*/*
examples
examples/*
examples/*/*
tmp
tmp/*
profiler
Expand Down
9 changes: 4 additions & 5 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -270,7 +270,6 @@ tags
data/*
wandb/*
*/wandb/*
#*.csv
*.ftz
*.ftz.*
*.model
Expand All @@ -292,7 +291,7 @@ assets/*.pickle
*.dat
notebooks/checkpoints
notebooks/tmp_models
assets/cleaning_influence/cache/*

# don't track env file with secret
.env
examples/DINO*
examples/tmp/*
examples/OxfordIIITPet/
DINO*
45 changes: 4 additions & 41 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -85,50 +85,17 @@ NUM_THREADS := $(shell expr $(NUM_CORES) / $(NUM_GPUS))
DOCKER_ARGS := -v $$PWD:/workspace/ -v $(LOCAL_DATA_DIR):/data/ -p $(PORT):8888 --rm
DOCKER_CMD := docker run $(DOCKER_ARGS) $(GPU_ARGS) $(DOCKER_CONTAINER_NAME) -it $(PROJECTNAME):$(GIT_BRANCH)

# SSH
PORT := 22
USERNAME := fgroger

###########################
# COMMANDS
###########################
# Thanks to: https://stackoverflow.com/a/10858332
# Check that given variables are set and all have non-empty values,
# die with an error otherwise.
#
# Params:
# 1. Variable name(s) to test.
# 2. (optional) Error message to print.
check_defined = \
$(strip $(foreach 1,$1, \
$(call __check_defined,$1,$(strip $(value 2)))))
__check_defined = \
$(if $(value $1),, \
$(error Undefined $1$(if $2, ($2))))

###########################
# SSH UTILS
###########################
.PHONY: push_ssh
push_ssh: clean ##@SSH pushes all the directories along with the files to a remote SSH server
$(call check_defined, SSH_CONN)
rsync -r --exclude='data/' --exclude='.git/' --exclude='.github/' --exclude='wandb/' --exclude='assets/' --progress -e 'ssh -p $(PORT)' $(PROJECT_DIR)/ $(USERNAME)@$(SSH_CONN):$(PROJECTNAME)/

.PHONY: pull_ssh
pull_ssh: ##@SSH pulls directories from a remote SSH server
$(call check_defined, SSH_CONN)
scp -r -P $(PORT) $(USERNAME)@$(SSH_CONN):$(PROJECTNAME) .

###########################
# PROJECT UTILS
###########################
.PHONY: init
init: ##@Utils initializes the project and pulls all the nessecary data
@git submodule update --init --recursive

.PHONY: update_data_ref
update_data_ref: ##@Utils updates the reference to the submodule to its latest commit
@git submodule update --remote --merge
.PHONY: install
install: ##@Utils install the dependencies for the project
@python3 -m pip install -r requirements.txt
@pre-commit install

.PHONY: clean
clean: ##@Utils clean the project
Expand All @@ -142,10 +109,6 @@ clean: ##@Utils clean the project
@rm -f -R tmp/
@rm -f -R cov_html/

.PHONY: install
install: ##@Utils install the dependencies for the project
python3 -m pip install -r requirements.txt

###########################
# DOCKER
###########################
Expand Down
33 changes: 16 additions & 17 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,30 +1,29 @@
# SelfClean
A holistic self-supervised data cleaning strategy to detect irrelevant samples, near duplicates and label errors.

<p align="center">
<img src="assets/SelfClean_Teaser.svg">
</p>

[**SelfClean Paper**](https://arxiv.org/abs/2305.17048)


## Development Environment
Run `make` for a list of possible targets.

## Installation
Run this command for installation
### Installation
Run these commands to install the project:
```bash
make init
make update_data_ref
make install
```

## Code and test conventions
- `black` for code style
- `isort` for import sorting
- docstring style: `sphinx`
- `pytest` for running tests

### Development installation and configurations
To set up your dev environment run:
```bash
pip install -r requirements.txt
# Install pre-commit hook
pre-commit install
```
To run all the linters on all files:
To run linters on all files:
```bash
pre-commit run --all-files
```

### Code and test conventions
- `black` for code style
- `isort` for import sorting
- `pytest` for running tests
1 change: 1 addition & 0 deletions assets/SelfClean_Teaser.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
343 changes: 343 additions & 0 deletions examples/Investigate_Flowers102.ipynb

Large diffs are not rendered by default.

363 changes: 363 additions & 0 deletions examples/Investigate_OxfordIIITPet.ipynb

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,9 @@ isort>=5.10
pipreqs==0.4.11
pre-commit>=2.20
pytest-cov>=3
transformers
seaborn
SciencePlots
scikit-image
codecov
randomname
2 changes: 1 addition & 1 deletion src/cleaner/irrelevants/lad_mixin.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,6 @@ def get_irrelevant_ranking(self) -> List[Tuple[float, int]]:
if self.plot_distribution and irrelevant_score is not None:
plot_dist(
scores=np.asarray([x[0] for x in irrelevant_score]),
title="Distribution of irrelevant samples samples",
title="Distribution of irrelevant samples",
)
return irrelevant_score
2 changes: 1 addition & 1 deletion src/cleaner/irrelevants/quantile_irrelevant_mixin.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,6 @@ def get_irrelevant_ranking(self) -> List[Tuple[float, int]]:
if self.plot_distribution and irrelevant_score is not None:
plot_dist(
scores=np.asarray([x[0] for x in irrelevant_score]),
title="Distribution of irrelevant samples samples",
title="Distribution of irrelevant samples",
)
return irrelevant_score
5 changes: 3 additions & 2 deletions src/cleaner/label_errors/intra_extra_distance_mixin.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,9 @@ def labels_calc_scores(self) -> np.ndarray:
# ensure one can not choose it's own distance
np.fill_diagonal(o_hot_same, np.inf)
# calc. the matrices for same and other lbl dists.
min_same = np.nanmin((o_hot_same * self.distance_matrix), axis=-1)
min_diff = np.nanmin((o_hot_diff * self.distance_matrix), axis=-1)
with np.errstate(all="ignore"):
min_same = np.nanmin((o_hot_same * self.distance_matrix), axis=-1)
min_diff = np.nanmin((o_hot_diff * self.distance_matrix), axis=-1)
# check if there are samples without any same labels
missing_same_indices = np.where(np.sum(o_hot_same == 1, axis=-1) == 0)[0]
if len(missing_same_indices) > 0:
Expand Down
Loading

0 comments on commit db3799a

Please sign in to comment.