From 2034c5e33fb87745c6284cd2558b114e4d10238b Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Fri, 17 Mar 2023 13:12:59 +0100 Subject: [PATCH 01/10] feat: cache preprocessed datasets for higher performance --- .../data_loader/utils.py | 26 ++++++++++++------- .../pre_split/processors/col_filter.py | 2 +- 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/src/psycop_model_training/data_loader/utils.py b/src/psycop_model_training/data_loader/utils.py index 3ae185db..06defc5b 100644 --- a/src/psycop_model_training/data_loader/utils.py +++ b/src/psycop_model_training/data_loader/utils.py @@ -3,7 +3,12 @@ from typing import Literal, Optional import pandas as pd +from joblib import Memory +from psycop_model_training.config_schemas.data import DataSchema from psycop_model_training.config_schemas.full_config import FullConfigSchema +from psycop_model_training.config_schemas.preprocessing import ( + PreSplitPreprocessingConfigSchema, +) from psycop_model_training.data_loader.data_classes import SplitDataset from psycop_model_training.data_loader.data_loader import DataLoader from psycop_model_training.preprocessing.pre_split.full_processor import ( @@ -13,32 +18,35 @@ PreSplitValueCleaner, ) +memory = Memory(location="E:/shared_resources/t2d/feature_cache", verbose=0) def get_latest_dataset_dir(path: Path) -> Path: """Get the latest dataset directory by time of creation.""" return max(path.glob("*"), key=os.path.getctime) - +@memory.cache def load_and_filter_split_from_cfg( - cfg: FullConfigSchema, + data_cfg: DataSchema, + pre_split_cfg: PreSplitPreprocessingConfigSchema, split: Literal["train", "test", "val"], cache_dir: Optional[Path] = None, ) -> pd.DataFrame: """Load train dataset from config. Args: - cfg (FullConfig): Config + data_cfg (DataSchema): Data config + pre_split_cfg (PreSplitPreprocessingConfigSchema): Pre-split config split (Literal["train", "test", "val"]): Split to load cache_dir (Optional[Path], optional): Directory. Defaults to None, in which case no caching is used. Returns: pd.DataFrame: Train dataset """ - dataset = DataLoader(data_cfg=cfg.data).load_dataset_from_dir(split_names=split) + dataset = DataLoader(data_cfg=data_cfg).load_dataset_from_dir(split_names=split) filtered_data = pre_split_process_full_dataset( dataset=dataset, - pre_split_cfg=cfg.preprocessing.pre_split, - data_cfg=cfg.data, + pre_split_cfg=pre_split_cfg, + data_cfg=data_cfg, cache_dir=cache_dir, ) @@ -58,14 +66,14 @@ def load_and_filter_train_from_cfg( Returns: pd.DataFrame: Train dataset """ - return load_and_filter_split_from_cfg(cfg=cfg, split="train", cache_dir=cache_dir) + return load_and_filter_split_from_cfg(pre_split_cfg=cfg.preprocessing.pre_split, data_cfg=cfg.data, split="train", cache_dir=cache_dir) def load_and_filter_train_and_val_from_cfg(cfg: FullConfigSchema) -> SplitDataset: """Load train and validation data from file.""" return SplitDataset( - train=load_and_filter_split_from_cfg(cfg=cfg, split="train"), - val=load_and_filter_split_from_cfg(cfg=cfg, split="val"), + train=load_and_filter_split_from_cfg(pre_split_cfg=cfg.preprocessing.pre_split, data_cfg=cfg.data, split="train"), + val=load_and_filter_split_from_cfg(pre_split_cfg=cfg.preprocessing.pre_split, data_cfg=cfg.data, split="val"), ) diff --git a/src/psycop_model_training/preprocessing/pre_split/processors/col_filter.py b/src/psycop_model_training/preprocessing/pre_split/processors/col_filter.py index 0c20c39d..154d8053 100644 --- a/src/psycop_model_training/preprocessing/pre_split/processors/col_filter.py +++ b/src/psycop_model_training/preprocessing/pre_split/processors/col_filter.py @@ -156,7 +156,7 @@ def _keep_unique_outcome_col_with_lookahead_days_matching_conf( """Keep only one outcome column with the same lookahead days as set in the config.""" outcome_cols = infer_outcome_col_name( - df=dataset, prefix=self.data_cfg.outc_prefix, allow_multiple=True + df=dataset, prefix=self.data_cfg.outc_prefix, allow_multiple=True, ) col_to_drop = [ From f417a46935b54481c3312aecd4350b5b2492de1f Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Fri, 17 Mar 2023 13:23:36 +0100 Subject: [PATCH 02/10] ci: update --- src/psycop_model_training/training/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/psycop_model_training/training/utils.py b/src/psycop_model_training/training/utils.py index b99944fc..e3754d2c 100644 --- a/src/psycop_model_training/training/utils.py +++ b/src/psycop_model_training/training/utils.py @@ -29,7 +29,7 @@ def create_eval_dataset( if len(eval_columns) > 0: custom_columns.update(eval_columns) - eval_dataset = EvalDataset( +"" eval_dataset = EvalDataset( ids=df[col_names.id], y=df[outcome_col_name], y_hat_probs=df["y_hat_prob"], From 675e020b6ec40de3c63093c28640444ed9f93b39 Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Fri, 17 Mar 2023 13:23:40 +0100 Subject: [PATCH 03/10] ci: update --- src/psycop_model_training/data_loader/utils.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/psycop_model_training/data_loader/utils.py b/src/psycop_model_training/data_loader/utils.py index 06defc5b..fff2f159 100644 --- a/src/psycop_model_training/data_loader/utils.py +++ b/src/psycop_model_training/data_loader/utils.py @@ -1,4 +1,5 @@ import os +import time from pathlib import Path from typing import Literal, Optional @@ -18,7 +19,15 @@ PreSplitValueCleaner, ) -memory = Memory(location="E:/shared_resources/t2d/feature_cache", verbose=0) +memory_path = Path("E:/shared_resources/feature_cache") + +# Delete any file older than 24 hours in memory_path +for file in memory_path.glob("*"): + one_day = time.time() - 24 * 3600 + if file.stat().st_mtime < one_day: + file.unlink() + +memory = Memory(location=memory_path, verbose=0) def get_latest_dataset_dir(path: Path) -> Path: """Get the latest dataset directory by time of creation.""" From 8e9f9de181706ff9ebabd99038a97458597aa577 Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Fri, 17 Mar 2023 13:44:02 +0100 Subject: [PATCH 04/10] fix: typo --- src/psycop_model_training/training/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/psycop_model_training/training/utils.py b/src/psycop_model_training/training/utils.py index e3754d2c..b99944fc 100644 --- a/src/psycop_model_training/training/utils.py +++ b/src/psycop_model_training/training/utils.py @@ -29,7 +29,7 @@ def create_eval_dataset( if len(eval_columns) > 0: custom_columns.update(eval_columns) -"" eval_dataset = EvalDataset( + eval_dataset = EvalDataset( ids=df[col_names.id], y=df[outcome_col_name], y_hat_probs=df["y_hat_prob"], From f5650c23f8b46f9c1e52252ee66752f2212f8d5e Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Tue, 21 Mar 2023 15:34:51 +0100 Subject: [PATCH 05/10] misc: updates to descriptive stats tabel --- .../tables/descriptive_stats_table.py | 21 +++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/src/psycop_model_training/model_eval/base_artifacts/tables/descriptive_stats_table.py b/src/psycop_model_training/model_eval/base_artifacts/tables/descriptive_stats_table.py index fbed5dce..fc3b19ae 100644 --- a/src/psycop_model_training/model_eval/base_artifacts/tables/descriptive_stats_table.py +++ b/src/psycop_model_training/model_eval/base_artifacts/tables/descriptive_stats_table.py @@ -36,13 +36,12 @@ def _generate_age_stats( df = self._get_column_header_df() - age_mean = round(self.eval_dataset.age.mean(), 2) - - age_span = f"{self.eval_dataset.age.quantile(0.05)} - {self.eval_dataset.age.quantile(0.95)}" + age_mean = round(self.eval_dataset.age.mean(), 1) + age_span = f"{self.eval_dataset.age.quantile(0.25)} - {self.eval_dataset.age.quantile(0.75)}" df = df.append( # type: ignore { - "category": "(visit_level) age (mean / interval)", + "category": "(visit_level) age (mean / 5-95 quartile interval)", "stat_1": age_mean, "stat_1_unit": "years", "stat_2": age_span, @@ -50,12 +49,13 @@ def _generate_age_stats( }, ignore_index=True, ) + age_counts = bin_continuous_data( self.eval_dataset.age, - bins=[0, 18, 35, 60, 100], + bins=[0, 17, *range(24, 75, 10)], )[0].value_counts() - age_percentages = round(age_counts / len(self.eval_dataset.age) * 100, 2) + age_percentages = round(age_counts / len(self.eval_dataset.age) * 100, 1) for i, _ in enumerate(age_counts): df = df.append( # type: ignore @@ -111,6 +111,12 @@ def _generate_eval_col_stats(self) -> pd.DataFrame: df = self._get_column_header_df() + if ( + not hasattr(self.eval_dataset, "custom_columns") + or self.eval_dataset.custom_columns is None + ): + return df + eval_cols: list[dict[str, pd.Series]] = [ {name: values} for name, values in self.eval_dataset.custom_columns.items() @@ -174,6 +180,7 @@ def _generate_visit_level_stats( # General stats visits_followed_by_positive_outcome = self.eval_dataset.y.sum() + visits_followed_by_positive_outcome_percentage = round( (visits_followed_by_positive_outcome / len(self.eval_dataset.ids) * 100), 2, @@ -290,6 +297,8 @@ def generate_descriptive_stats_table( Returns: Union[pd.DataFrame, wandb.Table]: Table 1. """ + if save_path is not None: + save_path.parent.mkdir(parents=True, exist_ok=True) if self.eval_dataset.age is not None: age_stats = self._generate_age_stats() From 049f0f4f3cbed668341a0c3636a434a48cd4ef2b Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Thu, 23 Mar 2023 14:08:23 +0100 Subject: [PATCH 06/10] build: ignore venvs --- .gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 83a82314..b7de2598 100644 --- a/.gitignore +++ b/.gitignore @@ -109,7 +109,7 @@ wandb outputs # virtual env -.venv/ +.venv*/ venv/ # MacOS From 1864ed396090b33d17bfd51cf70202f76a419a78 Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Thu, 23 Mar 2023 14:08:30 +0100 Subject: [PATCH 07/10] refactor: remove caching --- .../data_loader/utils.py | 30 +++++++++++-------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/src/psycop_model_training/data_loader/utils.py b/src/psycop_model_training/data_loader/utils.py index fff2f159..057bd6dd 100644 --- a/src/psycop_model_training/data_loader/utils.py +++ b/src/psycop_model_training/data_loader/utils.py @@ -19,21 +19,12 @@ PreSplitValueCleaner, ) -memory_path = Path("E:/shared_resources/feature_cache") - -# Delete any file older than 24 hours in memory_path -for file in memory_path.glob("*"): - one_day = time.time() - 24 * 3600 - if file.stat().st_mtime < one_day: - file.unlink() - -memory = Memory(location=memory_path, verbose=0) def get_latest_dataset_dir(path: Path) -> Path: """Get the latest dataset directory by time of creation.""" return max(path.glob("*"), key=os.path.getctime) -@memory.cache + def load_and_filter_split_from_cfg( data_cfg: DataSchema, pre_split_cfg: PreSplitPreprocessingConfigSchema, @@ -75,14 +66,27 @@ def load_and_filter_train_from_cfg( Returns: pd.DataFrame: Train dataset """ - return load_and_filter_split_from_cfg(pre_split_cfg=cfg.preprocessing.pre_split, data_cfg=cfg.data, split="train", cache_dir=cache_dir) + return load_and_filter_split_from_cfg( + pre_split_cfg=cfg.preprocessing.pre_split, + data_cfg=cfg.data, + split="train", + cache_dir=cache_dir, + ) def load_and_filter_train_and_val_from_cfg(cfg: FullConfigSchema) -> SplitDataset: """Load train and validation data from file.""" return SplitDataset( - train=load_and_filter_split_from_cfg(pre_split_cfg=cfg.preprocessing.pre_split, data_cfg=cfg.data, split="train"), - val=load_and_filter_split_from_cfg(pre_split_cfg=cfg.preprocessing.pre_split, data_cfg=cfg.data, split="val"), + train=load_and_filter_split_from_cfg( + pre_split_cfg=cfg.preprocessing.pre_split, + data_cfg=cfg.data, + split="train", + ), + val=load_and_filter_split_from_cfg( + pre_split_cfg=cfg.preprocessing.pre_split, + data_cfg=cfg.data, + split="val", + ), ) From 7c9e844053113796464f188680e751970461e62c Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Thu, 23 Mar 2023 14:10:05 +0100 Subject: [PATCH 08/10] style: linting --- src/psycop_model_training/data_loader/utils.py | 2 -- .../model_eval/base_artifacts/tables/descriptive_stats_table.py | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/src/psycop_model_training/data_loader/utils.py b/src/psycop_model_training/data_loader/utils.py index 057bd6dd..516077e4 100644 --- a/src/psycop_model_training/data_loader/utils.py +++ b/src/psycop_model_training/data_loader/utils.py @@ -1,10 +1,8 @@ import os -import time from pathlib import Path from typing import Literal, Optional import pandas as pd -from joblib import Memory from psycop_model_training.config_schemas.data import DataSchema from psycop_model_training.config_schemas.full_config import FullConfigSchema from psycop_model_training.config_schemas.preprocessing import ( diff --git a/src/psycop_model_training/model_eval/base_artifacts/tables/descriptive_stats_table.py b/src/psycop_model_training/model_eval/base_artifacts/tables/descriptive_stats_table.py index fc3b19ae..65d0cc66 100644 --- a/src/psycop_model_training/model_eval/base_artifacts/tables/descriptive_stats_table.py +++ b/src/psycop_model_training/model_eval/base_artifacts/tables/descriptive_stats_table.py @@ -180,7 +180,7 @@ def _generate_visit_level_stats( # General stats visits_followed_by_positive_outcome = self.eval_dataset.y.sum() - + visits_followed_by_positive_outcome_percentage = round( (visits_followed_by_positive_outcome / len(self.eval_dataset.ids) * 100), 2, From f108fc20a9be6fde07c121f007ce1adc5ae71ebb Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Thu, 23 Mar 2023 14:11:56 +0100 Subject: [PATCH 09/10] build: update cruft --- .cookiecutter.json | 1 - .cruft.json | 4 +- Makefile | 99 ++++++++++++++++++++++++++++++++++++++++++++++ setup_for_dev.sh | 19 --------- 4 files changed, 101 insertions(+), 22 deletions(-) create mode 100644 Makefile delete mode 100755 setup_for_dev.sh diff --git a/.cookiecutter.json b/.cookiecutter.json index b5b0a973..05ec5fa1 100644 --- a/.cookiecutter.json +++ b/.cookiecutter.json @@ -7,7 +7,6 @@ "github_user": "", "version": "0.0.0", "copyright_year": "2023", - "setup_for_dev_bash_script": ["y", "n"], "license": ["MIT", "None"], "_copy_without_render": [ "*.github" diff --git a/.cruft.json b/.cruft.json index 9609d9eb..a2b5cced 100644 --- a/.cruft.json +++ b/.cruft.json @@ -1,6 +1,6 @@ { "template": "https://github.com/MartinBernstorff/swift-python-cookiecutter", - "commit": "f4ad2bfd923ac5d8a70d7406f86ca9c4e9251100", + "commit": "ae0ab898bbc61c31407f423b8634ceaa80d5eb1d", "checkout": null, "context": { "cookiecutter": { @@ -12,7 +12,7 @@ "github_user": "MartinBernstorff", "version": "0.0.0", "copyright_year": "2023", - "setup_for_dev_bash_script": "y", + "add_makefile": "y", "license": "MIT", "_copy_without_render": [ "*.github" diff --git a/Makefile b/Makefile new file mode 100644 index 00000000..353c06d8 --- /dev/null +++ b/Makefile @@ -0,0 +1,99 @@ +# Variables +VENV = .venv +VENV_PYTHON = $(VENV)/bin/python +SYSTEM_PYTHON = $(or $(shell which python3.9), $(shell which python3)) +PYTHON = $(or $(wildcard $(VENV_PYTHON)), $(SYSTEM_PYTHON)) + +.DEFAULT_GOAL := help +.PHONY: coverage deps help lint publish push test tox + +setup: ## Setup the environment for development + @$(MAKE) git_init + @$(MAKE) setup_venv + @$(MAKE) install + +git_init: + @if [ ! -d ".git" ]; then \ + echo "๐Ÿ”จ Initializing Git repository..."; \ + git init; \ + git add.; \ + git commit -m "Initial commit"; \ + @echo "โœ… Git repository initialized"; \ + else \ + @echo "โœ… Git repository already exists, continuing"; \ + fi + +setup_venv: + @if [ ! -d ".venv" ]; then \ + @echo "๐Ÿ”จ Creating virtual environment with $(SYSTEM_PYTHON)..."; \ + $(SYSTEM_PYTHON) -m venv .venv; \ + @echo "โœ… Virtual environment created"; \ + else \ + @echo "โœ… Virtual environment already exists, continuing"; \ + fi + +install: ## Install the project as editable, ready for dev + @echo ๐Ÿ”จ Installing project... \ + $(PYTHON) -m pip install -e ".[dev,tests]"; \ + +update: ## Update dependencies + @echo ๐Ÿ”จ Updating project... \ + $(PYTHON) -m pip install --upgrade -e ".[dev,tests]" \ + +test: ## Run tests + @echo "\nโ€“โ€“โ€“ ๐Ÿงช Running tests โ€“โ€“โ€“" + @$(PYTHON) -m pytest -x -n auto -rfE --failed-first -p no:typeguard -p no:cov --disable-warnings -q | tee tests/.pytest_results || true + + @if [ `cat tests/.pytest_results | grep -c "failed"` -gt 0 ]; then \ + echo "\n\n\n"; \ + echo "โ€“โ€“โ€“ ๐Ÿšจ Test failure! ๐Ÿšจ โ€“โ€“โ€“"; \ + cat tests/.pytest_results | grep --color=always "^FAILED" | sed 's/.*test_/FAILURE #test_/' || true; \ + echo "\n"; \ + rm -rf tests/.pytest_results; \ + exit 1; \ + else \ + echo "\nโœ… All tests passed!"; \ + rm -rf tests/.pytest_results; \ + fi + +pr: ## Run linting and tests. If they pass, create a PR. + @$(MAKE) lint + @$(MAKE) test + + @if [ `gh pr list | wc -l` -gt 0 ]; then \ + echo "๐Ÿš‚ Pushing to existing PR..."; \ + git push; \ + else \ + gh pr create --web; \ + fi + + @echo "๐ŸŽ‰ PR up-to-date!" + +lint: + @$(MAKE) pre_commit + @$(MAKE) mypy + +mypy: + @echo "\nโ€“โ€“โ€“ ๐Ÿงน Running mypy โ€“โ€“โ€“" + @$(PYTHON) -m mypy . + +pre_commit: + @echo "\nโ€“โ€“โ€“ ๐Ÿงน Running pre-commit checks โ€“โ€“โ€“" + @pre-commit run --all-files + +help: + @IFS=$$'\n' ; \ + help_lines=(`fgrep -h "##" $(MAKEFILE_LIST) | fgrep -v fgrep | sed -e 's/\\$$//' | sed -e 's/##/:/'`); \ + printf "%s\n\n" "Usage: make [task]"; \ + printf "%-20s %s\n" "command" "Description" ; \ + printf "%-20s %s\n" "-------" "-----------" ; \ + for help_line in $${help_lines[@]}; do \ + IFS=$$':' ; \ + help_split=($$help_line) ; \ + help_command=`echo $${help_split[0]} | sed -e 's/^ *//' -e 's/ *$$//'` ; \ + help_info=`echo $${help_split[2]} | sed -e 's/^ *//' -e 's/ *$$//'` ; \ + printf '\033[36m'; \ + printf "%-20s %s" $$help_command ; \ + printf '\033[0m'; \ + printf "%s\n" $$help_info; \ + done \ No newline at end of file diff --git a/setup_for_dev.sh b/setup_for_dev.sh deleted file mode 100755 index 5470dda1..00000000 --- a/setup_for_dev.sh +++ /dev/null @@ -1,19 +0,0 @@ -# If no commits exist, initialise the repo -if ! git rev-parse HEAD > /dev/null 2>&1; then - git init - git add . - git commit -m "init: first commit" -fi - -# Check if Python 3.9 is installed -if command -v python3.9 &> /dev/null; then - # Python 3.9 is installed, use it to run the script - python3.9 -m venv .venv -else - # Python 3.11 is not installed, use the default Python version - python -m venv .venv -fi - -source .venv/bin/activate - -pip install -e .[dev,tests] From b9ba89b21d54faf83f4f1eb0246bbc1d383c1966 Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Thu, 23 Mar 2023 14:33:36 +0100 Subject: [PATCH 10/10] ci: fix description of IQR --- .../model_eval/base_artifacts/tables/descriptive_stats_table.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/psycop_model_training/model_eval/base_artifacts/tables/descriptive_stats_table.py b/src/psycop_model_training/model_eval/base_artifacts/tables/descriptive_stats_table.py index 65d0cc66..64b9655e 100644 --- a/src/psycop_model_training/model_eval/base_artifacts/tables/descriptive_stats_table.py +++ b/src/psycop_model_training/model_eval/base_artifacts/tables/descriptive_stats_table.py @@ -41,7 +41,7 @@ def _generate_age_stats( df = df.append( # type: ignore { - "category": "(visit_level) age (mean / 5-95 quartile interval)", + "category": "(visit_level) age (mean / 25-75 quartile interval)", "stat_1": age_mean, "stat_1_unit": "years", "stat_2": age_span,