From c82fa61c984fe999b35ed86a6bc4e035cd8507bb Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Thu, 20 Oct 2022 11:02:16 +0200 Subject: [PATCH 01/57] feat: intermediate refactor --- application/train_and_log_models.py | 67 +++++++++++++++++++ .../evaluate_saved_model_predictions.py | 61 +++++++++++++---- 2 files changed, 114 insertions(+), 14 deletions(-) create mode 100644 application/train_and_log_models.py diff --git a/application/train_and_log_models.py b/application/train_and_log_models.py new file mode 100644 index 00000000..1a3e0541 --- /dev/null +++ b/application/train_and_log_models.py @@ -0,0 +1,67 @@ +"""Example script to train multiple models and subsequently log the results to +wandb. + +Usage: +- Replace the HYDRA_ARGS string with the desired arguments for `train_model.py` +- Run this script from project root with `python src/psycopt2d/train_and_log_models.py` +- +""" +import os +from pathlib import Path + +from psycopt2d.evaluate_saved_model_predictions import ( + infer_look_distance, + infer_outcome_col_name, + infer_predictor_col_names, +) +from psycopt2d.load import DataLoader, DatasetSpecification, DatasetTimeSpecification + +BASE_CONF_FILE_NAME = f"default_config.yaml" +DATA_DIR = ( + Path("E:") + / "shared_resources" + / "feature_sets" + / "t2d" + / "feature_sets" + / "psycop_t2d_adminmanber_201_features_2022_10_05_15_14" +) + +BASE_ARGS = f"--multirun +model=xgboost --config-name {BASE_CONF_FILE_NAME}" +WANDB_PROJECT = "psycopt2d-testing" + +if __name__ == "__main__": + time_spec = DatasetTimeSpecification( + drop_patient_if_outcome_before_date="1979-01-01", + min_prediction_time_date="1979-01-01", + min_lookbehind_days=0, + min_lookahead_days=0, + ) + + dataset_spec = DatasetSpecification( + file_suffix="parquet", + time_spec=time_spec, + pred_col_name_prefix="pred_", + pred_time_colname="timestamp", + split_dir_path=DATA_DIR, + time=time_spec, + ) + + loader = DataLoader(dataset_spec) + train = loader.load_dataset_from_dir(split_names="train") + + # Get potential lookaheads from outc_ columns + outcome_col_names = infer_outcome_col_name(df=train, allow_multiple=True) + possible_lookahead_days = infer_look_distance( + col_names=outcome_col_names, allow_multiple=True + ) + + # Get potential lookbehinds from pred_ columns + pred_col_names = infer_predictor_col_names(df=train, allow_multiple=True) + possible_lookbehind_days = infer_look_distance(col_names=pred_col_names) + + # Override wandb group name with these + + # Iterate over them + + # Add feature subsetting subsetting to args + os.system(f"python src/psycopt2d/train_model.py {BASE_ARGS} ") diff --git a/src/psycopt2d/evaluate_saved_model_predictions.py b/src/psycopt2d/evaluate_saved_model_predictions.py index 0505ff86..771c351e 100644 --- a/src/psycopt2d/evaluate_saved_model_predictions.py +++ b/src/psycopt2d/evaluate_saved_model_predictions.py @@ -6,7 +6,9 @@ - Evaluate all models in 'evaluation_results' folder - CLI for evaluating a model """ +import re from pathlib import Path +from typing import Iterable, Union import pandas as pd from omegaconf.dictconfig import DictConfig @@ -15,26 +17,57 @@ from psycopt2d.visualization import plot_auc_by_time_from_first_visit -def infer_outcome_col_name(df: pd.DataFrame, prefix: str = "outc_") -> str: - """Infer the outcome column name from the dataframe.""" - outcome_name = [c for c in df.columns if c.startswith(prefix)] - if len(outcome_name) == 1: - return outcome_name[0] +def infer_col_names( + df: pd.DataFrame, prefix: str, allow_multiple: bool = True +) -> Union[str, list[str]]: + """Infer col names based on prefix""" + col_name = [c for c in df.columns if c.startswith(prefix)] + + if len(col_name) == 1: + return col_name[0] + elif len(col_name) > 1: + if allow_multiple: + return col_name + raise ValueError( + f"Multipel columns found and allow_multiple is {allow_multiple}." + ) else: raise ValueError("More than one outcome inferred") -def infer_predictor_col_names(df: pd.DataFrame, cfg: DictConfig) -> list[str]: - """Get the predictors that are used in the model. +def infer_outcome_col_name( + df: pd.DataFrame, prefix: str = "outc_", allow_multiple: bool = True +) -> Union[str, list[str]]: + """Infer the outcome column name from the dataframe.""" + return infer_col_names(df=df, prefix=prefix, allow_multiple=allow_multiple) - Args: - df (pd.Dataframe): Dataframe with model predictions - cfg (DictConfig): Config file - Returns: - list[str]: list of predictors - """ - return [c for c in df.columns if c.startswith(cfg.data.pred_col_name_prefix)] +def infer_predictor_col_names( + df: pd.DataFrame, prefix: str = "outc_", allow_multiple: bool = True +) -> Union[str, list[str]]: + """Get the predictors that are used in the model.""" + return infer_col_names(df=df, prefix=prefix, allow_multiple=allow_multiple) + + +def infer_look_distance( + col_name: Union[Iterable[str], str], regex_pattern=r"within_(\d)_days" +): + """Infer look distances from col names""" + # E.g. "outc_within_1_days" = 1 + # E.g. "outc_within_2_days" = 2 + # E.g. "pred_within_3_days" = 3 + # E.g. "pred_within_3_days" = 3 + + look_distances = [] + + if isinstance(col_name, Iterable): + look_distances.append( + infer_look_distance(col_name=col_name, regex_pattern=regex_pattern) + ) + else: + look_distances = re.findall(regex_pattern, col_name) + + return look_distances def load_model_predictions_and_cfg(path: Path) -> tuple[pd.DataFrame, DictConfig]: From 1ff62ffd59f82470b33dcc8a4f2308508078a36d Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Thu, 20 Oct 2022 13:04:41 +0200 Subject: [PATCH 02/57] feat: init training script --- application/train_and_log_models.py | 54 ++++++++++--------- pyproject.toml | 1 + src/psycopt2d/config/data/synth_data.yaml | 25 +++++---- src/psycopt2d/config/default_config.yaml | 2 +- .../evaluate_saved_model_predictions.py | 51 +++++++++++------- src/psycopt2d/train_model.py | 5 +- 6 files changed, 80 insertions(+), 58 deletions(-) diff --git a/application/train_and_log_models.py b/application/train_and_log_models.py index 1a3e0541..d9a3cd10 100644 --- a/application/train_and_log_models.py +++ b/application/train_and_log_models.py @@ -3,47 +3,43 @@ Usage: - Replace the HYDRA_ARGS string with the desired arguments for `train_model.py` -- Run this script from project root with `python src/psycopt2d/train_and_log_models.py` -- +- Run this script from project root with `python src/psycopt2d/train_and_log_models.py """ import os from pathlib import Path +from random_word import RandomWords +from wasabi import msg + from psycopt2d.evaluate_saved_model_predictions import ( infer_look_distance, infer_outcome_col_name, - infer_predictor_col_names, + infer_predictor_col_name, ) from psycopt2d.load import DataLoader, DatasetSpecification, DatasetTimeSpecification -BASE_CONF_FILE_NAME = f"default_config.yaml" -DATA_DIR = ( - Path("E:") - / "shared_resources" - / "feature_sets" - / "t2d" - / "feature_sets" - / "psycop_t2d_adminmanber_201_features_2022_10_05_15_14" -) +BASE_CONF_FILE_NAME = "integration_testing.yaml" -BASE_ARGS = f"--multirun +model=xgboost --config-name {BASE_CONF_FILE_NAME}" +DATA_DIR = Path("/Users/au484925/Desktop/psycop-t2d/tests/test_data/synth_splits/") + +BASE_ARGS = "--multirun +model=xgboost" WANDB_PROJECT = "psycopt2d-testing" +N_TRIALS_PER_CELL_IN_GRID = 50 if __name__ == "__main__": time_spec = DatasetTimeSpecification( - drop_patient_if_outcome_before_date="1979-01-01", + drop_patient_if_outcome_before_date=None, min_prediction_time_date="1979-01-01", min_lookbehind_days=0, min_lookahead_days=0, ) dataset_spec = DatasetSpecification( - file_suffix="parquet", - time_spec=time_spec, + file_suffix="csv", + time=time_spec, pred_col_name_prefix="pred_", pred_time_colname="timestamp", split_dir_path=DATA_DIR, - time=time_spec, ) loader = DataLoader(dataset_spec) @@ -51,17 +47,27 @@ # Get potential lookaheads from outc_ columns outcome_col_names = infer_outcome_col_name(df=train, allow_multiple=True) - possible_lookahead_days = infer_look_distance( - col_names=outcome_col_names, allow_multiple=True + possible_lookahead_days = set( + infer_look_distance( + col_name=outcome_col_names, + ) ) # Get potential lookbehinds from pred_ columns - pred_col_names = infer_predictor_col_names(df=train, allow_multiple=True) - possible_lookbehind_days = infer_look_distance(col_names=pred_col_names) + pred_col_names = infer_predictor_col_name(df=train, allow_multiple=True) + possible_lookbehind_days = set(infer_look_distance(col_name=pred_col_names)) # Override wandb group name with these + # Generate random word-word string + r = RandomWords() + + for lookbehind in possible_lookbehind_days: + for lookahead in possible_lookahead_days: + wandb_group = f"{r.get_random_word()}-{r.get_random_word()}-beh-{lookbehind}-ahead-{lookahead}" + + command = f"python src/psycopt2d/train_model.py {BASE_ARGS} data.min_lookbehind_days={lookbehind} data.min_lookahead_days={lookahead} +project.wandb_group={wandb_group} hydra.sweeper.n_trials={N_TRIALS_PER_CELL_IN_GRID} --config-name {BASE_CONF_FILE_NAME}" - # Iterate over them + msg.info("Sending command") + msg.info(command) - # Add feature subsetting subsetting to args - os.system(f"python src/psycopt2d/train_model.py {BASE_ARGS} ") + os.system(command) diff --git a/pyproject.toml b/pyproject.toml index c8927054..c2eecb63 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,6 +25,7 @@ selenium = ">=4.2.0,<4.6.0" # See https://github.com/Aarhus-Psychiatry-Research/psycop-t2d/pull/194 for thoughts on root cause seaborn = ">=0.12.0, <0.12.1" pyarrow = ">=9.0.0, <9.1.0" +Random-Word = "^1.0.11" [tool.poetry.dev-dependencies] diff --git a/src/psycopt2d/config/data/synth_data.yaml b/src/psycopt2d/config/data/synth_data.yaml index 4ce1f9a3..04443461 100644 --- a/src/psycopt2d/config/data/synth_data.yaml +++ b/src/psycopt2d/config/data/synth_data.yaml @@ -1,11 +1,14 @@ -n_training_samples: null -min_lookahead_days: null -min_prediction_time_date: null -lookahead_days: 30 -pred_col_name_prefix: "pred_" -pred_timestamp_col_name: timestamp -outcome_timestamp_col_name: timestamp_outcome -id_col_name: citizen_ids -source: synthetic -min_lookbehind_days: null -drop_patient_if_outcome_before_date: null \ No newline at end of file +# @package _global_ + +data: + n_training_samples: null + min_lookahead_days: null + min_prediction_time_date: null + lookahead_days: 30 + pred_col_name_prefix: "pred_" + pred_timestamp_col_name: timestamp + outcome_timestamp_col_name: timestamp_outcome + id_col_name: citizen_ids + source: synthetic + min_lookbehind_days: null + drop_patient_if_outcome_before_date: null diff --git a/src/psycopt2d/config/default_config.yaml b/src/psycopt2d/config/default_config.yaml index 5e318e27..2d081f25 100644 --- a/src/psycopt2d/config/default_config.yaml +++ b/src/psycopt2d/config/default_config.yaml @@ -5,4 +5,4 @@ defaults: - preprocessing: default_preprocessing - training: default_training - evaluation: default_evaluation - - sweeper: optuna_singlethread \ No newline at end of file + - sweeper: optuna_multithread diff --git a/src/psycopt2d/evaluate_saved_model_predictions.py b/src/psycopt2d/evaluate_saved_model_predictions.py index 771c351e..30b0e84b 100644 --- a/src/psycopt2d/evaluate_saved_model_predictions.py +++ b/src/psycopt2d/evaluate_saved_model_predictions.py @@ -7,8 +7,9 @@ - CLI for evaluating a model """ import re +from collections.abc import Iterable from pathlib import Path -from typing import Iterable, Union +from typing import Union import pandas as pd from omegaconf.dictconfig import DictConfig @@ -18,9 +19,11 @@ def infer_col_names( - df: pd.DataFrame, prefix: str, allow_multiple: bool = True + df: pd.DataFrame, + prefix: str, + allow_multiple: bool = True, ) -> Union[str, list[str]]: - """Infer col names based on prefix""" + """Infer col names based on prefix.""" col_name = [c for c in df.columns if c.startswith(prefix)] if len(col_name) == 1: @@ -29,43 +32,55 @@ def infer_col_names( if allow_multiple: return col_name raise ValueError( - f"Multipel columns found and allow_multiple is {allow_multiple}." + f"Multipel columns found and allow_multiple is {allow_multiple}.", ) else: raise ValueError("More than one outcome inferred") def infer_outcome_col_name( - df: pd.DataFrame, prefix: str = "outc_", allow_multiple: bool = True + df: pd.DataFrame, + prefix: str = "outc_", + allow_multiple: bool = True, ) -> Union[str, list[str]]: """Infer the outcome column name from the dataframe.""" return infer_col_names(df=df, prefix=prefix, allow_multiple=allow_multiple) -def infer_predictor_col_names( - df: pd.DataFrame, prefix: str = "outc_", allow_multiple: bool = True +def infer_predictor_col_name( + df: pd.DataFrame, + prefix: str = "pred_", + allow_multiple: bool = True, ) -> Union[str, list[str]]: """Get the predictors that are used in the model.""" return infer_col_names(df=df, prefix=prefix, allow_multiple=allow_multiple) def infer_look_distance( - col_name: Union[Iterable[str], str], regex_pattern=r"within_(\d)_days" -): - """Infer look distances from col names""" + col_name: Union[Iterable[str], str], + regex_pattern: str = r"within_(\d+)_days", + allow_multiple: bool = True, +) -> list[Union[int, float]]: + """Infer look distances from col names.""" # E.g. "outc_within_1_days" = 1 # E.g. "outc_within_2_days" = 2 # E.g. "pred_within_3_days" = 3 # E.g. "pred_within_3_days" = 3 - look_distances = [] + look_distances: list[Union[int, float]] = [] - if isinstance(col_name, Iterable): - look_distances.append( - infer_look_distance(col_name=col_name, regex_pattern=regex_pattern) - ) + if isinstance(col_name, Iterable) and not isinstance(col_name, str): + for c_name in col_name: + look_distances += infer_look_distance( + col_name=c_name, regex_pattern=regex_pattern + ) else: - look_distances = re.findall(regex_pattern, col_name) + look_distances = re.findall(pattern=regex_pattern, string=col_name) + + if len(look_distances) > 1 and not allow_multiple: + raise ValueError( + f"Multiple col names provided and allow_multiple is {allow_multiple}.", + ) return look_distances @@ -89,8 +104,8 @@ def load_model_predictions_and_cfg(path: Path) -> tuple[pd.DataFrame, DictConfig / "eval_model_name-xgboost_require_imputation-True_args-n_estimators-100_tree_method-auto_2022_09_22_10_52.pkl", ) - train_col_names = infer_predictor_col_names(eval_df, cfg) - y_col_name = infer_outcome_col_name(eval_df) + train_col_names = infer_predictor_col_name(df=eval_df) + y_col_name = infer_outcome_col_name(df=eval_df) Y_HAT_PROB_COL_NAME = "y_hat_prob" # change to 'y_hat_prob_oof' if using cv diff --git a/src/psycopt2d/train_model.py b/src/psycopt2d/train_model.py index ad635c98..1d865797 100644 --- a/src/psycopt2d/train_model.py +++ b/src/psycopt2d/train_model.py @@ -304,15 +304,12 @@ def main(cfg): create_wandb_folders() - # Get today's date as str - today_str = datetime.now().strftime("%Y-%m-%d") - run = wandb.init( project=cfg.project.name, reinit=True, config=flatten_nested_dict(cfg, sep="."), mode=cfg.project.wandb_mode, - group=today_str, + group=cfg.project.wandb_group, ) dataset = load_train_and_val_from_cfg(cfg) From 8e0506a8af7ec7f057a34fba7bd097cb6906c754 Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Thu, 20 Oct 2022 13:04:53 +0200 Subject: [PATCH 03/57] style: linting --- application/train_and_log_models.py | 2 +- src/psycopt2d/evaluate_saved_model_predictions.py | 3 ++- src/psycopt2d/train_model.py | 1 - 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/application/train_and_log_models.py b/application/train_and_log_models.py index d9a3cd10..84758f89 100644 --- a/application/train_and_log_models.py +++ b/application/train_and_log_models.py @@ -50,7 +50,7 @@ possible_lookahead_days = set( infer_look_distance( col_name=outcome_col_names, - ) + ), ) # Get potential lookbehinds from pred_ columns diff --git a/src/psycopt2d/evaluate_saved_model_predictions.py b/src/psycopt2d/evaluate_saved_model_predictions.py index 30b0e84b..37526f13 100644 --- a/src/psycopt2d/evaluate_saved_model_predictions.py +++ b/src/psycopt2d/evaluate_saved_model_predictions.py @@ -72,7 +72,8 @@ def infer_look_distance( if isinstance(col_name, Iterable) and not isinstance(col_name, str): for c_name in col_name: look_distances += infer_look_distance( - col_name=c_name, regex_pattern=regex_pattern + col_name=c_name, + regex_pattern=regex_pattern, ) else: look_distances = re.findall(pattern=regex_pattern, string=col_name) diff --git a/src/psycopt2d/train_model.py b/src/psycopt2d/train_model.py index 1d865797..a642e5df 100644 --- a/src/psycopt2d/train_model.py +++ b/src/psycopt2d/train_model.py @@ -1,7 +1,6 @@ """Training script for training a single model for predicting t2d.""" import os from collections.abc import Iterable -from datetime import datetime from pathlib import Path from typing import Optional From 3f8ccbb02de841bcefe36cbe861e7924c35ff492 Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Thu, 20 Oct 2022 13:06:51 +0200 Subject: [PATCH 04/57] fix: add wandb_group to project struct --- src/psycopt2d/config/project/integration_test_project.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/psycopt2d/config/project/integration_test_project.yaml b/src/psycopt2d/config/project/integration_test_project.yaml index b3d29e04..39402f44 100644 --- a/src/psycopt2d/config/project/integration_test_project.yaml +++ b/src/psycopt2d/config/project/integration_test_project.yaml @@ -1,3 +1,4 @@ name: psycop-t2d-integration-testing seed: 42 -wandb_mode: "disabled" # Which mode to run WanDB in. Takes "run", "dryrun", "offline" and "disabled" \ No newline at end of file +wandb_mode: "disabled" # Which mode to run WanDB in. Takes "run", "dryrun", "offline" and "disabled" +wandb_group: "integration_testing" From ca99441ff5a0b14f612c7f86e23d76396dc5b6f8 Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Thu, 20 Oct 2022 13:14:11 +0200 Subject: [PATCH 05/57] style: lint --- src/psycopt2d/utils.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/psycopt2d/utils.py b/src/psycopt2d/utils.py index 73576489..cee19ff5 100644 --- a/src/psycopt2d/utils.py +++ b/src/psycopt2d/utils.py @@ -392,6 +392,7 @@ def load_evaluation_data(model_data_dir: Path) -> ModelEvalData: feature_importance_dict=feature_importance_dict, ) + def infer_col_names( df: pd.DataFrame, prefix: str, @@ -429,6 +430,11 @@ def infer_predictor_col_name( """Get the predictors that are used in the model.""" return infer_col_names(df=df, prefix=prefix, allow_multiple=allow_multiple) -def infer_y_hat_prob_col_name(df: pd.DataFrame, prefix="y_hat_prob", allow_multiple: False) -> str: + +def infer_y_hat_prob_col_name( + df: pd.DataFrame, + prefix="y_hat_prob", + allow_multiple: bool = False, +) -> str: """Infer the y_hat_prob column name from the dataframe.""" - return infer_col_names(df=df, prefix=prefix, allow_multiple=allow_multiple) \ No newline at end of file + return infer_col_names(df=df, prefix=prefix, allow_multiple=allow_multiple) From d8bd92df0f0303ab5fa9afa40bb28148fc191f37 Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Thu, 20 Oct 2022 13:14:37 +0200 Subject: [PATCH 06/57] docs: typo --- src/psycopt2d/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/psycopt2d/utils.py b/src/psycopt2d/utils.py index cee19ff5..eac545e0 100644 --- a/src/psycopt2d/utils.py +++ b/src/psycopt2d/utils.py @@ -407,7 +407,7 @@ def infer_col_names( if allow_multiple: return col_name raise ValueError( - f"Multipel columns found and allow_multiple is {allow_multiple}.", + f"Multiple columns found and allow_multiple is {allow_multiple}.", ) else: raise ValueError("More than one outcome inferred") From bb63f531cd1405160cc8137bebf8b42428a336f5 Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Thu, 20 Oct 2022 13:28:24 +0200 Subject: [PATCH 07/57] feat: add watcher --- application/train_and_log_models.py | 146 +++++++++++++++++++++++----- 1 file changed, 122 insertions(+), 24 deletions(-) diff --git a/application/train_and_log_models.py b/application/train_and_log_models.py index 84758f89..79d3282d 100644 --- a/application/train_and_log_models.py +++ b/application/train_and_log_models.py @@ -6,10 +6,14 @@ - Run this script from project root with `python src/psycopt2d/train_and_log_models.py """ import os +import subprocess +import time from pathlib import Path +from typing import Iterable, Union -from random_word import RandomWords -from wasabi import msg +from hydra import compose, initialize +from pydantic import BaseModel +from wasabi import Printer, msg from psycopt2d.evaluate_saved_model_predictions import ( infer_look_distance, @@ -26,27 +30,43 @@ WANDB_PROJECT = "psycopt2d-testing" N_TRIALS_PER_CELL_IN_GRID = 50 -if __name__ == "__main__": - time_spec = DatasetTimeSpecification( - drop_patient_if_outcome_before_date=None, - min_prediction_time_date="1979-01-01", - min_lookbehind_days=0, - min_lookahead_days=0, - ) +# RUN CONSTANTS +CONFIG_NAME = "integration_testing.yaml" - dataset_spec = DatasetSpecification( - file_suffix="csv", - time=time_spec, - pred_col_name_prefix="pred_", - pred_time_colname="timestamp", - split_dir_path=DATA_DIR, - ) +HYDRA_ARGS = f"--multirun +model=xgboost project.wandb_mode='dryrun' model.args.tree_method='auto' --config-name {CONFIG_NAME}" +OVERTACI = "false" # Change to "true" if running on overtaci +# WATCHER CONSTANTS +WANDB_ENTITY = ( + "psycop" # The wandb entity to upload to (e.g. "psycop" or your user name) +) +N_RUNS_BEFORE_FIRST_EVAL = ( + "1" # The number of runs to upload to wandb before evaluating the best runs. +) +KEEP_WATCHER_ALIVE_AFTER_TRAINING_FINISHED_MINUTES = ( + 5 # minutes to wait for the wandb watcher after training +) +# has finished. Will kill the watcher after this time. +ARCHIVE_ALL_WANDB_RUNS = "false" # whether to archive all runs in the wandb folder +# before starting model training. Change to "t" to archive all wandb runs + + +def load_data(dataset_spec): + """Load the data""" loader = DataLoader(dataset_spec) - train = loader.load_dataset_from_dir(split_names="train") + return loader.load_dataset_from_dir(split_names="train") + +class PossibleLookDistanceDays(BaseModel): + ahead: Iterable[Union[int, float]] + behind: Iterable[Union[int, float]] + + +def infer_possible_look_directions(train): + """Infer the possible values for min_lookahead_days and min_lookbehind_days""" # Get potential lookaheads from outc_ columns outcome_col_names = infer_outcome_col_name(df=train, allow_multiple=True) + possible_lookahead_days = set( infer_look_distance( col_name=outcome_col_names, @@ -57,17 +77,95 @@ pred_col_names = infer_predictor_col_name(df=train, allow_multiple=True) possible_lookbehind_days = set(infer_look_distance(col_name=pred_col_names)) - # Override wandb group name with these - # Generate random word-word string - r = RandomWords() + return PossibleLookDistanceDays( + ahead=possible_lookahead_days, behind=possible_lookbehind_days + ) + + +def get_dataset_spec(data_dir_path: Path): + time_spec = DatasetTimeSpecification( + drop_patient_if_outcome_before_date=None, + min_prediction_time_date="1979-01-01", + min_lookbehind_days=0, + min_lookahead_days=0, + ) + + return DatasetSpecification( + file_suffix="csv", + time=time_spec, + pred_col_name_prefix="pred_", + pred_time_colname="timestamp", + split_dir_path=data_dir_path, + ) - for lookbehind in possible_lookbehind_days: - for lookahead in possible_lookahead_days: - wandb_group = f"{r.get_random_word()}-{r.get_random_word()}-beh-{lookbehind}-ahead-{lookahead}" - command = f"python src/psycopt2d/train_model.py {BASE_ARGS} data.min_lookbehind_days={lookbehind} data.min_lookahead_days={lookahead} +project.wandb_group={wandb_group} hydra.sweeper.n_trials={N_TRIALS_PER_CELL_IN_GRID} --config-name {BASE_CONF_FILE_NAME}" +def train_models_for_each_grid( + base_conf_file_name: Union[str, Path], + base_args: str, + n_trials_per_cell_in_grid: int, + possible_look_distances: PossibleLookDistanceDays, +): + """Train a model for each cell in the grid of possible look distances""" + from random_word import RandomWords + + random_word = RandomWords() + + for lookbehind in possible_look_distances.behind: + for lookahead in possible_look_distances.ahead: + wandb_group = f"{random_word.get_random_word()}-{random_word.get_random_word()}-beh-{lookbehind}-ahead-{lookahead}" + + command = f"python src/psycopt2d/train_model.py {base_args} data.min_lookbehind_days={lookbehind} data.min_lookahead_days={lookahead} +project.wandb_group={wandb_group} hydra.sweeper.n_trials={n_trials_per_cell_in_grid} --config-name {base_conf_file_name}" msg.info("Sending command") msg.info(command) os.system(command) + + +if __name__ == "__main__": + msg = Printer(timestamp=True) + + with initialize(version_base=None, config_path="config/"): + cfg = compose( + config_name=CONFIG_NAME, + ) + + dataset_spec = get_dataset_spec(data_dir_path=DATA_DIR) + + train = load_data(dataset_spec=dataset_spec) + + possible_look_distance = infer_possible_look_directions(train) + + watcher = subprocess.Popen( # pylint: disable=consider-using-with + [ + "python", + "src/psycopt2d/model_training_watcher.py", + "--entity", + WANDB_ENTITY, + "--project_name", + cfg.project.name, + "--n_runs_before_eval", + N_RUNS_BEFORE_FIRST_EVAL, + "--overtaci", + OVERTACI, + "--timeout", + "None", + "--clean_wandb_dir", + ARCHIVE_ALL_WANDB_RUNS, + ], + ) + + train_models_for_each_grid( + base_conf_file_name=BASE_CONF_FILE_NAME, + base_args=BASE_ARGS, + n_trials_per_cell_in_grid=N_TRIALS_PER_CELL_IN_GRID, + possible_look_distances=possible_look_distance, + ) + + msg.good( + f"Training finished. Stopping the watcher in {KEEP_WATCHER_ALIVE_AFTER_TRAINING_FINISHED_MINUTES} minutes...", + ) + + time.sleep(60 * KEEP_WATCHER_ALIVE_AFTER_TRAINING_FINISHED_MINUTES) + watcher.kill() + msg.good("Watcher stopped.") From 012bec76c9f06c13a857e6ef0f6cb0c0b22243f6 Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Thu, 20 Oct 2022 13:47:09 +0200 Subject: [PATCH 08/57] refactor: use structs --- application/train_and_log_models.py | 154 ++++++++++++++++++---------- 1 file changed, 101 insertions(+), 53 deletions(-) diff --git a/application/train_and_log_models.py b/application/train_and_log_models.py index 79d3282d..64e94e62 100644 --- a/application/train_and_log_models.py +++ b/application/train_and_log_models.py @@ -8,11 +8,11 @@ import os import subprocess import time +from collections.abc import Iterable from pathlib import Path -from typing import Iterable, Union +from typing import Union -from hydra import compose, initialize -from pydantic import BaseModel +from pydantic import BaseModel, Field from wasabi import Printer, msg from psycopt2d.evaluate_saved_model_predictions import ( @@ -22,48 +22,79 @@ ) from psycopt2d.load import DataLoader, DatasetSpecification, DatasetTimeSpecification -BASE_CONF_FILE_NAME = "integration_testing.yaml" -DATA_DIR = Path("/Users/au484925/Desktop/psycop-t2d/tests/test_data/synth_splits/") +class PossibleLookDistanceDays(BaseModel): + """Possible look distances""" -BASE_ARGS = "--multirun +model=xgboost" -WANDB_PROJECT = "psycopt2d-testing" -N_TRIALS_PER_CELL_IN_GRID = 50 + ahead: Iterable[Union[int, float]] + behind: Iterable[Union[int, float]] -# RUN CONSTANTS -CONFIG_NAME = "integration_testing.yaml" -HYDRA_ARGS = f"--multirun +model=xgboost project.wandb_mode='dryrun' model.args.tree_method='auto' --config-name {CONFIG_NAME}" -OVERTACI = "false" # Change to "true" if running on overtaci +class MetaConf(BaseModel): + """Meta configuration for the script.""" -# WATCHER CONSTANTS -WANDB_ENTITY = ( - "psycop" # The wandb entity to upload to (e.g. "psycop" or your user name) -) -N_RUNS_BEFORE_FIRST_EVAL = ( - "1" # The number of runs to upload to wandb before evaluating the best runs. -) -KEEP_WATCHER_ALIVE_AFTER_TRAINING_FINISHED_MINUTES = ( - 5 # minutes to wait for the wandb watcher after training -) -# has finished. Will kill the watcher after this time. -ARCHIVE_ALL_WANDB_RUNS = "false" # whether to archive all runs in the wandb folder -# before starting model training. Change to "t" to archive all wandb runs + conf_name: str = Field("integration_testing.yaml") + data_dir: Path = Path( + "/Users/au484925/Desktop/psycop-t2d/tests/test_data/synth_splits/" + ) + overtaci: str = Field( + default="false", description="Change to 'true' if running on overtaci" + ) + + +class WatcherConf(BaseModel): + """Confiugration for the watcher.""" + + archive_all: str = Field( + default="false", + description="Whether to archive all runs in the wandb folder before starting model training. Change to 't' to archive all wandb runs", + ) + n_runs_before_first_eval: int = Field( + default="1", + description="The number of runs to upload to wandb before evaluating the best runs.", + ) + keep_alive_after_training_minutes: int = Field( + default=5, + description="minutes to wait for the wandb watcher after training has finished. Will kill the watcher after this time.", + ) + + +class WandbConf(BaseModel): + """Configuration for wandb.""" + + project_name: str = "psycopt2d-testing" + entity: str = Field( + default="psycop", + description="The wandb entity to upload to (e.g. 'psycop' or your user name)", + ) + + +class TrainConf(BaseModel): + """Configuration for model training.""" + + n_trials_per_cell_in_grid: int = Field( + default=50, + description="Number of trials per cell in the lookahead/lookbehind grid", + ) + + conf_name: str = Field(default="integration_testing.yaml") + + base_args: str = Field( + default=f"--multirun +model=xgboost project.wandb_mode='dryrun' model.args.tree_method='auto' --config-name {conf_name}" + ) + + possible_look_distance: PossibleLookDistanceDays def load_data(dataset_spec): - """Load the data""" + """Load the data.""" loader = DataLoader(dataset_spec) return loader.load_dataset_from_dir(split_names="train") -class PossibleLookDistanceDays(BaseModel): - ahead: Iterable[Union[int, float]] - behind: Iterable[Union[int, float]] - - def infer_possible_look_directions(train): - """Infer the possible values for min_lookahead_days and min_lookbehind_days""" + """Infer the possible values for min_lookahead_days and + min_lookbehind_days.""" # Get potential lookaheads from outc_ columns outcome_col_names = infer_outcome_col_name(df=train, allow_multiple=True) @@ -78,11 +109,13 @@ def infer_possible_look_directions(train): possible_lookbehind_days = set(infer_look_distance(col_name=pred_col_names)) return PossibleLookDistanceDays( - ahead=possible_lookahead_days, behind=possible_lookbehind_days + ahead=possible_lookahead_days, + behind=possible_lookbehind_days, ) def get_dataset_spec(data_dir_path: Path): + """Get dataset specification""" time_spec = DatasetTimeSpecification( drop_patient_if_outcome_before_date=None, min_prediction_time_date="1979-01-01", @@ -99,13 +132,13 @@ def get_dataset_spec(data_dir_path: Path): ) -def train_models_for_each_grid( +def train_models_for_each_cell_in_grid( base_conf_file_name: Union[str, Path], base_args: str, n_trials_per_cell_in_grid: int, possible_look_distances: PossibleLookDistanceDays, ): - """Train a model for each cell in the grid of possible look distances""" + """Train a model for each cell in the grid of possible look distances.""" from random_word import RandomWords random_word = RandomWords() @@ -125,47 +158,62 @@ def train_models_for_each_grid( if __name__ == "__main__": msg = Printer(timestamp=True) - with initialize(version_base=None, config_path="config/"): - cfg = compose( - config_name=CONFIG_NAME, - ) + meta_conf = MetaConf( + conf_name="integration_testing.yaml", + overtaci="false", + data_dir=Path( + "/Users/au484925/Desktop/psycop-t2d/tests/test_data/synth_splits/" + ), + ) - dataset_spec = get_dataset_spec(data_dir_path=DATA_DIR) + wandb_conf = WandbConf( + entity="psycop", + project_name="psycopt2d-testing", + ) - train = load_data(dataset_spec=dataset_spec) + watcher_conf = WatcherConf(archive_all="false", keep_alive_after_training_minutes=5) + dataset_spec = get_dataset_spec(data_dir_path=meta_conf.data_dir) + + train = load_data(dataset_spec=dataset_spec) possible_look_distance = infer_possible_look_directions(train) + train_conf = TrainConf( + conf_name=meta_conf.conf_name, + base_args=f"--multirun +model=xgboost project.wandb_mode='dryrun' model.args.tree_method='auto' --config-name {meta_conf.conf_name}", + n_trials_per_cell_in_grid=50, + ) + watcher = subprocess.Popen( # pylint: disable=consider-using-with [ "python", "src/psycopt2d/model_training_watcher.py", "--entity", - WANDB_ENTITY, + wandb_conf.entity, "--project_name", - cfg.project.name, + wandb_conf.project_name, "--n_runs_before_eval", - N_RUNS_BEFORE_FIRST_EVAL, + str(watcher_conf.n_runs_before_first_eval), "--overtaci", - OVERTACI, + meta_conf.overtaci, "--timeout", "None", "--clean_wandb_dir", - ARCHIVE_ALL_WANDB_RUNS, + watcher_conf.archive_all, ], ) - train_models_for_each_grid( - base_conf_file_name=BASE_CONF_FILE_NAME, - base_args=BASE_ARGS, - n_trials_per_cell_in_grid=N_TRIALS_PER_CELL_IN_GRID, - possible_look_distances=possible_look_distance, + train_models_for_each_cell_in_grid( + base_conf_file_name=train_conf.conf_name, + base_args=train_conf.base_args, + n_trials_per_cell_in_grid=train_conf.n_trials_per_cell_in_grid, + possible_look_distances=train_conf.possible_look_distance, ) msg.good( - f"Training finished. Stopping the watcher in {KEEP_WATCHER_ALIVE_AFTER_TRAINING_FINISHED_MINUTES} minutes...", + f"Training finished. Stopping the watcher in {watcher_conf.keep_alive_after_training_minutes} minutes...", ) - time.sleep(60 * KEEP_WATCHER_ALIVE_AFTER_TRAINING_FINISHED_MINUTES) + time.sleep(60 * watcher_conf.keep_alive_after_training_minutes) watcher.kill() msg.good("Watcher stopped.") From b60f66141d45d1293069457a89e834942cc1365b Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Thu, 20 Oct 2022 13:47:25 +0200 Subject: [PATCH 09/57] style: lint --- application/train_and_log_models.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/application/train_and_log_models.py b/application/train_and_log_models.py index 64e94e62..01855323 100644 --- a/application/train_and_log_models.py +++ b/application/train_and_log_models.py @@ -13,7 +13,7 @@ from typing import Union from pydantic import BaseModel, Field -from wasabi import Printer, msg +from wasabi import Printer from psycopt2d.evaluate_saved_model_predictions import ( infer_look_distance, @@ -24,7 +24,7 @@ class PossibleLookDistanceDays(BaseModel): - """Possible look distances""" + """Possible look distances.""" ahead: Iterable[Union[int, float]] behind: Iterable[Union[int, float]] @@ -35,10 +35,11 @@ class MetaConf(BaseModel): conf_name: str = Field("integration_testing.yaml") data_dir: Path = Path( - "/Users/au484925/Desktop/psycop-t2d/tests/test_data/synth_splits/" + "/Users/au484925/Desktop/psycop-t2d/tests/test_data/synth_splits/", ) overtaci: str = Field( - default="false", description="Change to 'true' if running on overtaci" + default="false", + description="Change to 'true' if running on overtaci", ) @@ -80,7 +81,7 @@ class TrainConf(BaseModel): conf_name: str = Field(default="integration_testing.yaml") base_args: str = Field( - default=f"--multirun +model=xgboost project.wandb_mode='dryrun' model.args.tree_method='auto' --config-name {conf_name}" + default=f"--multirun +model=xgboost project.wandb_mode='dryrun' model.args.tree_method='auto' --config-name {conf_name}", ) possible_look_distance: PossibleLookDistanceDays @@ -115,7 +116,7 @@ def infer_possible_look_directions(train): def get_dataset_spec(data_dir_path: Path): - """Get dataset specification""" + """Get dataset specification.""" time_spec = DatasetTimeSpecification( drop_patient_if_outcome_before_date=None, min_prediction_time_date="1979-01-01", @@ -162,7 +163,7 @@ def train_models_for_each_cell_in_grid( conf_name="integration_testing.yaml", overtaci="false", data_dir=Path( - "/Users/au484925/Desktop/psycop-t2d/tests/test_data/synth_splits/" + "/Users/au484925/Desktop/psycop-t2d/tests/test_data/synth_splits/", ), ) From 0d460e84f7d3cee6e40d90c9f6b8d96542099540 Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Thu, 20 Oct 2022 15:48:20 +0200 Subject: [PATCH 10/57] refactor: use objects --- application/train_and_log_models.py | 136 ++++++++++++++++------------ 1 file changed, 79 insertions(+), 57 deletions(-) diff --git a/application/train_and_log_models.py b/application/train_and_log_models.py index 01855323..94a7e8ec 100644 --- a/application/train_and_log_models.py +++ b/application/train_and_log_models.py @@ -5,12 +5,9 @@ - Replace the HYDRA_ARGS string with the desired arguments for `train_model.py` - Run this script from project root with `python src/psycopt2d/train_and_log_models.py """ -import os import subprocess import time -from collections.abc import Iterable from pathlib import Path -from typing import Union from pydantic import BaseModel, Field from wasabi import Printer @@ -26,8 +23,8 @@ class PossibleLookDistanceDays(BaseModel): """Possible look distances.""" - ahead: Iterable[Union[int, float]] - behind: Iterable[Union[int, float]] + ahead: list[str] + behind: list[str] class MetaConf(BaseModel): @@ -44,7 +41,7 @@ class MetaConf(BaseModel): class WatcherConf(BaseModel): - """Confiugration for the watcher.""" + """Configuration for the watcher.""" archive_all: str = Field( default="false", @@ -68,23 +65,31 @@ class WandbConf(BaseModel): default="psycop", description="The wandb entity to upload to (e.g. 'psycop' or your user name)", ) + mode: str = Field(default="online", description="The wandb mode to use") class TrainConf(BaseModel): """Configuration for model training.""" + gpu: bool = Field(default="false", description="Whether to use GPU") + n_trials_per_cell_in_grid: int = Field( default=50, description="Number of trials per cell in the lookahead/lookbehind grid", ) + model_conf: str = Field( + default="xgboost", + description="The model conf to open. For example, 'xgboost' or 'logistic_regression'.", + ) + conf_name: str = Field(default="integration_testing.yaml") - base_args: str = Field( - default=f"--multirun +model=xgboost project.wandb_mode='dryrun' model.args.tree_method='auto' --config-name {conf_name}", + multirun: bool = Field( + default=False, description="Whether to use Hydra to run multiple models." ) - possible_look_distance: PossibleLookDistanceDays + possible_look_distances: PossibleLookDistanceDays def load_data(dataset_spec): @@ -99,15 +104,11 @@ def infer_possible_look_directions(train): # Get potential lookaheads from outc_ columns outcome_col_names = infer_outcome_col_name(df=train, allow_multiple=True) - possible_lookahead_days = set( - infer_look_distance( - col_name=outcome_col_names, - ), - ) + possible_lookahead_days = infer_look_distance(col_name=outcome_col_names) # Get potential lookbehinds from pred_ columns pred_col_names = infer_predictor_col_name(df=train, allow_multiple=True) - possible_lookbehind_days = set(infer_look_distance(col_name=pred_col_names)) + possible_lookbehind_days = list(set(infer_look_distance(col_name=pred_col_names))) return PossibleLookDistanceDays( ahead=possible_lookahead_days, @@ -134,26 +135,44 @@ def get_dataset_spec(data_dir_path: Path): def train_models_for_each_cell_in_grid( - base_conf_file_name: Union[str, Path], - base_args: str, - n_trials_per_cell_in_grid: int, - possible_look_distances: PossibleLookDistanceDays, + train_conf: TrainConf, ): """Train a model for each cell in the grid of possible look distances.""" from random_word import RandomWords random_word = RandomWords() - for lookbehind in possible_look_distances.behind: - for lookahead in possible_look_distances.ahead: + for lookbehind in train_conf.possible_look_distances.behind: + for lookahead in train_conf.possible_look_distances.ahead: wandb_group = f"{random_word.get_random_word()}-{random_word.get_random_word()}-beh-{lookbehind}-ahead-{lookahead}" - command = f"python src/psycopt2d/train_model.py {base_args} data.min_lookbehind_days={lookbehind} data.min_lookahead_days={lookahead} +project.wandb_group={wandb_group} hydra.sweeper.n_trials={n_trials_per_cell_in_grid} --config-name {base_conf_file_name}" + subprocess_args: list[str] = [ + "python", + "src/psycopt2d/train_model.py", + f"+model={train_conf.model_conf}", + f"data.min_lookbehind_days={lookbehind}", + f"data.min_lookahead_days={lookahead}", + f"project.wandb_group='{wandb_group}'", + f"hydra.sweeper.n_trials={train_conf.n_trials_per_cell_in_grid}", + "--config-name", + f"{meta_conf.conf_name}", + ] + + if train_conf.multirun: + subprocess_args.insert(2, "--multirun") - msg.info("Sending command") - msg.info(command) + if train_conf.model_conf == "xgboost" and not train_conf.gpu: + subprocess_args.insert(3, "++model.args.tree_method='auto'") - os.system(command) + msg.info("Starting trainer with command") + msg.info(f'{" ".join(subprocess_args)}') + + trainer = subprocess.Popen( # pylint: disable=consider-using-with + args=subprocess_args, + ) + + while trainer.poll() is None: + time.sleep(1) if __name__ == "__main__": @@ -170,51 +189,54 @@ def train_models_for_each_cell_in_grid( wandb_conf = WandbConf( entity="psycop", project_name="psycopt2d-testing", + mode="offline", ) - watcher_conf = WatcherConf(archive_all="false", keep_alive_after_training_minutes=5) - + watcher_conf = WatcherConf(archive_all="true", keep_alive_after_training_minutes=5) dataset_spec = get_dataset_spec(data_dir_path=meta_conf.data_dir) - train = load_data(dataset_spec=dataset_spec) - possible_look_distance = infer_possible_look_directions(train) + + possible_look_distances = infer_possible_look_directions(train) train_conf = TrainConf( conf_name=meta_conf.conf_name, - base_args=f"--multirun +model=xgboost project.wandb_mode='dryrun' model.args.tree_method='auto' --config-name {meta_conf.conf_name}", - n_trials_per_cell_in_grid=50, - ) - - watcher = subprocess.Popen( # pylint: disable=consider-using-with - [ - "python", - "src/psycopt2d/model_training_watcher.py", - "--entity", - wandb_conf.entity, - "--project_name", - wandb_conf.project_name, - "--n_runs_before_eval", - str(watcher_conf.n_runs_before_first_eval), - "--overtaci", - meta_conf.overtaci, - "--timeout", - "None", - "--clean_wandb_dir", - watcher_conf.archive_all, - ], - ) + multirun=False, + model_conf="xgboost", + n_trials_per_cell_in_grid=1, + possible_look_distances=possible_look_distances, + gpu=False, + ) + + if not train_conf.gpu: + msg.warn("Not using GPU for training") + + # watcher = subprocess.Popen( # pylint: disable=consider-using-with + # [ + # "python", + # "src/psycopt2d/model_training_watcher.py", + # "--entity", + # wandb_conf.entity, + # "--project_name", + # wandb_conf.project_name, + # "--n_runs_before_eval", + # str(watcher_conf.n_runs_before_first_eval), + # "--overtaci", + # meta_conf.overtaci, + # "--timeout", + # "None", + # "--clean_wandb_dir", + # watcher_conf.archive_all, + # ], + # ) train_models_for_each_cell_in_grid( - base_conf_file_name=train_conf.conf_name, - base_args=train_conf.base_args, - n_trials_per_cell_in_grid=train_conf.n_trials_per_cell_in_grid, - possible_look_distances=train_conf.possible_look_distance, + train_conf=train_conf, ) msg.good( f"Training finished. Stopping the watcher in {watcher_conf.keep_alive_after_training_minutes} minutes...", ) - time.sleep(60 * watcher_conf.keep_alive_after_training_minutes) - watcher.kill() + # time.sleep(60 * watcher_conf.keep_alive_after_training_minutes) + # watcher.kill() msg.good("Watcher stopped.") From 1dbd900efae2807b1d3525c6c018c29c52e543b0 Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Thu, 20 Oct 2022 15:48:47 +0200 Subject: [PATCH 11/57] fix: misc. minor fixes for training --- src/psycopt2d/config/data/synth_data.yaml | 6 +- src/psycopt2d/config/data/t2d_parquet.yaml | 3 +- .../evaluate_saved_model_predictions.py | 4 +- src/psycopt2d/load.py | 90 +++++++++++-------- src/psycopt2d/train_model.py | 3 +- src/psycopt2d/utils.py | 11 ++- 6 files changed, 72 insertions(+), 45 deletions(-) diff --git a/src/psycopt2d/config/data/synth_data.yaml b/src/psycopt2d/config/data/synth_data.yaml index 316235ae..d99e3b32 100644 --- a/src/psycopt2d/config/data/synth_data.yaml +++ b/src/psycopt2d/config/data/synth_data.yaml @@ -2,6 +2,7 @@ data: n_training_samples: null min_lookahead_days: null + min_lookbehind_days: null min_prediction_time_date: null lookahead_days: 30 pred_col_name_prefix: "pred_" @@ -9,12 +10,11 @@ data: outcome_timestamp_col_name: timestamp_outcome id_col_name: citizen_ids source: synthetic - min_lookbehind_days: null drop_patient_if_outcome_before_date: null - lookbehind_combination: null + lookbehind_combination: [30, 90] # Parameters that will only take effect if running with --multirun hydra: sweeper: params: - ++data.lookbehind_combinations: choice([30, 90], [30]) + data.lookbehind_combination: choice([3000, 90], [30]) diff --git a/src/psycopt2d/config/data/t2d_parquet.yaml b/src/psycopt2d/config/data/t2d_parquet.yaml index cd59b629..3255b74d 100644 --- a/src/psycopt2d/config/data/t2d_parquet.yaml +++ b/src/psycopt2d/config/data/t2d_parquet.yaml @@ -19,9 +19,10 @@ data: # Looking behind min_prediction_time_date: 2013-01-01 min_lookbehind_days: 365 # (int): Drop all prediction times where (prediction_timestamp) - (min timestamp in the dataset) is less than min_lookbehind_days + lookbehind_combinations: choice([30, 90, 180, 365, 730], [30, 180, 730], [30, 730], [730]) # Parameters that will only take effect if running with --multirun hydra: sweeper: params: - ++data.lookbehind_combinations: choice([30, 90], [30]) + ++data.lookbehind_combinations: choice([3000], [30, 90]) diff --git a/src/psycopt2d/evaluate_saved_model_predictions.py b/src/psycopt2d/evaluate_saved_model_predictions.py index bfa96ff1..a665d34f 100644 --- a/src/psycopt2d/evaluate_saved_model_predictions.py +++ b/src/psycopt2d/evaluate_saved_model_predictions.py @@ -29,14 +29,14 @@ def infer_look_distance( col_name: Union[Iterable[str], str], regex_pattern: str = r"within_(\d+)_days", allow_multiple: bool = True, -) -> list[Union[int, float]]: +) -> list[str]: """Infer look distances from col names.""" # E.g. "outc_within_1_days" = 1 # E.g. "outc_within_2_days" = 2 # E.g. "pred_within_3_days" = 3 # E.g. "pred_within_3_days" = 3 - look_distances: list[Union[int, float]] = [] + look_distances: list[str] = [] if isinstance(col_name, Iterable) and not isinstance(col_name, str): for c_name in col_name: diff --git a/src/psycopt2d/load.py b/src/psycopt2d/load.py index 65d4c34c..459685f8 100644 --- a/src/psycopt2d/load.py +++ b/src/psycopt2d/load.py @@ -2,6 +2,7 @@ import re from collections.abc import Iterable from datetime import datetime, timedelta +from multiprocessing.sharedctypes import Value from pathlib import Path from typing import Any, Optional, Union @@ -11,7 +12,13 @@ from pydantic import BaseModel, Field from wasabi import Printer -from psycopt2d.utils import PROJECT_ROOT, coerce_to_datetime +from psycopt2d.evaluate_saved_model_predictions import infer_look_distance +from psycopt2d.utils import ( + PROJECT_ROOT, + coerce_to_datetime, + get_percent_lost, + infer_predictor_col_name, +) msg = Printer(timestamp=True) @@ -168,7 +175,7 @@ def _drop_rows_if_datasets_ends_within_days( pd.DataFrame: Dataset with dropped rows. """ if not isinstance(n_days, timedelta): - n_days = timedelta(days=n_days) # type: ignore + n_days_timedelt: timedelta = timedelta(days=n_days) # type: ignore if direction not in ("ahead", "behind"): raise ValueError(f"Direction {direction} not supported.") @@ -176,23 +183,24 @@ def _drop_rows_if_datasets_ends_within_days( n_rows_before_modification = dataset.shape[0] if direction == "ahead": - max_datetime = dataset[self.spec.pred_time_colname].max() - n_days + max_datetime = dataset[self.spec.pred_time_colname].max() - n_days_timedelt before_max_dt = dataset[self.spec.pred_time_colname] < max_datetime dataset = dataset[before_max_dt] elif direction == "behind": - min_datetime = dataset[self.spec.pred_time_colname].min() + n_days + min_datetime = dataset[self.spec.pred_time_colname].min() + n_days_timedelt after_min_dt = dataset[self.spec.pred_time_colname] > min_datetime dataset = dataset[after_min_dt] n_rows_after_modification = dataset.shape[0] - percent_dropped = ( - n_rows_before_modification - n_rows_after_modification - ) / n_rows_before_modification - - msg.info( - f"Dropped {n_rows_before_modification - n_rows_after_modification} ({percent_dropped}%) rows because the end of the dataset was within {n_days} of their prediction time when looking {direction} from their prediction time", + percent_dropped = get_percent_lost( + n_before=n_rows_after_modification, n_after=n_rows_after_modification ) + if n_rows_before_modification - n_rows_after_modification != 0: + msg.info( + f"Dropped {n_rows_before_modification - n_rows_after_modification} ({percent_dropped}%) rows because the end of the dataset was within {n_days} of their prediction time when looking {direction} from their prediction time", + ) + return dataset def _drop_patients_with_event_in_washin(self, dataset) -> pd.DataFrame: @@ -216,14 +224,15 @@ def _drop_patients_with_event_in_washin(self, dataset) -> pd.DataFrame: ] n_rows_after_modification = dataset.shape[0] - percent_dropped = ( - n_rows_before_modification - n_rows_after_modification - ) / n_rows_before_modification - - msg.info( - f"Dropped {n_rows_before_modification - n_rows_after_modification} ({percent_dropped}%) rows because patients had diabetes in the washin period.", + percent_dropped = get_percent_lost( + n_before=n_rows_after_modification, n_after=n_rows_after_modification ) + if n_rows_before_modification - n_rows_after_modification != 0: + msg.info( + f"Dropped {n_rows_before_modification - n_rows_after_modification} ({percent_dropped}%) rows because patients had diabetes in the washin period.", + ) + return dataset def _drop_cols_not_in_lookbehind_combination( @@ -240,33 +249,43 @@ def _drop_cols_not_in_lookbehind_combination( pd.DataFrame: Dataset with dropped columns. """ + if not self.spec.time.lookbehind_combination: + raise ValueError("No lookbehind_combination provided.") + # Extract all unique lookbhehinds in the dataset predictors lookbehinds_in_dataset = { - int(re.findall(r"within_(\d+)_days", col)[0]) - for col in dataset.columns - if self.pred_col_name_prefix in col + int(infer_look_distance(col)[0]) + for col in infer_predictor_col_name(df=dataset) } + # Convert list to set + lookbehinds_in_spec = set(self.spec.time.lookbehind_combination) + # Check that all loobehinds in lookbehind_combination are used in the predictors - if not set(self.spec.time.lookbehind_combination).issubset( + if not lookbehinds_in_spec.issubset( lookbehinds_in_dataset, ): - raise ValueError( - f"One or more of the provided lookbehinds in lookbehind_combination is/are not used in any predictors in the dataset. Lookbehinds in dataset: {lookbehinds_in_dataset}. Lookbehinds in lookbehind_combination: {self.spec.time.lookbehind_combination}.", + msg.warn( + f"One or more of the provided lookbehinds in lookbehind_combination is/are not used in any predictors in the dataset. Dataset: {lookbehinds_in_dataset}. lookbehind_combination: {self.spec.time.lookbehind_combination}.", ) + lookbehinds_to_keep = lookbehinds_in_spec.intersection( + lookbehinds_in_dataset + ) + + if not lookbehinds_to_keep: + raise ValueError("No predictors left after dropping lookbehinds.") + + msg.warn(f"Training on {lookbehinds_to_keep}.") + # Create a list of all predictor columns who have a lookbehind window not in lookbehind_combination list cols_to_drop = [ - col - for col in dataset.columns - if any( - str(x) not in col and self.pred_col_name_prefix in col - for x in self.spec.time.lookbehind_combination - ) + c + for c in infer_predictor_col_name(df=dataset) + if any(str(l_beh) not in c for l_beh in lookbehinds_to_keep) ] dataset = dataset.drop(columns=cols_to_drop) - return dataset def _convert_timestamp_dtype_and_nat(self, dataset: pd.DataFrame) -> pd.DataFrame: @@ -330,14 +349,15 @@ def _drop_cols_if_exceeds_look_direction_threshold( cols_to_drop.append(col) n_cols_after_modification = dataset.shape[1] - percent_dropped = ( - n_cols_before_modification - n_cols_after_modification - ) / n_cols_before_modification - - msg.info( - f"Dropped {n_cols_before_modification - n_cols_after_modification} ({percent_dropped}%) columns because they were looking {direction} further out than {look_direction_threshold} days.", + percent_dropped = get_percent_lost( + n_before=n_cols_before_modification, n_after=n_cols_after_modification ) + if n_cols_before_modification - n_cols_after_modification != 0: + msg.info( + f"Dropped {n_cols_before_modification - n_cols_after_modification} ({percent_dropped}%) columns because they were looking {direction} further out than {look_direction_threshold} days.", + ) + return dataset[[c for c in dataset.columns if c not in cols_to_drop]] def _drop_cols_and_rows_if_look_direction_not_met( diff --git a/src/psycopt2d/train_model.py b/src/psycopt2d/train_model.py index 2a79d729..80702815 100644 --- a/src/psycopt2d/train_model.py +++ b/src/psycopt2d/train_model.py @@ -21,13 +21,14 @@ from psycopt2d.load import load_train_and_val_from_cfg from psycopt2d.models import MODELS from psycopt2d.utils import ( + PROJECT_ROOT, create_wandb_folders, flatten_nested_dict, get_feature_importance_dict, prediction_df_with_metadata_to_disk, ) -CONFIG_PATH = Path(__file__).parent / "config" +CONFIG_PATH = PROJECT_ROOT / "src" / "psycopt2d" / "config" TRAINING_COL_NAME_PREFIX = "pred_" # Handle wandb not playing nice with joblib diff --git a/src/psycopt2d/utils.py b/src/psycopt2d/utils.py index eac545e0..976e56db 100644 --- a/src/psycopt2d/utils.py +++ b/src/psycopt2d/utils.py @@ -397,12 +397,12 @@ def infer_col_names( df: pd.DataFrame, prefix: str, allow_multiple: bool = True, -) -> Union[str, list[str]]: +) -> list[str]: """Infer col names based on prefix.""" col_name = [c for c in df.columns if c.startswith(prefix)] if len(col_name) == 1: - return col_name[0] + return [col_name[0]] elif len(col_name) > 1: if allow_multiple: return col_name @@ -417,7 +417,7 @@ def infer_outcome_col_name( df: pd.DataFrame, prefix: str = "outc_", allow_multiple: bool = True, -) -> Union[str, list[str]]: +) -> list[str]: """Infer the outcome column name from the dataframe.""" return infer_col_names(df=df, prefix=prefix, allow_multiple=allow_multiple) @@ -438,3 +438,8 @@ def infer_y_hat_prob_col_name( ) -> str: """Infer the y_hat_prob column name from the dataframe.""" return infer_col_names(df=df, prefix=prefix, allow_multiple=allow_multiple) + + +def get_percent_lost(n_before: Union[int, float], n_after: Union[int, float]) -> float: + """Get the percent lost.""" + return round((100 * (1 - n_after / n_before)), 2) From 3e9fe78aee745d1b142a8b7881c8b1bb40e29b7b Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Thu, 20 Oct 2022 15:49:07 +0200 Subject: [PATCH 12/57] style: linting --- application/train_and_log_models.py | 3 ++- src/psycopt2d/load.py | 12 +++++++----- src/psycopt2d/train_model.py | 1 - 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/application/train_and_log_models.py b/application/train_and_log_models.py index 94a7e8ec..8c2abe20 100644 --- a/application/train_and_log_models.py +++ b/application/train_and_log_models.py @@ -86,7 +86,8 @@ class TrainConf(BaseModel): conf_name: str = Field(default="integration_testing.yaml") multirun: bool = Field( - default=False, description="Whether to use Hydra to run multiple models." + default=False, + description="Whether to use Hydra to run multiple models.", ) possible_look_distances: PossibleLookDistanceDays diff --git a/src/psycopt2d/load.py b/src/psycopt2d/load.py index 459685f8..535220a7 100644 --- a/src/psycopt2d/load.py +++ b/src/psycopt2d/load.py @@ -2,7 +2,6 @@ import re from collections.abc import Iterable from datetime import datetime, timedelta -from multiprocessing.sharedctypes import Value from pathlib import Path from typing import Any, Optional, Union @@ -193,7 +192,8 @@ def _drop_rows_if_datasets_ends_within_days( n_rows_after_modification = dataset.shape[0] percent_dropped = get_percent_lost( - n_before=n_rows_after_modification, n_after=n_rows_after_modification + n_before=n_rows_after_modification, + n_after=n_rows_after_modification, ) if n_rows_before_modification - n_rows_after_modification != 0: @@ -225,7 +225,8 @@ def _drop_patients_with_event_in_washin(self, dataset) -> pd.DataFrame: n_rows_after_modification = dataset.shape[0] percent_dropped = get_percent_lost( - n_before=n_rows_after_modification, n_after=n_rows_after_modification + n_before=n_rows_after_modification, + n_after=n_rows_after_modification, ) if n_rows_before_modification - n_rows_after_modification != 0: @@ -270,7 +271,7 @@ def _drop_cols_not_in_lookbehind_combination( ) lookbehinds_to_keep = lookbehinds_in_spec.intersection( - lookbehinds_in_dataset + lookbehinds_in_dataset, ) if not lookbehinds_to_keep: @@ -350,7 +351,8 @@ def _drop_cols_if_exceeds_look_direction_threshold( n_cols_after_modification = dataset.shape[1] percent_dropped = get_percent_lost( - n_before=n_cols_before_modification, n_after=n_cols_after_modification + n_before=n_cols_before_modification, + n_after=n_cols_after_modification, ) if n_cols_before_modification - n_cols_after_modification != 0: diff --git a/src/psycopt2d/train_model.py b/src/psycopt2d/train_model.py index 80702815..e9d78bbe 100644 --- a/src/psycopt2d/train_model.py +++ b/src/psycopt2d/train_model.py @@ -1,7 +1,6 @@ """Training script for training a single model for predicting t2d.""" import os from collections.abc import Iterable -from pathlib import Path from typing import Optional import hydra From cc2961c965b5ef6963e6d17200b8126f5ff034f4 Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Fri, 21 Oct 2022 10:15:14 +0200 Subject: [PATCH 13/57] misc. --- application/train_and_log_models.py | 186 +++++++++++------- src/psycopt2d/config/data/synth_data.yaml | 4 +- src/psycopt2d/config/data/t2d_parquet.yaml | 14 +- src/psycopt2d/config/default_config.yaml | 1 + .../config/project/default_project.yaml | 3 +- .../config/project/overtaci_test_project.yaml | 3 +- .../config/training/default_training.yaml | 2 +- src/psycopt2d/evaluation.py | 11 +- src/psycopt2d/load.py | 24 ++- src/psycopt2d/model_training_watcher.py | 32 +-- src/psycopt2d/visualization/base_charts.py | 2 +- .../visualization/feature_importance.py | 2 +- tests/test_train_model.py | 10 +- 13 files changed, 176 insertions(+), 118 deletions(-) diff --git a/application/train_and_log_models.py b/application/train_and_log_models.py index 8c2abe20..80ad6eb9 100644 --- a/application/train_and_log_models.py +++ b/application/train_and_log_models.py @@ -5,10 +5,12 @@ - Replace the HYDRA_ARGS string with the desired arguments for `train_model.py` - Run this script from project root with `python src/psycopt2d/train_and_log_models.py """ +import random import subprocess import time from pathlib import Path +from hydra import compose, initialize from pydantic import BaseModel, Field from wasabi import Printer @@ -18,6 +20,9 @@ infer_predictor_col_name, ) from psycopt2d.load import DataLoader, DatasetSpecification, DatasetTimeSpecification +from psycopt2d.utils import PROJECT_ROOT + +msg = Printer(timestamp=True) class PossibleLookDistanceDays(BaseModel): @@ -75,7 +80,7 @@ class TrainConf(BaseModel): n_trials_per_cell_in_grid: int = Field( default=50, - description="Number of trials per cell in the lookahead/lookbehind grid", + description="Number of trials per cell in the lookahead/lookbehind grid. If n > 1, automatically triggers multirun.", ) model_conf: str = Field( @@ -85,17 +90,13 @@ class TrainConf(BaseModel): conf_name: str = Field(default="integration_testing.yaml") - multirun: bool = Field( - default=False, - description="Whether to use Hydra to run multiple models.", - ) - possible_look_distances: PossibleLookDistanceDays -def load_data(dataset_spec): +def load_train_for_inference(dataset_spec): """Load the data.""" loader = DataLoader(dataset_spec) + msg.info("Loading datasets for look direction inference") return loader.load_dataset_from_dir(split_names="train") @@ -117,7 +118,7 @@ def infer_possible_look_directions(train): ) -def get_dataset_spec(data_dir_path: Path): +def get_dataset_spec(data_dir_path: Path, file_suffix: str): """Get dataset specification.""" time_spec = DatasetTimeSpecification( drop_patient_if_outcome_before_date=None, @@ -127,7 +128,7 @@ def get_dataset_spec(data_dir_path: Path): ) return DatasetSpecification( - file_suffix="csv", + file_suffix=file_suffix, time=time_spec, pred_col_name_prefix="pred_", pred_time_colname="timestamp", @@ -135,109 +136,162 @@ def get_dataset_spec(data_dir_path: Path): ) +class LookDirectionCombination(BaseModel): + """A combination of lookbehind and lookahead days.""" + + lookbehind: int + lookahead: int + + def train_models_for_each_cell_in_grid( train_conf: TrainConf, + wandb_conf: WandbConf, ): """Train a model for each cell in the grid of possible look distances.""" from random_word import RandomWords random_word = RandomWords() - for lookbehind in train_conf.possible_look_distances.behind: - for lookahead in train_conf.possible_look_distances.ahead: - wandb_group = f"{random_word.get_random_word()}-{random_word.get_random_word()}-beh-{lookbehind}-ahead-{lookahead}" + # Create all combinations of lookbehind and lookahead days + lookbehind_combinations = [ + LookDirectionCombination(lookbehind=lookbehind, lookahead=lookahead) + for lookbehind in train_conf.possible_look_distances.behind + for lookahead in train_conf.possible_look_distances.ahead + ] + + lookbehind_combinations = [ + comb for comb in lookbehind_combinations if comb.lookahead <= 1095 + ] + + random.shuffle(lookbehind_combinations) + + active_trainers: list[subprocess.Popen] = [] + + wandb_prefix = f"{random_word.get_random_word()}-{random_word.get_random_word()}" + + while lookbehind_combinations: + # Loop to run if enough trainers have been spawned + if len(active_trainers) >= 4: + active_trainers = [t for t in active_trainers if t.poll() is None] + time.sleep(1) + continue - subprocess_args: list[str] = [ - "python", - "src/psycopt2d/train_model.py", - f"+model={train_conf.model_conf}", - f"data.min_lookbehind_days={lookbehind}", - f"data.min_lookahead_days={lookahead}", - f"project.wandb_group='{wandb_group}'", - f"hydra.sweeper.n_trials={train_conf.n_trials_per_cell_in_grid}", - "--config-name", - f"{meta_conf.conf_name}", - ] + cell = lookbehind_combinations.pop() + msg.info( + f"Spawning a new trainer with lookbehind={cell.lookbehind} and lookahead={cell.lookahead}" + ) - if train_conf.multirun: - subprocess_args.insert(2, "--multirun") + wandb_group = f"{wandb_prefix}-beh-{cell.lookbehind}-ahead-{cell.lookahead}" - if train_conf.model_conf == "xgboost" and not train_conf.gpu: - subprocess_args.insert(3, "++model.args.tree_method='auto'") + subprocess_args: list[str] = [ + "python", + "src/psycopt2d/train_model.py", + f"model={train_conf.model_conf}", + f"data.min_lookbehind_days={cell.lookbehind}", + f"data.min_lookahead_days={cell.lookahead}", + f"project.wandb_group='{wandb_group}'", + f"hydra.sweeper.n_trials={train_conf.n_trials_per_cell_in_grid}", + f"project.wandb_mode={wandb_conf.mode}", + "--config-name", + f"{meta_conf.conf_name}", + ] - msg.info("Starting trainer with command") - msg.info(f'{" ".join(subprocess_args)}') + if train_conf.n_trials_per_cell_in_grid > 1: + subprocess_args.insert(2, "--multirun") - trainer = subprocess.Popen( # pylint: disable=consider-using-with + if train_conf.model_conf == "xgboost" and not train_conf.gpu: + subprocess_args.insert(3, "++model.args.tree_method='auto'") + + msg.info(f'{" ".join(subprocess_args)}') + + active_trainers.append( + subprocess.Popen( # pylint: disable=consider-using-with args=subprocess_args, ) - - while trainer.poll() is None: - time.sleep(1) + ) if __name__ == "__main__": msg = Printer(timestamp=True) + CONFIG_FILE_NAME = "default_config.yaml" + + with initialize(version_base=None, config_path="../src/psycopt2d/config/"): + cfg = compose( + config_name=CONFIG_FILE_NAME, + ) + meta_conf = MetaConf( - conf_name="integration_testing.yaml", + conf_name=CONFIG_FILE_NAME, overtaci="false", - data_dir=Path( - "/Users/au484925/Desktop/psycop-t2d/tests/test_data/synth_splits/", - ), + data_dir=cfg.data.dir, ) wandb_conf = WandbConf( entity="psycop", project_name="psycopt2d-testing", - mode="offline", + mode=cfg.project.wandb_mode, ) - watcher_conf = WatcherConf(archive_all="true", keep_alive_after_training_minutes=5) - dataset_spec = get_dataset_spec(data_dir_path=meta_conf.data_dir) - train = load_data(dataset_spec=dataset_spec) + watcher_conf = WatcherConf(archive_all="false", keep_alive_after_training_minutes=5) + + watcher = subprocess.Popen( # pylint: disable=consider-using-with + [ + "python", + "src/psycopt2d/model_training_watcher.py", + "--entity", + wandb_conf.entity, + "--project_name", + wandb_conf.project_name, + "--n_runs_before_eval", + str(watcher_conf.n_runs_before_first_eval), + "--overtaci", + meta_conf.overtaci, + "--timeout", + "None", + "--clean_wandb_dir", + watcher_conf.archive_all, + ], + ) + + dataset_spec = get_dataset_spec( + data_dir_path=meta_conf.data_dir, file_suffix=cfg.data.suffix + ) + train = load_train_for_inference(dataset_spec=dataset_spec) possible_look_distances = infer_possible_look_directions(train) + # Remove "9999" from possible look distances behind + possible_look_distances.behind = [ + dist for dist in possible_look_distances.behind if dist != "9999" + ] + + msg.info(f"Possible lookbehind days: {possible_look_distances.behind}") + msg.info(f"Possible lookahead days: {possible_look_distances.ahead}") + train_conf = TrainConf( conf_name=meta_conf.conf_name, - multirun=False, model_conf="xgboost", n_trials_per_cell_in_grid=1, possible_look_distances=possible_look_distances, - gpu=False, + gpu=True, ) if not train_conf.gpu: msg.warn("Not using GPU for training") - # watcher = subprocess.Popen( # pylint: disable=consider-using-with - # [ - # "python", - # "src/psycopt2d/model_training_watcher.py", - # "--entity", - # wandb_conf.entity, - # "--project_name", - # wandb_conf.project_name, - # "--n_runs_before_eval", - # str(watcher_conf.n_runs_before_first_eval), - # "--overtaci", - # meta_conf.overtaci, - # "--timeout", - # "None", - # "--clean_wandb_dir", - # watcher_conf.archive_all, - # ], - # ) - - train_models_for_each_cell_in_grid( - train_conf=train_conf, + clean_dir_seconds = 0 + msg.info( + f"Sleeping for {clean_dir_seconds} seconds to allow watcher to start and clean dir" ) + time.sleep(clean_dir_seconds) + + train_models_for_each_cell_in_grid(train_conf=train_conf, wandb_conf=wandb_conf) msg.good( f"Training finished. Stopping the watcher in {watcher_conf.keep_alive_after_training_minutes} minutes...", ) - # time.sleep(60 * watcher_conf.keep_alive_after_training_minutes) - # watcher.kill() + time.sleep(60 * watcher_conf.keep_alive_after_training_minutes) + watcher.kill() msg.good("Watcher stopped.") diff --git a/src/psycopt2d/config/data/synth_data.yaml b/src/psycopt2d/config/data/synth_data.yaml index d99e3b32..8089f440 100644 --- a/src/psycopt2d/config/data/synth_data.yaml +++ b/src/psycopt2d/config/data/synth_data.yaml @@ -5,11 +5,11 @@ data: min_lookbehind_days: null min_prediction_time_date: null lookahead_days: 30 - pred_col_name_prefix: "pred_" + pred_col_name_prefix: pred_ pred_timestamp_col_name: timestamp outcome_timestamp_col_name: timestamp_outcome id_col_name: citizen_ids - source: synthetic + suffix: synthetic drop_patient_if_outcome_before_date: null lookbehind_combination: [30, 90] diff --git a/src/psycopt2d/config/data/t2d_parquet.yaml b/src/psycopt2d/config/data/t2d_parquet.yaml index 3255b74d..611b3d02 100644 --- a/src/psycopt2d/config/data/t2d_parquet.yaml +++ b/src/psycopt2d/config/data/t2d_parquet.yaml @@ -2,8 +2,8 @@ data: # General config n_training_samples: null # (int, null): Number of training samples to use, defaults to null in which cases it uses all samples. - dir: E:\shared_resources\feature_sets\t2d\feature_sets\psycop_t2d_adminmanber_201_features_2022_10_05_15_14 - source: parquet # Where to load data from. Takes "sql" or "synthetic" + dir: E:\shared_resources\feature_sets\t2d\feature_sets\psycop_t2d_adminmanber_416_features_2022_10_20_11_12 + suffix: parquet # File suffix to load. # Feature specs pred_col_name_prefix: "pred_" # (str): prefix of predictor columns @@ -12,17 +12,17 @@ data: id_col_name: dw_ek_borger # (str): Citizen colnames # Looking ahead - lookahead_days: 1825 # (float): Number of days from prediction time to look ahead for the outcome. - min_lookahead_days: 1825 # (int): Drop all prediction times where (max timestamp in the dataset) - (current timestamp) is less than min_lookahead_days - drop_patient_if_outcome_before_date: 2013-01-01 + lookahead_days: 365 # (float): Number of days from prediction time to look ahead for the outcome. + min_lookahead_days: 365 # (int): Drop all prediction times where (max timestamp in the dataset) - (current timestamp) is less than min_lookahead_days + drop_patient_if_outcome_before_date: null # Looking behind min_prediction_time_date: 2013-01-01 min_lookbehind_days: 365 # (int): Drop all prediction times where (prediction_timestamp) - (min timestamp in the dataset) is less than min_lookbehind_days - lookbehind_combinations: choice([30, 90, 180, 365, 730], [30, 180, 730], [30, 730], [730]) + lookbehind_combination: [30, 90, 180, 365] # Parameters that will only take effect if running with --multirun hydra: sweeper: params: - ++data.lookbehind_combinations: choice([3000], [30, 90]) + ++data.lookbehind_combination: choice([3000], [30, 90]) diff --git a/src/psycopt2d/config/default_config.yaml b/src/psycopt2d/config/default_config.yaml index 2d081f25..e67a67ec 100644 --- a/src/psycopt2d/config/default_config.yaml +++ b/src/psycopt2d/config/default_config.yaml @@ -3,6 +3,7 @@ defaults: - project: overtaci_test_project - data: t2d_parquet - preprocessing: default_preprocessing + - model: xgboost - training: default_training - evaluation: default_evaluation - sweeper: optuna_multithread diff --git a/src/psycopt2d/config/project/default_project.yaml b/src/psycopt2d/config/project/default_project.yaml index f3684005..404397fa 100644 --- a/src/psycopt2d/config/project/default_project.yaml +++ b/src/psycopt2d/config/project/default_project.yaml @@ -1,3 +1,4 @@ name: psycop-t2d seed: 42 -wandb_mode: "run" # Which mode to run WanDB in. Takes "run", "dryrun", "offline" and "disabled" \ No newline at end of file +wandb_mode: "run" # Which mode to run WanDB in. Takes "run", "dryrun", "offline" and "disabled" +wandb_group: "psycop-t2d" # Which group to run WanDB in. \ No newline at end of file diff --git a/src/psycopt2d/config/project/overtaci_test_project.yaml b/src/psycopt2d/config/project/overtaci_test_project.yaml index 22dedaeb..ae8e6c6c 100644 --- a/src/psycopt2d/config/project/overtaci_test_project.yaml +++ b/src/psycopt2d/config/project/overtaci_test_project.yaml @@ -1,3 +1,4 @@ name: psycop-t2d-testing seed: 42 -wandb_mode: "run" # Which mode to run WanDB in. Takes "run", "dryrun", "offline" and "disabled" \ No newline at end of file +wandb_mode: "offline" # Which mode to run WanDB in. Takes "run", "dryrun", "offline" and "disabled" +wandb_group: "psycop-t2d" \ No newline at end of file diff --git a/src/psycopt2d/config/training/default_training.yaml b/src/psycopt2d/config/training/default_training.yaml index 932506fc..56014ceb 100644 --- a/src/psycopt2d/config/training/default_training.yaml +++ b/src/psycopt2d/config/training/default_training.yaml @@ -1 +1 @@ -n_splits: 3 # (int, Null): Number of k-folds during CV. If Null, loads pre-defined dataset. +n_splits: 2 # (int, Null): Number of k-folds during CV. If Null, loads pre-defined dataset. diff --git a/src/psycopt2d/evaluation.py b/src/psycopt2d/evaluation.py index 12f37a35..ff4a7235 100644 --- a/src/psycopt2d/evaluation.py +++ b/src/psycopt2d/evaluation.py @@ -61,14 +61,6 @@ def evaluate_model( feature_importance_dict: Optional[dict[str, float]], ) -> None: """Runs the evaluation suite on the model and logs to WandB. - At present, this includes: - 1. AUC - 2. Table of performance by pred_proba threshold - 3. Feature importance - 4. Sensitivity by time to outcome - 5. AUC by calendar time - 6. AUC by time from first visit - 7. F1 by time until diagnosis Args: cfg (OmegaConf): The hydra config from the run @@ -84,7 +76,8 @@ def evaluate_model( msg.info("Starting model evaluation") SAVE_DIR = PROJECT_ROOT / ".tmp" # pylint: disable=invalid-name - # When parallelising tests, this causes issues since multiple processes + # When running tests in parallel with pytest-xdist, + # this causes issues since multiple processes # override the same dir at once. # Can be solved by allowing config to override this # and using tmp_dir in pytest. Not worth refactoring diff --git a/src/psycopt2d/load.py b/src/psycopt2d/load.py index 535220a7..1f6ddf11 100644 --- a/src/psycopt2d/load.py +++ b/src/psycopt2d/load.py @@ -192,7 +192,7 @@ def _drop_rows_if_datasets_ends_within_days( n_rows_after_modification = dataset.shape[0] percent_dropped = get_percent_lost( - n_before=n_rows_after_modification, + n_before=n_rows_before_modification, n_after=n_rows_after_modification, ) @@ -210,7 +210,7 @@ def _drop_patients_with_event_in_washin(self, dataset) -> pd.DataFrame: # Remove dates before drop_patient_if_outcome_before_date outcome_before_date = ( - dataset["timestamp_first_diabetes_any"] + dataset["_timestamp_first_t2d"] < self.spec.time.drop_patient_if_outcome_before_date ) @@ -257,6 +257,7 @@ def _drop_cols_not_in_lookbehind_combination( lookbehinds_in_dataset = { int(infer_look_distance(col)[0]) for col in infer_predictor_col_name(df=dataset) + if len(infer_look_distance(col)) > 0 } # Convert list to set @@ -278,14 +279,19 @@ def _drop_cols_not_in_lookbehind_combination( raise ValueError("No predictors left after dropping lookbehinds.") msg.warn(f"Training on {lookbehinds_to_keep}.") + else: + lookbehinds_to_keep = lookbehinds_in_spec # Create a list of all predictor columns who have a lookbehind window not in lookbehind_combination list cols_to_drop = [ c for c in infer_predictor_col_name(df=dataset) - if any(str(l_beh) not in c for l_beh in lookbehinds_to_keep) + if all(str(l_beh) not in c for l_beh in lookbehinds_to_keep) ] + cols_to_drop = [c for c in cols_to_drop if "within" in c] + # TODO: Add some specification of within_x_days indicating how to parse columns to find lookbehinds. Or, alternatively, use the column spec. + dataset = dataset.drop(columns=cols_to_drop) return dataset @@ -331,9 +337,7 @@ def _drop_cols_if_exceeds_look_direction_threshold( n_cols_before_modification = dataset.shape[1] if direction == "behind": - cols_to_process = [ - c for c in dataset.columns if self.pred_col_name_prefix in c - ] + cols_to_process = infer_predictor_col_name(df=dataset) for col in cols_to_process: # Extract lookbehind days from column name use regex @@ -344,7 +348,8 @@ def _drop_cols_if_exceeds_look_direction_threshold( if len(lookbehind_days_strs) > 0: lookbehind_days = int(lookbehind_days_strs[0]) else: - raise ValueError(f"Could not extract lookbehind days from {col}") + msg.warn(f"Could not extract lookbehind days from {col}") + continue if lookbehind_days > look_direction_threshold: cols_to_drop.append(col) @@ -447,6 +452,7 @@ def load_dataset_from_dir( Returns: pd.DataFrame: The filtered dataset """ + msg.info(f"Loading {split_names}") # Handle input types for timedelta_arg in ( self.spec.time.min_lookbehind_days, @@ -499,12 +505,12 @@ def _init_spec_from_cfg( resolve=True, ) - if data_cfg["source"] == "synthetic": + if data_cfg["suffix"] == "synthetic": split_dir_path = PROJECT_ROOT / "tests" / "test_data" / "synth_splits" file_suffix = "csv" else: split_dir_path = data_cfg["dir"] - file_suffix = data_cfg["source"] + file_suffix = data_cfg["suffix"] time_spec = DatasetTimeSpecification( drop_patient_if_outcome_before_date=data_cfg[ diff --git a/src/psycopt2d/model_training_watcher.py b/src/psycopt2d/model_training_watcher.py index 5ac58127..3305122c 100644 --- a/src/psycopt2d/model_training_watcher.py +++ b/src/psycopt2d/model_training_watcher.py @@ -12,13 +12,9 @@ from wasabi import msg from psycopt2d.evaluation import evaluate_model -from psycopt2d.utils import ( - MODEL_PREDICTIONS_PATH, - PROJECT_ROOT, - infer_outcome_col_name, - infer_y_hat_prob_col_name, - load_evaluation_data, -) +from psycopt2d.utils import (MODEL_PREDICTIONS_PATH, PROJECT_ROOT, + infer_outcome_col_name, infer_y_hat_prob_col_name, + load_evaluation_data) # Path to the wandb directory WANDB_DIR = PROJECT_ROOT / "wandb" @@ -52,11 +48,11 @@ def __init__( self.n_runs_before_eval = n_runs_before_eval # A queue for runs waiting to be uploaded to WandB - self.run_id_upload_queue = [] + self.run_id_eval_candidates_queue = [] self.max_performance = 0 self.archive_path = WANDB_DIR / "archive" - self.archive_path.mkdir(exist_ok=True) + self.archive_path.mkdir(exist_ok=True, parents=True) def watch(self, timeout_minutes: Optional[int] = None) -> None: """Watch the wandb directory for new runs. @@ -70,12 +66,12 @@ def watch(self, timeout_minutes: Optional[int] = None) -> None: timeout_minutes is None or start_time + timeout_minutes * 60 > time.time() ): self.get_new_runs_and_evaluate() - time.sleep(10) + time.sleep(1) def get_new_runs_and_evaluate(self) -> None: """Get new runs and evaluate the best runs.""" self.upload_unarchived_runs() - if len(self.run_id_upload_queue) >= self.n_runs_before_eval: + if len(self.run_id_eval_candidates_queue) >= self.n_runs_before_eval: self.evaluate_best_runs() def _upload_run_dir(self, run_dir: Path) -> None: @@ -87,7 +83,7 @@ def _upload_run_dir(self, run_dir: Path) -> None: def _archive_run_dir(self, run_dir: Path) -> None: """Move a run to the archive folder.""" - run_dir.rename(self.archive_path / run_dir.name) + run_dir.rename(target=self.archive_path / run_dir.name) def _get_run_id(self, run_dir: Path) -> str: """Get the run id from a run directory.""" @@ -96,11 +92,17 @@ def _get_run_id(self, run_dir: Path) -> str: def upload_unarchived_runs(self) -> None: """Upload unarchived runs to wandb.""" for run_folder in WANDB_DIR.glob(r"offline-run*"): + # TODO: We need some kind of test here to figure out if the run is + # still running or not. If it is still running, we should wait + # until it is finished. Otherwise, we get a "permission denied" error. run_id = self._get_run_id(run_folder) self._upload_run_dir(run_folder) + + # TODO: If upload_run_dir fails, we should not archive the run. + # use return from subprocess.run to check if it failed. See docs: https://docs.python.org/3/library/subprocess.html self._archive_run_dir(run_folder) - self.run_id_upload_queue.append(run_id) + self.run_id_eval_candidates_queue.append(run_id) def _get_run_evaluation_dir(self, run_id: str) -> Path: """Get the evaluation path for a single run.""" @@ -151,7 +153,7 @@ def evaluate_best_runs(self) -> None: """Evaluate the best runs.""" run_performances = { run_id: self._get_run_performance(run_id) - for run_id in self.run_id_upload_queue + for run_id in self.run_id_eval_candidates_queue } # sort runs by performance to not upload subpar runs run_performances = dict( @@ -168,7 +170,7 @@ def evaluate_best_runs(self) -> None: self.max_performance = performance self._do_evaluation(run_id) # reset run id queue and try to upload unfinished runs next time - self.run_id_upload_queue = unfinished_runs + self.run_id_eval_candidates_queue = unfinished_runs def archive_all_runs(self) -> None: """Archive all runs in the wandb directory.""" diff --git a/src/psycopt2d/visualization/base_charts.py b/src/psycopt2d/visualization/base_charts.py index 8f1188e4..bbad7fe6 100644 --- a/src/psycopt2d/visualization/base_charts.py +++ b/src/psycopt2d/visualization/base_charts.py @@ -12,7 +12,7 @@ def plot_basic_chart( y_values: Iterable, x_title: str, y_title: str, - plot_type: Optional[Union[list[str], str]], + plot_type: Union[list[str], str], sort_x: Optional[Iterable[int]] = None, sort_y: Optional[Iterable[int]] = None, fig_size: Optional[tuple] = (10, 10), diff --git a/src/psycopt2d/visualization/feature_importance.py b/src/psycopt2d/visualization/feature_importance.py index 59577105..1b7d9084 100644 --- a/src/psycopt2d/visualization/feature_importance.py +++ b/src/psycopt2d/visualization/feature_importance.py @@ -47,7 +47,7 @@ def plot_feature_importances( y_values=df["feature_importances"].tolist(), x_title="Feature importance (gain)", y_title="Feature name", - sort_x=np.flip(np.arange(len(feature_importances))), + sort_x=np.flip(np.arange(len(df["feature_importances"]))), plot_type="hbar", fig_size=(16, 10), save_path=save_path, diff --git a/tests/test_train_model.py b/tests/test_train_model.py index 0339f72d..4538dc14 100644 --- a/tests/test_train_model.py +++ b/tests/test_train_model.py @@ -7,7 +7,7 @@ from psycopt2d.train_model import main CONFIG_DIR_PATH = "../src/psycopt2d/config/" -CONFIG_FILE_NAME = "integration_testing.yaml" +INTEGRATION_TEST_FILE_NAME = "integration_testing.yaml" INTEGRATION_TESTING_MODEL_OVERRIDE = "+model=logistic-regression" @@ -17,7 +17,7 @@ def test_main(model_name): with initialize(version_base=None, config_path=CONFIG_DIR_PATH): cfg = compose( - config_name=CONFIG_FILE_NAME, + config_name=INTEGRATION_TEST_FILE_NAME, overrides=[f"+model={model_name}"], ) @@ -38,7 +38,7 @@ def test_integration_test(): with initialize(version_base=None, config_path=CONFIG_DIR_PATH): cfg = compose( - config_name=CONFIG_FILE_NAME, + config_name=INTEGRATION_TEST_FILE_NAME, overrides=[INTEGRATION_TESTING_MODEL_OVERRIDE], ) main(cfg) @@ -48,7 +48,7 @@ def test_crossvalidation(): """Test crossvalidation.""" with initialize(version_base=None, config_path=CONFIG_DIR_PATH): cfg = compose( - config_name=CONFIG_FILE_NAME, + config_name=INTEGRATION_TEST_FILE_NAME, overrides=[INTEGRATION_TESTING_MODEL_OVERRIDE, "+data.n_splits=2"], ) main(cfg) @@ -58,7 +58,7 @@ def test_min_prediction_time_date(): """Test crossvalidation.""" with initialize(version_base=None, config_path=CONFIG_DIR_PATH): cfg = compose( - config_name=CONFIG_FILE_NAME, + config_name=INTEGRATION_TEST_FILE_NAME, overrides=[ INTEGRATION_TESTING_MODEL_OVERRIDE, "+data.min_prediction_time_date=1972-01-01", From a64ebe8fe30ae651962a63e8b4e12b4285a1bb17 Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Fri, 21 Oct 2022 10:33:18 +0200 Subject: [PATCH 14/57] Begin refactoring --- src/psycopt2d/dataclasses/configs.py | 19 ------------------- 1 file changed, 19 deletions(-) delete mode 100644 src/psycopt2d/dataclasses/configs.py diff --git a/src/psycopt2d/dataclasses/configs.py b/src/psycopt2d/dataclasses/configs.py deleted file mode 100644 index e809d072..00000000 --- a/src/psycopt2d/dataclasses/configs.py +++ /dev/null @@ -1,19 +0,0 @@ -"""Dataclasses used in the project.""" -from typing import Optional - -import pandas as pd -from omegaconf import DictConfig -from pydantic import BaseModel - -# pylint: disable=missing-class-docstring, too-few-public-methods - - -class ModelEvalData(BaseModel): - """Dataclass for model evaluation data.""" - - class Config: - arbitrary_types_allowed = True - - df: pd.DataFrame - cfg: DictConfig - feature_importance_dict: Optional[dict[str, float]] = None From 917b043a96eca4467276756d8f31cb7ea80989cd Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Fri, 21 Oct 2022 10:33:31 +0200 Subject: [PATCH 15/57] Begin refactoring --- src/psycopt2d/configs.py | 19 +++++++++++ .../utils/omegaconf_to_pydantic_objects.py | 32 +++++++++++++++++++ 2 files changed, 51 insertions(+) create mode 100644 src/psycopt2d/configs.py create mode 100644 src/psycopt2d/utils/omegaconf_to_pydantic_objects.py diff --git a/src/psycopt2d/configs.py b/src/psycopt2d/configs.py new file mode 100644 index 00000000..e809d072 --- /dev/null +++ b/src/psycopt2d/configs.py @@ -0,0 +1,19 @@ +"""Dataclasses used in the project.""" +from typing import Optional + +import pandas as pd +from omegaconf import DictConfig +from pydantic import BaseModel + +# pylint: disable=missing-class-docstring, too-few-public-methods + + +class ModelEvalData(BaseModel): + """Dataclass for model evaluation data.""" + + class Config: + arbitrary_types_allowed = True + + df: pd.DataFrame + cfg: DictConfig + feature_importance_dict: Optional[dict[str, float]] = None diff --git a/src/psycopt2d/utils/omegaconf_to_pydantic_objects.py b/src/psycopt2d/utils/omegaconf_to_pydantic_objects.py new file mode 100644 index 00000000..bd476e95 --- /dev/null +++ b/src/psycopt2d/utils/omegaconf_to_pydantic_objects.py @@ -0,0 +1,32 @@ +"""Utilities for converting config yamls to pydantic objects. + +Helpful because it makes them: +- Addressable with intellisense, +- Refactorable with IDEs, +- Easier to document with docstrings and +- Type checkable +""" + +import pydantic +from hydra import compose, initialize +from omegaconf import DictConfig + + +def omegaconf_to_pydantic_cfg(cfg: DictConfig) -> pydantic.BaseModel: + """Convert OmegaConf to pydantic config.""" + return pydantic.parse_obj_as(pydantic.BaseModel, cfg) + + +def main(): + with initialize(version_base=None, config_path="../src/psycopt2d/config/"): + cfg = compose( + config_name="defualt_config.yaml", + ) + + pydantic_obj = omegaconf_to_pydantic_cfg(cfg) + + pass + + +if __name__ == "__main__": + main() From 58bd9fdab475ccb3d12a4d816a79bcac7197a9cb Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Fri, 21 Oct 2022 11:44:15 +0200 Subject: [PATCH 16/57] refactor: convert omegaconf to pydantic objs --- application/train_and_log_models.py | 2 +- reports/render_report.py | 2 +- src/psycopt2d/config/data/synth_data.yaml | 3 +- src/psycopt2d/config/default_config.yaml | 4 +- .../default_evaluation.yaml | 0 .../evaluation_synth.yaml | 0 src/psycopt2d/config/integration_testing.yaml | 5 +- src/psycopt2d/config/overtaci_testing.yaml | 8 -- src/psycopt2d/config/sweep_xgboost.yaml | 8 -- .../{training => train}/default_training.yaml | 0 .../evaluate_saved_model_predictions.py | 2 +- src/psycopt2d/evaluation.py | 2 +- src/psycopt2d/load.py | 80 ++++++------ src/psycopt2d/model_training_watcher.py | 10 +- src/psycopt2d/train_model.py | 17 ++- src/psycopt2d/utils/__init__.py | 0 .../utils/omegaconf_to_pydantic_objects.py | 121 ++++++++++++++++-- src/psycopt2d/{ => utils}/utils.py | 8 +- .../visualization/performance_over_time.py | 2 +- src/psycopt2d/visualization/prob_over_time.py | 2 +- src/psycopt2d/visualization/sens_over_time.py | 6 +- tests/test_auc_by_group_table.py | 2 +- tests/test_calculate_performance_metrics.py | 2 +- tests/test_load.py | 15 ++- tests/test_performance_by_threshold.py | 2 +- tests/test_train_model.py | 9 +- tests/test_utils.py | 28 +++- tests/test_visualizations.py | 2 +- 28 files changed, 237 insertions(+), 105 deletions(-) rename src/psycopt2d/config/{evaluation => eval}/default_evaluation.yaml (100%) rename src/psycopt2d/config/{evaluation => eval}/evaluation_synth.yaml (100%) delete mode 100644 src/psycopt2d/config/overtaci_testing.yaml delete mode 100644 src/psycopt2d/config/sweep_xgboost.yaml rename src/psycopt2d/config/{training => train}/default_training.yaml (100%) create mode 100644 src/psycopt2d/utils/__init__.py rename src/psycopt2d/{ => utils}/utils.py (98%) diff --git a/application/train_and_log_models.py b/application/train_and_log_models.py index 80ad6eb9..adc8bcab 100644 --- a/application/train_and_log_models.py +++ b/application/train_and_log_models.py @@ -20,7 +20,7 @@ infer_predictor_col_name, ) from psycopt2d.load import DataLoader, DatasetSpecification, DatasetTimeSpecification -from psycopt2d.utils import PROJECT_ROOT +from psycopt2d.utils.utils import PROJECT_ROOT msg = Printer(timestamp=True) diff --git a/reports/render_report.py b/reports/render_report.py index e62d906f..0204127e 100644 --- a/reports/render_report.py +++ b/reports/render_report.py @@ -9,7 +9,7 @@ import pandas as pd -from psycopt2d.utils import PROJECT_ROOT +from psycopt2d.utils.utils import PROJECT_ROOT # import pandoc # See comment in pyproject.toml on Pandoc, not currently in use. Should work now, see: https://github.com/boisgera/pandoc/pull/49#issuecomment-1265983279 diff --git a/src/psycopt2d/config/data/synth_data.yaml b/src/psycopt2d/config/data/synth_data.yaml index 8089f440..f94b5da9 100644 --- a/src/psycopt2d/config/data/synth_data.yaml +++ b/src/psycopt2d/config/data/synth_data.yaml @@ -9,7 +9,8 @@ data: pred_timestamp_col_name: timestamp outcome_timestamp_col_name: timestamp_outcome id_col_name: citizen_ids - suffix: synthetic + dir: "../psycop-t2d/tests/test_data/synth_splits/" + suffix: csv drop_patient_if_outcome_before_date: null lookbehind_combination: [30, 90] diff --git a/src/psycopt2d/config/default_config.yaml b/src/psycopt2d/config/default_config.yaml index e67a67ec..5590c26f 100644 --- a/src/psycopt2d/config/default_config.yaml +++ b/src/psycopt2d/config/default_config.yaml @@ -4,6 +4,6 @@ defaults: - data: t2d_parquet - preprocessing: default_preprocessing - model: xgboost - - training: default_training - - evaluation: default_evaluation + - train: default_training + - eval: default_evaluation - sweeper: optuna_multithread diff --git a/src/psycopt2d/config/evaluation/default_evaluation.yaml b/src/psycopt2d/config/eval/default_evaluation.yaml similarity index 100% rename from src/psycopt2d/config/evaluation/default_evaluation.yaml rename to src/psycopt2d/config/eval/default_evaluation.yaml diff --git a/src/psycopt2d/config/evaluation/evaluation_synth.yaml b/src/psycopt2d/config/eval/evaluation_synth.yaml similarity index 100% rename from src/psycopt2d/config/evaluation/evaluation_synth.yaml rename to src/psycopt2d/config/eval/evaluation_synth.yaml diff --git a/src/psycopt2d/config/integration_testing.yaml b/src/psycopt2d/config/integration_testing.yaml index 5a3c6d52..6b860e1b 100644 --- a/src/psycopt2d/config/integration_testing.yaml +++ b/src/psycopt2d/config/integration_testing.yaml @@ -3,6 +3,7 @@ defaults: - project: integration_test_project - data: synth_data - preprocessing: default_preprocessing - - training: default_training - - evaluation: evaluation_synth + - train: default_training + - model: xgboost + - eval: evaluation_synth - sweeper: optuna_singlethread diff --git a/src/psycopt2d/config/overtaci_testing.yaml b/src/psycopt2d/config/overtaci_testing.yaml deleted file mode 100644 index 39aca780..00000000 --- a/src/psycopt2d/config/overtaci_testing.yaml +++ /dev/null @@ -1,8 +0,0 @@ -# @package _global_ -defaults: - - project: overtaci_test_project - - data: t2d_parquet - - preprocessing: default_preprocessing - - training: default_training - - evaluation: default_evaluation - - sweeper: optuna_singlethread diff --git a/src/psycopt2d/config/sweep_xgboost.yaml b/src/psycopt2d/config/sweep_xgboost.yaml deleted file mode 100644 index 0295eb9a..00000000 --- a/src/psycopt2d/config/sweep_xgboost.yaml +++ /dev/null @@ -1,8 +0,0 @@ -# @package _global_ -defaults: - - project: default_project - - data: all_csv - - preprocessing: default_preprocessing - - training: default_training - - evaluation: default_evaluation - - sweeper: optuna_singlethread \ No newline at end of file diff --git a/src/psycopt2d/config/training/default_training.yaml b/src/psycopt2d/config/train/default_training.yaml similarity index 100% rename from src/psycopt2d/config/training/default_training.yaml rename to src/psycopt2d/config/train/default_training.yaml diff --git a/src/psycopt2d/evaluate_saved_model_predictions.py b/src/psycopt2d/evaluate_saved_model_predictions.py index a665d34f..2b312724 100644 --- a/src/psycopt2d/evaluate_saved_model_predictions.py +++ b/src/psycopt2d/evaluate_saved_model_predictions.py @@ -14,7 +14,7 @@ import pandas as pd from omegaconf import DictConfig -from psycopt2d.utils import ( +from psycopt2d.utils.utils import ( PROJECT_ROOT, infer_outcome_col_name, infer_predictor_col_name, diff --git a/src/psycopt2d/evaluation.py b/src/psycopt2d/evaluation.py index ff4a7235..cf345975 100644 --- a/src/psycopt2d/evaluation.py +++ b/src/psycopt2d/evaluation.py @@ -14,7 +14,7 @@ from psycopt2d.tables.performance_by_threshold import ( generate_performance_by_positive_rate_table, ) -from psycopt2d.utils import PROJECT_ROOT, positive_rate_to_pred_probs +from psycopt2d.utils.utils import PROJECT_ROOT, positive_rate_to_pred_probs from psycopt2d.visualization import ( plot_auc_by_time_from_first_visit, plot_feature_importances, diff --git a/src/psycopt2d/load.py b/src/psycopt2d/load.py index 1f6ddf11..48a0b7c1 100644 --- a/src/psycopt2d/load.py +++ b/src/psycopt2d/load.py @@ -3,6 +3,7 @@ from collections.abc import Iterable from datetime import datetime, timedelta from pathlib import Path +from queue import Full from typing import Any, Optional, Union import pandas as pd @@ -12,7 +13,8 @@ from wasabi import Printer from psycopt2d.evaluate_saved_model_predictions import infer_look_distance -from psycopt2d.utils import ( +from psycopt2d.utils.omegaconf_to_pydantic_objects import FullConfig +from psycopt2d.utils.utils import ( PROJECT_ROOT, coerce_to_datetime, get_percent_lost, @@ -111,16 +113,16 @@ class DataLoader: def __init__( self, - spec: DatasetSpecification, + cfg: FullConfig, ): - self.spec = spec + self.cfg = cfg # File handling - self.dir_path = Path(spec.split_dir_path) - self.file_suffix = spec.file_suffix + self.dir_path = Path(cfg.data.dir) + self.file_suffix = cfg.data.suffix # Column specifications - self.pred_col_name_prefix = spec.pred_col_name_prefix + self.pred_col_name_prefix = cfg.data.pred_col_name_prefix def _load_dataset_file( # pylint: disable=inconsistent-return-statements self, @@ -182,12 +184,18 @@ def _drop_rows_if_datasets_ends_within_days( n_rows_before_modification = dataset.shape[0] if direction == "ahead": - max_datetime = dataset[self.spec.pred_time_colname].max() - n_days_timedelt - before_max_dt = dataset[self.spec.pred_time_colname] < max_datetime + max_datetime = ( + dataset[self.cfg.data.pred_timestamp_col_name].max() - n_days_timedelt + ) + before_max_dt = ( + dataset[self.cfg.data.pred_timestamp_col_name] < max_datetime + ) dataset = dataset[before_max_dt] elif direction == "behind": - min_datetime = dataset[self.spec.pred_time_colname].min() + n_days_timedelt - after_min_dt = dataset[self.spec.pred_time_colname] > min_datetime + min_datetime = ( + dataset[self.cfg.data.pred_timestamp_col_name].min() + n_days_timedelt + ) + after_min_dt = dataset[self.cfg.data.pred_timestamp_col_name] > min_datetime dataset = dataset[after_min_dt] n_rows_after_modification = dataset.shape[0] @@ -211,7 +219,7 @@ def _drop_patients_with_event_in_washin(self, dataset) -> pd.DataFrame: # Remove dates before drop_patient_if_outcome_before_date outcome_before_date = ( dataset["_timestamp_first_t2d"] - < self.spec.time.drop_patient_if_outcome_before_date + < self.cfg.data.drop_patient_if_outcome_before_date ) patients_to_drop = set(dataset["dw_ek_borger"][outcome_before_date].unique()) @@ -219,8 +227,8 @@ def _drop_patients_with_event_in_washin(self, dataset) -> pd.DataFrame: # Removed dates before drop_patient_if_outcome_before_date dataset = dataset[ - dataset[self.spec.pred_time_colname] - > self.spec.time.drop_patient_if_outcome_before_date + dataset[self.cfg.data.pred_timestamp_col_name] + > self.cfg.data.drop_patient_if_outcome_before_date ] n_rows_after_modification = dataset.shape[0] @@ -250,7 +258,7 @@ def _drop_cols_not_in_lookbehind_combination( pd.DataFrame: Dataset with dropped columns. """ - if not self.spec.time.lookbehind_combination: + if not self.cfg.data.lookbehind_combination: raise ValueError("No lookbehind_combination provided.") # Extract all unique lookbhehinds in the dataset predictors @@ -261,14 +269,14 @@ def _drop_cols_not_in_lookbehind_combination( } # Convert list to set - lookbehinds_in_spec = set(self.spec.time.lookbehind_combination) + lookbehinds_in_spec = set(self.cfg.data.lookbehind_combination) # Check that all loobehinds in lookbehind_combination are used in the predictors if not lookbehinds_in_spec.issubset( lookbehinds_in_dataset, ): msg.warn( - f"One or more of the provided lookbehinds in lookbehind_combination is/are not used in any predictors in the dataset. Dataset: {lookbehinds_in_dataset}. lookbehind_combination: {self.spec.time.lookbehind_combination}.", + f"One or more of the provided lookbehinds in lookbehind_combination is/are not used in any predictors in the dataset. Dataset: {lookbehinds_in_dataset}. lookbehind_combination: {self.cfg.data.lookbehind_combination}.", ) lookbehinds_to_keep = lookbehinds_in_spec.intersection( @@ -384,10 +392,10 @@ def _drop_cols_and_rows_if_look_direction_not_met( for direction in ("ahead", "behind"): if direction in ("ahead", "behind"): - if self.spec.time.min_lookahead_days: - n_days = self.spec.time.min_lookahead_days - elif self.spec.time.min_lookbehind_days: - n_days = self.spec.time.min_lookbehind_days + if self.cfg.data.min_lookahead_days: + n_days = self.cfg.data.min_lookahead_days + elif self.cfg.data.min_lookbehind_days: + n_days = self.cfg.data.min_lookbehind_days else: continue @@ -416,23 +424,23 @@ def _process_dataset(self, dataset: pd.DataFrame) -> pd.DataFrame: Returns: pd.DataFrame: Processed dataset """ - if self.spec.time.drop_patient_if_outcome_before_date: + if self.cfg.data.drop_patient_if_outcome_before_date: dataset = add_washin_timestamps(dataset=dataset) dataset = self._convert_timestamp_dtype_and_nat(dataset) - if self.spec.time.drop_patient_if_outcome_before_date: + if self.cfg.data.drop_patient_if_outcome_before_date: dataset = self._drop_patients_with_event_in_washin(dataset=dataset) # Drop if later than min prediction time date - if self.spec.time.min_prediction_time_date: + if self.cfg.data.min_prediction_time_date: dataset = dataset[ - dataset[self.spec.pred_time_colname] - > self.spec.time.min_prediction_time_date + dataset[self.cfg.data.pred_timestamp_col_name] + > self.cfg.data.min_prediction_time_date ] dataset = self._drop_cols_and_rows_if_look_direction_not_met(dataset=dataset) - if self.spec.time.lookbehind_combination: + if self.cfg.data.lookbehind_combination: dataset = self._drop_cols_not_in_lookbehind_combination(dataset=dataset) return dataset @@ -455,15 +463,15 @@ def load_dataset_from_dir( msg.info(f"Loading {split_names}") # Handle input types for timedelta_arg in ( - self.spec.time.min_lookbehind_days, - self.spec.time.min_lookahead_days, + self.cfg.data.min_lookbehind_days, + self.cfg.data.min_lookahead_days, ): if timedelta_arg: timedelta_arg = timedelta(days=timedelta_arg) # type: ignore for date_arg in ( - self.spec.time.drop_patient_if_outcome_before_date, - self.spec.time.min_prediction_time_date, + self.cfg.data.drop_patient_if_outcome_before_date, + self.cfg.data.min_prediction_time_date, ): if isinstance(date_arg, str): date_arg = coerce_to_datetime( @@ -545,16 +553,12 @@ class Config: val: pd.DataFrame -def load_train_and_val_from_cfg(cfg: DictConfig): +def load_train_and_val_from_cfg(cfg: FullConfig): """Load train and validation data from file.""" - data_specification = _init_spec_from_cfg( - cfg, - ) - - split = DataLoader(spec=data_specification) + loader = DataLoader(cfg=cfg) return SplitDataset( - train=split.load_dataset_from_dir(split_names="train"), - val=split.load_dataset_from_dir(split_names="val"), + train=loader.load_dataset_from_dir(split_names="train"), + val=loader.load_dataset_from_dir(split_names="val"), ) diff --git a/src/psycopt2d/model_training_watcher.py b/src/psycopt2d/model_training_watcher.py index 3305122c..d4ee469f 100644 --- a/src/psycopt2d/model_training_watcher.py +++ b/src/psycopt2d/model_training_watcher.py @@ -12,9 +12,13 @@ from wasabi import msg from psycopt2d.evaluation import evaluate_model -from psycopt2d.utils import (MODEL_PREDICTIONS_PATH, PROJECT_ROOT, - infer_outcome_col_name, infer_y_hat_prob_col_name, - load_evaluation_data) +from psycopt2d.utils.utils import ( + MODEL_PREDICTIONS_PATH, + PROJECT_ROOT, + infer_outcome_col_name, + infer_y_hat_prob_col_name, + load_evaluation_data, +) # Path to the wandb directory WANDB_DIR = PROJECT_ROOT / "wandb" diff --git a/src/psycopt2d/train_model.py b/src/psycopt2d/train_model.py index e9d78bbe..44ecbfe0 100644 --- a/src/psycopt2d/train_model.py +++ b/src/psycopt2d/train_model.py @@ -1,7 +1,7 @@ """Training script for training a single model for predicting t2d.""" import os from collections.abc import Iterable -from typing import Optional +from typing import Optional, Union import hydra import numpy as np @@ -19,7 +19,11 @@ from psycopt2d.feature_transformers import ConvertToBoolean, DateTimeConverter from psycopt2d.load import load_train_and_val_from_cfg from psycopt2d.models import MODELS -from psycopt2d.utils import ( +from psycopt2d.utils.omegaconf_to_pydantic_objects import ( + FullConfig, + omegaconf_to_pydantic_objects, +) +from psycopt2d.utils.utils import ( PROJECT_ROOT, create_wandb_folders, flatten_nested_dict, @@ -302,8 +306,11 @@ def get_col_names(cfg: DictConfig, train: pd.DataFrame) -> tuple[str, list[str]] config_name="default_config", version_base="1.2", ) -def main(cfg): +def main(cfg: Union[FullConfig, DictConfig]): """Main function for training a single model.""" + if not isinstance(cfg, FullConfig): + cfg = omegaconf_to_pydantic_objects(cfg) + msg = Printer(timestamp=True) create_wandb_folders() @@ -311,7 +318,7 @@ def main(cfg): run = wandb.init( project=cfg.project.name, reinit=True, - config=flatten_nested_dict(cfg, sep="."), + config=flatten_nested_dict(cfg.__dict__, sep="."), mode=cfg.project.wandb_mode, group=cfg.project.wandb_group, ) @@ -331,7 +338,7 @@ def main(cfg): pipe=pipe, outcome_col_name=outcome_col_name, train_col_names=train_col_names, - n_splits=cfg.training.n_splits, + n_splits=cfg.train.n_splits, ) # Save model predictions, feature importance, and config to disk diff --git a/src/psycopt2d/utils/__init__.py b/src/psycopt2d/utils/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/psycopt2d/utils/omegaconf_to_pydantic_objects.py b/src/psycopt2d/utils/omegaconf_to_pydantic_objects.py index bd476e95..e1d1fa95 100644 --- a/src/psycopt2d/utils/omegaconf_to_pydantic_objects.py +++ b/src/psycopt2d/utils/omegaconf_to_pydantic_objects.py @@ -7,25 +7,124 @@ - Type checkable """ +from datetime import datetime +from pathlib import Path +from typing import Optional, Union + import pydantic from hydra import compose, initialize -from omegaconf import DictConfig +from omegaconf import DictConfig, OmegaConf +from pydantic import BaseModel as PydanticBaseModel + + +class BaseModel(PydanticBaseModel): + """Allow arbitrary types in all pydantic models.""" + + class Config: + """Allow arbitrary types""" + + arbitrary_types_allowed = True + + +class ProjectConf(BaseModel): + """Project configuration.""" + + name: str = "psycopt2d" + seed: int + wandb_group: str + wandb_mode: str + + +class DataConf(BaseModel): + """Data configuration.""" + + n_training_samples: Optional[ + int + ] # (int, null): Number of training samples to use, defaults to null in which cases it uses all samples. + dir: Union[Path, str] + suffix: str # File suffix to load. + + # Feature specs + pred_col_name_prefix: str # (str): prefix of predictor columns + pred_timestamp_col_name: str # (str): Column name for prediction times + outcome_timestamp_col_name: str # (str): Column name for outcome timestamps + id_col_name: str # (str): Citizen colnames + + # Looking ahead + lookahead_days: int # (float): Number of days from prediction time to look ahead for the outcome. + min_lookahead_days: Optional[ + int + ] # (int): Drop all prediction times where (max timestamp in the dataset) - (current timestamp) is less than min_lookahead_days + min_lookbehind_days: Optional[int] + drop_patient_if_outcome_before_date: Optional[Union[str, datetime]] + + # Looking behind + # (int): Drop all prediction times where (prediction_timestamp) - (min timestamp in the dataset) is less than min_lookbehind_days + min_prediction_time_date: Optional[Union[str, datetime]] + lookbehind_combination: Optional[list[int]] + + +class PreprocessingConf(BaseModel): + """Preprocessing config""" + + convert_to_boolean: bool # (Boolean): Convert all prediction values (except gender) to boolean. Defaults to False + convert_datetimes_to: bool # (str): Options include ordinal or False + imputation_method: Optional[str] # (str): Options include "most_frequent" + transform: Optional[ + str + ] # (str|null): Transformation applied to all predictors after imputation. Options include "z-score-normalization" + + +class ModelConf(BaseModel): + """Model configuration""" + + model_name: str # (str): Model, can currently take xgboost + require_imputation: bool # (bool): Whether the model requires imputation. (shouldn't this be false?) + args: dict + + +class TrainConf(BaseModel): + """Training configuration""" + + n_splits: int # TODO: How do we handle whether to use crossvalidation or train/val splitting? + + +class EvalConf(BaseModel): + """Evaluation config""" + + threshold_percentiles: list[int] + + # top n features to plot. A table with all features is also logged + top_n_feature_importances: int + + positive_rate_thresholds: list[int] + save_model_predictions_on_overtaci: bool + date_bins_ahead: list[int] + date_bins_behind: list[int] + +class FullConfig(BaseModel): + """A full configuration object.""" -def omegaconf_to_pydantic_cfg(cfg: DictConfig) -> pydantic.BaseModel: - """Convert OmegaConf to pydantic config.""" - return pydantic.parse_obj_as(pydantic.BaseModel, cfg) + project: ProjectConf + data: DataConf + preprocessing: PreprocessingConf + model: ModelConf + train: TrainConf + eval: EvalConf -def main(): - with initialize(version_base=None, config_path="../src/psycopt2d/config/"): - cfg = compose( - config_name="defualt_config.yaml", - ) +def omegaconf_to_pydantic_objects(conf: DictConfig) -> FullConfig: + """Converts an omegaconf DictConfig to a pydantic object. - pydantic_obj = omegaconf_to_pydantic_cfg(cfg) + Args: + conf (DictConfig): Omegaconf DictConfig - pass + Returns: + FullConfig: Pydantic object + """ + conf = OmegaConf.to_container(conf, resolve=True) # type: ignore + return FullConfig(**conf) if __name__ == "__main__": diff --git a/src/psycopt2d/utils.py b/src/psycopt2d/utils/utils.py similarity index 98% rename from src/psycopt2d/utils.py rename to src/psycopt2d/utils/utils.py index 976e56db..1ad2519a 100644 --- a/src/psycopt2d/utils.py +++ b/src/psycopt2d/utils/utils.py @@ -18,8 +18,9 @@ from wandb.sdk.wandb_run import Run # pylint: disable=no-name-in-module from wasabi import msg -from psycopt2d.dataclasses.configs import ModelEvalData +from psycopt2d.configs import ModelEvalData from psycopt2d.model_performance import ModelPerformance +from psycopt2d.utils.omegaconf_to_pydantic_objects import FullConfig SHARED_RESOURCES_PATH = Path(r"E:\shared_resources") FEATURE_SETS_PATH = SHARED_RESOURCES_PATH / "feature_sets" @@ -27,6 +28,7 @@ RAW_DATA_VALIDATION_PATH = SHARED_RESOURCES_PATH / "raw_data_validation" FEATURIZERS_PATH = SHARED_RESOURCES_PATH / "featurizers" MODEL_PREDICTIONS_PATH = SHARED_RESOURCES_PATH / "model_predictions" + PROJECT_ROOT = Path(__file__).resolve().parents[2] @@ -299,7 +301,7 @@ def get_feature_importance_dict(pipe: Pipeline) -> Union[None, dict[str, float]] def prediction_df_with_metadata_to_disk( df: pd.DataFrame, - cfg: DictConfig, + cfg: FullConfig, pipe: Pipeline, run: Optional[Run] = None, ) -> None: @@ -321,7 +323,7 @@ def prediction_df_with_metadata_to_disk( else: run_descriptor = f"{timestamp}_{model_args}"[:100] - if cfg.evaluation.save_model_predictions_on_overtaci: + if cfg.eval.save_model_predictions_on_overtaci: # Save to overtaci dir_path = MODEL_PREDICTIONS_PATH / cfg.project.name / run_descriptor else: diff --git a/src/psycopt2d/visualization/performance_over_time.py b/src/psycopt2d/visualization/performance_over_time.py index e5ca1dac..b5c1d265 100644 --- a/src/psycopt2d/visualization/performance_over_time.py +++ b/src/psycopt2d/visualization/performance_over_time.py @@ -11,7 +11,7 @@ import pandas as pd from sklearn.metrics import f1_score, roc_auc_score -from psycopt2d.utils import bin_continuous_data, round_floats_to_edge +from psycopt2d.utils.utils import bin_continuous_data, round_floats_to_edge from psycopt2d.visualization.base_charts import plot_basic_chart diff --git a/src/psycopt2d/visualization/prob_over_time.py b/src/psycopt2d/visualization/prob_over_time.py index 7adba6a3..4a9fc2ac 100644 --- a/src/psycopt2d/visualization/prob_over_time.py +++ b/src/psycopt2d/visualization/prob_over_time.py @@ -141,7 +141,7 @@ def plot_prob_over_time( if __name__ == "__main__": - from psycopt2d.utils import PROJECT_ROOT + from psycopt2d.utils.utils import PROJECT_ROOT path = PROJECT_ROOT / "tests" / "test_data" / "synth_eval_data.csv" df = pd.read_csv(path) diff --git a/src/psycopt2d/visualization/sens_over_time.py b/src/psycopt2d/visualization/sens_over_time.py index c7f6be5a..306164ab 100644 --- a/src/psycopt2d/visualization/sens_over_time.py +++ b/src/psycopt2d/visualization/sens_over_time.py @@ -9,7 +9,7 @@ import numpy as np import pandas as pd -from psycopt2d.utils import PROJECT_ROOT, round_floats_to_edge +from psycopt2d.utils.utils import PROJECT_ROOT, round_floats_to_edge def create_sensitivity_by_time_to_outcome_df( @@ -303,7 +303,7 @@ def plot_sensitivity_by_time_to_outcome_heatmap( Examples: >>> from pathlib import Path - >>> from psycopt2d.utils import positive_rate_to_pred_probs + >>> from psycopt2d.utils.utils import positive_rate_to_pred_probs >>> repo_path = Path(__file__).parent.parent.parent.parent >>> path = repo_path / "tests" / "test_data" / "synth_eval_data.csv" @@ -381,7 +381,7 @@ def plot_sensitivity_by_time_to_outcome_heatmap( if __name__ == "__main__": - from psycopt2d.utils import positive_rate_to_pred_probs + from psycopt2d.utils.utils import positive_rate_to_pred_probs path = PROJECT_ROOT / "tests" / "test_data" / "synth_eval_data.csv" df = pd.read_csv(path) diff --git a/tests/test_auc_by_group_table.py b/tests/test_auc_by_group_table.py index b4103307..4f01fcef 100644 --- a/tests/test_auc_by_group_table.py +++ b/tests/test_auc_by_group_table.py @@ -2,7 +2,7 @@ # pylint: disable=missing-function-docstring from psycopt2d.tables import auc_by_group_table -from psycopt2d.utils import bin_continuous_data +from psycopt2d.utils.utils import bin_continuous_data def test_auc_by_group_table(synth_data): diff --git a/tests/test_calculate_performance_metrics.py b/tests/test_calculate_performance_metrics.py index db630a4f..0f053e71 100644 --- a/tests/test_calculate_performance_metrics.py +++ b/tests/test_calculate_performance_metrics.py @@ -1,5 +1,5 @@ # import wandb -# from psycopt2d.utils import calculate_performance_metrics +# from psycopt2d.utils.utils import calculate_performance_metrics # def test_log_performance_metrics(synth_data): diff --git a/tests/test_load.py b/tests/test_load.py index c0be3a58..58bafbdc 100644 --- a/tests/test_load.py +++ b/tests/test_load.py @@ -2,6 +2,7 @@ from hydra import compose, initialize from psycopt2d.load import load_train_and_val_from_cfg +from psycopt2d.utils.omegaconf_to_pydantic_objects import omegaconf_to_pydantic_objects def test_load_lookbehind_exceeds_lookbehind_threshold(): @@ -10,14 +11,14 @@ def test_load_lookbehind_exceeds_lookbehind_threshold(): with initialize(version_base=None, config_path="../src/psycopt2d/config/"): cfg = compose( config_name="integration_testing.yaml", - overrides=[ - "++data.min_lookbehind_days=90", - ], ) + cfg = omegaconf_to_pydantic_objects(cfg) + + cfg.data.min_lookahead_days = 90 split_dataset = load_train_and_val_from_cfg(cfg) - assert split_dataset.train.shape == (644, 7) + assert split_dataset.train.shape == (644, 6) def test_load_lookbehind_not_in_lookbehind_combination(): @@ -26,11 +27,11 @@ def test_load_lookbehind_not_in_lookbehind_combination(): with initialize(version_base=None, config_path="../src/psycopt2d/config/"): cfg = compose( config_name="integration_testing.yaml", - overrides=[ - "++data.lookbehind_combination=[30]", - ], ) + cfg = omegaconf_to_pydantic_objects(cfg) + + cfg.data.lookbehind_combination = [30] split_dataset = load_train_and_val_from_cfg(cfg) assert split_dataset.train.shape == (700, 6) diff --git a/tests/test_performance_by_threshold.py b/tests/test_performance_by_threshold.py index 6d3b0630..499640e0 100644 --- a/tests/test_performance_by_threshold.py +++ b/tests/test_performance_by_threshold.py @@ -14,7 +14,7 @@ days_from_first_positive_to_diagnosis, generate_performance_by_positive_rate_table, ) -from psycopt2d.utils import positive_rate_to_pred_probs +from psycopt2d.utils.utils import positive_rate_to_pred_probs @pytest.fixture(scope="function") diff --git a/tests/test_train_model.py b/tests/test_train_model.py index 4538dc14..ef246e50 100644 --- a/tests/test_train_model.py +++ b/tests/test_train_model.py @@ -5,10 +5,11 @@ from psycopt2d.models import MODELS from psycopt2d.train_model import main +from psycopt2d.utils.omegaconf_to_pydantic_objects import omegaconf_to_pydantic_objects CONFIG_DIR_PATH = "../src/psycopt2d/config/" INTEGRATION_TEST_FILE_NAME = "integration_testing.yaml" -INTEGRATION_TESTING_MODEL_OVERRIDE = "+model=logistic-regression" +INTEGRATION_TESTING_MODEL_OVERRIDE = "model=logistic-regression" @pytest.mark.parametrize("model_name", MODELS.keys()) @@ -18,13 +19,15 @@ def test_main(model_name): cfg = compose( config_name=INTEGRATION_TEST_FILE_NAME, - overrides=[f"+model={model_name}"], + overrides=[f"model={model_name}"], ) + cfg = omegaconf_to_pydantic_objects(cfg) + # XGBoost should train on GPU on Overtaci, # but CPU during integration testing if model_name == "xgboost": - cfg.model.args.tree_method = "auto" + cfg.model.args["tree_method"] = "auto" main(cfg) diff --git a/tests/test_utils.py b/tests/test_utils.py index b3ddf1fe..eac256d4 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,10 +1,16 @@ """Testing of the utils module.""" # pylint: disable=missing-function-docstring +from pathlib import Path + import numpy as np import pandas as pd +import pytest +from hydra import compose, initialize from utils_for_testing import str_to_df -from psycopt2d.utils import ( +from psycopt2d.utils.omegaconf_to_pydantic_objects import omegaconf_to_pydantic_objects +from psycopt2d.utils.utils import ( + PROJECT_ROOT, drop_records_if_datediff_days_smaller_than, flatten_nested_dict, ) @@ -50,3 +56,23 @@ def test_flatten_nested_dict(): output_dict = flatten_nested_dict(input_dict) assert expected_dict == output_dict + + +CONFIG_DIR_PATH_ABS = PROJECT_ROOT / "psycopt2d" / "config" +CONFIG_DIR_PATH_REL = "../src/psycopt2d/config" + + +def get_config_file_names() -> list[str]: + """Get all config file names""" + config_file_paths: list[Path] = list(CONFIG_DIR_PATH_ABS.glob("*.yaml")) + return [f"{path.stem}.yaml" for path in config_file_paths] + + +@pytest.mark.parametrize("config_file_name", get_config_file_names()) +def test_configs(config_file_name): + with initialize(version_base=None, config_path=CONFIG_DIR_PATH_REL): + cfg = compose( + config_name=config_file_name, + ) + + cfg = omegaconf_to_pydantic_objects(conf=cfg) diff --git a/tests/test_visualizations.py b/tests/test_visualizations.py index b108e04f..963fa5d1 100644 --- a/tests/test_visualizations.py +++ b/tests/test_visualizations.py @@ -10,7 +10,7 @@ import pytest from sklearn.metrics import f1_score, roc_auc_score -from psycopt2d.utils import positive_rate_to_pred_probs +from psycopt2d.utils.utils import positive_rate_to_pred_probs from psycopt2d.visualization import plot_prob_over_time from psycopt2d.visualization.base_charts import plot_basic_chart from psycopt2d.visualization.feature_importance import plot_feature_importances From aaa25c165b190ab4d89aeb8a5543779c28c10765 Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Fri, 21 Oct 2022 11:45:18 +0200 Subject: [PATCH 17/57] style: linting --- application/train_and_log_models.py | 14 ++++++------ src/psycopt2d/load.py | 1 - .../utils/omegaconf_to_pydantic_objects.py | 22 +++++++------------ src/psycopt2d/utils/utils.py | 1 - tests/test_utils.py | 2 +- 5 files changed, 16 insertions(+), 24 deletions(-) diff --git a/application/train_and_log_models.py b/application/train_and_log_models.py index adc8bcab..2e340fa5 100644 --- a/application/train_and_log_models.py +++ b/application/train_and_log_models.py @@ -20,7 +20,6 @@ infer_predictor_col_name, ) from psycopt2d.load import DataLoader, DatasetSpecification, DatasetTimeSpecification -from psycopt2d.utils.utils import PROJECT_ROOT msg = Printer(timestamp=True) @@ -178,7 +177,7 @@ def train_models_for_each_cell_in_grid( cell = lookbehind_combinations.pop() msg.info( - f"Spawning a new trainer with lookbehind={cell.lookbehind} and lookahead={cell.lookahead}" + f"Spawning a new trainer with lookbehind={cell.lookbehind} and lookahead={cell.lookahead}", ) wandb_group = f"{wandb_prefix}-beh-{cell.lookbehind}-ahead-{cell.lookahead}" @@ -207,7 +206,7 @@ def train_models_for_each_cell_in_grid( active_trainers.append( subprocess.Popen( # pylint: disable=consider-using-with args=subprocess_args, - ) + ), ) @@ -255,7 +254,8 @@ def train_models_for_each_cell_in_grid( ) dataset_spec = get_dataset_spec( - data_dir_path=meta_conf.data_dir, file_suffix=cfg.data.suffix + data_dir_path=meta_conf.data_dir, + file_suffix=cfg.data.suffix, ) train = load_train_for_inference(dataset_spec=dataset_spec) @@ -280,11 +280,11 @@ def train_models_for_each_cell_in_grid( if not train_conf.gpu: msg.warn("Not using GPU for training") - clean_dir_seconds = 0 + CLEAN_DIR_SECONDS = 0 msg.info( - f"Sleeping for {clean_dir_seconds} seconds to allow watcher to start and clean dir" + f"Sleeping for {CLEAN_DIR_SECONDS} seconds to allow watcher to start and clean dir", ) - time.sleep(clean_dir_seconds) + time.sleep(CLEAN_DIR_SECONDS) train_models_for_each_cell_in_grid(train_conf=train_conf, wandb_conf=wandb_conf) diff --git a/src/psycopt2d/load.py b/src/psycopt2d/load.py index 48a0b7c1..1403df98 100644 --- a/src/psycopt2d/load.py +++ b/src/psycopt2d/load.py @@ -3,7 +3,6 @@ from collections.abc import Iterable from datetime import datetime, timedelta from pathlib import Path -from queue import Full from typing import Any, Optional, Union import pandas as pd diff --git a/src/psycopt2d/utils/omegaconf_to_pydantic_objects.py b/src/psycopt2d/utils/omegaconf_to_pydantic_objects.py index e1d1fa95..340ca6f2 100644 --- a/src/psycopt2d/utils/omegaconf_to_pydantic_objects.py +++ b/src/psycopt2d/utils/omegaconf_to_pydantic_objects.py @@ -1,9 +1,9 @@ -"""Utilities for converting config yamls to pydantic objects. +"""Utilities for converting config yamls to pydantic objects. Helpful because it makes them: - Addressable with intellisense, -- Refactorable with IDEs, -- Easier to document with docstrings and +- Refactorable with IDEs, +- Easier to document with docstrings and - Type checkable """ @@ -11,8 +11,6 @@ from pathlib import Path from typing import Optional, Union -import pydantic -from hydra import compose, initialize from omegaconf import DictConfig, OmegaConf from pydantic import BaseModel as PydanticBaseModel @@ -21,7 +19,7 @@ class BaseModel(PydanticBaseModel): """Allow arbitrary types in all pydantic models.""" class Config: - """Allow arbitrary types""" + """Allow arbitrary types.""" arbitrary_types_allowed = True @@ -65,7 +63,7 @@ class DataConf(BaseModel): class PreprocessingConf(BaseModel): - """Preprocessing config""" + """Preprocessing config.""" convert_to_boolean: bool # (Boolean): Convert all prediction values (except gender) to boolean. Defaults to False convert_datetimes_to: bool # (str): Options include ordinal or False @@ -76,7 +74,7 @@ class PreprocessingConf(BaseModel): class ModelConf(BaseModel): - """Model configuration""" + """Model configuration.""" model_name: str # (str): Model, can currently take xgboost require_imputation: bool # (bool): Whether the model requires imputation. (shouldn't this be false?) @@ -84,13 +82,13 @@ class ModelConf(BaseModel): class TrainConf(BaseModel): - """Training configuration""" + """Training configuration.""" n_splits: int # TODO: How do we handle whether to use crossvalidation or train/val splitting? class EvalConf(BaseModel): - """Evaluation config""" + """Evaluation config.""" threshold_percentiles: list[int] @@ -125,7 +123,3 @@ def omegaconf_to_pydantic_objects(conf: DictConfig) -> FullConfig: """ conf = OmegaConf.to_container(conf, resolve=True) # type: ignore return FullConfig(**conf) - - -if __name__ == "__main__": - main() diff --git a/src/psycopt2d/utils/utils.py b/src/psycopt2d/utils/utils.py index 1ad2519a..e13ae035 100644 --- a/src/psycopt2d/utils/utils.py +++ b/src/psycopt2d/utils/utils.py @@ -13,7 +13,6 @@ import dill as pkl import numpy as np import pandas as pd -from omegaconf.dictconfig import DictConfig from sklearn.pipeline import Pipeline from wandb.sdk.wandb_run import Run # pylint: disable=no-name-in-module from wasabi import msg diff --git a/tests/test_utils.py b/tests/test_utils.py index eac256d4..b5a40dbc 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -63,7 +63,7 @@ def test_flatten_nested_dict(): def get_config_file_names() -> list[str]: - """Get all config file names""" + """Get all config file names.""" config_file_paths: list[Path] = list(CONFIG_DIR_PATH_ABS.glob("*.yaml")) return [f"{path.stem}.yaml" for path in config_file_paths] From 76ba99d42c2ec8957642ae14615d19543dfa3fe3 Mon Sep 17 00:00:00 2001 From: HLasse Date: Fri, 21 Oct 2022 12:03:54 +0200 Subject: [PATCH 18/57] fix: make watcher not archive runs that haven't finished --- src/psycopt2d/model_training_watcher.py | 53 +++++++++++++++++++------ 1 file changed, 40 insertions(+), 13 deletions(-) diff --git a/src/psycopt2d/model_training_watcher.py b/src/psycopt2d/model_training_watcher.py index 3305122c..8d7408d2 100644 --- a/src/psycopt2d/model_training_watcher.py +++ b/src/psycopt2d/model_training_watcher.py @@ -12,9 +12,13 @@ from wasabi import msg from psycopt2d.evaluation import evaluate_model -from psycopt2d.utils import (MODEL_PREDICTIONS_PATH, PROJECT_ROOT, - infer_outcome_col_name, infer_y_hat_prob_col_name, - load_evaluation_data) +from psycopt2d.utils import ( + MODEL_PREDICTIONS_PATH, + PROJECT_ROOT, + infer_outcome_col_name, + infer_y_hat_prob_col_name, + load_evaluation_data, +) # Path to the wandb directory WANDB_DIR = PROJECT_ROOT / "wandb" @@ -32,6 +36,7 @@ class ModelTrainingWatcher: model_data_dir: Where to look for evaluation results. overtaci: Whether the script is running on overtaci. Determines where to look for the evaluation results. + verbose: Whether to print verbose output. """ def __init__( @@ -40,6 +45,7 @@ def __init__( project_name: str, n_runs_before_eval: int, model_data_dir: Path, + verbose: bool = False, ): self.entity = entity self.project_name = project_name @@ -47,6 +53,7 @@ def __init__( self.n_runs_before_eval = n_runs_before_eval + self.verbose = verbose # A queue for runs waiting to be uploaded to WandB self.run_id_eval_candidates_queue = [] self.max_performance = 0 @@ -76,10 +83,16 @@ def get_new_runs_and_evaluate(self) -> None: def _upload_run_dir(self, run_dir: Path) -> None: """Upload a single run to wandb.""" - subprocess.run( + # get stdout from subprocess.run + proc = subprocess.run( ["wandb", "sync", str(run_dir), "--project", self.project_name], check=True, + capture_output=True, ) + stdout = proc.stdout.decode("utf-8") + if self.verbose: + msg.info(f"Watcher: {stdout}") + return stdout def _archive_run_dir(self, run_dir: Path) -> None: """Move a run to the archive folder.""" @@ -92,15 +105,16 @@ def _get_run_id(self, run_dir: Path) -> str: def upload_unarchived_runs(self) -> None: """Upload unarchived runs to wandb.""" for run_folder in WANDB_DIR.glob(r"offline-run*"): - # TODO: We need some kind of test here to figure out if the run is - # still running or not. If it is still running, we should wait - # until it is finished. Otherwise, we get a "permission denied" error. run_id = self._get_run_id(run_folder) - self._upload_run_dir(run_folder) + wandb_sync_stdout = self._upload_run_dir(run_folder) # TODO: If upload_run_dir fails, we should not archive the run. # use return from subprocess.run to check if it failed. See docs: https://docs.python.org/3/library/subprocess.html + if ".wandb file is empty" in wandb_sync_stdout: + if self.verbose: + msg.warn(f"Run {run_id} is still running. Skipping.") + continue self._archive_run_dir(run_folder) self.run_id_eval_candidates_queue.append(run_id) @@ -144,9 +158,10 @@ def _get_run_performance(self, run_id: str) -> float: run = self._get_wandb_run(run_id) if "roc_auc_unweighted" in run.summary: return run.summary.roc_auc_unweighted - msg.info( - f"Run {run_id} has no performance metric. Pinging again at next eval time.", - ) + if self.verbose: + msg.info( + f"Run {run_id} has no performance metric. Pinging again at next eval time.", + ) return None def evaluate_best_runs(self) -> None: @@ -157,7 +172,11 @@ def evaluate_best_runs(self) -> None: } # sort runs by performance to not upload subpar runs run_performances = dict( - sorted(run_performances.items(), key=lambda item: item[1], reverse=True), + sorted( + run_performances.items(), + key=lambda item: (item[1] is not None, item[1]), + reverse=True, + ), ) # get runs with auc of None (attempted upload before run finished) unfinished_runs = [ @@ -165,7 +184,7 @@ def evaluate_best_runs(self) -> None: ] for run_id, performance in run_performances.items(): - if performance > self.max_performance: + if performance is not None and performance > self.max_performance: msg.good(f"New record performance! AUC: {performance}") self.max_performance = performance self._do_evaluation(run_id) @@ -217,6 +236,13 @@ def float_or_none(arg: str) -> Optional[float]: help="Archive all runs in the wandb dir before starting", required=True, ) + parser.add_argument( + "--verbose", + type=lambda x: bool(strtobool(x)), + help="Whether to print verbose messages (default: False)", + required=False, + default=False, + ) args = parser.parse_args() model_data_dir = ( @@ -230,6 +256,7 @@ def float_or_none(arg: str) -> Optional[float]: project_name=args.project_name, n_runs_before_eval=args.n_runs_before_eval, model_data_dir=model_data_dir, + verbose=args.verbose, ) if args.clean_wandb_dir: watcher.archive_all_runs() From ab2291d4fa8d584f9f0b03a27d839b4a9f00a240 Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Fri, 21 Oct 2022 12:04:03 +0200 Subject: [PATCH 19/57] refactor: refactor train_and_log_models --- application/train_and_log_models.py | 161 ++++-------------- .../config/project/default_project.yaml | 5 +- .../project/integration_test_project.yaml | 1 + .../project/watcher/default_watcher.yaml | 2 + .../config/train/default_training.yaml | 1 + .../utils/omegaconf_to_pydantic_objects.py | 12 ++ 6 files changed, 53 insertions(+), 129 deletions(-) create mode 100644 src/psycopt2d/config/project/watcher/default_watcher.yaml diff --git a/application/train_and_log_models.py b/application/train_and_log_models.py index 2e340fa5..a0a8b3a4 100644 --- a/application/train_and_log_models.py +++ b/application/train_and_log_models.py @@ -8,10 +8,9 @@ import random import subprocess import time -from pathlib import Path from hydra import compose, initialize -from pydantic import BaseModel, Field +from pydantic import BaseModel from wasabi import Printer from psycopt2d.evaluate_saved_model_predictions import ( @@ -19,7 +18,11 @@ infer_outcome_col_name, infer_predictor_col_name, ) -from psycopt2d.load import DataLoader, DatasetSpecification, DatasetTimeSpecification +from psycopt2d.load import DataLoader +from psycopt2d.utils.omegaconf_to_pydantic_objects import ( + FullConfig, + omegaconf_to_pydantic_objects, +) msg = Printer(timestamp=True) @@ -31,70 +34,9 @@ class PossibleLookDistanceDays(BaseModel): behind: list[str] -class MetaConf(BaseModel): - """Meta configuration for the script.""" - - conf_name: str = Field("integration_testing.yaml") - data_dir: Path = Path( - "/Users/au484925/Desktop/psycop-t2d/tests/test_data/synth_splits/", - ) - overtaci: str = Field( - default="false", - description="Change to 'true' if running on overtaci", - ) - - -class WatcherConf(BaseModel): - """Configuration for the watcher.""" - - archive_all: str = Field( - default="false", - description="Whether to archive all runs in the wandb folder before starting model training. Change to 't' to archive all wandb runs", - ) - n_runs_before_first_eval: int = Field( - default="1", - description="The number of runs to upload to wandb before evaluating the best runs.", - ) - keep_alive_after_training_minutes: int = Field( - default=5, - description="minutes to wait for the wandb watcher after training has finished. Will kill the watcher after this time.", - ) - - -class WandbConf(BaseModel): - """Configuration for wandb.""" - - project_name: str = "psycopt2d-testing" - entity: str = Field( - default="psycop", - description="The wandb entity to upload to (e.g. 'psycop' or your user name)", - ) - mode: str = Field(default="online", description="The wandb mode to use") - - -class TrainConf(BaseModel): - """Configuration for model training.""" - - gpu: bool = Field(default="false", description="Whether to use GPU") - - n_trials_per_cell_in_grid: int = Field( - default=50, - description="Number of trials per cell in the lookahead/lookbehind grid. If n > 1, automatically triggers multirun.", - ) - - model_conf: str = Field( - default="xgboost", - description="The model conf to open. For example, 'xgboost' or 'logistic_regression'.", - ) - - conf_name: str = Field(default="integration_testing.yaml") - - possible_look_distances: PossibleLookDistanceDays - - -def load_train_for_inference(dataset_spec): +def load_train_for_inference(cfg: FullConfig): """Load the data.""" - loader = DataLoader(dataset_spec) + loader = DataLoader(cfg=cfg) msg.info("Loading datasets for look direction inference") return loader.load_dataset_from_dir(split_names="train") @@ -117,24 +59,6 @@ def infer_possible_look_directions(train): ) -def get_dataset_spec(data_dir_path: Path, file_suffix: str): - """Get dataset specification.""" - time_spec = DatasetTimeSpecification( - drop_patient_if_outcome_before_date=None, - min_prediction_time_date="1979-01-01", - min_lookbehind_days=0, - min_lookahead_days=0, - ) - - return DatasetSpecification( - file_suffix=file_suffix, - time=time_spec, - pred_col_name_prefix="pred_", - pred_time_colname="timestamp", - split_dir_path=data_dir_path, - ) - - class LookDirectionCombination(BaseModel): """A combination of lookbehind and lookahead days.""" @@ -143,8 +67,9 @@ class LookDirectionCombination(BaseModel): def train_models_for_each_cell_in_grid( - train_conf: TrainConf, - wandb_conf: WandbConf, + cfg: FullConfig, + possible_look_distances: PossibleLookDistanceDays, + config_file_name: str, ): """Train a model for each cell in the grid of possible look distances.""" from random_word import RandomWords @@ -154,8 +79,8 @@ def train_models_for_each_cell_in_grid( # Create all combinations of lookbehind and lookahead days lookbehind_combinations = [ LookDirectionCombination(lookbehind=lookbehind, lookahead=lookahead) - for lookbehind in train_conf.possible_look_distances.behind - for lookahead in train_conf.possible_look_distances.ahead + for lookbehind in possible_look_distances.behind + for lookahead in possible_look_distances.ahead ] lookbehind_combinations = [ @@ -185,20 +110,20 @@ def train_models_for_each_cell_in_grid( subprocess_args: list[str] = [ "python", "src/psycopt2d/train_model.py", - f"model={train_conf.model_conf}", + f"model={cfg.model.model_name}", f"data.min_lookbehind_days={cell.lookbehind}", f"data.min_lookahead_days={cell.lookahead}", f"project.wandb_group='{wandb_group}'", - f"hydra.sweeper.n_trials={train_conf.n_trials_per_cell_in_grid}", - f"project.wandb_mode={wandb_conf.mode}", + f"hydra.sweeper.n_trials={cfg.train.n_trials_per_lookdirection_combination}", + f"project.wandb_mode={cfg.project.wandb_mode}", "--config-name", - f"{meta_conf.conf_name}", + f"{config_file_name}", ] - if train_conf.n_trials_per_cell_in_grid > 1: + if cfg.train.n_trials_per_lookdirection_combination > 1: subprocess_args.insert(2, "--multirun") - if train_conf.model_conf == "xgboost" and not train_conf.gpu: + if cfg.model.model_name == "xgboost" and not cfg.project.gpu: subprocess_args.insert(3, "++model.args.tree_method='auto'") msg.info(f'{" ".join(subprocess_args)}') @@ -220,44 +145,28 @@ def train_models_for_each_cell_in_grid( config_name=CONFIG_FILE_NAME, ) - meta_conf = MetaConf( - conf_name=CONFIG_FILE_NAME, - overtaci="false", - data_dir=cfg.data.dir, - ) - - wandb_conf = WandbConf( - entity="psycop", - project_name="psycopt2d-testing", - mode=cfg.project.wandb_mode, - ) - - watcher_conf = WatcherConf(archive_all="false", keep_alive_after_training_minutes=5) + cfg = omegaconf_to_pydantic_objects(cfg) watcher = subprocess.Popen( # pylint: disable=consider-using-with [ "python", "src/psycopt2d/model_training_watcher.py", "--entity", - wandb_conf.entity, + cfg.project.wandb_entity, "--project_name", - wandb_conf.project_name, + cfg.project.name, "--n_runs_before_eval", - str(watcher_conf.n_runs_before_first_eval), + str(cfg.project.watcher.n_runs_before_eval), "--overtaci", - meta_conf.overtaci, + cfg.eval.save_model_predictions_on_overtaci, "--timeout", "None", "--clean_wandb_dir", - watcher_conf.archive_all, + cfg.project.watcher.archive_all, ], ) - dataset_spec = get_dataset_spec( - data_dir_path=meta_conf.data_dir, - file_suffix=cfg.data.suffix, - ) - train = load_train_for_inference(dataset_spec=dataset_spec) + train = load_train_for_inference(cfg=cfg) possible_look_distances = infer_possible_look_directions(train) @@ -269,15 +178,7 @@ def train_models_for_each_cell_in_grid( msg.info(f"Possible lookbehind days: {possible_look_distances.behind}") msg.info(f"Possible lookahead days: {possible_look_distances.ahead}") - train_conf = TrainConf( - conf_name=meta_conf.conf_name, - model_conf="xgboost", - n_trials_per_cell_in_grid=1, - possible_look_distances=possible_look_distances, - gpu=True, - ) - - if not train_conf.gpu: + if not cfg.project.gpu: msg.warn("Not using GPU for training") CLEAN_DIR_SECONDS = 0 @@ -286,12 +187,16 @@ def train_models_for_each_cell_in_grid( ) time.sleep(CLEAN_DIR_SECONDS) - train_models_for_each_cell_in_grid(train_conf=train_conf, wandb_conf=wandb_conf) + train_models_for_each_cell_in_grid( + cfg=cfg, + possible_look_distances=possible_look_distances, + config_file_name=CONFIG_FILE_NAME, + ) msg.good( f"Training finished. Stopping the watcher in {watcher_conf.keep_alive_after_training_minutes} minutes...", ) - time.sleep(60 * watcher_conf.keep_alive_after_training_minutes) + time.sleep(60 * cfg.project.watcher.keep_alive_after_training_minutes) watcher.kill() msg.good("Watcher stopped.") diff --git a/src/psycopt2d/config/project/default_project.yaml b/src/psycopt2d/config/project/default_project.yaml index 404397fa..99af8b3f 100644 --- a/src/psycopt2d/config/project/default_project.yaml +++ b/src/psycopt2d/config/project/default_project.yaml @@ -1,4 +1,7 @@ name: psycop-t2d seed: 42 wandb_mode: "run" # Which mode to run WanDB in. Takes "run", "dryrun", "offline" and "disabled" -wandb_group: "psycop-t2d" # Which group to run WanDB in. \ No newline at end of file +wandb_group: "psycop-t2d" # Which group to run WanDB in. +wandb_entity: "psycop-t2d-testing" # Which entity to run WanDB in. +watcher: default_watcher +gpu: false diff --git a/src/psycopt2d/config/project/integration_test_project.yaml b/src/psycopt2d/config/project/integration_test_project.yaml index 39402f44..9be0e502 100644 --- a/src/psycopt2d/config/project/integration_test_project.yaml +++ b/src/psycopt2d/config/project/integration_test_project.yaml @@ -2,3 +2,4 @@ name: psycop-t2d-integration-testing seed: 42 wandb_mode: "disabled" # Which mode to run WanDB in. Takes "run", "dryrun", "offline" and "disabled" wandb_group: "integration_testing" +watcher: default_watcher diff --git a/src/psycopt2d/config/project/watcher/default_watcher.yaml b/src/psycopt2d/config/project/watcher/default_watcher.yaml new file mode 100644 index 00000000..f76bc9a4 --- /dev/null +++ b/src/psycopt2d/config/project/watcher/default_watcher.yaml @@ -0,0 +1,2 @@ +archive_all: true +keep_alive_after_training_minutes: 5 diff --git a/src/psycopt2d/config/train/default_training.yaml b/src/psycopt2d/config/train/default_training.yaml index 56014ceb..9ecc02a5 100644 --- a/src/psycopt2d/config/train/default_training.yaml +++ b/src/psycopt2d/config/train/default_training.yaml @@ -1 +1,2 @@ n_splits: 2 # (int, Null): Number of k-folds during CV. If Null, loads pre-defined dataset. +n_trials_per_lookdirection_combination: 1 diff --git a/src/psycopt2d/utils/omegaconf_to_pydantic_objects.py b/src/psycopt2d/utils/omegaconf_to_pydantic_objects.py index 340ca6f2..4c762df8 100644 --- a/src/psycopt2d/utils/omegaconf_to_pydantic_objects.py +++ b/src/psycopt2d/utils/omegaconf_to_pydantic_objects.py @@ -24,6 +24,14 @@ class Config: arbitrary_types_allowed = True +class WatcherConf(BaseModel): + """Configuration for watchers""" + + archive_all: bool + keep_alive_after_training_minutes: Union[int, float] + n_runs_before_eval: int + + class ProjectConf(BaseModel): """Project configuration.""" @@ -31,6 +39,9 @@ class ProjectConf(BaseModel): seed: int wandb_group: str wandb_mode: str + wandb_entity: str + watcher: WatcherConf + gpu: bool class DataConf(BaseModel): @@ -85,6 +96,7 @@ class TrainConf(BaseModel): """Training configuration.""" n_splits: int # TODO: How do we handle whether to use crossvalidation or train/val splitting? + n_trials_per_lookdirection_combination: int class EvalConf(BaseModel): From 5c0503b091ab386840a9b45446e3850a52355606 Mon Sep 17 00:00:00 2001 From: HLasse Date: Fri, 21 Oct 2022 12:04:20 +0200 Subject: [PATCH 20/57] fix: infer col names return list if len 1 --- src/psycopt2d/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/psycopt2d/utils.py b/src/psycopt2d/utils.py index 976e56db..fcdbe2b0 100644 --- a/src/psycopt2d/utils.py +++ b/src/psycopt2d/utils.py @@ -18,7 +18,7 @@ from wandb.sdk.wandb_run import Run # pylint: disable=no-name-in-module from wasabi import msg -from psycopt2d.dataclasses.configs import ModelEvalData +from psycopt2d.configs import ModelEvalData from psycopt2d.model_performance import ModelPerformance SHARED_RESOURCES_PATH = Path(r"E:\shared_resources") @@ -402,7 +402,7 @@ def infer_col_names( col_name = [c for c in df.columns if c.startswith(prefix)] if len(col_name) == 1: - return [col_name[0]] + return col_name[0] elif len(col_name) > 1: if allow_multiple: return col_name From dce563e61c1b9579759aaffced60a1f2d4ec5a75 Mon Sep 17 00:00:00 2001 From: HLasse Date: Fri, 21 Oct 2022 12:04:28 +0200 Subject: [PATCH 21/57] fix: remove artefact code --- src/psycopt2d/train_and_log_models.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/src/psycopt2d/train_and_log_models.py b/src/psycopt2d/train_and_log_models.py index 499c539a..50b92ee8 100644 --- a/src/psycopt2d/train_and_log_models.py +++ b/src/psycopt2d/train_and_log_models.py @@ -70,15 +70,3 @@ time.sleep(60 * KEEP_WATCHER_ALIVE_AFTER_TRAINING_FINISHED_MINUTES) watcher.kill() msg.good("Watcher stopped.") - - # any_process_done = False # pylint: disable=invalid-name - # for process in (trainer, watcher): - # while process.poll() is None: - # if any_process_done: - # # kill the watcher if the trainer is done - # # but allow some time to finish evaluation - # time.sleep(KEEP_WATCHER_ALIVE_AFTER_TRAINING_FINISHED_MINUTES * 60) - # process.kill() - # time.sleep(1) - # any_process_done = True # pylint: disable=invalid-name - # process.kill() From 1999b6554df20757e6eb473ebf7f539851311657 Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Fri, 21 Oct 2022 12:09:56 +0200 Subject: [PATCH 22/57] fix: failing tests --- src/psycopt2d/config/default_config.yaml | 2 +- src/psycopt2d/config/project/default_project.yaml | 4 +++- src/psycopt2d/config/project/integration_test_project.yaml | 6 +++++- src/psycopt2d/config/project/watcher/default_watcher.yaml | 2 -- src/psycopt2d/utils/omegaconf_to_pydantic_objects.py | 4 ++-- 5 files changed, 11 insertions(+), 7 deletions(-) delete mode 100644 src/psycopt2d/config/project/watcher/default_watcher.yaml diff --git a/src/psycopt2d/config/default_config.yaml b/src/psycopt2d/config/default_config.yaml index 5590c26f..46b91517 100644 --- a/src/psycopt2d/config/default_config.yaml +++ b/src/psycopt2d/config/default_config.yaml @@ -1,6 +1,6 @@ # @package _global_ defaults: - - project: overtaci_test_project + - project: default_project - data: t2d_parquet - preprocessing: default_preprocessing - model: xgboost diff --git a/src/psycopt2d/config/project/default_project.yaml b/src/psycopt2d/config/project/default_project.yaml index 99af8b3f..ee44de65 100644 --- a/src/psycopt2d/config/project/default_project.yaml +++ b/src/psycopt2d/config/project/default_project.yaml @@ -3,5 +3,7 @@ seed: 42 wandb_mode: "run" # Which mode to run WanDB in. Takes "run", "dryrun", "offline" and "disabled" wandb_group: "psycop-t2d" # Which group to run WanDB in. wandb_entity: "psycop-t2d-testing" # Which entity to run WanDB in. -watcher: default_watcher +watcher: + archive_all: true + keep_alive_after_training_minutes: 5 gpu: false diff --git a/src/psycopt2d/config/project/integration_test_project.yaml b/src/psycopt2d/config/project/integration_test_project.yaml index 9be0e502..23f9eff2 100644 --- a/src/psycopt2d/config/project/integration_test_project.yaml +++ b/src/psycopt2d/config/project/integration_test_project.yaml @@ -2,4 +2,8 @@ name: psycop-t2d-integration-testing seed: 42 wandb_mode: "disabled" # Which mode to run WanDB in. Takes "run", "dryrun", "offline" and "disabled" wandb_group: "integration_testing" -watcher: default_watcher +wandb_entity: "psycop-t2d-testing" # Which entity to run WanDB in. +watcher: + archive_all: true + keep_alive_after_training_minutes: 5 +gpu: false diff --git a/src/psycopt2d/config/project/watcher/default_watcher.yaml b/src/psycopt2d/config/project/watcher/default_watcher.yaml deleted file mode 100644 index f76bc9a4..00000000 --- a/src/psycopt2d/config/project/watcher/default_watcher.yaml +++ /dev/null @@ -1,2 +0,0 @@ -archive_all: true -keep_alive_after_training_minutes: 5 diff --git a/src/psycopt2d/utils/omegaconf_to_pydantic_objects.py b/src/psycopt2d/utils/omegaconf_to_pydantic_objects.py index 4c762df8..c1e90875 100644 --- a/src/psycopt2d/utils/omegaconf_to_pydantic_objects.py +++ b/src/psycopt2d/utils/omegaconf_to_pydantic_objects.py @@ -25,7 +25,7 @@ class Config: class WatcherConf(BaseModel): - """Configuration for watchers""" + """Configuration for watchers.""" archive_all: bool keep_alive_after_training_minutes: Union[int, float] @@ -40,7 +40,7 @@ class ProjectConf(BaseModel): wandb_group: str wandb_mode: str wandb_entity: str - watcher: WatcherConf + watcher: dict gpu: bool From f9db2e884f76b824c59f9f5ddbbb9362bcdc0a1d Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Fri, 21 Oct 2022 12:12:21 +0200 Subject: [PATCH 23/57] style: linting --- application/train_and_log_models.py | 2 +- src/psycopt2d/config/project/default_project.yaml | 1 + src/psycopt2d/config/project/integration_test_project.yaml | 1 + src/psycopt2d/utils/omegaconf_to_pydantic_objects.py | 2 +- 4 files changed, 4 insertions(+), 2 deletions(-) diff --git a/application/train_and_log_models.py b/application/train_and_log_models.py index a0a8b3a4..b3df5c9f 100644 --- a/application/train_and_log_models.py +++ b/application/train_and_log_models.py @@ -194,7 +194,7 @@ def train_models_for_each_cell_in_grid( ) msg.good( - f"Training finished. Stopping the watcher in {watcher_conf.keep_alive_after_training_minutes} minutes...", + f"Training finished. Stopping the watcher in {cfg.project.watcher.keep_alive_after_training_minutes} minutes...", ) time.sleep(60 * cfg.project.watcher.keep_alive_after_training_minutes) diff --git a/src/psycopt2d/config/project/default_project.yaml b/src/psycopt2d/config/project/default_project.yaml index ee44de65..e66a50b4 100644 --- a/src/psycopt2d/config/project/default_project.yaml +++ b/src/psycopt2d/config/project/default_project.yaml @@ -6,4 +6,5 @@ wandb_entity: "psycop-t2d-testing" # Which entity to run WanDB in. watcher: archive_all: true keep_alive_after_training_minutes: 5 + n_runs_before_eval: 1 gpu: false diff --git a/src/psycopt2d/config/project/integration_test_project.yaml b/src/psycopt2d/config/project/integration_test_project.yaml index 23f9eff2..05f31fcd 100644 --- a/src/psycopt2d/config/project/integration_test_project.yaml +++ b/src/psycopt2d/config/project/integration_test_project.yaml @@ -6,4 +6,5 @@ wandb_entity: "psycop-t2d-testing" # Which entity to run WanDB in. watcher: archive_all: true keep_alive_after_training_minutes: 5 + n_runs_before_eval: 1 gpu: false diff --git a/src/psycopt2d/utils/omegaconf_to_pydantic_objects.py b/src/psycopt2d/utils/omegaconf_to_pydantic_objects.py index c1e90875..a7946edd 100644 --- a/src/psycopt2d/utils/omegaconf_to_pydantic_objects.py +++ b/src/psycopt2d/utils/omegaconf_to_pydantic_objects.py @@ -40,7 +40,7 @@ class ProjectConf(BaseModel): wandb_group: str wandb_mode: str wandb_entity: str - watcher: dict + watcher: WatcherConf gpu: bool From 1095109843a31a591a3518671d321903481db4c4 Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Fri, 21 Oct 2022 13:04:46 +0200 Subject: [PATCH 24/57] style: linting --- src/psycopt2d/load.py | 2 +- src/psycopt2d/model_training_watcher.py | 2 +- src/psycopt2d/utils/omegaconf_to_pydantic_objects.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/psycopt2d/load.py b/src/psycopt2d/load.py index 1403df98..1aef6ea7 100644 --- a/src/psycopt2d/load.py +++ b/src/psycopt2d/load.py @@ -297,7 +297,7 @@ def _drop_cols_not_in_lookbehind_combination( ] cols_to_drop = [c for c in cols_to_drop if "within" in c] - # TODO: Add some specification of within_x_days indicating how to parse columns to find lookbehinds. Or, alternatively, use the column spec. + # ? Add some specification of within_x_days indicating how to parse columns to find lookbehinds. Or, alternatively, use the column spec. dataset = dataset.drop(columns=cols_to_drop) return dataset diff --git a/src/psycopt2d/model_training_watcher.py b/src/psycopt2d/model_training_watcher.py index 9eb427f8..1c8791a4 100644 --- a/src/psycopt2d/model_training_watcher.py +++ b/src/psycopt2d/model_training_watcher.py @@ -4,7 +4,7 @@ import time from distutils.util import strtobool # pylint: disable=deprecated-module from pathlib import Path -from typing import Optional, Union +from typing import Optional import wandb from wandb.apis.public import Api # pylint: disable=no-name-in-module diff --git a/src/psycopt2d/utils/omegaconf_to_pydantic_objects.py b/src/psycopt2d/utils/omegaconf_to_pydantic_objects.py index a7946edd..380e6f85 100644 --- a/src/psycopt2d/utils/omegaconf_to_pydantic_objects.py +++ b/src/psycopt2d/utils/omegaconf_to_pydantic_objects.py @@ -95,7 +95,7 @@ class ModelConf(BaseModel): class TrainConf(BaseModel): """Training configuration.""" - n_splits: int # TODO: How do we handle whether to use crossvalidation or train/val splitting? + n_splits: int # ? How do we handle whether to use crossvalidation or train/val splitting? n_trials_per_lookdirection_combination: int From 420da4cb4af70ab28a1504da861acd8b7d3c3f86 Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Fri, 21 Oct 2022 14:48:30 +0200 Subject: [PATCH 25/57] fix: misc. fixes and refactor --- application/train_and_log_models.py | 10 +- .../config/project/default_project.yaml | 6 +- src/psycopt2d/configs.py | 5 +- src/psycopt2d/evaluation.py | 13 ++- src/psycopt2d/load.py | 109 ++++-------------- src/psycopt2d/model_training_watcher.py | 40 ++++--- src/psycopt2d/train_model.py | 1 - .../utils/omegaconf_to_pydantic_objects.py | 5 +- src/psycopt2d/utils/utils.py | 2 +- 9 files changed, 72 insertions(+), 119 deletions(-) diff --git a/application/train_and_log_models.py b/application/train_and_log_models.py index b3df5c9f..9457808c 100644 --- a/application/train_and_log_models.py +++ b/application/train_and_log_models.py @@ -95,7 +95,7 @@ def train_models_for_each_cell_in_grid( while lookbehind_combinations: # Loop to run if enough trainers have been spawned - if len(active_trainers) >= 4: + if len(active_trainers) >= 1: # TODO: Add to conf. active_trainers = [t for t in active_trainers if t.poll() is None] time.sleep(1) continue @@ -147,6 +147,8 @@ def train_models_for_each_cell_in_grid( cfg = omegaconf_to_pydantic_objects(cfg) + # TODO: Watcher must be instantiated once for each cell in the grid, otherwise + # it will compare max performances across all cells. watcher = subprocess.Popen( # pylint: disable=consider-using-with [ "python", @@ -158,11 +160,13 @@ def train_models_for_each_cell_in_grid( "--n_runs_before_eval", str(cfg.project.watcher.n_runs_before_eval), "--overtaci", - cfg.eval.save_model_predictions_on_overtaci, + str(cfg.eval.save_model_predictions_on_overtaci), "--timeout", "None", "--clean_wandb_dir", - cfg.project.watcher.archive_all, + str(cfg.project.watcher.archive_all), + "--verbose", + "True", ], ) diff --git a/src/psycopt2d/config/project/default_project.yaml b/src/psycopt2d/config/project/default_project.yaml index e66a50b4..3bec2ad6 100644 --- a/src/psycopt2d/config/project/default_project.yaml +++ b/src/psycopt2d/config/project/default_project.yaml @@ -1,10 +1,10 @@ name: psycop-t2d seed: 42 -wandb_mode: "run" # Which mode to run WanDB in. Takes "run", "dryrun", "offline" and "disabled" +wandb_mode: "offline" # Which mode to run WanDB in. Takes "run", "dryrun", "offline" and "disabled" wandb_group: "psycop-t2d" # Which group to run WanDB in. -wandb_entity: "psycop-t2d-testing" # Which entity to run WanDB in. +wandb_entity: "psycop" # Which entity to run WanDB in. watcher: - archive_all: true + archive_all: false keep_alive_after_training_minutes: 5 n_runs_before_eval: 1 gpu: false diff --git a/src/psycopt2d/configs.py b/src/psycopt2d/configs.py index e809d072..76ba6b02 100644 --- a/src/psycopt2d/configs.py +++ b/src/psycopt2d/configs.py @@ -2,9 +2,10 @@ from typing import Optional import pandas as pd -from omegaconf import DictConfig from pydantic import BaseModel +from psycopt2d.utils.omegaconf_to_pydantic_objects import FullConfig + # pylint: disable=missing-class-docstring, too-few-public-methods @@ -15,5 +16,5 @@ class Config: arbitrary_types_allowed = True df: pd.DataFrame - cfg: DictConfig + cfg: FullConfig feature_importance_dict: Optional[dict[str, float]] = None diff --git a/src/psycopt2d/evaluation.py b/src/psycopt2d/evaluation.py index cf345975..39b2f308 100644 --- a/src/psycopt2d/evaluation.py +++ b/src/psycopt2d/evaluation.py @@ -14,6 +14,7 @@ from psycopt2d.tables.performance_by_threshold import ( generate_performance_by_positive_rate_table, ) +from psycopt2d.utils.omegaconf_to_pydantic_objects import FullConfig from psycopt2d.utils.utils import PROJECT_ROOT, positive_rate_to_pred_probs from psycopt2d.visualization import ( plot_auc_by_time_from_first_visit, @@ -37,7 +38,7 @@ def log_feature_importances( feature_importance_plot_path = plot_feature_importances( feature_names=feature_importance_dict.keys(), feature_importances=feature_importance_dict.values(), - top_n_feature_importances=cfg.evaluation.top_n_feature_importances, + top_n_feature_importances=cfg.eval.top_n_feature_importances, save_path=save_path, ) @@ -53,7 +54,7 @@ def log_feature_importances( def evaluate_model( - cfg, + cfg: FullConfig, eval_df: pd.DataFrame, y_col_name: str, y_hat_prob_col_name: str, @@ -94,8 +95,8 @@ def evaluate_model( pred_timestamps = eval_df[cfg.data.pred_timestamp_col_name] y_hat_int = np.round(y_hat_probs, 0) - date_bins_ahead: Iterable[int] = cfg.evaluation.date_bins_ahead - date_bins_behind: Iterable[int] = cfg.evaluation.date_bins_behind + date_bins_ahead: Iterable[int] = cfg.eval.date_bins_ahead + date_bins_behind: Iterable[int] = cfg.eval.date_bins_behind # Drop date_bins_direction if they are further away than min_lookdirection_days if cfg.data.min_lookbehind_days: @@ -121,7 +122,7 @@ def evaluate_model( pred_proba_thresholds = positive_rate_to_pred_probs( pred_probs=y_hat_probs, - positive_rate_thresholds=cfg.evaluation.positive_rate_thresholds, + positive_rate_thresholds=cfg.eval.positive_rate_thresholds, ) msg.info(f"AUC: {auc}") @@ -132,7 +133,7 @@ def evaluate_model( performance_by_threshold_df = generate_performance_by_positive_rate_table( labels=y, pred_probs=y_hat_probs, - positive_rate_thresholds=cfg.evaluation.positive_rate_thresholds, + positive_rate_thresholds=cfg.eval.positive_rate_thresholds, pred_proba_thresholds=pred_proba_thresholds, ids=eval_df[cfg.data.id_col_name], pred_timestamps=pred_timestamps, diff --git a/src/psycopt2d/load.py b/src/psycopt2d/load.py index 1aef6ea7..08d08a46 100644 --- a/src/psycopt2d/load.py +++ b/src/psycopt2d/load.py @@ -2,6 +2,7 @@ import re from collections.abc import Iterable from datetime import datetime, timedelta +from multiprocessing.sharedctypes import Value from pathlib import Path from typing import Any, Optional, Union @@ -17,61 +18,13 @@ PROJECT_ROOT, coerce_to_datetime, get_percent_lost, + infer_outcome_col_name, infer_predictor_col_name, ) msg = Printer(timestamp=True) -class DatasetTimeSpecification(BaseModel): - """Specification of the time range of the dataset.""" - - drop_patient_if_outcome_before_date: Optional[Union[str, datetime]] = Field( - description="""If a patient experiences the outcome before this date, all their prediction times will be dropped. - Used for wash-in, to avoid including patients who were probably already experiencing the outcome before the study began.""", - ) - - min_prediction_time_date: Optional[Union[str, datetime]] = Field( - description="""Any prediction time before this date will be dropped.""", - ) - - min_lookbehind_days: Optional[Union[int, float]] = Field( - description="""If the distance from the prediction time to the start of the dataset is less than this, the prediction time will be dropped""", - ) - - min_lookahead_days: Optional[Union[int, float]] = Field( - description="""If the distance from the prediction time to the end of the dataset is less than this, the prediction time will be dropped""", - ) - - lookbehind_combination: Optional[list[Union[int, float]]] = Field( - description="""List containing a combination of lookbehind windows (e.g. [30, 60, 90]) which determines which features to keep in the dataset. E.g. for the above list, only features with lookbehinds of 30, 60 or 90 days will be kept.""", - ) - - -class DatasetSpecification(BaseModel): - """Specification for loading a dataset.""" - - split_dir_path: Union[str, Path] = Field( - description="""Path to the directory containing the split files.""", - ) - - file_suffix: str = Field( - description="""Suffix of the split files. E.g. 'parquet' or 'csv'.""", - default="parquet", - ) - - time: DatasetTimeSpecification - - pred_col_name_prefix: str = Field( - default="pred_", - description="""Prefix for the prediction column names.""", - ) - pred_time_colname: str = Field( - default="timestamp", - description="""Column name for with timestamps for prediction times""", - ) - - def load_timestamp_for_any_diabetes(): """Loads timestamps for the broad definition of diabetes used for wash-in. @@ -412,6 +365,24 @@ def _drop_cols_and_rows_if_look_direction_not_met( return dataset + def _keep_unique_outcome_col_with_lookahead_days_matching_conf( + self, dataset: pd.DataFrame + ) -> pd.DataFrame: + """Keep only one outcome column with the same lookahead days as set in the config.""" + outcome_cols = infer_outcome_col_name(df=dataset, allow_multiple=True) + col_to_drop = [ + c for c in outcome_cols if str(self.cfg.data.lookahead_days) not in c + ] + + df = dataset.drop(col_to_drop, axis=1) + + if not isinstance(infer_outcome_col_name(df), str): + raise ValueError( + "Returning more than one outcome column, will cause problems during eval." + ) + + return df + def _process_dataset(self, dataset: pd.DataFrame) -> pd.DataFrame: """Process dataset, namely: @@ -442,6 +413,10 @@ def _process_dataset(self, dataset: pd.DataFrame) -> pd.DataFrame: if self.cfg.data.lookbehind_combination: dataset = self._drop_cols_not_in_lookbehind_combination(dataset=dataset) + dataset = self._keep_unique_outcome_col_with_lookahead_days_matching_conf( + dataset=dataset + ) + return dataset def load_dataset_from_dir( @@ -503,42 +478,6 @@ def load_dataset_from_dir( return dataset -def _init_spec_from_cfg( - cfg: DictConfig, -) -> DatasetSpecification: - """Initialise a feature spec from a DictConfig.""" - data_cfg: dict[str, Any] = OmegaConf.to_container( # type: ignore - cfg.data, - resolve=True, - ) - - if data_cfg["suffix"] == "synthetic": - split_dir_path = PROJECT_ROOT / "tests" / "test_data" / "synth_splits" - file_suffix = "csv" - else: - split_dir_path = data_cfg["dir"] - file_suffix = data_cfg["suffix"] - - time_spec = DatasetTimeSpecification( - drop_patient_if_outcome_before_date=data_cfg[ - "drop_patient_if_outcome_before_date" - ], - min_lookahead_days=data_cfg["min_lookahead_days"], - min_lookbehind_days=data_cfg["min_lookbehind_days"], - min_prediction_time_date=data_cfg["min_prediction_time_date"], - lookbehind_combination=data_cfg["lookbehind_combination"], - ) - - return DatasetSpecification( - split_dir_path=split_dir_path, - pred_col_name_prefix=data_cfg["pred_col_name_prefix"], - file_suffix=file_suffix, - pred_time_colname=data_cfg["pred_timestamp_col_name"], - n_training_samples=data_cfg["n_training_samples"], - time=time_spec, - ) - - class SplitDataset(BaseModel): """A dataset split into train, test and optionally validation.""" diff --git a/src/psycopt2d/model_training_watcher.py b/src/psycopt2d/model_training_watcher.py index 1c8791a4..acf4017d 100644 --- a/src/psycopt2d/model_training_watcher.py +++ b/src/psycopt2d/model_training_watcher.py @@ -13,13 +13,10 @@ from psycopt2d.configs import ModelEvalData from psycopt2d.evaluation import evaluate_model -from psycopt2d.utils.utils import ( - MODEL_PREDICTIONS_PATH, - PROJECT_ROOT, - infer_outcome_col_name, - infer_y_hat_prob_col_name, - load_evaluation_data, -) +from psycopt2d.utils.utils import (MODEL_PREDICTIONS_PATH, PROJECT_ROOT, + infer_outcome_col_name, + infer_y_hat_prob_col_name, + load_evaluation_data) # Path to the wandb directory WANDB_DIR = PROJECT_ROOT / "wandb" @@ -79,8 +76,9 @@ def watch(self, timeout_minutes: Optional[int] = None) -> None: def get_new_runs_and_evaluate(self) -> None: """Get new runs and evaluate the best runs.""" self.upload_unarchived_runs() + if len(self.run_id_eval_candidates_queue) >= self.n_runs_before_eval: - self.evaluate_best_runs() + self.evaluate_and_upload_records_and_archive() def _upload_run_dir(self, run_dir: Path) -> str: """Upload a single run to wandb.""" @@ -110,21 +108,25 @@ def upload_unarchived_runs(self) -> None: wandb_sync_stdout = self._upload_run_dir(run_folder) - if ".wandb file is empty" in wandb_sync_stdout: - if self.verbose: - msg.warn(f"Run {run_id} is still running. Skipping.") + if not "...done" in wandb_sync_stdout: + if ".wandb file is empty" in wandb_sync_stdout: + if self.verbose: + msg.warn(f"Run {run_id} is still running. Skipping.") + else: + raise ValueError( + f"wandb sync failed, returned: {wandb_sync_stdout}" + ) continue - self._archive_run_dir(run_folder) self.run_id_eval_candidates_queue.append(run_id) - def _get_run_evaluation_dir(self, run_id: str) -> Path: + def _get_run_evaluation_data_dir(self, run_id: str) -> Path: """Get the evaluation path for a single run.""" return list(self.model_data_dir.glob(f"*{run_id}*"))[0] def _get_eval_data(self, run_id: str) -> ModelEvalData: """Get the evaluation data for a single run.""" - run_eval_dir = self._get_run_evaluation_dir(run_id) + run_eval_dir = self._get_run_evaluation_data_dir(run_id) return load_evaluation_data(run_eval_dir) @@ -153,6 +155,9 @@ def _get_wandb_run(self, run_id: str) -> Run: """Get the wandb run object from the run id.""" return Api().run(f"{self.entity}/{self.project_name}/{run_id}") + def _get_run_wandb_dir(self, run_id: str) -> Path: + return list(WANDB_DIR.glob(f"*offline-run*{run_id}*"))[0] + def _get_run_performance(self, run_id: str) -> Optional[float]: """Get the performance of a single run and check if it failed.""" run = self._get_wandb_run(run_id) @@ -160,11 +165,11 @@ def _get_run_performance(self, run_id: str) -> Optional[float]: return run.summary.roc_auc_unweighted if self.verbose: msg.info( - f"Run {run_id} has no performance metric. Pinging again at next eval time.", + f"Watcher: Run {run_id} has no performance metric. Pinging again at next eval time.", ) return None - def evaluate_best_runs(self) -> None: + def evaluate_and_upload_records_and_archive(self) -> None: """Evaluate the best runs.""" run_performances = { run_id: self._get_run_performance(run_id) @@ -188,6 +193,7 @@ def evaluate_best_runs(self) -> None: msg.good(f"New record performance! AUC: {performance}") self.max_performance = performance self._do_evaluation(run_id) + self._archive_run_dir(run_dir=self._get_run_wandb_dir(run_id)) # reset run id queue and try to upload unfinished runs next time self.run_id_eval_candidates_queue = unfinished_runs @@ -261,5 +267,5 @@ def float_or_none(arg: str) -> Optional[float]: if args.clean_wandb_dir: watcher.archive_all_runs() - msg.info("Starting WandB watcher") + msg.info("Watcher: Starting WandB watcher") watcher.watch(timeout_minutes=args.timeout) diff --git a/src/psycopt2d/train_model.py b/src/psycopt2d/train_model.py index 44ecbfe0..797b7ebe 100644 --- a/src/psycopt2d/train_model.py +++ b/src/psycopt2d/train_model.py @@ -32,7 +32,6 @@ ) CONFIG_PATH = PROJECT_ROOT / "src" / "psycopt2d" / "config" -TRAINING_COL_NAME_PREFIX = "pred_" # Handle wandb not playing nice with joblib os.environ["WANDB_START_METHOD"] = "thread" diff --git a/src/psycopt2d/utils/omegaconf_to_pydantic_objects.py b/src/psycopt2d/utils/omegaconf_to_pydantic_objects.py index 380e6f85..f13fe66f 100644 --- a/src/psycopt2d/utils/omegaconf_to_pydantic_objects.py +++ b/src/psycopt2d/utils/omegaconf_to_pydantic_objects.py @@ -54,7 +54,7 @@ class DataConf(BaseModel): suffix: str # File suffix to load. # Feature specs - pred_col_name_prefix: str # (str): prefix of predictor columns + pred_col_name_prefix: str # prefix of predictor columns pred_timestamp_col_name: str # (str): Column name for prediction times outcome_timestamp_col_name: str # (str): Column name for outcome timestamps id_col_name: str # (str): Citizen colnames @@ -124,6 +124,9 @@ class FullConfig(BaseModel): eval: EvalConf +# ? Should FullConfig be here or in another location? + + def omegaconf_to_pydantic_objects(conf: DictConfig) -> FullConfig: """Converts an omegaconf DictConfig to a pydantic object. diff --git a/src/psycopt2d/utils/utils.py b/src/psycopt2d/utils/utils.py index 3a2a8d18..33cfa289 100644 --- a/src/psycopt2d/utils/utils.py +++ b/src/psycopt2d/utils/utils.py @@ -28,7 +28,7 @@ FEATURIZERS_PATH = SHARED_RESOURCES_PATH / "featurizers" MODEL_PREDICTIONS_PATH = SHARED_RESOURCES_PATH / "model_predictions" -PROJECT_ROOT = Path(__file__).resolve().parents[2] +PROJECT_ROOT = Path(__file__).resolve().parents[3] def format_dict_for_printing(d: dict) -> str: From 51f080da7b72831391a6d6a0a99cfec837829c5a Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Fri, 21 Oct 2022 14:49:58 +0200 Subject: [PATCH 26/57] style: linting --- src/psycopt2d/load.py | 19 +++++++++---------- src/psycopt2d/model_training_watcher.py | 13 ++++++++----- 2 files changed, 17 insertions(+), 15 deletions(-) diff --git a/src/psycopt2d/load.py b/src/psycopt2d/load.py index 08d08a46..50331d50 100644 --- a/src/psycopt2d/load.py +++ b/src/psycopt2d/load.py @@ -1,21 +1,18 @@ """Loader for the t2d dataset.""" import re from collections.abc import Iterable -from datetime import datetime, timedelta -from multiprocessing.sharedctypes import Value +from datetime import timedelta from pathlib import Path -from typing import Any, Optional, Union +from typing import Optional, Union import pandas as pd -from omegaconf import DictConfig, OmegaConf from psycopmlutils.sql.loader import sql_load -from pydantic import BaseModel, Field +from pydantic import BaseModel from wasabi import Printer from psycopt2d.evaluate_saved_model_predictions import infer_look_distance from psycopt2d.utils.omegaconf_to_pydantic_objects import FullConfig from psycopt2d.utils.utils import ( - PROJECT_ROOT, coerce_to_datetime, get_percent_lost, infer_outcome_col_name, @@ -366,9 +363,11 @@ def _drop_cols_and_rows_if_look_direction_not_met( return dataset def _keep_unique_outcome_col_with_lookahead_days_matching_conf( - self, dataset: pd.DataFrame + self, + dataset: pd.DataFrame, ) -> pd.DataFrame: - """Keep only one outcome column with the same lookahead days as set in the config.""" + """Keep only one outcome column with the same lookahead days as set in + the config.""" outcome_cols = infer_outcome_col_name(df=dataset, allow_multiple=True) col_to_drop = [ c for c in outcome_cols if str(self.cfg.data.lookahead_days) not in c @@ -378,7 +377,7 @@ def _keep_unique_outcome_col_with_lookahead_days_matching_conf( if not isinstance(infer_outcome_col_name(df), str): raise ValueError( - "Returning more than one outcome column, will cause problems during eval." + "Returning more than one outcome column, will cause problems during eval.", ) return df @@ -414,7 +413,7 @@ def _process_dataset(self, dataset: pd.DataFrame) -> pd.DataFrame: dataset = self._drop_cols_not_in_lookbehind_combination(dataset=dataset) dataset = self._keep_unique_outcome_col_with_lookahead_days_matching_conf( - dataset=dataset + dataset=dataset, ) return dataset diff --git a/src/psycopt2d/model_training_watcher.py b/src/psycopt2d/model_training_watcher.py index acf4017d..b981fb74 100644 --- a/src/psycopt2d/model_training_watcher.py +++ b/src/psycopt2d/model_training_watcher.py @@ -13,10 +13,13 @@ from psycopt2d.configs import ModelEvalData from psycopt2d.evaluation import evaluate_model -from psycopt2d.utils.utils import (MODEL_PREDICTIONS_PATH, PROJECT_ROOT, - infer_outcome_col_name, - infer_y_hat_prob_col_name, - load_evaluation_data) +from psycopt2d.utils.utils import ( + MODEL_PREDICTIONS_PATH, + PROJECT_ROOT, + infer_outcome_col_name, + infer_y_hat_prob_col_name, + load_evaluation_data, +) # Path to the wandb directory WANDB_DIR = PROJECT_ROOT / "wandb" @@ -114,7 +117,7 @@ def upload_unarchived_runs(self) -> None: msg.warn(f"Run {run_id} is still running. Skipping.") else: raise ValueError( - f"wandb sync failed, returned: {wandb_sync_stdout}" + f"wandb sync failed, returned: {wandb_sync_stdout}", ) continue From e152029c1d5040e21557ae1fbf0ecaa9da39fa74 Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Fri, 21 Oct 2022 14:50:44 +0200 Subject: [PATCH 27/57] style: linting --- src/psycopt2d/model_training_watcher.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/psycopt2d/model_training_watcher.py b/src/psycopt2d/model_training_watcher.py index b981fb74..936200e5 100644 --- a/src/psycopt2d/model_training_watcher.py +++ b/src/psycopt2d/model_training_watcher.py @@ -111,7 +111,7 @@ def upload_unarchived_runs(self) -> None: wandb_sync_stdout = self._upload_run_dir(run_folder) - if not "...done" in wandb_sync_stdout: + if "...done" not in wandb_sync_stdout: if ".wandb file is empty" in wandb_sync_stdout: if self.verbose: msg.warn(f"Run {run_id} is still running. Skipping.") From d2f4952611c88c363b6d1cbad05eaf9468d71dac Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Fri, 21 Oct 2022 15:06:12 +0200 Subject: [PATCH 28/57] misc. refactors --- application/train_and_log_models.py | 139 ++++++++++-------- src/psycopt2d/config/data/t2d_parquet.yaml | 19 +-- .../config/project/default_project.yaml | 12 +- .../config/train/default_training.yaml | 2 + src/psycopt2d/configs.py | 2 +- src/psycopt2d/evaluation.py | 2 +- src/psycopt2d/load.py | 2 +- src/psycopt2d/train_model.py | 5 +- ...conf_to_pydantic_objects.py => configs.py} | 16 +- src/psycopt2d/utils/utils.py | 2 +- tests/test_load.py | 2 +- tests/test_train_model.py | 2 +- tests/test_utils.py | 2 +- 13 files changed, 116 insertions(+), 91 deletions(-) rename src/psycopt2d/utils/{omegaconf_to_pydantic_objects.py => configs.py} (96%) diff --git a/application/train_and_log_models.py b/application/train_and_log_models.py index 9457808c..fb2cb5ce 100644 --- a/application/train_and_log_models.py +++ b/application/train_and_log_models.py @@ -9,6 +9,7 @@ import subprocess import time +import pandas as pd from hydra import compose, initialize from pydantic import BaseModel from wasabi import Printer @@ -19,10 +20,7 @@ infer_predictor_col_name, ) from psycopt2d.load import DataLoader -from psycopt2d.utils.omegaconf_to_pydantic_objects import ( - FullConfig, - omegaconf_to_pydantic_objects, -) +from psycopt2d.utils.configs import FullConfig, omegaconf_to_pydantic_objects msg = Printer(timestamp=True) @@ -41,16 +39,16 @@ def load_train_for_inference(cfg: FullConfig): return loader.load_dataset_from_dir(split_names="train") -def infer_possible_look_directions(train): +def infer_possible_look_distances(df: pd.DataFrame) -> PossibleLookDistanceDays: """Infer the possible values for min_lookahead_days and min_lookbehind_days.""" # Get potential lookaheads from outc_ columns - outcome_col_names = infer_outcome_col_name(df=train, allow_multiple=True) + outcome_col_names = infer_outcome_col_name(df=df, allow_multiple=True) possible_lookahead_days = infer_look_distance(col_name=outcome_col_names) # Get potential lookbehinds from pred_ columns - pred_col_names = infer_predictor_col_name(df=train, allow_multiple=True) + pred_col_names = infer_predictor_col_name(df=df, allow_multiple=True) possible_lookbehind_days = list(set(infer_look_distance(col_name=pred_col_names))) return PossibleLookDistanceDays( @@ -83,10 +81,6 @@ def train_models_for_each_cell_in_grid( for lookahead in possible_look_distances.ahead ] - lookbehind_combinations = [ - comb for comb in lookbehind_combinations if comb.lookahead <= 1095 - ] - random.shuffle(lookbehind_combinations) active_trainers: list[subprocess.Popen] = [] @@ -95,61 +89,65 @@ def train_models_for_each_cell_in_grid( while lookbehind_combinations: # Loop to run if enough trainers have been spawned - if len(active_trainers) >= 1: # TODO: Add to conf. + if len(active_trainers) >= cfg.train.active_trainers: active_trainers = [t for t in active_trainers if t.poll() is None] time.sleep(1) continue - cell = lookbehind_combinations.pop() + combination = lookbehind_combinations.pop() + msg.info( - f"Spawning a new trainer with lookbehind={cell.lookbehind} and lookahead={cell.lookahead}", + f"Spawning a new trainer with lookbehind={combination.lookbehind} and lookahead={combination.lookahead}", ) - wandb_group = f"{wandb_prefix}-beh-{cell.lookbehind}-ahead-{cell.lookahead}" - - subprocess_args: list[str] = [ - "python", - "src/psycopt2d/train_model.py", - f"model={cfg.model.model_name}", - f"data.min_lookbehind_days={cell.lookbehind}", - f"data.min_lookahead_days={cell.lookahead}", - f"project.wandb_group='{wandb_group}'", - f"hydra.sweeper.n_trials={cfg.train.n_trials_per_lookdirection_combination}", - f"project.wandb_mode={cfg.project.wandb_mode}", - "--config-name", - f"{config_file_name}", - ] - - if cfg.train.n_trials_per_lookdirection_combination > 1: - subprocess_args.insert(2, "--multirun") - - if cfg.model.model_name == "xgboost" and not cfg.project.gpu: - subprocess_args.insert(3, "++model.args.tree_method='auto'") - - msg.info(f'{" ".join(subprocess_args)}') + wandb_group = ( + f"{wandb_prefix}-beh-{combination.lookbehind}-ahead-{combination.lookahead}" + ) active_trainers.append( - subprocess.Popen( # pylint: disable=consider-using-with - args=subprocess_args, - ), + start_trainer( + cfg=cfg, + config_file_name=config_file_name, + cell=combination, + wandb_group=wandb_group, + ) ) -if __name__ == "__main__": - msg = Printer(timestamp=True) +def start_trainer( + cfg: FullConfig, + config_file_name: str, + cell: LookDirectionCombination, + wandb_group: str, +): + subprocess_args: list[str] = [ + "python", + "src/psycopt2d/train_model.py", + f"model={cfg.model.model_name}", + f"data.min_lookbehind_days={cell.lookbehind}", + f"data.min_lookahead_days={cell.lookahead}", + f"project.wandb_group='{wandb_group}'", + f"hydra.sweeper.n_trials={cfg.train.n_trials_per_lookdirection_combination}", + f"project.wandb_mode={cfg.project.wandb_mode}", + "--config-name", + f"{config_file_name}", + ] - CONFIG_FILE_NAME = "default_config.yaml" + if cfg.train.n_trials_per_lookdirection_combination > 1: + subprocess_args.insert(2, "--multirun") - with initialize(version_base=None, config_path="../src/psycopt2d/config/"): - cfg = compose( - config_name=CONFIG_FILE_NAME, - ) + if cfg.model.model_name == "xgboost" and not cfg.train.gpu: + subprocess_args.insert(3, "++model.args.tree_method='auto'") - cfg = omegaconf_to_pydantic_objects(cfg) + msg.info(f'{" ".join(subprocess_args)}') - # TODO: Watcher must be instantiated once for each cell in the grid, otherwise - # it will compare max performances across all cells. - watcher = subprocess.Popen( # pylint: disable=consider-using-with + return subprocess.Popen( # pylint: disable=consider-using-with + args=subprocess_args, + ) + + +def start_watcher(cfg): + return subprocess.Popen( # pylint: disable=consider-using-with [ "python", "src/psycopt2d/model_training_watcher.py", @@ -170,31 +168,36 @@ def train_models_for_each_cell_in_grid( ], ) - train = load_train_for_inference(cfg=cfg) - possible_look_distances = infer_possible_look_directions(train) +def main(): + msg = Printer(timestamp=True) + + config_file_name = "integration_testing.yaml" + + cfg = load_cfg(config_file_name=config_file_name) + # TODO: Watcher must be instantiated once for each cell in the grid, otherwise + # it will compare max performances across all cells. + watcher = start_watcher(cfg) + train = load_train_for_inference(cfg=cfg) + possible_look_distances = infer_possible_look_distances(df=train) # Remove "9999" from possible look distances behind possible_look_distances.behind = [ - dist for dist in possible_look_distances.behind if dist != "9999" + dist + for dist in possible_look_distances + if not int(dist) > cfg.data.max_lookbehind_days ] msg.info(f"Possible lookbehind days: {possible_look_distances.behind}") msg.info(f"Possible lookahead days: {possible_look_distances.ahead}") - if not cfg.project.gpu: + if not cfg.train.gpu: msg.warn("Not using GPU for training") - CLEAN_DIR_SECONDS = 0 - msg.info( - f"Sleeping for {CLEAN_DIR_SECONDS} seconds to allow watcher to start and clean dir", - ) - time.sleep(CLEAN_DIR_SECONDS) - train_models_for_each_cell_in_grid( cfg=cfg, possible_look_distances=possible_look_distances, - config_file_name=CONFIG_FILE_NAME, + config_file_name=config_file_name, ) msg.good( @@ -204,3 +207,17 @@ def train_models_for_each_cell_in_grid( time.sleep(60 * cfg.project.watcher.keep_alive_after_training_minutes) watcher.kill() msg.good("Watcher stopped.") + + +def load_cfg(config_file_name): + with initialize(version_base=None, config_path="../src/psycopt2d/config/"): + cfg = compose( + config_name=config_file_name, + ) + + cfg = omegaconf_to_pydantic_objects(cfg) + return cfg + + +if __name__ == "__main__": + main() diff --git a/src/psycopt2d/config/data/t2d_parquet.yaml b/src/psycopt2d/config/data/t2d_parquet.yaml index 611b3d02..6c1ee3ac 100644 --- a/src/psycopt2d/config/data/t2d_parquet.yaml +++ b/src/psycopt2d/config/data/t2d_parquet.yaml @@ -1,24 +1,25 @@ # @package _global_ data: # General config - n_training_samples: null # (int, null): Number of training samples to use, defaults to null in which cases it uses all samples. + n_training_samples: null dir: E:\shared_resources\feature_sets\t2d\feature_sets\psycop_t2d_adminmanber_416_features_2022_10_20_11_12 - suffix: parquet # File suffix to load. + suffix: parquet # Feature specs - pred_col_name_prefix: "pred_" # (str): prefix of predictor columns - pred_timestamp_col_name: timestamp # (str): Column name for prediction times - outcome_timestamp_col_name: _timestamp_first_t2d # (str): Column name for outcome timestamps - id_col_name: dw_ek_borger # (str): Citizen colnames + pred_col_name_prefix: "pred_" + pred_timestamp_col_name: timestamp + outcome_timestamp_col_name: _timestamp_first_t2d + id_col_name: dw_ek_borger # Looking ahead - lookahead_days: 365 # (float): Number of days from prediction time to look ahead for the outcome. - min_lookahead_days: 365 # (int): Drop all prediction times where (max timestamp in the dataset) - (current timestamp) is less than min_lookahead_days + lookahead_days: 365 + min_lookahead_days: 365 drop_patient_if_outcome_before_date: null # Looking behind min_prediction_time_date: 2013-01-01 - min_lookbehind_days: 365 # (int): Drop all prediction times where (prediction_timestamp) - (min timestamp in the dataset) is less than min_lookbehind_days + min_lookbehind_days: 365 + max_lookbehind_days: 3650 lookbehind_combination: [30, 90, 180, 365] # Parameters that will only take effect if running with --multirun diff --git a/src/psycopt2d/config/project/default_project.yaml b/src/psycopt2d/config/project/default_project.yaml index 3bec2ad6..837cfc78 100644 --- a/src/psycopt2d/config/project/default_project.yaml +++ b/src/psycopt2d/config/project/default_project.yaml @@ -1,10 +1,12 @@ name: psycop-t2d seed: 42 -wandb_mode: "offline" # Which mode to run WanDB in. Takes "run", "dryrun", "offline" and "disabled" -wandb_group: "psycop-t2d" # Which group to run WanDB in. -wandb_entity: "psycop" # Which entity to run WanDB in. + +wandb: + entity: "psycop" # Which entity to run WanDB in. + mode: "offline" # Which mode to run WanDB in. Takes "run", "dryrun", "offline" and "disabled" + group: "psycop-t2d" # Which group to run WanDB in. + watcher: archive_all: false keep_alive_after_training_minutes: 5 - n_runs_before_eval: 1 -gpu: false + n_runs_before_eval: 1 \ No newline at end of file diff --git a/src/psycopt2d/config/train/default_training.yaml b/src/psycopt2d/config/train/default_training.yaml index 9ecc02a5..84a98c92 100644 --- a/src/psycopt2d/config/train/default_training.yaml +++ b/src/psycopt2d/config/train/default_training.yaml @@ -1,2 +1,4 @@ n_splits: 2 # (int, Null): Number of k-folds during CV. If Null, loads pre-defined dataset. n_trials_per_lookdirection_combination: 1 +gpu: false +active_trainers: 4 \ No newline at end of file diff --git a/src/psycopt2d/configs.py b/src/psycopt2d/configs.py index 76ba6b02..3e4beb6c 100644 --- a/src/psycopt2d/configs.py +++ b/src/psycopt2d/configs.py @@ -4,7 +4,7 @@ import pandas as pd from pydantic import BaseModel -from psycopt2d.utils.omegaconf_to_pydantic_objects import FullConfig +from psycopt2d.utils.configs import FullConfig # pylint: disable=missing-class-docstring, too-few-public-methods diff --git a/src/psycopt2d/evaluation.py b/src/psycopt2d/evaluation.py index 39b2f308..17a7d1b6 100644 --- a/src/psycopt2d/evaluation.py +++ b/src/psycopt2d/evaluation.py @@ -14,7 +14,7 @@ from psycopt2d.tables.performance_by_threshold import ( generate_performance_by_positive_rate_table, ) -from psycopt2d.utils.omegaconf_to_pydantic_objects import FullConfig +from psycopt2d.utils.configs import FullConfig from psycopt2d.utils.utils import PROJECT_ROOT, positive_rate_to_pred_probs from psycopt2d.visualization import ( plot_auc_by_time_from_first_visit, diff --git a/src/psycopt2d/load.py b/src/psycopt2d/load.py index 50331d50..6e08b807 100644 --- a/src/psycopt2d/load.py +++ b/src/psycopt2d/load.py @@ -11,7 +11,7 @@ from wasabi import Printer from psycopt2d.evaluate_saved_model_predictions import infer_look_distance -from psycopt2d.utils.omegaconf_to_pydantic_objects import FullConfig +from psycopt2d.utils.configs import FullConfig from psycopt2d.utils.utils import ( coerce_to_datetime, get_percent_lost, diff --git a/src/psycopt2d/train_model.py b/src/psycopt2d/train_model.py index 797b7ebe..84fb1328 100644 --- a/src/psycopt2d/train_model.py +++ b/src/psycopt2d/train_model.py @@ -19,10 +19,7 @@ from psycopt2d.feature_transformers import ConvertToBoolean, DateTimeConverter from psycopt2d.load import load_train_and_val_from_cfg from psycopt2d.models import MODELS -from psycopt2d.utils.omegaconf_to_pydantic_objects import ( - FullConfig, - omegaconf_to_pydantic_objects, -) +from psycopt2d.utils.configs import FullConfig, omegaconf_to_pydantic_objects from psycopt2d.utils.utils import ( PROJECT_ROOT, create_wandb_folders, diff --git a/src/psycopt2d/utils/omegaconf_to_pydantic_objects.py b/src/psycopt2d/utils/configs.py similarity index 96% rename from src/psycopt2d/utils/omegaconf_to_pydantic_objects.py rename to src/psycopt2d/utils/configs.py index f13fe66f..58a58e66 100644 --- a/src/psycopt2d/utils/omegaconf_to_pydantic_objects.py +++ b/src/psycopt2d/utils/configs.py @@ -24,6 +24,12 @@ class Config: arbitrary_types_allowed = True +class WandbConf(BaseModel): + group: str + mode: str + entity: str + + class WatcherConf(BaseModel): """Configuration for watchers.""" @@ -37,11 +43,7 @@ class ProjectConf(BaseModel): name: str = "psycopt2d" seed: int - wandb_group: str - wandb_mode: str - wandb_entity: str watcher: WatcherConf - gpu: bool class DataConf(BaseModel): @@ -64,12 +66,13 @@ class DataConf(BaseModel): min_lookahead_days: Optional[ int ] # (int): Drop all prediction times where (max timestamp in the dataset) - (current timestamp) is less than min_lookahead_days - min_lookbehind_days: Optional[int] drop_patient_if_outcome_before_date: Optional[Union[str, datetime]] # Looking behind # (int): Drop all prediction times where (prediction_timestamp) - (min timestamp in the dataset) is less than min_lookbehind_days min_prediction_time_date: Optional[Union[str, datetime]] + min_lookbehind_days: Optional[int] + max_lookbehind_days: Optional[int] lookbehind_combination: Optional[list[int]] @@ -97,6 +100,8 @@ class TrainConf(BaseModel): n_splits: int # ? How do we handle whether to use crossvalidation or train/val splitting? n_trials_per_lookdirection_combination: int + gpu: bool + active_trainers: int class EvalConf(BaseModel): @@ -116,6 +121,7 @@ class EvalConf(BaseModel): class FullConfig(BaseModel): """A full configuration object.""" + wandb: WandbConf project: ProjectConf data: DataConf preprocessing: PreprocessingConf diff --git a/src/psycopt2d/utils/utils.py b/src/psycopt2d/utils/utils.py index 33cfa289..3ad4bcb5 100644 --- a/src/psycopt2d/utils/utils.py +++ b/src/psycopt2d/utils/utils.py @@ -19,7 +19,7 @@ from psycopt2d.configs import ModelEvalData from psycopt2d.model_performance import ModelPerformance -from psycopt2d.utils.omegaconf_to_pydantic_objects import FullConfig +from psycopt2d.utils.configs import FullConfig SHARED_RESOURCES_PATH = Path(r"E:\shared_resources") FEATURE_SETS_PATH = SHARED_RESOURCES_PATH / "feature_sets" diff --git a/tests/test_load.py b/tests/test_load.py index 58bafbdc..bf5a9f8f 100644 --- a/tests/test_load.py +++ b/tests/test_load.py @@ -2,7 +2,7 @@ from hydra import compose, initialize from psycopt2d.load import load_train_and_val_from_cfg -from psycopt2d.utils.omegaconf_to_pydantic_objects import omegaconf_to_pydantic_objects +from psycopt2d.utils.configs import omegaconf_to_pydantic_objects def test_load_lookbehind_exceeds_lookbehind_threshold(): diff --git a/tests/test_train_model.py b/tests/test_train_model.py index ef246e50..fcb4b1b4 100644 --- a/tests/test_train_model.py +++ b/tests/test_train_model.py @@ -5,7 +5,7 @@ from psycopt2d.models import MODELS from psycopt2d.train_model import main -from psycopt2d.utils.omegaconf_to_pydantic_objects import omegaconf_to_pydantic_objects +from psycopt2d.utils.configs import omegaconf_to_pydantic_objects CONFIG_DIR_PATH = "../src/psycopt2d/config/" INTEGRATION_TEST_FILE_NAME = "integration_testing.yaml" diff --git a/tests/test_utils.py b/tests/test_utils.py index b5a40dbc..b772542e 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -8,7 +8,7 @@ from hydra import compose, initialize from utils_for_testing import str_to_df -from psycopt2d.utils.omegaconf_to_pydantic_objects import omegaconf_to_pydantic_objects +from psycopt2d.utils.configs import omegaconf_to_pydantic_objects from psycopt2d.utils.utils import ( PROJECT_ROOT, drop_records_if_datediff_days_smaller_than, From 781b23f07dd58779873d8a43390e1a169bfa4e22 Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Fri, 21 Oct 2022 15:18:24 +0200 Subject: [PATCH 29/57] refactor: simplify functionality --- application/train_and_log_models.py | 60 ++++++++----------- .../config/train/default_training.yaml | 7 +-- src/psycopt2d/train_model.py | 15 +++-- src/psycopt2d/utils/configs.py | 1 - 4 files changed, 36 insertions(+), 47 deletions(-) diff --git a/application/train_and_log_models.py b/application/train_and_log_models.py index fb2cb5ce..a22afc5a 100644 --- a/application/train_and_log_models.py +++ b/application/train_and_log_models.py @@ -83,18 +83,11 @@ def train_models_for_each_cell_in_grid( random.shuffle(lookbehind_combinations) - active_trainers: list[subprocess.Popen] = [] - wandb_prefix = f"{random_word.get_random_word()}-{random_word.get_random_word()}" while lookbehind_combinations: - # Loop to run if enough trainers have been spawned - if len(active_trainers) >= cfg.train.active_trainers: - active_trainers = [t for t in active_trainers if t.poll() is None] - time.sleep(1) - continue - combination = lookbehind_combinations.pop() + watcher = start_watcher(cfg=cfg) msg.info( f"Spawning a new trainer with lookbehind={combination.lookbehind} and lookahead={combination.lookahead}", @@ -104,15 +97,23 @@ def train_models_for_each_cell_in_grid( f"{wandb_prefix}-beh-{combination.lookbehind}-ahead-{combination.lookahead}" ) - active_trainers.append( - start_trainer( - cfg=cfg, - config_file_name=config_file_name, - cell=combination, - wandb_group=wandb_group, - ) + trainer = start_trainer( + cfg=cfg, + config_file_name=config_file_name, + cell=combination, + wandb_group=wandb_group, + ) + + while trainer.poll() is None: + time.sleep(1) + + msg.good( + f"Training finished. Stopping the watcher in {cfg.project.watcher.keep_alive_after_training_minutes} minutes...", ) + time.sleep(60 * cfg.project.watcher.keep_alive_after_training_minutes) + watcher.kill() + def start_trainer( cfg: FullConfig, @@ -169,6 +170,16 @@ def start_watcher(cfg): ) +def load_cfg(config_file_name): + with initialize(version_base=None, config_path="../src/psycopt2d/config/"): + cfg = compose( + config_name=config_file_name, + ) + + cfg = omegaconf_to_pydantic_objects(cfg) + return cfg + + def main(): msg = Printer(timestamp=True) @@ -177,7 +188,6 @@ def main(): cfg = load_cfg(config_file_name=config_file_name) # TODO: Watcher must be instantiated once for each cell in the grid, otherwise # it will compare max performances across all cells. - watcher = start_watcher(cfg) train = load_train_for_inference(cfg=cfg) possible_look_distances = infer_possible_look_distances(df=train) @@ -200,24 +210,6 @@ def main(): config_file_name=config_file_name, ) - msg.good( - f"Training finished. Stopping the watcher in {cfg.project.watcher.keep_alive_after_training_minutes} minutes...", - ) - - time.sleep(60 * cfg.project.watcher.keep_alive_after_training_minutes) - watcher.kill() - msg.good("Watcher stopped.") - - -def load_cfg(config_file_name): - with initialize(version_base=None, config_path="../src/psycopt2d/config/"): - cfg = compose( - config_name=config_file_name, - ) - - cfg = omegaconf_to_pydantic_objects(cfg) - return cfg - if __name__ == "__main__": main() diff --git a/src/psycopt2d/config/train/default_training.yaml b/src/psycopt2d/config/train/default_training.yaml index 84a98c92..476a0e11 100644 --- a/src/psycopt2d/config/train/default_training.yaml +++ b/src/psycopt2d/config/train/default_training.yaml @@ -1,4 +1,3 @@ -n_splits: 2 # (int, Null): Number of k-folds during CV. If Null, loads pre-defined dataset. -n_trials_per_lookdirection_combination: 1 -gpu: false -active_trainers: 4 \ No newline at end of file +n_splits: 3 # (int, Null): Number of k-folds during CV. If Null, loads pre-defined dataset. +n_trials_per_lookdirection_combination: 10 +gpu: true \ No newline at end of file diff --git a/src/psycopt2d/train_model.py b/src/psycopt2d/train_model.py index 84fb1328..0f84284a 100644 --- a/src/psycopt2d/train_model.py +++ b/src/psycopt2d/train_model.py @@ -20,13 +20,10 @@ from psycopt2d.load import load_train_and_val_from_cfg from psycopt2d.models import MODELS from psycopt2d.utils.configs import FullConfig, omegaconf_to_pydantic_objects -from psycopt2d.utils.utils import ( - PROJECT_ROOT, - create_wandb_folders, - flatten_nested_dict, - get_feature_importance_dict, - prediction_df_with_metadata_to_disk, -) +from psycopt2d.utils.utils import (PROJECT_ROOT, create_wandb_folders, + flatten_nested_dict, + get_feature_importance_dict, + prediction_df_with_metadata_to_disk) CONFIG_PATH = PROJECT_ROOT / "src" / "psycopt2d" / "config" @@ -359,7 +356,9 @@ def main(cfg: Union[FullConfig, DictConfig]): ) msg.info(f"ROC AUC: {roc_auc}") - run.log({"roc_auc_unweighted": roc_auc}) + run.log({"roc_auc_unweighted": roc_auc, + "lookbehind": cfg.data.lookbehind_days, + "lookahead": cfg.data.lookahead_days,}) run.finish() return roc_auc diff --git a/src/psycopt2d/utils/configs.py b/src/psycopt2d/utils/configs.py index 58a58e66..4a198dd9 100644 --- a/src/psycopt2d/utils/configs.py +++ b/src/psycopt2d/utils/configs.py @@ -101,7 +101,6 @@ class TrainConf(BaseModel): n_splits: int # ? How do we handle whether to use crossvalidation or train/val splitting? n_trials_per_lookdirection_combination: int gpu: bool - active_trainers: int class EvalConf(BaseModel): From cba13fa64d0b389ae65ed01fe46be627504e8e9d Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Fri, 21 Oct 2022 15:19:40 +0200 Subject: [PATCH 30/57] style: linting --- application/train_and_log_models.py | 110 ++++++++++++++-------------- src/psycopt2d/train_model.py | 21 ++++-- 2 files changed, 71 insertions(+), 60 deletions(-) diff --git a/application/train_and_log_models.py b/application/train_and_log_models.py index a22afc5a..6af1ee3a 100644 --- a/application/train_and_log_models.py +++ b/application/train_and_log_models.py @@ -64,63 +64,13 @@ class LookDirectionCombination(BaseModel): lookahead: int -def train_models_for_each_cell_in_grid( - cfg: FullConfig, - possible_look_distances: PossibleLookDistanceDays, - config_file_name: str, -): - """Train a model for each cell in the grid of possible look distances.""" - from random_word import RandomWords - - random_word = RandomWords() - - # Create all combinations of lookbehind and lookahead days - lookbehind_combinations = [ - LookDirectionCombination(lookbehind=lookbehind, lookahead=lookahead) - for lookbehind in possible_look_distances.behind - for lookahead in possible_look_distances.ahead - ] - - random.shuffle(lookbehind_combinations) - - wandb_prefix = f"{random_word.get_random_word()}-{random_word.get_random_word()}" - - while lookbehind_combinations: - combination = lookbehind_combinations.pop() - watcher = start_watcher(cfg=cfg) - - msg.info( - f"Spawning a new trainer with lookbehind={combination.lookbehind} and lookahead={combination.lookahead}", - ) - - wandb_group = ( - f"{wandb_prefix}-beh-{combination.lookbehind}-ahead-{combination.lookahead}" - ) - - trainer = start_trainer( - cfg=cfg, - config_file_name=config_file_name, - cell=combination, - wandb_group=wandb_group, - ) - - while trainer.poll() is None: - time.sleep(1) - - msg.good( - f"Training finished. Stopping the watcher in {cfg.project.watcher.keep_alive_after_training_minutes} minutes...", - ) - - time.sleep(60 * cfg.project.watcher.keep_alive_after_training_minutes) - watcher.kill() - - def start_trainer( cfg: FullConfig, config_file_name: str, cell: LookDirectionCombination, wandb_group: str, -): +) -> subprocess.Popen: + """Start a trainer""" subprocess_args: list[str] = [ "python", "src/psycopt2d/train_model.py", @@ -147,7 +97,8 @@ def start_trainer( ) -def start_watcher(cfg): +def start_watcher(cfg: FullConfig) -> subprocess.Popen: + """Start a watcher""" return subprocess.Popen( # pylint: disable=consider-using-with [ "python", @@ -170,7 +121,59 @@ def start_watcher(cfg): ) +def train_models_for_each_cell_in_grid( + cfg: FullConfig, + possible_look_distances: PossibleLookDistanceDays, + config_file_name: str, +): + """Train a model for each cell in the grid of possible look distances.""" + from random_word import RandomWords + + random_word = RandomWords() + + # Create all combinations of lookbehind and lookahead days + lookbehind_combinations = [ + LookDirectionCombination(lookbehind=lookbehind, lookahead=lookahead) + for lookbehind in possible_look_distances.behind + for lookahead in possible_look_distances.ahead + ] + + random.shuffle(lookbehind_combinations) + + wandb_prefix = f"{random_word.get_random_word()}-{random_word.get_random_word()}" + + while lookbehind_combinations: + combination = lookbehind_combinations.pop() + watcher = start_watcher(cfg=cfg) + + msg.info( + f"Spawning a new trainer with lookbehind={combination.lookbehind} and lookahead={combination.lookahead}", + ) + + wandb_group = ( + f"{wandb_prefix}-beh-{combination.lookbehind}-ahead-{combination.lookahead}" + ) + + trainer = start_trainer( + cfg=cfg, + config_file_name=config_file_name, + cell=combination, + wandb_group=wandb_group, + ) + + while trainer.poll() is None: + time.sleep(1) + + msg.good( + f"Training finished. Stopping the watcher in {cfg.project.watcher.keep_alive_after_training_minutes} minutes...", + ) + + time.sleep(60 * cfg.project.watcher.keep_alive_after_training_minutes) + watcher.kill() + + def load_cfg(config_file_name): + """Load config as pydantic object""" with initialize(version_base=None, config_path="../src/psycopt2d/config/"): cfg = compose( config_name=config_file_name, @@ -181,6 +184,7 @@ def load_cfg(config_file_name): def main(): + """Main""" msg = Printer(timestamp=True) config_file_name = "integration_testing.yaml" diff --git a/src/psycopt2d/train_model.py b/src/psycopt2d/train_model.py index 0f84284a..370aeee1 100644 --- a/src/psycopt2d/train_model.py +++ b/src/psycopt2d/train_model.py @@ -20,10 +20,13 @@ from psycopt2d.load import load_train_and_val_from_cfg from psycopt2d.models import MODELS from psycopt2d.utils.configs import FullConfig, omegaconf_to_pydantic_objects -from psycopt2d.utils.utils import (PROJECT_ROOT, create_wandb_folders, - flatten_nested_dict, - get_feature_importance_dict, - prediction_df_with_metadata_to_disk) +from psycopt2d.utils.utils import ( + PROJECT_ROOT, + create_wandb_folders, + flatten_nested_dict, + get_feature_importance_dict, + prediction_df_with_metadata_to_disk, +) CONFIG_PATH = PROJECT_ROOT / "src" / "psycopt2d" / "config" @@ -356,9 +359,13 @@ def main(cfg: Union[FullConfig, DictConfig]): ) msg.info(f"ROC AUC: {roc_auc}") - run.log({"roc_auc_unweighted": roc_auc, - "lookbehind": cfg.data.lookbehind_days, - "lookahead": cfg.data.lookahead_days,}) + run.log( + { + "roc_auc_unweighted": roc_auc, + "lookbehind": cfg.data.lookbehind_days, + "lookahead": cfg.data.lookahead_days, + } + ) run.finish() return roc_auc From 5df4d85e622216244cadb83f46bfee650b2488f6 Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Fri, 21 Oct 2022 15:20:00 +0200 Subject: [PATCH 31/57] style: linting --- application/train_and_log_models.py | 8 ++++---- src/psycopt2d/train_model.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/application/train_and_log_models.py b/application/train_and_log_models.py index 6af1ee3a..8f881cb2 100644 --- a/application/train_and_log_models.py +++ b/application/train_and_log_models.py @@ -70,7 +70,7 @@ def start_trainer( cell: LookDirectionCombination, wandb_group: str, ) -> subprocess.Popen: - """Start a trainer""" + """Start a trainer.""" subprocess_args: list[str] = [ "python", "src/psycopt2d/train_model.py", @@ -98,7 +98,7 @@ def start_trainer( def start_watcher(cfg: FullConfig) -> subprocess.Popen: - """Start a watcher""" + """Start a watcher.""" return subprocess.Popen( # pylint: disable=consider-using-with [ "python", @@ -173,7 +173,7 @@ def train_models_for_each_cell_in_grid( def load_cfg(config_file_name): - """Load config as pydantic object""" + """Load config as pydantic object.""" with initialize(version_base=None, config_path="../src/psycopt2d/config/"): cfg = compose( config_name=config_file_name, @@ -184,7 +184,7 @@ def load_cfg(config_file_name): def main(): - """Main""" + """Main.""" msg = Printer(timestamp=True) config_file_name = "integration_testing.yaml" diff --git a/src/psycopt2d/train_model.py b/src/psycopt2d/train_model.py index 370aeee1..48f326d5 100644 --- a/src/psycopt2d/train_model.py +++ b/src/psycopt2d/train_model.py @@ -364,7 +364,7 @@ def main(cfg: Union[FullConfig, DictConfig]): "roc_auc_unweighted": roc_auc, "lookbehind": cfg.data.lookbehind_days, "lookahead": cfg.data.lookahead_days, - } + }, ) run.finish() return roc_auc From 091a3c02af4150dcccb46c60753f0b7380e5f588 Mon Sep 17 00:00:00 2001 From: Lasse Date: Sat, 22 Oct 2022 09:48:46 +0200 Subject: [PATCH 32/57] feat: make watcher store separate max performance per lookbehind/lookahead combination --- src/psycopt2d/model_training_watcher.py | 127 +++++++++++++++++------- 1 file changed, 89 insertions(+), 38 deletions(-) diff --git a/src/psycopt2d/model_training_watcher.py b/src/psycopt2d/model_training_watcher.py index 936200e5..270f38c4 100644 --- a/src/psycopt2d/model_training_watcher.py +++ b/src/psycopt2d/model_training_watcher.py @@ -2,11 +2,13 @@ import argparse import subprocess import time +from collections import defaultdict from distutils.util import strtobool # pylint: disable=deprecated-module from pathlib import Path -from typing import Optional +from typing import Any, Optional import wandb +from pydantic import BaseModel from wandb.apis.public import Api # pylint: disable=no-name-in-module from wandb.sdk.wandb_run import Run # pylint: disable=no-name-in-module from wasabi import msg @@ -25,7 +27,24 @@ WANDB_DIR = PROJECT_ROOT / "wandb" -class ModelTrainingWatcher: +class RunInformation(BaseModel): + """Information about a wandb run.""" + + run_id: str + auc: float + lookbehind_days: int + lookahead_days: int + lookahead_lookbehind_combined: Optional[str] = None + + def __init__(self, **kwargs): + super().__init__(**kwargs) + if self.lookahead_lookbehind_combined is None: + self.lookahead_lookbehind_combined = ( + f"lookahead:{self.lookahead_days}_lookbehind:{self.lookbehind_days}" + ) + + +class ModelTrainingWatcher: # pylint: disable=too-many-instance-attributes """Watch the wandb directory for new files and uploads them to wandb. Fully evaluates the best runs after a certain number of runs have been uploaded. @@ -57,7 +76,8 @@ def __init__( self.verbose = verbose # A queue for runs waiting to be uploaded to WandB self.run_id_eval_candidates_queue: list[str] = [] - self.max_performance = 0.0 + # max performance by lookbehind/-ahead combination + self.max_performances: dict[str, float] = defaultdict(lambda: 0.0) self.archive_path = WANDB_DIR / "archive" self.archive_path.mkdir(exist_ok=True, parents=True) @@ -81,7 +101,11 @@ def get_new_runs_and_evaluate(self) -> None: self.upload_unarchived_runs() if len(self.run_id_eval_candidates_queue) >= self.n_runs_before_eval: - self.evaluate_and_upload_records_and_archive() + run_information = self._get_run_information_for_all_in_queue() + self.run_id_eval_candidates_queue = self._get_unfinished_runs( + run_information=run_information + ) + self._evaluate_and_archive_finished_runs(run_information=run_information) def _upload_run_dir(self, run_dir: Path) -> str: """Upload a single run to wandb.""" @@ -105,20 +129,24 @@ def _get_run_id(self, run_dir: Path) -> str: return run_dir.name.split("-")[-1] def upload_unarchived_runs(self) -> None: - """Upload unarchived runs to wandb.""" + """Upload unarchived runs to wandb. Only adds runs that have finished + training to the evaluation queue. + + Raises: + ValueError: If wandb sync failed + """ for run_folder in WANDB_DIR.glob(r"offline-run*"): run_id = self._get_run_id(run_folder) wandb_sync_stdout = self._upload_run_dir(run_folder) if "...done" not in wandb_sync_stdout: - if ".wandb file is empty" in wandb_sync_stdout: - if self.verbose: - msg.warn(f"Run {run_id} is still running. Skipping.") - else: + if ".wandb file is empty" not in wandb_sync_stdout: raise ValueError( f"wandb sync failed, returned: {wandb_sync_stdout}", ) + if self.verbose: + msg.warn(f"Run {run_id} is still running. Skipping.") continue self.run_id_eval_candidates_queue.append(run_id) @@ -161,44 +189,67 @@ def _get_wandb_run(self, run_id: str) -> Run: def _get_run_wandb_dir(self, run_id: str) -> Path: return list(WANDB_DIR.glob(f"*offline-run*{run_id}*"))[0] - def _get_run_performance(self, run_id: str) -> Optional[float]: - """Get the performance of a single run and check if it failed.""" - run = self._get_wandb_run(run_id) - if "roc_auc_unweighted" in run.summary: - return run.summary.roc_auc_unweighted + def _get_run_attribute(self, run: Run, attribute: str) -> Any: + """Get an attribute from a wandb run.""" + if attribute in run.summary: + return run.summary[attribute] if self.verbose: msg.info( - f"Watcher: Run {run_id} has no performance metric. Pinging again at next eval time.", + f"Run {run.id} has no attribute {attribute}. Pinging again at next eval time." ) return None - def evaluate_and_upload_records_and_archive(self) -> None: - """Evaluate the best runs.""" - run_performances = { - run_id: self._get_run_performance(run_id) + def _evaluate_and_archive_finished_runs( + self, run_information: list[RunInformation] + ) -> None: + """Evaluate the finished runs. Test their performance against the current + maximum for each lookbehind/-ahead days, and fully evaluate the best performing. + Move all wandb run dirs to the archive folder.""" + finished_runs = [ + run_info for run_info in run_information if run_info.auc is not None + ] + + for run_info in finished_runs: + if ( + run_info.auc + > self.max_performances[run_info.lookbehind_lookahead_combination] + ): + msg.good( + f"New record performance for {run_info.lookbehind_lookahead_combination}! AUC: {run_info.auc}" + ) + self.max_performances[ + run_info.loobehind_lookhead_combination + ] = run_info.auc + self._do_evaluation(run_info.run_id) + self._archive_run_dir(run_dir=self._get_run_wandb_dir(run_info.run_id)) + + def _get_unfinished_runs(self, run_information: list[RunInformation]) -> list[str]: + """Get the run ids of the unfinished runs.""" + return [run_info.run_id for run_info in run_information if run_info.auc is None] + + def _get_run_information_for_all_in_queue(self): + """Get the performance and information of all runs in the evaluation queue + and sort by lookahead/lookbehind combination and AUC for faster uploading.""" + return [ + self._get_run_information(run_id) for run_id in self.run_id_eval_candidates_queue - } - # sort runs by performance to not upload subpar runs - run_performances = dict( - sorted( - run_performances.items(), - key=lambda item: (item[1] is not None, item[1]), - reverse=True, + ].sort( + key=lambda run_info: ( + run_info.lookahead_lookbehind_combined, + run_info.auc, ), + reverse=True, ) - # get runs with auc of None (attempted upload before run finished) - unfinished_runs = [ - run_id for run_id, auc in run_performances.items() if auc is None - ] - for run_id, performance in run_performances.items(): - if performance is not None and performance > self.max_performance: - msg.good(f"New record performance! AUC: {performance}") - self.max_performance = performance - self._do_evaluation(run_id) - self._archive_run_dir(run_dir=self._get_run_wandb_dir(run_id)) - # reset run id queue and try to upload unfinished runs next time - self.run_id_eval_candidates_queue = unfinished_runs + def _get_run_information(self, run_id: str) -> RunInformation: + """Get the run information for a single run.""" + run = self._get_wandb_run(run_id) + return RunInformation( + run_id=run_id, + auc=self._get_run_attribute(run, "roc_auc_unweighted"), + lookbehind_days=self._get_run_attribute(run, "lookbehind_days"), + lookahead_days=self._get_run_attribute(run, "lookahead_days"), + ) def archive_all_runs(self) -> None: """Archive all runs in the wandb directory.""" From 66c6e9424acae535c52b37ce9d5efbc0f092ddce Mon Sep 17 00:00:00 2001 From: Lasse Date: Sat, 22 Oct 2022 09:50:06 +0200 Subject: [PATCH 33/57] chore: linting --- src/psycopt2d/model_training_watcher.py | 139 ++++++++++++------------ 1 file changed, 72 insertions(+), 67 deletions(-) diff --git a/src/psycopt2d/model_training_watcher.py b/src/psycopt2d/model_training_watcher.py index 270f38c4..fb23ad33 100644 --- a/src/psycopt2d/model_training_watcher.py +++ b/src/psycopt2d/model_training_watcher.py @@ -96,61 +96,10 @@ def watch(self, timeout_minutes: Optional[int] = None) -> None: self.get_new_runs_and_evaluate() time.sleep(1) - def get_new_runs_and_evaluate(self) -> None: - """Get new runs and evaluate the best runs.""" - self.upload_unarchived_runs() - - if len(self.run_id_eval_candidates_queue) >= self.n_runs_before_eval: - run_information = self._get_run_information_for_all_in_queue() - self.run_id_eval_candidates_queue = self._get_unfinished_runs( - run_information=run_information - ) - self._evaluate_and_archive_finished_runs(run_information=run_information) - - def _upload_run_dir(self, run_dir: Path) -> str: - """Upload a single run to wandb.""" - # get stdout from subprocess.run - proc = subprocess.run( - ["wandb", "sync", str(run_dir), "--project", self.project_name], - check=True, - capture_output=True, - ) - stdout = proc.stdout.decode("utf-8") - if self.verbose: - msg.info(f"Watcher: {stdout}") - return stdout - def _archive_run_dir(self, run_dir: Path) -> None: """Move a run to the archive folder.""" run_dir.rename(target=self.archive_path / run_dir.name) - def _get_run_id(self, run_dir: Path) -> str: - """Get the run id from a run directory.""" - return run_dir.name.split("-")[-1] - - def upload_unarchived_runs(self) -> None: - """Upload unarchived runs to wandb. Only adds runs that have finished - training to the evaluation queue. - - Raises: - ValueError: If wandb sync failed - """ - for run_folder in WANDB_DIR.glob(r"offline-run*"): - run_id = self._get_run_id(run_folder) - - wandb_sync_stdout = self._upload_run_dir(run_folder) - - if "...done" not in wandb_sync_stdout: - if ".wandb file is empty" not in wandb_sync_stdout: - raise ValueError( - f"wandb sync failed, returned: {wandb_sync_stdout}", - ) - if self.verbose: - msg.warn(f"Run {run_id} is still running. Skipping.") - continue - - self.run_id_eval_candidates_queue.append(run_id) - def _get_run_evaluation_data_dir(self, run_id: str) -> Path: """Get the evaluation path for a single run.""" return list(self.model_data_dir.glob(f"*{run_id}*"))[0] @@ -195,16 +144,20 @@ def _get_run_attribute(self, run: Run, attribute: str) -> Any: return run.summary[attribute] if self.verbose: msg.info( - f"Run {run.id} has no attribute {attribute}. Pinging again at next eval time." + f"Run {run.id} has no attribute {attribute}. Pinging again at next eval time.", ) return None def _evaluate_and_archive_finished_runs( - self, run_information: list[RunInformation] + self, + run_information: list[RunInformation], ) -> None: - """Evaluate the finished runs. Test their performance against the current - maximum for each lookbehind/-ahead days, and fully evaluate the best performing. - Move all wandb run dirs to the archive folder.""" + """Evaluate the finished runs. + + Test their performance against the current maximum for each + lookbehind/-ahead days, and fully evaluate the best performing. + Move all wandb run dirs to the archive folder. + """ finished_runs = [ run_info for run_info in run_information if run_info.auc is not None ] @@ -215,7 +168,7 @@ def _evaluate_and_archive_finished_runs( > self.max_performances[run_info.lookbehind_lookahead_combination] ): msg.good( - f"New record performance for {run_info.lookbehind_lookahead_combination}! AUC: {run_info.auc}" + f"New record performance for {run_info.lookbehind_lookahead_combination}! AUC: {run_info.auc}", ) self.max_performances[ run_info.loobehind_lookhead_combination @@ -227,9 +180,20 @@ def _get_unfinished_runs(self, run_information: list[RunInformation]) -> list[st """Get the run ids of the unfinished runs.""" return [run_info.run_id for run_info in run_information if run_info.auc is None] + def _get_run_information(self, run_id: str) -> RunInformation: + """Get the run information for a single run.""" + run = self._get_wandb_run(run_id) + return RunInformation( + run_id=run_id, + auc=self._get_run_attribute(run, "roc_auc_unweighted"), + lookbehind_days=self._get_run_attribute(run, "lookbehind_days"), + lookahead_days=self._get_run_attribute(run, "lookahead_days"), + ) + def _get_run_information_for_all_in_queue(self): - """Get the performance and information of all runs in the evaluation queue - and sort by lookahead/lookbehind combination and AUC for faster uploading.""" + """Get the performance and information of all runs in the evaluation + queue and sort by lookahead/lookbehind combination and AUC for faster + uploading.""" return [ self._get_run_information(run_id) for run_id in self.run_id_eval_candidates_queue @@ -241,15 +205,56 @@ def _get_run_information_for_all_in_queue(self): reverse=True, ) - def _get_run_information(self, run_id: str) -> RunInformation: - """Get the run information for a single run.""" - run = self._get_wandb_run(run_id) - return RunInformation( - run_id=run_id, - auc=self._get_run_attribute(run, "roc_auc_unweighted"), - lookbehind_days=self._get_run_attribute(run, "lookbehind_days"), - lookahead_days=self._get_run_attribute(run, "lookahead_days"), + def get_new_runs_and_evaluate(self) -> None: + """Get new runs and evaluate the best runs.""" + self.upload_unarchived_runs() + + if len(self.run_id_eval_candidates_queue) >= self.n_runs_before_eval: + run_information = self._get_run_information_for_all_in_queue() + self.run_id_eval_candidates_queue = self._get_unfinished_runs( + run_information=run_information, + ) + self._evaluate_and_archive_finished_runs(run_information=run_information) + + def _upload_run_dir(self, run_dir: Path) -> str: + """Upload a single run to wandb.""" + # get stdout from subprocess.run + proc = subprocess.run( + ["wandb", "sync", str(run_dir), "--project", self.project_name], + check=True, + capture_output=True, ) + stdout = proc.stdout.decode("utf-8") + if self.verbose: + msg.info(f"Watcher: {stdout}") + return stdout + + def _get_run_id(self, run_dir: Path) -> str: + """Get the run id from a run directory.""" + return run_dir.name.split("-")[-1] + + def upload_unarchived_runs(self) -> None: + """Upload unarchived runs to wandb. Only adds runs that have finished + training to the evaluation queue. + + Raises: + ValueError: If wandb sync failed + """ + for run_folder in WANDB_DIR.glob(r"offline-run*"): + run_id = self._get_run_id(run_folder) + + wandb_sync_stdout = self._upload_run_dir(run_folder) + + if "...done" not in wandb_sync_stdout: + if ".wandb file is empty" not in wandb_sync_stdout: + raise ValueError( + f"wandb sync failed, returned: {wandb_sync_stdout}", + ) + if self.verbose: + msg.warn(f"Run {run_id} is still running. Skipping.") + continue + + self.run_id_eval_candidates_queue.append(run_id) def archive_all_runs(self) -> None: """Archive all runs in the wandb directory.""" From 9beedfca0bcf6281f18f096ebc5924d3f6f95030 Mon Sep 17 00:00:00 2001 From: HLasse Date: Sat, 22 Oct 2022 12:14:54 +0200 Subject: [PATCH 34/57] fix: correct output if only 1 outcome col --- src/psycopt2d/load.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/psycopt2d/load.py b/src/psycopt2d/load.py index 6e08b807..2523ddf9 100644 --- a/src/psycopt2d/load.py +++ b/src/psycopt2d/load.py @@ -369,10 +369,13 @@ def _keep_unique_outcome_col_with_lookahead_days_matching_conf( """Keep only one outcome column with the same lookahead days as set in the config.""" outcome_cols = infer_outcome_col_name(df=dataset, allow_multiple=True) + # if only one outcome column, return + if isinstance(outcome_cols, str): + return dataset + col_to_drop = [ c for c in outcome_cols if str(self.cfg.data.lookahead_days) not in c ] - df = dataset.drop(col_to_drop, axis=1) if not isinstance(infer_outcome_col_name(df), str): From 615b17c7be9634a286666785079f1443eceb9751 Mon Sep 17 00:00:00 2001 From: HLasse Date: Sat, 22 Oct 2022 12:15:29 +0200 Subject: [PATCH 35/57] fix: various bugs in watcher --- src/psycopt2d/model_training_watcher.py | 70 +++++++++++++------------ 1 file changed, 37 insertions(+), 33 deletions(-) diff --git a/src/psycopt2d/model_training_watcher.py b/src/psycopt2d/model_training_watcher.py index fb23ad33..7aa05f78 100644 --- a/src/psycopt2d/model_training_watcher.py +++ b/src/psycopt2d/model_training_watcher.py @@ -5,7 +5,7 @@ from collections import defaultdict from distutils.util import strtobool # pylint: disable=deprecated-module from pathlib import Path -from typing import Any, Optional +from typing import Any, Optional, Union import wandb from pydantic import BaseModel @@ -30,18 +30,19 @@ class RunInformation(BaseModel): """Information about a wandb run.""" - run_id: str - auc: float - lookbehind_days: int - lookahead_days: int + run_id: Optional[str] + auc: Optional[float] + lookbehind_days: Optional[Union[int, list[int]]] + lookahead_days: Optional[int] lookahead_lookbehind_combined: Optional[str] = None def __init__(self, **kwargs): super().__init__(**kwargs) - if self.lookahead_lookbehind_combined is None: - self.lookahead_lookbehind_combined = ( - f"lookahead:{self.lookahead_days}_lookbehind:{self.lookbehind_days}" - ) + if ( + self.lookahead_lookbehind_combined is None + and self.lookbehind_days is not None + ): + self.lookahead_lookbehind_combined = f"lookahead:{str(self.lookahead_days)}_lookbehind:{str(self.lookbehind_days)}" class ModelTrainingWatcher: # pylint: disable=too-many-instance-attributes @@ -161,20 +162,29 @@ def _evaluate_and_archive_finished_runs( finished_runs = [ run_info for run_info in run_information if run_info.auc is not None ] + # sort to only upload the best in in each group + finished_runs.sort( + key=lambda run_info: ( + run_info.lookahead_lookbehind_combined, + run_info.auc, + ), + reverse=True, + ) - for run_info in finished_runs: - if ( - run_info.auc - > self.max_performances[run_info.lookbehind_lookahead_combination] - ): - msg.good( - f"New record performance for {run_info.lookbehind_lookahead_combination}! AUC: {run_info.auc}", - ) - self.max_performances[ - run_info.loobehind_lookhead_combination - ] = run_info.auc - self._do_evaluation(run_info.run_id) - self._archive_run_dir(run_dir=self._get_run_wandb_dir(run_info.run_id)) + if finished_runs: + for run_info in finished_runs: + if ( + run_info.auc + > self.max_performances[run_info.lookahead_lookbehind_combined] + ): + msg.good( + f"New record performance for {run_info.lookahead_lookbehind_combined}! AUC: {run_info.auc}", + ) + self.max_performances[ + run_info.lookahead_lookbehind_combined + ] = run_info.auc + self._do_evaluation(run_info.run_id) + self._archive_run_dir(run_dir=self._get_run_wandb_dir(run_info.run_id)) def _get_unfinished_runs(self, run_information: list[RunInformation]) -> list[str]: """Get the run ids of the unfinished runs.""" @@ -186,24 +196,17 @@ def _get_run_information(self, run_id: str) -> RunInformation: return RunInformation( run_id=run_id, auc=self._get_run_attribute(run, "roc_auc_unweighted"), - lookbehind_days=self._get_run_attribute(run, "lookbehind_days"), - lookahead_days=self._get_run_attribute(run, "lookahead_days"), + lookbehind_days=self._get_run_attribute(run, "lookbehind"), + lookahead_days=self._get_run_attribute(run, "lookahead"), ) def _get_run_information_for_all_in_queue(self): """Get the performance and information of all runs in the evaluation - queue and sort by lookahead/lookbehind combination and AUC for faster - uploading.""" + queue.""" return [ self._get_run_information(run_id) for run_id in self.run_id_eval_candidates_queue - ].sort( - key=lambda run_info: ( - run_info.lookahead_lookbehind_combined, - run_info.auc, - ), - reverse=True, - ) + ] def get_new_runs_and_evaluate(self) -> None: """Get new runs and evaluate the best runs.""" @@ -323,6 +326,7 @@ def float_or_none(arg: str) -> Optional[float]: model_data_dir=model_data_dir, verbose=args.verbose, ) + if args.clean_wandb_dir: watcher.archive_all_runs() From a350443f0f8c3ce5c1e865267ed1cc9b0a9cf251 Mon Sep 17 00:00:00 2001 From: HLasse Date: Sat, 22 Oct 2022 12:16:14 +0200 Subject: [PATCH 36/57] fix: look correct lookbehind in trainer --- src/psycopt2d/train_model.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/psycopt2d/train_model.py b/src/psycopt2d/train_model.py index 48f326d5..a3a64f1c 100644 --- a/src/psycopt2d/train_model.py +++ b/src/psycopt2d/train_model.py @@ -315,8 +315,8 @@ def main(cfg: Union[FullConfig, DictConfig]): project=cfg.project.name, reinit=True, config=flatten_nested_dict(cfg.__dict__, sep="."), - mode=cfg.project.wandb_mode, - group=cfg.project.wandb_group, + mode=cfg.project.wandb.mode, + group=cfg.project.wandb.group, ) dataset = load_train_and_val_from_cfg(cfg) @@ -342,7 +342,7 @@ def main(cfg: Union[FullConfig, DictConfig]): # only run full evaluation if wandb mode mode is online # otherwise delegate to watcher script - if cfg.project.wandb_mode == "run": + if cfg.project.wandb.mode == "run": msg.info("Evaluating model") evaluate_model( cfg=cfg, @@ -362,7 +362,7 @@ def main(cfg: Union[FullConfig, DictConfig]): run.log( { "roc_auc_unweighted": roc_auc, - "lookbehind": cfg.data.lookbehind_days, + "lookbehind": cfg.data.lookbehind_combination, "lookahead": cfg.data.lookahead_days, }, ) From b528c38379c8a50035611a8d166532fe218c9822 Mon Sep 17 00:00:00 2001 From: HLasse Date: Sat, 22 Oct 2022 12:16:31 +0200 Subject: [PATCH 37/57] misc: to get things running --- application/train_and_log_models.py | 23 +++++++++---------- src/psycopt2d/config/data/synth_data.yaml | 21 +++++++++++------ .../project/integration_test_project.yaml | 7 +++--- src/psycopt2d/train_and_log_models.py | 2 +- src/psycopt2d/utils/configs.py | 2 +- 5 files changed, 31 insertions(+), 24 deletions(-) diff --git a/application/train_and_log_models.py b/application/train_and_log_models.py index 8f881cb2..e58eeec3 100644 --- a/application/train_and_log_models.py +++ b/application/train_and_log_models.py @@ -77,9 +77,9 @@ def start_trainer( f"model={cfg.model.model_name}", f"data.min_lookbehind_days={cell.lookbehind}", f"data.min_lookahead_days={cell.lookahead}", - f"project.wandb_group='{wandb_group}'", + f"project.wandb.group='{wandb_group}'", f"hydra.sweeper.n_trials={cfg.train.n_trials_per_lookdirection_combination}", - f"project.wandb_mode={cfg.project.wandb_mode}", + f"project.wandb.mode={cfg.project.wandb.mode}", "--config-name", f"{config_file_name}", ] @@ -104,7 +104,7 @@ def start_watcher(cfg: FullConfig) -> subprocess.Popen: "python", "src/psycopt2d/model_training_watcher.py", "--entity", - cfg.project.wandb_entity, + cfg.project.wandb.entity, "--project_name", cfg.project.name, "--n_runs_before_eval", @@ -141,10 +141,9 @@ def train_models_for_each_cell_in_grid( random.shuffle(lookbehind_combinations) wandb_prefix = f"{random_word.get_random_word()}-{random_word.get_random_word()}" - + watcher = start_watcher(cfg=cfg) while lookbehind_combinations: combination = lookbehind_combinations.pop() - watcher = start_watcher(cfg=cfg) msg.info( f"Spawning a new trainer with lookbehind={combination.lookbehind} and lookahead={combination.lookahead}", @@ -164,12 +163,12 @@ def train_models_for_each_cell_in_grid( while trainer.poll() is None: time.sleep(1) - msg.good( - f"Training finished. Stopping the watcher in {cfg.project.watcher.keep_alive_after_training_minutes} minutes...", - ) + msg.good( + f"Training finished. Stopping the watcher in {cfg.project.watcher.keep_alive_after_training_minutes} minutes...", + ) - time.sleep(60 * cfg.project.watcher.keep_alive_after_training_minutes) - watcher.kill() + time.sleep(60 * cfg.project.watcher.keep_alive_after_training_minutes) + watcher.kill() def load_cfg(config_file_name): @@ -187,7 +186,7 @@ def main(): """Main.""" msg = Printer(timestamp=True) - config_file_name = "integration_testing.yaml" + config_file_name = "default_config.yaml" cfg = load_cfg(config_file_name=config_file_name) # TODO: Watcher must be instantiated once for each cell in the grid, otherwise @@ -198,7 +197,7 @@ def main(): # Remove "9999" from possible look distances behind possible_look_distances.behind = [ dist - for dist in possible_look_distances + for dist in possible_look_distances.behind if not int(dist) > cfg.data.max_lookbehind_days ] diff --git a/src/psycopt2d/config/data/synth_data.yaml b/src/psycopt2d/config/data/synth_data.yaml index f94b5da9..fe9fdccb 100644 --- a/src/psycopt2d/config/data/synth_data.yaml +++ b/src/psycopt2d/config/data/synth_data.yaml @@ -1,21 +1,28 @@ # @package _global_ data: n_training_samples: null - min_lookahead_days: null - min_lookbehind_days: null - min_prediction_time_date: null - lookahead_days: 30 + dir: "../psycop-t2d/tests/test_data/synth_splits/" + suffix: csv + + # Feature specs pred_col_name_prefix: pred_ pred_timestamp_col_name: timestamp outcome_timestamp_col_name: timestamp_outcome id_col_name: citizen_ids - dir: "../psycop-t2d/tests/test_data/synth_splits/" - suffix: csv + + # Looking ahead + lookahead_days: 30 + min_lookahead_days: null drop_patient_if_outcome_before_date: null + + # Looking behind + min_prediction_time_date: null + min_lookbehind_days: null + max_lookbehind_days: 1850 lookbehind_combination: [30, 90] # Parameters that will only take effect if running with --multirun hydra: sweeper: params: - data.lookbehind_combination: choice([3000, 90], [30]) + data.lookbehind_combination: choice([100, 60], [30]) diff --git a/src/psycopt2d/config/project/integration_test_project.yaml b/src/psycopt2d/config/project/integration_test_project.yaml index 05f31fcd..486511cd 100644 --- a/src/psycopt2d/config/project/integration_test_project.yaml +++ b/src/psycopt2d/config/project/integration_test_project.yaml @@ -1,8 +1,9 @@ name: psycop-t2d-integration-testing seed: 42 -wandb_mode: "disabled" # Which mode to run WanDB in. Takes "run", "dryrun", "offline" and "disabled" -wandb_group: "integration_testing" -wandb_entity: "psycop-t2d-testing" # Which entity to run WanDB in. +wandb: + mode: "offline" # Which mode to run WanDB in. Takes "run", "dryrun", "offline" and "disabled" + group: "integration_testing" + entity: "psycop-t2d-testing" # Which entity to run WanDB in. watcher: archive_all: true keep_alive_after_training_minutes: 5 diff --git a/src/psycopt2d/train_and_log_models.py b/src/psycopt2d/train_and_log_models.py index 50b92ee8..6fa8b2c8 100644 --- a/src/psycopt2d/train_and_log_models.py +++ b/src/psycopt2d/train_and_log_models.py @@ -16,7 +16,7 @@ # RUN CONSTANTS CONFIG_NAME = "integration_testing.yaml" -HYDRA_ARGS = f"--multirun +model=xgboost project.wandb_mode='dryrun' model.args.tree_method='auto' --config-name {CONFIG_NAME}" +HYDRA_ARGS = f"--multirun project.wandb.mode='dryrun' model.args.tree_method='auto' --config-name {CONFIG_NAME}" OVERTACI = "false" # Change to "true" if running on overtaci # WATCHER CONSTANTS diff --git a/src/psycopt2d/utils/configs.py b/src/psycopt2d/utils/configs.py index 4a198dd9..c32f8b77 100644 --- a/src/psycopt2d/utils/configs.py +++ b/src/psycopt2d/utils/configs.py @@ -43,6 +43,7 @@ class ProjectConf(BaseModel): name: str = "psycopt2d" seed: int + wandb: WandbConf watcher: WatcherConf @@ -120,7 +121,6 @@ class EvalConf(BaseModel): class FullConfig(BaseModel): """A full configuration object.""" - wandb: WandbConf project: ProjectConf data: DataConf preprocessing: PreprocessingConf From 39858e512a74a7ba0b5f8662fc37f8fa8e1cc972 Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Mon, 24 Oct 2022 10:01:03 +0200 Subject: [PATCH 38/57] misc. fixes --- README.md | 2 +- application/train_and_log_models.py | 104 +++++++++++------- src/psycopt2d/config/data/synth_data.yaml | 3 +- src/psycopt2d/config/data/t2d_parquet.yaml | 21 ++-- src/psycopt2d/config/default_config.yaml | 2 +- .../config/project/default_project.yaml | 2 +- .../project/integration_test_project.yaml | 9 +- .../config/train/default_training.yaml | 5 +- src/psycopt2d/evaluation.py | 4 +- src/psycopt2d/load.py | 70 +++++++----- src/psycopt2d/train_and_log_models.py | 2 +- src/psycopt2d/train_model.py | 26 +++-- src/psycopt2d/utils/configs.py | 12 +- src/psycopt2d/utils/utils.py | 4 +- 14 files changed, 161 insertions(+), 105 deletions(-) diff --git a/README.md b/README.md index 5e6d3238..3cb4511a 100644 --- a/README.md +++ b/README.md @@ -21,7 +21,7 @@ python src/psycopt2d/train_model.py --config-name test_config.yaml +model=xgboos To test new integrations with WandB: ```python -python src/psycopt2d/train_model.py +model=xgboost project.wandb_mode="run" --config-name integration_testing.yaml +python src/psycopt2d/train_model.py +model=xgboost project.wandb.mode="run" --config-name integration_testing.yaml ``` diff --git a/application/train_and_log_models.py b/application/train_and_log_models.py index 8f881cb2..9385ac16 100644 --- a/application/train_and_log_models.py +++ b/application/train_and_log_models.py @@ -8,6 +8,7 @@ import random import subprocess import time +from pathlib import Path import pandas as pd from hydra import compose, initialize @@ -19,7 +20,7 @@ infer_outcome_col_name, infer_predictor_col_name, ) -from psycopt2d.load import DataLoader +from psycopt2d.load import DataLoader, load_train_from_cfg from psycopt2d.utils.configs import FullConfig, omegaconf_to_pydantic_objects msg = Printer(timestamp=True) @@ -32,11 +33,15 @@ class PossibleLookDistanceDays(BaseModel): behind: list[str] -def load_train_for_inference(cfg: FullConfig): +def load_train_raw(cfg: FullConfig): """Load the data.""" - loader = DataLoader(cfg=cfg) - msg.info("Loading datasets for look direction inference") - return loader.load_dataset_from_dir(split_names="train") + path = Path(cfg.data.dir) + file = list(path.glob(pattern=r"*train*")) + + if len(file) == 1: + return pd.read_parquet(file) + + raise ValueError(f"Returned {len(file)} files") def infer_possible_look_distances(df: pd.DataFrame) -> PossibleLookDistanceDays: @@ -68,18 +73,18 @@ def start_trainer( cfg: FullConfig, config_file_name: str, cell: LookDirectionCombination, - wandb_group: str, + wandb_group_override: str, ) -> subprocess.Popen: """Start a trainer.""" subprocess_args: list[str] = [ "python", "src/psycopt2d/train_model.py", f"model={cfg.model.model_name}", - f"data.min_lookbehind_days={cell.lookbehind}", + f"data.min_lookbehind_days={max(cfg.data.lookbehind_combination)}", f"data.min_lookahead_days={cell.lookahead}", - f"project.wandb_group='{wandb_group}'", + f"project.wandb.group='{wandb_group_override}'", f"hydra.sweeper.n_trials={cfg.train.n_trials_per_lookdirection_combination}", - f"project.wandb_mode={cfg.project.wandb_mode}", + f"project.wandb.mode={cfg.project.wandb.mode}", "--config-name", f"{config_file_name}", ] @@ -104,7 +109,7 @@ def start_watcher(cfg: FullConfig) -> subprocess.Popen: "python", "src/psycopt2d/model_training_watcher.py", "--entity", - cfg.project.wandb_entity, + cfg.project.wandb.entity, "--project_name", cfg.project.name, "--n_runs_before_eval", @@ -140,36 +145,50 @@ def train_models_for_each_cell_in_grid( random.shuffle(lookbehind_combinations) + active_trainers: list[subprocess.Popen] = [] + wandb_prefix = f"{random_word.get_random_word()}-{random_word.get_random_word()}" - while lookbehind_combinations: - combination = lookbehind_combinations.pop() - watcher = start_watcher(cfg=cfg) + while lookbehind_combinations or active_trainers: + # Wait until there is a free slot in the trainers group + if len(active_trainers) >= cfg.train.n_active_trainers: + # Drop trainers if they have finished + # If finished, t.poll() is not None + active_trainers = [t for t in active_trainers if t.poll() is None] + time.sleep(1) + continue - msg.info( - f"Spawning a new trainer with lookbehind={combination.lookbehind} and lookahead={combination.lookahead}", - ) + # Start a new trainer - wandb_group = ( - f"{wandb_prefix}-beh-{combination.lookbehind}-ahead-{combination.lookahead}" - ) + combination = lookbehind_combinations.pop() - trainer = start_trainer( - cfg=cfg, - config_file_name=config_file_name, - cell=combination, - wandb_group=wandb_group, - ) + # Check if any rows in the given combinatin of lookbehind and lookahead days + cfg_for_checking_any_rows = cfg.copy() + cfg_for_checking_any_rows.data.min_lookbehind_days = combination.lookbehind + cfg_for_checking_any_rows.data.min_lookahead_days = combination.lookahead + # TODO: Can be refactored by + # 1) Inferring the dataset length from max/min of prediction time + # 2) Checking if combination.lookbehind + combination.lookahead < dataset length - while trainer.poll() is None: - time.sleep(1) + train = load_train_from_cfg(cfg=cfg) - msg.good( - f"Training finished. Stopping the watcher in {cfg.project.watcher.keep_alive_after_training_minutes} minutes...", - ) + if train.shape[0] == 0: + msg.warn(f"No rows for {combination}, continuing") + continue - time.sleep(60 * cfg.project.watcher.keep_alive_after_training_minutes) - watcher.kill() + # watcher = start_watcher(cfg=cfg) + msg.info( + f"Spawning a new trainer with lookbehind={combination.lookbehind} and lookahead={combination.lookahead}", + ) + wandb_group = f"{wandb_prefix}" + active_trainers.append( + start_trainer( + cfg=cfg, + config_file_name=config_file_name, + cell=combination, + wandb_group_override=wandb_group, + ) + ) def load_cfg(config_file_name): @@ -187,20 +206,27 @@ def main(): """Main.""" msg = Printer(timestamp=True) - config_file_name = "integration_testing.yaml" + config_file_name = "default_config.yaml" cfg = load_cfg(config_file_name=config_file_name) + + if cfg.project.wandb.mode == "run": + msg.warn( + f"wandb.mode is {cfg.project.wandb.mode}, not using the watcher. This will substantially slow down training." + ) + # TODO: Watcher must be instantiated once for each cell in the grid, otherwise # it will compare max performances across all cells. - train = load_train_for_inference(cfg=cfg) + train = load_train_raw(cfg=cfg) possible_look_distances = infer_possible_look_distances(df=train) # Remove "9999" from possible look distances behind - possible_look_distances.behind = [ - dist - for dist in possible_look_distances - if not int(dist) > cfg.data.max_lookbehind_days - ] + if cfg.data.max_lookbehind_days: + possible_look_distances.behind = [ + dist + for dist in possible_look_distances.behind + if not int(dist) > cfg.data.max_lookbehind_days + ] msg.info(f"Possible lookbehind days: {possible_look_distances.behind}") msg.info(f"Possible lookahead days: {possible_look_distances.ahead}") diff --git a/src/psycopt2d/config/data/synth_data.yaml b/src/psycopt2d/config/data/synth_data.yaml index f94b5da9..9dfac480 100644 --- a/src/psycopt2d/config/data/synth_data.yaml +++ b/src/psycopt2d/config/data/synth_data.yaml @@ -4,7 +4,6 @@ data: min_lookahead_days: null min_lookbehind_days: null min_prediction_time_date: null - lookahead_days: 30 pred_col_name_prefix: pred_ pred_timestamp_col_name: timestamp outcome_timestamp_col_name: timestamp_outcome @@ -18,4 +17,4 @@ data: hydra: sweeper: params: - data.lookbehind_combination: choice([3000, 90], [30]) + data.lookbehind_combination: choice([30, 90], [30]) diff --git a/src/psycopt2d/config/data/t2d_parquet.yaml b/src/psycopt2d/config/data/t2d_parquet.yaml index 6c1ee3ac..f88a9402 100644 --- a/src/psycopt2d/config/data/t2d_parquet.yaml +++ b/src/psycopt2d/config/data/t2d_parquet.yaml @@ -5,25 +5,24 @@ data: dir: E:\shared_resources\feature_sets\t2d\feature_sets\psycop_t2d_adminmanber_416_features_2022_10_20_11_12 suffix: parquet + # Patient exclusion criteria + drop_patient_if_outcome_before_date: 2013-01-01 + + # Prediction time exclusion criteria + min_prediction_time_date: 2013-01-01 + min_lookbehind_days: 730 + min_lookahead_days: 1825 + # Feature specs pred_col_name_prefix: "pred_" pred_timestamp_col_name: timestamp outcome_timestamp_col_name: _timestamp_first_t2d id_col_name: dw_ek_borger - - # Looking ahead - lookahead_days: 365 - min_lookahead_days: 365 - drop_patient_if_outcome_before_date: null - - # Looking behind - min_prediction_time_date: 2013-01-01 - min_lookbehind_days: 365 max_lookbehind_days: 3650 - lookbehind_combination: [30, 90, 180, 365] + lookbehind_combination: [30, 90, 180, 365, 730] # Parameters that will only take effect if running with --multirun hydra: sweeper: params: - ++data.lookbehind_combination: choice([3000], [30, 90]) + ++data.lookbehind_combination: choice([30, 90, 180, 365, 730], [30, 180, 730], [30, 730], [730], [365], [90], [30]) diff --git a/src/psycopt2d/config/default_config.yaml b/src/psycopt2d/config/default_config.yaml index 46b91517..c62edfc1 100644 --- a/src/psycopt2d/config/default_config.yaml +++ b/src/psycopt2d/config/default_config.yaml @@ -6,4 +6,4 @@ defaults: - model: xgboost - train: default_training - eval: default_evaluation - - sweeper: optuna_multithread + - sweeper: optuna_singlethread diff --git a/src/psycopt2d/config/project/default_project.yaml b/src/psycopt2d/config/project/default_project.yaml index 837cfc78..e13ddd05 100644 --- a/src/psycopt2d/config/project/default_project.yaml +++ b/src/psycopt2d/config/project/default_project.yaml @@ -3,7 +3,7 @@ seed: 42 wandb: entity: "psycop" # Which entity to run WanDB in. - mode: "offline" # Which mode to run WanDB in. Takes "run", "dryrun", "offline" and "disabled" + mode: "run" # Which mode to run WanDB in. Takes "run", "dryrun", "offline" and "disabled" group: "psycop-t2d" # Which group to run WanDB in. watcher: diff --git a/src/psycopt2d/config/project/integration_test_project.yaml b/src/psycopt2d/config/project/integration_test_project.yaml index 05f31fcd..38df97d6 100644 --- a/src/psycopt2d/config/project/integration_test_project.yaml +++ b/src/psycopt2d/config/project/integration_test_project.yaml @@ -1,8 +1,11 @@ name: psycop-t2d-integration-testing seed: 42 -wandb_mode: "disabled" # Which mode to run WanDB in. Takes "run", "dryrun", "offline" and "disabled" -wandb_group: "integration_testing" -wandb_entity: "psycop-t2d-testing" # Which entity to run WanDB in. + +wandb: + mode: "run" # Which mode to run WanDB in. Takes "run", "dryrun", "offline" and "disabled" + group: "integration_testing" + entity: "psycop-t2d-testing" # Which entity to run WanDB in. + watcher: archive_all: true keep_alive_after_training_minutes: 5 diff --git a/src/psycopt2d/config/train/default_training.yaml b/src/psycopt2d/config/train/default_training.yaml index 476a0e11..e81d99be 100644 --- a/src/psycopt2d/config/train/default_training.yaml +++ b/src/psycopt2d/config/train/default_training.yaml @@ -1,3 +1,4 @@ n_splits: 3 # (int, Null): Number of k-folds during CV. If Null, loads pre-defined dataset. -n_trials_per_lookdirection_combination: 10 -gpu: true \ No newline at end of file +n_trials_per_lookdirection_combination: 20 +n_active_trainers: 8 +gpu: true diff --git a/src/psycopt2d/evaluation.py b/src/psycopt2d/evaluation.py index 17a7d1b6..2d229041 100644 --- a/src/psycopt2d/evaluation.py +++ b/src/psycopt2d/evaluation.py @@ -101,12 +101,12 @@ def evaluate_model( # Drop date_bins_direction if they are further away than min_lookdirection_days if cfg.data.min_lookbehind_days: date_bins_behind = [ - b for b in date_bins_behind if cfg.data.min_lookbehind_days < b + b for b in date_bins_behind if cfg.data.min_lookbehind_days > b ] if cfg.data.min_lookahead_days: date_bins_ahead = [ - b for b in date_bins_ahead if cfg.data.min_lookahead_days < abs(b) + b for b in date_bins_ahead if cfg.data.min_lookahead_days > abs(b) ] # Invert date_bins_behind to negative if it's not already diff --git a/src/psycopt2d/load.py b/src/psycopt2d/load.py index 6e08b807..93252173 100644 --- a/src/psycopt2d/load.py +++ b/src/psycopt2d/load.py @@ -1,4 +1,5 @@ """Loader for the t2d dataset.""" +import os import re from collections.abc import Iterable from datetime import timedelta @@ -12,12 +13,9 @@ from psycopt2d.evaluate_saved_model_predictions import infer_look_distance from psycopt2d.utils.configs import FullConfig -from psycopt2d.utils.utils import ( - coerce_to_datetime, - get_percent_lost, - infer_outcome_col_name, - infer_predictor_col_name, -) +from psycopt2d.utils.utils import (coerce_to_datetime, get_percent_lost, + infer_outcome_col_name, + infer_predictor_col_name) msg = Printer(timestamp=True) @@ -160,27 +158,23 @@ def _drop_rows_if_datasets_ends_within_days( return dataset - def _drop_patients_with_event_in_washin(self, dataset) -> pd.DataFrame: + def drop_patient_if_outcome_before_date( + self, dataset: pd.DataFrame + ) -> pd.DataFrame: """Drop patients within washin period.""" n_rows_before_modification = dataset.shape[0] - # Remove dates before drop_patient_if_outcome_before_date outcome_before_date = ( - dataset["_timestamp_first_t2d"] + dataset[self.cfg.data.outcome_timestamp_col_name] < self.cfg.data.drop_patient_if_outcome_before_date ) patients_to_drop = set(dataset["dw_ek_borger"][outcome_before_date].unique()) dataset = dataset[~dataset["dw_ek_borger"].isin(patients_to_drop)] - # Removed dates before drop_patient_if_outcome_before_date - dataset = dataset[ - dataset[self.cfg.data.pred_timestamp_col_name] - > self.cfg.data.drop_patient_if_outcome_before_date - ] - n_rows_after_modification = dataset.shape[0] + percent_dropped = get_percent_lost( n_before=n_rows_after_modification, n_after=n_rows_after_modification, @@ -225,7 +219,7 @@ def _drop_cols_not_in_lookbehind_combination( lookbehinds_in_dataset, ): msg.warn( - f"One or more of the provided lookbehinds in lookbehind_combination is/are not used in any predictors in the dataset. Dataset: {lookbehinds_in_dataset}. lookbehind_combination: {self.cfg.data.lookbehind_combination}.", + f"One or more of the provided lookbehinds in lookbehind_combination is/are not used in any predictors in the dataset: {lookbehinds_in_spec - lookbehinds_in_dataset}", ) lookbehinds_to_keep = lookbehinds_in_spec.intersection( @@ -369,19 +363,30 @@ def _keep_unique_outcome_col_with_lookahead_days_matching_conf( """Keep only one outcome column with the same lookahead days as set in the config.""" outcome_cols = infer_outcome_col_name(df=dataset, allow_multiple=True) - col_to_drop = [ - c for c in outcome_cols if str(self.cfg.data.lookahead_days) not in c - ] + + if not outcome_cols: + raise ValueError("No outcome columns found.") + + if isinstance(outcome_cols, list): + col_to_drop = [ + c for c in outcome_cols if str(self.cfg.data.min_lookahead_days) not in c + ] + elif isinstance(outcome_cols, str): + col_to_drop = [outcome_cols] df = dataset.drop(col_to_drop, axis=1) - if not isinstance(infer_outcome_col_name(df), str): + if not self.n_outcome_col_names(df) == 1: raise ValueError( - "Returning more than one outcome column, will cause problems during eval.", + f"Returning {self.n_outcome_col_names(df=df)}, will cause problems during eval.", ) return df + def n_outcome_col_names(self, df: pd.DataFrame): + """How many outcome columns there are in a dataframe.""" + return len(infer_outcome_col_name(df=df, allow_multiple=True)) + def _process_dataset(self, dataset: pd.DataFrame) -> pd.DataFrame: """Process dataset, namely: @@ -393,12 +398,10 @@ def _process_dataset(self, dataset: pd.DataFrame) -> pd.DataFrame: Returns: pd.DataFrame: Processed dataset """ - if self.cfg.data.drop_patient_if_outcome_before_date: - dataset = add_washin_timestamps(dataset=dataset) - dataset = self._convert_timestamp_dtype_and_nat(dataset) + if self.cfg.data.drop_patient_if_outcome_before_date: - dataset = self._drop_patients_with_event_in_washin(dataset=dataset) + dataset = self.drop_patient_if_outcome_before_date(dataset=dataset) # Drop if later than min prediction time date if self.cfg.data.min_prediction_time_date: @@ -490,6 +493,18 @@ class Config: val: pd.DataFrame +def load_train_from_cfg(cfg: FullConfig) -> pd.DataFrame: + """Load train dataset from config. + + Args: + cfg (FullConfig): Config + + Returns: + pd.DataFrame: Train dataset + """ + return DataLoader(cfg=cfg).load_dataset_from_dir(split_names="train") + + def load_train_and_val_from_cfg(cfg: FullConfig): """Load train and validation data from file.""" @@ -499,3 +514,8 @@ def load_train_and_val_from_cfg(cfg: FullConfig): train=loader.load_dataset_from_dir(split_names="train"), val=loader.load_dataset_from_dir(split_names="val"), ) + + +def get_latest_dataset_dir(path: Path) -> Path: + """Get the latest dataset directory by time of creation.""" + return max(path.glob("*"), key=os.path.getctime) diff --git a/src/psycopt2d/train_and_log_models.py b/src/psycopt2d/train_and_log_models.py index 50b92ee8..b080b0d1 100644 --- a/src/psycopt2d/train_and_log_models.py +++ b/src/psycopt2d/train_and_log_models.py @@ -16,7 +16,7 @@ # RUN CONSTANTS CONFIG_NAME = "integration_testing.yaml" -HYDRA_ARGS = f"--multirun +model=xgboost project.wandb_mode='dryrun' model.args.tree_method='auto' --config-name {CONFIG_NAME}" +HYDRA_ARGS = f"--multirun +model=xgboost project.wandb.mode='dryrun' model.args.tree_method='auto' --config-name {CONFIG_NAME}" OVERTACI = "false" # Change to "true" if running on overtaci # WATCHER CONSTANTS diff --git a/src/psycopt2d/train_model.py b/src/psycopt2d/train_model.py index 48f326d5..6ff96fc7 100644 --- a/src/psycopt2d/train_model.py +++ b/src/psycopt2d/train_model.py @@ -1,12 +1,14 @@ """Training script for training a single model for predicting t2d.""" import os from collections.abc import Iterable -from typing import Optional, Union +from multiprocessing.sharedctypes import Value +from typing import Any, Hashable, Optional, Union import hydra import numpy as np import pandas as pd import wandb +from omegaconf import OmegaConf from omegaconf.dictconfig import DictConfig from sklearn.impute import SimpleImputer from sklearn.metrics import roc_auc_score @@ -287,7 +289,7 @@ def get_col_names(cfg: DictConfig, train: pd.DataFrame) -> tuple[str, list[str]] """ outcome_col_name = ( # pylint: disable=invalid-name - f"outc_dichotomous_t2d_within_{cfg.data.lookahead_days}_days_max_fallback_0" + f"outc_dichotomous_t2d_within_{cfg.data.min_lookahead_days}_days_max_fallback_0" ) train_col_names = [ # pylint: disable=invalid-name @@ -302,8 +304,11 @@ def get_col_names(cfg: DictConfig, train: pd.DataFrame) -> tuple[str, list[str]] config_name="default_config", version_base="1.2", ) -def main(cfg: Union[FullConfig, DictConfig]): +def main(cfg: DictConfig): """Main function for training a single model.""" + # Save dictconfig for easier logging + dict_config: dict[str, Any] = OmegaConf.to_container(cfg) # type: ignore + if not isinstance(cfg, FullConfig): cfg = omegaconf_to_pydantic_objects(cfg) @@ -314,11 +319,14 @@ def main(cfg: Union[FullConfig, DictConfig]): run = wandb.init( project=cfg.project.name, reinit=True, - config=flatten_nested_dict(cfg.__dict__, sep="."), - mode=cfg.project.wandb_mode, - group=cfg.project.wandb_group, + config=dict_config, + mode=cfg.project.wandb.mode, + group=cfg.project.wandb.group, ) + if run is None: + raise ValueError("Failed to initialise Wandb") + dataset = load_train_and_val_from_cfg(cfg) msg.info("Creating pipeline") @@ -342,7 +350,7 @@ def main(cfg: Union[FullConfig, DictConfig]): # only run full evaluation if wandb mode mode is online # otherwise delegate to watcher script - if cfg.project.wandb_mode == "run": + if cfg.project.wandb.mode == "run": msg.info("Evaluating model") evaluate_model( cfg=cfg, @@ -362,8 +370,8 @@ def main(cfg: Union[FullConfig, DictConfig]): run.log( { "roc_auc_unweighted": roc_auc, - "lookbehind": cfg.data.lookbehind_days, - "lookahead": cfg.data.lookahead_days, + "lookbehind": max(cfg.data.lookbehind_combination), + "lookahead": cfg.data.min_lookahead_days, }, ) run.finish() diff --git a/src/psycopt2d/utils/configs.py b/src/psycopt2d/utils/configs.py index 4a198dd9..4d1ce2de 100644 --- a/src/psycopt2d/utils/configs.py +++ b/src/psycopt2d/utils/configs.py @@ -25,6 +25,8 @@ class Config: class WandbConf(BaseModel): + """Configuration for weights and biases.""" + group: str mode: str entity: str @@ -41,6 +43,7 @@ class WatcherConf(BaseModel): class ProjectConf(BaseModel): """Project configuration.""" + wandb: WandbConf name: str = "psycopt2d" seed: int watcher: WatcherConf @@ -62,16 +65,13 @@ class DataConf(BaseModel): id_col_name: str # (str): Citizen colnames # Looking ahead - lookahead_days: int # (float): Number of days from prediction time to look ahead for the outcome. - min_lookahead_days: Optional[ - int - ] # (int): Drop all prediction times where (max timestamp in the dataset) - (current timestamp) is less than min_lookahead_days + min_lookahead_days: int # (int): Drop all prediction times where (max timestamp in the dataset) - (current timestamp) is less than min_lookahead_days drop_patient_if_outcome_before_date: Optional[Union[str, datetime]] # Looking behind # (int): Drop all prediction times where (prediction_timestamp) - (min timestamp in the dataset) is less than min_lookbehind_days min_prediction_time_date: Optional[Union[str, datetime]] - min_lookbehind_days: Optional[int] + min_lookbehind_days: int max_lookbehind_days: Optional[int] lookbehind_combination: Optional[list[int]] @@ -100,6 +100,7 @@ class TrainConf(BaseModel): n_splits: int # ? How do we handle whether to use crossvalidation or train/val splitting? n_trials_per_lookdirection_combination: int + n_active_trainers: int # Number of subprocesses to spawn when training gpu: bool @@ -120,7 +121,6 @@ class EvalConf(BaseModel): class FullConfig(BaseModel): """A full configuration object.""" - wandb: WandbConf project: ProjectConf data: DataConf preprocessing: PreprocessingConf diff --git a/src/psycopt2d/utils/utils.py b/src/psycopt2d/utils/utils.py index 3ad4bcb5..41c125a2 100644 --- a/src/psycopt2d/utils/utils.py +++ b/src/psycopt2d/utils/utils.py @@ -403,7 +403,7 @@ def infer_col_names( col_name = [c for c in df.columns if c.startswith(prefix)] if len(col_name) == 1: - return col_name[0] + return col_name elif len(col_name) > 1: if allow_multiple: return col_name @@ -411,7 +411,7 @@ def infer_col_names( f"Multiple columns found and allow_multiple is {allow_multiple}.", ) else: - raise ValueError("More than one outcome inferred") + raise ValueError("No outcomes inferred") def infer_outcome_col_name( From f50b501aa53306d259b41b4e3614d65025e4a514 Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Mon, 24 Oct 2022 11:14:18 +0200 Subject: [PATCH 39/57] merge with main --- docs/conf.py | 4 +-- .../project/integration_test_project.yaml | 6 ++-- src/psycopt2d/load.py | 32 +++++++++++-------- .../model_performance/model_performance.py | 4 +-- src/psycopt2d/train_model.py | 2 +- src/psycopt2d/utils/configs.py | 1 + src/psycopt2d/utils/utils.py | 5 ++- tests/test_utils.py | 2 +- 8 files changed, 31 insertions(+), 25 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index b420ec84..e898c604 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -24,7 +24,7 @@ # Add any Sphinx extension module names here, as strings. They can be extensions # coming with Sphinx (named 'sphinx.ext.*') or your custom ones. -extensions = [] +extensions = [] # type: ignore # Add any paths that contain templates here, relative to this directory. templates_path = ["_templates"] @@ -167,7 +167,7 @@ # -- Options for LaTeX output -------------------------------------------------- -latex_elements = { +latex_elements = { # type: ignore # The paper size ('letterpaper' or 'a4paper'). # 'papersize': 'letterpaper', # The font size ('10pt', '11pt' or '12pt'). diff --git a/src/psycopt2d/config/project/integration_test_project.yaml b/src/psycopt2d/config/project/integration_test_project.yaml index 38df97d6..2231968c 100644 --- a/src/psycopt2d/config/project/integration_test_project.yaml +++ b/src/psycopt2d/config/project/integration_test_project.yaml @@ -1,11 +1,9 @@ name: psycop-t2d-integration-testing seed: 42 - wandb: - mode: "run" # Which mode to run WanDB in. Takes "run", "dryrun", "offline" and "disabled" + mode: "disabled" # Which mode to run WanDB in. Takes "run", "dryrun", "offline" and "disabled" group: "integration_testing" - entity: "psycop-t2d-testing" # Which entity to run WanDB in. - + entity: "psycop" # Which entity to run WanDB in. watcher: archive_all: true keep_alive_after_training_minutes: 5 diff --git a/src/psycopt2d/load.py b/src/psycopt2d/load.py index 93252173..15bd95c2 100644 --- a/src/psycopt2d/load.py +++ b/src/psycopt2d/load.py @@ -13,9 +13,12 @@ from psycopt2d.evaluate_saved_model_predictions import infer_look_distance from psycopt2d.utils.configs import FullConfig -from psycopt2d.utils.utils import (coerce_to_datetime, get_percent_lost, - infer_outcome_col_name, - infer_predictor_col_name) +from psycopt2d.utils.utils import ( + coerce_to_datetime, + get_percent_lost, + infer_outcome_col_name, + infer_predictor_col_name, +) msg = Printer(timestamp=True) @@ -364,21 +367,24 @@ def _keep_unique_outcome_col_with_lookahead_days_matching_conf( the config.""" outcome_cols = infer_outcome_col_name(df=dataset, allow_multiple=True) - if not outcome_cols: - raise ValueError("No outcome columns found.") + col_to_drop = [ + c for c in outcome_cols if str(self.cfg.data.lookahead_days) not in c + ] - if isinstance(outcome_cols, list): - col_to_drop = [ - c for c in outcome_cols if str(self.cfg.data.min_lookahead_days) not in c - ] - elif isinstance(outcome_cols, str): - col_to_drop = [outcome_cols] + # If no columns to drop, return the dataset + if not col_to_drop: + return dataset + + if len(col_to_drop) == 1: + col_to_drop = col_to_drop[0] + else: + col_to_drop = outcome_cols df = dataset.drop(col_to_drop, axis=1) - if not self.n_outcome_col_names(df) == 1: + if not isinstance(infer_outcome_col_name(df), str): raise ValueError( - f"Returning {self.n_outcome_col_names(df=df)}, will cause problems during eval.", + "Returning more than one outcome column, will cause problems during eval.", ) return df diff --git a/src/psycopt2d/model_performance/model_performance.py b/src/psycopt2d/model_performance/model_performance.py index e3f887da..6a413620 100644 --- a/src/psycopt2d/model_performance/model_performance.py +++ b/src/psycopt2d/model_performance/model_performance.py @@ -365,9 +365,7 @@ def compute_metrics( """ # sorting to get correct output from f1, prec, and recall groups = sorted(set(labels)) - performance = {} - - performance["acc-overall"] = accuracy_score(labels, predicted) + performance = {"acc-overall": accuracy_score(labels, predicted)} performance["balanced_accuracy-overall"] = balanced_accuracy_score( labels, predicted, diff --git a/src/psycopt2d/train_model.py b/src/psycopt2d/train_model.py index 6ff96fc7..5aa91fbc 100644 --- a/src/psycopt2d/train_model.py +++ b/src/psycopt2d/train_model.py @@ -319,7 +319,7 @@ def main(cfg: DictConfig): run = wandb.init( project=cfg.project.name, reinit=True, - config=dict_config, + config=flatten_nested_dict(cfg.__dict__, sep="."), mode=cfg.project.wandb.mode, group=cfg.project.wandb.group, ) diff --git a/src/psycopt2d/utils/configs.py b/src/psycopt2d/utils/configs.py index 4d1ce2de..bc5668d6 100644 --- a/src/psycopt2d/utils/configs.py +++ b/src/psycopt2d/utils/configs.py @@ -47,6 +47,7 @@ class ProjectConf(BaseModel): name: str = "psycopt2d" seed: int watcher: WatcherConf + wandb: WandbConf class DataConf(BaseModel): diff --git a/src/psycopt2d/utils/utils.py b/src/psycopt2d/utils/utils.py index 41c125a2..70b7efcc 100644 --- a/src/psycopt2d/utils/utils.py +++ b/src/psycopt2d/utils/utils.py @@ -7,6 +7,7 @@ import time from collections.abc import Iterable, MutableMapping from datetime import date, datetime +from multiprocessing.sharedctypes import Value from pathlib import Path from typing import Any, Optional, Union @@ -19,7 +20,7 @@ from psycopt2d.configs import ModelEvalData from psycopt2d.model_performance import ModelPerformance -from psycopt2d.utils.configs import FullConfig +from psycopt2d.utils.configs import BaseModel, FullConfig SHARED_RESOURCES_PATH = Path(r"E:\shared_resources") FEATURE_SETS_PATH = SHARED_RESOURCES_PATH / "feature_sets" @@ -410,6 +411,8 @@ def infer_col_names( raise ValueError( f"Multiple columns found and allow_multiple is {allow_multiple}.", ) + elif len(col_name) == 0: + raise ValueError("No outcome col name inferred") else: raise ValueError("No outcomes inferred") diff --git a/tests/test_utils.py b/tests/test_utils.py index b772542e..f619872d 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -58,7 +58,7 @@ def test_flatten_nested_dict(): assert expected_dict == output_dict -CONFIG_DIR_PATH_ABS = PROJECT_ROOT / "psycopt2d" / "config" +CONFIG_DIR_PATH_ABS = PROJECT_ROOT / "src" / "psycopt2d" / "config" CONFIG_DIR_PATH_REL = "../src/psycopt2d/config" From 315fd3f3308c6d805cf26573a37af534c9500d41 Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Mon, 24 Oct 2022 12:37:19 +0200 Subject: [PATCH 40/57] fix: failing tests --- application/train_and_log_models.py | 2 +- src/psycopt2d/config/data/synth_data.yaml | 4 +- src/psycopt2d/load.py | 8 +--- .../model_performance/model_performance.py | 15 ++++--- src/psycopt2d/model_training_watcher.py | 19 +++++---- .../tables/performance_by_threshold.py | 40 +++++++++---------- src/psycopt2d/train_model.py | 11 ++++- src/psycopt2d/utils/configs.py | 1 - src/psycopt2d/utils/utils.py | 18 ++++----- tests/test_load.py | 6 +-- 10 files changed, 65 insertions(+), 59 deletions(-) diff --git a/application/train_and_log_models.py b/application/train_and_log_models.py index 9385ac16..67f10ece 100644 --- a/application/train_and_log_models.py +++ b/application/train_and_log_models.py @@ -20,7 +20,7 @@ infer_outcome_col_name, infer_predictor_col_name, ) -from psycopt2d.load import DataLoader, load_train_from_cfg +from psycopt2d.load import load_train_from_cfg from psycopt2d.utils.configs import FullConfig, omegaconf_to_pydantic_objects msg = Printer(timestamp=True) diff --git a/src/psycopt2d/config/data/synth_data.yaml b/src/psycopt2d/config/data/synth_data.yaml index 9dfac480..80ffa8e3 100644 --- a/src/psycopt2d/config/data/synth_data.yaml +++ b/src/psycopt2d/config/data/synth_data.yaml @@ -1,8 +1,8 @@ # @package _global_ data: n_training_samples: null - min_lookahead_days: null - min_lookbehind_days: null + min_lookahead_days: 30 + min_lookbehind_days: 100 min_prediction_time_date: null pred_col_name_prefix: pred_ pred_timestamp_col_name: timestamp diff --git a/src/psycopt2d/load.py b/src/psycopt2d/load.py index 15bd95c2..8a5806f5 100644 --- a/src/psycopt2d/load.py +++ b/src/psycopt2d/load.py @@ -368,18 +368,14 @@ def _keep_unique_outcome_col_with_lookahead_days_matching_conf( outcome_cols = infer_outcome_col_name(df=dataset, allow_multiple=True) col_to_drop = [ - c for c in outcome_cols if str(self.cfg.data.lookahead_days) not in c + c for c in outcome_cols if str(self.cfg.data.min_lookahead_days) not in c ] # If no columns to drop, return the dataset if not col_to_drop: return dataset - if len(col_to_drop) == 1: - col_to_drop = col_to_drop[0] - else: - col_to_drop = outcome_cols - + col_to_drop = col_to_drop[0] if len(col_to_drop) == 1 else outcome_cols df = dataset.drop(col_to_drop, axis=1) if not isinstance(infer_outcome_col_name(df), str): diff --git a/src/psycopt2d/model_performance/model_performance.py b/src/psycopt2d/model_performance/model_performance.py index 6a413620..805ff62b 100644 --- a/src/psycopt2d/model_performance/model_performance.py +++ b/src/psycopt2d/model_performance/model_performance.py @@ -28,16 +28,17 @@ class ModelPerformance: """Evaluators of model performance.""" + @staticmethod def performance_metrics_from_df( prediction_df: pd.DataFrame, prediction_col_name: str, label_col_name: str, - id_col_name: Optional[str] = None, + id_col_name: str = None, metadata_col_names: Optional[list[str]] = None, id2label: Optional[ # pylint: disable=redefined-outer-name dict[int, str] ] = None, - to_wide: Optional[bool] = False, + to_wide: bool = False, binary_threshold: Optional[float] = 0.5, ) -> pd.DataFrame: """Calculate performance metrics from a dataframe. @@ -49,7 +50,7 @@ def performance_metrics_from_df( prediction_df (pd.DataFrame): Dataframe with 1 row per prediction. prediction_col_name (str): column containing probabilities for each class or a list of floats for binary classification. label_col_name (str): column containing ground truth label - id_col_name (str, optional): Column name for the id, used for grouping. + id_col_name (str): Column name for the id, used for grouping. metadata_col_names (Optional[list[str]], optional): Column(s) containing metadata to add to the performance dataframe. Each column should only contain 1 unique value. E.g. model_name, modality.. If set to "all" will auto-detect metadata columns and add them all. @@ -61,8 +62,6 @@ def performance_metrics_from_df( pd.Dataframe: Dataframe with performance metrics. """ - concat_axis = 1 if to_wide else 0 - performance_description = ModelPerformance._evaluate_single_model( prediction_df=prediction_df, aggregate_by_id=False, @@ -93,6 +92,8 @@ def performance_metrics_from_df( binary_threshold=binary_threshold, ) + concat_axis = 1 if to_wide else 0 + performance_description = pd.concat( [performance_description, performance_by_id], axis=concat_axis, @@ -113,6 +114,7 @@ def performance_metrics_from_df( return performance_description + @staticmethod def performance_metrics_from_file( jsonl_path: Union[str, Path], prediction_col_name: str, @@ -122,7 +124,7 @@ def performance_metrics_from_file( id2label: Optional[ # pylint: disable=redefined-outer-name dict[int, str] ] = None, - to_wide: Optional[bool] = False, + to_wide: bool = False, binary_threshold: Optional[float] = 0.5, ) -> pd.DataFrame: """Load a .jsonl file and returns performance metrics. @@ -214,6 +216,7 @@ def performance_metrics_from_folder( ] return pd.concat(dfs) + @staticmethod def _evaluate_single_model( # pylint: disable=too-many-locals prediction_df: pd.DataFrame, aggregate_by_id: bool, diff --git a/src/psycopt2d/model_training_watcher.py b/src/psycopt2d/model_training_watcher.py index 936200e5..d9dc9fb9 100644 --- a/src/psycopt2d/model_training_watcher.py +++ b/src/psycopt2d/model_training_watcher.py @@ -4,7 +4,7 @@ import time from distutils.util import strtobool # pylint: disable=deprecated-module from pathlib import Path -from typing import Optional +from typing import Any, Optional import wandb from wandb.apis.public import Api # pylint: disable=no-name-in-module @@ -25,7 +25,7 @@ WANDB_DIR = PROJECT_ROOT / "wandb" -class ModelTrainingWatcher: +class ModelTrainingWatcher: # pylint: disable=too-many-instance-attributes """Watch the wandb directory for new files and uploads them to wandb. Fully evaluates the best runs after a certain number of runs have been uploaded. @@ -112,13 +112,12 @@ def upload_unarchived_runs(self) -> None: wandb_sync_stdout = self._upload_run_dir(run_folder) if "...done" not in wandb_sync_stdout: - if ".wandb file is empty" in wandb_sync_stdout: - if self.verbose: - msg.warn(f"Run {run_id} is still running. Skipping.") - else: + if ".wandb file is empty" not in wandb_sync_stdout: raise ValueError( f"wandb sync failed, returned: {wandb_sync_stdout}", ) + if self.verbose: + msg.warn(f"Run {run_id} is still running. Skipping.") continue self.run_id_eval_candidates_queue.append(run_id) @@ -164,8 +163,12 @@ def _get_run_wandb_dir(self, run_id: str) -> Path: def _get_run_performance(self, run_id: str) -> Optional[float]: """Get the performance of a single run and check if it failed.""" run = self._get_wandb_run(run_id) - if "roc_auc_unweighted" in run.summary: - return run.summary.roc_auc_unweighted + + summary: dict[str, Any] = run.summary # type: ignore + + if "roc_auc_unweighted" in summary: + return run.summary["roc_auc_unweighted"] + if self.verbose: msg.info( f"Watcher: Run {run_id} has no performance metric. Pinging again at next eval time.", diff --git a/src/psycopt2d/tables/performance_by_threshold.py b/src/psycopt2d/tables/performance_by_threshold.py index 688faaa3..fde2f8b6 100644 --- a/src/psycopt2d/tables/performance_by_threshold.py +++ b/src/psycopt2d/tables/performance_by_threshold.py @@ -1,6 +1,6 @@ """Get performance by which threshold is used to classify positive.""" from collections.abc import Iterable -from typing import Optional, Union +from typing import Optional, Sequence, Union import numpy as np import pandas as pd @@ -9,8 +9,8 @@ def performance_by_threshold( # pylint: disable=too-many-locals - labels: Iterable[int], - pred_probs: Iterable[float], + labels: Sequence[int], + pred_probs: Sequence[float], positive_threshold: float, round_to: int = 4, ) -> pd.DataFrame: @@ -26,7 +26,7 @@ def performance_by_threshold( # pylint: disable=too-many-locals Returns: pd.DataFrame """ - preds = np.where(pred_probs > positive_threshold, 1, 0) + preds = np.where(pred_probs > positive_threshold, 1, 0) # type: ignore conf_matrix = confusion_matrix(labels, preds) @@ -141,33 +141,31 @@ def days_from_first_positive_to_diagnosis( ] ] - warning_days = df["warning_days"].agg(aggregation_method) - - return warning_days + return df["warning_days"].agg(aggregation_method) def generate_performance_by_positive_rate_table( - labels: Iterable[int], - pred_probs: Iterable[float], - positive_rate_thresholds: Iterable[Union[int, float]], - pred_proba_thresholds: Iterable[float], - ids: Iterable[Union[int, float]], - pred_timestamps: Iterable[pd.Timestamp], - outcome_timestamps: Iterable[pd.Timestamp], + labels: Sequence[int], + pred_probs: Sequence[float], + positive_rate_thresholds: Sequence[Union[int, float]], + pred_proba_thresholds: Sequence[float], + ids: Sequence[Union[int, float]], + pred_timestamps: Sequence[pd.Timestamp], + outcome_timestamps: Sequence[pd.Timestamp], output_format: Optional[str] = "wandb_table", ) -> Union[pd.DataFrame, str]: """Generates a performance_by_threshold table as either a DataFrame or html object. Args: - labels (Iterable[int]): True labels. - pred_probs (Iterable[float]): Predicted probabilities. - positive_rate_thresholds (Iterable[float]): Positive_rate_thresholds to add to the table, e.g. 0.99, 0.98 etc. + labels (Sequence[int]): True labels. + pred_probs (Sequence[float]): Predicted probabilities. + positive_rate_thresholds (Sequence[float]): Positive_rate_thresholds to add to the table, e.g. 0.99, 0.98 etc. Calculated so that the Xth percentile of predictions are classified as the positive class. - pred_proba_thresholds (Iterable[float]): Thresholds above which predictions are classified as positive. - ids (Iterable[Union[int, float]]): Ids to group on. - pred_timestamps (Iterable[ pd.Timestamp ]): Timestamp for each prediction time. - outcome_timestamps (Iterable[pd.Timestamp]): Timestamp for each outcome time. + pred_proba_thresholds (Sequence[float]): Thresholds above which predictions are classified as positive. + ids (Sequence[Union[int, float]]): Ids to group on. + pred_timestamps (Sequence[ pd.Timestamp ]): Timestamp for each prediction time. + outcome_timestamps (Sequence[pd.Timestamp]): Timestamp for each outcome time. output_format (str, optional): Format to output - either "df" or "wandb_table". Defaults to "df". Returns: diff --git a/src/psycopt2d/train_model.py b/src/psycopt2d/train_model.py index 5aa91fbc..86d503dd 100644 --- a/src/psycopt2d/train_model.py +++ b/src/psycopt2d/train_model.py @@ -307,7 +307,14 @@ def get_col_names(cfg: DictConfig, train: pd.DataFrame) -> tuple[str, list[str]] def main(cfg: DictConfig): """Main function for training a single model.""" # Save dictconfig for easier logging - dict_config: dict[str, Any] = OmegaConf.to_container(cfg) # type: ignore + if isinstance(cfg, DictConfig): + # Create flattened dict for logging to wandb + # Wandb doesn't allow configs to be nested, so we + # flatten it. + dict_config_to_log: dict[str, Any] = flatten_nested_dict(OmegaConf.to_container(cfg), sep=".") # type: ignore + else: + # For testing, we can take a FullConfig object instead. Simplifies boilerplate. + dict_config_to_log = cfg.__dict__ if not isinstance(cfg, FullConfig): cfg = omegaconf_to_pydantic_objects(cfg) @@ -319,7 +326,7 @@ def main(cfg: DictConfig): run = wandb.init( project=cfg.project.name, reinit=True, - config=flatten_nested_dict(cfg.__dict__, sep="."), + config=dict_config_to_log, mode=cfg.project.wandb.mode, group=cfg.project.wandb.group, ) diff --git a/src/psycopt2d/utils/configs.py b/src/psycopt2d/utils/configs.py index bc5668d6..4d1ce2de 100644 --- a/src/psycopt2d/utils/configs.py +++ b/src/psycopt2d/utils/configs.py @@ -47,7 +47,6 @@ class ProjectConf(BaseModel): name: str = "psycopt2d" seed: int watcher: WatcherConf - wandb: WandbConf class DataConf(BaseModel): diff --git a/src/psycopt2d/utils/utils.py b/src/psycopt2d/utils/utils.py index 70b7efcc..66849b71 100644 --- a/src/psycopt2d/utils/utils.py +++ b/src/psycopt2d/utils/utils.py @@ -61,7 +61,7 @@ def flatten_nested_dict( d: dict, parent_key: str = "", sep: str = ".", -) -> dict: +) -> dict[str, Any]: """Recursively flatten an infinitely nested dict. E.g. {"level1": {"level2": "level3": {"level4": 5}}}} becomes @@ -82,15 +82,15 @@ def flatten_nested_dict( new_key = parent_key + sep + k if parent_key else k if isinstance(v, MutableMapping): items.extend( - flatten_nested_dict(d=v, parent_key=new_key, sep=sep).items(), + flatten_nested_dict(d=v, parent_key=new_key, sep=sep).items(), # type: ignore ) # typing: ignore else: - items.append((new_key, v)) + items.append((new_key, v)) # type: ignore - return dict(items) + return dict(items) # type: ignore -def drop_records_if_datediff_days_smaller_than( +def drop_records_if_datediff_days_smaller_than( # pylint: disable=inconsistent-return-statements df: pd.DataFrame, t2_col_name: str, t1_col_name: str, @@ -159,7 +159,7 @@ def calculate_performance_metrics( A pandas dataframe with the performance metrics. """ performance_metrics = ModelPerformance.performance_metrics_from_df( - eval_df, + prediction_df=eval_df, prediction_col_name=prediction_probabilities_col_name, label_col_name=outcome_col_name, id_col_name=id_col_name, @@ -247,7 +247,7 @@ def dump_to_pickle(obj: Any, path: str) -> None: pkl.dump(obj, f) -def read_pickle(path: str) -> Any: +def read_pickle(path: Union[str, Path]) -> Any: """Reads a pickled object from a file. Args: @@ -411,7 +411,7 @@ def infer_col_names( raise ValueError( f"Multiple columns found and allow_multiple is {allow_multiple}.", ) - elif len(col_name) == 0: + elif not col_name: raise ValueError("No outcome col name inferred") else: raise ValueError("No outcomes inferred") @@ -439,7 +439,7 @@ def infer_y_hat_prob_col_name( df: pd.DataFrame, prefix="y_hat_prob", allow_multiple: bool = False, -) -> str: +) -> list[str]: """Infer the y_hat_prob column name from the dataframe.""" return infer_col_names(df=df, prefix=prefix, allow_multiple=allow_multiple) diff --git a/tests/test_load.py b/tests/test_load.py index bf5a9f8f..094fe3c6 100644 --- a/tests/test_load.py +++ b/tests/test_load.py @@ -15,10 +15,10 @@ def test_load_lookbehind_exceeds_lookbehind_threshold(): cfg = omegaconf_to_pydantic_objects(cfg) - cfg.data.min_lookahead_days = 90 + cfg.data.min_lookahead_days = 30 split_dataset = load_train_and_val_from_cfg(cfg) - assert split_dataset.train.shape == (644, 6) + assert split_dataset.train.shape[1] == 6 def test_load_lookbehind_not_in_lookbehind_combination(): @@ -34,4 +34,4 @@ def test_load_lookbehind_not_in_lookbehind_combination(): cfg.data.lookbehind_combination = [30] split_dataset = load_train_and_val_from_cfg(cfg) - assert split_dataset.train.shape == (700, 6) + assert split_dataset.train.shape[1] == 6 From ad403a3fb4e1c54b438b34fed64470d5ce8a59c8 Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Tue, 25 Oct 2022 09:45:50 +0200 Subject: [PATCH 41/57] refactor: misc. refactor --- application/train_and_log_models.py | 69 +++++++++++++------ src/psycopt2d/load.py | 3 +- .../tables/performance_by_threshold.py | 3 +- src/psycopt2d/train_model.py | 3 +- src/psycopt2d/utils/utils.py | 3 +- 5 files changed, 53 insertions(+), 28 deletions(-) diff --git a/application/train_and_log_models.py b/application/train_and_log_models.py index 67f10ece..25e22850 100644 --- a/application/train_and_log_models.py +++ b/application/train_and_log_models.py @@ -9,6 +9,7 @@ import subprocess import time from pathlib import Path +from queue import Full import pandas as pd from hydra import compose, initialize @@ -65,8 +66,8 @@ def infer_possible_look_distances(df: pd.DataFrame) -> PossibleLookDistanceDays: class LookDirectionCombination(BaseModel): """A combination of lookbehind and lookahead days.""" - lookbehind: int - lookahead: int + behind_days: int + ahead_days: int def start_trainer( @@ -81,7 +82,7 @@ def start_trainer( "src/psycopt2d/train_model.py", f"model={cfg.model.model_name}", f"data.min_lookbehind_days={max(cfg.data.lookbehind_combination)}", - f"data.min_lookahead_days={cell.lookahead}", + f"data.min_lookahead_days={cell.ahead_days}", f"project.wandb.group='{wandb_group_override}'", f"hydra.sweeper.n_trials={cfg.train.n_trials_per_lookdirection_combination}", f"project.wandb.mode={cfg.project.wandb.mode}", @@ -137,11 +138,6 @@ def train_models_for_each_cell_in_grid( random_word = RandomWords() # Create all combinations of lookbehind and lookahead days - lookbehind_combinations = [ - LookDirectionCombination(lookbehind=lookbehind, lookahead=lookahead) - for lookbehind in possible_look_distances.behind - for lookahead in possible_look_distances.ahead - ] random.shuffle(lookbehind_combinations) @@ -187,7 +183,7 @@ def train_models_for_each_cell_in_grid( config_file_name=config_file_name, cell=combination, wandb_group_override=wandb_group, - ) + ), ) @@ -212,33 +208,62 @@ def main(): if cfg.project.wandb.mode == "run": msg.warn( - f"wandb.mode is {cfg.project.wandb.mode}, not using the watcher. This will substantially slow down training." + f"wandb.mode is {cfg.project.wandb.mode}, not using the watcher. This will substantially slow down training.", ) - # TODO: Watcher must be instantiated once for each cell in the grid, otherwise - # it will compare max performances across all cells. train = load_train_raw(cfg=cfg) + possible_look_distances = get_possible_look_distances(msg, cfg, train) + + if not cfg.train.gpu: + msg.warn("Not using GPU for training") + + train_models_for_each_cell_in_grid( + cfg=cfg, + possible_look_distances=possible_look_distances, + config_file_name=config_file_name, + ) + + +def get_possible_look_distances(msg: Printer, cfg: FullConfig, train: pd.DataFrame): + """Some look_ahead and look_behind distances will result in 0 valid prediction times. Only return combinations which will allow some prediction times. + + E.g. if we only have 4 years of data: + - min_lookahead = 2 years + - min_lookbehind = 3 years + + Will mean that no rows satisfy the criteria. + """ + possible_look_distances = infer_possible_look_distances(df=train) + lookbehind_combinations = [ + LookDirectionCombination(behind_days=behind_days, ahead_days=ahead_days) + for behind_days in possible_look_distances.behind + for ahead_days in possible_look_distances.ahead + ] + + # Don't try look distance combinations which will result in 0 rows + max_date_interval_in_dataset = max(train[cfg.data.pred_timestamp_col_name]) - max( + train[cfg.data.pred_timestamp_col_name] + ) + + possible_look_distances = [ + dist + for dist in lookbehind_combinations + if ((dist.ahead + dist.behind_days) < max_date_interval_in_dataset) + ] + # Remove "9999" from possible look distances behind if cfg.data.max_lookbehind_days: possible_look_distances.behind = [ dist for dist in possible_look_distances.behind - if not int(dist) > cfg.data.max_lookbehind_days + if int(dist) <= cfg.data.max_lookbehind_days ] msg.info(f"Possible lookbehind days: {possible_look_distances.behind}") msg.info(f"Possible lookahead days: {possible_look_distances.ahead}") - - if not cfg.train.gpu: - msg.warn("Not using GPU for training") - - train_models_for_each_cell_in_grid( - cfg=cfg, - possible_look_distances=possible_look_distances, - config_file_name=config_file_name, - ) + return possible_look_distances if __name__ == "__main__": diff --git a/src/psycopt2d/load.py b/src/psycopt2d/load.py index 8a5806f5..e680a4c5 100644 --- a/src/psycopt2d/load.py +++ b/src/psycopt2d/load.py @@ -162,7 +162,8 @@ def _drop_rows_if_datasets_ends_within_days( return dataset def drop_patient_if_outcome_before_date( - self, dataset: pd.DataFrame + self, + dataset: pd.DataFrame, ) -> pd.DataFrame: """Drop patients within washin period.""" diff --git a/src/psycopt2d/tables/performance_by_threshold.py b/src/psycopt2d/tables/performance_by_threshold.py index fde2f8b6..1e780728 100644 --- a/src/psycopt2d/tables/performance_by_threshold.py +++ b/src/psycopt2d/tables/performance_by_threshold.py @@ -1,6 +1,7 @@ """Get performance by which threshold is used to classify positive.""" from collections.abc import Iterable -from typing import Optional, Sequence, Union +from typing import Optional, Union +from collections.abc import Sequence import numpy as np import pandas as pd diff --git a/src/psycopt2d/train_model.py b/src/psycopt2d/train_model.py index 86d503dd..1e6a4096 100644 --- a/src/psycopt2d/train_model.py +++ b/src/psycopt2d/train_model.py @@ -1,8 +1,7 @@ """Training script for training a single model for predicting t2d.""" import os from collections.abc import Iterable -from multiprocessing.sharedctypes import Value -from typing import Any, Hashable, Optional, Union +from typing import Any, Optional import hydra import numpy as np diff --git a/src/psycopt2d/utils/utils.py b/src/psycopt2d/utils/utils.py index 66849b71..8e820a74 100644 --- a/src/psycopt2d/utils/utils.py +++ b/src/psycopt2d/utils/utils.py @@ -7,7 +7,6 @@ import time from collections.abc import Iterable, MutableMapping from datetime import date, datetime -from multiprocessing.sharedctypes import Value from pathlib import Path from typing import Any, Optional, Union @@ -20,7 +19,7 @@ from psycopt2d.configs import ModelEvalData from psycopt2d.model_performance import ModelPerformance -from psycopt2d.utils.configs import BaseModel, FullConfig +from psycopt2d.utils.configs import FullConfig SHARED_RESOURCES_PATH = Path(r"E:\shared_resources") FEATURE_SETS_PATH = SHARED_RESOURCES_PATH / "feature_sets" From 12f81682d60dfea34465b6f491f0f6fcb4aa2d64 Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Tue, 25 Oct 2022 10:27:52 +0200 Subject: [PATCH 42/57] style: linting --- application/train_and_log_models.py | 7 ++++--- src/psycopt2d/tables/performance_by_threshold.py | 3 +-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/application/train_and_log_models.py b/application/train_and_log_models.py index 25e22850..2f6125e1 100644 --- a/application/train_and_log_models.py +++ b/application/train_and_log_models.py @@ -9,7 +9,6 @@ import subprocess import time from pathlib import Path -from queue import Full import pandas as pd from hydra import compose, initialize @@ -225,7 +224,9 @@ def main(): def get_possible_look_distances(msg: Printer, cfg: FullConfig, train: pd.DataFrame): - """Some look_ahead and look_behind distances will result in 0 valid prediction times. Only return combinations which will allow some prediction times. + """Some look_ahead and look_behind distances will result in 0 valid + prediction times. Only return combinations which will allow some prediction + times. E.g. if we only have 4 years of data: - min_lookahead = 2 years @@ -244,7 +245,7 @@ def get_possible_look_distances(msg: Printer, cfg: FullConfig, train: pd.DataFra # Don't try look distance combinations which will result in 0 rows max_date_interval_in_dataset = max(train[cfg.data.pred_timestamp_col_name]) - max( - train[cfg.data.pred_timestamp_col_name] + train[cfg.data.pred_timestamp_col_name], ) possible_look_distances = [ diff --git a/src/psycopt2d/tables/performance_by_threshold.py b/src/psycopt2d/tables/performance_by_threshold.py index 1e780728..f2cb7a25 100644 --- a/src/psycopt2d/tables/performance_by_threshold.py +++ b/src/psycopt2d/tables/performance_by_threshold.py @@ -1,7 +1,6 @@ """Get performance by which threshold is used to classify positive.""" -from collections.abc import Iterable +from collections.abc import Iterable, Sequence from typing import Optional, Union -from collections.abc import Sequence import numpy as np import pandas as pd From 5f2ef6e2d3ed8973790fbdefd6bb49957d6ba148 Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Tue, 25 Oct 2022 10:49:26 +0200 Subject: [PATCH 43/57] style: linting --- application/train_and_log_models.py | 52 ++++++++++++------------- src/psycopt2d/model_training_watcher.py | 37 +++++++++--------- 2 files changed, 45 insertions(+), 44 deletions(-) diff --git a/application/train_and_log_models.py b/application/train_and_log_models.py index 9c2d388a..746b3f59 100644 --- a/application/train_and_log_models.py +++ b/application/train_and_log_models.py @@ -206,32 +206,6 @@ def load_cfg(config_file_name): return cfg -def main(): - """Main.""" - msg = Printer(timestamp=True) - - config_file_name = "default_config.yaml" - - cfg = load_cfg(config_file_name=config_file_name) - - if cfg.project.wandb.mode == "run": - msg.warn( - f"wandb.mode is {cfg.project.wandb.mode}, not using the watcher. This will substantially slow down training.", - ) - - train = load_train_raw(cfg=cfg) - possible_look_distances = get_possible_look_distances(msg, cfg, train) - - if not cfg.train.gpu: - msg.warn("Not using GPU for training") - - train_models_for_each_cell_in_grid( - cfg=cfg, - possible_look_distances=possible_look_distances, - config_file_name=config_file_name, - ) - - def get_possible_look_distances(msg: Printer, cfg: FullConfig, train: pd.DataFrame): """Some look_ahead and look_behind distances will result in 0 valid prediction times. Only return combinations which will allow some prediction @@ -276,5 +250,31 @@ def get_possible_look_distances(msg: Printer, cfg: FullConfig, train: pd.DataFra return possible_look_distances +def main(): + """Main.""" + msg = Printer(timestamp=True) + + config_file_name = "default_config.yaml" + + cfg = load_cfg(config_file_name=config_file_name) + + if cfg.project.wandb.mode == "run": + msg.warn( + f"wandb.mode is {cfg.project.wandb.mode}, not using the watcher. This will substantially slow down training.", + ) + + train = load_train_raw(cfg=cfg) + possible_look_distances = get_possible_look_distances(msg, cfg, train) + + if not cfg.train.gpu: + msg.warn("Not using GPU for training") + + train_models_for_each_cell_in_grid( + cfg=cfg, + possible_look_distances=possible_look_distances, + config_file_name=config_file_name, + ) + + if __name__ == "__main__": main() diff --git a/src/psycopt2d/model_training_watcher.py b/src/psycopt2d/model_training_watcher.py index e9dcfb7c..2ac24818 100644 --- a/src/psycopt2d/model_training_watcher.py +++ b/src/psycopt2d/model_training_watcher.py @@ -109,6 +109,23 @@ def _get_run_id(self, run_dir: Path) -> str: """Get the run id from a run directory.""" return run_dir.name.split("-")[-1] + def _upload_run_dir(self, run_dir: Path) -> str: + """Upload a single run to wandb.""" + # get stdout from subprocess.run + proc = subprocess.run( + ["wandb", "sync", str(run_dir), "--project", self.project_name], + check=True, + capture_output=True, + ) + stdout = proc.stdout.decode("utf-8") + if self.verbose: + msg.info(f"Watcher: {stdout}") + return stdout + + def _get_run_id(self, run_dir: Path) -> str: + """Get the run id from a run directory.""" + return run_dir.name.split("-")[-1] + def upload_unarchived_runs(self) -> None: """Upload unarchived runs to wandb.""" for run_folder in WANDB_DIR.glob(r"offline-run*"): @@ -214,7 +231,8 @@ def _evaluate_and_archive_finished_runs( self._archive_run_dir(run_dir=self._get_run_wandb_dir(run_info.run_id)) def _get_unfinished_run_ids( - self, run_information: list[RunInformation] + self, + run_information: list[RunInformation], ) -> list[str]: """Get the run ids of the unfinished runs.""" return [run_info.run_id for run_info in run_information if run_info.auc is None] @@ -248,23 +266,6 @@ def get_new_runs_and_evaluate(self) -> None: ) self._evaluate_and_archive_finished_runs(run_information=run_infos) - def _upload_run_dir(self, run_dir: Path) -> str: - """Upload a single run to wandb.""" - # get stdout from subprocess.run - proc = subprocess.run( - ["wandb", "sync", str(run_dir), "--project", self.project_name], - check=True, - capture_output=True, - ) - stdout = proc.stdout.decode("utf-8") - if self.verbose: - msg.info(f"Watcher: {stdout}") - return stdout - - def _get_run_id(self, run_dir: Path) -> str: - """Get the run id from a run directory.""" - return run_dir.name.split("-")[-1] - def upload_unarchived_runs(self) -> None: """Upload unarchived runs to wandb. Only adds runs that have finished training to the evaluation queue. From 43e6ba49d376b6bc1470ee70c09c253e6143ffc8 Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Tue, 25 Oct 2022 12:55:25 +0200 Subject: [PATCH 44/57] fix: run_id is required --- src/psycopt2d/model_training_watcher.py | 33 ++++++------------------- 1 file changed, 7 insertions(+), 26 deletions(-) diff --git a/src/psycopt2d/model_training_watcher.py b/src/psycopt2d/model_training_watcher.py index 2ac24818..d634deac 100644 --- a/src/psycopt2d/model_training_watcher.py +++ b/src/psycopt2d/model_training_watcher.py @@ -30,7 +30,9 @@ class RunInformation(BaseModel): """Information about a wandb run.""" - run_id: Optional[str] + # Attributes must be optional since runs can be uploaded, + # without having been sufficiently validated. + run_id: str auc: Optional[float] lookbehind_days: Optional[Union[int, list[int]]] lookahead_days: Optional[int] @@ -122,29 +124,6 @@ def _upload_run_dir(self, run_dir: Path) -> str: msg.info(f"Watcher: {stdout}") return stdout - def _get_run_id(self, run_dir: Path) -> str: - """Get the run id from a run directory.""" - return run_dir.name.split("-")[-1] - - def upload_unarchived_runs(self) -> None: - """Upload unarchived runs to wandb.""" - for run_folder in WANDB_DIR.glob(r"offline-run*"): - run_id = self._get_run_id(run_folder) - - wandb_sync_stdout = self._upload_run_dir(run_folder) - - if "...done" not in wandb_sync_stdout: - if ".wandb file is empty" in wandb_sync_stdout: - if self.verbose: - msg.warn(f"Run {run_id} is still running. Skipping.") - else: - raise ValueError( - f"wandb sync failed, returned: {wandb_sync_stdout}", - ) - continue - - self.run_id_eval_candidates_queue.append(run_id) - def _get_run_evaluation_data_dir(self, run_id: str) -> Path: """Get the evaluation path for a single run.""" return list(self.model_data_dir.glob(f"*{run_id}*"))[0] @@ -203,8 +182,10 @@ def _evaluate_and_archive_finished_runs( lookbehind/-ahead days, and fully evaluate the best performing. Move all wandb run dirs to the archive folder. """ - finished_runs = [ - run_info for run_info in run_information if run_info.auc is not None + finished_runs: list[RunInformation] = [ + run_info + for run_info in run_information + if run_info.auc and run_info.lookahead_lookbehind_combined ] # sort to only upload the best in in each group finished_runs.sort( From bbb7dea2c323c187542f4e88693771c1079df03c Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Tue, 25 Oct 2022 12:55:31 +0200 Subject: [PATCH 45/57] fix: add data dir to synth dataset --- src/psycopt2d/config/data/synth_data.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/psycopt2d/config/data/synth_data.yaml b/src/psycopt2d/config/data/synth_data.yaml index b60f3080..a07c524c 100644 --- a/src/psycopt2d/config/data/synth_data.yaml +++ b/src/psycopt2d/config/data/synth_data.yaml @@ -1,5 +1,7 @@ # @package _global_ data: + dir: tests/test_data/synth_splits + suffix: csv n_training_samples: null min_lookahead_days: 30 min_lookbehind_days: 100 From 4bd3e7ea8225ae57410920dbb729ca626421364c Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Tue, 25 Oct 2022 13:18:35 +0200 Subject: [PATCH 46/57] fix: minor fixes after merge --- .../config/preprocessing/default_preprocessing.yaml | 7 ++++--- src/psycopt2d/train_model.py | 10 +++++----- src/psycopt2d/utils/configs.py | 11 ++++++++--- tests/test_auc_by_group_table.py | 6 +++--- tests/test_train_model.py | 4 ++-- 5 files changed, 22 insertions(+), 16 deletions(-) diff --git a/src/psycopt2d/config/preprocessing/default_preprocessing.yaml b/src/psycopt2d/config/preprocessing/default_preprocessing.yaml index 0d33340a..1197da95 100644 --- a/src/psycopt2d/config/preprocessing/default_preprocessing.yaml +++ b/src/psycopt2d/config/preprocessing/default_preprocessing.yaml @@ -2,6 +2,7 @@ convert_to_boolean: False # (Boolean): Convert all prediction values (except gen convert_datetimes_to: False # (str): Options include ordinal or False imputation_method: "most_frequent" # (str): Options include 2most_frequent" transform: null # (str|null): Transformation applied to all predictors after imputation. Options include "z-score-normalization" -feature_selection_method: null -feature_selection_params: - percentile: 10 # (int): Percent of features to keep. Defaults to 10. +feature_selection: + name: null + params: + percentile: 10 # (int): Percent of features to keep. Defaults to 10. diff --git a/src/psycopt2d/train_model.py b/src/psycopt2d/train_model.py index cc32513d..ff7aadf2 100644 --- a/src/psycopt2d/train_model.py +++ b/src/psycopt2d/train_model.py @@ -64,23 +64,23 @@ def create_preprocessing_pipeline(cfg): ("z-score-normalization", StandardScaler()), ) - if cfg.preprocessing.feature_selection_method == "f_classif": + if cfg.preprocessing.feature_selection.name == "f_classif": steps.append( ( "feature_selection", SelectPercentile( f_classif, - percentile=cfg.preprocessing.feature_selection_params.percentile, + percentile=cfg.preprocessing.feature_selection.params["percentile"], ), ), ) - if cfg.preprocessing.feature_selection_method == "chi2": + if cfg.preprocessing.feature_selection.name == "chi2": steps.append( ( "feature_selection", SelectPercentile( chi2, - percentile=cfg.preprocessing.feature_selection_params.percentile, + percentile=cfg.preprocessing.feature_selection.params["percentile"], ), ), ) @@ -351,7 +351,7 @@ def main(cfg: DictConfig): config=dict_config_to_log, mode=cfg.project.wandb.mode, group=cfg.project.wandb.group, - entity=cfg.project.wandb_entity, + entity=cfg.project.wandb.entity, ) if run is None: diff --git a/src/psycopt2d/utils/configs.py b/src/psycopt2d/utils/configs.py index 448980ee..98461894 100644 --- a/src/psycopt2d/utils/configs.py +++ b/src/psycopt2d/utils/configs.py @@ -77,6 +77,13 @@ class DataConf(BaseModel): lookbehind_combination: Optional[list[int]] +class FeatureSelectionConf(BaseModel): + """Configuration for feature selection methods""" + + name: Optional[str] + params: Optional[dict] + + class PreprocessingConf(BaseModel): """Preprocessing config.""" @@ -86,6 +93,7 @@ class PreprocessingConf(BaseModel): transform: Optional[ str ] # (str|null): Transformation applied to all predictors after imputation. Options include "z-score-normalization" + feature_selection: FeatureSelectionConf class ModelConf(BaseModel): @@ -130,9 +138,6 @@ class FullConfig(BaseModel): eval: EvalConf -# ? Should FullConfig be here or in another location? - - def omegaconf_to_pydantic_objects(conf: DictConfig) -> FullConfig: """Converts an omegaconf DictConfig to a pydantic object. diff --git a/tests/test_auc_by_group_table.py b/tests/test_auc_by_group_table.py index 72c0975c..3f91de9f 100644 --- a/tests/test_auc_by_group_table.py +++ b/tests/test_auc_by_group_table.py @@ -1,11 +1,11 @@ """table_test_auc_by_group_table.""" # pylint: disable=missing-function-docstring -from psycopt2d.tables import auc_by_group_table -from psycopt2d.utils import bin_continuous_data +from psycopt2d.tables import auc_by_group_df +from psycopt2d.utils.utils import bin_continuous_data -def test_auc_by_group_table(synth_data): +def test_auc_by_group_df(synth_data): synth_data["Age bins"] = bin_continuous_data( synth_data["age"], bins=[0, 18, 30, 50, 120], diff --git a/tests/test_train_model.py b/tests/test_train_model.py index 765e180f..886d512f 100644 --- a/tests/test_train_model.py +++ b/tests/test_train_model.py @@ -77,8 +77,8 @@ def test_feature_selection(): config_name=CONFIG_FILE_NAME, overrides=[ INTEGRATION_TESTING_MODEL_OVERRIDE, - "preprocessing.feature_selection_method=f_classif", - "preprocessing.feature_selection_params.percentile=10", + "preprocessing.feature_selection.name=f_classif", + "preprocessing.feature_selection.params.percentile=10", # "project.wandb_mode=run", ], ) From 0ebeda4c99906371237106a2f4cfb538dcf0c5c0 Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Tue, 25 Oct 2022 13:20:08 +0200 Subject: [PATCH 47/57] style: linting --- src/psycopt2d/utils/configs.py | 2 +- tests/test_train_model.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/psycopt2d/utils/configs.py b/src/psycopt2d/utils/configs.py index 98461894..fb6f18ad 100644 --- a/src/psycopt2d/utils/configs.py +++ b/src/psycopt2d/utils/configs.py @@ -78,7 +78,7 @@ class DataConf(BaseModel): class FeatureSelectionConf(BaseModel): - """Configuration for feature selection methods""" + """Configuration for feature selection methods.""" name: Optional[str] params: Optional[dict] diff --git a/tests/test_train_model.py b/tests/test_train_model.py index 886d512f..78d523f7 100644 --- a/tests/test_train_model.py +++ b/tests/test_train_model.py @@ -74,7 +74,7 @@ def test_feature_selection(): """Test feature selection.""" with initialize(version_base=None, config_path=CONFIG_DIR_PATH): cfg = compose( - config_name=CONFIG_FILE_NAME, + config_name=INTEGRATION_TEST_FILE_NAME, overrides=[ INTEGRATION_TESTING_MODEL_OVERRIDE, "preprocessing.feature_selection.name=f_classif", From 7c9f0c8dcbf40b05f3708bfdc534bb558fd9bea2 Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Tue, 25 Oct 2022 13:40:14 +0200 Subject: [PATCH 48/57] fix: feature_selection_test requires more than 1 pred col --- src/psycopt2d/config/data/synth_data.yaml | 2 +- tests/test_train_model.py | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/src/psycopt2d/config/data/synth_data.yaml b/src/psycopt2d/config/data/synth_data.yaml index a07c524c..676acce5 100644 --- a/src/psycopt2d/config/data/synth_data.yaml +++ b/src/psycopt2d/config/data/synth_data.yaml @@ -17,7 +17,7 @@ data: # Looking behind max_lookbehind_days: 1850 - lookbehind_combination: [30, 90] + lookbehind_combination: [30, 60, 100] # Parameters that will only take effect if running with --multirun hydra: diff --git a/tests/test_train_model.py b/tests/test_train_model.py index 78d523f7..d4768206 100644 --- a/tests/test_train_model.py +++ b/tests/test_train_model.py @@ -3,6 +3,7 @@ import pytest from hydra import compose, initialize +from psycopt2d.load import load_train_from_cfg from psycopt2d.models import MODELS from psycopt2d.train_model import main from psycopt2d.utils.configs import omegaconf_to_pydantic_objects @@ -73,13 +74,15 @@ def test_min_prediction_time_date(): def test_feature_selection(): """Test feature selection.""" with initialize(version_base=None, config_path=CONFIG_DIR_PATH): + cfg = compose( config_name=INTEGRATION_TEST_FILE_NAME, overrides=[ INTEGRATION_TESTING_MODEL_OVERRIDE, "preprocessing.feature_selection.name=f_classif", - "preprocessing.feature_selection.params.percentile=10", + "preprocessing.feature_selection.params.percentile=100", # "project.wandb_mode=run", ], ) + main(cfg) From 67b6b70414c3f409d9ae2ade690fc10fbf06d8ed Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Tue, 25 Oct 2022 13:40:34 +0200 Subject: [PATCH 49/57] style: linting --- tests/test_train_model.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_train_model.py b/tests/test_train_model.py index d4768206..84f7446f 100644 --- a/tests/test_train_model.py +++ b/tests/test_train_model.py @@ -3,7 +3,6 @@ import pytest from hydra import compose, initialize -from psycopt2d.load import load_train_from_cfg from psycopt2d.models import MODELS from psycopt2d.train_model import main from psycopt2d.utils.configs import omegaconf_to_pydantic_objects From f51421c936726345073f177aee7e614d78981193 Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Tue, 25 Oct 2022 13:41:42 +0200 Subject: [PATCH 50/57] test: meaningful percentiles --- tests/test_train_model.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/test_train_model.py b/tests/test_train_model.py index 84f7446f..c6265627 100644 --- a/tests/test_train_model.py +++ b/tests/test_train_model.py @@ -79,8 +79,7 @@ def test_feature_selection(): overrides=[ INTEGRATION_TESTING_MODEL_OVERRIDE, "preprocessing.feature_selection.name=f_classif", - "preprocessing.feature_selection.params.percentile=100", - # "project.wandb_mode=run", + "preprocessing.feature_selection.params.percentile=10", ], ) From 917d42e08c0e51b5ae6e7c5271eb827409613fb4 Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Tue, 25 Oct 2022 13:50:31 +0200 Subject: [PATCH 51/57] feat: add watcher to main training script --- application/train_and_log_models.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/application/train_and_log_models.py b/application/train_and_log_models.py index 746b3f59..4a414d12 100644 --- a/application/train_and_log_models.py +++ b/application/train_and_log_models.py @@ -148,7 +148,7 @@ def train_models_for_each_cell_in_grid( active_trainers: list[subprocess.Popen] = [] wandb_prefix = f"{random_word.get_random_word()}-{random_word.get_random_word()}" - watcher = start_watcher(cfg=cfg) + while lookbehind_combinations or active_trainers: # Wait until there is a free slot in the trainers group if len(active_trainers) >= cfg.train.n_active_trainers: @@ -173,11 +173,11 @@ def train_models_for_each_cell_in_grid( msg.warn(f"No rows for {combination}, continuing") continue - # watcher = start_watcher(cfg=cfg) msg.info( f"Spawning a new trainer with lookbehind={combination.lookbehind} and lookahead={combination.lookahead}", ) wandb_group = f"{wandb_prefix}" + active_trainers.append( start_trainer( cfg=cfg, @@ -187,13 +187,6 @@ def train_models_for_each_cell_in_grid( ), ) - msg.good( - f"Training finished. Stopping the watcher in {cfg.project.watcher.keep_alive_after_training_minutes} minutes...", - ) - - time.sleep(60 * cfg.project.watcher.keep_alive_after_training_minutes) - watcher.kill() - def load_cfg(config_file_name): """Load config as pydantic object.""" @@ -269,12 +262,21 @@ def main(): if not cfg.train.gpu: msg.warn("Not using GPU for training") + watcher = start_watcher(cfg=cfg) + train_models_for_each_cell_in_grid( cfg=cfg, possible_look_distances=possible_look_distances, config_file_name=config_file_name, ) + msg.good( + f"Training finished. Stopping the watcher in {cfg.project.watcher.keep_alive_after_training_minutes} minutes...", + ) + + time.sleep(60 * cfg.project.watcher.keep_alive_after_training_minutes) + watcher.kill() + if __name__ == "__main__": main() From bf0bb64e7bf5ca37cc5687f671c0471a73b2b9a4 Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Tue, 25 Oct 2022 13:53:57 +0200 Subject: [PATCH 52/57] misc. fixes --- application/train_and_log_models.py | 33 +++++++++++-------- src/psycopt2d/config/data/t2d_parquet.yaml | 2 +- .../config/project/default_project.yaml | 2 +- src/psycopt2d/train_model.py | 2 ++ 4 files changed, 24 insertions(+), 15 deletions(-) diff --git a/application/train_and_log_models.py b/application/train_and_log_models.py index 4a414d12..523c3329 100644 --- a/application/train_and_log_models.py +++ b/application/train_and_log_models.py @@ -188,7 +188,7 @@ def train_models_for_each_cell_in_grid( ) -def load_cfg(config_file_name): +def load_cfg(config_file_name) -> FullConfig: """Load config as pydantic object.""" with initialize(version_base=None, config_path="../src/psycopt2d/config/"): cfg = compose( @@ -199,7 +199,9 @@ def load_cfg(config_file_name): return cfg -def get_possible_look_distances(msg: Printer, cfg: FullConfig, train: pd.DataFrame): +def get_possible_look_distances( + msg: Printer, cfg: FullConfig, train: pd.DataFrame +) -> list[PossibleLookDistanceDays]: """Some look_ahead and look_behind distances will result in 0 valid prediction times. Only return combinations which will allow some prediction times. @@ -227,7 +229,7 @@ def get_possible_look_distances(msg: Printer, cfg: FullConfig, train: pd.DataFra possible_look_distances = [ dist for dist in lookbehind_combinations - if ((dist.ahead + dist.behind_days) < max_date_interval_in_dataset) + if ((dist.ahead_days + dist.behind_days) < max_date_interval_in_dataset) ] # Remove "9999" from possible look distances behind @@ -240,6 +242,7 @@ def get_possible_look_distances(msg: Printer, cfg: FullConfig, train: pd.DataFra msg.info(f"Possible lookbehind days: {possible_look_distances.behind}") msg.info(f"Possible lookahead days: {possible_look_distances.ahead}") + return possible_look_distances @@ -251,10 +254,8 @@ def main(): cfg = load_cfg(config_file_name=config_file_name) - if cfg.project.wandb.mode == "run": - msg.warn( - f"wandb.mode is {cfg.project.wandb.mode}, not using the watcher. This will substantially slow down training.", - ) + # Override for testing + cfg.train.n_active_trainers = 1 train = load_train_raw(cfg=cfg) possible_look_distances = get_possible_look_distances(msg, cfg, train) @@ -262,7 +263,12 @@ def main(): if not cfg.train.gpu: msg.warn("Not using GPU for training") - watcher = start_watcher(cfg=cfg) + if cfg.project.wandb.mode == "run": + msg.warn( + f"wandb.mode is {cfg.project.wandb.mode}, not using the watcher. This will substantially slow down training.", + ) + else: + watcher = start_watcher(cfg=cfg) train_models_for_each_cell_in_grid( cfg=cfg, @@ -270,12 +276,13 @@ def main(): config_file_name=config_file_name, ) - msg.good( - f"Training finished. Stopping the watcher in {cfg.project.watcher.keep_alive_after_training_minutes} minutes...", - ) + if cfg.project.wand.mode != "run": + msg.good( + f"Training finished. Stopping the watcher in {cfg.project.watcher.keep_alive_after_training_minutes} minutes...", + ) - time.sleep(60 * cfg.project.watcher.keep_alive_after_training_minutes) - watcher.kill() + time.sleep(60 * cfg.project.watcher.keep_alive_after_training_minutes) + watcher.kill() if __name__ == "__main__": diff --git a/src/psycopt2d/config/data/t2d_parquet.yaml b/src/psycopt2d/config/data/t2d_parquet.yaml index f88a9402..6f3d394a 100644 --- a/src/psycopt2d/config/data/t2d_parquet.yaml +++ b/src/psycopt2d/config/data/t2d_parquet.yaml @@ -25,4 +25,4 @@ data: hydra: sweeper: params: - ++data.lookbehind_combination: choice([30, 90, 180, 365, 730], [30, 180, 730], [30, 730], [730], [365], [90], [30]) + ++data.lookbehind_combination: choice([30, 90, 180, 365, 730], [30, 180, 730], [730], [365], [90], [30]) diff --git a/src/psycopt2d/config/project/default_project.yaml b/src/psycopt2d/config/project/default_project.yaml index 563ab5a2..b98f94c8 100644 --- a/src/psycopt2d/config/project/default_project.yaml +++ b/src/psycopt2d/config/project/default_project.yaml @@ -3,7 +3,7 @@ seed: 42 wandb: entity: "psycop" # Which entity to run WanDB in. - mode: "run" # Which mode to run WanDB in. Takes "run", "dryrun", "offline" and "disabled" + mode: "offline" # Which mode to run WanDB in. Takes "run", "dryrun", "offline" and "disabled" group: "psycop-t2d" # Which group to run WanDB in. watcher: diff --git a/src/psycopt2d/train_model.py b/src/psycopt2d/train_model.py index ff7aadf2..90b2ce98 100644 --- a/src/psycopt2d/train_model.py +++ b/src/psycopt2d/train_model.py @@ -389,6 +389,8 @@ def main(cfg: DictConfig): y_hat_prob_col_name="y_hat_prob", feature_importance_dict=get_feature_importance_dict(pipe), run=run, + pipe=pipe, + train_col_names=train_col_names, ) roc_auc = roc_auc_score( From 45a9addcec7c891dc2ec7ba11c77311c73eaa4b1 Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Tue, 25 Oct 2022 14:05:03 +0200 Subject: [PATCH 53/57] fix: type errors --- application/train_and_log_models.py | 53 ++++++++++++------------- src/psycopt2d/model_training_watcher.py | 10 +++-- 2 files changed, 32 insertions(+), 31 deletions(-) diff --git a/application/train_and_log_models.py b/application/train_and_log_models.py index 523c3329..35716fe6 100644 --- a/application/train_and_log_models.py +++ b/application/train_and_log_models.py @@ -26,8 +26,8 @@ msg = Printer(timestamp=True) -class PossibleLookDistanceDays(BaseModel): - """Possible look distances.""" +class LookDistances(BaseModel): + """A distance of ahead and behind.""" ahead: list[str] behind: list[str] @@ -44,7 +44,7 @@ def load_train_raw(cfg: FullConfig): raise ValueError(f"Returned {len(file)} files") -def infer_possible_look_distances(df: pd.DataFrame) -> PossibleLookDistanceDays: +def infer_possible_look_distances(df: pd.DataFrame) -> LookDistances: """Infer the possible values for min_lookahead_days and min_lookbehind_days.""" # Get potential lookaheads from outc_ columns @@ -56,7 +56,7 @@ def infer_possible_look_distances(df: pd.DataFrame) -> PossibleLookDistanceDays: pred_col_names = infer_predictor_col_name(df=df, allow_multiple=True) possible_lookbehind_days = list(set(infer_look_distance(col_name=pred_col_names))) - return PossibleLookDistanceDays( + return LookDistances( ahead=possible_lookahead_days, behind=possible_lookbehind_days, ) @@ -128,7 +128,7 @@ def start_watcher(cfg: FullConfig) -> subprocess.Popen: def train_models_for_each_cell_in_grid( cfg: FullConfig, - possible_look_distances: PossibleLookDistanceDays, + possible_look_distances: LookDistances, config_file_name: str, ): """Train a model for each cell in the grid of possible look distances.""" @@ -164,8 +164,8 @@ def train_models_for_each_cell_in_grid( # Check if any rows in the given combinatin of lookbehind and lookahead days cfg_for_checking_any_rows = cfg.copy() - cfg_for_checking_any_rows.data.min_lookbehind_days = combination.lookbehind - cfg_for_checking_any_rows.data.min_lookahead_days = combination.lookahead + cfg_for_checking_any_rows.data.min_lookbehind_days = combination.behind_days + cfg_for_checking_any_rows.data.min_lookahead_days = combination.ahead_days train = load_train_from_cfg(cfg=cfg) @@ -174,7 +174,7 @@ def train_models_for_each_cell_in_grid( continue msg.info( - f"Spawning a new trainer with lookbehind={combination.lookbehind} and lookahead={combination.lookahead}", + f"Spawning a new trainer with lookbehind={combination.behind_days} and lookahead={combination.ahead_days}", ) wandb_group = f"{wandb_prefix}" @@ -201,7 +201,7 @@ def load_cfg(config_file_name) -> FullConfig: def get_possible_look_distances( msg: Printer, cfg: FullConfig, train: pd.DataFrame -) -> list[PossibleLookDistanceDays]: +) -> list[LookDirectionCombination]: """Some look_ahead and look_behind distances will result in 0 valid prediction times. Only return combinations which will allow some prediction times. @@ -213,37 +213,36 @@ def get_possible_look_distances( Will mean that no rows satisfy the criteria. """ - possible_look_distances = infer_possible_look_distances(df=train) + look_combinations_in_dataset = infer_possible_look_distances(df=train) - lookbehind_combinations = [ + look_distance_combinations = [ LookDirectionCombination(behind_days=behind_days, ahead_days=ahead_days) - for behind_days in possible_look_distances.behind - for ahead_days in possible_look_distances.ahead + for behind_days in look_combinations_in_dataset.behind + for ahead_days in look_combinations_in_dataset.ahead ] # Don't try look distance combinations which will result in 0 rows - max_date_interval_in_dataset = max(train[cfg.data.pred_timestamp_col_name]) - max( + max_distance_in_dataset_days = max(train[cfg.data.pred_timestamp_col_name]) - max( train[cfg.data.pred_timestamp_col_name], ) - possible_look_distances = [ + look_combinations_without_rows = [ dist - for dist in lookbehind_combinations - if ((dist.ahead_days + dist.behind_days) < max_date_interval_in_dataset) + for dist in look_distance_combinations + if ((dist.ahead_days + dist.behind_days)) > max_distance_in_dataset_days ] - # Remove "9999" from possible look distances behind - if cfg.data.max_lookbehind_days: - possible_look_distances.behind = [ - dist - for dist in possible_look_distances.behind - if int(dist) <= cfg.data.max_lookbehind_days - ] + msg.info( + f"Not fitting model to {look_combinations_without_rows}, since no rows satisfy the criteria." + ) - msg.info(f"Possible lookbehind days: {possible_look_distances.behind}") - msg.info(f"Possible lookahead days: {possible_look_distances.ahead}") + look_combinations_with_rows = [ + dist + for dist in look_distance_combinations + if ((dist.ahead_days + dist.behind_days) < max_distance_in_dataset_days) + ] - return possible_look_distances + return look_combinations_with_rows def main(): diff --git a/src/psycopt2d/model_training_watcher.py b/src/psycopt2d/model_training_watcher.py index d634deac..d6d05c1d 100644 --- a/src/psycopt2d/model_training_watcher.py +++ b/src/psycopt2d/model_training_watcher.py @@ -199,15 +199,17 @@ def _evaluate_and_archive_finished_runs( if finished_runs: for run_info in finished_runs: if ( - run_info.auc - > self.max_performances[run_info.lookahead_lookbehind_combined] + run_info.auc # type: ignore + > self.max_performances[ + run_info.lookahead_lookbehind_combined # type: ignore + ] ): msg.good( f"New record performance for {run_info.lookahead_lookbehind_combined}! AUC: {run_info.auc}", ) self.max_performances[ - run_info.lookahead_lookbehind_combined - ] = run_info.auc + run_info.lookahead_lookbehind_combined # type: ignore + ] = run_info.auc # type: ignore self._do_evaluation(run_info.run_id) self._archive_run_dir(run_dir=self._get_run_wandb_dir(run_info.run_id)) From faa43dc302e6c236cf141737b5b3aded4ad1ea0a Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Tue, 25 Oct 2022 14:11:01 +0200 Subject: [PATCH 54/57] fix: type errors --- application/train_and_log_models.py | 47 +++++++++++------------------ 1 file changed, 17 insertions(+), 30 deletions(-) diff --git a/application/train_and_log_models.py b/application/train_and_log_models.py index 35716fe6..52b87ee8 100644 --- a/application/train_and_log_models.py +++ b/application/train_and_log_models.py @@ -9,6 +9,7 @@ import subprocess import time from pathlib import Path +from typing import Union import pandas as pd from hydra import compose, initialize @@ -26,11 +27,11 @@ msg = Printer(timestamp=True) -class LookDistances(BaseModel): +class LookDistance(BaseModel): """A distance of ahead and behind.""" - ahead: list[str] - behind: list[str] + behind_days: list[Union[int, float]] + ahead_days: list[Union[int, float]] def load_train_raw(cfg: FullConfig): @@ -44,7 +45,7 @@ def load_train_raw(cfg: FullConfig): raise ValueError(f"Returned {len(file)} files") -def infer_possible_look_distances(df: pd.DataFrame) -> LookDistances: +def infer_possible_look_distances(df: pd.DataFrame) -> LookDistance: """Infer the possible values for min_lookahead_days and min_lookbehind_days.""" # Get potential lookaheads from outc_ columns @@ -56,23 +57,16 @@ def infer_possible_look_distances(df: pd.DataFrame) -> LookDistances: pred_col_names = infer_predictor_col_name(df=df, allow_multiple=True) possible_lookbehind_days = list(set(infer_look_distance(col_name=pred_col_names))) - return LookDistances( - ahead=possible_lookahead_days, - behind=possible_lookbehind_days, + return LookDistance( + behind_days=possible_lookahead_days, + ahead_days=possible_lookbehind_days, ) -class LookDirectionCombination(BaseModel): - """A combination of lookbehind and lookahead days.""" - - behind_days: int - ahead_days: int - - def start_trainer( cfg: FullConfig, config_file_name: str, - cell: LookDirectionCombination, + cell: LookDistance, wandb_group_override: str, ) -> subprocess.Popen: """Start a trainer.""" @@ -128,7 +122,7 @@ def start_watcher(cfg: FullConfig) -> subprocess.Popen: def train_models_for_each_cell_in_grid( cfg: FullConfig, - possible_look_distances: LookDistances, + possible_look_distances: list[LookDistance], config_file_name: str, ): """Train a model for each cell in the grid of possible look distances.""" @@ -136,20 +130,13 @@ def train_models_for_each_cell_in_grid( random_word = RandomWords() - # Create all combinations of lookbehind and lookahead days - lookbehind_combinations = [ - LookDirectionCombination(lookbehind=lookbehind, lookahead=lookahead) - for lookbehind in possible_look_distances.behind - for lookahead in possible_look_distances.ahead - ] - - random.shuffle(lookbehind_combinations) + random.shuffle(possible_look_distances) active_trainers: list[subprocess.Popen] = [] wandb_prefix = f"{random_word.get_random_word()}-{random_word.get_random_word()}" - while lookbehind_combinations or active_trainers: + while possible_look_distances or active_trainers: # Wait until there is a free slot in the trainers group if len(active_trainers) >= cfg.train.n_active_trainers: # Drop trainers if they have finished @@ -160,7 +147,7 @@ def train_models_for_each_cell_in_grid( # Start a new trainer - combination = lookbehind_combinations.pop() + combination = possible_look_distances.pop() # Check if any rows in the given combinatin of lookbehind and lookahead days cfg_for_checking_any_rows = cfg.copy() @@ -201,7 +188,7 @@ def load_cfg(config_file_name) -> FullConfig: def get_possible_look_distances( msg: Printer, cfg: FullConfig, train: pd.DataFrame -) -> list[LookDirectionCombination]: +) -> list[LookDistance]: """Some look_ahead and look_behind distances will result in 0 valid prediction times. Only return combinations which will allow some prediction times. @@ -216,9 +203,9 @@ def get_possible_look_distances( look_combinations_in_dataset = infer_possible_look_distances(df=train) look_distance_combinations = [ - LookDirectionCombination(behind_days=behind_days, ahead_days=ahead_days) - for behind_days in look_combinations_in_dataset.behind - for ahead_days in look_combinations_in_dataset.ahead + LookDistance(behind_days=behind_days, ahead_days=ahead_days) + for behind_days in look_combinations_in_dataset.ahead_days + for ahead_days in look_combinations_in_dataset.behind_days ] # Don't try look distance combinations which will result in 0 rows From 7fd5ba1ca2ce5e238c8e34d9a2d0347abf4134d9 Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Wed, 26 Oct 2022 12:09:05 +0200 Subject: [PATCH 55/57] fix: watcher is working --- application/train_and_log_models.py | 66 +++++++++---------- .../config/project/default_project.yaml | 1 + .../project/integration_test_project.yaml | 1 + .../config/train/default_training.yaml | 2 +- src/psycopt2d/evaluation.py | 20 +----- src/psycopt2d/load.py | 28 ++------ src/psycopt2d/model_training_watcher.py | 9 +-- src/psycopt2d/utils/configs.py | 3 +- 8 files changed, 46 insertions(+), 84 deletions(-) diff --git a/application/train_and_log_models.py b/application/train_and_log_models.py index 52b87ee8..0354be81 100644 --- a/application/train_and_log_models.py +++ b/application/train_and_log_models.py @@ -30,22 +30,27 @@ class LookDistance(BaseModel): """A distance of ahead and behind.""" - behind_days: list[Union[int, float]] - ahead_days: list[Union[int, float]] + behind_days: Union[int, float] + ahead_days: Union[int, float] def load_train_raw(cfg: FullConfig): """Load the data.""" path = Path(cfg.data.dir) - file = list(path.glob(pattern=r"*train*")) + file_names = list(path.glob(pattern=r"*train*")) - if len(file) == 1: - return pd.read_parquet(file) + if len(file_names) == 1: + file_name = file_names[0] + file_suffix = file_name.suffix + if file_suffix == ".parquet": + return pd.read_parquet(file_name) + elif file_suffix == ".csv": + return pd.read_csv(file_name) - raise ValueError(f"Returned {len(file)} files") + raise ValueError(f"Returned {len(file_names)} files") -def infer_possible_look_distances(df: pd.DataFrame) -> LookDistance: +def infer_possible_look_distances(df: pd.DataFrame) -> list[LookDistance]: """Infer the possible values for min_lookahead_days and min_lookbehind_days.""" # Get potential lookaheads from outc_ columns @@ -57,10 +62,14 @@ def infer_possible_look_distances(df: pd.DataFrame) -> LookDistance: pred_col_names = infer_predictor_col_name(df=df, allow_multiple=True) possible_lookbehind_days = list(set(infer_look_distance(col_name=pred_col_names))) - return LookDistance( - behind_days=possible_lookahead_days, - ahead_days=possible_lookbehind_days, - ) + return [ + LookDistance( + behind_days=lookbehind_days, + ahead_days=lookahead_days, + ) + for lookahead_days in possible_lookahead_days + for lookbehind_days in possible_lookbehind_days + ] def start_trainer( @@ -149,17 +158,6 @@ def train_models_for_each_cell_in_grid( combination = possible_look_distances.pop() - # Check if any rows in the given combinatin of lookbehind and lookahead days - cfg_for_checking_any_rows = cfg.copy() - cfg_for_checking_any_rows.data.min_lookbehind_days = combination.behind_days - cfg_for_checking_any_rows.data.min_lookahead_days = combination.ahead_days - - train = load_train_from_cfg(cfg=cfg) - - if train.shape[0] == 0: - msg.warn(f"No rows for {combination}, continuing") - continue - msg.info( f"Spawning a new trainer with lookbehind={combination.behind_days} and lookahead={combination.ahead_days}", ) @@ -202,20 +200,17 @@ def get_possible_look_distances( look_combinations_in_dataset = infer_possible_look_distances(df=train) - look_distance_combinations = [ - LookDistance(behind_days=behind_days, ahead_days=ahead_days) - for behind_days in look_combinations_in_dataset.ahead_days - for ahead_days in look_combinations_in_dataset.behind_days - ] - # Don't try look distance combinations which will result in 0 rows - max_distance_in_dataset_days = max(train[cfg.data.pred_timestamp_col_name]) - max( - train[cfg.data.pred_timestamp_col_name], - ) + max_distance_in_dataset_days = ( + max(train[cfg.data.pred_timestamp_col_name]) + - min( + train[cfg.data.pred_timestamp_col_name], + ) + ).days look_combinations_without_rows = [ dist - for dist in look_distance_combinations + for dist in look_combinations_in_dataset if ((dist.ahead_days + dist.behind_days)) > max_distance_in_dataset_days ] @@ -225,7 +220,7 @@ def get_possible_look_distances( look_combinations_with_rows = [ dist - for dist in look_distance_combinations + for dist in look_combinations_in_dataset if ((dist.ahead_days + dist.behind_days) < max_distance_in_dataset_days) ] @@ -240,9 +235,8 @@ def main(): cfg = load_cfg(config_file_name=config_file_name) - # Override for testing - cfg.train.n_active_trainers = 1 - + # Load dataset without dropping any rows for inferring + # which look distances to grid search over train = load_train_raw(cfg=cfg) possible_look_distances = get_possible_look_distances(msg, cfg, train) diff --git a/src/psycopt2d/config/project/default_project.yaml b/src/psycopt2d/config/project/default_project.yaml index b98f94c8..60ef0af8 100644 --- a/src/psycopt2d/config/project/default_project.yaml +++ b/src/psycopt2d/config/project/default_project.yaml @@ -10,3 +10,4 @@ watcher: archive_all: false keep_alive_after_training_minutes: 5 n_runs_before_eval: 1 + verbose: true diff --git a/src/psycopt2d/config/project/integration_test_project.yaml b/src/psycopt2d/config/project/integration_test_project.yaml index eea71af1..dceda704 100644 --- a/src/psycopt2d/config/project/integration_test_project.yaml +++ b/src/psycopt2d/config/project/integration_test_project.yaml @@ -8,4 +8,5 @@ watcher: archive_all: true keep_alive_after_training_minutes: 5 n_runs_before_eval: 1 + verbose: true gpu: false diff --git a/src/psycopt2d/config/train/default_training.yaml b/src/psycopt2d/config/train/default_training.yaml index e81d99be..f5378cb6 100644 --- a/src/psycopt2d/config/train/default_training.yaml +++ b/src/psycopt2d/config/train/default_training.yaml @@ -1,4 +1,4 @@ n_splits: 3 # (int, Null): Number of k-folds during CV. If Null, loads pre-defined dataset. n_trials_per_lookdirection_combination: 20 -n_active_trainers: 8 +n_active_trainers: 1 gpu: true diff --git a/src/psycopt2d/evaluation.py b/src/psycopt2d/evaluation.py index 5f7745da..6c3d9704 100644 --- a/src/psycopt2d/evaluation.py +++ b/src/psycopt2d/evaluation.py @@ -58,9 +58,7 @@ def log_feature_importances( def evaluate_model( cfg: FullConfig, eval_df: pd.DataFrame, - pipe: Pipeline, y_col_name: str, - train_col_names: Iterable[str], y_hat_prob_col_name: str, run: wandb_run, feature_importance_dict: Optional[dict[str, float]], @@ -77,6 +75,8 @@ def evaluate_model( run (wandb_run): WandB run to log to. feature_importance_dict (Optional[dict[str, float]]): Dict of feature names and their importance. If None, will not log feature importance. + selected_features (Optional[list[str]]): List of selected features after preprocessing. + Used for plotting. """ msg = Printer(timestamp=True) @@ -101,22 +101,6 @@ def evaluate_model( pred_timestamps = eval_df[cfg.data.pred_timestamp_col_name] y_hat_int = np.round(y_hat_probs, 0) - if "feature_selection" in pipe["preprocessing"].named_steps: - selected_features = ( - eval_df[train_col_names] - .columns[pipe["preprocessing"]["feature_selection"].get_support()] - .to_list() - ) - - run.log( - { - "feature_selection_table": feature_selection_table( - feature_names=train_col_names, - selected_feature_names=selected_features, - ), - }, - ) - date_bins_ahead: Iterable[int] = cfg.eval.date_bins_ahead date_bins_behind: Iterable[int] = cfg.eval.date_bins_behind diff --git a/src/psycopt2d/load.py b/src/psycopt2d/load.py index e680a4c5..c0309b5f 100644 --- a/src/psycopt2d/load.py +++ b/src/psycopt2d/load.py @@ -13,12 +13,9 @@ from psycopt2d.evaluate_saved_model_predictions import infer_look_distance from psycopt2d.utils.configs import FullConfig -from psycopt2d.utils.utils import ( - coerce_to_datetime, - get_percent_lost, - infer_outcome_col_name, - infer_predictor_col_name, -) +from psycopt2d.utils.utils import (coerce_to_datetime, get_percent_lost, + infer_outcome_col_name, + infer_predictor_col_name) msg = Printer(timestamp=True) @@ -376,10 +373,9 @@ def _keep_unique_outcome_col_with_lookahead_days_matching_conf( if not col_to_drop: return dataset - col_to_drop = col_to_drop[0] if len(col_to_drop) == 1 else outcome_cols df = dataset.drop(col_to_drop, axis=1) - if not isinstance(infer_outcome_col_name(df), str): + if not len(infer_outcome_col_name(df)) == 1: raise ValueError( "Returning more than one outcome column, will cause problems during eval.", ) @@ -440,22 +436,6 @@ def load_dataset_from_dir( pd.DataFrame: The filtered dataset """ msg.info(f"Loading {split_names}") - # Handle input types - for timedelta_arg in ( - self.cfg.data.min_lookbehind_days, - self.cfg.data.min_lookahead_days, - ): - if timedelta_arg: - timedelta_arg = timedelta(days=timedelta_arg) # type: ignore - - for date_arg in ( - self.cfg.data.drop_patient_if_outcome_before_date, - self.cfg.data.min_prediction_time_date, - ): - if isinstance(date_arg, str): - date_arg = coerce_to_datetime( - date_repr=date_arg, - ) # Concat splits if multiple are given if isinstance(split_names, (list, tuple)): diff --git a/src/psycopt2d/model_training_watcher.py b/src/psycopt2d/model_training_watcher.py index d6d05c1d..0de379af 100644 --- a/src/psycopt2d/model_training_watcher.py +++ b/src/psycopt2d/model_training_watcher.py @@ -23,7 +23,6 @@ load_evaluation_data, ) -# Path to the wandb directory WANDB_DIR = PROJECT_ROOT / "wandb" @@ -119,7 +118,9 @@ def _upload_run_dir(self, run_dir: Path) -> str: check=True, capture_output=True, ) + stdout = proc.stdout.decode("utf-8") + if self.verbose: msg.info(f"Watcher: {stdout}") return stdout @@ -139,8 +140,8 @@ def _do_evaluation(self, run_id: str) -> None: # get evaluation data eval_data = self._get_eval_data(run_id) # infer required column names - y_col_name = infer_outcome_col_name(df=eval_data.df, prefix="outc_") - y_hat_prob_col_name = infer_y_hat_prob_col_name(df=eval_data.df) + y_col_name = infer_outcome_col_name(df=eval_data.df, prefix="outc_")[0] + y_hat_prob_col_name = infer_y_hat_prob_col_name(df=eval_data.df)[0] # get wandb run run: Run = wandb.init(project=self.project_name, entity=self.entity, id=run_id) # type: ignore @@ -261,7 +262,7 @@ def upload_unarchived_runs(self) -> None: wandb_sync_stdout = self._upload_run_dir(run_folder) - if "...done" not in wandb_sync_stdout: + if "... done" not in wandb_sync_stdout: if ".wandb file is empty" not in wandb_sync_stdout: raise ValueError( f"wandb sync failed, returned: {wandb_sync_stdout}", diff --git a/src/psycopt2d/utils/configs.py b/src/psycopt2d/utils/configs.py index fb6f18ad..1b43ef45 100644 --- a/src/psycopt2d/utils/configs.py +++ b/src/psycopt2d/utils/configs.py @@ -22,6 +22,7 @@ class Config: """Allow arbitrary types.""" arbitrary_types_allowed = True + allow_mutation = False class WandbConf(BaseModel): @@ -38,6 +39,7 @@ class WatcherConf(BaseModel): archive_all: bool keep_alive_after_training_minutes: Union[int, float] n_runs_before_eval: int + verbose: bool class ProjectConf(BaseModel): @@ -46,7 +48,6 @@ class ProjectConf(BaseModel): wandb: WandbConf name: str = "psycopt2d" seed: int - wandb: WandbConf watcher: WatcherConf From cb3de8b273ee677986a8debd0e9cf264c77d31d1 Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Wed, 26 Oct 2022 12:09:30 +0200 Subject: [PATCH 56/57] style: linting --- application/train_and_log_models.py | 9 +++++---- src/psycopt2d/evaluation.py | 2 -- src/psycopt2d/load.py | 8 +++++--- 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/application/train_and_log_models.py b/application/train_and_log_models.py index 0354be81..c5b5e226 100644 --- a/application/train_and_log_models.py +++ b/application/train_and_log_models.py @@ -21,7 +21,6 @@ infer_outcome_col_name, infer_predictor_col_name, ) -from psycopt2d.load import load_train_from_cfg from psycopt2d.utils.configs import FullConfig, omegaconf_to_pydantic_objects msg = Printer(timestamp=True) @@ -185,7 +184,9 @@ def load_cfg(config_file_name) -> FullConfig: def get_possible_look_distances( - msg: Printer, cfg: FullConfig, train: pd.DataFrame + msg: Printer, + cfg: FullConfig, + train: pd.DataFrame, ) -> list[LookDistance]: """Some look_ahead and look_behind distances will result in 0 valid prediction times. Only return combinations which will allow some prediction @@ -211,11 +212,11 @@ def get_possible_look_distances( look_combinations_without_rows = [ dist for dist in look_combinations_in_dataset - if ((dist.ahead_days + dist.behind_days)) > max_distance_in_dataset_days + if (dist.ahead_days + dist.behind_days) > max_distance_in_dataset_days ] msg.info( - f"Not fitting model to {look_combinations_without_rows}, since no rows satisfy the criteria." + f"Not fitting model to {look_combinations_without_rows}, since no rows satisfy the criteria.", ) look_combinations_with_rows = [ diff --git a/src/psycopt2d/evaluation.py b/src/psycopt2d/evaluation.py index 6c3d9704..003c959e 100644 --- a/src/psycopt2d/evaluation.py +++ b/src/psycopt2d/evaluation.py @@ -7,7 +7,6 @@ import pandas as pd from omegaconf.dictconfig import DictConfig from sklearn.metrics import recall_score, roc_auc_score -from sklearn.pipeline import Pipeline from wandb.sdk.wandb_run import Run as wandb_run # pylint: disable=no-name-in-module from wasabi import Printer @@ -15,7 +14,6 @@ from psycopt2d.tables.performance_by_threshold import ( generate_performance_by_positive_rate_table, ) -from psycopt2d.tables.tables import feature_selection_table from psycopt2d.utils.configs import FullConfig from psycopt2d.utils.utils import PROJECT_ROOT, positive_rate_to_pred_probs from psycopt2d.visualization import ( diff --git a/src/psycopt2d/load.py b/src/psycopt2d/load.py index c0309b5f..092b09ce 100644 --- a/src/psycopt2d/load.py +++ b/src/psycopt2d/load.py @@ -13,9 +13,11 @@ from psycopt2d.evaluate_saved_model_predictions import infer_look_distance from psycopt2d.utils.configs import FullConfig -from psycopt2d.utils.utils import (coerce_to_datetime, get_percent_lost, - infer_outcome_col_name, - infer_predictor_col_name) +from psycopt2d.utils.utils import ( + get_percent_lost, + infer_outcome_col_name, + infer_predictor_col_name, +) msg = Printer(timestamp=True) From 65dc59a10e390f60917862cfd4239f306ad1059d Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Wed, 26 Oct 2022 12:20:10 +0200 Subject: [PATCH 57/57] fix: failing tests --- src/psycopt2d/load.py | 4 ++-- src/psycopt2d/train_and_log_models.py | 2 -- tests/test_load.py | 7 +++---- tests/test_train_model.py | 1 + 4 files changed, 6 insertions(+), 8 deletions(-) diff --git a/src/psycopt2d/load.py b/src/psycopt2d/load.py index 092b09ce..41958499 100644 --- a/src/psycopt2d/load.py +++ b/src/psycopt2d/load.py @@ -338,9 +338,9 @@ def _drop_cols_and_rows_if_look_direction_not_met( for direction in ("ahead", "behind"): if direction in ("ahead", "behind"): - if self.cfg.data.min_lookahead_days: + if direction == "ahead": n_days = self.cfg.data.min_lookahead_days - elif self.cfg.data.min_lookbehind_days: + elif direction == "behind": n_days = self.cfg.data.min_lookbehind_days else: continue diff --git a/src/psycopt2d/train_and_log_models.py b/src/psycopt2d/train_and_log_models.py index 764b28af..3608733f 100644 --- a/src/psycopt2d/train_and_log_models.py +++ b/src/psycopt2d/train_and_log_models.py @@ -8,8 +8,6 @@ - Run this script from project root with `python src/psycopt2d/train_and_log_models.py` """ -# TODO: Should be unified with the other train_and_log_models in application. Will be done when merging parent branch. - import subprocess import time diff --git a/tests/test_load.py b/tests/test_load.py index 094fe3c6..90991383 100644 --- a/tests/test_load.py +++ b/tests/test_load.py @@ -11,14 +11,13 @@ def test_load_lookbehind_exceeds_lookbehind_threshold(): with initialize(version_base=None, config_path="../src/psycopt2d/config/"): cfg = compose( config_name="integration_testing.yaml", + overrides=["data.min_lookbehind_days=60"], ) cfg = omegaconf_to_pydantic_objects(cfg) - - cfg.data.min_lookahead_days = 30 split_dataset = load_train_and_val_from_cfg(cfg) - assert split_dataset.train.shape[1] == 6 + assert split_dataset.train.shape[1] == 7 def test_load_lookbehind_not_in_lookbehind_combination(): @@ -27,11 +26,11 @@ def test_load_lookbehind_not_in_lookbehind_combination(): with initialize(version_base=None, config_path="../src/psycopt2d/config/"): cfg = compose( config_name="integration_testing.yaml", + overrides=["data.lookbehind_combination=[30]"], ) cfg = omegaconf_to_pydantic_objects(cfg) - cfg.data.lookbehind_combination = [30] split_dataset = load_train_and_val_from_cfg(cfg) assert split_dataset.train.shape[1] == 6 diff --git a/tests/test_train_model.py b/tests/test_train_model.py index c6265627..21762d11 100644 --- a/tests/test_train_model.py +++ b/tests/test_train_model.py @@ -3,6 +3,7 @@ import pytest from hydra import compose, initialize +from psycopt2d.load import load_train_from_cfg from psycopt2d.models import MODELS from psycopt2d.train_model import main from psycopt2d.utils.configs import omegaconf_to_pydantic_objects