From c82fa61c984fe999b35ed86a6bc4e035cd8507bb Mon Sep 17 00:00:00 2001
From: Martin Bernstorff <martinbernstorff@gmail.com>
Date: Thu, 20 Oct 2022 11:02:16 +0200
Subject: [PATCH 01/57] feat: intermediate refactor

---
 application/train_and_log_models.py           | 67 +++++++++++++++++++
 .../evaluate_saved_model_predictions.py       | 61 +++++++++++++----
 2 files changed, 114 insertions(+), 14 deletions(-)
 create mode 100644 application/train_and_log_models.py

diff --git a/application/train_and_log_models.py b/application/train_and_log_models.py
new file mode 100644
index 00000000..1a3e0541
--- /dev/null
+++ b/application/train_and_log_models.py
@@ -0,0 +1,67 @@
+"""Example script to train multiple models and subsequently log the results to
+wandb.
+
+Usage:
+- Replace the HYDRA_ARGS string with the desired arguments for `train_model.py`
+- Run this script from project root with `python src/psycopt2d/train_and_log_models.py`
+-
+"""
+import os
+from pathlib import Path
+
+from psycopt2d.evaluate_saved_model_predictions import (
+    infer_look_distance,
+    infer_outcome_col_name,
+    infer_predictor_col_names,
+)
+from psycopt2d.load import DataLoader, DatasetSpecification, DatasetTimeSpecification
+
+BASE_CONF_FILE_NAME = f"default_config.yaml"
+DATA_DIR = (
+    Path("E:")
+    / "shared_resources"
+    / "feature_sets"
+    / "t2d"
+    / "feature_sets"
+    / "psycop_t2d_adminmanber_201_features_2022_10_05_15_14"
+)
+
+BASE_ARGS = f"--multirun +model=xgboost --config-name {BASE_CONF_FILE_NAME}"
+WANDB_PROJECT = "psycopt2d-testing"
+
+if __name__ == "__main__":
+    time_spec = DatasetTimeSpecification(
+        drop_patient_if_outcome_before_date="1979-01-01",
+        min_prediction_time_date="1979-01-01",
+        min_lookbehind_days=0,
+        min_lookahead_days=0,
+    )
+
+    dataset_spec = DatasetSpecification(
+        file_suffix="parquet",
+        time_spec=time_spec,
+        pred_col_name_prefix="pred_",
+        pred_time_colname="timestamp",
+        split_dir_path=DATA_DIR,
+        time=time_spec,
+    )
+
+    loader = DataLoader(dataset_spec)
+    train = loader.load_dataset_from_dir(split_names="train")
+
+    # Get potential lookaheads from outc_ columns
+    outcome_col_names = infer_outcome_col_name(df=train, allow_multiple=True)
+    possible_lookahead_days = infer_look_distance(
+        col_names=outcome_col_names, allow_multiple=True
+    )
+
+    # Get potential lookbehinds from pred_ columns
+    pred_col_names = infer_predictor_col_names(df=train, allow_multiple=True)
+    possible_lookbehind_days = infer_look_distance(col_names=pred_col_names)
+
+    # Override wandb group name with these
+
+    # Iterate over them
+
+    # Add feature subsetting subsetting to args
+    os.system(f"python src/psycopt2d/train_model.py {BASE_ARGS} ")
diff --git a/src/psycopt2d/evaluate_saved_model_predictions.py b/src/psycopt2d/evaluate_saved_model_predictions.py
index 0505ff86..771c351e 100644
--- a/src/psycopt2d/evaluate_saved_model_predictions.py
+++ b/src/psycopt2d/evaluate_saved_model_predictions.py
@@ -6,7 +6,9 @@
 - Evaluate all models in 'evaluation_results' folder
 - CLI for evaluating a model
 """
+import re
 from pathlib import Path
+from typing import Iterable, Union
 
 import pandas as pd
 from omegaconf.dictconfig import DictConfig
@@ -15,26 +17,57 @@
 from psycopt2d.visualization import plot_auc_by_time_from_first_visit
 
 
-def infer_outcome_col_name(df: pd.DataFrame, prefix: str = "outc_") -> str:
-    """Infer the outcome column name from the dataframe."""
-    outcome_name = [c for c in df.columns if c.startswith(prefix)]
-    if len(outcome_name) == 1:
-        return outcome_name[0]
+def infer_col_names(
+    df: pd.DataFrame, prefix: str, allow_multiple: bool = True
+) -> Union[str, list[str]]:
+    """Infer col names based on prefix"""
+    col_name = [c for c in df.columns if c.startswith(prefix)]
+
+    if len(col_name) == 1:
+        return col_name[0]
+    elif len(col_name) > 1:
+        if allow_multiple:
+            return col_name
+        raise ValueError(
+            f"Multipel columns found and allow_multiple is {allow_multiple}."
+        )
     else:
         raise ValueError("More than one outcome inferred")
 
 
-def infer_predictor_col_names(df: pd.DataFrame, cfg: DictConfig) -> list[str]:
-    """Get the predictors that are used in the model.
+def infer_outcome_col_name(
+    df: pd.DataFrame, prefix: str = "outc_", allow_multiple: bool = True
+) -> Union[str, list[str]]:
+    """Infer the outcome column name from the dataframe."""
+    return infer_col_names(df=df, prefix=prefix, allow_multiple=allow_multiple)
 
-    Args:
-        df (pd.Dataframe): Dataframe with model predictions
-        cfg (DictConfig): Config file
 
-    Returns:
-        list[str]: list of predictors
-    """
-    return [c for c in df.columns if c.startswith(cfg.data.pred_col_name_prefix)]
+def infer_predictor_col_names(
+    df: pd.DataFrame, prefix: str = "outc_", allow_multiple: bool = True
+) -> Union[str, list[str]]:
+    """Get the predictors that are used in the model."""
+    return infer_col_names(df=df, prefix=prefix, allow_multiple=allow_multiple)
+
+
+def infer_look_distance(
+    col_name: Union[Iterable[str], str], regex_pattern=r"within_(\d)_days"
+):
+    """Infer look distances from col names"""
+    # E.g. "outc_within_1_days" = 1
+    # E.g. "outc_within_2_days" = 2
+    # E.g. "pred_within_3_days" = 3
+    # E.g. "pred_within_3_days" = 3
+
+    look_distances = []
+
+    if isinstance(col_name, Iterable):
+        look_distances.append(
+            infer_look_distance(col_name=col_name, regex_pattern=regex_pattern)
+        )
+    else:
+        look_distances = re.findall(regex_pattern, col_name)
+
+    return look_distances
 
 
 def load_model_predictions_and_cfg(path: Path) -> tuple[pd.DataFrame, DictConfig]:

From 1ff62ffd59f82470b33dcc8a4f2308508078a36d Mon Sep 17 00:00:00 2001
From: Martin Bernstorff <martinbernstorff@gmail.com>
Date: Thu, 20 Oct 2022 13:04:41 +0200
Subject: [PATCH 02/57] feat: init training script

---
 application/train_and_log_models.py           | 54 ++++++++++---------
 pyproject.toml                                |  1 +
 src/psycopt2d/config/data/synth_data.yaml     | 25 +++++----
 src/psycopt2d/config/default_config.yaml      |  2 +-
 .../evaluate_saved_model_predictions.py       | 51 +++++++++++-------
 src/psycopt2d/train_model.py                  |  5 +-
 6 files changed, 80 insertions(+), 58 deletions(-)

diff --git a/application/train_and_log_models.py b/application/train_and_log_models.py
index 1a3e0541..d9a3cd10 100644
--- a/application/train_and_log_models.py
+++ b/application/train_and_log_models.py
@@ -3,47 +3,43 @@
 
 Usage:
 - Replace the HYDRA_ARGS string with the desired arguments for `train_model.py`
-- Run this script from project root with `python src/psycopt2d/train_and_log_models.py`
--
+- Run this script from project root with `python src/psycopt2d/train_and_log_models.py
 """
 import os
 from pathlib import Path
 
+from random_word import RandomWords
+from wasabi import msg
+
 from psycopt2d.evaluate_saved_model_predictions import (
     infer_look_distance,
     infer_outcome_col_name,
-    infer_predictor_col_names,
+    infer_predictor_col_name,
 )
 from psycopt2d.load import DataLoader, DatasetSpecification, DatasetTimeSpecification
 
-BASE_CONF_FILE_NAME = f"default_config.yaml"
-DATA_DIR = (
-    Path("E:")
-    / "shared_resources"
-    / "feature_sets"
-    / "t2d"
-    / "feature_sets"
-    / "psycop_t2d_adminmanber_201_features_2022_10_05_15_14"
-)
+BASE_CONF_FILE_NAME = "integration_testing.yaml"
 
-BASE_ARGS = f"--multirun +model=xgboost --config-name {BASE_CONF_FILE_NAME}"
+DATA_DIR = Path("/Users/au484925/Desktop/psycop-t2d/tests/test_data/synth_splits/")
+
+BASE_ARGS = "--multirun +model=xgboost"
 WANDB_PROJECT = "psycopt2d-testing"
+N_TRIALS_PER_CELL_IN_GRID = 50
 
 if __name__ == "__main__":
     time_spec = DatasetTimeSpecification(
-        drop_patient_if_outcome_before_date="1979-01-01",
+        drop_patient_if_outcome_before_date=None,
         min_prediction_time_date="1979-01-01",
         min_lookbehind_days=0,
         min_lookahead_days=0,
     )
 
     dataset_spec = DatasetSpecification(
-        file_suffix="parquet",
-        time_spec=time_spec,
+        file_suffix="csv",
+        time=time_spec,
         pred_col_name_prefix="pred_",
         pred_time_colname="timestamp",
         split_dir_path=DATA_DIR,
-        time=time_spec,
     )
 
     loader = DataLoader(dataset_spec)
@@ -51,17 +47,27 @@
 
     # Get potential lookaheads from outc_ columns
     outcome_col_names = infer_outcome_col_name(df=train, allow_multiple=True)
-    possible_lookahead_days = infer_look_distance(
-        col_names=outcome_col_names, allow_multiple=True
+    possible_lookahead_days = set(
+        infer_look_distance(
+            col_name=outcome_col_names,
+        )
     )
 
     # Get potential lookbehinds from pred_ columns
-    pred_col_names = infer_predictor_col_names(df=train, allow_multiple=True)
-    possible_lookbehind_days = infer_look_distance(col_names=pred_col_names)
+    pred_col_names = infer_predictor_col_name(df=train, allow_multiple=True)
+    possible_lookbehind_days = set(infer_look_distance(col_name=pred_col_names))
 
     # Override wandb group name with these
+    # Generate random word-word string
+    r = RandomWords()
+
+    for lookbehind in possible_lookbehind_days:
+        for lookahead in possible_lookahead_days:
+            wandb_group = f"{r.get_random_word()}-{r.get_random_word()}-beh-{lookbehind}-ahead-{lookahead}"
+
+            command = f"python src/psycopt2d/train_model.py {BASE_ARGS} data.min_lookbehind_days={lookbehind} data.min_lookahead_days={lookahead} +project.wandb_group={wandb_group} hydra.sweeper.n_trials={N_TRIALS_PER_CELL_IN_GRID} --config-name {BASE_CONF_FILE_NAME}"
 
-    # Iterate over them
+            msg.info("Sending command")
+            msg.info(command)
 
-    # Add feature subsetting subsetting to args
-    os.system(f"python src/psycopt2d/train_model.py {BASE_ARGS} ")
+            os.system(command)
diff --git a/pyproject.toml b/pyproject.toml
index c8927054..c2eecb63 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -25,6 +25,7 @@ selenium = ">=4.2.0,<4.6.0"
 # See https://github.com/Aarhus-Psychiatry-Research/psycop-t2d/pull/194 for thoughts on root cause
 seaborn = ">=0.12.0, <0.12.1"
 pyarrow = ">=9.0.0, <9.1.0"
+Random-Word = "^1.0.11"
 
 
 [tool.poetry.dev-dependencies]
diff --git a/src/psycopt2d/config/data/synth_data.yaml b/src/psycopt2d/config/data/synth_data.yaml
index 4ce1f9a3..04443461 100644
--- a/src/psycopt2d/config/data/synth_data.yaml
+++ b/src/psycopt2d/config/data/synth_data.yaml
@@ -1,11 +1,14 @@
-n_training_samples: null
-min_lookahead_days: null
-min_prediction_time_date: null
-lookahead_days: 30
-pred_col_name_prefix: "pred_"
-pred_timestamp_col_name: timestamp
-outcome_timestamp_col_name: timestamp_outcome
-id_col_name: citizen_ids
-source: synthetic
-min_lookbehind_days: null
-drop_patient_if_outcome_before_date: null
\ No newline at end of file
+# @package _global_
+
+data:
+  n_training_samples: null
+  min_lookahead_days: null
+  min_prediction_time_date: null
+  lookahead_days: 30
+  pred_col_name_prefix: "pred_"
+  pred_timestamp_col_name: timestamp
+  outcome_timestamp_col_name: timestamp_outcome
+  id_col_name: citizen_ids
+  source: synthetic
+  min_lookbehind_days: null
+  drop_patient_if_outcome_before_date: null
diff --git a/src/psycopt2d/config/default_config.yaml b/src/psycopt2d/config/default_config.yaml
index 5e318e27..2d081f25 100644
--- a/src/psycopt2d/config/default_config.yaml
+++ b/src/psycopt2d/config/default_config.yaml
@@ -5,4 +5,4 @@ defaults:
   - preprocessing: default_preprocessing
   - training: default_training
   - evaluation: default_evaluation
-  - sweeper: optuna_singlethread
\ No newline at end of file
+  - sweeper: optuna_multithread
diff --git a/src/psycopt2d/evaluate_saved_model_predictions.py b/src/psycopt2d/evaluate_saved_model_predictions.py
index 771c351e..30b0e84b 100644
--- a/src/psycopt2d/evaluate_saved_model_predictions.py
+++ b/src/psycopt2d/evaluate_saved_model_predictions.py
@@ -7,8 +7,9 @@
 - CLI for evaluating a model
 """
 import re
+from collections.abc import Iterable
 from pathlib import Path
-from typing import Iterable, Union
+from typing import Union
 
 import pandas as pd
 from omegaconf.dictconfig import DictConfig
@@ -18,9 +19,11 @@
 
 
 def infer_col_names(
-    df: pd.DataFrame, prefix: str, allow_multiple: bool = True
+    df: pd.DataFrame,
+    prefix: str,
+    allow_multiple: bool = True,
 ) -> Union[str, list[str]]:
-    """Infer col names based on prefix"""
+    """Infer col names based on prefix."""
     col_name = [c for c in df.columns if c.startswith(prefix)]
 
     if len(col_name) == 1:
@@ -29,43 +32,55 @@ def infer_col_names(
         if allow_multiple:
             return col_name
         raise ValueError(
-            f"Multipel columns found and allow_multiple is {allow_multiple}."
+            f"Multipel columns found and allow_multiple is {allow_multiple}.",
         )
     else:
         raise ValueError("More than one outcome inferred")
 
 
 def infer_outcome_col_name(
-    df: pd.DataFrame, prefix: str = "outc_", allow_multiple: bool = True
+    df: pd.DataFrame,
+    prefix: str = "outc_",
+    allow_multiple: bool = True,
 ) -> Union[str, list[str]]:
     """Infer the outcome column name from the dataframe."""
     return infer_col_names(df=df, prefix=prefix, allow_multiple=allow_multiple)
 
 
-def infer_predictor_col_names(
-    df: pd.DataFrame, prefix: str = "outc_", allow_multiple: bool = True
+def infer_predictor_col_name(
+    df: pd.DataFrame,
+    prefix: str = "pred_",
+    allow_multiple: bool = True,
 ) -> Union[str, list[str]]:
     """Get the predictors that are used in the model."""
     return infer_col_names(df=df, prefix=prefix, allow_multiple=allow_multiple)
 
 
 def infer_look_distance(
-    col_name: Union[Iterable[str], str], regex_pattern=r"within_(\d)_days"
-):
-    """Infer look distances from col names"""
+    col_name: Union[Iterable[str], str],
+    regex_pattern: str = r"within_(\d+)_days",
+    allow_multiple: bool = True,
+) -> list[Union[int, float]]:
+    """Infer look distances from col names."""
     # E.g. "outc_within_1_days" = 1
     # E.g. "outc_within_2_days" = 2
     # E.g. "pred_within_3_days" = 3
     # E.g. "pred_within_3_days" = 3
 
-    look_distances = []
+    look_distances: list[Union[int, float]] = []
 
-    if isinstance(col_name, Iterable):
-        look_distances.append(
-            infer_look_distance(col_name=col_name, regex_pattern=regex_pattern)
-        )
+    if isinstance(col_name, Iterable) and not isinstance(col_name, str):
+        for c_name in col_name:
+            look_distances += infer_look_distance(
+                col_name=c_name, regex_pattern=regex_pattern
+            )
     else:
-        look_distances = re.findall(regex_pattern, col_name)
+        look_distances = re.findall(pattern=regex_pattern, string=col_name)
+
+    if len(look_distances) > 1 and not allow_multiple:
+        raise ValueError(
+            f"Multiple col names provided and allow_multiple is {allow_multiple}.",
+        )
 
     return look_distances
 
@@ -89,8 +104,8 @@ def load_model_predictions_and_cfg(path: Path) -> tuple[pd.DataFrame, DictConfig
         / "eval_model_name-xgboost_require_imputation-True_args-n_estimators-100_tree_method-auto_2022_09_22_10_52.pkl",
     )
 
-    train_col_names = infer_predictor_col_names(eval_df, cfg)
-    y_col_name = infer_outcome_col_name(eval_df)
+    train_col_names = infer_predictor_col_name(df=eval_df)
+    y_col_name = infer_outcome_col_name(df=eval_df)
 
     Y_HAT_PROB_COL_NAME = "y_hat_prob"  # change to 'y_hat_prob_oof' if using cv
 
diff --git a/src/psycopt2d/train_model.py b/src/psycopt2d/train_model.py
index ad635c98..1d865797 100644
--- a/src/psycopt2d/train_model.py
+++ b/src/psycopt2d/train_model.py
@@ -304,15 +304,12 @@ def main(cfg):
 
     create_wandb_folders()
 
-    # Get today's date as str
-    today_str = datetime.now().strftime("%Y-%m-%d")
-
     run = wandb.init(
         project=cfg.project.name,
         reinit=True,
         config=flatten_nested_dict(cfg, sep="."),
         mode=cfg.project.wandb_mode,
-        group=today_str,
+        group=cfg.project.wandb_group,
     )
 
     dataset = load_train_and_val_from_cfg(cfg)

From 8e0506a8af7ec7f057a34fba7bd097cb6906c754 Mon Sep 17 00:00:00 2001
From: Martin Bernstorff <martinbernstorff@gmail.com>
Date: Thu, 20 Oct 2022 13:04:53 +0200
Subject: [PATCH 03/57] style: linting

---
 application/train_and_log_models.py               | 2 +-
 src/psycopt2d/evaluate_saved_model_predictions.py | 3 ++-
 src/psycopt2d/train_model.py                      | 1 -
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/application/train_and_log_models.py b/application/train_and_log_models.py
index d9a3cd10..84758f89 100644
--- a/application/train_and_log_models.py
+++ b/application/train_and_log_models.py
@@ -50,7 +50,7 @@
     possible_lookahead_days = set(
         infer_look_distance(
             col_name=outcome_col_names,
-        )
+        ),
     )
 
     # Get potential lookbehinds from pred_ columns
diff --git a/src/psycopt2d/evaluate_saved_model_predictions.py b/src/psycopt2d/evaluate_saved_model_predictions.py
index 30b0e84b..37526f13 100644
--- a/src/psycopt2d/evaluate_saved_model_predictions.py
+++ b/src/psycopt2d/evaluate_saved_model_predictions.py
@@ -72,7 +72,8 @@ def infer_look_distance(
     if isinstance(col_name, Iterable) and not isinstance(col_name, str):
         for c_name in col_name:
             look_distances += infer_look_distance(
-                col_name=c_name, regex_pattern=regex_pattern
+                col_name=c_name,
+                regex_pattern=regex_pattern,
             )
     else:
         look_distances = re.findall(pattern=regex_pattern, string=col_name)
diff --git a/src/psycopt2d/train_model.py b/src/psycopt2d/train_model.py
index 1d865797..a642e5df 100644
--- a/src/psycopt2d/train_model.py
+++ b/src/psycopt2d/train_model.py
@@ -1,7 +1,6 @@
 """Training script for training a single model for predicting t2d."""
 import os
 from collections.abc import Iterable
-from datetime import datetime
 from pathlib import Path
 from typing import Optional
 

From 3f8ccbb02de841bcefe36cbe861e7924c35ff492 Mon Sep 17 00:00:00 2001
From: Martin Bernstorff <martinbernstorff@gmail.com>
Date: Thu, 20 Oct 2022 13:06:51 +0200
Subject: [PATCH 04/57] fix: add wandb_group to project struct

---
 src/psycopt2d/config/project/integration_test_project.yaml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/psycopt2d/config/project/integration_test_project.yaml b/src/psycopt2d/config/project/integration_test_project.yaml
index b3d29e04..39402f44 100644
--- a/src/psycopt2d/config/project/integration_test_project.yaml
+++ b/src/psycopt2d/config/project/integration_test_project.yaml
@@ -1,3 +1,4 @@
 name: psycop-t2d-integration-testing
 seed: 42
-wandb_mode: "disabled" # Which mode to run WanDB in. Takes "run", "dryrun", "offline" and "disabled"
\ No newline at end of file
+wandb_mode: "disabled" # Which mode to run WanDB in. Takes "run", "dryrun", "offline" and "disabled"
+wandb_group: "integration_testing"

From ca99441ff5a0b14f612c7f86e23d76396dc5b6f8 Mon Sep 17 00:00:00 2001
From: Martin Bernstorff <martinbernstorff@gmail.com>
Date: Thu, 20 Oct 2022 13:14:11 +0200
Subject: [PATCH 05/57] style: lint

---
 src/psycopt2d/utils.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/psycopt2d/utils.py b/src/psycopt2d/utils.py
index 73576489..cee19ff5 100644
--- a/src/psycopt2d/utils.py
+++ b/src/psycopt2d/utils.py
@@ -392,6 +392,7 @@ def load_evaluation_data(model_data_dir: Path) -> ModelEvalData:
         feature_importance_dict=feature_importance_dict,
     )
 
+
 def infer_col_names(
     df: pd.DataFrame,
     prefix: str,
@@ -429,6 +430,11 @@ def infer_predictor_col_name(
     """Get the predictors that are used in the model."""
     return infer_col_names(df=df, prefix=prefix, allow_multiple=allow_multiple)
 
-def infer_y_hat_prob_col_name(df: pd.DataFrame, prefix="y_hat_prob", allow_multiple: False) -> str:
+
+def infer_y_hat_prob_col_name(
+    df: pd.DataFrame,
+    prefix="y_hat_prob",
+    allow_multiple: bool = False,
+) -> str:
     """Infer the y_hat_prob column name from the dataframe."""
-    return infer_col_names(df=df, prefix=prefix, allow_multiple=allow_multiple)
\ No newline at end of file
+    return infer_col_names(df=df, prefix=prefix, allow_multiple=allow_multiple)

From d8bd92df0f0303ab5fa9afa40bb28148fc191f37 Mon Sep 17 00:00:00 2001
From: Martin Bernstorff <martinbernstorff@gmail.com>
Date: Thu, 20 Oct 2022 13:14:37 +0200
Subject: [PATCH 06/57] docs: typo

---
 src/psycopt2d/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/psycopt2d/utils.py b/src/psycopt2d/utils.py
index cee19ff5..eac545e0 100644
--- a/src/psycopt2d/utils.py
+++ b/src/psycopt2d/utils.py
@@ -407,7 +407,7 @@ def infer_col_names(
         if allow_multiple:
             return col_name
         raise ValueError(
-            f"Multipel columns found and allow_multiple is {allow_multiple}.",
+            f"Multiple columns found and allow_multiple is {allow_multiple}.",
         )
     else:
         raise ValueError("More than one outcome inferred")

From bb63f531cd1405160cc8137bebf8b42428a336f5 Mon Sep 17 00:00:00 2001
From: Martin Bernstorff <martinbernstorff@gmail.com>
Date: Thu, 20 Oct 2022 13:28:24 +0200
Subject: [PATCH 07/57] feat: add watcher

---
 application/train_and_log_models.py | 146 +++++++++++++++++++++++-----
 1 file changed, 122 insertions(+), 24 deletions(-)

diff --git a/application/train_and_log_models.py b/application/train_and_log_models.py
index 84758f89..79d3282d 100644
--- a/application/train_and_log_models.py
+++ b/application/train_and_log_models.py
@@ -6,10 +6,14 @@
 - Run this script from project root with `python src/psycopt2d/train_and_log_models.py
 """
 import os
+import subprocess
+import time
 from pathlib import Path
+from typing import Iterable, Union
 
-from random_word import RandomWords
-from wasabi import msg
+from hydra import compose, initialize
+from pydantic import BaseModel
+from wasabi import Printer, msg
 
 from psycopt2d.evaluate_saved_model_predictions import (
     infer_look_distance,
@@ -26,27 +30,43 @@
 WANDB_PROJECT = "psycopt2d-testing"
 N_TRIALS_PER_CELL_IN_GRID = 50
 
-if __name__ == "__main__":
-    time_spec = DatasetTimeSpecification(
-        drop_patient_if_outcome_before_date=None,
-        min_prediction_time_date="1979-01-01",
-        min_lookbehind_days=0,
-        min_lookahead_days=0,
-    )
+# RUN CONSTANTS
+CONFIG_NAME = "integration_testing.yaml"
 
-    dataset_spec = DatasetSpecification(
-        file_suffix="csv",
-        time=time_spec,
-        pred_col_name_prefix="pred_",
-        pred_time_colname="timestamp",
-        split_dir_path=DATA_DIR,
-    )
+HYDRA_ARGS = f"--multirun +model=xgboost project.wandb_mode='dryrun' model.args.tree_method='auto' --config-name {CONFIG_NAME}"
+OVERTACI = "false"  # Change to "true" if running on overtaci
 
+# WATCHER CONSTANTS
+WANDB_ENTITY = (
+    "psycop"  # The wandb entity to upload to (e.g. "psycop" or your user name)
+)
+N_RUNS_BEFORE_FIRST_EVAL = (
+    "1"  # The number of runs to upload to wandb before evaluating the best runs.
+)
+KEEP_WATCHER_ALIVE_AFTER_TRAINING_FINISHED_MINUTES = (
+    5  # minutes to wait for the wandb watcher after training
+)
+# has finished. Will kill the watcher after this time.
+ARCHIVE_ALL_WANDB_RUNS = "false"  # whether to archive all runs in the wandb folder
+# before starting model training. Change to "t" to archive all wandb runs
+
+
+def load_data(dataset_spec):
+    """Load the data"""
     loader = DataLoader(dataset_spec)
-    train = loader.load_dataset_from_dir(split_names="train")
+    return loader.load_dataset_from_dir(split_names="train")
+
 
+class PossibleLookDistanceDays(BaseModel):
+    ahead: Iterable[Union[int, float]]
+    behind: Iterable[Union[int, float]]
+
+
+def infer_possible_look_directions(train):
+    """Infer the possible values for min_lookahead_days and min_lookbehind_days"""
     # Get potential lookaheads from outc_ columns
     outcome_col_names = infer_outcome_col_name(df=train, allow_multiple=True)
+
     possible_lookahead_days = set(
         infer_look_distance(
             col_name=outcome_col_names,
@@ -57,17 +77,95 @@
     pred_col_names = infer_predictor_col_name(df=train, allow_multiple=True)
     possible_lookbehind_days = set(infer_look_distance(col_name=pred_col_names))
 
-    # Override wandb group name with these
-    # Generate random word-word string
-    r = RandomWords()
+    return PossibleLookDistanceDays(
+        ahead=possible_lookahead_days, behind=possible_lookbehind_days
+    )
+
+
+def get_dataset_spec(data_dir_path: Path):
+    time_spec = DatasetTimeSpecification(
+        drop_patient_if_outcome_before_date=None,
+        min_prediction_time_date="1979-01-01",
+        min_lookbehind_days=0,
+        min_lookahead_days=0,
+    )
+
+    return DatasetSpecification(
+        file_suffix="csv",
+        time=time_spec,
+        pred_col_name_prefix="pred_",
+        pred_time_colname="timestamp",
+        split_dir_path=data_dir_path,
+    )
 
-    for lookbehind in possible_lookbehind_days:
-        for lookahead in possible_lookahead_days:
-            wandb_group = f"{r.get_random_word()}-{r.get_random_word()}-beh-{lookbehind}-ahead-{lookahead}"
 
-            command = f"python src/psycopt2d/train_model.py {BASE_ARGS} data.min_lookbehind_days={lookbehind} data.min_lookahead_days={lookahead} +project.wandb_group={wandb_group} hydra.sweeper.n_trials={N_TRIALS_PER_CELL_IN_GRID} --config-name {BASE_CONF_FILE_NAME}"
+def train_models_for_each_grid(
+    base_conf_file_name: Union[str, Path],
+    base_args: str,
+    n_trials_per_cell_in_grid: int,
+    possible_look_distances: PossibleLookDistanceDays,
+):
+    """Train a model for each cell in the grid of possible look distances"""
+    from random_word import RandomWords
+
+    random_word = RandomWords()
+
+    for lookbehind in possible_look_distances.behind:
+        for lookahead in possible_look_distances.ahead:
+            wandb_group = f"{random_word.get_random_word()}-{random_word.get_random_word()}-beh-{lookbehind}-ahead-{lookahead}"
+
+            command = f"python src/psycopt2d/train_model.py {base_args} data.min_lookbehind_days={lookbehind} data.min_lookahead_days={lookahead} +project.wandb_group={wandb_group} hydra.sweeper.n_trials={n_trials_per_cell_in_grid} --config-name {base_conf_file_name}"
 
             msg.info("Sending command")
             msg.info(command)
 
             os.system(command)
+
+
+if __name__ == "__main__":
+    msg = Printer(timestamp=True)
+
+    with initialize(version_base=None, config_path="config/"):
+        cfg = compose(
+            config_name=CONFIG_NAME,
+        )
+
+    dataset_spec = get_dataset_spec(data_dir_path=DATA_DIR)
+
+    train = load_data(dataset_spec=dataset_spec)
+
+    possible_look_distance = infer_possible_look_directions(train)
+
+    watcher = subprocess.Popen(  # pylint: disable=consider-using-with
+        [
+            "python",
+            "src/psycopt2d/model_training_watcher.py",
+            "--entity",
+            WANDB_ENTITY,
+            "--project_name",
+            cfg.project.name,
+            "--n_runs_before_eval",
+            N_RUNS_BEFORE_FIRST_EVAL,
+            "--overtaci",
+            OVERTACI,
+            "--timeout",
+            "None",
+            "--clean_wandb_dir",
+            ARCHIVE_ALL_WANDB_RUNS,
+        ],
+    )
+
+    train_models_for_each_grid(
+        base_conf_file_name=BASE_CONF_FILE_NAME,
+        base_args=BASE_ARGS,
+        n_trials_per_cell_in_grid=N_TRIALS_PER_CELL_IN_GRID,
+        possible_look_distances=possible_look_distance,
+    )
+
+    msg.good(
+        f"Training finished. Stopping the watcher in {KEEP_WATCHER_ALIVE_AFTER_TRAINING_FINISHED_MINUTES} minutes...",
+    )
+
+    time.sleep(60 * KEEP_WATCHER_ALIVE_AFTER_TRAINING_FINISHED_MINUTES)
+    watcher.kill()
+    msg.good("Watcher stopped.")

From 012bec76c9f06c13a857e6ef0f6cb0c0b22243f6 Mon Sep 17 00:00:00 2001
From: Martin Bernstorff <martinbernstorff@gmail.com>
Date: Thu, 20 Oct 2022 13:47:09 +0200
Subject: [PATCH 08/57] refactor: use structs

---
 application/train_and_log_models.py | 154 ++++++++++++++++++----------
 1 file changed, 101 insertions(+), 53 deletions(-)

diff --git a/application/train_and_log_models.py b/application/train_and_log_models.py
index 79d3282d..64e94e62 100644
--- a/application/train_and_log_models.py
+++ b/application/train_and_log_models.py
@@ -8,11 +8,11 @@
 import os
 import subprocess
 import time
+from collections.abc import Iterable
 from pathlib import Path
-from typing import Iterable, Union
+from typing import Union
 
-from hydra import compose, initialize
-from pydantic import BaseModel
+from pydantic import BaseModel, Field
 from wasabi import Printer, msg
 
 from psycopt2d.evaluate_saved_model_predictions import (
@@ -22,48 +22,79 @@
 )
 from psycopt2d.load import DataLoader, DatasetSpecification, DatasetTimeSpecification
 
-BASE_CONF_FILE_NAME = "integration_testing.yaml"
 
-DATA_DIR = Path("/Users/au484925/Desktop/psycop-t2d/tests/test_data/synth_splits/")
+class PossibleLookDistanceDays(BaseModel):
+    """Possible look distances"""
 
-BASE_ARGS = "--multirun +model=xgboost"
-WANDB_PROJECT = "psycopt2d-testing"
-N_TRIALS_PER_CELL_IN_GRID = 50
+    ahead: Iterable[Union[int, float]]
+    behind: Iterable[Union[int, float]]
 
-# RUN CONSTANTS
-CONFIG_NAME = "integration_testing.yaml"
 
-HYDRA_ARGS = f"--multirun +model=xgboost project.wandb_mode='dryrun' model.args.tree_method='auto' --config-name {CONFIG_NAME}"
-OVERTACI = "false"  # Change to "true" if running on overtaci
+class MetaConf(BaseModel):
+    """Meta configuration for the script."""
 
-# WATCHER CONSTANTS
-WANDB_ENTITY = (
-    "psycop"  # The wandb entity to upload to (e.g. "psycop" or your user name)
-)
-N_RUNS_BEFORE_FIRST_EVAL = (
-    "1"  # The number of runs to upload to wandb before evaluating the best runs.
-)
-KEEP_WATCHER_ALIVE_AFTER_TRAINING_FINISHED_MINUTES = (
-    5  # minutes to wait for the wandb watcher after training
-)
-# has finished. Will kill the watcher after this time.
-ARCHIVE_ALL_WANDB_RUNS = "false"  # whether to archive all runs in the wandb folder
-# before starting model training. Change to "t" to archive all wandb runs
+    conf_name: str = Field("integration_testing.yaml")
+    data_dir: Path = Path(
+        "/Users/au484925/Desktop/psycop-t2d/tests/test_data/synth_splits/"
+    )
+    overtaci: str = Field(
+        default="false", description="Change to 'true' if running on overtaci"
+    )
+
+
+class WatcherConf(BaseModel):
+    """Confiugration for the watcher."""
+
+    archive_all: str = Field(
+        default="false",
+        description="Whether to archive all runs in the wandb folder before starting model training. Change to 't' to archive all wandb runs",
+    )
+    n_runs_before_first_eval: int = Field(
+        default="1",
+        description="The number of runs to upload to wandb before evaluating the best runs.",
+    )
+    keep_alive_after_training_minutes: int = Field(
+        default=5,
+        description="minutes to wait for the wandb watcher after training has finished. Will kill the watcher after this time.",
+    )
+
+
+class WandbConf(BaseModel):
+    """Configuration for wandb."""
+
+    project_name: str = "psycopt2d-testing"
+    entity: str = Field(
+        default="psycop",
+        description="The wandb entity to upload to (e.g. 'psycop' or your user name)",
+    )
+
+
+class TrainConf(BaseModel):
+    """Configuration for model training."""
+
+    n_trials_per_cell_in_grid: int = Field(
+        default=50,
+        description="Number of trials per cell in the lookahead/lookbehind grid",
+    )
+
+    conf_name: str = Field(default="integration_testing.yaml")
+
+    base_args: str = Field(
+        default=f"--multirun +model=xgboost project.wandb_mode='dryrun' model.args.tree_method='auto' --config-name {conf_name}"
+    )
+
+    possible_look_distance: PossibleLookDistanceDays
 
 
 def load_data(dataset_spec):
-    """Load the data"""
+    """Load the data."""
     loader = DataLoader(dataset_spec)
     return loader.load_dataset_from_dir(split_names="train")
 
 
-class PossibleLookDistanceDays(BaseModel):
-    ahead: Iterable[Union[int, float]]
-    behind: Iterable[Union[int, float]]
-
-
 def infer_possible_look_directions(train):
-    """Infer the possible values for min_lookahead_days and min_lookbehind_days"""
+    """Infer the possible values for min_lookahead_days and
+    min_lookbehind_days."""
     # Get potential lookaheads from outc_ columns
     outcome_col_names = infer_outcome_col_name(df=train, allow_multiple=True)
 
@@ -78,11 +109,13 @@ def infer_possible_look_directions(train):
     possible_lookbehind_days = set(infer_look_distance(col_name=pred_col_names))
 
     return PossibleLookDistanceDays(
-        ahead=possible_lookahead_days, behind=possible_lookbehind_days
+        ahead=possible_lookahead_days,
+        behind=possible_lookbehind_days,
     )
 
 
 def get_dataset_spec(data_dir_path: Path):
+    """Get dataset specification"""
     time_spec = DatasetTimeSpecification(
         drop_patient_if_outcome_before_date=None,
         min_prediction_time_date="1979-01-01",
@@ -99,13 +132,13 @@ def get_dataset_spec(data_dir_path: Path):
     )
 
 
-def train_models_for_each_grid(
+def train_models_for_each_cell_in_grid(
     base_conf_file_name: Union[str, Path],
     base_args: str,
     n_trials_per_cell_in_grid: int,
     possible_look_distances: PossibleLookDistanceDays,
 ):
-    """Train a model for each cell in the grid of possible look distances"""
+    """Train a model for each cell in the grid of possible look distances."""
     from random_word import RandomWords
 
     random_word = RandomWords()
@@ -125,47 +158,62 @@ def train_models_for_each_grid(
 if __name__ == "__main__":
     msg = Printer(timestamp=True)
 
-    with initialize(version_base=None, config_path="config/"):
-        cfg = compose(
-            config_name=CONFIG_NAME,
-        )
+    meta_conf = MetaConf(
+        conf_name="integration_testing.yaml",
+        overtaci="false",
+        data_dir=Path(
+            "/Users/au484925/Desktop/psycop-t2d/tests/test_data/synth_splits/"
+        ),
+    )
 
-    dataset_spec = get_dataset_spec(data_dir_path=DATA_DIR)
+    wandb_conf = WandbConf(
+        entity="psycop",
+        project_name="psycopt2d-testing",
+    )
 
-    train = load_data(dataset_spec=dataset_spec)
+    watcher_conf = WatcherConf(archive_all="false", keep_alive_after_training_minutes=5)
 
+    dataset_spec = get_dataset_spec(data_dir_path=meta_conf.data_dir)
+
+    train = load_data(dataset_spec=dataset_spec)
     possible_look_distance = infer_possible_look_directions(train)
 
+    train_conf = TrainConf(
+        conf_name=meta_conf.conf_name,
+        base_args=f"--multirun +model=xgboost project.wandb_mode='dryrun' model.args.tree_method='auto' --config-name {meta_conf.conf_name}",
+        n_trials_per_cell_in_grid=50,
+    )
+
     watcher = subprocess.Popen(  # pylint: disable=consider-using-with
         [
             "python",
             "src/psycopt2d/model_training_watcher.py",
             "--entity",
-            WANDB_ENTITY,
+            wandb_conf.entity,
             "--project_name",
-            cfg.project.name,
+            wandb_conf.project_name,
             "--n_runs_before_eval",
-            N_RUNS_BEFORE_FIRST_EVAL,
+            str(watcher_conf.n_runs_before_first_eval),
             "--overtaci",
-            OVERTACI,
+            meta_conf.overtaci,
             "--timeout",
             "None",
             "--clean_wandb_dir",
-            ARCHIVE_ALL_WANDB_RUNS,
+            watcher_conf.archive_all,
         ],
     )
 
-    train_models_for_each_grid(
-        base_conf_file_name=BASE_CONF_FILE_NAME,
-        base_args=BASE_ARGS,
-        n_trials_per_cell_in_grid=N_TRIALS_PER_CELL_IN_GRID,
-        possible_look_distances=possible_look_distance,
+    train_models_for_each_cell_in_grid(
+        base_conf_file_name=train_conf.conf_name,
+        base_args=train_conf.base_args,
+        n_trials_per_cell_in_grid=train_conf.n_trials_per_cell_in_grid,
+        possible_look_distances=train_conf.possible_look_distance,
     )
 
     msg.good(
-        f"Training finished. Stopping the watcher in {KEEP_WATCHER_ALIVE_AFTER_TRAINING_FINISHED_MINUTES} minutes...",
+        f"Training finished. Stopping the watcher in {watcher_conf.keep_alive_after_training_minutes} minutes...",
     )
 
-    time.sleep(60 * KEEP_WATCHER_ALIVE_AFTER_TRAINING_FINISHED_MINUTES)
+    time.sleep(60 * watcher_conf.keep_alive_after_training_minutes)
     watcher.kill()
     msg.good("Watcher stopped.")

From b60f66141d45d1293069457a89e834942cc1365b Mon Sep 17 00:00:00 2001
From: Martin Bernstorff <martinbernstorff@gmail.com>
Date: Thu, 20 Oct 2022 13:47:25 +0200
Subject: [PATCH 09/57] style: lint

---
 application/train_and_log_models.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/application/train_and_log_models.py b/application/train_and_log_models.py
index 64e94e62..01855323 100644
--- a/application/train_and_log_models.py
+++ b/application/train_and_log_models.py
@@ -13,7 +13,7 @@
 from typing import Union
 
 from pydantic import BaseModel, Field
-from wasabi import Printer, msg
+from wasabi import Printer
 
 from psycopt2d.evaluate_saved_model_predictions import (
     infer_look_distance,
@@ -24,7 +24,7 @@
 
 
 class PossibleLookDistanceDays(BaseModel):
-    """Possible look distances"""
+    """Possible look distances."""
 
     ahead: Iterable[Union[int, float]]
     behind: Iterable[Union[int, float]]
@@ -35,10 +35,11 @@ class MetaConf(BaseModel):
 
     conf_name: str = Field("integration_testing.yaml")
     data_dir: Path = Path(
-        "/Users/au484925/Desktop/psycop-t2d/tests/test_data/synth_splits/"
+        "/Users/au484925/Desktop/psycop-t2d/tests/test_data/synth_splits/",
     )
     overtaci: str = Field(
-        default="false", description="Change to 'true' if running on overtaci"
+        default="false",
+        description="Change to 'true' if running on overtaci",
     )
 
 
@@ -80,7 +81,7 @@ class TrainConf(BaseModel):
     conf_name: str = Field(default="integration_testing.yaml")
 
     base_args: str = Field(
-        default=f"--multirun +model=xgboost project.wandb_mode='dryrun' model.args.tree_method='auto' --config-name {conf_name}"
+        default=f"--multirun +model=xgboost project.wandb_mode='dryrun' model.args.tree_method='auto' --config-name {conf_name}",
     )
 
     possible_look_distance: PossibleLookDistanceDays
@@ -115,7 +116,7 @@ def infer_possible_look_directions(train):
 
 
 def get_dataset_spec(data_dir_path: Path):
-    """Get dataset specification"""
+    """Get dataset specification."""
     time_spec = DatasetTimeSpecification(
         drop_patient_if_outcome_before_date=None,
         min_prediction_time_date="1979-01-01",
@@ -162,7 +163,7 @@ def train_models_for_each_cell_in_grid(
         conf_name="integration_testing.yaml",
         overtaci="false",
         data_dir=Path(
-            "/Users/au484925/Desktop/psycop-t2d/tests/test_data/synth_splits/"
+            "/Users/au484925/Desktop/psycop-t2d/tests/test_data/synth_splits/",
         ),
     )
 

From 0d460e84f7d3cee6e40d90c9f6b8d96542099540 Mon Sep 17 00:00:00 2001
From: Martin Bernstorff <martinbernstorff@gmail.com>
Date: Thu, 20 Oct 2022 15:48:20 +0200
Subject: [PATCH 10/57] refactor: use objects

---
 application/train_and_log_models.py | 136 ++++++++++++++++------------
 1 file changed, 79 insertions(+), 57 deletions(-)

diff --git a/application/train_and_log_models.py b/application/train_and_log_models.py
index 01855323..94a7e8ec 100644
--- a/application/train_and_log_models.py
+++ b/application/train_and_log_models.py
@@ -5,12 +5,9 @@
 - Replace the HYDRA_ARGS string with the desired arguments for `train_model.py`
 - Run this script from project root with `python src/psycopt2d/train_and_log_models.py
 """
-import os
 import subprocess
 import time
-from collections.abc import Iterable
 from pathlib import Path
-from typing import Union
 
 from pydantic import BaseModel, Field
 from wasabi import Printer
@@ -26,8 +23,8 @@
 class PossibleLookDistanceDays(BaseModel):
     """Possible look distances."""
 
-    ahead: Iterable[Union[int, float]]
-    behind: Iterable[Union[int, float]]
+    ahead: list[str]
+    behind: list[str]
 
 
 class MetaConf(BaseModel):
@@ -44,7 +41,7 @@ class MetaConf(BaseModel):
 
 
 class WatcherConf(BaseModel):
-    """Confiugration for the watcher."""
+    """Configuration for the watcher."""
 
     archive_all: str = Field(
         default="false",
@@ -68,23 +65,31 @@ class WandbConf(BaseModel):
         default="psycop",
         description="The wandb entity to upload to (e.g. 'psycop' or your user name)",
     )
+    mode: str = Field(default="online", description="The wandb mode to use")
 
 
 class TrainConf(BaseModel):
     """Configuration for model training."""
 
+    gpu: bool = Field(default="false", description="Whether to use GPU")
+
     n_trials_per_cell_in_grid: int = Field(
         default=50,
         description="Number of trials per cell in the lookahead/lookbehind grid",
     )
 
+    model_conf: str = Field(
+        default="xgboost",
+        description="The model conf to open. For example, 'xgboost' or 'logistic_regression'.",
+    )
+
     conf_name: str = Field(default="integration_testing.yaml")
 
-    base_args: str = Field(
-        default=f"--multirun +model=xgboost project.wandb_mode='dryrun' model.args.tree_method='auto' --config-name {conf_name}",
+    multirun: bool = Field(
+        default=False, description="Whether to use Hydra to run multiple models."
     )
 
-    possible_look_distance: PossibleLookDistanceDays
+    possible_look_distances: PossibleLookDistanceDays
 
 
 def load_data(dataset_spec):
@@ -99,15 +104,11 @@ def infer_possible_look_directions(train):
     # Get potential lookaheads from outc_ columns
     outcome_col_names = infer_outcome_col_name(df=train, allow_multiple=True)
 
-    possible_lookahead_days = set(
-        infer_look_distance(
-            col_name=outcome_col_names,
-        ),
-    )
+    possible_lookahead_days = infer_look_distance(col_name=outcome_col_names)
 
     # Get potential lookbehinds from pred_ columns
     pred_col_names = infer_predictor_col_name(df=train, allow_multiple=True)
-    possible_lookbehind_days = set(infer_look_distance(col_name=pred_col_names))
+    possible_lookbehind_days = list(set(infer_look_distance(col_name=pred_col_names)))
 
     return PossibleLookDistanceDays(
         ahead=possible_lookahead_days,
@@ -134,26 +135,44 @@ def get_dataset_spec(data_dir_path: Path):
 
 
 def train_models_for_each_cell_in_grid(
-    base_conf_file_name: Union[str, Path],
-    base_args: str,
-    n_trials_per_cell_in_grid: int,
-    possible_look_distances: PossibleLookDistanceDays,
+    train_conf: TrainConf,
 ):
     """Train a model for each cell in the grid of possible look distances."""
     from random_word import RandomWords
 
     random_word = RandomWords()
 
-    for lookbehind in possible_look_distances.behind:
-        for lookahead in possible_look_distances.ahead:
+    for lookbehind in train_conf.possible_look_distances.behind:
+        for lookahead in train_conf.possible_look_distances.ahead:
             wandb_group = f"{random_word.get_random_word()}-{random_word.get_random_word()}-beh-{lookbehind}-ahead-{lookahead}"
 
-            command = f"python src/psycopt2d/train_model.py {base_args} data.min_lookbehind_days={lookbehind} data.min_lookahead_days={lookahead} +project.wandb_group={wandb_group} hydra.sweeper.n_trials={n_trials_per_cell_in_grid} --config-name {base_conf_file_name}"
+            subprocess_args: list[str] = [
+                "python",
+                "src/psycopt2d/train_model.py",
+                f"+model={train_conf.model_conf}",
+                f"data.min_lookbehind_days={lookbehind}",
+                f"data.min_lookahead_days={lookahead}",
+                f"project.wandb_group='{wandb_group}'",
+                f"hydra.sweeper.n_trials={train_conf.n_trials_per_cell_in_grid}",
+                "--config-name",
+                f"{meta_conf.conf_name}",
+            ]
+
+            if train_conf.multirun:
+                subprocess_args.insert(2, "--multirun")
 
-            msg.info("Sending command")
-            msg.info(command)
+            if train_conf.model_conf == "xgboost" and not train_conf.gpu:
+                subprocess_args.insert(3, "++model.args.tree_method='auto'")
 
-            os.system(command)
+            msg.info("Starting trainer with command")
+            msg.info(f'{" ".join(subprocess_args)}')
+
+            trainer = subprocess.Popen(  # pylint: disable=consider-using-with
+                args=subprocess_args,
+            )
+
+            while trainer.poll() is None:
+                time.sleep(1)
 
 
 if __name__ == "__main__":
@@ -170,51 +189,54 @@ def train_models_for_each_cell_in_grid(
     wandb_conf = WandbConf(
         entity="psycop",
         project_name="psycopt2d-testing",
+        mode="offline",
     )
 
-    watcher_conf = WatcherConf(archive_all="false", keep_alive_after_training_minutes=5)
-
+    watcher_conf = WatcherConf(archive_all="true", keep_alive_after_training_minutes=5)
     dataset_spec = get_dataset_spec(data_dir_path=meta_conf.data_dir)
-
     train = load_data(dataset_spec=dataset_spec)
-    possible_look_distance = infer_possible_look_directions(train)
+
+    possible_look_distances = infer_possible_look_directions(train)
 
     train_conf = TrainConf(
         conf_name=meta_conf.conf_name,
-        base_args=f"--multirun +model=xgboost project.wandb_mode='dryrun' model.args.tree_method='auto' --config-name {meta_conf.conf_name}",
-        n_trials_per_cell_in_grid=50,
-    )
-
-    watcher = subprocess.Popen(  # pylint: disable=consider-using-with
-        [
-            "python",
-            "src/psycopt2d/model_training_watcher.py",
-            "--entity",
-            wandb_conf.entity,
-            "--project_name",
-            wandb_conf.project_name,
-            "--n_runs_before_eval",
-            str(watcher_conf.n_runs_before_first_eval),
-            "--overtaci",
-            meta_conf.overtaci,
-            "--timeout",
-            "None",
-            "--clean_wandb_dir",
-            watcher_conf.archive_all,
-        ],
-    )
+        multirun=False,
+        model_conf="xgboost",
+        n_trials_per_cell_in_grid=1,
+        possible_look_distances=possible_look_distances,
+        gpu=False,
+    )
+
+    if not train_conf.gpu:
+        msg.warn("Not using GPU for training")
+
+    # watcher = subprocess.Popen(  # pylint: disable=consider-using-with
+    #     [
+    #         "python",
+    #         "src/psycopt2d/model_training_watcher.py",
+    #         "--entity",
+    #         wandb_conf.entity,
+    #         "--project_name",
+    #         wandb_conf.project_name,
+    #         "--n_runs_before_eval",
+    #         str(watcher_conf.n_runs_before_first_eval),
+    #         "--overtaci",
+    #         meta_conf.overtaci,
+    #         "--timeout",
+    #         "None",
+    #         "--clean_wandb_dir",
+    #         watcher_conf.archive_all,
+    #     ],
+    # )
 
     train_models_for_each_cell_in_grid(
-        base_conf_file_name=train_conf.conf_name,
-        base_args=train_conf.base_args,
-        n_trials_per_cell_in_grid=train_conf.n_trials_per_cell_in_grid,
-        possible_look_distances=train_conf.possible_look_distance,
+        train_conf=train_conf,
     )
 
     msg.good(
         f"Training finished. Stopping the watcher in {watcher_conf.keep_alive_after_training_minutes} minutes...",
     )
 
-    time.sleep(60 * watcher_conf.keep_alive_after_training_minutes)
-    watcher.kill()
+    # time.sleep(60 * watcher_conf.keep_alive_after_training_minutes)
+    # watcher.kill()
     msg.good("Watcher stopped.")

From 1dbd900efae2807b1d3525c6c018c29c52e543b0 Mon Sep 17 00:00:00 2001
From: Martin Bernstorff <martinbernstorff@gmail.com>
Date: Thu, 20 Oct 2022 15:48:47 +0200
Subject: [PATCH 11/57] fix: misc. minor fixes for training

---
 src/psycopt2d/config/data/synth_data.yaml     |  6 +-
 src/psycopt2d/config/data/t2d_parquet.yaml    |  3 +-
 .../evaluate_saved_model_predictions.py       |  4 +-
 src/psycopt2d/load.py                         | 90 +++++++++++--------
 src/psycopt2d/train_model.py                  |  3 +-
 src/psycopt2d/utils.py                        | 11 ++-
 6 files changed, 72 insertions(+), 45 deletions(-)

diff --git a/src/psycopt2d/config/data/synth_data.yaml b/src/psycopt2d/config/data/synth_data.yaml
index 316235ae..d99e3b32 100644
--- a/src/psycopt2d/config/data/synth_data.yaml
+++ b/src/psycopt2d/config/data/synth_data.yaml
@@ -2,6 +2,7 @@
 data:
   n_training_samples: null
   min_lookahead_days: null
+  min_lookbehind_days: null
   min_prediction_time_date: null
   lookahead_days: 30
   pred_col_name_prefix: "pred_"
@@ -9,12 +10,11 @@ data:
   outcome_timestamp_col_name: timestamp_outcome
   id_col_name: citizen_ids
   source: synthetic
-  min_lookbehind_days: null
   drop_patient_if_outcome_before_date: null
-  lookbehind_combination: null
+  lookbehind_combination: [30, 90]
 
 # Parameters that will only take effect if running with --multirun
 hydra:
   sweeper:
     params:
-      ++data.lookbehind_combinations: choice([30, 90], [30])
+      data.lookbehind_combination: choice([3000, 90], [30])
diff --git a/src/psycopt2d/config/data/t2d_parquet.yaml b/src/psycopt2d/config/data/t2d_parquet.yaml
index cd59b629..3255b74d 100644
--- a/src/psycopt2d/config/data/t2d_parquet.yaml
+++ b/src/psycopt2d/config/data/t2d_parquet.yaml
@@ -19,9 +19,10 @@ data:
   # Looking behind
   min_prediction_time_date: 2013-01-01
   min_lookbehind_days: 365 # (int): Drop all prediction times where (prediction_timestamp) - (min timestamp in the dataset) is less than min_lookbehind_days
+  lookbehind_combinations: choice([30, 90, 180, 365, 730], [30, 180, 730], [30, 730], [730])
 
 # Parameters that will only take effect if running with --multirun
 hydra:
   sweeper:
     params:
-      ++data.lookbehind_combinations: choice([30, 90], [30])
+      ++data.lookbehind_combinations: choice([3000], [30, 90])
diff --git a/src/psycopt2d/evaluate_saved_model_predictions.py b/src/psycopt2d/evaluate_saved_model_predictions.py
index bfa96ff1..a665d34f 100644
--- a/src/psycopt2d/evaluate_saved_model_predictions.py
+++ b/src/psycopt2d/evaluate_saved_model_predictions.py
@@ -29,14 +29,14 @@ def infer_look_distance(
     col_name: Union[Iterable[str], str],
     regex_pattern: str = r"within_(\d+)_days",
     allow_multiple: bool = True,
-) -> list[Union[int, float]]:
+) -> list[str]:
     """Infer look distances from col names."""
     # E.g. "outc_within_1_days" = 1
     # E.g. "outc_within_2_days" = 2
     # E.g. "pred_within_3_days" = 3
     # E.g. "pred_within_3_days" = 3
 
-    look_distances: list[Union[int, float]] = []
+    look_distances: list[str] = []
 
     if isinstance(col_name, Iterable) and not isinstance(col_name, str):
         for c_name in col_name:
diff --git a/src/psycopt2d/load.py b/src/psycopt2d/load.py
index 65d4c34c..459685f8 100644
--- a/src/psycopt2d/load.py
+++ b/src/psycopt2d/load.py
@@ -2,6 +2,7 @@
 import re
 from collections.abc import Iterable
 from datetime import datetime, timedelta
+from multiprocessing.sharedctypes import Value
 from pathlib import Path
 from typing import Any, Optional, Union
 
@@ -11,7 +12,13 @@
 from pydantic import BaseModel, Field
 from wasabi import Printer
 
-from psycopt2d.utils import PROJECT_ROOT, coerce_to_datetime
+from psycopt2d.evaluate_saved_model_predictions import infer_look_distance
+from psycopt2d.utils import (
+    PROJECT_ROOT,
+    coerce_to_datetime,
+    get_percent_lost,
+    infer_predictor_col_name,
+)
 
 msg = Printer(timestamp=True)
 
@@ -168,7 +175,7 @@ def _drop_rows_if_datasets_ends_within_days(
             pd.DataFrame: Dataset with dropped rows.
         """
         if not isinstance(n_days, timedelta):
-            n_days = timedelta(days=n_days)  # type: ignore
+            n_days_timedelt: timedelta = timedelta(days=n_days)  # type: ignore
 
         if direction not in ("ahead", "behind"):
             raise ValueError(f"Direction {direction} not supported.")
@@ -176,23 +183,24 @@ def _drop_rows_if_datasets_ends_within_days(
         n_rows_before_modification = dataset.shape[0]
 
         if direction == "ahead":
-            max_datetime = dataset[self.spec.pred_time_colname].max() - n_days
+            max_datetime = dataset[self.spec.pred_time_colname].max() - n_days_timedelt
             before_max_dt = dataset[self.spec.pred_time_colname] < max_datetime
             dataset = dataset[before_max_dt]
         elif direction == "behind":
-            min_datetime = dataset[self.spec.pred_time_colname].min() + n_days
+            min_datetime = dataset[self.spec.pred_time_colname].min() + n_days_timedelt
             after_min_dt = dataset[self.spec.pred_time_colname] > min_datetime
             dataset = dataset[after_min_dt]
 
         n_rows_after_modification = dataset.shape[0]
-        percent_dropped = (
-            n_rows_before_modification - n_rows_after_modification
-        ) / n_rows_before_modification
-
-        msg.info(
-            f"Dropped {n_rows_before_modification - n_rows_after_modification} ({percent_dropped}%) rows because the end of the dataset was within {n_days} of their prediction time when looking {direction} from their prediction time",
+        percent_dropped = get_percent_lost(
+            n_before=n_rows_after_modification, n_after=n_rows_after_modification
         )
 
+        if n_rows_before_modification - n_rows_after_modification != 0:
+            msg.info(
+                f"Dropped {n_rows_before_modification - n_rows_after_modification} ({percent_dropped}%) rows because the end of the dataset was within {n_days} of their prediction time when looking {direction} from their prediction time",
+            )
+
         return dataset
 
     def _drop_patients_with_event_in_washin(self, dataset) -> pd.DataFrame:
@@ -216,14 +224,15 @@ def _drop_patients_with_event_in_washin(self, dataset) -> pd.DataFrame:
         ]
 
         n_rows_after_modification = dataset.shape[0]
-        percent_dropped = (
-            n_rows_before_modification - n_rows_after_modification
-        ) / n_rows_before_modification
-
-        msg.info(
-            f"Dropped {n_rows_before_modification - n_rows_after_modification} ({percent_dropped}%) rows because patients had diabetes in the washin period.",
+        percent_dropped = get_percent_lost(
+            n_before=n_rows_after_modification, n_after=n_rows_after_modification
         )
 
+        if n_rows_before_modification - n_rows_after_modification != 0:
+            msg.info(
+                f"Dropped {n_rows_before_modification - n_rows_after_modification} ({percent_dropped}%) rows because patients had diabetes in the washin period.",
+            )
+
         return dataset
 
     def _drop_cols_not_in_lookbehind_combination(
@@ -240,33 +249,43 @@ def _drop_cols_not_in_lookbehind_combination(
             pd.DataFrame: Dataset with dropped columns.
         """
 
+        if not self.spec.time.lookbehind_combination:
+            raise ValueError("No lookbehind_combination provided.")
+
         # Extract all unique lookbhehinds in the dataset predictors
         lookbehinds_in_dataset = {
-            int(re.findall(r"within_(\d+)_days", col)[0])
-            for col in dataset.columns
-            if self.pred_col_name_prefix in col
+            int(infer_look_distance(col)[0])
+            for col in infer_predictor_col_name(df=dataset)
         }
 
+        # Convert list to set
+        lookbehinds_in_spec = set(self.spec.time.lookbehind_combination)
+
         # Check that all loobehinds in lookbehind_combination are used in the predictors
-        if not set(self.spec.time.lookbehind_combination).issubset(
+        if not lookbehinds_in_spec.issubset(
             lookbehinds_in_dataset,
         ):
-            raise ValueError(
-                f"One or more of the provided lookbehinds in lookbehind_combination is/are not used in any predictors in the dataset. Lookbehinds in dataset: {lookbehinds_in_dataset}. Lookbehinds in lookbehind_combination: {self.spec.time.lookbehind_combination}.",
+            msg.warn(
+                f"One or more of the provided lookbehinds in lookbehind_combination is/are not used in any predictors in the dataset. Dataset: {lookbehinds_in_dataset}. lookbehind_combination: {self.spec.time.lookbehind_combination}.",
             )
 
+            lookbehinds_to_keep = lookbehinds_in_spec.intersection(
+                lookbehinds_in_dataset
+            )
+
+            if not lookbehinds_to_keep:
+                raise ValueError("No predictors left after dropping lookbehinds.")
+
+            msg.warn(f"Training on {lookbehinds_to_keep}.")
+
         # Create a list of all predictor columns who have a lookbehind window not in lookbehind_combination list
         cols_to_drop = [
-            col
-            for col in dataset.columns
-            if any(
-                str(x) not in col and self.pred_col_name_prefix in col
-                for x in self.spec.time.lookbehind_combination
-            )
+            c
+            for c in infer_predictor_col_name(df=dataset)
+            if any(str(l_beh) not in c for l_beh in lookbehinds_to_keep)
         ]
 
         dataset = dataset.drop(columns=cols_to_drop)
-
         return dataset
 
     def _convert_timestamp_dtype_and_nat(self, dataset: pd.DataFrame) -> pd.DataFrame:
@@ -330,14 +349,15 @@ def _drop_cols_if_exceeds_look_direction_threshold(
                     cols_to_drop.append(col)
 
         n_cols_after_modification = dataset.shape[1]
-        percent_dropped = (
-            n_cols_before_modification - n_cols_after_modification
-        ) / n_cols_before_modification
-
-        msg.info(
-            f"Dropped {n_cols_before_modification - n_cols_after_modification} ({percent_dropped}%) columns because they were looking {direction} further out than {look_direction_threshold} days.",
+        percent_dropped = get_percent_lost(
+            n_before=n_cols_before_modification, n_after=n_cols_after_modification
         )
 
+        if n_cols_before_modification - n_cols_after_modification != 0:
+            msg.info(
+                f"Dropped {n_cols_before_modification - n_cols_after_modification} ({percent_dropped}%) columns because they were looking {direction} further out than {look_direction_threshold} days.",
+            )
+
         return dataset[[c for c in dataset.columns if c not in cols_to_drop]]
 
     def _drop_cols_and_rows_if_look_direction_not_met(
diff --git a/src/psycopt2d/train_model.py b/src/psycopt2d/train_model.py
index 2a79d729..80702815 100644
--- a/src/psycopt2d/train_model.py
+++ b/src/psycopt2d/train_model.py
@@ -21,13 +21,14 @@
 from psycopt2d.load import load_train_and_val_from_cfg
 from psycopt2d.models import MODELS
 from psycopt2d.utils import (
+    PROJECT_ROOT,
     create_wandb_folders,
     flatten_nested_dict,
     get_feature_importance_dict,
     prediction_df_with_metadata_to_disk,
 )
 
-CONFIG_PATH = Path(__file__).parent / "config"
+CONFIG_PATH = PROJECT_ROOT / "src" / "psycopt2d" / "config"
 TRAINING_COL_NAME_PREFIX = "pred_"
 
 # Handle wandb not playing nice with joblib
diff --git a/src/psycopt2d/utils.py b/src/psycopt2d/utils.py
index eac545e0..976e56db 100644
--- a/src/psycopt2d/utils.py
+++ b/src/psycopt2d/utils.py
@@ -397,12 +397,12 @@ def infer_col_names(
     df: pd.DataFrame,
     prefix: str,
     allow_multiple: bool = True,
-) -> Union[str, list[str]]:
+) -> list[str]:
     """Infer col names based on prefix."""
     col_name = [c for c in df.columns if c.startswith(prefix)]
 
     if len(col_name) == 1:
-        return col_name[0]
+        return [col_name[0]]
     elif len(col_name) > 1:
         if allow_multiple:
             return col_name
@@ -417,7 +417,7 @@ def infer_outcome_col_name(
     df: pd.DataFrame,
     prefix: str = "outc_",
     allow_multiple: bool = True,
-) -> Union[str, list[str]]:
+) -> list[str]:
     """Infer the outcome column name from the dataframe."""
     return infer_col_names(df=df, prefix=prefix, allow_multiple=allow_multiple)
 
@@ -438,3 +438,8 @@ def infer_y_hat_prob_col_name(
 ) -> str:
     """Infer the y_hat_prob column name from the dataframe."""
     return infer_col_names(df=df, prefix=prefix, allow_multiple=allow_multiple)
+
+
+def get_percent_lost(n_before: Union[int, float], n_after: Union[int, float]) -> float:
+    """Get the percent lost."""
+    return round((100 * (1 - n_after / n_before)), 2)

From 3e9fe78aee745d1b142a8b7881c8b1bb40e29b7b Mon Sep 17 00:00:00 2001
From: Martin Bernstorff <martinbernstorff@gmail.com>
Date: Thu, 20 Oct 2022 15:49:07 +0200
Subject: [PATCH 12/57] style: linting

---
 application/train_and_log_models.py |  3 ++-
 src/psycopt2d/load.py               | 12 +++++++-----
 src/psycopt2d/train_model.py        |  1 -
 3 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/application/train_and_log_models.py b/application/train_and_log_models.py
index 94a7e8ec..8c2abe20 100644
--- a/application/train_and_log_models.py
+++ b/application/train_and_log_models.py
@@ -86,7 +86,8 @@ class TrainConf(BaseModel):
     conf_name: str = Field(default="integration_testing.yaml")
 
     multirun: bool = Field(
-        default=False, description="Whether to use Hydra to run multiple models."
+        default=False,
+        description="Whether to use Hydra to run multiple models.",
     )
 
     possible_look_distances: PossibleLookDistanceDays
diff --git a/src/psycopt2d/load.py b/src/psycopt2d/load.py
index 459685f8..535220a7 100644
--- a/src/psycopt2d/load.py
+++ b/src/psycopt2d/load.py
@@ -2,7 +2,6 @@
 import re
 from collections.abc import Iterable
 from datetime import datetime, timedelta
-from multiprocessing.sharedctypes import Value
 from pathlib import Path
 from typing import Any, Optional, Union
 
@@ -193,7 +192,8 @@ def _drop_rows_if_datasets_ends_within_days(
 
         n_rows_after_modification = dataset.shape[0]
         percent_dropped = get_percent_lost(
-            n_before=n_rows_after_modification, n_after=n_rows_after_modification
+            n_before=n_rows_after_modification,
+            n_after=n_rows_after_modification,
         )
 
         if n_rows_before_modification - n_rows_after_modification != 0:
@@ -225,7 +225,8 @@ def _drop_patients_with_event_in_washin(self, dataset) -> pd.DataFrame:
 
         n_rows_after_modification = dataset.shape[0]
         percent_dropped = get_percent_lost(
-            n_before=n_rows_after_modification, n_after=n_rows_after_modification
+            n_before=n_rows_after_modification,
+            n_after=n_rows_after_modification,
         )
 
         if n_rows_before_modification - n_rows_after_modification != 0:
@@ -270,7 +271,7 @@ def _drop_cols_not_in_lookbehind_combination(
             )
 
             lookbehinds_to_keep = lookbehinds_in_spec.intersection(
-                lookbehinds_in_dataset
+                lookbehinds_in_dataset,
             )
 
             if not lookbehinds_to_keep:
@@ -350,7 +351,8 @@ def _drop_cols_if_exceeds_look_direction_threshold(
 
         n_cols_after_modification = dataset.shape[1]
         percent_dropped = get_percent_lost(
-            n_before=n_cols_before_modification, n_after=n_cols_after_modification
+            n_before=n_cols_before_modification,
+            n_after=n_cols_after_modification,
         )
 
         if n_cols_before_modification - n_cols_after_modification != 0:
diff --git a/src/psycopt2d/train_model.py b/src/psycopt2d/train_model.py
index 80702815..e9d78bbe 100644
--- a/src/psycopt2d/train_model.py
+++ b/src/psycopt2d/train_model.py
@@ -1,7 +1,6 @@
 """Training script for training a single model for predicting t2d."""
 import os
 from collections.abc import Iterable
-from pathlib import Path
 from typing import Optional
 
 import hydra

From cc2961c965b5ef6963e6d17200b8126f5ff034f4 Mon Sep 17 00:00:00 2001
From: Martin Bernstorff <ryqiem@gmail.com>
Date: Fri, 21 Oct 2022 10:15:14 +0200
Subject: [PATCH 13/57] misc.

---
 application/train_and_log_models.py           | 186 +++++++++++-------
 src/psycopt2d/config/data/synth_data.yaml     |   4 +-
 src/psycopt2d/config/data/t2d_parquet.yaml    |  14 +-
 src/psycopt2d/config/default_config.yaml      |   1 +
 .../config/project/default_project.yaml       |   3 +-
 .../config/project/overtaci_test_project.yaml |   3 +-
 .../config/training/default_training.yaml     |   2 +-
 src/psycopt2d/evaluation.py                   |  11 +-
 src/psycopt2d/load.py                         |  24 ++-
 src/psycopt2d/model_training_watcher.py       |  32 +--
 src/psycopt2d/visualization/base_charts.py    |   2 +-
 .../visualization/feature_importance.py       |   2 +-
 tests/test_train_model.py                     |  10 +-
 13 files changed, 176 insertions(+), 118 deletions(-)

diff --git a/application/train_and_log_models.py b/application/train_and_log_models.py
index 8c2abe20..80ad6eb9 100644
--- a/application/train_and_log_models.py
+++ b/application/train_and_log_models.py
@@ -5,10 +5,12 @@
 - Replace the HYDRA_ARGS string with the desired arguments for `train_model.py`
 - Run this script from project root with `python src/psycopt2d/train_and_log_models.py
 """
+import random
 import subprocess
 import time
 from pathlib import Path
 
+from hydra import compose, initialize
 from pydantic import BaseModel, Field
 from wasabi import Printer
 
@@ -18,6 +20,9 @@
     infer_predictor_col_name,
 )
 from psycopt2d.load import DataLoader, DatasetSpecification, DatasetTimeSpecification
+from psycopt2d.utils import PROJECT_ROOT
+
+msg = Printer(timestamp=True)
 
 
 class PossibleLookDistanceDays(BaseModel):
@@ -75,7 +80,7 @@ class TrainConf(BaseModel):
 
     n_trials_per_cell_in_grid: int = Field(
         default=50,
-        description="Number of trials per cell in the lookahead/lookbehind grid",
+        description="Number of trials per cell in the lookahead/lookbehind grid. If n > 1, automatically triggers multirun.",
     )
 
     model_conf: str = Field(
@@ -85,17 +90,13 @@ class TrainConf(BaseModel):
 
     conf_name: str = Field(default="integration_testing.yaml")
 
-    multirun: bool = Field(
-        default=False,
-        description="Whether to use Hydra to run multiple models.",
-    )
-
     possible_look_distances: PossibleLookDistanceDays
 
 
-def load_data(dataset_spec):
+def load_train_for_inference(dataset_spec):
     """Load the data."""
     loader = DataLoader(dataset_spec)
+    msg.info("Loading datasets for look direction inference")
     return loader.load_dataset_from_dir(split_names="train")
 
 
@@ -117,7 +118,7 @@ def infer_possible_look_directions(train):
     )
 
 
-def get_dataset_spec(data_dir_path: Path):
+def get_dataset_spec(data_dir_path: Path, file_suffix: str):
     """Get dataset specification."""
     time_spec = DatasetTimeSpecification(
         drop_patient_if_outcome_before_date=None,
@@ -127,7 +128,7 @@ def get_dataset_spec(data_dir_path: Path):
     )
 
     return DatasetSpecification(
-        file_suffix="csv",
+        file_suffix=file_suffix,
         time=time_spec,
         pred_col_name_prefix="pred_",
         pred_time_colname="timestamp",
@@ -135,109 +136,162 @@ def get_dataset_spec(data_dir_path: Path):
     )
 
 
+class LookDirectionCombination(BaseModel):
+    """A combination of lookbehind and lookahead days."""
+
+    lookbehind: int
+    lookahead: int
+
+
 def train_models_for_each_cell_in_grid(
     train_conf: TrainConf,
+    wandb_conf: WandbConf,
 ):
     """Train a model for each cell in the grid of possible look distances."""
     from random_word import RandomWords
 
     random_word = RandomWords()
 
-    for lookbehind in train_conf.possible_look_distances.behind:
-        for lookahead in train_conf.possible_look_distances.ahead:
-            wandb_group = f"{random_word.get_random_word()}-{random_word.get_random_word()}-beh-{lookbehind}-ahead-{lookahead}"
+    # Create all combinations of lookbehind and lookahead days
+    lookbehind_combinations = [
+        LookDirectionCombination(lookbehind=lookbehind, lookahead=lookahead)
+        for lookbehind in train_conf.possible_look_distances.behind
+        for lookahead in train_conf.possible_look_distances.ahead
+    ]
+
+    lookbehind_combinations = [
+        comb for comb in lookbehind_combinations if comb.lookahead <= 1095
+    ]
+
+    random.shuffle(lookbehind_combinations)
+
+    active_trainers: list[subprocess.Popen] = []
+
+    wandb_prefix = f"{random_word.get_random_word()}-{random_word.get_random_word()}"
+
+    while lookbehind_combinations:
+        # Loop to run if enough trainers have been spawned
+        if len(active_trainers) >= 4:
+            active_trainers = [t for t in active_trainers if t.poll() is None]
+            time.sleep(1)
+            continue
 
-            subprocess_args: list[str] = [
-                "python",
-                "src/psycopt2d/train_model.py",
-                f"+model={train_conf.model_conf}",
-                f"data.min_lookbehind_days={lookbehind}",
-                f"data.min_lookahead_days={lookahead}",
-                f"project.wandb_group='{wandb_group}'",
-                f"hydra.sweeper.n_trials={train_conf.n_trials_per_cell_in_grid}",
-                "--config-name",
-                f"{meta_conf.conf_name}",
-            ]
+        cell = lookbehind_combinations.pop()
+        msg.info(
+            f"Spawning a new trainer with lookbehind={cell.lookbehind} and lookahead={cell.lookahead}"
+        )
 
-            if train_conf.multirun:
-                subprocess_args.insert(2, "--multirun")
+        wandb_group = f"{wandb_prefix}-beh-{cell.lookbehind}-ahead-{cell.lookahead}"
 
-            if train_conf.model_conf == "xgboost" and not train_conf.gpu:
-                subprocess_args.insert(3, "++model.args.tree_method='auto'")
+        subprocess_args: list[str] = [
+            "python",
+            "src/psycopt2d/train_model.py",
+            f"model={train_conf.model_conf}",
+            f"data.min_lookbehind_days={cell.lookbehind}",
+            f"data.min_lookahead_days={cell.lookahead}",
+            f"project.wandb_group='{wandb_group}'",
+            f"hydra.sweeper.n_trials={train_conf.n_trials_per_cell_in_grid}",
+            f"project.wandb_mode={wandb_conf.mode}",
+            "--config-name",
+            f"{meta_conf.conf_name}",
+        ]
 
-            msg.info("Starting trainer with command")
-            msg.info(f'{" ".join(subprocess_args)}')
+        if train_conf.n_trials_per_cell_in_grid > 1:
+            subprocess_args.insert(2, "--multirun")
 
-            trainer = subprocess.Popen(  # pylint: disable=consider-using-with
+        if train_conf.model_conf == "xgboost" and not train_conf.gpu:
+            subprocess_args.insert(3, "++model.args.tree_method='auto'")
+
+        msg.info(f'{" ".join(subprocess_args)}')
+
+        active_trainers.append(
+            subprocess.Popen(  # pylint: disable=consider-using-with
                 args=subprocess_args,
             )
-
-            while trainer.poll() is None:
-                time.sleep(1)
+        )
 
 
 if __name__ == "__main__":
     msg = Printer(timestamp=True)
 
+    CONFIG_FILE_NAME = "default_config.yaml"
+
+    with initialize(version_base=None, config_path="../src/psycopt2d/config/"):
+        cfg = compose(
+            config_name=CONFIG_FILE_NAME,
+        )
+
     meta_conf = MetaConf(
-        conf_name="integration_testing.yaml",
+        conf_name=CONFIG_FILE_NAME,
         overtaci="false",
-        data_dir=Path(
-            "/Users/au484925/Desktop/psycop-t2d/tests/test_data/synth_splits/",
-        ),
+        data_dir=cfg.data.dir,
     )
 
     wandb_conf = WandbConf(
         entity="psycop",
         project_name="psycopt2d-testing",
-        mode="offline",
+        mode=cfg.project.wandb_mode,
     )
 
-    watcher_conf = WatcherConf(archive_all="true", keep_alive_after_training_minutes=5)
-    dataset_spec = get_dataset_spec(data_dir_path=meta_conf.data_dir)
-    train = load_data(dataset_spec=dataset_spec)
+    watcher_conf = WatcherConf(archive_all="false", keep_alive_after_training_minutes=5)
+
+    watcher = subprocess.Popen(  # pylint: disable=consider-using-with
+        [
+            "python",
+            "src/psycopt2d/model_training_watcher.py",
+            "--entity",
+            wandb_conf.entity,
+            "--project_name",
+            wandb_conf.project_name,
+            "--n_runs_before_eval",
+            str(watcher_conf.n_runs_before_first_eval),
+            "--overtaci",
+            meta_conf.overtaci,
+            "--timeout",
+            "None",
+            "--clean_wandb_dir",
+            watcher_conf.archive_all,
+        ],
+    )
+
+    dataset_spec = get_dataset_spec(
+        data_dir_path=meta_conf.data_dir, file_suffix=cfg.data.suffix
+    )
+    train = load_train_for_inference(dataset_spec=dataset_spec)
 
     possible_look_distances = infer_possible_look_directions(train)
 
+    # Remove "9999" from possible look distances behind
+    possible_look_distances.behind = [
+        dist for dist in possible_look_distances.behind if dist != "9999"
+    ]
+
+    msg.info(f"Possible lookbehind days: {possible_look_distances.behind}")
+    msg.info(f"Possible lookahead days: {possible_look_distances.ahead}")
+
     train_conf = TrainConf(
         conf_name=meta_conf.conf_name,
-        multirun=False,
         model_conf="xgboost",
         n_trials_per_cell_in_grid=1,
         possible_look_distances=possible_look_distances,
-        gpu=False,
+        gpu=True,
     )
 
     if not train_conf.gpu:
         msg.warn("Not using GPU for training")
 
-    # watcher = subprocess.Popen(  # pylint: disable=consider-using-with
-    #     [
-    #         "python",
-    #         "src/psycopt2d/model_training_watcher.py",
-    #         "--entity",
-    #         wandb_conf.entity,
-    #         "--project_name",
-    #         wandb_conf.project_name,
-    #         "--n_runs_before_eval",
-    #         str(watcher_conf.n_runs_before_first_eval),
-    #         "--overtaci",
-    #         meta_conf.overtaci,
-    #         "--timeout",
-    #         "None",
-    #         "--clean_wandb_dir",
-    #         watcher_conf.archive_all,
-    #     ],
-    # )
-
-    train_models_for_each_cell_in_grid(
-        train_conf=train_conf,
+    clean_dir_seconds = 0
+    msg.info(
+        f"Sleeping for {clean_dir_seconds} seconds to allow watcher to start and clean dir"
     )
+    time.sleep(clean_dir_seconds)
+
+    train_models_for_each_cell_in_grid(train_conf=train_conf, wandb_conf=wandb_conf)
 
     msg.good(
         f"Training finished. Stopping the watcher in {watcher_conf.keep_alive_after_training_minutes} minutes...",
     )
 
-    # time.sleep(60 * watcher_conf.keep_alive_after_training_minutes)
-    # watcher.kill()
+    time.sleep(60 * watcher_conf.keep_alive_after_training_minutes)
+    watcher.kill()
     msg.good("Watcher stopped.")
diff --git a/src/psycopt2d/config/data/synth_data.yaml b/src/psycopt2d/config/data/synth_data.yaml
index d99e3b32..8089f440 100644
--- a/src/psycopt2d/config/data/synth_data.yaml
+++ b/src/psycopt2d/config/data/synth_data.yaml
@@ -5,11 +5,11 @@ data:
   min_lookbehind_days: null
   min_prediction_time_date: null
   lookahead_days: 30
-  pred_col_name_prefix: "pred_"
+  pred_col_name_prefix: pred_
   pred_timestamp_col_name: timestamp
   outcome_timestamp_col_name: timestamp_outcome
   id_col_name: citizen_ids
-  source: synthetic
+  suffix: synthetic
   drop_patient_if_outcome_before_date: null
   lookbehind_combination: [30, 90]
 
diff --git a/src/psycopt2d/config/data/t2d_parquet.yaml b/src/psycopt2d/config/data/t2d_parquet.yaml
index 3255b74d..611b3d02 100644
--- a/src/psycopt2d/config/data/t2d_parquet.yaml
+++ b/src/psycopt2d/config/data/t2d_parquet.yaml
@@ -2,8 +2,8 @@
 data:
   # General config
   n_training_samples: null # (int, null): Number of training samples to use, defaults to null in which cases it uses all samples.
-  dir: E:\shared_resources\feature_sets\t2d\feature_sets\psycop_t2d_adminmanber_201_features_2022_10_05_15_14
-  source: parquet # Where to load data from. Takes "sql" or "synthetic"
+  dir: E:\shared_resources\feature_sets\t2d\feature_sets\psycop_t2d_adminmanber_416_features_2022_10_20_11_12
+  suffix: parquet # File suffix to load.
 
   # Feature specs
   pred_col_name_prefix: "pred_" # (str): prefix of predictor columns
@@ -12,17 +12,17 @@ data:
   id_col_name: dw_ek_borger # (str): Citizen colnames
 
   # Looking ahead
-  lookahead_days: 1825 # (float): Number of days from prediction time to look ahead for the outcome.
-  min_lookahead_days: 1825 # (int): Drop all prediction times where (max timestamp in the dataset) - (current timestamp) is less than min_lookahead_days
-  drop_patient_if_outcome_before_date: 2013-01-01
+  lookahead_days: 365 # (float): Number of days from prediction time to look ahead for the outcome.
+  min_lookahead_days: 365 # (int): Drop all prediction times where (max timestamp in the dataset) - (current timestamp) is less than min_lookahead_days
+  drop_patient_if_outcome_before_date: null
 
   # Looking behind
   min_prediction_time_date: 2013-01-01
   min_lookbehind_days: 365 # (int): Drop all prediction times where (prediction_timestamp) - (min timestamp in the dataset) is less than min_lookbehind_days
-  lookbehind_combinations: choice([30, 90, 180, 365, 730], [30, 180, 730], [30, 730], [730])
+  lookbehind_combination: [30, 90, 180, 365]
 
 # Parameters that will only take effect if running with --multirun
 hydra:
   sweeper:
     params:
-      ++data.lookbehind_combinations: choice([3000], [30, 90])
+      ++data.lookbehind_combination: choice([3000], [30, 90])
diff --git a/src/psycopt2d/config/default_config.yaml b/src/psycopt2d/config/default_config.yaml
index 2d081f25..e67a67ec 100644
--- a/src/psycopt2d/config/default_config.yaml
+++ b/src/psycopt2d/config/default_config.yaml
@@ -3,6 +3,7 @@ defaults:
   - project: overtaci_test_project
   - data: t2d_parquet
   - preprocessing: default_preprocessing
+  - model: xgboost
   - training: default_training
   - evaluation: default_evaluation
   - sweeper: optuna_multithread
diff --git a/src/psycopt2d/config/project/default_project.yaml b/src/psycopt2d/config/project/default_project.yaml
index f3684005..404397fa 100644
--- a/src/psycopt2d/config/project/default_project.yaml
+++ b/src/psycopt2d/config/project/default_project.yaml
@@ -1,3 +1,4 @@
 name: psycop-t2d
 seed: 42
-wandb_mode: "run" # Which mode to run WanDB in. Takes "run", "dryrun", "offline" and "disabled"
\ No newline at end of file
+wandb_mode: "run" # Which mode to run WanDB in. Takes "run", "dryrun", "offline" and "disabled"
+wandb_group: "psycop-t2d" # Which group to run WanDB in.
\ No newline at end of file
diff --git a/src/psycopt2d/config/project/overtaci_test_project.yaml b/src/psycopt2d/config/project/overtaci_test_project.yaml
index 22dedaeb..ae8e6c6c 100644
--- a/src/psycopt2d/config/project/overtaci_test_project.yaml
+++ b/src/psycopt2d/config/project/overtaci_test_project.yaml
@@ -1,3 +1,4 @@
 name: psycop-t2d-testing
 seed: 42
-wandb_mode: "run" # Which mode to run WanDB in. Takes "run", "dryrun", "offline" and "disabled"
\ No newline at end of file
+wandb_mode: "offline" # Which mode to run WanDB in. Takes "run", "dryrun", "offline" and "disabled"
+wandb_group: "psycop-t2d"
\ No newline at end of file
diff --git a/src/psycopt2d/config/training/default_training.yaml b/src/psycopt2d/config/training/default_training.yaml
index 932506fc..56014ceb 100644
--- a/src/psycopt2d/config/training/default_training.yaml
+++ b/src/psycopt2d/config/training/default_training.yaml
@@ -1 +1 @@
-n_splits: 3 # (int, Null): Number of k-folds during CV. If Null, loads pre-defined dataset.
+n_splits: 2 # (int, Null): Number of k-folds during CV. If Null, loads pre-defined dataset.
diff --git a/src/psycopt2d/evaluation.py b/src/psycopt2d/evaluation.py
index 12f37a35..ff4a7235 100644
--- a/src/psycopt2d/evaluation.py
+++ b/src/psycopt2d/evaluation.py
@@ -61,14 +61,6 @@ def evaluate_model(
     feature_importance_dict: Optional[dict[str, float]],
 ) -> None:
     """Runs the evaluation suite on the model and logs to WandB.
-    At present, this includes:
-    1. AUC
-    2. Table of performance by pred_proba threshold
-    3. Feature importance
-    4. Sensitivity by time to outcome
-    5. AUC by calendar time
-    6. AUC by time from first visit
-    7. F1 by time until diagnosis
 
     Args:
         cfg (OmegaConf): The hydra config from the run
@@ -84,7 +76,8 @@ def evaluate_model(
     msg.info("Starting model evaluation")
 
     SAVE_DIR = PROJECT_ROOT / ".tmp"  # pylint: disable=invalid-name
-    # When parallelising tests, this causes issues since multiple processes
+    # When running tests in parallel with pytest-xdist,
+    # this causes issues since multiple processes
     # override the same dir at once.
     # Can be solved by allowing config to override this
     # and using tmp_dir in pytest. Not worth refactoring
diff --git a/src/psycopt2d/load.py b/src/psycopt2d/load.py
index 535220a7..1f6ddf11 100644
--- a/src/psycopt2d/load.py
+++ b/src/psycopt2d/load.py
@@ -192,7 +192,7 @@ def _drop_rows_if_datasets_ends_within_days(
 
         n_rows_after_modification = dataset.shape[0]
         percent_dropped = get_percent_lost(
-            n_before=n_rows_after_modification,
+            n_before=n_rows_before_modification,
             n_after=n_rows_after_modification,
         )
 
@@ -210,7 +210,7 @@ def _drop_patients_with_event_in_washin(self, dataset) -> pd.DataFrame:
 
         # Remove dates before drop_patient_if_outcome_before_date
         outcome_before_date = (
-            dataset["timestamp_first_diabetes_any"]
+            dataset["_timestamp_first_t2d"]
             < self.spec.time.drop_patient_if_outcome_before_date
         )
 
@@ -257,6 +257,7 @@ def _drop_cols_not_in_lookbehind_combination(
         lookbehinds_in_dataset = {
             int(infer_look_distance(col)[0])
             for col in infer_predictor_col_name(df=dataset)
+            if len(infer_look_distance(col)) > 0
         }
 
         # Convert list to set
@@ -278,14 +279,19 @@ def _drop_cols_not_in_lookbehind_combination(
                 raise ValueError("No predictors left after dropping lookbehinds.")
 
             msg.warn(f"Training on {lookbehinds_to_keep}.")
+        else:
+            lookbehinds_to_keep = lookbehinds_in_spec
 
         # Create a list of all predictor columns who have a lookbehind window not in lookbehind_combination list
         cols_to_drop = [
             c
             for c in infer_predictor_col_name(df=dataset)
-            if any(str(l_beh) not in c for l_beh in lookbehinds_to_keep)
+            if all(str(l_beh) not in c for l_beh in lookbehinds_to_keep)
         ]
 
+        cols_to_drop = [c for c in cols_to_drop if "within" in c]
+        # TODO: Add some specification of within_x_days indicating how to parse columns to find lookbehinds. Or, alternatively, use the column spec.
+
         dataset = dataset.drop(columns=cols_to_drop)
         return dataset
 
@@ -331,9 +337,7 @@ def _drop_cols_if_exceeds_look_direction_threshold(
         n_cols_before_modification = dataset.shape[1]
 
         if direction == "behind":
-            cols_to_process = [
-                c for c in dataset.columns if self.pred_col_name_prefix in c
-            ]
+            cols_to_process = infer_predictor_col_name(df=dataset)
 
             for col in cols_to_process:
                 # Extract lookbehind days from column name use regex
@@ -344,7 +348,8 @@ def _drop_cols_if_exceeds_look_direction_threshold(
                 if len(lookbehind_days_strs) > 0:
                     lookbehind_days = int(lookbehind_days_strs[0])
                 else:
-                    raise ValueError(f"Could not extract lookbehind days from {col}")
+                    msg.warn(f"Could not extract lookbehind days from {col}")
+                    continue
 
                 if lookbehind_days > look_direction_threshold:
                     cols_to_drop.append(col)
@@ -447,6 +452,7 @@ def load_dataset_from_dir(
         Returns:
             pd.DataFrame: The filtered dataset
         """
+        msg.info(f"Loading {split_names}")
         # Handle input types
         for timedelta_arg in (
             self.spec.time.min_lookbehind_days,
@@ -499,12 +505,12 @@ def _init_spec_from_cfg(
         resolve=True,
     )
 
-    if data_cfg["source"] == "synthetic":
+    if data_cfg["suffix"] == "synthetic":
         split_dir_path = PROJECT_ROOT / "tests" / "test_data" / "synth_splits"
         file_suffix = "csv"
     else:
         split_dir_path = data_cfg["dir"]
-        file_suffix = data_cfg["source"]
+        file_suffix = data_cfg["suffix"]
 
     time_spec = DatasetTimeSpecification(
         drop_patient_if_outcome_before_date=data_cfg[
diff --git a/src/psycopt2d/model_training_watcher.py b/src/psycopt2d/model_training_watcher.py
index 5ac58127..3305122c 100644
--- a/src/psycopt2d/model_training_watcher.py
+++ b/src/psycopt2d/model_training_watcher.py
@@ -12,13 +12,9 @@
 from wasabi import msg
 
 from psycopt2d.evaluation import evaluate_model
-from psycopt2d.utils import (
-    MODEL_PREDICTIONS_PATH,
-    PROJECT_ROOT,
-    infer_outcome_col_name,
-    infer_y_hat_prob_col_name,
-    load_evaluation_data,
-)
+from psycopt2d.utils import (MODEL_PREDICTIONS_PATH, PROJECT_ROOT,
+                             infer_outcome_col_name, infer_y_hat_prob_col_name,
+                             load_evaluation_data)
 
 # Path to the wandb directory
 WANDB_DIR = PROJECT_ROOT / "wandb"
@@ -52,11 +48,11 @@ def __init__(
         self.n_runs_before_eval = n_runs_before_eval
 
         # A queue for runs waiting to be uploaded to WandB
-        self.run_id_upload_queue = []
+        self.run_id_eval_candidates_queue = []
         self.max_performance = 0
 
         self.archive_path = WANDB_DIR / "archive"
-        self.archive_path.mkdir(exist_ok=True)
+        self.archive_path.mkdir(exist_ok=True, parents=True)
 
     def watch(self, timeout_minutes: Optional[int] = None) -> None:
         """Watch the wandb directory for new runs.
@@ -70,12 +66,12 @@ def watch(self, timeout_minutes: Optional[int] = None) -> None:
             timeout_minutes is None or start_time + timeout_minutes * 60 > time.time()
         ):
             self.get_new_runs_and_evaluate()
-            time.sleep(10)
+            time.sleep(1)
 
     def get_new_runs_and_evaluate(self) -> None:
         """Get new runs and evaluate the best runs."""
         self.upload_unarchived_runs()
-        if len(self.run_id_upload_queue) >= self.n_runs_before_eval:
+        if len(self.run_id_eval_candidates_queue) >= self.n_runs_before_eval:
             self.evaluate_best_runs()
 
     def _upload_run_dir(self, run_dir: Path) -> None:
@@ -87,7 +83,7 @@ def _upload_run_dir(self, run_dir: Path) -> None:
 
     def _archive_run_dir(self, run_dir: Path) -> None:
         """Move a run to the archive folder."""
-        run_dir.rename(self.archive_path / run_dir.name)
+        run_dir.rename(target=self.archive_path / run_dir.name)
 
     def _get_run_id(self, run_dir: Path) -> str:
         """Get the run id from a run directory."""
@@ -96,11 +92,17 @@ def _get_run_id(self, run_dir: Path) -> str:
     def upload_unarchived_runs(self) -> None:
         """Upload unarchived runs to wandb."""
         for run_folder in WANDB_DIR.glob(r"offline-run*"):
+            # TODO: We need some kind of test here to figure out if the run is
+            # still running or not. If it is still running, we should wait
+            # until it is finished. Otherwise, we get a "permission denied" error.
             run_id = self._get_run_id(run_folder)
 
             self._upload_run_dir(run_folder)
+
+            # TODO: If upload_run_dir fails, we should not archive the run.
+            # use return from subprocess.run to check if it failed. See docs: https://docs.python.org/3/library/subprocess.html
             self._archive_run_dir(run_folder)
-            self.run_id_upload_queue.append(run_id)
+            self.run_id_eval_candidates_queue.append(run_id)
 
     def _get_run_evaluation_dir(self, run_id: str) -> Path:
         """Get the evaluation path for a single run."""
@@ -151,7 +153,7 @@ def evaluate_best_runs(self) -> None:
         """Evaluate the best runs."""
         run_performances = {
             run_id: self._get_run_performance(run_id)
-            for run_id in self.run_id_upload_queue
+            for run_id in self.run_id_eval_candidates_queue
         }
         # sort runs by performance to not upload subpar runs
         run_performances = dict(
@@ -168,7 +170,7 @@ def evaluate_best_runs(self) -> None:
                 self.max_performance = performance
                 self._do_evaluation(run_id)
         # reset run id queue and try to upload unfinished runs next time
-        self.run_id_upload_queue = unfinished_runs
+        self.run_id_eval_candidates_queue = unfinished_runs
 
     def archive_all_runs(self) -> None:
         """Archive all runs in the wandb directory."""
diff --git a/src/psycopt2d/visualization/base_charts.py b/src/psycopt2d/visualization/base_charts.py
index 8f1188e4..bbad7fe6 100644
--- a/src/psycopt2d/visualization/base_charts.py
+++ b/src/psycopt2d/visualization/base_charts.py
@@ -12,7 +12,7 @@ def plot_basic_chart(
     y_values: Iterable,
     x_title: str,
     y_title: str,
-    plot_type: Optional[Union[list[str], str]],
+    plot_type: Union[list[str], str],
     sort_x: Optional[Iterable[int]] = None,
     sort_y: Optional[Iterable[int]] = None,
     fig_size: Optional[tuple] = (10, 10),
diff --git a/src/psycopt2d/visualization/feature_importance.py b/src/psycopt2d/visualization/feature_importance.py
index 59577105..1b7d9084 100644
--- a/src/psycopt2d/visualization/feature_importance.py
+++ b/src/psycopt2d/visualization/feature_importance.py
@@ -47,7 +47,7 @@ def plot_feature_importances(
         y_values=df["feature_importances"].tolist(),
         x_title="Feature importance (gain)",
         y_title="Feature name",
-        sort_x=np.flip(np.arange(len(feature_importances))),
+        sort_x=np.flip(np.arange(len(df["feature_importances"]))),
         plot_type="hbar",
         fig_size=(16, 10),
         save_path=save_path,
diff --git a/tests/test_train_model.py b/tests/test_train_model.py
index 0339f72d..4538dc14 100644
--- a/tests/test_train_model.py
+++ b/tests/test_train_model.py
@@ -7,7 +7,7 @@
 from psycopt2d.train_model import main
 
 CONFIG_DIR_PATH = "../src/psycopt2d/config/"
-CONFIG_FILE_NAME = "integration_testing.yaml"
+INTEGRATION_TEST_FILE_NAME = "integration_testing.yaml"
 INTEGRATION_TESTING_MODEL_OVERRIDE = "+model=logistic-regression"
 
 
@@ -17,7 +17,7 @@ def test_main(model_name):
     with initialize(version_base=None, config_path=CONFIG_DIR_PATH):
 
         cfg = compose(
-            config_name=CONFIG_FILE_NAME,
+            config_name=INTEGRATION_TEST_FILE_NAME,
             overrides=[f"+model={model_name}"],
         )
 
@@ -38,7 +38,7 @@ def test_integration_test():
     with initialize(version_base=None, config_path=CONFIG_DIR_PATH):
 
         cfg = compose(
-            config_name=CONFIG_FILE_NAME,
+            config_name=INTEGRATION_TEST_FILE_NAME,
             overrides=[INTEGRATION_TESTING_MODEL_OVERRIDE],
         )
         main(cfg)
@@ -48,7 +48,7 @@ def test_crossvalidation():
     """Test crossvalidation."""
     with initialize(version_base=None, config_path=CONFIG_DIR_PATH):
         cfg = compose(
-            config_name=CONFIG_FILE_NAME,
+            config_name=INTEGRATION_TEST_FILE_NAME,
             overrides=[INTEGRATION_TESTING_MODEL_OVERRIDE, "+data.n_splits=2"],
         )
         main(cfg)
@@ -58,7 +58,7 @@ def test_min_prediction_time_date():
     """Test crossvalidation."""
     with initialize(version_base=None, config_path=CONFIG_DIR_PATH):
         cfg = compose(
-            config_name=CONFIG_FILE_NAME,
+            config_name=INTEGRATION_TEST_FILE_NAME,
             overrides=[
                 INTEGRATION_TESTING_MODEL_OVERRIDE,
                 "+data.min_prediction_time_date=1972-01-01",

From a64ebe8fe30ae651962a63e8b4e12b4285a1bb17 Mon Sep 17 00:00:00 2001
From: Martin Bernstorff <ryqiem@gmail.com>
Date: Fri, 21 Oct 2022 10:33:18 +0200
Subject: [PATCH 14/57] Begin refactoring

---
 src/psycopt2d/dataclasses/configs.py | 19 -------------------
 1 file changed, 19 deletions(-)
 delete mode 100644 src/psycopt2d/dataclasses/configs.py

diff --git a/src/psycopt2d/dataclasses/configs.py b/src/psycopt2d/dataclasses/configs.py
deleted file mode 100644
index e809d072..00000000
--- a/src/psycopt2d/dataclasses/configs.py
+++ /dev/null
@@ -1,19 +0,0 @@
-"""Dataclasses used in the project."""
-from typing import Optional
-
-import pandas as pd
-from omegaconf import DictConfig
-from pydantic import BaseModel
-
-# pylint: disable=missing-class-docstring, too-few-public-methods
-
-
-class ModelEvalData(BaseModel):
-    """Dataclass for model evaluation data."""
-
-    class Config:
-        arbitrary_types_allowed = True
-
-    df: pd.DataFrame
-    cfg: DictConfig
-    feature_importance_dict: Optional[dict[str, float]] = None

From 917b043a96eca4467276756d8f31cb7ea80989cd Mon Sep 17 00:00:00 2001
From: Martin Bernstorff <ryqiem@gmail.com>
Date: Fri, 21 Oct 2022 10:33:31 +0200
Subject: [PATCH 15/57] Begin refactoring

---
 src/psycopt2d/configs.py                      | 19 +++++++++++
 .../utils/omegaconf_to_pydantic_objects.py    | 32 +++++++++++++++++++
 2 files changed, 51 insertions(+)
 create mode 100644 src/psycopt2d/configs.py
 create mode 100644 src/psycopt2d/utils/omegaconf_to_pydantic_objects.py

diff --git a/src/psycopt2d/configs.py b/src/psycopt2d/configs.py
new file mode 100644
index 00000000..e809d072
--- /dev/null
+++ b/src/psycopt2d/configs.py
@@ -0,0 +1,19 @@
+"""Dataclasses used in the project."""
+from typing import Optional
+
+import pandas as pd
+from omegaconf import DictConfig
+from pydantic import BaseModel
+
+# pylint: disable=missing-class-docstring, too-few-public-methods
+
+
+class ModelEvalData(BaseModel):
+    """Dataclass for model evaluation data."""
+
+    class Config:
+        arbitrary_types_allowed = True
+
+    df: pd.DataFrame
+    cfg: DictConfig
+    feature_importance_dict: Optional[dict[str, float]] = None
diff --git a/src/psycopt2d/utils/omegaconf_to_pydantic_objects.py b/src/psycopt2d/utils/omegaconf_to_pydantic_objects.py
new file mode 100644
index 00000000..bd476e95
--- /dev/null
+++ b/src/psycopt2d/utils/omegaconf_to_pydantic_objects.py
@@ -0,0 +1,32 @@
+"""Utilities for converting config yamls to pydantic objects. 
+
+Helpful because it makes them:
+- Addressable with intellisense,
+- Refactorable with IDEs, 
+- Easier to document with docstrings and 
+- Type checkable
+"""
+
+import pydantic
+from hydra import compose, initialize
+from omegaconf import DictConfig
+
+
+def omegaconf_to_pydantic_cfg(cfg: DictConfig) -> pydantic.BaseModel:
+    """Convert OmegaConf to pydantic config."""
+    return pydantic.parse_obj_as(pydantic.BaseModel, cfg)
+
+
+def main():
+    with initialize(version_base=None, config_path="../src/psycopt2d/config/"):
+        cfg = compose(
+            config_name="defualt_config.yaml",
+        )
+
+    pydantic_obj = omegaconf_to_pydantic_cfg(cfg)
+
+    pass
+
+
+if __name__ == "__main__":
+    main()

From 58bd9fdab475ccb3d12a4d816a79bcac7197a9cb Mon Sep 17 00:00:00 2001
From: Martin Bernstorff <martinbernstorff@gmail.com>
Date: Fri, 21 Oct 2022 11:44:15 +0200
Subject: [PATCH 16/57] refactor: convert omegaconf to pydantic objs

---
 application/train_and_log_models.py           |   2 +-
 reports/render_report.py                      |   2 +-
 src/psycopt2d/config/data/synth_data.yaml     |   3 +-
 src/psycopt2d/config/default_config.yaml      |   4 +-
 .../default_evaluation.yaml                   |   0
 .../evaluation_synth.yaml                     |   0
 src/psycopt2d/config/integration_testing.yaml |   5 +-
 src/psycopt2d/config/overtaci_testing.yaml    |   8 --
 src/psycopt2d/config/sweep_xgboost.yaml       |   8 --
 .../{training => train}/default_training.yaml |   0
 .../evaluate_saved_model_predictions.py       |   2 +-
 src/psycopt2d/evaluation.py                   |   2 +-
 src/psycopt2d/load.py                         |  80 ++++++------
 src/psycopt2d/model_training_watcher.py       |  10 +-
 src/psycopt2d/train_model.py                  |  17 ++-
 src/psycopt2d/utils/__init__.py               |   0
 .../utils/omegaconf_to_pydantic_objects.py    | 121 ++++++++++++++++--
 src/psycopt2d/{ => utils}/utils.py            |   8 +-
 .../visualization/performance_over_time.py    |   2 +-
 src/psycopt2d/visualization/prob_over_time.py |   2 +-
 src/psycopt2d/visualization/sens_over_time.py |   6 +-
 tests/test_auc_by_group_table.py              |   2 +-
 tests/test_calculate_performance_metrics.py   |   2 +-
 tests/test_load.py                            |  15 ++-
 tests/test_performance_by_threshold.py        |   2 +-
 tests/test_train_model.py                     |   9 +-
 tests/test_utils.py                           |  28 +++-
 tests/test_visualizations.py                  |   2 +-
 28 files changed, 237 insertions(+), 105 deletions(-)
 rename src/psycopt2d/config/{evaluation => eval}/default_evaluation.yaml (100%)
 rename src/psycopt2d/config/{evaluation => eval}/evaluation_synth.yaml (100%)
 delete mode 100644 src/psycopt2d/config/overtaci_testing.yaml
 delete mode 100644 src/psycopt2d/config/sweep_xgboost.yaml
 rename src/psycopt2d/config/{training => train}/default_training.yaml (100%)
 create mode 100644 src/psycopt2d/utils/__init__.py
 rename src/psycopt2d/{ => utils}/utils.py (98%)

diff --git a/application/train_and_log_models.py b/application/train_and_log_models.py
index 80ad6eb9..adc8bcab 100644
--- a/application/train_and_log_models.py
+++ b/application/train_and_log_models.py
@@ -20,7 +20,7 @@
     infer_predictor_col_name,
 )
 from psycopt2d.load import DataLoader, DatasetSpecification, DatasetTimeSpecification
-from psycopt2d.utils import PROJECT_ROOT
+from psycopt2d.utils.utils import PROJECT_ROOT
 
 msg = Printer(timestamp=True)
 
diff --git a/reports/render_report.py b/reports/render_report.py
index e62d906f..0204127e 100644
--- a/reports/render_report.py
+++ b/reports/render_report.py
@@ -9,7 +9,7 @@
 
 import pandas as pd
 
-from psycopt2d.utils import PROJECT_ROOT
+from psycopt2d.utils.utils import PROJECT_ROOT
 
 # import pandoc
 # See comment in pyproject.toml on Pandoc, not currently in use. Should work now, see: https://github.com/boisgera/pandoc/pull/49#issuecomment-1265983279
diff --git a/src/psycopt2d/config/data/synth_data.yaml b/src/psycopt2d/config/data/synth_data.yaml
index 8089f440..f94b5da9 100644
--- a/src/psycopt2d/config/data/synth_data.yaml
+++ b/src/psycopt2d/config/data/synth_data.yaml
@@ -9,7 +9,8 @@ data:
   pred_timestamp_col_name: timestamp
   outcome_timestamp_col_name: timestamp_outcome
   id_col_name: citizen_ids
-  suffix: synthetic
+  dir: "../psycop-t2d/tests/test_data/synth_splits/"
+  suffix: csv
   drop_patient_if_outcome_before_date: null
   lookbehind_combination: [30, 90]
 
diff --git a/src/psycopt2d/config/default_config.yaml b/src/psycopt2d/config/default_config.yaml
index e67a67ec..5590c26f 100644
--- a/src/psycopt2d/config/default_config.yaml
+++ b/src/psycopt2d/config/default_config.yaml
@@ -4,6 +4,6 @@ defaults:
   - data: t2d_parquet
   - preprocessing: default_preprocessing
   - model: xgboost
-  - training: default_training
-  - evaluation: default_evaluation
+  - train: default_training
+  - eval: default_evaluation
   - sweeper: optuna_multithread
diff --git a/src/psycopt2d/config/evaluation/default_evaluation.yaml b/src/psycopt2d/config/eval/default_evaluation.yaml
similarity index 100%
rename from src/psycopt2d/config/evaluation/default_evaluation.yaml
rename to src/psycopt2d/config/eval/default_evaluation.yaml
diff --git a/src/psycopt2d/config/evaluation/evaluation_synth.yaml b/src/psycopt2d/config/eval/evaluation_synth.yaml
similarity index 100%
rename from src/psycopt2d/config/evaluation/evaluation_synth.yaml
rename to src/psycopt2d/config/eval/evaluation_synth.yaml
diff --git a/src/psycopt2d/config/integration_testing.yaml b/src/psycopt2d/config/integration_testing.yaml
index 5a3c6d52..6b860e1b 100644
--- a/src/psycopt2d/config/integration_testing.yaml
+++ b/src/psycopt2d/config/integration_testing.yaml
@@ -3,6 +3,7 @@ defaults:
   - project: integration_test_project
   - data: synth_data
   - preprocessing: default_preprocessing
-  - training: default_training
-  - evaluation: evaluation_synth
+  - train: default_training
+  - model: xgboost
+  - eval: evaluation_synth
   - sweeper: optuna_singlethread
diff --git a/src/psycopt2d/config/overtaci_testing.yaml b/src/psycopt2d/config/overtaci_testing.yaml
deleted file mode 100644
index 39aca780..00000000
--- a/src/psycopt2d/config/overtaci_testing.yaml
+++ /dev/null
@@ -1,8 +0,0 @@
-# @package _global_
-defaults:
-  - project: overtaci_test_project
-  - data: t2d_parquet
-  - preprocessing: default_preprocessing
-  - training: default_training
-  - evaluation: default_evaluation
-  - sweeper: optuna_singlethread
diff --git a/src/psycopt2d/config/sweep_xgboost.yaml b/src/psycopt2d/config/sweep_xgboost.yaml
deleted file mode 100644
index 0295eb9a..00000000
--- a/src/psycopt2d/config/sweep_xgboost.yaml
+++ /dev/null
@@ -1,8 +0,0 @@
-# @package _global_
-defaults:
-  - project: default_project
-  - data: all_csv
-  - preprocessing: default_preprocessing
-  - training: default_training
-  - evaluation: default_evaluation
-  - sweeper: optuna_singlethread
\ No newline at end of file
diff --git a/src/psycopt2d/config/training/default_training.yaml b/src/psycopt2d/config/train/default_training.yaml
similarity index 100%
rename from src/psycopt2d/config/training/default_training.yaml
rename to src/psycopt2d/config/train/default_training.yaml
diff --git a/src/psycopt2d/evaluate_saved_model_predictions.py b/src/psycopt2d/evaluate_saved_model_predictions.py
index a665d34f..2b312724 100644
--- a/src/psycopt2d/evaluate_saved_model_predictions.py
+++ b/src/psycopt2d/evaluate_saved_model_predictions.py
@@ -14,7 +14,7 @@
 import pandas as pd
 from omegaconf import DictConfig
 
-from psycopt2d.utils import (
+from psycopt2d.utils.utils import (
     PROJECT_ROOT,
     infer_outcome_col_name,
     infer_predictor_col_name,
diff --git a/src/psycopt2d/evaluation.py b/src/psycopt2d/evaluation.py
index ff4a7235..cf345975 100644
--- a/src/psycopt2d/evaluation.py
+++ b/src/psycopt2d/evaluation.py
@@ -14,7 +14,7 @@
 from psycopt2d.tables.performance_by_threshold import (
     generate_performance_by_positive_rate_table,
 )
-from psycopt2d.utils import PROJECT_ROOT, positive_rate_to_pred_probs
+from psycopt2d.utils.utils import PROJECT_ROOT, positive_rate_to_pred_probs
 from psycopt2d.visualization import (
     plot_auc_by_time_from_first_visit,
     plot_feature_importances,
diff --git a/src/psycopt2d/load.py b/src/psycopt2d/load.py
index 1f6ddf11..48a0b7c1 100644
--- a/src/psycopt2d/load.py
+++ b/src/psycopt2d/load.py
@@ -3,6 +3,7 @@
 from collections.abc import Iterable
 from datetime import datetime, timedelta
 from pathlib import Path
+from queue import Full
 from typing import Any, Optional, Union
 
 import pandas as pd
@@ -12,7 +13,8 @@
 from wasabi import Printer
 
 from psycopt2d.evaluate_saved_model_predictions import infer_look_distance
-from psycopt2d.utils import (
+from psycopt2d.utils.omegaconf_to_pydantic_objects import FullConfig
+from psycopt2d.utils.utils import (
     PROJECT_ROOT,
     coerce_to_datetime,
     get_percent_lost,
@@ -111,16 +113,16 @@ class DataLoader:
 
     def __init__(
         self,
-        spec: DatasetSpecification,
+        cfg: FullConfig,
     ):
-        self.spec = spec
+        self.cfg = cfg
 
         # File handling
-        self.dir_path = Path(spec.split_dir_path)
-        self.file_suffix = spec.file_suffix
+        self.dir_path = Path(cfg.data.dir)
+        self.file_suffix = cfg.data.suffix
 
         # Column specifications
-        self.pred_col_name_prefix = spec.pred_col_name_prefix
+        self.pred_col_name_prefix = cfg.data.pred_col_name_prefix
 
     def _load_dataset_file(  # pylint: disable=inconsistent-return-statements
         self,
@@ -182,12 +184,18 @@ def _drop_rows_if_datasets_ends_within_days(
         n_rows_before_modification = dataset.shape[0]
 
         if direction == "ahead":
-            max_datetime = dataset[self.spec.pred_time_colname].max() - n_days_timedelt
-            before_max_dt = dataset[self.spec.pred_time_colname] < max_datetime
+            max_datetime = (
+                dataset[self.cfg.data.pred_timestamp_col_name].max() - n_days_timedelt
+            )
+            before_max_dt = (
+                dataset[self.cfg.data.pred_timestamp_col_name] < max_datetime
+            )
             dataset = dataset[before_max_dt]
         elif direction == "behind":
-            min_datetime = dataset[self.spec.pred_time_colname].min() + n_days_timedelt
-            after_min_dt = dataset[self.spec.pred_time_colname] > min_datetime
+            min_datetime = (
+                dataset[self.cfg.data.pred_timestamp_col_name].min() + n_days_timedelt
+            )
+            after_min_dt = dataset[self.cfg.data.pred_timestamp_col_name] > min_datetime
             dataset = dataset[after_min_dt]
 
         n_rows_after_modification = dataset.shape[0]
@@ -211,7 +219,7 @@ def _drop_patients_with_event_in_washin(self, dataset) -> pd.DataFrame:
         # Remove dates before drop_patient_if_outcome_before_date
         outcome_before_date = (
             dataset["_timestamp_first_t2d"]
-            < self.spec.time.drop_patient_if_outcome_before_date
+            < self.cfg.data.drop_patient_if_outcome_before_date
         )
 
         patients_to_drop = set(dataset["dw_ek_borger"][outcome_before_date].unique())
@@ -219,8 +227,8 @@ def _drop_patients_with_event_in_washin(self, dataset) -> pd.DataFrame:
 
         # Removed dates before drop_patient_if_outcome_before_date
         dataset = dataset[
-            dataset[self.spec.pred_time_colname]
-            > self.spec.time.drop_patient_if_outcome_before_date
+            dataset[self.cfg.data.pred_timestamp_col_name]
+            > self.cfg.data.drop_patient_if_outcome_before_date
         ]
 
         n_rows_after_modification = dataset.shape[0]
@@ -250,7 +258,7 @@ def _drop_cols_not_in_lookbehind_combination(
             pd.DataFrame: Dataset with dropped columns.
         """
 
-        if not self.spec.time.lookbehind_combination:
+        if not self.cfg.data.lookbehind_combination:
             raise ValueError("No lookbehind_combination provided.")
 
         # Extract all unique lookbhehinds in the dataset predictors
@@ -261,14 +269,14 @@ def _drop_cols_not_in_lookbehind_combination(
         }
 
         # Convert list to set
-        lookbehinds_in_spec = set(self.spec.time.lookbehind_combination)
+        lookbehinds_in_spec = set(self.cfg.data.lookbehind_combination)
 
         # Check that all loobehinds in lookbehind_combination are used in the predictors
         if not lookbehinds_in_spec.issubset(
             lookbehinds_in_dataset,
         ):
             msg.warn(
-                f"One or more of the provided lookbehinds in lookbehind_combination is/are not used in any predictors in the dataset. Dataset: {lookbehinds_in_dataset}. lookbehind_combination: {self.spec.time.lookbehind_combination}.",
+                f"One or more of the provided lookbehinds in lookbehind_combination is/are not used in any predictors in the dataset. Dataset: {lookbehinds_in_dataset}. lookbehind_combination: {self.cfg.data.lookbehind_combination}.",
             )
 
             lookbehinds_to_keep = lookbehinds_in_spec.intersection(
@@ -384,10 +392,10 @@ def _drop_cols_and_rows_if_look_direction_not_met(
         for direction in ("ahead", "behind"):
 
             if direction in ("ahead", "behind"):
-                if self.spec.time.min_lookahead_days:
-                    n_days = self.spec.time.min_lookahead_days
-                elif self.spec.time.min_lookbehind_days:
-                    n_days = self.spec.time.min_lookbehind_days
+                if self.cfg.data.min_lookahead_days:
+                    n_days = self.cfg.data.min_lookahead_days
+                elif self.cfg.data.min_lookbehind_days:
+                    n_days = self.cfg.data.min_lookbehind_days
                 else:
                     continue
 
@@ -416,23 +424,23 @@ def _process_dataset(self, dataset: pd.DataFrame) -> pd.DataFrame:
         Returns:
             pd.DataFrame: Processed dataset
         """
-        if self.spec.time.drop_patient_if_outcome_before_date:
+        if self.cfg.data.drop_patient_if_outcome_before_date:
             dataset = add_washin_timestamps(dataset=dataset)
 
         dataset = self._convert_timestamp_dtype_and_nat(dataset)
-        if self.spec.time.drop_patient_if_outcome_before_date:
+        if self.cfg.data.drop_patient_if_outcome_before_date:
             dataset = self._drop_patients_with_event_in_washin(dataset=dataset)
 
         # Drop if later than min prediction time date
-        if self.spec.time.min_prediction_time_date:
+        if self.cfg.data.min_prediction_time_date:
             dataset = dataset[
-                dataset[self.spec.pred_time_colname]
-                > self.spec.time.min_prediction_time_date
+                dataset[self.cfg.data.pred_timestamp_col_name]
+                > self.cfg.data.min_prediction_time_date
             ]
 
         dataset = self._drop_cols_and_rows_if_look_direction_not_met(dataset=dataset)
 
-        if self.spec.time.lookbehind_combination:
+        if self.cfg.data.lookbehind_combination:
             dataset = self._drop_cols_not_in_lookbehind_combination(dataset=dataset)
 
         return dataset
@@ -455,15 +463,15 @@ def load_dataset_from_dir(
         msg.info(f"Loading {split_names}")
         # Handle input types
         for timedelta_arg in (
-            self.spec.time.min_lookbehind_days,
-            self.spec.time.min_lookahead_days,
+            self.cfg.data.min_lookbehind_days,
+            self.cfg.data.min_lookahead_days,
         ):
             if timedelta_arg:
                 timedelta_arg = timedelta(days=timedelta_arg)  # type: ignore
 
         for date_arg in (
-            self.spec.time.drop_patient_if_outcome_before_date,
-            self.spec.time.min_prediction_time_date,
+            self.cfg.data.drop_patient_if_outcome_before_date,
+            self.cfg.data.min_prediction_time_date,
         ):
             if isinstance(date_arg, str):
                 date_arg = coerce_to_datetime(
@@ -545,16 +553,12 @@ class Config:
     val: pd.DataFrame
 
 
-def load_train_and_val_from_cfg(cfg: DictConfig):
+def load_train_and_val_from_cfg(cfg: FullConfig):
     """Load train and validation data from file."""
 
-    data_specification = _init_spec_from_cfg(
-        cfg,
-    )
-
-    split = DataLoader(spec=data_specification)
+    loader = DataLoader(cfg=cfg)
 
     return SplitDataset(
-        train=split.load_dataset_from_dir(split_names="train"),
-        val=split.load_dataset_from_dir(split_names="val"),
+        train=loader.load_dataset_from_dir(split_names="train"),
+        val=loader.load_dataset_from_dir(split_names="val"),
     )
diff --git a/src/psycopt2d/model_training_watcher.py b/src/psycopt2d/model_training_watcher.py
index 3305122c..d4ee469f 100644
--- a/src/psycopt2d/model_training_watcher.py
+++ b/src/psycopt2d/model_training_watcher.py
@@ -12,9 +12,13 @@
 from wasabi import msg
 
 from psycopt2d.evaluation import evaluate_model
-from psycopt2d.utils import (MODEL_PREDICTIONS_PATH, PROJECT_ROOT,
-                             infer_outcome_col_name, infer_y_hat_prob_col_name,
-                             load_evaluation_data)
+from psycopt2d.utils.utils import (
+    MODEL_PREDICTIONS_PATH,
+    PROJECT_ROOT,
+    infer_outcome_col_name,
+    infer_y_hat_prob_col_name,
+    load_evaluation_data,
+)
 
 # Path to the wandb directory
 WANDB_DIR = PROJECT_ROOT / "wandb"
diff --git a/src/psycopt2d/train_model.py b/src/psycopt2d/train_model.py
index e9d78bbe..44ecbfe0 100644
--- a/src/psycopt2d/train_model.py
+++ b/src/psycopt2d/train_model.py
@@ -1,7 +1,7 @@
 """Training script for training a single model for predicting t2d."""
 import os
 from collections.abc import Iterable
-from typing import Optional
+from typing import Optional, Union
 
 import hydra
 import numpy as np
@@ -19,7 +19,11 @@
 from psycopt2d.feature_transformers import ConvertToBoolean, DateTimeConverter
 from psycopt2d.load import load_train_and_val_from_cfg
 from psycopt2d.models import MODELS
-from psycopt2d.utils import (
+from psycopt2d.utils.omegaconf_to_pydantic_objects import (
+    FullConfig,
+    omegaconf_to_pydantic_objects,
+)
+from psycopt2d.utils.utils import (
     PROJECT_ROOT,
     create_wandb_folders,
     flatten_nested_dict,
@@ -302,8 +306,11 @@ def get_col_names(cfg: DictConfig, train: pd.DataFrame) -> tuple[str, list[str]]
     config_name="default_config",
     version_base="1.2",
 )
-def main(cfg):
+def main(cfg: Union[FullConfig, DictConfig]):
     """Main function for training a single model."""
+    if not isinstance(cfg, FullConfig):
+        cfg = omegaconf_to_pydantic_objects(cfg)
+
     msg = Printer(timestamp=True)
 
     create_wandb_folders()
@@ -311,7 +318,7 @@ def main(cfg):
     run = wandb.init(
         project=cfg.project.name,
         reinit=True,
-        config=flatten_nested_dict(cfg, sep="."),
+        config=flatten_nested_dict(cfg.__dict__, sep="."),
         mode=cfg.project.wandb_mode,
         group=cfg.project.wandb_group,
     )
@@ -331,7 +338,7 @@ def main(cfg):
         pipe=pipe,
         outcome_col_name=outcome_col_name,
         train_col_names=train_col_names,
-        n_splits=cfg.training.n_splits,
+        n_splits=cfg.train.n_splits,
     )
 
     # Save model predictions, feature importance, and config to disk
diff --git a/src/psycopt2d/utils/__init__.py b/src/psycopt2d/utils/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/src/psycopt2d/utils/omegaconf_to_pydantic_objects.py b/src/psycopt2d/utils/omegaconf_to_pydantic_objects.py
index bd476e95..e1d1fa95 100644
--- a/src/psycopt2d/utils/omegaconf_to_pydantic_objects.py
+++ b/src/psycopt2d/utils/omegaconf_to_pydantic_objects.py
@@ -7,25 +7,124 @@
 - Type checkable
 """
 
+from datetime import datetime
+from pathlib import Path
+from typing import Optional, Union
+
 import pydantic
 from hydra import compose, initialize
-from omegaconf import DictConfig
+from omegaconf import DictConfig, OmegaConf
+from pydantic import BaseModel as PydanticBaseModel
+
+
+class BaseModel(PydanticBaseModel):
+    """Allow arbitrary types in all pydantic models."""
+
+    class Config:
+        """Allow arbitrary types"""
+
+        arbitrary_types_allowed = True
+
+
+class ProjectConf(BaseModel):
+    """Project configuration."""
+
+    name: str = "psycopt2d"
+    seed: int
+    wandb_group: str
+    wandb_mode: str
+
+
+class DataConf(BaseModel):
+    """Data configuration."""
+
+    n_training_samples: Optional[
+        int
+    ]  # (int, null): Number of training samples to use, defaults to null in which cases it uses all samples.
+    dir: Union[Path, str]
+    suffix: str  # File suffix to load.
+
+    # Feature specs
+    pred_col_name_prefix: str  # (str): prefix of predictor columns
+    pred_timestamp_col_name: str  # (str): Column name for prediction times
+    outcome_timestamp_col_name: str  # (str): Column name for outcome timestamps
+    id_col_name: str  # (str): Citizen colnames
+
+    # Looking ahead
+    lookahead_days: int  # (float): Number of days from prediction time to look ahead for the outcome.
+    min_lookahead_days: Optional[
+        int
+    ]  # (int): Drop all prediction times where (max timestamp in the dataset) - (current timestamp) is less than min_lookahead_days
+    min_lookbehind_days: Optional[int]
+    drop_patient_if_outcome_before_date: Optional[Union[str, datetime]]
+
+    # Looking behind
+    # (int): Drop all prediction times where (prediction_timestamp) - (min timestamp in the dataset) is less than min_lookbehind_days
+    min_prediction_time_date: Optional[Union[str, datetime]]
+    lookbehind_combination: Optional[list[int]]
+
+
+class PreprocessingConf(BaseModel):
+    """Preprocessing config"""
+
+    convert_to_boolean: bool  # (Boolean): Convert all prediction values (except gender) to boolean. Defaults to False
+    convert_datetimes_to: bool  # (str): Options include ordinal or False
+    imputation_method: Optional[str]  # (str): Options include "most_frequent"
+    transform: Optional[
+        str
+    ]  # (str|null): Transformation applied to all predictors after imputation. Options include "z-score-normalization"
+
+
+class ModelConf(BaseModel):
+    """Model configuration"""
+
+    model_name: str  # (str): Model, can currently take xgboost
+    require_imputation: bool  # (bool): Whether the model requires imputation. (shouldn't this be false?)
+    args: dict
+
+
+class TrainConf(BaseModel):
+    """Training configuration"""
+
+    n_splits: int  # TODO: How do we handle whether to use crossvalidation or train/val splitting?
+
+
+class EvalConf(BaseModel):
+    """Evaluation config"""
+
+    threshold_percentiles: list[int]
+
+    # top n features to plot. A table with all features is also logged
+    top_n_feature_importances: int
+
+    positive_rate_thresholds: list[int]
+    save_model_predictions_on_overtaci: bool
+    date_bins_ahead: list[int]
+    date_bins_behind: list[int]
+
 
+class FullConfig(BaseModel):
+    """A full configuration object."""
 
-def omegaconf_to_pydantic_cfg(cfg: DictConfig) -> pydantic.BaseModel:
-    """Convert OmegaConf to pydantic config."""
-    return pydantic.parse_obj_as(pydantic.BaseModel, cfg)
+    project: ProjectConf
+    data: DataConf
+    preprocessing: PreprocessingConf
+    model: ModelConf
+    train: TrainConf
+    eval: EvalConf
 
 
-def main():
-    with initialize(version_base=None, config_path="../src/psycopt2d/config/"):
-        cfg = compose(
-            config_name="defualt_config.yaml",
-        )
+def omegaconf_to_pydantic_objects(conf: DictConfig) -> FullConfig:
+    """Converts an omegaconf DictConfig to a pydantic object.
 
-    pydantic_obj = omegaconf_to_pydantic_cfg(cfg)
+    Args:
+        conf (DictConfig): Omegaconf DictConfig
 
-    pass
+    Returns:
+        FullConfig: Pydantic object
+    """
+    conf = OmegaConf.to_container(conf, resolve=True)  # type: ignore
+    return FullConfig(**conf)
 
 
 if __name__ == "__main__":
diff --git a/src/psycopt2d/utils.py b/src/psycopt2d/utils/utils.py
similarity index 98%
rename from src/psycopt2d/utils.py
rename to src/psycopt2d/utils/utils.py
index 976e56db..1ad2519a 100644
--- a/src/psycopt2d/utils.py
+++ b/src/psycopt2d/utils/utils.py
@@ -18,8 +18,9 @@
 from wandb.sdk.wandb_run import Run  # pylint: disable=no-name-in-module
 from wasabi import msg
 
-from psycopt2d.dataclasses.configs import ModelEvalData
+from psycopt2d.configs import ModelEvalData
 from psycopt2d.model_performance import ModelPerformance
+from psycopt2d.utils.omegaconf_to_pydantic_objects import FullConfig
 
 SHARED_RESOURCES_PATH = Path(r"E:\shared_resources")
 FEATURE_SETS_PATH = SHARED_RESOURCES_PATH / "feature_sets"
@@ -27,6 +28,7 @@
 RAW_DATA_VALIDATION_PATH = SHARED_RESOURCES_PATH / "raw_data_validation"
 FEATURIZERS_PATH = SHARED_RESOURCES_PATH / "featurizers"
 MODEL_PREDICTIONS_PATH = SHARED_RESOURCES_PATH / "model_predictions"
+
 PROJECT_ROOT = Path(__file__).resolve().parents[2]
 
 
@@ -299,7 +301,7 @@ def get_feature_importance_dict(pipe: Pipeline) -> Union[None, dict[str, float]]
 
 def prediction_df_with_metadata_to_disk(
     df: pd.DataFrame,
-    cfg: DictConfig,
+    cfg: FullConfig,
     pipe: Pipeline,
     run: Optional[Run] = None,
 ) -> None:
@@ -321,7 +323,7 @@ def prediction_df_with_metadata_to_disk(
     else:
         run_descriptor = f"{timestamp}_{model_args}"[:100]
 
-    if cfg.evaluation.save_model_predictions_on_overtaci:
+    if cfg.eval.save_model_predictions_on_overtaci:
         # Save to overtaci
         dir_path = MODEL_PREDICTIONS_PATH / cfg.project.name / run_descriptor
     else:
diff --git a/src/psycopt2d/visualization/performance_over_time.py b/src/psycopt2d/visualization/performance_over_time.py
index e5ca1dac..b5c1d265 100644
--- a/src/psycopt2d/visualization/performance_over_time.py
+++ b/src/psycopt2d/visualization/performance_over_time.py
@@ -11,7 +11,7 @@
 import pandas as pd
 from sklearn.metrics import f1_score, roc_auc_score
 
-from psycopt2d.utils import bin_continuous_data, round_floats_to_edge
+from psycopt2d.utils.utils import bin_continuous_data, round_floats_to_edge
 from psycopt2d.visualization.base_charts import plot_basic_chart
 
 
diff --git a/src/psycopt2d/visualization/prob_over_time.py b/src/psycopt2d/visualization/prob_over_time.py
index 7adba6a3..4a9fc2ac 100644
--- a/src/psycopt2d/visualization/prob_over_time.py
+++ b/src/psycopt2d/visualization/prob_over_time.py
@@ -141,7 +141,7 @@ def plot_prob_over_time(
 
 
 if __name__ == "__main__":
-    from psycopt2d.utils import PROJECT_ROOT
+    from psycopt2d.utils.utils import PROJECT_ROOT
 
     path = PROJECT_ROOT / "tests" / "test_data" / "synth_eval_data.csv"
     df = pd.read_csv(path)
diff --git a/src/psycopt2d/visualization/sens_over_time.py b/src/psycopt2d/visualization/sens_over_time.py
index c7f6be5a..306164ab 100644
--- a/src/psycopt2d/visualization/sens_over_time.py
+++ b/src/psycopt2d/visualization/sens_over_time.py
@@ -9,7 +9,7 @@
 import numpy as np
 import pandas as pd
 
-from psycopt2d.utils import PROJECT_ROOT, round_floats_to_edge
+from psycopt2d.utils.utils import PROJECT_ROOT, round_floats_to_edge
 
 
 def create_sensitivity_by_time_to_outcome_df(
@@ -303,7 +303,7 @@ def plot_sensitivity_by_time_to_outcome_heatmap(
 
     Examples:
         >>> from pathlib import Path
-        >>> from psycopt2d.utils import positive_rate_to_pred_probs
+        >>> from psycopt2d.utils.utils import positive_rate_to_pred_probs
 
         >>> repo_path = Path(__file__).parent.parent.parent.parent
         >>> path = repo_path / "tests" / "test_data" / "synth_eval_data.csv"
@@ -381,7 +381,7 @@ def plot_sensitivity_by_time_to_outcome_heatmap(
 
 
 if __name__ == "__main__":
-    from psycopt2d.utils import positive_rate_to_pred_probs
+    from psycopt2d.utils.utils import positive_rate_to_pred_probs
 
     path = PROJECT_ROOT / "tests" / "test_data" / "synth_eval_data.csv"
     df = pd.read_csv(path)
diff --git a/tests/test_auc_by_group_table.py b/tests/test_auc_by_group_table.py
index b4103307..4f01fcef 100644
--- a/tests/test_auc_by_group_table.py
+++ b/tests/test_auc_by_group_table.py
@@ -2,7 +2,7 @@
 # pylint: disable=missing-function-docstring
 
 from psycopt2d.tables import auc_by_group_table
-from psycopt2d.utils import bin_continuous_data
+from psycopt2d.utils.utils import bin_continuous_data
 
 
 def test_auc_by_group_table(synth_data):
diff --git a/tests/test_calculate_performance_metrics.py b/tests/test_calculate_performance_metrics.py
index db630a4f..0f053e71 100644
--- a/tests/test_calculate_performance_metrics.py
+++ b/tests/test_calculate_performance_metrics.py
@@ -1,5 +1,5 @@
 # import wandb
-# from psycopt2d.utils import calculate_performance_metrics
+# from psycopt2d.utils.utils import calculate_performance_metrics
 
 
 # def test_log_performance_metrics(synth_data):
diff --git a/tests/test_load.py b/tests/test_load.py
index c0be3a58..58bafbdc 100644
--- a/tests/test_load.py
+++ b/tests/test_load.py
@@ -2,6 +2,7 @@
 from hydra import compose, initialize
 
 from psycopt2d.load import load_train_and_val_from_cfg
+from psycopt2d.utils.omegaconf_to_pydantic_objects import omegaconf_to_pydantic_objects
 
 
 def test_load_lookbehind_exceeds_lookbehind_threshold():
@@ -10,14 +11,14 @@ def test_load_lookbehind_exceeds_lookbehind_threshold():
     with initialize(version_base=None, config_path="../src/psycopt2d/config/"):
         cfg = compose(
             config_name="integration_testing.yaml",
-            overrides=[
-                "++data.min_lookbehind_days=90",
-            ],
         )
 
+        cfg = omegaconf_to_pydantic_objects(cfg)
+
+        cfg.data.min_lookahead_days = 90
         split_dataset = load_train_and_val_from_cfg(cfg)
 
-        assert split_dataset.train.shape == (644, 7)
+        assert split_dataset.train.shape == (644, 6)
 
 
 def test_load_lookbehind_not_in_lookbehind_combination():
@@ -26,11 +27,11 @@ def test_load_lookbehind_not_in_lookbehind_combination():
     with initialize(version_base=None, config_path="../src/psycopt2d/config/"):
         cfg = compose(
             config_name="integration_testing.yaml",
-            overrides=[
-                "++data.lookbehind_combination=[30]",
-            ],
         )
 
+        cfg = omegaconf_to_pydantic_objects(cfg)
+
+        cfg.data.lookbehind_combination = [30]
         split_dataset = load_train_and_val_from_cfg(cfg)
 
         assert split_dataset.train.shape == (700, 6)
diff --git a/tests/test_performance_by_threshold.py b/tests/test_performance_by_threshold.py
index 6d3b0630..499640e0 100644
--- a/tests/test_performance_by_threshold.py
+++ b/tests/test_performance_by_threshold.py
@@ -14,7 +14,7 @@
     days_from_first_positive_to_diagnosis,
     generate_performance_by_positive_rate_table,
 )
-from psycopt2d.utils import positive_rate_to_pred_probs
+from psycopt2d.utils.utils import positive_rate_to_pred_probs
 
 
 @pytest.fixture(scope="function")
diff --git a/tests/test_train_model.py b/tests/test_train_model.py
index 4538dc14..ef246e50 100644
--- a/tests/test_train_model.py
+++ b/tests/test_train_model.py
@@ -5,10 +5,11 @@
 
 from psycopt2d.models import MODELS
 from psycopt2d.train_model import main
+from psycopt2d.utils.omegaconf_to_pydantic_objects import omegaconf_to_pydantic_objects
 
 CONFIG_DIR_PATH = "../src/psycopt2d/config/"
 INTEGRATION_TEST_FILE_NAME = "integration_testing.yaml"
-INTEGRATION_TESTING_MODEL_OVERRIDE = "+model=logistic-regression"
+INTEGRATION_TESTING_MODEL_OVERRIDE = "model=logistic-regression"
 
 
 @pytest.mark.parametrize("model_name", MODELS.keys())
@@ -18,13 +19,15 @@ def test_main(model_name):
 
         cfg = compose(
             config_name=INTEGRATION_TEST_FILE_NAME,
-            overrides=[f"+model={model_name}"],
+            overrides=[f"model={model_name}"],
         )
 
+        cfg = omegaconf_to_pydantic_objects(cfg)
+
         # XGBoost should train on GPU on Overtaci,
         # but CPU during integration testing
         if model_name == "xgboost":
-            cfg.model.args.tree_method = "auto"
+            cfg.model.args["tree_method"] = "auto"
 
         main(cfg)
 
diff --git a/tests/test_utils.py b/tests/test_utils.py
index b3ddf1fe..eac256d4 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -1,10 +1,16 @@
 """Testing of the utils module."""
 # pylint: disable=missing-function-docstring
+from pathlib import Path
+
 import numpy as np
 import pandas as pd
+import pytest
+from hydra import compose, initialize
 from utils_for_testing import str_to_df
 
-from psycopt2d.utils import (
+from psycopt2d.utils.omegaconf_to_pydantic_objects import omegaconf_to_pydantic_objects
+from psycopt2d.utils.utils import (
+    PROJECT_ROOT,
     drop_records_if_datediff_days_smaller_than,
     flatten_nested_dict,
 )
@@ -50,3 +56,23 @@ def test_flatten_nested_dict():
     output_dict = flatten_nested_dict(input_dict)
 
     assert expected_dict == output_dict
+
+
+CONFIG_DIR_PATH_ABS = PROJECT_ROOT / "psycopt2d" / "config"
+CONFIG_DIR_PATH_REL = "../src/psycopt2d/config"
+
+
+def get_config_file_names() -> list[str]:
+    """Get all config file names"""
+    config_file_paths: list[Path] = list(CONFIG_DIR_PATH_ABS.glob("*.yaml"))
+    return [f"{path.stem}.yaml" for path in config_file_paths]
+
+
+@pytest.mark.parametrize("config_file_name", get_config_file_names())
+def test_configs(config_file_name):
+    with initialize(version_base=None, config_path=CONFIG_DIR_PATH_REL):
+        cfg = compose(
+            config_name=config_file_name,
+        )
+
+    cfg = omegaconf_to_pydantic_objects(conf=cfg)
diff --git a/tests/test_visualizations.py b/tests/test_visualizations.py
index b108e04f..963fa5d1 100644
--- a/tests/test_visualizations.py
+++ b/tests/test_visualizations.py
@@ -10,7 +10,7 @@
 import pytest
 from sklearn.metrics import f1_score, roc_auc_score
 
-from psycopt2d.utils import positive_rate_to_pred_probs
+from psycopt2d.utils.utils import positive_rate_to_pred_probs
 from psycopt2d.visualization import plot_prob_over_time
 from psycopt2d.visualization.base_charts import plot_basic_chart
 from psycopt2d.visualization.feature_importance import plot_feature_importances

From aaa25c165b190ab4d89aeb8a5543779c28c10765 Mon Sep 17 00:00:00 2001
From: Martin Bernstorff <martinbernstorff@gmail.com>
Date: Fri, 21 Oct 2022 11:45:18 +0200
Subject: [PATCH 17/57] style: linting

---
 application/train_and_log_models.py           | 14 ++++++------
 src/psycopt2d/load.py                         |  1 -
 .../utils/omegaconf_to_pydantic_objects.py    | 22 +++++++------------
 src/psycopt2d/utils/utils.py                  |  1 -
 tests/test_utils.py                           |  2 +-
 5 files changed, 16 insertions(+), 24 deletions(-)

diff --git a/application/train_and_log_models.py b/application/train_and_log_models.py
index adc8bcab..2e340fa5 100644
--- a/application/train_and_log_models.py
+++ b/application/train_and_log_models.py
@@ -20,7 +20,6 @@
     infer_predictor_col_name,
 )
 from psycopt2d.load import DataLoader, DatasetSpecification, DatasetTimeSpecification
-from psycopt2d.utils.utils import PROJECT_ROOT
 
 msg = Printer(timestamp=True)
 
@@ -178,7 +177,7 @@ def train_models_for_each_cell_in_grid(
 
         cell = lookbehind_combinations.pop()
         msg.info(
-            f"Spawning a new trainer with lookbehind={cell.lookbehind} and lookahead={cell.lookahead}"
+            f"Spawning a new trainer with lookbehind={cell.lookbehind} and lookahead={cell.lookahead}",
         )
 
         wandb_group = f"{wandb_prefix}-beh-{cell.lookbehind}-ahead-{cell.lookahead}"
@@ -207,7 +206,7 @@ def train_models_for_each_cell_in_grid(
         active_trainers.append(
             subprocess.Popen(  # pylint: disable=consider-using-with
                 args=subprocess_args,
-            )
+            ),
         )
 
 
@@ -255,7 +254,8 @@ def train_models_for_each_cell_in_grid(
     )
 
     dataset_spec = get_dataset_spec(
-        data_dir_path=meta_conf.data_dir, file_suffix=cfg.data.suffix
+        data_dir_path=meta_conf.data_dir,
+        file_suffix=cfg.data.suffix,
     )
     train = load_train_for_inference(dataset_spec=dataset_spec)
 
@@ -280,11 +280,11 @@ def train_models_for_each_cell_in_grid(
     if not train_conf.gpu:
         msg.warn("Not using GPU for training")
 
-    clean_dir_seconds = 0
+    CLEAN_DIR_SECONDS = 0
     msg.info(
-        f"Sleeping for {clean_dir_seconds} seconds to allow watcher to start and clean dir"
+        f"Sleeping for {CLEAN_DIR_SECONDS} seconds to allow watcher to start and clean dir",
     )
-    time.sleep(clean_dir_seconds)
+    time.sleep(CLEAN_DIR_SECONDS)
 
     train_models_for_each_cell_in_grid(train_conf=train_conf, wandb_conf=wandb_conf)
 
diff --git a/src/psycopt2d/load.py b/src/psycopt2d/load.py
index 48a0b7c1..1403df98 100644
--- a/src/psycopt2d/load.py
+++ b/src/psycopt2d/load.py
@@ -3,7 +3,6 @@
 from collections.abc import Iterable
 from datetime import datetime, timedelta
 from pathlib import Path
-from queue import Full
 from typing import Any, Optional, Union
 
 import pandas as pd
diff --git a/src/psycopt2d/utils/omegaconf_to_pydantic_objects.py b/src/psycopt2d/utils/omegaconf_to_pydantic_objects.py
index e1d1fa95..340ca6f2 100644
--- a/src/psycopt2d/utils/omegaconf_to_pydantic_objects.py
+++ b/src/psycopt2d/utils/omegaconf_to_pydantic_objects.py
@@ -1,9 +1,9 @@
-"""Utilities for converting config yamls to pydantic objects. 
+"""Utilities for converting config yamls to pydantic objects.
 
 Helpful because it makes them:
 - Addressable with intellisense,
-- Refactorable with IDEs, 
-- Easier to document with docstrings and 
+- Refactorable with IDEs,
+- Easier to document with docstrings and
 - Type checkable
 """
 
@@ -11,8 +11,6 @@
 from pathlib import Path
 from typing import Optional, Union
 
-import pydantic
-from hydra import compose, initialize
 from omegaconf import DictConfig, OmegaConf
 from pydantic import BaseModel as PydanticBaseModel
 
@@ -21,7 +19,7 @@ class BaseModel(PydanticBaseModel):
     """Allow arbitrary types in all pydantic models."""
 
     class Config:
-        """Allow arbitrary types"""
+        """Allow arbitrary types."""
 
         arbitrary_types_allowed = True
 
@@ -65,7 +63,7 @@ class DataConf(BaseModel):
 
 
 class PreprocessingConf(BaseModel):
-    """Preprocessing config"""
+    """Preprocessing config."""
 
     convert_to_boolean: bool  # (Boolean): Convert all prediction values (except gender) to boolean. Defaults to False
     convert_datetimes_to: bool  # (str): Options include ordinal or False
@@ -76,7 +74,7 @@ class PreprocessingConf(BaseModel):
 
 
 class ModelConf(BaseModel):
-    """Model configuration"""
+    """Model configuration."""
 
     model_name: str  # (str): Model, can currently take xgboost
     require_imputation: bool  # (bool): Whether the model requires imputation. (shouldn't this be false?)
@@ -84,13 +82,13 @@ class ModelConf(BaseModel):
 
 
 class TrainConf(BaseModel):
-    """Training configuration"""
+    """Training configuration."""
 
     n_splits: int  # TODO: How do we handle whether to use crossvalidation or train/val splitting?
 
 
 class EvalConf(BaseModel):
-    """Evaluation config"""
+    """Evaluation config."""
 
     threshold_percentiles: list[int]
 
@@ -125,7 +123,3 @@ def omegaconf_to_pydantic_objects(conf: DictConfig) -> FullConfig:
     """
     conf = OmegaConf.to_container(conf, resolve=True)  # type: ignore
     return FullConfig(**conf)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/src/psycopt2d/utils/utils.py b/src/psycopt2d/utils/utils.py
index 1ad2519a..e13ae035 100644
--- a/src/psycopt2d/utils/utils.py
+++ b/src/psycopt2d/utils/utils.py
@@ -13,7 +13,6 @@
 import dill as pkl
 import numpy as np
 import pandas as pd
-from omegaconf.dictconfig import DictConfig
 from sklearn.pipeline import Pipeline
 from wandb.sdk.wandb_run import Run  # pylint: disable=no-name-in-module
 from wasabi import msg
diff --git a/tests/test_utils.py b/tests/test_utils.py
index eac256d4..b5a40dbc 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -63,7 +63,7 @@ def test_flatten_nested_dict():
 
 
 def get_config_file_names() -> list[str]:
-    """Get all config file names"""
+    """Get all config file names."""
     config_file_paths: list[Path] = list(CONFIG_DIR_PATH_ABS.glob("*.yaml"))
     return [f"{path.stem}.yaml" for path in config_file_paths]
 

From 76ba99d42c2ec8957642ae14615d19543dfa3fe3 Mon Sep 17 00:00:00 2001
From: HLasse <lasseh0310@gmail.com>
Date: Fri, 21 Oct 2022 12:03:54 +0200
Subject: [PATCH 18/57] fix: make watcher not archive runs that haven't
 finished

---
 src/psycopt2d/model_training_watcher.py | 53 +++++++++++++++++++------
 1 file changed, 40 insertions(+), 13 deletions(-)

diff --git a/src/psycopt2d/model_training_watcher.py b/src/psycopt2d/model_training_watcher.py
index 3305122c..8d7408d2 100644
--- a/src/psycopt2d/model_training_watcher.py
+++ b/src/psycopt2d/model_training_watcher.py
@@ -12,9 +12,13 @@
 from wasabi import msg
 
 from psycopt2d.evaluation import evaluate_model
-from psycopt2d.utils import (MODEL_PREDICTIONS_PATH, PROJECT_ROOT,
-                             infer_outcome_col_name, infer_y_hat_prob_col_name,
-                             load_evaluation_data)
+from psycopt2d.utils import (
+    MODEL_PREDICTIONS_PATH,
+    PROJECT_ROOT,
+    infer_outcome_col_name,
+    infer_y_hat_prob_col_name,
+    load_evaluation_data,
+)
 
 # Path to the wandb directory
 WANDB_DIR = PROJECT_ROOT / "wandb"
@@ -32,6 +36,7 @@ class ModelTrainingWatcher:
         model_data_dir: Where to look for evaluation results.
         overtaci: Whether the script is running on overtaci. Determines where
             to look for the evaluation results.
+        verbose: Whether to print verbose output.
     """
 
     def __init__(
@@ -40,6 +45,7 @@ def __init__(
         project_name: str,
         n_runs_before_eval: int,
         model_data_dir: Path,
+        verbose: bool = False,
     ):
         self.entity = entity
         self.project_name = project_name
@@ -47,6 +53,7 @@ def __init__(
 
         self.n_runs_before_eval = n_runs_before_eval
 
+        self.verbose = verbose
         # A queue for runs waiting to be uploaded to WandB
         self.run_id_eval_candidates_queue = []
         self.max_performance = 0
@@ -76,10 +83,16 @@ def get_new_runs_and_evaluate(self) -> None:
 
     def _upload_run_dir(self, run_dir: Path) -> None:
         """Upload a single run to wandb."""
-        subprocess.run(
+        # get stdout from subprocess.run
+        proc = subprocess.run(
             ["wandb", "sync", str(run_dir), "--project", self.project_name],
             check=True,
+            capture_output=True,
         )
+        stdout = proc.stdout.decode("utf-8")
+        if self.verbose:
+            msg.info(f"Watcher: {stdout}")
+        return stdout
 
     def _archive_run_dir(self, run_dir: Path) -> None:
         """Move a run to the archive folder."""
@@ -92,15 +105,16 @@ def _get_run_id(self, run_dir: Path) -> str:
     def upload_unarchived_runs(self) -> None:
         """Upload unarchived runs to wandb."""
         for run_folder in WANDB_DIR.glob(r"offline-run*"):
-            # TODO: We need some kind of test here to figure out if the run is
-            # still running or not. If it is still running, we should wait
-            # until it is finished. Otherwise, we get a "permission denied" error.
             run_id = self._get_run_id(run_folder)
 
-            self._upload_run_dir(run_folder)
+            wandb_sync_stdout = self._upload_run_dir(run_folder)
 
             # TODO: If upload_run_dir fails, we should not archive the run.
             # use return from subprocess.run to check if it failed. See docs: https://docs.python.org/3/library/subprocess.html
+            if ".wandb file is empty" in wandb_sync_stdout:
+                if self.verbose:
+                    msg.warn(f"Run {run_id} is still running. Skipping.")
+                continue
             self._archive_run_dir(run_folder)
             self.run_id_eval_candidates_queue.append(run_id)
 
@@ -144,9 +158,10 @@ def _get_run_performance(self, run_id: str) -> float:
         run = self._get_wandb_run(run_id)
         if "roc_auc_unweighted" in run.summary:
             return run.summary.roc_auc_unweighted
-        msg.info(
-            f"Run {run_id} has no performance metric. Pinging again at next eval time.",
-        )
+        if self.verbose:
+            msg.info(
+                f"Run {run_id} has no performance metric. Pinging again at next eval time.",
+            )
         return None
 
     def evaluate_best_runs(self) -> None:
@@ -157,7 +172,11 @@ def evaluate_best_runs(self) -> None:
         }
         # sort runs by performance to not upload subpar runs
         run_performances = dict(
-            sorted(run_performances.items(), key=lambda item: item[1], reverse=True),
+            sorted(
+                run_performances.items(),
+                key=lambda item: (item[1] is not None, item[1]),
+                reverse=True,
+            ),
         )
         # get runs with auc of None (attempted upload before run finished)
         unfinished_runs = [
@@ -165,7 +184,7 @@ def evaluate_best_runs(self) -> None:
         ]
 
         for run_id, performance in run_performances.items():
-            if performance > self.max_performance:
+            if performance is not None and performance > self.max_performance:
                 msg.good(f"New record performance! AUC: {performance}")
                 self.max_performance = performance
                 self._do_evaluation(run_id)
@@ -217,6 +236,13 @@ def float_or_none(arg: str) -> Optional[float]:
         help="Archive all runs in the wandb dir before starting",
         required=True,
     )
+    parser.add_argument(
+        "--verbose",
+        type=lambda x: bool(strtobool(x)),
+        help="Whether to print verbose messages (default: False)",
+        required=False,
+        default=False,
+    )
     args = parser.parse_args()
 
     model_data_dir = (
@@ -230,6 +256,7 @@ def float_or_none(arg: str) -> Optional[float]:
         project_name=args.project_name,
         n_runs_before_eval=args.n_runs_before_eval,
         model_data_dir=model_data_dir,
+        verbose=args.verbose,
     )
     if args.clean_wandb_dir:
         watcher.archive_all_runs()

From ab2291d4fa8d584f9f0b03a27d839b4a9f00a240 Mon Sep 17 00:00:00 2001
From: Martin Bernstorff <martinbernstorff@gmail.com>
Date: Fri, 21 Oct 2022 12:04:03 +0200
Subject: [PATCH 19/57] refactor: refactor train_and_log_models

---
 application/train_and_log_models.py           | 161 ++++--------------
 .../config/project/default_project.yaml       |   5 +-
 .../project/integration_test_project.yaml     |   1 +
 .../project/watcher/default_watcher.yaml      |   2 +
 .../config/train/default_training.yaml        |   1 +
 .../utils/omegaconf_to_pydantic_objects.py    |  12 ++
 6 files changed, 53 insertions(+), 129 deletions(-)
 create mode 100644 src/psycopt2d/config/project/watcher/default_watcher.yaml

diff --git a/application/train_and_log_models.py b/application/train_and_log_models.py
index 2e340fa5..a0a8b3a4 100644
--- a/application/train_and_log_models.py
+++ b/application/train_and_log_models.py
@@ -8,10 +8,9 @@
 import random
 import subprocess
 import time
-from pathlib import Path
 
 from hydra import compose, initialize
-from pydantic import BaseModel, Field
+from pydantic import BaseModel
 from wasabi import Printer
 
 from psycopt2d.evaluate_saved_model_predictions import (
@@ -19,7 +18,11 @@
     infer_outcome_col_name,
     infer_predictor_col_name,
 )
-from psycopt2d.load import DataLoader, DatasetSpecification, DatasetTimeSpecification
+from psycopt2d.load import DataLoader
+from psycopt2d.utils.omegaconf_to_pydantic_objects import (
+    FullConfig,
+    omegaconf_to_pydantic_objects,
+)
 
 msg = Printer(timestamp=True)
 
@@ -31,70 +34,9 @@ class PossibleLookDistanceDays(BaseModel):
     behind: list[str]
 
 
-class MetaConf(BaseModel):
-    """Meta configuration for the script."""
-
-    conf_name: str = Field("integration_testing.yaml")
-    data_dir: Path = Path(
-        "/Users/au484925/Desktop/psycop-t2d/tests/test_data/synth_splits/",
-    )
-    overtaci: str = Field(
-        default="false",
-        description="Change to 'true' if running on overtaci",
-    )
-
-
-class WatcherConf(BaseModel):
-    """Configuration for the watcher."""
-
-    archive_all: str = Field(
-        default="false",
-        description="Whether to archive all runs in the wandb folder before starting model training. Change to 't' to archive all wandb runs",
-    )
-    n_runs_before_first_eval: int = Field(
-        default="1",
-        description="The number of runs to upload to wandb before evaluating the best runs.",
-    )
-    keep_alive_after_training_minutes: int = Field(
-        default=5,
-        description="minutes to wait for the wandb watcher after training has finished. Will kill the watcher after this time.",
-    )
-
-
-class WandbConf(BaseModel):
-    """Configuration for wandb."""
-
-    project_name: str = "psycopt2d-testing"
-    entity: str = Field(
-        default="psycop",
-        description="The wandb entity to upload to (e.g. 'psycop' or your user name)",
-    )
-    mode: str = Field(default="online", description="The wandb mode to use")
-
-
-class TrainConf(BaseModel):
-    """Configuration for model training."""
-
-    gpu: bool = Field(default="false", description="Whether to use GPU")
-
-    n_trials_per_cell_in_grid: int = Field(
-        default=50,
-        description="Number of trials per cell in the lookahead/lookbehind grid. If n > 1, automatically triggers multirun.",
-    )
-
-    model_conf: str = Field(
-        default="xgboost",
-        description="The model conf to open. For example, 'xgboost' or 'logistic_regression'.",
-    )
-
-    conf_name: str = Field(default="integration_testing.yaml")
-
-    possible_look_distances: PossibleLookDistanceDays
-
-
-def load_train_for_inference(dataset_spec):
+def load_train_for_inference(cfg: FullConfig):
     """Load the data."""
-    loader = DataLoader(dataset_spec)
+    loader = DataLoader(cfg=cfg)
     msg.info("Loading datasets for look direction inference")
     return loader.load_dataset_from_dir(split_names="train")
 
@@ -117,24 +59,6 @@ def infer_possible_look_directions(train):
     )
 
 
-def get_dataset_spec(data_dir_path: Path, file_suffix: str):
-    """Get dataset specification."""
-    time_spec = DatasetTimeSpecification(
-        drop_patient_if_outcome_before_date=None,
-        min_prediction_time_date="1979-01-01",
-        min_lookbehind_days=0,
-        min_lookahead_days=0,
-    )
-
-    return DatasetSpecification(
-        file_suffix=file_suffix,
-        time=time_spec,
-        pred_col_name_prefix="pred_",
-        pred_time_colname="timestamp",
-        split_dir_path=data_dir_path,
-    )
-
-
 class LookDirectionCombination(BaseModel):
     """A combination of lookbehind and lookahead days."""
 
@@ -143,8 +67,9 @@ class LookDirectionCombination(BaseModel):
 
 
 def train_models_for_each_cell_in_grid(
-    train_conf: TrainConf,
-    wandb_conf: WandbConf,
+    cfg: FullConfig,
+    possible_look_distances: PossibleLookDistanceDays,
+    config_file_name: str,
 ):
     """Train a model for each cell in the grid of possible look distances."""
     from random_word import RandomWords
@@ -154,8 +79,8 @@ def train_models_for_each_cell_in_grid(
     # Create all combinations of lookbehind and lookahead days
     lookbehind_combinations = [
         LookDirectionCombination(lookbehind=lookbehind, lookahead=lookahead)
-        for lookbehind in train_conf.possible_look_distances.behind
-        for lookahead in train_conf.possible_look_distances.ahead
+        for lookbehind in possible_look_distances.behind
+        for lookahead in possible_look_distances.ahead
     ]
 
     lookbehind_combinations = [
@@ -185,20 +110,20 @@ def train_models_for_each_cell_in_grid(
         subprocess_args: list[str] = [
             "python",
             "src/psycopt2d/train_model.py",
-            f"model={train_conf.model_conf}",
+            f"model={cfg.model.model_name}",
             f"data.min_lookbehind_days={cell.lookbehind}",
             f"data.min_lookahead_days={cell.lookahead}",
             f"project.wandb_group='{wandb_group}'",
-            f"hydra.sweeper.n_trials={train_conf.n_trials_per_cell_in_grid}",
-            f"project.wandb_mode={wandb_conf.mode}",
+            f"hydra.sweeper.n_trials={cfg.train.n_trials_per_lookdirection_combination}",
+            f"project.wandb_mode={cfg.project.wandb_mode}",
             "--config-name",
-            f"{meta_conf.conf_name}",
+            f"{config_file_name}",
         ]
 
-        if train_conf.n_trials_per_cell_in_grid > 1:
+        if cfg.train.n_trials_per_lookdirection_combination > 1:
             subprocess_args.insert(2, "--multirun")
 
-        if train_conf.model_conf == "xgboost" and not train_conf.gpu:
+        if cfg.model.model_name == "xgboost" and not cfg.project.gpu:
             subprocess_args.insert(3, "++model.args.tree_method='auto'")
 
         msg.info(f'{" ".join(subprocess_args)}')
@@ -220,44 +145,28 @@ def train_models_for_each_cell_in_grid(
             config_name=CONFIG_FILE_NAME,
         )
 
-    meta_conf = MetaConf(
-        conf_name=CONFIG_FILE_NAME,
-        overtaci="false",
-        data_dir=cfg.data.dir,
-    )
-
-    wandb_conf = WandbConf(
-        entity="psycop",
-        project_name="psycopt2d-testing",
-        mode=cfg.project.wandb_mode,
-    )
-
-    watcher_conf = WatcherConf(archive_all="false", keep_alive_after_training_minutes=5)
+        cfg = omegaconf_to_pydantic_objects(cfg)
 
     watcher = subprocess.Popen(  # pylint: disable=consider-using-with
         [
             "python",
             "src/psycopt2d/model_training_watcher.py",
             "--entity",
-            wandb_conf.entity,
+            cfg.project.wandb_entity,
             "--project_name",
-            wandb_conf.project_name,
+            cfg.project.name,
             "--n_runs_before_eval",
-            str(watcher_conf.n_runs_before_first_eval),
+            str(cfg.project.watcher.n_runs_before_eval),
             "--overtaci",
-            meta_conf.overtaci,
+            cfg.eval.save_model_predictions_on_overtaci,
             "--timeout",
             "None",
             "--clean_wandb_dir",
-            watcher_conf.archive_all,
+            cfg.project.watcher.archive_all,
         ],
     )
 
-    dataset_spec = get_dataset_spec(
-        data_dir_path=meta_conf.data_dir,
-        file_suffix=cfg.data.suffix,
-    )
-    train = load_train_for_inference(dataset_spec=dataset_spec)
+    train = load_train_for_inference(cfg=cfg)
 
     possible_look_distances = infer_possible_look_directions(train)
 
@@ -269,15 +178,7 @@ def train_models_for_each_cell_in_grid(
     msg.info(f"Possible lookbehind days: {possible_look_distances.behind}")
     msg.info(f"Possible lookahead days: {possible_look_distances.ahead}")
 
-    train_conf = TrainConf(
-        conf_name=meta_conf.conf_name,
-        model_conf="xgboost",
-        n_trials_per_cell_in_grid=1,
-        possible_look_distances=possible_look_distances,
-        gpu=True,
-    )
-
-    if not train_conf.gpu:
+    if not cfg.project.gpu:
         msg.warn("Not using GPU for training")
 
     CLEAN_DIR_SECONDS = 0
@@ -286,12 +187,16 @@ def train_models_for_each_cell_in_grid(
     )
     time.sleep(CLEAN_DIR_SECONDS)
 
-    train_models_for_each_cell_in_grid(train_conf=train_conf, wandb_conf=wandb_conf)
+    train_models_for_each_cell_in_grid(
+        cfg=cfg,
+        possible_look_distances=possible_look_distances,
+        config_file_name=CONFIG_FILE_NAME,
+    )
 
     msg.good(
         f"Training finished. Stopping the watcher in {watcher_conf.keep_alive_after_training_minutes} minutes...",
     )
 
-    time.sleep(60 * watcher_conf.keep_alive_after_training_minutes)
+    time.sleep(60 * cfg.project.watcher.keep_alive_after_training_minutes)
     watcher.kill()
     msg.good("Watcher stopped.")
diff --git a/src/psycopt2d/config/project/default_project.yaml b/src/psycopt2d/config/project/default_project.yaml
index 404397fa..99af8b3f 100644
--- a/src/psycopt2d/config/project/default_project.yaml
+++ b/src/psycopt2d/config/project/default_project.yaml
@@ -1,4 +1,7 @@
 name: psycop-t2d
 seed: 42
 wandb_mode: "run" # Which mode to run WanDB in. Takes "run", "dryrun", "offline" and "disabled"
-wandb_group: "psycop-t2d" # Which group to run WanDB in.
\ No newline at end of file
+wandb_group: "psycop-t2d" # Which group to run WanDB in.
+wandb_entity: "psycop-t2d-testing" # Which entity to run WanDB in.
+watcher: default_watcher
+gpu: false
diff --git a/src/psycopt2d/config/project/integration_test_project.yaml b/src/psycopt2d/config/project/integration_test_project.yaml
index 39402f44..9be0e502 100644
--- a/src/psycopt2d/config/project/integration_test_project.yaml
+++ b/src/psycopt2d/config/project/integration_test_project.yaml
@@ -2,3 +2,4 @@ name: psycop-t2d-integration-testing
 seed: 42
 wandb_mode: "disabled" # Which mode to run WanDB in. Takes "run", "dryrun", "offline" and "disabled"
 wandb_group: "integration_testing"
+watcher: default_watcher
diff --git a/src/psycopt2d/config/project/watcher/default_watcher.yaml b/src/psycopt2d/config/project/watcher/default_watcher.yaml
new file mode 100644
index 00000000..f76bc9a4
--- /dev/null
+++ b/src/psycopt2d/config/project/watcher/default_watcher.yaml
@@ -0,0 +1,2 @@
+archive_all: true
+keep_alive_after_training_minutes: 5
diff --git a/src/psycopt2d/config/train/default_training.yaml b/src/psycopt2d/config/train/default_training.yaml
index 56014ceb..9ecc02a5 100644
--- a/src/psycopt2d/config/train/default_training.yaml
+++ b/src/psycopt2d/config/train/default_training.yaml
@@ -1 +1,2 @@
 n_splits: 2 # (int, Null): Number of k-folds during CV. If Null, loads pre-defined dataset.
+n_trials_per_lookdirection_combination: 1
diff --git a/src/psycopt2d/utils/omegaconf_to_pydantic_objects.py b/src/psycopt2d/utils/omegaconf_to_pydantic_objects.py
index 340ca6f2..4c762df8 100644
--- a/src/psycopt2d/utils/omegaconf_to_pydantic_objects.py
+++ b/src/psycopt2d/utils/omegaconf_to_pydantic_objects.py
@@ -24,6 +24,14 @@ class Config:
         arbitrary_types_allowed = True
 
 
+class WatcherConf(BaseModel):
+    """Configuration for watchers"""
+
+    archive_all: bool
+    keep_alive_after_training_minutes: Union[int, float]
+    n_runs_before_eval: int
+
+
 class ProjectConf(BaseModel):
     """Project configuration."""
 
@@ -31,6 +39,9 @@ class ProjectConf(BaseModel):
     seed: int
     wandb_group: str
     wandb_mode: str
+    wandb_entity: str
+    watcher: WatcherConf
+    gpu: bool
 
 
 class DataConf(BaseModel):
@@ -85,6 +96,7 @@ class TrainConf(BaseModel):
     """Training configuration."""
 
     n_splits: int  # TODO: How do we handle whether to use crossvalidation or train/val splitting?
+    n_trials_per_lookdirection_combination: int
 
 
 class EvalConf(BaseModel):

From 5c0503b091ab386840a9b45446e3850a52355606 Mon Sep 17 00:00:00 2001
From: HLasse <lasseh0310@gmail.com>
Date: Fri, 21 Oct 2022 12:04:20 +0200
Subject: [PATCH 20/57] fix: infer col names return list if len 1

---
 src/psycopt2d/utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/psycopt2d/utils.py b/src/psycopt2d/utils.py
index 976e56db..fcdbe2b0 100644
--- a/src/psycopt2d/utils.py
+++ b/src/psycopt2d/utils.py
@@ -18,7 +18,7 @@
 from wandb.sdk.wandb_run import Run  # pylint: disable=no-name-in-module
 from wasabi import msg
 
-from psycopt2d.dataclasses.configs import ModelEvalData
+from psycopt2d.configs import ModelEvalData
 from psycopt2d.model_performance import ModelPerformance
 
 SHARED_RESOURCES_PATH = Path(r"E:\shared_resources")
@@ -402,7 +402,7 @@ def infer_col_names(
     col_name = [c for c in df.columns if c.startswith(prefix)]
 
     if len(col_name) == 1:
-        return [col_name[0]]
+        return col_name[0]
     elif len(col_name) > 1:
         if allow_multiple:
             return col_name

From dce563e61c1b9579759aaffced60a1f2d4ec5a75 Mon Sep 17 00:00:00 2001
From: HLasse <lasseh0310@gmail.com>
Date: Fri, 21 Oct 2022 12:04:28 +0200
Subject: [PATCH 21/57] fix: remove artefact code

---
 src/psycopt2d/train_and_log_models.py | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/src/psycopt2d/train_and_log_models.py b/src/psycopt2d/train_and_log_models.py
index 499c539a..50b92ee8 100644
--- a/src/psycopt2d/train_and_log_models.py
+++ b/src/psycopt2d/train_and_log_models.py
@@ -70,15 +70,3 @@
     time.sleep(60 * KEEP_WATCHER_ALIVE_AFTER_TRAINING_FINISHED_MINUTES)
     watcher.kill()
     msg.good("Watcher stopped.")
-
-    # any_process_done = False  # pylint: disable=invalid-name
-    # for process in (trainer, watcher):
-    #     while process.poll() is None:
-    #         if any_process_done:
-    #             # kill the watcher if the trainer is done
-    #             # but allow some time to finish evaluation
-    #             time.sleep(KEEP_WATCHER_ALIVE_AFTER_TRAINING_FINISHED_MINUTES * 60)
-    #             process.kill()
-    #         time.sleep(1)
-    #     any_process_done = True  # pylint: disable=invalid-name
-    #     process.kill()

From 1999b6554df20757e6eb473ebf7f539851311657 Mon Sep 17 00:00:00 2001
From: Martin Bernstorff <martinbernstorff@gmail.com>
Date: Fri, 21 Oct 2022 12:09:56 +0200
Subject: [PATCH 22/57] fix: failing tests

---
 src/psycopt2d/config/default_config.yaml                   | 2 +-
 src/psycopt2d/config/project/default_project.yaml          | 4 +++-
 src/psycopt2d/config/project/integration_test_project.yaml | 6 +++++-
 src/psycopt2d/config/project/watcher/default_watcher.yaml  | 2 --
 src/psycopt2d/utils/omegaconf_to_pydantic_objects.py       | 4 ++--
 5 files changed, 11 insertions(+), 7 deletions(-)
 delete mode 100644 src/psycopt2d/config/project/watcher/default_watcher.yaml

diff --git a/src/psycopt2d/config/default_config.yaml b/src/psycopt2d/config/default_config.yaml
index 5590c26f..46b91517 100644
--- a/src/psycopt2d/config/default_config.yaml
+++ b/src/psycopt2d/config/default_config.yaml
@@ -1,6 +1,6 @@
 # @package _global_
 defaults:
-  - project: overtaci_test_project
+  - project: default_project
   - data: t2d_parquet
   - preprocessing: default_preprocessing
   - model: xgboost
diff --git a/src/psycopt2d/config/project/default_project.yaml b/src/psycopt2d/config/project/default_project.yaml
index 99af8b3f..ee44de65 100644
--- a/src/psycopt2d/config/project/default_project.yaml
+++ b/src/psycopt2d/config/project/default_project.yaml
@@ -3,5 +3,7 @@ seed: 42
 wandb_mode: "run" # Which mode to run WanDB in. Takes "run", "dryrun", "offline" and "disabled"
 wandb_group: "psycop-t2d" # Which group to run WanDB in.
 wandb_entity: "psycop-t2d-testing" # Which entity to run WanDB in.
-watcher: default_watcher
+watcher:
+  archive_all: true
+  keep_alive_after_training_minutes: 5
 gpu: false
diff --git a/src/psycopt2d/config/project/integration_test_project.yaml b/src/psycopt2d/config/project/integration_test_project.yaml
index 9be0e502..23f9eff2 100644
--- a/src/psycopt2d/config/project/integration_test_project.yaml
+++ b/src/psycopt2d/config/project/integration_test_project.yaml
@@ -2,4 +2,8 @@ name: psycop-t2d-integration-testing
 seed: 42
 wandb_mode: "disabled" # Which mode to run WanDB in. Takes "run", "dryrun", "offline" and "disabled"
 wandb_group: "integration_testing"
-watcher: default_watcher
+wandb_entity: "psycop-t2d-testing" # Which entity to run WanDB in.
+watcher:
+  archive_all: true
+  keep_alive_after_training_minutes: 5
+gpu: false
diff --git a/src/psycopt2d/config/project/watcher/default_watcher.yaml b/src/psycopt2d/config/project/watcher/default_watcher.yaml
deleted file mode 100644
index f76bc9a4..00000000
--- a/src/psycopt2d/config/project/watcher/default_watcher.yaml
+++ /dev/null
@@ -1,2 +0,0 @@
-archive_all: true
-keep_alive_after_training_minutes: 5
diff --git a/src/psycopt2d/utils/omegaconf_to_pydantic_objects.py b/src/psycopt2d/utils/omegaconf_to_pydantic_objects.py
index 4c762df8..c1e90875 100644
--- a/src/psycopt2d/utils/omegaconf_to_pydantic_objects.py
+++ b/src/psycopt2d/utils/omegaconf_to_pydantic_objects.py
@@ -25,7 +25,7 @@ class Config:
 
 
 class WatcherConf(BaseModel):
-    """Configuration for watchers"""
+    """Configuration for watchers."""
 
     archive_all: bool
     keep_alive_after_training_minutes: Union[int, float]
@@ -40,7 +40,7 @@ class ProjectConf(BaseModel):
     wandb_group: str
     wandb_mode: str
     wandb_entity: str
-    watcher: WatcherConf
+    watcher: dict
     gpu: bool
 
 

From f9db2e884f76b824c59f9f5ddbbb9362bcdc0a1d Mon Sep 17 00:00:00 2001
From: Martin Bernstorff <martinbernstorff@gmail.com>
Date: Fri, 21 Oct 2022 12:12:21 +0200
Subject: [PATCH 23/57] style: linting

---
 application/train_and_log_models.py                        | 2 +-
 src/psycopt2d/config/project/default_project.yaml          | 1 +
 src/psycopt2d/config/project/integration_test_project.yaml | 1 +
 src/psycopt2d/utils/omegaconf_to_pydantic_objects.py       | 2 +-
 4 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/application/train_and_log_models.py b/application/train_and_log_models.py
index a0a8b3a4..b3df5c9f 100644
--- a/application/train_and_log_models.py
+++ b/application/train_and_log_models.py
@@ -194,7 +194,7 @@ def train_models_for_each_cell_in_grid(
     )
 
     msg.good(
-        f"Training finished. Stopping the watcher in {watcher_conf.keep_alive_after_training_minutes} minutes...",
+        f"Training finished. Stopping the watcher in {cfg.project.watcher.keep_alive_after_training_minutes} minutes...",
     )
 
     time.sleep(60 * cfg.project.watcher.keep_alive_after_training_minutes)
diff --git a/src/psycopt2d/config/project/default_project.yaml b/src/psycopt2d/config/project/default_project.yaml
index ee44de65..e66a50b4 100644
--- a/src/psycopt2d/config/project/default_project.yaml
+++ b/src/psycopt2d/config/project/default_project.yaml
@@ -6,4 +6,5 @@ wandb_entity: "psycop-t2d-testing" # Which entity to run WanDB in.
 watcher:
   archive_all: true
   keep_alive_after_training_minutes: 5
+  n_runs_before_eval: 1
 gpu: false
diff --git a/src/psycopt2d/config/project/integration_test_project.yaml b/src/psycopt2d/config/project/integration_test_project.yaml
index 23f9eff2..05f31fcd 100644
--- a/src/psycopt2d/config/project/integration_test_project.yaml
+++ b/src/psycopt2d/config/project/integration_test_project.yaml
@@ -6,4 +6,5 @@ wandb_entity: "psycop-t2d-testing" # Which entity to run WanDB in.
 watcher:
   archive_all: true
   keep_alive_after_training_minutes: 5
+  n_runs_before_eval: 1
 gpu: false
diff --git a/src/psycopt2d/utils/omegaconf_to_pydantic_objects.py b/src/psycopt2d/utils/omegaconf_to_pydantic_objects.py
index c1e90875..a7946edd 100644
--- a/src/psycopt2d/utils/omegaconf_to_pydantic_objects.py
+++ b/src/psycopt2d/utils/omegaconf_to_pydantic_objects.py
@@ -40,7 +40,7 @@ class ProjectConf(BaseModel):
     wandb_group: str
     wandb_mode: str
     wandb_entity: str
-    watcher: dict
+    watcher: WatcherConf
     gpu: bool
 
 

From 1095109843a31a591a3518671d321903481db4c4 Mon Sep 17 00:00:00 2001
From: Martin Bernstorff <martinbernstorff@gmail.com>
Date: Fri, 21 Oct 2022 13:04:46 +0200
Subject: [PATCH 24/57] style: linting

---
 src/psycopt2d/load.py                                | 2 +-
 src/psycopt2d/model_training_watcher.py              | 2 +-
 src/psycopt2d/utils/omegaconf_to_pydantic_objects.py | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/psycopt2d/load.py b/src/psycopt2d/load.py
index 1403df98..1aef6ea7 100644
--- a/src/psycopt2d/load.py
+++ b/src/psycopt2d/load.py
@@ -297,7 +297,7 @@ def _drop_cols_not_in_lookbehind_combination(
         ]
 
         cols_to_drop = [c for c in cols_to_drop if "within" in c]
-        # TODO: Add some specification of within_x_days indicating how to parse columns to find lookbehinds. Or, alternatively, use the column spec.
+        # ? Add some specification of within_x_days indicating how to parse columns to find lookbehinds. Or, alternatively, use the column spec.
 
         dataset = dataset.drop(columns=cols_to_drop)
         return dataset
diff --git a/src/psycopt2d/model_training_watcher.py b/src/psycopt2d/model_training_watcher.py
index 9eb427f8..1c8791a4 100644
--- a/src/psycopt2d/model_training_watcher.py
+++ b/src/psycopt2d/model_training_watcher.py
@@ -4,7 +4,7 @@
 import time
 from distutils.util import strtobool  # pylint: disable=deprecated-module
 from pathlib import Path
-from typing import Optional, Union
+from typing import Optional
 
 import wandb
 from wandb.apis.public import Api  # pylint: disable=no-name-in-module
diff --git a/src/psycopt2d/utils/omegaconf_to_pydantic_objects.py b/src/psycopt2d/utils/omegaconf_to_pydantic_objects.py
index a7946edd..380e6f85 100644
--- a/src/psycopt2d/utils/omegaconf_to_pydantic_objects.py
+++ b/src/psycopt2d/utils/omegaconf_to_pydantic_objects.py
@@ -95,7 +95,7 @@ class ModelConf(BaseModel):
 class TrainConf(BaseModel):
     """Training configuration."""
 
-    n_splits: int  # TODO: How do we handle whether to use crossvalidation or train/val splitting?
+    n_splits: int  # ? How do we handle whether to use crossvalidation or train/val splitting?
     n_trials_per_lookdirection_combination: int
 
 

From 420da4cb4af70ab28a1504da861acd8b7d3c3f86 Mon Sep 17 00:00:00 2001
From: Martin Bernstorff <ryqiem@gmail.com>
Date: Fri, 21 Oct 2022 14:48:30 +0200
Subject: [PATCH 25/57] fix: misc. fixes and refactor

---
 application/train_and_log_models.py           |  10 +-
 .../config/project/default_project.yaml       |   6 +-
 src/psycopt2d/configs.py                      |   5 +-
 src/psycopt2d/evaluation.py                   |  13 ++-
 src/psycopt2d/load.py                         | 109 ++++--------------
 src/psycopt2d/model_training_watcher.py       |  40 ++++---
 src/psycopt2d/train_model.py                  |   1 -
 .../utils/omegaconf_to_pydantic_objects.py    |   5 +-
 src/psycopt2d/utils/utils.py                  |   2 +-
 9 files changed, 72 insertions(+), 119 deletions(-)

diff --git a/application/train_and_log_models.py b/application/train_and_log_models.py
index b3df5c9f..9457808c 100644
--- a/application/train_and_log_models.py
+++ b/application/train_and_log_models.py
@@ -95,7 +95,7 @@ def train_models_for_each_cell_in_grid(
 
     while lookbehind_combinations:
         # Loop to run if enough trainers have been spawned
-        if len(active_trainers) >= 4:
+        if len(active_trainers) >= 1:  # TODO: Add to conf.
             active_trainers = [t for t in active_trainers if t.poll() is None]
             time.sleep(1)
             continue
@@ -147,6 +147,8 @@ def train_models_for_each_cell_in_grid(
 
         cfg = omegaconf_to_pydantic_objects(cfg)
 
+    # TODO: Watcher must be instantiated once for each cell in the grid, otherwise
+    # it will compare max performances across all cells.
     watcher = subprocess.Popen(  # pylint: disable=consider-using-with
         [
             "python",
@@ -158,11 +160,13 @@ def train_models_for_each_cell_in_grid(
             "--n_runs_before_eval",
             str(cfg.project.watcher.n_runs_before_eval),
             "--overtaci",
-            cfg.eval.save_model_predictions_on_overtaci,
+            str(cfg.eval.save_model_predictions_on_overtaci),
             "--timeout",
             "None",
             "--clean_wandb_dir",
-            cfg.project.watcher.archive_all,
+            str(cfg.project.watcher.archive_all),
+            "--verbose",
+            "True",
         ],
     )
 
diff --git a/src/psycopt2d/config/project/default_project.yaml b/src/psycopt2d/config/project/default_project.yaml
index e66a50b4..3bec2ad6 100644
--- a/src/psycopt2d/config/project/default_project.yaml
+++ b/src/psycopt2d/config/project/default_project.yaml
@@ -1,10 +1,10 @@
 name: psycop-t2d
 seed: 42
-wandb_mode: "run" # Which mode to run WanDB in. Takes "run", "dryrun", "offline" and "disabled"
+wandb_mode: "offline" # Which mode to run WanDB in. Takes "run", "dryrun", "offline" and "disabled"
 wandb_group: "psycop-t2d" # Which group to run WanDB in.
-wandb_entity: "psycop-t2d-testing" # Which entity to run WanDB in.
+wandb_entity: "psycop" # Which entity to run WanDB in.
 watcher:
-  archive_all: true
+  archive_all: false
   keep_alive_after_training_minutes: 5
   n_runs_before_eval: 1
 gpu: false
diff --git a/src/psycopt2d/configs.py b/src/psycopt2d/configs.py
index e809d072..76ba6b02 100644
--- a/src/psycopt2d/configs.py
+++ b/src/psycopt2d/configs.py
@@ -2,9 +2,10 @@
 from typing import Optional
 
 import pandas as pd
-from omegaconf import DictConfig
 from pydantic import BaseModel
 
+from psycopt2d.utils.omegaconf_to_pydantic_objects import FullConfig
+
 # pylint: disable=missing-class-docstring, too-few-public-methods
 
 
@@ -15,5 +16,5 @@ class Config:
         arbitrary_types_allowed = True
 
     df: pd.DataFrame
-    cfg: DictConfig
+    cfg: FullConfig
     feature_importance_dict: Optional[dict[str, float]] = None
diff --git a/src/psycopt2d/evaluation.py b/src/psycopt2d/evaluation.py
index cf345975..39b2f308 100644
--- a/src/psycopt2d/evaluation.py
+++ b/src/psycopt2d/evaluation.py
@@ -14,6 +14,7 @@
 from psycopt2d.tables.performance_by_threshold import (
     generate_performance_by_positive_rate_table,
 )
+from psycopt2d.utils.omegaconf_to_pydantic_objects import FullConfig
 from psycopt2d.utils.utils import PROJECT_ROOT, positive_rate_to_pred_probs
 from psycopt2d.visualization import (
     plot_auc_by_time_from_first_visit,
@@ -37,7 +38,7 @@ def log_feature_importances(
     feature_importance_plot_path = plot_feature_importances(
         feature_names=feature_importance_dict.keys(),
         feature_importances=feature_importance_dict.values(),
-        top_n_feature_importances=cfg.evaluation.top_n_feature_importances,
+        top_n_feature_importances=cfg.eval.top_n_feature_importances,
         save_path=save_path,
     )
 
@@ -53,7 +54,7 @@ def log_feature_importances(
 
 
 def evaluate_model(
-    cfg,
+    cfg: FullConfig,
     eval_df: pd.DataFrame,
     y_col_name: str,
     y_hat_prob_col_name: str,
@@ -94,8 +95,8 @@ def evaluate_model(
     pred_timestamps = eval_df[cfg.data.pred_timestamp_col_name]
     y_hat_int = np.round(y_hat_probs, 0)
 
-    date_bins_ahead: Iterable[int] = cfg.evaluation.date_bins_ahead
-    date_bins_behind: Iterable[int] = cfg.evaluation.date_bins_behind
+    date_bins_ahead: Iterable[int] = cfg.eval.date_bins_ahead
+    date_bins_behind: Iterable[int] = cfg.eval.date_bins_behind
 
     # Drop date_bins_direction if they are further away than min_lookdirection_days
     if cfg.data.min_lookbehind_days:
@@ -121,7 +122,7 @@ def evaluate_model(
 
     pred_proba_thresholds = positive_rate_to_pred_probs(
         pred_probs=y_hat_probs,
-        positive_rate_thresholds=cfg.evaluation.positive_rate_thresholds,
+        positive_rate_thresholds=cfg.eval.positive_rate_thresholds,
     )
 
     msg.info(f"AUC: {auc}")
@@ -132,7 +133,7 @@ def evaluate_model(
     performance_by_threshold_df = generate_performance_by_positive_rate_table(
         labels=y,
         pred_probs=y_hat_probs,
-        positive_rate_thresholds=cfg.evaluation.positive_rate_thresholds,
+        positive_rate_thresholds=cfg.eval.positive_rate_thresholds,
         pred_proba_thresholds=pred_proba_thresholds,
         ids=eval_df[cfg.data.id_col_name],
         pred_timestamps=pred_timestamps,
diff --git a/src/psycopt2d/load.py b/src/psycopt2d/load.py
index 1aef6ea7..08d08a46 100644
--- a/src/psycopt2d/load.py
+++ b/src/psycopt2d/load.py
@@ -2,6 +2,7 @@
 import re
 from collections.abc import Iterable
 from datetime import datetime, timedelta
+from multiprocessing.sharedctypes import Value
 from pathlib import Path
 from typing import Any, Optional, Union
 
@@ -17,61 +18,13 @@
     PROJECT_ROOT,
     coerce_to_datetime,
     get_percent_lost,
+    infer_outcome_col_name,
     infer_predictor_col_name,
 )
 
 msg = Printer(timestamp=True)
 
 
-class DatasetTimeSpecification(BaseModel):
-    """Specification of the time range of the dataset."""
-
-    drop_patient_if_outcome_before_date: Optional[Union[str, datetime]] = Field(
-        description="""If a patient experiences the outcome before this date, all their prediction times will be dropped.
-        Used for wash-in, to avoid including patients who were probably already experiencing the outcome before the study began.""",
-    )
-
-    min_prediction_time_date: Optional[Union[str, datetime]] = Field(
-        description="""Any prediction time before this date will be dropped.""",
-    )
-
-    min_lookbehind_days: Optional[Union[int, float]] = Field(
-        description="""If the distance from the prediction time to the start of the dataset is less than this, the prediction time will be dropped""",
-    )
-
-    min_lookahead_days: Optional[Union[int, float]] = Field(
-        description="""If the distance from the prediction time to the end of the dataset is less than this, the prediction time will be dropped""",
-    )
-
-    lookbehind_combination: Optional[list[Union[int, float]]] = Field(
-        description="""List containing a combination of lookbehind windows (e.g. [30, 60, 90]) which determines which features to keep in the dataset. E.g. for the above list, only features with lookbehinds of 30, 60 or 90 days will be kept.""",
-    )
-
-
-class DatasetSpecification(BaseModel):
-    """Specification for loading a dataset."""
-
-    split_dir_path: Union[str, Path] = Field(
-        description="""Path to the directory containing the split files.""",
-    )
-
-    file_suffix: str = Field(
-        description="""Suffix of the split files. E.g. 'parquet' or 'csv'.""",
-        default="parquet",
-    )
-
-    time: DatasetTimeSpecification
-
-    pred_col_name_prefix: str = Field(
-        default="pred_",
-        description="""Prefix for the prediction column names.""",
-    )
-    pred_time_colname: str = Field(
-        default="timestamp",
-        description="""Column name for with timestamps for prediction times""",
-    )
-
-
 def load_timestamp_for_any_diabetes():
     """Loads timestamps for the broad definition of diabetes used for wash-in.
 
@@ -412,6 +365,24 @@ def _drop_cols_and_rows_if_look_direction_not_met(
 
         return dataset
 
+    def _keep_unique_outcome_col_with_lookahead_days_matching_conf(
+        self, dataset: pd.DataFrame
+    ) -> pd.DataFrame:
+        """Keep only one outcome column with the same lookahead days as set in the config."""
+        outcome_cols = infer_outcome_col_name(df=dataset, allow_multiple=True)
+        col_to_drop = [
+            c for c in outcome_cols if str(self.cfg.data.lookahead_days) not in c
+        ]
+
+        df = dataset.drop(col_to_drop, axis=1)
+
+        if not isinstance(infer_outcome_col_name(df), str):
+            raise ValueError(
+                "Returning more than one outcome column, will cause problems during eval."
+            )
+
+        return df
+
     def _process_dataset(self, dataset: pd.DataFrame) -> pd.DataFrame:
         """Process dataset, namely:
 
@@ -442,6 +413,10 @@ def _process_dataset(self, dataset: pd.DataFrame) -> pd.DataFrame:
         if self.cfg.data.lookbehind_combination:
             dataset = self._drop_cols_not_in_lookbehind_combination(dataset=dataset)
 
+        dataset = self._keep_unique_outcome_col_with_lookahead_days_matching_conf(
+            dataset=dataset
+        )
+
         return dataset
 
     def load_dataset_from_dir(
@@ -503,42 +478,6 @@ def load_dataset_from_dir(
         return dataset
 
 
-def _init_spec_from_cfg(
-    cfg: DictConfig,
-) -> DatasetSpecification:
-    """Initialise a feature spec from a DictConfig."""
-    data_cfg: dict[str, Any] = OmegaConf.to_container(  # type: ignore
-        cfg.data,
-        resolve=True,
-    )
-
-    if data_cfg["suffix"] == "synthetic":
-        split_dir_path = PROJECT_ROOT / "tests" / "test_data" / "synth_splits"
-        file_suffix = "csv"
-    else:
-        split_dir_path = data_cfg["dir"]
-        file_suffix = data_cfg["suffix"]
-
-    time_spec = DatasetTimeSpecification(
-        drop_patient_if_outcome_before_date=data_cfg[
-            "drop_patient_if_outcome_before_date"
-        ],
-        min_lookahead_days=data_cfg["min_lookahead_days"],
-        min_lookbehind_days=data_cfg["min_lookbehind_days"],
-        min_prediction_time_date=data_cfg["min_prediction_time_date"],
-        lookbehind_combination=data_cfg["lookbehind_combination"],
-    )
-
-    return DatasetSpecification(
-        split_dir_path=split_dir_path,
-        pred_col_name_prefix=data_cfg["pred_col_name_prefix"],
-        file_suffix=file_suffix,
-        pred_time_colname=data_cfg["pred_timestamp_col_name"],
-        n_training_samples=data_cfg["n_training_samples"],
-        time=time_spec,
-    )
-
-
 class SplitDataset(BaseModel):
     """A dataset split into train, test and optionally validation."""
 
diff --git a/src/psycopt2d/model_training_watcher.py b/src/psycopt2d/model_training_watcher.py
index 1c8791a4..acf4017d 100644
--- a/src/psycopt2d/model_training_watcher.py
+++ b/src/psycopt2d/model_training_watcher.py
@@ -13,13 +13,10 @@
 
 from psycopt2d.configs import ModelEvalData
 from psycopt2d.evaluation import evaluate_model
-from psycopt2d.utils.utils import (
-    MODEL_PREDICTIONS_PATH,
-    PROJECT_ROOT,
-    infer_outcome_col_name,
-    infer_y_hat_prob_col_name,
-    load_evaluation_data,
-)
+from psycopt2d.utils.utils import (MODEL_PREDICTIONS_PATH, PROJECT_ROOT,
+                                   infer_outcome_col_name,
+                                   infer_y_hat_prob_col_name,
+                                   load_evaluation_data)
 
 # Path to the wandb directory
 WANDB_DIR = PROJECT_ROOT / "wandb"
@@ -79,8 +76,9 @@ def watch(self, timeout_minutes: Optional[int] = None) -> None:
     def get_new_runs_and_evaluate(self) -> None:
         """Get new runs and evaluate the best runs."""
         self.upload_unarchived_runs()
+
         if len(self.run_id_eval_candidates_queue) >= self.n_runs_before_eval:
-            self.evaluate_best_runs()
+            self.evaluate_and_upload_records_and_archive()
 
     def _upload_run_dir(self, run_dir: Path) -> str:
         """Upload a single run to wandb."""
@@ -110,21 +108,25 @@ def upload_unarchived_runs(self) -> None:
 
             wandb_sync_stdout = self._upload_run_dir(run_folder)
 
-            if ".wandb file is empty" in wandb_sync_stdout:
-                if self.verbose:
-                    msg.warn(f"Run {run_id} is still running. Skipping.")
+            if not "...done" in wandb_sync_stdout:
+                if ".wandb file is empty" in wandb_sync_stdout:
+                    if self.verbose:
+                        msg.warn(f"Run {run_id} is still running. Skipping.")
+                else:
+                    raise ValueError(
+                        f"wandb sync failed, returned: {wandb_sync_stdout}"
+                    )
                 continue
 
-            self._archive_run_dir(run_folder)
             self.run_id_eval_candidates_queue.append(run_id)
 
-    def _get_run_evaluation_dir(self, run_id: str) -> Path:
+    def _get_run_evaluation_data_dir(self, run_id: str) -> Path:
         """Get the evaluation path for a single run."""
         return list(self.model_data_dir.glob(f"*{run_id}*"))[0]
 
     def _get_eval_data(self, run_id: str) -> ModelEvalData:
         """Get the evaluation data for a single run."""
-        run_eval_dir = self._get_run_evaluation_dir(run_id)
+        run_eval_dir = self._get_run_evaluation_data_dir(run_id)
 
         return load_evaluation_data(run_eval_dir)
 
@@ -153,6 +155,9 @@ def _get_wandb_run(self, run_id: str) -> Run:
         """Get the wandb run object from the run id."""
         return Api().run(f"{self.entity}/{self.project_name}/{run_id}")
 
+    def _get_run_wandb_dir(self, run_id: str) -> Path:
+        return list(WANDB_DIR.glob(f"*offline-run*{run_id}*"))[0]
+
     def _get_run_performance(self, run_id: str) -> Optional[float]:
         """Get the performance of a single run and check if it failed."""
         run = self._get_wandb_run(run_id)
@@ -160,11 +165,11 @@ def _get_run_performance(self, run_id: str) -> Optional[float]:
             return run.summary.roc_auc_unweighted
         if self.verbose:
             msg.info(
-                f"Run {run_id} has no performance metric. Pinging again at next eval time.",
+                f"Watcher: Run {run_id} has no performance metric. Pinging again at next eval time.",
             )
         return None
 
-    def evaluate_best_runs(self) -> None:
+    def evaluate_and_upload_records_and_archive(self) -> None:
         """Evaluate the best runs."""
         run_performances = {
             run_id: self._get_run_performance(run_id)
@@ -188,6 +193,7 @@ def evaluate_best_runs(self) -> None:
                 msg.good(f"New record performance! AUC: {performance}")
                 self.max_performance = performance
                 self._do_evaluation(run_id)
+            self._archive_run_dir(run_dir=self._get_run_wandb_dir(run_id))
         # reset run id queue and try to upload unfinished runs next time
         self.run_id_eval_candidates_queue = unfinished_runs
 
@@ -261,5 +267,5 @@ def float_or_none(arg: str) -> Optional[float]:
     if args.clean_wandb_dir:
         watcher.archive_all_runs()
 
-    msg.info("Starting WandB watcher")
+    msg.info("Watcher: Starting WandB watcher")
     watcher.watch(timeout_minutes=args.timeout)
diff --git a/src/psycopt2d/train_model.py b/src/psycopt2d/train_model.py
index 44ecbfe0..797b7ebe 100644
--- a/src/psycopt2d/train_model.py
+++ b/src/psycopt2d/train_model.py
@@ -32,7 +32,6 @@
 )
 
 CONFIG_PATH = PROJECT_ROOT / "src" / "psycopt2d" / "config"
-TRAINING_COL_NAME_PREFIX = "pred_"
 
 # Handle wandb not playing nice with joblib
 os.environ["WANDB_START_METHOD"] = "thread"
diff --git a/src/psycopt2d/utils/omegaconf_to_pydantic_objects.py b/src/psycopt2d/utils/omegaconf_to_pydantic_objects.py
index 380e6f85..f13fe66f 100644
--- a/src/psycopt2d/utils/omegaconf_to_pydantic_objects.py
+++ b/src/psycopt2d/utils/omegaconf_to_pydantic_objects.py
@@ -54,7 +54,7 @@ class DataConf(BaseModel):
     suffix: str  # File suffix to load.
 
     # Feature specs
-    pred_col_name_prefix: str  # (str): prefix of predictor columns
+    pred_col_name_prefix: str  # prefix of predictor columns
     pred_timestamp_col_name: str  # (str): Column name for prediction times
     outcome_timestamp_col_name: str  # (str): Column name for outcome timestamps
     id_col_name: str  # (str): Citizen colnames
@@ -124,6 +124,9 @@ class FullConfig(BaseModel):
     eval: EvalConf
 
 
+# ? Should FullConfig be here or in another location?
+
+
 def omegaconf_to_pydantic_objects(conf: DictConfig) -> FullConfig:
     """Converts an omegaconf DictConfig to a pydantic object.
 
diff --git a/src/psycopt2d/utils/utils.py b/src/psycopt2d/utils/utils.py
index 3a2a8d18..33cfa289 100644
--- a/src/psycopt2d/utils/utils.py
+++ b/src/psycopt2d/utils/utils.py
@@ -28,7 +28,7 @@
 FEATURIZERS_PATH = SHARED_RESOURCES_PATH / "featurizers"
 MODEL_PREDICTIONS_PATH = SHARED_RESOURCES_PATH / "model_predictions"
 
-PROJECT_ROOT = Path(__file__).resolve().parents[2]
+PROJECT_ROOT = Path(__file__).resolve().parents[3]
 
 
 def format_dict_for_printing(d: dict) -> str:

From 51f080da7b72831391a6d6a0a99cfec837829c5a Mon Sep 17 00:00:00 2001
From: Martin Bernstorff <ryqiem@gmail.com>
Date: Fri, 21 Oct 2022 14:49:58 +0200
Subject: [PATCH 26/57] style: linting

---
 src/psycopt2d/load.py                   | 19 +++++++++----------
 src/psycopt2d/model_training_watcher.py | 13 ++++++++-----
 2 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/src/psycopt2d/load.py b/src/psycopt2d/load.py
index 08d08a46..50331d50 100644
--- a/src/psycopt2d/load.py
+++ b/src/psycopt2d/load.py
@@ -1,21 +1,18 @@
 """Loader for the t2d dataset."""
 import re
 from collections.abc import Iterable
-from datetime import datetime, timedelta
-from multiprocessing.sharedctypes import Value
+from datetime import timedelta
 from pathlib import Path
-from typing import Any, Optional, Union
+from typing import Optional, Union
 
 import pandas as pd
-from omegaconf import DictConfig, OmegaConf
 from psycopmlutils.sql.loader import sql_load
-from pydantic import BaseModel, Field
+from pydantic import BaseModel
 from wasabi import Printer
 
 from psycopt2d.evaluate_saved_model_predictions import infer_look_distance
 from psycopt2d.utils.omegaconf_to_pydantic_objects import FullConfig
 from psycopt2d.utils.utils import (
-    PROJECT_ROOT,
     coerce_to_datetime,
     get_percent_lost,
     infer_outcome_col_name,
@@ -366,9 +363,11 @@ def _drop_cols_and_rows_if_look_direction_not_met(
         return dataset
 
     def _keep_unique_outcome_col_with_lookahead_days_matching_conf(
-        self, dataset: pd.DataFrame
+        self,
+        dataset: pd.DataFrame,
     ) -> pd.DataFrame:
-        """Keep only one outcome column with the same lookahead days as set in the config."""
+        """Keep only one outcome column with the same lookahead days as set in
+        the config."""
         outcome_cols = infer_outcome_col_name(df=dataset, allow_multiple=True)
         col_to_drop = [
             c for c in outcome_cols if str(self.cfg.data.lookahead_days) not in c
@@ -378,7 +377,7 @@ def _keep_unique_outcome_col_with_lookahead_days_matching_conf(
 
         if not isinstance(infer_outcome_col_name(df), str):
             raise ValueError(
-                "Returning more than one outcome column, will cause problems during eval."
+                "Returning more than one outcome column, will cause problems during eval.",
             )
 
         return df
@@ -414,7 +413,7 @@ def _process_dataset(self, dataset: pd.DataFrame) -> pd.DataFrame:
             dataset = self._drop_cols_not_in_lookbehind_combination(dataset=dataset)
 
         dataset = self._keep_unique_outcome_col_with_lookahead_days_matching_conf(
-            dataset=dataset
+            dataset=dataset,
         )
 
         return dataset
diff --git a/src/psycopt2d/model_training_watcher.py b/src/psycopt2d/model_training_watcher.py
index acf4017d..b981fb74 100644
--- a/src/psycopt2d/model_training_watcher.py
+++ b/src/psycopt2d/model_training_watcher.py
@@ -13,10 +13,13 @@
 
 from psycopt2d.configs import ModelEvalData
 from psycopt2d.evaluation import evaluate_model
-from psycopt2d.utils.utils import (MODEL_PREDICTIONS_PATH, PROJECT_ROOT,
-                                   infer_outcome_col_name,
-                                   infer_y_hat_prob_col_name,
-                                   load_evaluation_data)
+from psycopt2d.utils.utils import (
+    MODEL_PREDICTIONS_PATH,
+    PROJECT_ROOT,
+    infer_outcome_col_name,
+    infer_y_hat_prob_col_name,
+    load_evaluation_data,
+)
 
 # Path to the wandb directory
 WANDB_DIR = PROJECT_ROOT / "wandb"
@@ -114,7 +117,7 @@ def upload_unarchived_runs(self) -> None:
                         msg.warn(f"Run {run_id} is still running. Skipping.")
                 else:
                     raise ValueError(
-                        f"wandb sync failed, returned: {wandb_sync_stdout}"
+                        f"wandb sync failed, returned: {wandb_sync_stdout}",
                     )
                 continue
 

From e152029c1d5040e21557ae1fbf0ecaa9da39fa74 Mon Sep 17 00:00:00 2001
From: Martin Bernstorff <ryqiem@gmail.com>
Date: Fri, 21 Oct 2022 14:50:44 +0200
Subject: [PATCH 27/57] style: linting

---
 src/psycopt2d/model_training_watcher.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/psycopt2d/model_training_watcher.py b/src/psycopt2d/model_training_watcher.py
index b981fb74..936200e5 100644
--- a/src/psycopt2d/model_training_watcher.py
+++ b/src/psycopt2d/model_training_watcher.py
@@ -111,7 +111,7 @@ def upload_unarchived_runs(self) -> None:
 
             wandb_sync_stdout = self._upload_run_dir(run_folder)
 
-            if not "...done" in wandb_sync_stdout:
+            if "...done" not in wandb_sync_stdout:
                 if ".wandb file is empty" in wandb_sync_stdout:
                     if self.verbose:
                         msg.warn(f"Run {run_id} is still running. Skipping.")

From d2f4952611c88c363b6d1cbad05eaf9468d71dac Mon Sep 17 00:00:00 2001
From: Martin Bernstorff <ryqiem@gmail.com>
Date: Fri, 21 Oct 2022 15:06:12 +0200
Subject: [PATCH 28/57] misc. refactors

---
 application/train_and_log_models.py           | 139 ++++++++++--------
 src/psycopt2d/config/data/t2d_parquet.yaml    |  19 +--
 .../config/project/default_project.yaml       |  12 +-
 .../config/train/default_training.yaml        |   2 +
 src/psycopt2d/configs.py                      |   2 +-
 src/psycopt2d/evaluation.py                   |   2 +-
 src/psycopt2d/load.py                         |   2 +-
 src/psycopt2d/train_model.py                  |   5 +-
 ...conf_to_pydantic_objects.py => configs.py} |  16 +-
 src/psycopt2d/utils/utils.py                  |   2 +-
 tests/test_load.py                            |   2 +-
 tests/test_train_model.py                     |   2 +-
 tests/test_utils.py                           |   2 +-
 13 files changed, 116 insertions(+), 91 deletions(-)
 rename src/psycopt2d/utils/{omegaconf_to_pydantic_objects.py => configs.py} (96%)

diff --git a/application/train_and_log_models.py b/application/train_and_log_models.py
index 9457808c..fb2cb5ce 100644
--- a/application/train_and_log_models.py
+++ b/application/train_and_log_models.py
@@ -9,6 +9,7 @@
 import subprocess
 import time
 
+import pandas as pd
 from hydra import compose, initialize
 from pydantic import BaseModel
 from wasabi import Printer
@@ -19,10 +20,7 @@
     infer_predictor_col_name,
 )
 from psycopt2d.load import DataLoader
-from psycopt2d.utils.omegaconf_to_pydantic_objects import (
-    FullConfig,
-    omegaconf_to_pydantic_objects,
-)
+from psycopt2d.utils.configs import FullConfig, omegaconf_to_pydantic_objects
 
 msg = Printer(timestamp=True)
 
@@ -41,16 +39,16 @@ def load_train_for_inference(cfg: FullConfig):
     return loader.load_dataset_from_dir(split_names="train")
 
 
-def infer_possible_look_directions(train):
+def infer_possible_look_distances(df: pd.DataFrame) -> PossibleLookDistanceDays:
     """Infer the possible values for min_lookahead_days and
     min_lookbehind_days."""
     # Get potential lookaheads from outc_ columns
-    outcome_col_names = infer_outcome_col_name(df=train, allow_multiple=True)
+    outcome_col_names = infer_outcome_col_name(df=df, allow_multiple=True)
 
     possible_lookahead_days = infer_look_distance(col_name=outcome_col_names)
 
     # Get potential lookbehinds from pred_ columns
-    pred_col_names = infer_predictor_col_name(df=train, allow_multiple=True)
+    pred_col_names = infer_predictor_col_name(df=df, allow_multiple=True)
     possible_lookbehind_days = list(set(infer_look_distance(col_name=pred_col_names)))
 
     return PossibleLookDistanceDays(
@@ -83,10 +81,6 @@ def train_models_for_each_cell_in_grid(
         for lookahead in possible_look_distances.ahead
     ]
 
-    lookbehind_combinations = [
-        comb for comb in lookbehind_combinations if comb.lookahead <= 1095
-    ]
-
     random.shuffle(lookbehind_combinations)
 
     active_trainers: list[subprocess.Popen] = []
@@ -95,61 +89,65 @@ def train_models_for_each_cell_in_grid(
 
     while lookbehind_combinations:
         # Loop to run if enough trainers have been spawned
-        if len(active_trainers) >= 1:  # TODO: Add to conf.
+        if len(active_trainers) >= cfg.train.active_trainers:
             active_trainers = [t for t in active_trainers if t.poll() is None]
             time.sleep(1)
             continue
 
-        cell = lookbehind_combinations.pop()
+        combination = lookbehind_combinations.pop()
+
         msg.info(
-            f"Spawning a new trainer with lookbehind={cell.lookbehind} and lookahead={cell.lookahead}",
+            f"Spawning a new trainer with lookbehind={combination.lookbehind} and lookahead={combination.lookahead}",
         )
 
-        wandb_group = f"{wandb_prefix}-beh-{cell.lookbehind}-ahead-{cell.lookahead}"
-
-        subprocess_args: list[str] = [
-            "python",
-            "src/psycopt2d/train_model.py",
-            f"model={cfg.model.model_name}",
-            f"data.min_lookbehind_days={cell.lookbehind}",
-            f"data.min_lookahead_days={cell.lookahead}",
-            f"project.wandb_group='{wandb_group}'",
-            f"hydra.sweeper.n_trials={cfg.train.n_trials_per_lookdirection_combination}",
-            f"project.wandb_mode={cfg.project.wandb_mode}",
-            "--config-name",
-            f"{config_file_name}",
-        ]
-
-        if cfg.train.n_trials_per_lookdirection_combination > 1:
-            subprocess_args.insert(2, "--multirun")
-
-        if cfg.model.model_name == "xgboost" and not cfg.project.gpu:
-            subprocess_args.insert(3, "++model.args.tree_method='auto'")
-
-        msg.info(f'{" ".join(subprocess_args)}')
+        wandb_group = (
+            f"{wandb_prefix}-beh-{combination.lookbehind}-ahead-{combination.lookahead}"
+        )
 
         active_trainers.append(
-            subprocess.Popen(  # pylint: disable=consider-using-with
-                args=subprocess_args,
-            ),
+            start_trainer(
+                cfg=cfg,
+                config_file_name=config_file_name,
+                cell=combination,
+                wandb_group=wandb_group,
+            )
         )
 
 
-if __name__ == "__main__":
-    msg = Printer(timestamp=True)
+def start_trainer(
+    cfg: FullConfig,
+    config_file_name: str,
+    cell: LookDirectionCombination,
+    wandb_group: str,
+):
+    subprocess_args: list[str] = [
+        "python",
+        "src/psycopt2d/train_model.py",
+        f"model={cfg.model.model_name}",
+        f"data.min_lookbehind_days={cell.lookbehind}",
+        f"data.min_lookahead_days={cell.lookahead}",
+        f"project.wandb_group='{wandb_group}'",
+        f"hydra.sweeper.n_trials={cfg.train.n_trials_per_lookdirection_combination}",
+        f"project.wandb_mode={cfg.project.wandb_mode}",
+        "--config-name",
+        f"{config_file_name}",
+    ]
 
-    CONFIG_FILE_NAME = "default_config.yaml"
+    if cfg.train.n_trials_per_lookdirection_combination > 1:
+        subprocess_args.insert(2, "--multirun")
 
-    with initialize(version_base=None, config_path="../src/psycopt2d/config/"):
-        cfg = compose(
-            config_name=CONFIG_FILE_NAME,
-        )
+    if cfg.model.model_name == "xgboost" and not cfg.train.gpu:
+        subprocess_args.insert(3, "++model.args.tree_method='auto'")
 
-        cfg = omegaconf_to_pydantic_objects(cfg)
+    msg.info(f'{" ".join(subprocess_args)}')
 
-    # TODO: Watcher must be instantiated once for each cell in the grid, otherwise
-    # it will compare max performances across all cells.
-    watcher = subprocess.Popen(  # pylint: disable=consider-using-with
+    return subprocess.Popen(  # pylint: disable=consider-using-with
+        args=subprocess_args,
+    )
+
+
+def start_watcher(cfg):
+    return subprocess.Popen(  # pylint: disable=consider-using-with
         [
             "python",
             "src/psycopt2d/model_training_watcher.py",
@@ -170,31 +168,36 @@ def train_models_for_each_cell_in_grid(
         ],
     )
 
-    train = load_train_for_inference(cfg=cfg)
 
-    possible_look_distances = infer_possible_look_directions(train)
+def main():
+    msg = Printer(timestamp=True)
+
+    config_file_name = "integration_testing.yaml"
+
+    cfg = load_cfg(config_file_name=config_file_name)
+    # TODO: Watcher must be instantiated once for each cell in the grid, otherwise
+    # it will compare max performances across all cells.
+    watcher = start_watcher(cfg)
+    train = load_train_for_inference(cfg=cfg)
+    possible_look_distances = infer_possible_look_distances(df=train)
 
     # Remove "9999" from possible look distances behind
     possible_look_distances.behind = [
-        dist for dist in possible_look_distances.behind if dist != "9999"
+        dist
+        for dist in possible_look_distances
+        if not int(dist) > cfg.data.max_lookbehind_days
     ]
 
     msg.info(f"Possible lookbehind days: {possible_look_distances.behind}")
     msg.info(f"Possible lookahead days: {possible_look_distances.ahead}")
 
-    if not cfg.project.gpu:
+    if not cfg.train.gpu:
         msg.warn("Not using GPU for training")
 
-    CLEAN_DIR_SECONDS = 0
-    msg.info(
-        f"Sleeping for {CLEAN_DIR_SECONDS} seconds to allow watcher to start and clean dir",
-    )
-    time.sleep(CLEAN_DIR_SECONDS)
-
     train_models_for_each_cell_in_grid(
         cfg=cfg,
         possible_look_distances=possible_look_distances,
-        config_file_name=CONFIG_FILE_NAME,
+        config_file_name=config_file_name,
     )
 
     msg.good(
@@ -204,3 +207,17 @@ def train_models_for_each_cell_in_grid(
     time.sleep(60 * cfg.project.watcher.keep_alive_after_training_minutes)
     watcher.kill()
     msg.good("Watcher stopped.")
+
+
+def load_cfg(config_file_name):
+    with initialize(version_base=None, config_path="../src/psycopt2d/config/"):
+        cfg = compose(
+            config_name=config_file_name,
+        )
+
+        cfg = omegaconf_to_pydantic_objects(cfg)
+    return cfg
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/psycopt2d/config/data/t2d_parquet.yaml b/src/psycopt2d/config/data/t2d_parquet.yaml
index 611b3d02..6c1ee3ac 100644
--- a/src/psycopt2d/config/data/t2d_parquet.yaml
+++ b/src/psycopt2d/config/data/t2d_parquet.yaml
@@ -1,24 +1,25 @@
 # @package _global_
 data:
   # General config
-  n_training_samples: null # (int, null): Number of training samples to use, defaults to null in which cases it uses all samples.
+  n_training_samples: null
   dir: E:\shared_resources\feature_sets\t2d\feature_sets\psycop_t2d_adminmanber_416_features_2022_10_20_11_12
-  suffix: parquet # File suffix to load.
+  suffix: parquet
 
   # Feature specs
-  pred_col_name_prefix: "pred_" # (str): prefix of predictor columns
-  pred_timestamp_col_name: timestamp # (str): Column name for prediction times
-  outcome_timestamp_col_name: _timestamp_first_t2d # (str): Column name for outcome timestamps
-  id_col_name: dw_ek_borger # (str): Citizen colnames
+  pred_col_name_prefix: "pred_"
+  pred_timestamp_col_name: timestamp
+  outcome_timestamp_col_name: _timestamp_first_t2d
+  id_col_name: dw_ek_borger
 
   # Looking ahead
-  lookahead_days: 365 # (float): Number of days from prediction time to look ahead for the outcome.
-  min_lookahead_days: 365 # (int): Drop all prediction times where (max timestamp in the dataset) - (current timestamp) is less than min_lookahead_days
+  lookahead_days: 365
+  min_lookahead_days: 365
   drop_patient_if_outcome_before_date: null
 
   # Looking behind
   min_prediction_time_date: 2013-01-01
-  min_lookbehind_days: 365 # (int): Drop all prediction times where (prediction_timestamp) - (min timestamp in the dataset) is less than min_lookbehind_days
+  min_lookbehind_days: 365
+  max_lookbehind_days: 3650
   lookbehind_combination: [30, 90, 180, 365]
 
 # Parameters that will only take effect if running with --multirun
diff --git a/src/psycopt2d/config/project/default_project.yaml b/src/psycopt2d/config/project/default_project.yaml
index 3bec2ad6..837cfc78 100644
--- a/src/psycopt2d/config/project/default_project.yaml
+++ b/src/psycopt2d/config/project/default_project.yaml
@@ -1,10 +1,12 @@
 name: psycop-t2d
 seed: 42
-wandb_mode: "offline" # Which mode to run WanDB in. Takes "run", "dryrun", "offline" and "disabled"
-wandb_group: "psycop-t2d" # Which group to run WanDB in.
-wandb_entity: "psycop" # Which entity to run WanDB in.
+
+wandb:
+  entity: "psycop" # Which entity to run WanDB in.
+  mode: "offline" # Which mode to run WanDB in. Takes "run", "dryrun", "offline" and "disabled"
+  group: "psycop-t2d" # Which group to run WanDB in.
+
 watcher:
   archive_all: false
   keep_alive_after_training_minutes: 5
-  n_runs_before_eval: 1
-gpu: false
+  n_runs_before_eval: 1
\ No newline at end of file
diff --git a/src/psycopt2d/config/train/default_training.yaml b/src/psycopt2d/config/train/default_training.yaml
index 9ecc02a5..84a98c92 100644
--- a/src/psycopt2d/config/train/default_training.yaml
+++ b/src/psycopt2d/config/train/default_training.yaml
@@ -1,2 +1,4 @@
 n_splits: 2 # (int, Null): Number of k-folds during CV. If Null, loads pre-defined dataset.
 n_trials_per_lookdirection_combination: 1
+gpu: false
+active_trainers: 4
\ No newline at end of file
diff --git a/src/psycopt2d/configs.py b/src/psycopt2d/configs.py
index 76ba6b02..3e4beb6c 100644
--- a/src/psycopt2d/configs.py
+++ b/src/psycopt2d/configs.py
@@ -4,7 +4,7 @@
 import pandas as pd
 from pydantic import BaseModel
 
-from psycopt2d.utils.omegaconf_to_pydantic_objects import FullConfig
+from psycopt2d.utils.configs import FullConfig
 
 # pylint: disable=missing-class-docstring, too-few-public-methods
 
diff --git a/src/psycopt2d/evaluation.py b/src/psycopt2d/evaluation.py
index 39b2f308..17a7d1b6 100644
--- a/src/psycopt2d/evaluation.py
+++ b/src/psycopt2d/evaluation.py
@@ -14,7 +14,7 @@
 from psycopt2d.tables.performance_by_threshold import (
     generate_performance_by_positive_rate_table,
 )
-from psycopt2d.utils.omegaconf_to_pydantic_objects import FullConfig
+from psycopt2d.utils.configs import FullConfig
 from psycopt2d.utils.utils import PROJECT_ROOT, positive_rate_to_pred_probs
 from psycopt2d.visualization import (
     plot_auc_by_time_from_first_visit,
diff --git a/src/psycopt2d/load.py b/src/psycopt2d/load.py
index 50331d50..6e08b807 100644
--- a/src/psycopt2d/load.py
+++ b/src/psycopt2d/load.py
@@ -11,7 +11,7 @@
 from wasabi import Printer
 
 from psycopt2d.evaluate_saved_model_predictions import infer_look_distance
-from psycopt2d.utils.omegaconf_to_pydantic_objects import FullConfig
+from psycopt2d.utils.configs import FullConfig
 from psycopt2d.utils.utils import (
     coerce_to_datetime,
     get_percent_lost,
diff --git a/src/psycopt2d/train_model.py b/src/psycopt2d/train_model.py
index 797b7ebe..84fb1328 100644
--- a/src/psycopt2d/train_model.py
+++ b/src/psycopt2d/train_model.py
@@ -19,10 +19,7 @@
 from psycopt2d.feature_transformers import ConvertToBoolean, DateTimeConverter
 from psycopt2d.load import load_train_and_val_from_cfg
 from psycopt2d.models import MODELS
-from psycopt2d.utils.omegaconf_to_pydantic_objects import (
-    FullConfig,
-    omegaconf_to_pydantic_objects,
-)
+from psycopt2d.utils.configs import FullConfig, omegaconf_to_pydantic_objects
 from psycopt2d.utils.utils import (
     PROJECT_ROOT,
     create_wandb_folders,
diff --git a/src/psycopt2d/utils/omegaconf_to_pydantic_objects.py b/src/psycopt2d/utils/configs.py
similarity index 96%
rename from src/psycopt2d/utils/omegaconf_to_pydantic_objects.py
rename to src/psycopt2d/utils/configs.py
index f13fe66f..58a58e66 100644
--- a/src/psycopt2d/utils/omegaconf_to_pydantic_objects.py
+++ b/src/psycopt2d/utils/configs.py
@@ -24,6 +24,12 @@ class Config:
         arbitrary_types_allowed = True
 
 
+class WandbConf(BaseModel):
+    group: str
+    mode: str
+    entity: str
+
+
 class WatcherConf(BaseModel):
     """Configuration for watchers."""
 
@@ -37,11 +43,7 @@ class ProjectConf(BaseModel):
 
     name: str = "psycopt2d"
     seed: int
-    wandb_group: str
-    wandb_mode: str
-    wandb_entity: str
     watcher: WatcherConf
-    gpu: bool
 
 
 class DataConf(BaseModel):
@@ -64,12 +66,13 @@ class DataConf(BaseModel):
     min_lookahead_days: Optional[
         int
     ]  # (int): Drop all prediction times where (max timestamp in the dataset) - (current timestamp) is less than min_lookahead_days
-    min_lookbehind_days: Optional[int]
     drop_patient_if_outcome_before_date: Optional[Union[str, datetime]]
 
     # Looking behind
     # (int): Drop all prediction times where (prediction_timestamp) - (min timestamp in the dataset) is less than min_lookbehind_days
     min_prediction_time_date: Optional[Union[str, datetime]]
+    min_lookbehind_days: Optional[int]
+    max_lookbehind_days: Optional[int]
     lookbehind_combination: Optional[list[int]]
 
 
@@ -97,6 +100,8 @@ class TrainConf(BaseModel):
 
     n_splits: int  # ? How do we handle whether to use crossvalidation or train/val splitting?
     n_trials_per_lookdirection_combination: int
+    gpu: bool
+    active_trainers: int
 
 
 class EvalConf(BaseModel):
@@ -116,6 +121,7 @@ class EvalConf(BaseModel):
 class FullConfig(BaseModel):
     """A full configuration object."""
 
+    wandb: WandbConf
     project: ProjectConf
     data: DataConf
     preprocessing: PreprocessingConf
diff --git a/src/psycopt2d/utils/utils.py b/src/psycopt2d/utils/utils.py
index 33cfa289..3ad4bcb5 100644
--- a/src/psycopt2d/utils/utils.py
+++ b/src/psycopt2d/utils/utils.py
@@ -19,7 +19,7 @@
 
 from psycopt2d.configs import ModelEvalData
 from psycopt2d.model_performance import ModelPerformance
-from psycopt2d.utils.omegaconf_to_pydantic_objects import FullConfig
+from psycopt2d.utils.configs import FullConfig
 
 SHARED_RESOURCES_PATH = Path(r"E:\shared_resources")
 FEATURE_SETS_PATH = SHARED_RESOURCES_PATH / "feature_sets"
diff --git a/tests/test_load.py b/tests/test_load.py
index 58bafbdc..bf5a9f8f 100644
--- a/tests/test_load.py
+++ b/tests/test_load.py
@@ -2,7 +2,7 @@
 from hydra import compose, initialize
 
 from psycopt2d.load import load_train_and_val_from_cfg
-from psycopt2d.utils.omegaconf_to_pydantic_objects import omegaconf_to_pydantic_objects
+from psycopt2d.utils.configs import omegaconf_to_pydantic_objects
 
 
 def test_load_lookbehind_exceeds_lookbehind_threshold():
diff --git a/tests/test_train_model.py b/tests/test_train_model.py
index ef246e50..fcb4b1b4 100644
--- a/tests/test_train_model.py
+++ b/tests/test_train_model.py
@@ -5,7 +5,7 @@
 
 from psycopt2d.models import MODELS
 from psycopt2d.train_model import main
-from psycopt2d.utils.omegaconf_to_pydantic_objects import omegaconf_to_pydantic_objects
+from psycopt2d.utils.configs import omegaconf_to_pydantic_objects
 
 CONFIG_DIR_PATH = "../src/psycopt2d/config/"
 INTEGRATION_TEST_FILE_NAME = "integration_testing.yaml"
diff --git a/tests/test_utils.py b/tests/test_utils.py
index b5a40dbc..b772542e 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -8,7 +8,7 @@
 from hydra import compose, initialize
 from utils_for_testing import str_to_df
 
-from psycopt2d.utils.omegaconf_to_pydantic_objects import omegaconf_to_pydantic_objects
+from psycopt2d.utils.configs import omegaconf_to_pydantic_objects
 from psycopt2d.utils.utils import (
     PROJECT_ROOT,
     drop_records_if_datediff_days_smaller_than,

From 781b23f07dd58779873d8a43390e1a169bfa4e22 Mon Sep 17 00:00:00 2001
From: Martin Bernstorff <ryqiem@gmail.com>
Date: Fri, 21 Oct 2022 15:18:24 +0200
Subject: [PATCH 29/57] refactor: simplify functionality

---
 application/train_and_log_models.py           | 60 ++++++++-----------
 .../config/train/default_training.yaml        |  7 +--
 src/psycopt2d/train_model.py                  | 15 +++--
 src/psycopt2d/utils/configs.py                |  1 -
 4 files changed, 36 insertions(+), 47 deletions(-)

diff --git a/application/train_and_log_models.py b/application/train_and_log_models.py
index fb2cb5ce..a22afc5a 100644
--- a/application/train_and_log_models.py
+++ b/application/train_and_log_models.py
@@ -83,18 +83,11 @@ def train_models_for_each_cell_in_grid(
 
     random.shuffle(lookbehind_combinations)
 
-    active_trainers: list[subprocess.Popen] = []
-
     wandb_prefix = f"{random_word.get_random_word()}-{random_word.get_random_word()}"
 
     while lookbehind_combinations:
-        # Loop to run if enough trainers have been spawned
-        if len(active_trainers) >= cfg.train.active_trainers:
-            active_trainers = [t for t in active_trainers if t.poll() is None]
-            time.sleep(1)
-            continue
-
         combination = lookbehind_combinations.pop()
+        watcher = start_watcher(cfg=cfg)
 
         msg.info(
             f"Spawning a new trainer with lookbehind={combination.lookbehind} and lookahead={combination.lookahead}",
@@ -104,15 +97,23 @@ def train_models_for_each_cell_in_grid(
             f"{wandb_prefix}-beh-{combination.lookbehind}-ahead-{combination.lookahead}"
         )
 
-        active_trainers.append(
-            start_trainer(
-                cfg=cfg,
-                config_file_name=config_file_name,
-                cell=combination,
-                wandb_group=wandb_group,
-            )
+        trainer = start_trainer(
+            cfg=cfg,
+            config_file_name=config_file_name,
+            cell=combination,
+            wandb_group=wandb_group,
+        )
+
+        while trainer.poll() is None:
+            time.sleep(1)
+
+        msg.good(
+            f"Training finished. Stopping the watcher in {cfg.project.watcher.keep_alive_after_training_minutes} minutes...",
         )
 
+        time.sleep(60 * cfg.project.watcher.keep_alive_after_training_minutes)
+        watcher.kill()
+
 
 def start_trainer(
     cfg: FullConfig,
@@ -169,6 +170,16 @@ def start_watcher(cfg):
     )
 
 
+def load_cfg(config_file_name):
+    with initialize(version_base=None, config_path="../src/psycopt2d/config/"):
+        cfg = compose(
+            config_name=config_file_name,
+        )
+
+        cfg = omegaconf_to_pydantic_objects(cfg)
+    return cfg
+
+
 def main():
     msg = Printer(timestamp=True)
 
@@ -177,7 +188,6 @@ def main():
     cfg = load_cfg(config_file_name=config_file_name)
     # TODO: Watcher must be instantiated once for each cell in the grid, otherwise
     # it will compare max performances across all cells.
-    watcher = start_watcher(cfg)
     train = load_train_for_inference(cfg=cfg)
     possible_look_distances = infer_possible_look_distances(df=train)
 
@@ -200,24 +210,6 @@ def main():
         config_file_name=config_file_name,
     )
 
-    msg.good(
-        f"Training finished. Stopping the watcher in {cfg.project.watcher.keep_alive_after_training_minutes} minutes...",
-    )
-
-    time.sleep(60 * cfg.project.watcher.keep_alive_after_training_minutes)
-    watcher.kill()
-    msg.good("Watcher stopped.")
-
-
-def load_cfg(config_file_name):
-    with initialize(version_base=None, config_path="../src/psycopt2d/config/"):
-        cfg = compose(
-            config_name=config_file_name,
-        )
-
-        cfg = omegaconf_to_pydantic_objects(cfg)
-    return cfg
-
 
 if __name__ == "__main__":
     main()
diff --git a/src/psycopt2d/config/train/default_training.yaml b/src/psycopt2d/config/train/default_training.yaml
index 84a98c92..476a0e11 100644
--- a/src/psycopt2d/config/train/default_training.yaml
+++ b/src/psycopt2d/config/train/default_training.yaml
@@ -1,4 +1,3 @@
-n_splits: 2 # (int, Null): Number of k-folds during CV. If Null, loads pre-defined dataset.
-n_trials_per_lookdirection_combination: 1
-gpu: false
-active_trainers: 4
\ No newline at end of file
+n_splits: 3 # (int, Null): Number of k-folds during CV. If Null, loads pre-defined dataset.
+n_trials_per_lookdirection_combination: 10
+gpu: true
\ No newline at end of file
diff --git a/src/psycopt2d/train_model.py b/src/psycopt2d/train_model.py
index 84fb1328..0f84284a 100644
--- a/src/psycopt2d/train_model.py
+++ b/src/psycopt2d/train_model.py
@@ -20,13 +20,10 @@
 from psycopt2d.load import load_train_and_val_from_cfg
 from psycopt2d.models import MODELS
 from psycopt2d.utils.configs import FullConfig, omegaconf_to_pydantic_objects
-from psycopt2d.utils.utils import (
-    PROJECT_ROOT,
-    create_wandb_folders,
-    flatten_nested_dict,
-    get_feature_importance_dict,
-    prediction_df_with_metadata_to_disk,
-)
+from psycopt2d.utils.utils import (PROJECT_ROOT, create_wandb_folders,
+                                   flatten_nested_dict,
+                                   get_feature_importance_dict,
+                                   prediction_df_with_metadata_to_disk)
 
 CONFIG_PATH = PROJECT_ROOT / "src" / "psycopt2d" / "config"
 
@@ -359,7 +356,9 @@ def main(cfg: Union[FullConfig, DictConfig]):
     )
 
     msg.info(f"ROC AUC: {roc_auc}")
-    run.log({"roc_auc_unweighted": roc_auc})
+    run.log({"roc_auc_unweighted": roc_auc,
+             "lookbehind": cfg.data.lookbehind_days,
+             "lookahead": cfg.data.lookahead_days,})
     run.finish()
     return roc_auc
 
diff --git a/src/psycopt2d/utils/configs.py b/src/psycopt2d/utils/configs.py
index 58a58e66..4a198dd9 100644
--- a/src/psycopt2d/utils/configs.py
+++ b/src/psycopt2d/utils/configs.py
@@ -101,7 +101,6 @@ class TrainConf(BaseModel):
     n_splits: int  # ? How do we handle whether to use crossvalidation or train/val splitting?
     n_trials_per_lookdirection_combination: int
     gpu: bool
-    active_trainers: int
 
 
 class EvalConf(BaseModel):

From cba13fa64d0b389ae65ed01fe46be627504e8e9d Mon Sep 17 00:00:00 2001
From: Martin Bernstorff <ryqiem@gmail.com>
Date: Fri, 21 Oct 2022 15:19:40 +0200
Subject: [PATCH 30/57] style: linting

---
 application/train_and_log_models.py | 110 ++++++++++++++--------------
 src/psycopt2d/train_model.py        |  21 ++++--
 2 files changed, 71 insertions(+), 60 deletions(-)

diff --git a/application/train_and_log_models.py b/application/train_and_log_models.py
index a22afc5a..6af1ee3a 100644
--- a/application/train_and_log_models.py
+++ b/application/train_and_log_models.py
@@ -64,63 +64,13 @@ class LookDirectionCombination(BaseModel):
     lookahead: int
 
 
-def train_models_for_each_cell_in_grid(
-    cfg: FullConfig,
-    possible_look_distances: PossibleLookDistanceDays,
-    config_file_name: str,
-):
-    """Train a model for each cell in the grid of possible look distances."""
-    from random_word import RandomWords
-
-    random_word = RandomWords()
-
-    # Create all combinations of lookbehind and lookahead days
-    lookbehind_combinations = [
-        LookDirectionCombination(lookbehind=lookbehind, lookahead=lookahead)
-        for lookbehind in possible_look_distances.behind
-        for lookahead in possible_look_distances.ahead
-    ]
-
-    random.shuffle(lookbehind_combinations)
-
-    wandb_prefix = f"{random_word.get_random_word()}-{random_word.get_random_word()}"
-
-    while lookbehind_combinations:
-        combination = lookbehind_combinations.pop()
-        watcher = start_watcher(cfg=cfg)
-
-        msg.info(
-            f"Spawning a new trainer with lookbehind={combination.lookbehind} and lookahead={combination.lookahead}",
-        )
-
-        wandb_group = (
-            f"{wandb_prefix}-beh-{combination.lookbehind}-ahead-{combination.lookahead}"
-        )
-
-        trainer = start_trainer(
-            cfg=cfg,
-            config_file_name=config_file_name,
-            cell=combination,
-            wandb_group=wandb_group,
-        )
-
-        while trainer.poll() is None:
-            time.sleep(1)
-
-        msg.good(
-            f"Training finished. Stopping the watcher in {cfg.project.watcher.keep_alive_after_training_minutes} minutes...",
-        )
-
-        time.sleep(60 * cfg.project.watcher.keep_alive_after_training_minutes)
-        watcher.kill()
-
-
 def start_trainer(
     cfg: FullConfig,
     config_file_name: str,
     cell: LookDirectionCombination,
     wandb_group: str,
-):
+) -> subprocess.Popen:
+    """Start a trainer"""
     subprocess_args: list[str] = [
         "python",
         "src/psycopt2d/train_model.py",
@@ -147,7 +97,8 @@ def start_trainer(
     )
 
 
-def start_watcher(cfg):
+def start_watcher(cfg: FullConfig) -> subprocess.Popen:
+    """Start a watcher"""
     return subprocess.Popen(  # pylint: disable=consider-using-with
         [
             "python",
@@ -170,7 +121,59 @@ def start_watcher(cfg):
     )
 
 
+def train_models_for_each_cell_in_grid(
+    cfg: FullConfig,
+    possible_look_distances: PossibleLookDistanceDays,
+    config_file_name: str,
+):
+    """Train a model for each cell in the grid of possible look distances."""
+    from random_word import RandomWords
+
+    random_word = RandomWords()
+
+    # Create all combinations of lookbehind and lookahead days
+    lookbehind_combinations = [
+        LookDirectionCombination(lookbehind=lookbehind, lookahead=lookahead)
+        for lookbehind in possible_look_distances.behind
+        for lookahead in possible_look_distances.ahead
+    ]
+
+    random.shuffle(lookbehind_combinations)
+
+    wandb_prefix = f"{random_word.get_random_word()}-{random_word.get_random_word()}"
+
+    while lookbehind_combinations:
+        combination = lookbehind_combinations.pop()
+        watcher = start_watcher(cfg=cfg)
+
+        msg.info(
+            f"Spawning a new trainer with lookbehind={combination.lookbehind} and lookahead={combination.lookahead}",
+        )
+
+        wandb_group = (
+            f"{wandb_prefix}-beh-{combination.lookbehind}-ahead-{combination.lookahead}"
+        )
+
+        trainer = start_trainer(
+            cfg=cfg,
+            config_file_name=config_file_name,
+            cell=combination,
+            wandb_group=wandb_group,
+        )
+
+        while trainer.poll() is None:
+            time.sleep(1)
+
+        msg.good(
+            f"Training finished. Stopping the watcher in {cfg.project.watcher.keep_alive_after_training_minutes} minutes...",
+        )
+
+        time.sleep(60 * cfg.project.watcher.keep_alive_after_training_minutes)
+        watcher.kill()
+
+
 def load_cfg(config_file_name):
+    """Load config as pydantic object"""
     with initialize(version_base=None, config_path="../src/psycopt2d/config/"):
         cfg = compose(
             config_name=config_file_name,
@@ -181,6 +184,7 @@ def load_cfg(config_file_name):
 
 
 def main():
+    """Main"""
     msg = Printer(timestamp=True)
 
     config_file_name = "integration_testing.yaml"
diff --git a/src/psycopt2d/train_model.py b/src/psycopt2d/train_model.py
index 0f84284a..370aeee1 100644
--- a/src/psycopt2d/train_model.py
+++ b/src/psycopt2d/train_model.py
@@ -20,10 +20,13 @@
 from psycopt2d.load import load_train_and_val_from_cfg
 from psycopt2d.models import MODELS
 from psycopt2d.utils.configs import FullConfig, omegaconf_to_pydantic_objects
-from psycopt2d.utils.utils import (PROJECT_ROOT, create_wandb_folders,
-                                   flatten_nested_dict,
-                                   get_feature_importance_dict,
-                                   prediction_df_with_metadata_to_disk)
+from psycopt2d.utils.utils import (
+    PROJECT_ROOT,
+    create_wandb_folders,
+    flatten_nested_dict,
+    get_feature_importance_dict,
+    prediction_df_with_metadata_to_disk,
+)
 
 CONFIG_PATH = PROJECT_ROOT / "src" / "psycopt2d" / "config"
 
@@ -356,9 +359,13 @@ def main(cfg: Union[FullConfig, DictConfig]):
     )
 
     msg.info(f"ROC AUC: {roc_auc}")
-    run.log({"roc_auc_unweighted": roc_auc,
-             "lookbehind": cfg.data.lookbehind_days,
-             "lookahead": cfg.data.lookahead_days,})
+    run.log(
+        {
+            "roc_auc_unweighted": roc_auc,
+            "lookbehind": cfg.data.lookbehind_days,
+            "lookahead": cfg.data.lookahead_days,
+        }
+    )
     run.finish()
     return roc_auc
 

From 5df4d85e622216244cadb83f46bfee650b2488f6 Mon Sep 17 00:00:00 2001
From: Martin Bernstorff <ryqiem@gmail.com>
Date: Fri, 21 Oct 2022 15:20:00 +0200
Subject: [PATCH 31/57] style: linting

---
 application/train_and_log_models.py | 8 ++++----
 src/psycopt2d/train_model.py        | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/application/train_and_log_models.py b/application/train_and_log_models.py
index 6af1ee3a..8f881cb2 100644
--- a/application/train_and_log_models.py
+++ b/application/train_and_log_models.py
@@ -70,7 +70,7 @@ def start_trainer(
     cell: LookDirectionCombination,
     wandb_group: str,
 ) -> subprocess.Popen:
-    """Start a trainer"""
+    """Start a trainer."""
     subprocess_args: list[str] = [
         "python",
         "src/psycopt2d/train_model.py",
@@ -98,7 +98,7 @@ def start_trainer(
 
 
 def start_watcher(cfg: FullConfig) -> subprocess.Popen:
-    """Start a watcher"""
+    """Start a watcher."""
     return subprocess.Popen(  # pylint: disable=consider-using-with
         [
             "python",
@@ -173,7 +173,7 @@ def train_models_for_each_cell_in_grid(
 
 
 def load_cfg(config_file_name):
-    """Load config as pydantic object"""
+    """Load config as pydantic object."""
     with initialize(version_base=None, config_path="../src/psycopt2d/config/"):
         cfg = compose(
             config_name=config_file_name,
@@ -184,7 +184,7 @@ def load_cfg(config_file_name):
 
 
 def main():
-    """Main"""
+    """Main."""
     msg = Printer(timestamp=True)
 
     config_file_name = "integration_testing.yaml"
diff --git a/src/psycopt2d/train_model.py b/src/psycopt2d/train_model.py
index 370aeee1..48f326d5 100644
--- a/src/psycopt2d/train_model.py
+++ b/src/psycopt2d/train_model.py
@@ -364,7 +364,7 @@ def main(cfg: Union[FullConfig, DictConfig]):
             "roc_auc_unweighted": roc_auc,
             "lookbehind": cfg.data.lookbehind_days,
             "lookahead": cfg.data.lookahead_days,
-        }
+        },
     )
     run.finish()
     return roc_auc

From 091a3c02af4150dcccb46c60753f0b7380e5f588 Mon Sep 17 00:00:00 2001
From: Lasse <lasseh0310@gmail.com>
Date: Sat, 22 Oct 2022 09:48:46 +0200
Subject: [PATCH 32/57] feat: make watcher store separate max performance per
 lookbehind/lookahead combination

---
 src/psycopt2d/model_training_watcher.py | 127 +++++++++++++++++-------
 1 file changed, 89 insertions(+), 38 deletions(-)

diff --git a/src/psycopt2d/model_training_watcher.py b/src/psycopt2d/model_training_watcher.py
index 936200e5..270f38c4 100644
--- a/src/psycopt2d/model_training_watcher.py
+++ b/src/psycopt2d/model_training_watcher.py
@@ -2,11 +2,13 @@
 import argparse
 import subprocess
 import time
+from collections import defaultdict
 from distutils.util import strtobool  # pylint: disable=deprecated-module
 from pathlib import Path
-from typing import Optional
+from typing import Any, Optional
 
 import wandb
+from pydantic import BaseModel
 from wandb.apis.public import Api  # pylint: disable=no-name-in-module
 from wandb.sdk.wandb_run import Run  # pylint: disable=no-name-in-module
 from wasabi import msg
@@ -25,7 +27,24 @@
 WANDB_DIR = PROJECT_ROOT / "wandb"
 
 
-class ModelTrainingWatcher:
+class RunInformation(BaseModel):
+    """Information about a wandb run."""
+
+    run_id: str
+    auc: float
+    lookbehind_days: int
+    lookahead_days: int
+    lookahead_lookbehind_combined: Optional[str] = None
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        if self.lookahead_lookbehind_combined is None:
+            self.lookahead_lookbehind_combined = (
+                f"lookahead:{self.lookahead_days}_lookbehind:{self.lookbehind_days}"
+            )
+
+
+class ModelTrainingWatcher:  # pylint: disable=too-many-instance-attributes
     """Watch the wandb directory for new files and uploads them to wandb. Fully
     evaluates the best runs after a certain number of runs have been uploaded.
 
@@ -57,7 +76,8 @@ def __init__(
         self.verbose = verbose
         # A queue for runs waiting to be uploaded to WandB
         self.run_id_eval_candidates_queue: list[str] = []
-        self.max_performance = 0.0
+        # max performance by lookbehind/-ahead combination
+        self.max_performances: dict[str, float] = defaultdict(lambda: 0.0)
 
         self.archive_path = WANDB_DIR / "archive"
         self.archive_path.mkdir(exist_ok=True, parents=True)
@@ -81,7 +101,11 @@ def get_new_runs_and_evaluate(self) -> None:
         self.upload_unarchived_runs()
 
         if len(self.run_id_eval_candidates_queue) >= self.n_runs_before_eval:
-            self.evaluate_and_upload_records_and_archive()
+            run_information = self._get_run_information_for_all_in_queue()
+            self.run_id_eval_candidates_queue = self._get_unfinished_runs(
+                run_information=run_information
+            )
+            self._evaluate_and_archive_finished_runs(run_information=run_information)
 
     def _upload_run_dir(self, run_dir: Path) -> str:
         """Upload a single run to wandb."""
@@ -105,20 +129,24 @@ def _get_run_id(self, run_dir: Path) -> str:
         return run_dir.name.split("-")[-1]
 
     def upload_unarchived_runs(self) -> None:
-        """Upload unarchived runs to wandb."""
+        """Upload unarchived runs to wandb. Only adds runs that have finished
+        training to the evaluation queue.
+
+        Raises:
+            ValueError: If wandb sync failed
+        """
         for run_folder in WANDB_DIR.glob(r"offline-run*"):
             run_id = self._get_run_id(run_folder)
 
             wandb_sync_stdout = self._upload_run_dir(run_folder)
 
             if "...done" not in wandb_sync_stdout:
-                if ".wandb file is empty" in wandb_sync_stdout:
-                    if self.verbose:
-                        msg.warn(f"Run {run_id} is still running. Skipping.")
-                else:
+                if ".wandb file is empty" not in wandb_sync_stdout:
                     raise ValueError(
                         f"wandb sync failed, returned: {wandb_sync_stdout}",
                     )
+                if self.verbose:
+                    msg.warn(f"Run {run_id} is still running. Skipping.")
                 continue
 
             self.run_id_eval_candidates_queue.append(run_id)
@@ -161,44 +189,67 @@ def _get_wandb_run(self, run_id: str) -> Run:
     def _get_run_wandb_dir(self, run_id: str) -> Path:
         return list(WANDB_DIR.glob(f"*offline-run*{run_id}*"))[0]
 
-    def _get_run_performance(self, run_id: str) -> Optional[float]:
-        """Get the performance of a single run and check if it failed."""
-        run = self._get_wandb_run(run_id)
-        if "roc_auc_unweighted" in run.summary:
-            return run.summary.roc_auc_unweighted
+    def _get_run_attribute(self, run: Run, attribute: str) -> Any:
+        """Get an attribute from a wandb run."""
+        if attribute in run.summary:
+            return run.summary[attribute]
         if self.verbose:
             msg.info(
-                f"Watcher: Run {run_id} has no performance metric. Pinging again at next eval time.",
+                f"Run {run.id} has no attribute {attribute}. Pinging again at next eval time."
             )
         return None
 
-    def evaluate_and_upload_records_and_archive(self) -> None:
-        """Evaluate the best runs."""
-        run_performances = {
-            run_id: self._get_run_performance(run_id)
+    def _evaluate_and_archive_finished_runs(
+        self, run_information: list[RunInformation]
+    ) -> None:
+        """Evaluate the finished runs. Test their performance against the current
+        maximum for each lookbehind/-ahead days, and fully evaluate the best performing.
+        Move all wandb run dirs to the archive folder."""
+        finished_runs = [
+            run_info for run_info in run_information if run_info.auc is not None
+        ]
+
+        for run_info in finished_runs:
+            if (
+                run_info.auc
+                > self.max_performances[run_info.lookbehind_lookahead_combination]
+            ):
+                msg.good(
+                    f"New record performance for {run_info.lookbehind_lookahead_combination}! AUC: {run_info.auc}"
+                )
+                self.max_performances[
+                    run_info.loobehind_lookhead_combination
+                ] = run_info.auc
+                self._do_evaluation(run_info.run_id)
+            self._archive_run_dir(run_dir=self._get_run_wandb_dir(run_info.run_id))
+
+    def _get_unfinished_runs(self, run_information: list[RunInformation]) -> list[str]:
+        """Get the run ids of the unfinished runs."""
+        return [run_info.run_id for run_info in run_information if run_info.auc is None]
+
+    def _get_run_information_for_all_in_queue(self):
+        """Get the performance and information of all runs in the evaluation queue
+        and sort by lookahead/lookbehind combination and AUC for faster uploading."""
+        return [
+            self._get_run_information(run_id)
             for run_id in self.run_id_eval_candidates_queue
-        }
-        # sort runs by performance to not upload subpar runs
-        run_performances = dict(
-            sorted(
-                run_performances.items(),
-                key=lambda item: (item[1] is not None, item[1]),
-                reverse=True,
+        ].sort(
+            key=lambda run_info: (
+                run_info.lookahead_lookbehind_combined,
+                run_info.auc,
             ),
+            reverse=True,
         )
-        # get runs with auc of None (attempted upload before run finished)
-        unfinished_runs = [
-            run_id for run_id, auc in run_performances.items() if auc is None
-        ]
 
-        for run_id, performance in run_performances.items():
-            if performance is not None and performance > self.max_performance:
-                msg.good(f"New record performance! AUC: {performance}")
-                self.max_performance = performance
-                self._do_evaluation(run_id)
-            self._archive_run_dir(run_dir=self._get_run_wandb_dir(run_id))
-        # reset run id queue and try to upload unfinished runs next time
-        self.run_id_eval_candidates_queue = unfinished_runs
+    def _get_run_information(self, run_id: str) -> RunInformation:
+        """Get the run information for a single run."""
+        run = self._get_wandb_run(run_id)
+        return RunInformation(
+            run_id=run_id,
+            auc=self._get_run_attribute(run, "roc_auc_unweighted"),
+            lookbehind_days=self._get_run_attribute(run, "lookbehind_days"),
+            lookahead_days=self._get_run_attribute(run, "lookahead_days"),
+        )
 
     def archive_all_runs(self) -> None:
         """Archive all runs in the wandb directory."""

From 66c6e9424acae535c52b37ce9d5efbc0f092ddce Mon Sep 17 00:00:00 2001
From: Lasse <lasseh0310@gmail.com>
Date: Sat, 22 Oct 2022 09:50:06 +0200
Subject: [PATCH 33/57] chore: linting

---
 src/psycopt2d/model_training_watcher.py | 139 ++++++++++++------------
 1 file changed, 72 insertions(+), 67 deletions(-)

diff --git a/src/psycopt2d/model_training_watcher.py b/src/psycopt2d/model_training_watcher.py
index 270f38c4..fb23ad33 100644
--- a/src/psycopt2d/model_training_watcher.py
+++ b/src/psycopt2d/model_training_watcher.py
@@ -96,61 +96,10 @@ def watch(self, timeout_minutes: Optional[int] = None) -> None:
             self.get_new_runs_and_evaluate()
             time.sleep(1)
 
-    def get_new_runs_and_evaluate(self) -> None:
-        """Get new runs and evaluate the best runs."""
-        self.upload_unarchived_runs()
-
-        if len(self.run_id_eval_candidates_queue) >= self.n_runs_before_eval:
-            run_information = self._get_run_information_for_all_in_queue()
-            self.run_id_eval_candidates_queue = self._get_unfinished_runs(
-                run_information=run_information
-            )
-            self._evaluate_and_archive_finished_runs(run_information=run_information)
-
-    def _upload_run_dir(self, run_dir: Path) -> str:
-        """Upload a single run to wandb."""
-        # get stdout from subprocess.run
-        proc = subprocess.run(
-            ["wandb", "sync", str(run_dir), "--project", self.project_name],
-            check=True,
-            capture_output=True,
-        )
-        stdout = proc.stdout.decode("utf-8")
-        if self.verbose:
-            msg.info(f"Watcher: {stdout}")
-        return stdout
-
     def _archive_run_dir(self, run_dir: Path) -> None:
         """Move a run to the archive folder."""
         run_dir.rename(target=self.archive_path / run_dir.name)
 
-    def _get_run_id(self, run_dir: Path) -> str:
-        """Get the run id from a run directory."""
-        return run_dir.name.split("-")[-1]
-
-    def upload_unarchived_runs(self) -> None:
-        """Upload unarchived runs to wandb. Only adds runs that have finished
-        training to the evaluation queue.
-
-        Raises:
-            ValueError: If wandb sync failed
-        """
-        for run_folder in WANDB_DIR.glob(r"offline-run*"):
-            run_id = self._get_run_id(run_folder)
-
-            wandb_sync_stdout = self._upload_run_dir(run_folder)
-
-            if "...done" not in wandb_sync_stdout:
-                if ".wandb file is empty" not in wandb_sync_stdout:
-                    raise ValueError(
-                        f"wandb sync failed, returned: {wandb_sync_stdout}",
-                    )
-                if self.verbose:
-                    msg.warn(f"Run {run_id} is still running. Skipping.")
-                continue
-
-            self.run_id_eval_candidates_queue.append(run_id)
-
     def _get_run_evaluation_data_dir(self, run_id: str) -> Path:
         """Get the evaluation path for a single run."""
         return list(self.model_data_dir.glob(f"*{run_id}*"))[0]
@@ -195,16 +144,20 @@ def _get_run_attribute(self, run: Run, attribute: str) -> Any:
             return run.summary[attribute]
         if self.verbose:
             msg.info(
-                f"Run {run.id} has no attribute {attribute}. Pinging again at next eval time."
+                f"Run {run.id} has no attribute {attribute}. Pinging again at next eval time.",
             )
         return None
 
     def _evaluate_and_archive_finished_runs(
-        self, run_information: list[RunInformation]
+        self,
+        run_information: list[RunInformation],
     ) -> None:
-        """Evaluate the finished runs. Test their performance against the current
-        maximum for each lookbehind/-ahead days, and fully evaluate the best performing.
-        Move all wandb run dirs to the archive folder."""
+        """Evaluate the finished runs.
+
+        Test their performance against the current maximum for each
+        lookbehind/-ahead days, and fully evaluate the best performing.
+        Move all wandb run dirs to the archive folder.
+        """
         finished_runs = [
             run_info for run_info in run_information if run_info.auc is not None
         ]
@@ -215,7 +168,7 @@ def _evaluate_and_archive_finished_runs(
                 > self.max_performances[run_info.lookbehind_lookahead_combination]
             ):
                 msg.good(
-                    f"New record performance for {run_info.lookbehind_lookahead_combination}! AUC: {run_info.auc}"
+                    f"New record performance for {run_info.lookbehind_lookahead_combination}! AUC: {run_info.auc}",
                 )
                 self.max_performances[
                     run_info.loobehind_lookhead_combination
@@ -227,9 +180,20 @@ def _get_unfinished_runs(self, run_information: list[RunInformation]) -> list[st
         """Get the run ids of the unfinished runs."""
         return [run_info.run_id for run_info in run_information if run_info.auc is None]
 
+    def _get_run_information(self, run_id: str) -> RunInformation:
+        """Get the run information for a single run."""
+        run = self._get_wandb_run(run_id)
+        return RunInformation(
+            run_id=run_id,
+            auc=self._get_run_attribute(run, "roc_auc_unweighted"),
+            lookbehind_days=self._get_run_attribute(run, "lookbehind_days"),
+            lookahead_days=self._get_run_attribute(run, "lookahead_days"),
+        )
+
     def _get_run_information_for_all_in_queue(self):
-        """Get the performance and information of all runs in the evaluation queue
-        and sort by lookahead/lookbehind combination and AUC for faster uploading."""
+        """Get the performance and information of all runs in the evaluation
+        queue and sort by lookahead/lookbehind combination and AUC for faster
+        uploading."""
         return [
             self._get_run_information(run_id)
             for run_id in self.run_id_eval_candidates_queue
@@ -241,15 +205,56 @@ def _get_run_information_for_all_in_queue(self):
             reverse=True,
         )
 
-    def _get_run_information(self, run_id: str) -> RunInformation:
-        """Get the run information for a single run."""
-        run = self._get_wandb_run(run_id)
-        return RunInformation(
-            run_id=run_id,
-            auc=self._get_run_attribute(run, "roc_auc_unweighted"),
-            lookbehind_days=self._get_run_attribute(run, "lookbehind_days"),
-            lookahead_days=self._get_run_attribute(run, "lookahead_days"),
+    def get_new_runs_and_evaluate(self) -> None:
+        """Get new runs and evaluate the best runs."""
+        self.upload_unarchived_runs()
+
+        if len(self.run_id_eval_candidates_queue) >= self.n_runs_before_eval:
+            run_information = self._get_run_information_for_all_in_queue()
+            self.run_id_eval_candidates_queue = self._get_unfinished_runs(
+                run_information=run_information,
+            )
+            self._evaluate_and_archive_finished_runs(run_information=run_information)
+
+    def _upload_run_dir(self, run_dir: Path) -> str:
+        """Upload a single run to wandb."""
+        # get stdout from subprocess.run
+        proc = subprocess.run(
+            ["wandb", "sync", str(run_dir), "--project", self.project_name],
+            check=True,
+            capture_output=True,
         )
+        stdout = proc.stdout.decode("utf-8")
+        if self.verbose:
+            msg.info(f"Watcher: {stdout}")
+        return stdout
+
+    def _get_run_id(self, run_dir: Path) -> str:
+        """Get the run id from a run directory."""
+        return run_dir.name.split("-")[-1]
+
+    def upload_unarchived_runs(self) -> None:
+        """Upload unarchived runs to wandb. Only adds runs that have finished
+        training to the evaluation queue.
+
+        Raises:
+            ValueError: If wandb sync failed
+        """
+        for run_folder in WANDB_DIR.glob(r"offline-run*"):
+            run_id = self._get_run_id(run_folder)
+
+            wandb_sync_stdout = self._upload_run_dir(run_folder)
+
+            if "...done" not in wandb_sync_stdout:
+                if ".wandb file is empty" not in wandb_sync_stdout:
+                    raise ValueError(
+                        f"wandb sync failed, returned: {wandb_sync_stdout}",
+                    )
+                if self.verbose:
+                    msg.warn(f"Run {run_id} is still running. Skipping.")
+                continue
+
+            self.run_id_eval_candidates_queue.append(run_id)
 
     def archive_all_runs(self) -> None:
         """Archive all runs in the wandb directory."""

From 9beedfca0bcf6281f18f096ebc5924d3f6f95030 Mon Sep 17 00:00:00 2001
From: HLasse <lasseh0310@gmail.com>
Date: Sat, 22 Oct 2022 12:14:54 +0200
Subject: [PATCH 34/57] fix: correct output if only 1 outcome col

---
 src/psycopt2d/load.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/psycopt2d/load.py b/src/psycopt2d/load.py
index 6e08b807..2523ddf9 100644
--- a/src/psycopt2d/load.py
+++ b/src/psycopt2d/load.py
@@ -369,10 +369,13 @@ def _keep_unique_outcome_col_with_lookahead_days_matching_conf(
         """Keep only one outcome column with the same lookahead days as set in
         the config."""
         outcome_cols = infer_outcome_col_name(df=dataset, allow_multiple=True)
+        # if only one outcome column, return
+        if isinstance(outcome_cols, str):
+            return dataset
+        
         col_to_drop = [
             c for c in outcome_cols if str(self.cfg.data.lookahead_days) not in c
         ]
-
         df = dataset.drop(col_to_drop, axis=1)
 
         if not isinstance(infer_outcome_col_name(df), str):

From 615b17c7be9634a286666785079f1443eceb9751 Mon Sep 17 00:00:00 2001
From: HLasse <lasseh0310@gmail.com>
Date: Sat, 22 Oct 2022 12:15:29 +0200
Subject: [PATCH 35/57] fix: various bugs in watcher

---
 src/psycopt2d/model_training_watcher.py | 70 +++++++++++++------------
 1 file changed, 37 insertions(+), 33 deletions(-)

diff --git a/src/psycopt2d/model_training_watcher.py b/src/psycopt2d/model_training_watcher.py
index fb23ad33..7aa05f78 100644
--- a/src/psycopt2d/model_training_watcher.py
+++ b/src/psycopt2d/model_training_watcher.py
@@ -5,7 +5,7 @@
 from collections import defaultdict
 from distutils.util import strtobool  # pylint: disable=deprecated-module
 from pathlib import Path
-from typing import Any, Optional
+from typing import Any, Optional, Union
 
 import wandb
 from pydantic import BaseModel
@@ -30,18 +30,19 @@
 class RunInformation(BaseModel):
     """Information about a wandb run."""
 
-    run_id: str
-    auc: float
-    lookbehind_days: int
-    lookahead_days: int
+    run_id: Optional[str]
+    auc: Optional[float]
+    lookbehind_days: Optional[Union[int, list[int]]]
+    lookahead_days: Optional[int]
     lookahead_lookbehind_combined: Optional[str] = None
 
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
-        if self.lookahead_lookbehind_combined is None:
-            self.lookahead_lookbehind_combined = (
-                f"lookahead:{self.lookahead_days}_lookbehind:{self.lookbehind_days}"
-            )
+        if (
+            self.lookahead_lookbehind_combined is None
+            and self.lookbehind_days is not None
+        ):
+            self.lookahead_lookbehind_combined = f"lookahead:{str(self.lookahead_days)}_lookbehind:{str(self.lookbehind_days)}"
 
 
 class ModelTrainingWatcher:  # pylint: disable=too-many-instance-attributes
@@ -161,20 +162,29 @@ def _evaluate_and_archive_finished_runs(
         finished_runs = [
             run_info for run_info in run_information if run_info.auc is not None
         ]
+        # sort to only upload the best in in each group
+        finished_runs.sort(
+            key=lambda run_info: (
+                run_info.lookahead_lookbehind_combined,
+                run_info.auc,
+            ),
+            reverse=True,
+        )
 
-        for run_info in finished_runs:
-            if (
-                run_info.auc
-                > self.max_performances[run_info.lookbehind_lookahead_combination]
-            ):
-                msg.good(
-                    f"New record performance for {run_info.lookbehind_lookahead_combination}! AUC: {run_info.auc}",
-                )
-                self.max_performances[
-                    run_info.loobehind_lookhead_combination
-                ] = run_info.auc
-                self._do_evaluation(run_info.run_id)
-            self._archive_run_dir(run_dir=self._get_run_wandb_dir(run_info.run_id))
+        if finished_runs:
+            for run_info in finished_runs:
+                if (
+                    run_info.auc
+                    > self.max_performances[run_info.lookahead_lookbehind_combined]
+                ):
+                    msg.good(
+                        f"New record performance for {run_info.lookahead_lookbehind_combined}! AUC: {run_info.auc}",
+                    )
+                    self.max_performances[
+                        run_info.lookahead_lookbehind_combined
+                    ] = run_info.auc
+                    self._do_evaluation(run_info.run_id)
+                self._archive_run_dir(run_dir=self._get_run_wandb_dir(run_info.run_id))
 
     def _get_unfinished_runs(self, run_information: list[RunInformation]) -> list[str]:
         """Get the run ids of the unfinished runs."""
@@ -186,24 +196,17 @@ def _get_run_information(self, run_id: str) -> RunInformation:
         return RunInformation(
             run_id=run_id,
             auc=self._get_run_attribute(run, "roc_auc_unweighted"),
-            lookbehind_days=self._get_run_attribute(run, "lookbehind_days"),
-            lookahead_days=self._get_run_attribute(run, "lookahead_days"),
+            lookbehind_days=self._get_run_attribute(run, "lookbehind"),
+            lookahead_days=self._get_run_attribute(run, "lookahead"),
         )
 
     def _get_run_information_for_all_in_queue(self):
         """Get the performance and information of all runs in the evaluation
-        queue and sort by lookahead/lookbehind combination and AUC for faster
-        uploading."""
+        queue."""
         return [
             self._get_run_information(run_id)
             for run_id in self.run_id_eval_candidates_queue
-        ].sort(
-            key=lambda run_info: (
-                run_info.lookahead_lookbehind_combined,
-                run_info.auc,
-            ),
-            reverse=True,
-        )
+        ]
 
     def get_new_runs_and_evaluate(self) -> None:
         """Get new runs and evaluate the best runs."""
@@ -323,6 +326,7 @@ def float_or_none(arg: str) -> Optional[float]:
         model_data_dir=model_data_dir,
         verbose=args.verbose,
     )
+
     if args.clean_wandb_dir:
         watcher.archive_all_runs()
 

From a350443f0f8c3ce5c1e865267ed1cc9b0a9cf251 Mon Sep 17 00:00:00 2001
From: HLasse <lasseh0310@gmail.com>
Date: Sat, 22 Oct 2022 12:16:14 +0200
Subject: [PATCH 36/57] fix: look correct lookbehind in trainer

---
 src/psycopt2d/train_model.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/psycopt2d/train_model.py b/src/psycopt2d/train_model.py
index 48f326d5..a3a64f1c 100644
--- a/src/psycopt2d/train_model.py
+++ b/src/psycopt2d/train_model.py
@@ -315,8 +315,8 @@ def main(cfg: Union[FullConfig, DictConfig]):
         project=cfg.project.name,
         reinit=True,
         config=flatten_nested_dict(cfg.__dict__, sep="."),
-        mode=cfg.project.wandb_mode,
-        group=cfg.project.wandb_group,
+        mode=cfg.project.wandb.mode,
+        group=cfg.project.wandb.group,
     )
 
     dataset = load_train_and_val_from_cfg(cfg)
@@ -342,7 +342,7 @@ def main(cfg: Union[FullConfig, DictConfig]):
 
     # only run full evaluation if wandb mode mode is online
     # otherwise delegate to watcher script
-    if cfg.project.wandb_mode == "run":
+    if cfg.project.wandb.mode == "run":
         msg.info("Evaluating model")
         evaluate_model(
             cfg=cfg,
@@ -362,7 +362,7 @@ def main(cfg: Union[FullConfig, DictConfig]):
     run.log(
         {
             "roc_auc_unweighted": roc_auc,
-            "lookbehind": cfg.data.lookbehind_days,
+            "lookbehind": cfg.data.lookbehind_combination,
             "lookahead": cfg.data.lookahead_days,
         },
     )

From b528c38379c8a50035611a8d166532fe218c9822 Mon Sep 17 00:00:00 2001
From: HLasse <lasseh0310@gmail.com>
Date: Sat, 22 Oct 2022 12:16:31 +0200
Subject: [PATCH 37/57] misc: to get things running

---
 application/train_and_log_models.py           | 23 +++++++++----------
 src/psycopt2d/config/data/synth_data.yaml     | 21 +++++++++++------
 .../project/integration_test_project.yaml     |  7 +++---
 src/psycopt2d/train_and_log_models.py         |  2 +-
 src/psycopt2d/utils/configs.py                |  2 +-
 5 files changed, 31 insertions(+), 24 deletions(-)

diff --git a/application/train_and_log_models.py b/application/train_and_log_models.py
index 8f881cb2..e58eeec3 100644
--- a/application/train_and_log_models.py
+++ b/application/train_and_log_models.py
@@ -77,9 +77,9 @@ def start_trainer(
         f"model={cfg.model.model_name}",
         f"data.min_lookbehind_days={cell.lookbehind}",
         f"data.min_lookahead_days={cell.lookahead}",
-        f"project.wandb_group='{wandb_group}'",
+        f"project.wandb.group='{wandb_group}'",
         f"hydra.sweeper.n_trials={cfg.train.n_trials_per_lookdirection_combination}",
-        f"project.wandb_mode={cfg.project.wandb_mode}",
+        f"project.wandb.mode={cfg.project.wandb.mode}",
         "--config-name",
         f"{config_file_name}",
     ]
@@ -104,7 +104,7 @@ def start_watcher(cfg: FullConfig) -> subprocess.Popen:
             "python",
             "src/psycopt2d/model_training_watcher.py",
             "--entity",
-            cfg.project.wandb_entity,
+            cfg.project.wandb.entity,
             "--project_name",
             cfg.project.name,
             "--n_runs_before_eval",
@@ -141,10 +141,9 @@ def train_models_for_each_cell_in_grid(
     random.shuffle(lookbehind_combinations)
 
     wandb_prefix = f"{random_word.get_random_word()}-{random_word.get_random_word()}"
-
+    watcher = start_watcher(cfg=cfg)
     while lookbehind_combinations:
         combination = lookbehind_combinations.pop()
-        watcher = start_watcher(cfg=cfg)
 
         msg.info(
             f"Spawning a new trainer with lookbehind={combination.lookbehind} and lookahead={combination.lookahead}",
@@ -164,12 +163,12 @@ def train_models_for_each_cell_in_grid(
         while trainer.poll() is None:
             time.sleep(1)
 
-        msg.good(
-            f"Training finished. Stopping the watcher in {cfg.project.watcher.keep_alive_after_training_minutes} minutes...",
-        )
+    msg.good(
+        f"Training finished. Stopping the watcher in {cfg.project.watcher.keep_alive_after_training_minutes} minutes...",
+    )
 
-        time.sleep(60 * cfg.project.watcher.keep_alive_after_training_minutes)
-        watcher.kill()
+    time.sleep(60 * cfg.project.watcher.keep_alive_after_training_minutes)
+    watcher.kill()
 
 
 def load_cfg(config_file_name):
@@ -187,7 +186,7 @@ def main():
     """Main."""
     msg = Printer(timestamp=True)
 
-    config_file_name = "integration_testing.yaml"
+    config_file_name = "default_config.yaml"
 
     cfg = load_cfg(config_file_name=config_file_name)
     # TODO: Watcher must be instantiated once for each cell in the grid, otherwise
@@ -198,7 +197,7 @@ def main():
     # Remove "9999" from possible look distances behind
     possible_look_distances.behind = [
         dist
-        for dist in possible_look_distances
+        for dist in possible_look_distances.behind
         if not int(dist) > cfg.data.max_lookbehind_days
     ]
 
diff --git a/src/psycopt2d/config/data/synth_data.yaml b/src/psycopt2d/config/data/synth_data.yaml
index f94b5da9..fe9fdccb 100644
--- a/src/psycopt2d/config/data/synth_data.yaml
+++ b/src/psycopt2d/config/data/synth_data.yaml
@@ -1,21 +1,28 @@
 # @package _global_
 data:
   n_training_samples: null
-  min_lookahead_days: null
-  min_lookbehind_days: null
-  min_prediction_time_date: null
-  lookahead_days: 30
+  dir: "../psycop-t2d/tests/test_data/synth_splits/"
+  suffix: csv
+
+  # Feature specs
   pred_col_name_prefix: pred_
   pred_timestamp_col_name: timestamp
   outcome_timestamp_col_name: timestamp_outcome
   id_col_name: citizen_ids
-  dir: "../psycop-t2d/tests/test_data/synth_splits/"
-  suffix: csv
+  
+  # Looking ahead
+  lookahead_days: 30
+  min_lookahead_days: null
   drop_patient_if_outcome_before_date: null
+  
+  # Looking behind
+  min_prediction_time_date: null
+  min_lookbehind_days: null
+  max_lookbehind_days: 1850
   lookbehind_combination: [30, 90]
 
 # Parameters that will only take effect if running with --multirun
 hydra:
   sweeper:
     params:
-      data.lookbehind_combination: choice([3000, 90], [30])
+      data.lookbehind_combination: choice([100, 60], [30])
diff --git a/src/psycopt2d/config/project/integration_test_project.yaml b/src/psycopt2d/config/project/integration_test_project.yaml
index 05f31fcd..486511cd 100644
--- a/src/psycopt2d/config/project/integration_test_project.yaml
+++ b/src/psycopt2d/config/project/integration_test_project.yaml
@@ -1,8 +1,9 @@
 name: psycop-t2d-integration-testing
 seed: 42
-wandb_mode: "disabled" # Which mode to run WanDB in. Takes "run", "dryrun", "offline" and "disabled"
-wandb_group: "integration_testing"
-wandb_entity: "psycop-t2d-testing" # Which entity to run WanDB in.
+wandb:
+  mode: "offline" # Which mode to run WanDB in. Takes "run", "dryrun", "offline" and "disabled"
+  group: "integration_testing"
+  entity: "psycop-t2d-testing" # Which entity to run WanDB in.
 watcher:
   archive_all: true
   keep_alive_after_training_minutes: 5
diff --git a/src/psycopt2d/train_and_log_models.py b/src/psycopt2d/train_and_log_models.py
index 50b92ee8..6fa8b2c8 100644
--- a/src/psycopt2d/train_and_log_models.py
+++ b/src/psycopt2d/train_and_log_models.py
@@ -16,7 +16,7 @@
 # RUN CONSTANTS
 CONFIG_NAME = "integration_testing.yaml"
 
-HYDRA_ARGS = f"--multirun +model=xgboost project.wandb_mode='dryrun' model.args.tree_method='auto' --config-name {CONFIG_NAME}"
+HYDRA_ARGS = f"--multirun project.wandb.mode='dryrun' model.args.tree_method='auto' --config-name {CONFIG_NAME}"
 OVERTACI = "false"  # Change to "true" if running on overtaci
 
 # WATCHER CONSTANTS
diff --git a/src/psycopt2d/utils/configs.py b/src/psycopt2d/utils/configs.py
index 4a198dd9..c32f8b77 100644
--- a/src/psycopt2d/utils/configs.py
+++ b/src/psycopt2d/utils/configs.py
@@ -43,6 +43,7 @@ class ProjectConf(BaseModel):
 
     name: str = "psycopt2d"
     seed: int
+    wandb: WandbConf
     watcher: WatcherConf
 
 
@@ -120,7 +121,6 @@ class EvalConf(BaseModel):
 class FullConfig(BaseModel):
     """A full configuration object."""
 
-    wandb: WandbConf
     project: ProjectConf
     data: DataConf
     preprocessing: PreprocessingConf

From 39858e512a74a7ba0b5f8662fc37f8fa8e1cc972 Mon Sep 17 00:00:00 2001
From: Martin Bernstorff <ryqiem@gmail.com>
Date: Mon, 24 Oct 2022 10:01:03 +0200
Subject: [PATCH 38/57] misc. fixes

---
 README.md                                     |   2 +-
 application/train_and_log_models.py           | 104 +++++++++++-------
 src/psycopt2d/config/data/synth_data.yaml     |   3 +-
 src/psycopt2d/config/data/t2d_parquet.yaml    |  21 ++--
 src/psycopt2d/config/default_config.yaml      |   2 +-
 .../config/project/default_project.yaml       |   2 +-
 .../project/integration_test_project.yaml     |   9 +-
 .../config/train/default_training.yaml        |   5 +-
 src/psycopt2d/evaluation.py                   |   4 +-
 src/psycopt2d/load.py                         |  70 +++++++-----
 src/psycopt2d/train_and_log_models.py         |   2 +-
 src/psycopt2d/train_model.py                  |  26 +++--
 src/psycopt2d/utils/configs.py                |  12 +-
 src/psycopt2d/utils/utils.py                  |   4 +-
 14 files changed, 161 insertions(+), 105 deletions(-)

diff --git a/README.md b/README.md
index 5e6d3238..3cb4511a 100644
--- a/README.md
+++ b/README.md
@@ -21,7 +21,7 @@ python src/psycopt2d/train_model.py --config-name test_config.yaml +model=xgboos
 
 To test new integrations with WandB:
 ```python
-python src/psycopt2d/train_model.py +model=xgboost project.wandb_mode="run" --config-name integration_testing.yaml
+python src/psycopt2d/train_model.py +model=xgboost project.wandb.mode="run" --config-name integration_testing.yaml
 ```
 
 
diff --git a/application/train_and_log_models.py b/application/train_and_log_models.py
index 8f881cb2..9385ac16 100644
--- a/application/train_and_log_models.py
+++ b/application/train_and_log_models.py
@@ -8,6 +8,7 @@
 import random
 import subprocess
 import time
+from pathlib import Path
 
 import pandas as pd
 from hydra import compose, initialize
@@ -19,7 +20,7 @@
     infer_outcome_col_name,
     infer_predictor_col_name,
 )
-from psycopt2d.load import DataLoader
+from psycopt2d.load import DataLoader, load_train_from_cfg
 from psycopt2d.utils.configs import FullConfig, omegaconf_to_pydantic_objects
 
 msg = Printer(timestamp=True)
@@ -32,11 +33,15 @@ class PossibleLookDistanceDays(BaseModel):
     behind: list[str]
 
 
-def load_train_for_inference(cfg: FullConfig):
+def load_train_raw(cfg: FullConfig):
     """Load the data."""
-    loader = DataLoader(cfg=cfg)
-    msg.info("Loading datasets for look direction inference")
-    return loader.load_dataset_from_dir(split_names="train")
+    path = Path(cfg.data.dir)
+    file = list(path.glob(pattern=r"*train*"))
+
+    if len(file) == 1:
+        return pd.read_parquet(file)
+
+    raise ValueError(f"Returned {len(file)} files")
 
 
 def infer_possible_look_distances(df: pd.DataFrame) -> PossibleLookDistanceDays:
@@ -68,18 +73,18 @@ def start_trainer(
     cfg: FullConfig,
     config_file_name: str,
     cell: LookDirectionCombination,
-    wandb_group: str,
+    wandb_group_override: str,
 ) -> subprocess.Popen:
     """Start a trainer."""
     subprocess_args: list[str] = [
         "python",
         "src/psycopt2d/train_model.py",
         f"model={cfg.model.model_name}",
-        f"data.min_lookbehind_days={cell.lookbehind}",
+        f"data.min_lookbehind_days={max(cfg.data.lookbehind_combination)}",
         f"data.min_lookahead_days={cell.lookahead}",
-        f"project.wandb_group='{wandb_group}'",
+        f"project.wandb.group='{wandb_group_override}'",
         f"hydra.sweeper.n_trials={cfg.train.n_trials_per_lookdirection_combination}",
-        f"project.wandb_mode={cfg.project.wandb_mode}",
+        f"project.wandb.mode={cfg.project.wandb.mode}",
         "--config-name",
         f"{config_file_name}",
     ]
@@ -104,7 +109,7 @@ def start_watcher(cfg: FullConfig) -> subprocess.Popen:
             "python",
             "src/psycopt2d/model_training_watcher.py",
             "--entity",
-            cfg.project.wandb_entity,
+            cfg.project.wandb.entity,
             "--project_name",
             cfg.project.name,
             "--n_runs_before_eval",
@@ -140,36 +145,50 @@ def train_models_for_each_cell_in_grid(
 
     random.shuffle(lookbehind_combinations)
 
+    active_trainers: list[subprocess.Popen] = []
+
     wandb_prefix = f"{random_word.get_random_word()}-{random_word.get_random_word()}"
 
-    while lookbehind_combinations:
-        combination = lookbehind_combinations.pop()
-        watcher = start_watcher(cfg=cfg)
+    while lookbehind_combinations or active_trainers:
+        # Wait until there is a free slot in the trainers group
+        if len(active_trainers) >= cfg.train.n_active_trainers:
+            # Drop trainers if they have finished
+            # If finished, t.poll() is not None
+            active_trainers = [t for t in active_trainers if t.poll() is None]
+            time.sleep(1)
+            continue
 
-        msg.info(
-            f"Spawning a new trainer with lookbehind={combination.lookbehind} and lookahead={combination.lookahead}",
-        )
+        # Start a new trainer
 
-        wandb_group = (
-            f"{wandb_prefix}-beh-{combination.lookbehind}-ahead-{combination.lookahead}"
-        )
+        combination = lookbehind_combinations.pop()
 
-        trainer = start_trainer(
-            cfg=cfg,
-            config_file_name=config_file_name,
-            cell=combination,
-            wandb_group=wandb_group,
-        )
+        # Check if any rows in the given combinatin of lookbehind and lookahead days
+        cfg_for_checking_any_rows = cfg.copy()
+        cfg_for_checking_any_rows.data.min_lookbehind_days = combination.lookbehind
+        cfg_for_checking_any_rows.data.min_lookahead_days = combination.lookahead
+        # TODO: Can be refactored by
+        # 1) Inferring the dataset length from max/min of prediction time
+        # 2) Checking if combination.lookbehind + combination.lookahead < dataset length
 
-        while trainer.poll() is None:
-            time.sleep(1)
+        train = load_train_from_cfg(cfg=cfg)
 
-        msg.good(
-            f"Training finished. Stopping the watcher in {cfg.project.watcher.keep_alive_after_training_minutes} minutes...",
-        )
+        if train.shape[0] == 0:
+            msg.warn(f"No rows for {combination}, continuing")
+            continue
 
-        time.sleep(60 * cfg.project.watcher.keep_alive_after_training_minutes)
-        watcher.kill()
+        # watcher = start_watcher(cfg=cfg)
+        msg.info(
+            f"Spawning a new trainer with lookbehind={combination.lookbehind} and lookahead={combination.lookahead}",
+        )
+        wandb_group = f"{wandb_prefix}"
+        active_trainers.append(
+            start_trainer(
+                cfg=cfg,
+                config_file_name=config_file_name,
+                cell=combination,
+                wandb_group_override=wandb_group,
+            )
+        )
 
 
 def load_cfg(config_file_name):
@@ -187,20 +206,27 @@ def main():
     """Main."""
     msg = Printer(timestamp=True)
 
-    config_file_name = "integration_testing.yaml"
+    config_file_name = "default_config.yaml"
 
     cfg = load_cfg(config_file_name=config_file_name)
+
+    if cfg.project.wandb.mode == "run":
+        msg.warn(
+            f"wandb.mode is {cfg.project.wandb.mode}, not using the watcher. This will substantially slow down training."
+        )
+
     # TODO: Watcher must be instantiated once for each cell in the grid, otherwise
     # it will compare max performances across all cells.
-    train = load_train_for_inference(cfg=cfg)
+    train = load_train_raw(cfg=cfg)
     possible_look_distances = infer_possible_look_distances(df=train)
 
     # Remove "9999" from possible look distances behind
-    possible_look_distances.behind = [
-        dist
-        for dist in possible_look_distances
-        if not int(dist) > cfg.data.max_lookbehind_days
-    ]
+    if cfg.data.max_lookbehind_days:
+        possible_look_distances.behind = [
+            dist
+            for dist in possible_look_distances.behind
+            if not int(dist) > cfg.data.max_lookbehind_days
+        ]
 
     msg.info(f"Possible lookbehind days: {possible_look_distances.behind}")
     msg.info(f"Possible lookahead days: {possible_look_distances.ahead}")
diff --git a/src/psycopt2d/config/data/synth_data.yaml b/src/psycopt2d/config/data/synth_data.yaml
index f94b5da9..9dfac480 100644
--- a/src/psycopt2d/config/data/synth_data.yaml
+++ b/src/psycopt2d/config/data/synth_data.yaml
@@ -4,7 +4,6 @@ data:
   min_lookahead_days: null
   min_lookbehind_days: null
   min_prediction_time_date: null
-  lookahead_days: 30
   pred_col_name_prefix: pred_
   pred_timestamp_col_name: timestamp
   outcome_timestamp_col_name: timestamp_outcome
@@ -18,4 +17,4 @@ data:
 hydra:
   sweeper:
     params:
-      data.lookbehind_combination: choice([3000, 90], [30])
+      data.lookbehind_combination: choice([30, 90], [30])
diff --git a/src/psycopt2d/config/data/t2d_parquet.yaml b/src/psycopt2d/config/data/t2d_parquet.yaml
index 6c1ee3ac..f88a9402 100644
--- a/src/psycopt2d/config/data/t2d_parquet.yaml
+++ b/src/psycopt2d/config/data/t2d_parquet.yaml
@@ -5,25 +5,24 @@ data:
   dir: E:\shared_resources\feature_sets\t2d\feature_sets\psycop_t2d_adminmanber_416_features_2022_10_20_11_12
   suffix: parquet
 
+  # Patient exclusion criteria
+  drop_patient_if_outcome_before_date: 2013-01-01
+
+  # Prediction time exclusion criteria
+  min_prediction_time_date: 2013-01-01
+  min_lookbehind_days: 730
+  min_lookahead_days: 1825
+
   # Feature specs
   pred_col_name_prefix: "pred_"
   pred_timestamp_col_name: timestamp
   outcome_timestamp_col_name: _timestamp_first_t2d
   id_col_name: dw_ek_borger
-
-  # Looking ahead
-  lookahead_days: 365
-  min_lookahead_days: 365
-  drop_patient_if_outcome_before_date: null
-
-  # Looking behind
-  min_prediction_time_date: 2013-01-01
-  min_lookbehind_days: 365
   max_lookbehind_days: 3650
-  lookbehind_combination: [30, 90, 180, 365]
+  lookbehind_combination: [30, 90, 180, 365, 730]
 
 # Parameters that will only take effect if running with --multirun
 hydra:
   sweeper:
     params:
-      ++data.lookbehind_combination: choice([3000], [30, 90])
+      ++data.lookbehind_combination: choice([30, 90, 180, 365, 730], [30, 180, 730], [30, 730], [730], [365], [90], [30])
diff --git a/src/psycopt2d/config/default_config.yaml b/src/psycopt2d/config/default_config.yaml
index 46b91517..c62edfc1 100644
--- a/src/psycopt2d/config/default_config.yaml
+++ b/src/psycopt2d/config/default_config.yaml
@@ -6,4 +6,4 @@ defaults:
   - model: xgboost
   - train: default_training
   - eval: default_evaluation
-  - sweeper: optuna_multithread
+  - sweeper: optuna_singlethread
diff --git a/src/psycopt2d/config/project/default_project.yaml b/src/psycopt2d/config/project/default_project.yaml
index 837cfc78..e13ddd05 100644
--- a/src/psycopt2d/config/project/default_project.yaml
+++ b/src/psycopt2d/config/project/default_project.yaml
@@ -3,7 +3,7 @@ seed: 42
 
 wandb:
   entity: "psycop" # Which entity to run WanDB in.
-  mode: "offline" # Which mode to run WanDB in. Takes "run", "dryrun", "offline" and "disabled"
+  mode: "run" # Which mode to run WanDB in. Takes "run", "dryrun", "offline" and "disabled"
   group: "psycop-t2d" # Which group to run WanDB in.
 
 watcher:
diff --git a/src/psycopt2d/config/project/integration_test_project.yaml b/src/psycopt2d/config/project/integration_test_project.yaml
index 05f31fcd..38df97d6 100644
--- a/src/psycopt2d/config/project/integration_test_project.yaml
+++ b/src/psycopt2d/config/project/integration_test_project.yaml
@@ -1,8 +1,11 @@
 name: psycop-t2d-integration-testing
 seed: 42
-wandb_mode: "disabled" # Which mode to run WanDB in. Takes "run", "dryrun", "offline" and "disabled"
-wandb_group: "integration_testing"
-wandb_entity: "psycop-t2d-testing" # Which entity to run WanDB in.
+
+wandb:
+  mode: "run" # Which mode to run WanDB in. Takes "run", "dryrun", "offline" and "disabled"
+  group: "integration_testing"
+  entity: "psycop-t2d-testing" # Which entity to run WanDB in.
+
 watcher:
   archive_all: true
   keep_alive_after_training_minutes: 5
diff --git a/src/psycopt2d/config/train/default_training.yaml b/src/psycopt2d/config/train/default_training.yaml
index 476a0e11..e81d99be 100644
--- a/src/psycopt2d/config/train/default_training.yaml
+++ b/src/psycopt2d/config/train/default_training.yaml
@@ -1,3 +1,4 @@
 n_splits: 3 # (int, Null): Number of k-folds during CV. If Null, loads pre-defined dataset.
-n_trials_per_lookdirection_combination: 10
-gpu: true
\ No newline at end of file
+n_trials_per_lookdirection_combination: 20
+n_active_trainers: 8
+gpu: true
diff --git a/src/psycopt2d/evaluation.py b/src/psycopt2d/evaluation.py
index 17a7d1b6..2d229041 100644
--- a/src/psycopt2d/evaluation.py
+++ b/src/psycopt2d/evaluation.py
@@ -101,12 +101,12 @@ def evaluate_model(
     # Drop date_bins_direction if they are further away than min_lookdirection_days
     if cfg.data.min_lookbehind_days:
         date_bins_behind = [
-            b for b in date_bins_behind if cfg.data.min_lookbehind_days < b
+            b for b in date_bins_behind if cfg.data.min_lookbehind_days > b
         ]
 
     if cfg.data.min_lookahead_days:
         date_bins_ahead = [
-            b for b in date_bins_ahead if cfg.data.min_lookahead_days < abs(b)
+            b for b in date_bins_ahead if cfg.data.min_lookahead_days > abs(b)
         ]
 
     # Invert date_bins_behind to negative if it's not already
diff --git a/src/psycopt2d/load.py b/src/psycopt2d/load.py
index 6e08b807..93252173 100644
--- a/src/psycopt2d/load.py
+++ b/src/psycopt2d/load.py
@@ -1,4 +1,5 @@
 """Loader for the t2d dataset."""
+import os
 import re
 from collections.abc import Iterable
 from datetime import timedelta
@@ -12,12 +13,9 @@
 
 from psycopt2d.evaluate_saved_model_predictions import infer_look_distance
 from psycopt2d.utils.configs import FullConfig
-from psycopt2d.utils.utils import (
-    coerce_to_datetime,
-    get_percent_lost,
-    infer_outcome_col_name,
-    infer_predictor_col_name,
-)
+from psycopt2d.utils.utils import (coerce_to_datetime, get_percent_lost,
+                                   infer_outcome_col_name,
+                                   infer_predictor_col_name)
 
 msg = Printer(timestamp=True)
 
@@ -160,27 +158,23 @@ def _drop_rows_if_datasets_ends_within_days(
 
         return dataset
 
-    def _drop_patients_with_event_in_washin(self, dataset) -> pd.DataFrame:
+    def drop_patient_if_outcome_before_date(
+        self, dataset: pd.DataFrame
+    ) -> pd.DataFrame:
         """Drop patients within washin period."""
 
         n_rows_before_modification = dataset.shape[0]
 
-        # Remove dates before drop_patient_if_outcome_before_date
         outcome_before_date = (
-            dataset["_timestamp_first_t2d"]
+            dataset[self.cfg.data.outcome_timestamp_col_name]
             < self.cfg.data.drop_patient_if_outcome_before_date
         )
 
         patients_to_drop = set(dataset["dw_ek_borger"][outcome_before_date].unique())
         dataset = dataset[~dataset["dw_ek_borger"].isin(patients_to_drop)]
 
-        # Removed dates before drop_patient_if_outcome_before_date
-        dataset = dataset[
-            dataset[self.cfg.data.pred_timestamp_col_name]
-            > self.cfg.data.drop_patient_if_outcome_before_date
-        ]
-
         n_rows_after_modification = dataset.shape[0]
+
         percent_dropped = get_percent_lost(
             n_before=n_rows_after_modification,
             n_after=n_rows_after_modification,
@@ -225,7 +219,7 @@ def _drop_cols_not_in_lookbehind_combination(
             lookbehinds_in_dataset,
         ):
             msg.warn(
-                f"One or more of the provided lookbehinds in lookbehind_combination is/are not used in any predictors in the dataset. Dataset: {lookbehinds_in_dataset}. lookbehind_combination: {self.cfg.data.lookbehind_combination}.",
+                f"One or more of the provided lookbehinds in lookbehind_combination is/are not used in any predictors in the dataset: {lookbehinds_in_spec - lookbehinds_in_dataset}",
             )
 
             lookbehinds_to_keep = lookbehinds_in_spec.intersection(
@@ -369,19 +363,30 @@ def _keep_unique_outcome_col_with_lookahead_days_matching_conf(
         """Keep only one outcome column with the same lookahead days as set in
         the config."""
         outcome_cols = infer_outcome_col_name(df=dataset, allow_multiple=True)
-        col_to_drop = [
-            c for c in outcome_cols if str(self.cfg.data.lookahead_days) not in c
-        ]
+
+        if not outcome_cols:
+            raise ValueError("No outcome columns found.")
+
+        if isinstance(outcome_cols, list):
+            col_to_drop = [
+                c for c in outcome_cols if str(self.cfg.data.min_lookahead_days) not in c
+            ]
+        elif isinstance(outcome_cols, str):
+            col_to_drop = [outcome_cols]
 
         df = dataset.drop(col_to_drop, axis=1)
 
-        if not isinstance(infer_outcome_col_name(df), str):
+        if not self.n_outcome_col_names(df) == 1:
             raise ValueError(
-                "Returning more than one outcome column, will cause problems during eval.",
+                f"Returning {self.n_outcome_col_names(df=df)}, will cause problems during eval.",
             )
 
         return df
 
+    def n_outcome_col_names(self, df: pd.DataFrame):
+        """How many outcome columns there are in a dataframe."""
+        return len(infer_outcome_col_name(df=df, allow_multiple=True))
+
     def _process_dataset(self, dataset: pd.DataFrame) -> pd.DataFrame:
         """Process dataset, namely:
 
@@ -393,12 +398,10 @@ def _process_dataset(self, dataset: pd.DataFrame) -> pd.DataFrame:
         Returns:
             pd.DataFrame: Processed dataset
         """
-        if self.cfg.data.drop_patient_if_outcome_before_date:
-            dataset = add_washin_timestamps(dataset=dataset)
-
         dataset = self._convert_timestamp_dtype_and_nat(dataset)
+
         if self.cfg.data.drop_patient_if_outcome_before_date:
-            dataset = self._drop_patients_with_event_in_washin(dataset=dataset)
+            dataset = self.drop_patient_if_outcome_before_date(dataset=dataset)
 
         # Drop if later than min prediction time date
         if self.cfg.data.min_prediction_time_date:
@@ -490,6 +493,18 @@ class Config:
     val: pd.DataFrame
 
 
+def load_train_from_cfg(cfg: FullConfig) -> pd.DataFrame:
+    """Load train dataset from config.
+
+    Args:
+        cfg (FullConfig): Config
+
+    Returns:
+        pd.DataFrame: Train dataset
+    """
+    return DataLoader(cfg=cfg).load_dataset_from_dir(split_names="train")
+
+
 def load_train_and_val_from_cfg(cfg: FullConfig):
     """Load train and validation data from file."""
 
@@ -499,3 +514,8 @@ def load_train_and_val_from_cfg(cfg: FullConfig):
         train=loader.load_dataset_from_dir(split_names="train"),
         val=loader.load_dataset_from_dir(split_names="val"),
     )
+
+
+def get_latest_dataset_dir(path: Path) -> Path:
+    """Get the latest dataset directory by time of creation."""
+    return max(path.glob("*"), key=os.path.getctime)
diff --git a/src/psycopt2d/train_and_log_models.py b/src/psycopt2d/train_and_log_models.py
index 50b92ee8..b080b0d1 100644
--- a/src/psycopt2d/train_and_log_models.py
+++ b/src/psycopt2d/train_and_log_models.py
@@ -16,7 +16,7 @@
 # RUN CONSTANTS
 CONFIG_NAME = "integration_testing.yaml"
 
-HYDRA_ARGS = f"--multirun +model=xgboost project.wandb_mode='dryrun' model.args.tree_method='auto' --config-name {CONFIG_NAME}"
+HYDRA_ARGS = f"--multirun +model=xgboost project.wandb.mode='dryrun' model.args.tree_method='auto' --config-name {CONFIG_NAME}"
 OVERTACI = "false"  # Change to "true" if running on overtaci
 
 # WATCHER CONSTANTS
diff --git a/src/psycopt2d/train_model.py b/src/psycopt2d/train_model.py
index 48f326d5..6ff96fc7 100644
--- a/src/psycopt2d/train_model.py
+++ b/src/psycopt2d/train_model.py
@@ -1,12 +1,14 @@
 """Training script for training a single model for predicting t2d."""
 import os
 from collections.abc import Iterable
-from typing import Optional, Union
+from multiprocessing.sharedctypes import Value
+from typing import Any, Hashable, Optional, Union
 
 import hydra
 import numpy as np
 import pandas as pd
 import wandb
+from omegaconf import OmegaConf
 from omegaconf.dictconfig import DictConfig
 from sklearn.impute import SimpleImputer
 from sklearn.metrics import roc_auc_score
@@ -287,7 +289,7 @@ def get_col_names(cfg: DictConfig, train: pd.DataFrame) -> tuple[str, list[str]]
     """
 
     outcome_col_name = (  # pylint: disable=invalid-name
-        f"outc_dichotomous_t2d_within_{cfg.data.lookahead_days}_days_max_fallback_0"
+        f"outc_dichotomous_t2d_within_{cfg.data.min_lookahead_days}_days_max_fallback_0"
     )
 
     train_col_names = [  # pylint: disable=invalid-name
@@ -302,8 +304,11 @@ def get_col_names(cfg: DictConfig, train: pd.DataFrame) -> tuple[str, list[str]]
     config_name="default_config",
     version_base="1.2",
 )
-def main(cfg: Union[FullConfig, DictConfig]):
+def main(cfg: DictConfig):
     """Main function for training a single model."""
+    # Save dictconfig for easier logging
+    dict_config: dict[str, Any] = OmegaConf.to_container(cfg)  # type: ignore
+
     if not isinstance(cfg, FullConfig):
         cfg = omegaconf_to_pydantic_objects(cfg)
 
@@ -314,11 +319,14 @@ def main(cfg: Union[FullConfig, DictConfig]):
     run = wandb.init(
         project=cfg.project.name,
         reinit=True,
-        config=flatten_nested_dict(cfg.__dict__, sep="."),
-        mode=cfg.project.wandb_mode,
-        group=cfg.project.wandb_group,
+        config=dict_config,
+        mode=cfg.project.wandb.mode,
+        group=cfg.project.wandb.group,
     )
 
+    if run is None:
+        raise ValueError("Failed to initialise Wandb")
+
     dataset = load_train_and_val_from_cfg(cfg)
 
     msg.info("Creating pipeline")
@@ -342,7 +350,7 @@ def main(cfg: Union[FullConfig, DictConfig]):
 
     # only run full evaluation if wandb mode mode is online
     # otherwise delegate to watcher script
-    if cfg.project.wandb_mode == "run":
+    if cfg.project.wandb.mode == "run":
         msg.info("Evaluating model")
         evaluate_model(
             cfg=cfg,
@@ -362,8 +370,8 @@ def main(cfg: Union[FullConfig, DictConfig]):
     run.log(
         {
             "roc_auc_unweighted": roc_auc,
-            "lookbehind": cfg.data.lookbehind_days,
-            "lookahead": cfg.data.lookahead_days,
+            "lookbehind": max(cfg.data.lookbehind_combination),
+            "lookahead": cfg.data.min_lookahead_days,
         },
     )
     run.finish()
diff --git a/src/psycopt2d/utils/configs.py b/src/psycopt2d/utils/configs.py
index 4a198dd9..4d1ce2de 100644
--- a/src/psycopt2d/utils/configs.py
+++ b/src/psycopt2d/utils/configs.py
@@ -25,6 +25,8 @@ class Config:
 
 
 class WandbConf(BaseModel):
+    """Configuration for weights and biases."""
+
     group: str
     mode: str
     entity: str
@@ -41,6 +43,7 @@ class WatcherConf(BaseModel):
 class ProjectConf(BaseModel):
     """Project configuration."""
 
+    wandb: WandbConf
     name: str = "psycopt2d"
     seed: int
     watcher: WatcherConf
@@ -62,16 +65,13 @@ class DataConf(BaseModel):
     id_col_name: str  # (str): Citizen colnames
 
     # Looking ahead
-    lookahead_days: int  # (float): Number of days from prediction time to look ahead for the outcome.
-    min_lookahead_days: Optional[
-        int
-    ]  # (int): Drop all prediction times where (max timestamp in the dataset) - (current timestamp) is less than min_lookahead_days
+    min_lookahead_days: int  # (int): Drop all prediction times where (max timestamp in the dataset) - (current timestamp) is less than min_lookahead_days
     drop_patient_if_outcome_before_date: Optional[Union[str, datetime]]
 
     # Looking behind
     # (int): Drop all prediction times where (prediction_timestamp) - (min timestamp in the dataset) is less than min_lookbehind_days
     min_prediction_time_date: Optional[Union[str, datetime]]
-    min_lookbehind_days: Optional[int]
+    min_lookbehind_days: int
     max_lookbehind_days: Optional[int]
     lookbehind_combination: Optional[list[int]]
 
@@ -100,6 +100,7 @@ class TrainConf(BaseModel):
 
     n_splits: int  # ? How do we handle whether to use crossvalidation or train/val splitting?
     n_trials_per_lookdirection_combination: int
+    n_active_trainers: int  # Number of subprocesses to spawn when training
     gpu: bool
 
 
@@ -120,7 +121,6 @@ class EvalConf(BaseModel):
 class FullConfig(BaseModel):
     """A full configuration object."""
 
-    wandb: WandbConf
     project: ProjectConf
     data: DataConf
     preprocessing: PreprocessingConf
diff --git a/src/psycopt2d/utils/utils.py b/src/psycopt2d/utils/utils.py
index 3ad4bcb5..41c125a2 100644
--- a/src/psycopt2d/utils/utils.py
+++ b/src/psycopt2d/utils/utils.py
@@ -403,7 +403,7 @@ def infer_col_names(
     col_name = [c for c in df.columns if c.startswith(prefix)]
 
     if len(col_name) == 1:
-        return col_name[0]
+        return col_name
     elif len(col_name) > 1:
         if allow_multiple:
             return col_name
@@ -411,7 +411,7 @@ def infer_col_names(
             f"Multiple columns found and allow_multiple is {allow_multiple}.",
         )
     else:
-        raise ValueError("More than one outcome inferred")
+        raise ValueError("No outcomes inferred")
 
 
 def infer_outcome_col_name(

From f50b501aa53306d259b41b4e3614d65025e4a514 Mon Sep 17 00:00:00 2001
From: Martin Bernstorff <martinbernstorff@gmail.com>
Date: Mon, 24 Oct 2022 11:14:18 +0200
Subject: [PATCH 39/57] merge with main

---
 docs/conf.py                                  |  4 +--
 .../project/integration_test_project.yaml     |  6 ++--
 src/psycopt2d/load.py                         | 32 +++++++++++--------
 .../model_performance/model_performance.py    |  4 +--
 src/psycopt2d/train_model.py                  |  2 +-
 src/psycopt2d/utils/configs.py                |  1 +
 src/psycopt2d/utils/utils.py                  |  5 ++-
 tests/test_utils.py                           |  2 +-
 8 files changed, 31 insertions(+), 25 deletions(-)

diff --git a/docs/conf.py b/docs/conf.py
index b420ec84..e898c604 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -24,7 +24,7 @@
 
 # Add any Sphinx extension module names here, as strings. They can be extensions
 # coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
-extensions = []
+extensions = []  # type: ignore
 
 # Add any paths that contain templates here, relative to this directory.
 templates_path = ["_templates"]
@@ -167,7 +167,7 @@
 
 # -- Options for LaTeX output --------------------------------------------------
 
-latex_elements = {
+latex_elements = {  # type: ignore
     # The paper size ('letterpaper' or 'a4paper').
     # 'papersize': 'letterpaper',
     # The font size ('10pt', '11pt' or '12pt').
diff --git a/src/psycopt2d/config/project/integration_test_project.yaml b/src/psycopt2d/config/project/integration_test_project.yaml
index 38df97d6..2231968c 100644
--- a/src/psycopt2d/config/project/integration_test_project.yaml
+++ b/src/psycopt2d/config/project/integration_test_project.yaml
@@ -1,11 +1,9 @@
 name: psycop-t2d-integration-testing
 seed: 42
-
 wandb:
-  mode: "run" # Which mode to run WanDB in. Takes "run", "dryrun", "offline" and "disabled"
+  mode: "disabled" # Which mode to run WanDB in. Takes "run", "dryrun", "offline" and "disabled"
   group: "integration_testing"
-  entity: "psycop-t2d-testing" # Which entity to run WanDB in.
-
+  entity: "psycop" # Which entity to run WanDB in.
 watcher:
   archive_all: true
   keep_alive_after_training_minutes: 5
diff --git a/src/psycopt2d/load.py b/src/psycopt2d/load.py
index 93252173..15bd95c2 100644
--- a/src/psycopt2d/load.py
+++ b/src/psycopt2d/load.py
@@ -13,9 +13,12 @@
 
 from psycopt2d.evaluate_saved_model_predictions import infer_look_distance
 from psycopt2d.utils.configs import FullConfig
-from psycopt2d.utils.utils import (coerce_to_datetime, get_percent_lost,
-                                   infer_outcome_col_name,
-                                   infer_predictor_col_name)
+from psycopt2d.utils.utils import (
+    coerce_to_datetime,
+    get_percent_lost,
+    infer_outcome_col_name,
+    infer_predictor_col_name,
+)
 
 msg = Printer(timestamp=True)
 
@@ -364,21 +367,24 @@ def _keep_unique_outcome_col_with_lookahead_days_matching_conf(
         the config."""
         outcome_cols = infer_outcome_col_name(df=dataset, allow_multiple=True)
 
-        if not outcome_cols:
-            raise ValueError("No outcome columns found.")
+        col_to_drop = [
+            c for c in outcome_cols if str(self.cfg.data.lookahead_days) not in c
+        ]
 
-        if isinstance(outcome_cols, list):
-            col_to_drop = [
-                c for c in outcome_cols if str(self.cfg.data.min_lookahead_days) not in c
-            ]
-        elif isinstance(outcome_cols, str):
-            col_to_drop = [outcome_cols]
+        # If no columns to drop, return the dataset
+        if not col_to_drop:
+            return dataset
+
+        if len(col_to_drop) == 1:
+            col_to_drop = col_to_drop[0]
+        else:
+            col_to_drop = outcome_cols
 
         df = dataset.drop(col_to_drop, axis=1)
 
-        if not self.n_outcome_col_names(df) == 1:
+        if not isinstance(infer_outcome_col_name(df), str):
             raise ValueError(
-                f"Returning {self.n_outcome_col_names(df=df)}, will cause problems during eval.",
+                "Returning more than one outcome column, will cause problems during eval.",
             )
 
         return df
diff --git a/src/psycopt2d/model_performance/model_performance.py b/src/psycopt2d/model_performance/model_performance.py
index e3f887da..6a413620 100644
--- a/src/psycopt2d/model_performance/model_performance.py
+++ b/src/psycopt2d/model_performance/model_performance.py
@@ -365,9 +365,7 @@ def compute_metrics(
         """
         # sorting to get correct output from f1, prec, and recall
         groups = sorted(set(labels))
-        performance = {}
-
-        performance["acc-overall"] = accuracy_score(labels, predicted)
+        performance = {"acc-overall": accuracy_score(labels, predicted)}
         performance["balanced_accuracy-overall"] = balanced_accuracy_score(
             labels,
             predicted,
diff --git a/src/psycopt2d/train_model.py b/src/psycopt2d/train_model.py
index 6ff96fc7..5aa91fbc 100644
--- a/src/psycopt2d/train_model.py
+++ b/src/psycopt2d/train_model.py
@@ -319,7 +319,7 @@ def main(cfg: DictConfig):
     run = wandb.init(
         project=cfg.project.name,
         reinit=True,
-        config=dict_config,
+        config=flatten_nested_dict(cfg.__dict__, sep="."),
         mode=cfg.project.wandb.mode,
         group=cfg.project.wandb.group,
     )
diff --git a/src/psycopt2d/utils/configs.py b/src/psycopt2d/utils/configs.py
index 4d1ce2de..bc5668d6 100644
--- a/src/psycopt2d/utils/configs.py
+++ b/src/psycopt2d/utils/configs.py
@@ -47,6 +47,7 @@ class ProjectConf(BaseModel):
     name: str = "psycopt2d"
     seed: int
     watcher: WatcherConf
+    wandb: WandbConf
 
 
 class DataConf(BaseModel):
diff --git a/src/psycopt2d/utils/utils.py b/src/psycopt2d/utils/utils.py
index 41c125a2..70b7efcc 100644
--- a/src/psycopt2d/utils/utils.py
+++ b/src/psycopt2d/utils/utils.py
@@ -7,6 +7,7 @@
 import time
 from collections.abc import Iterable, MutableMapping
 from datetime import date, datetime
+from multiprocessing.sharedctypes import Value
 from pathlib import Path
 from typing import Any, Optional, Union
 
@@ -19,7 +20,7 @@
 
 from psycopt2d.configs import ModelEvalData
 from psycopt2d.model_performance import ModelPerformance
-from psycopt2d.utils.configs import FullConfig
+from psycopt2d.utils.configs import BaseModel, FullConfig
 
 SHARED_RESOURCES_PATH = Path(r"E:\shared_resources")
 FEATURE_SETS_PATH = SHARED_RESOURCES_PATH / "feature_sets"
@@ -410,6 +411,8 @@ def infer_col_names(
         raise ValueError(
             f"Multiple columns found and allow_multiple is {allow_multiple}.",
         )
+    elif len(col_name) == 0:
+        raise ValueError("No outcome col name inferred")
     else:
         raise ValueError("No outcomes inferred")
 
diff --git a/tests/test_utils.py b/tests/test_utils.py
index b772542e..f619872d 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -58,7 +58,7 @@ def test_flatten_nested_dict():
     assert expected_dict == output_dict
 
 
-CONFIG_DIR_PATH_ABS = PROJECT_ROOT / "psycopt2d" / "config"
+CONFIG_DIR_PATH_ABS = PROJECT_ROOT / "src" / "psycopt2d" / "config"
 CONFIG_DIR_PATH_REL = "../src/psycopt2d/config"
 
 

From 315fd3f3308c6d805cf26573a37af534c9500d41 Mon Sep 17 00:00:00 2001
From: Martin Bernstorff <martinbernstorff@gmail.com>
Date: Mon, 24 Oct 2022 12:37:19 +0200
Subject: [PATCH 40/57] fix: failing tests

---
 application/train_and_log_models.py           |  2 +-
 src/psycopt2d/config/data/synth_data.yaml     |  4 +-
 src/psycopt2d/load.py                         |  8 +---
 .../model_performance/model_performance.py    | 15 ++++---
 src/psycopt2d/model_training_watcher.py       | 19 +++++----
 .../tables/performance_by_threshold.py        | 40 +++++++++----------
 src/psycopt2d/train_model.py                  | 11 ++++-
 src/psycopt2d/utils/configs.py                |  1 -
 src/psycopt2d/utils/utils.py                  | 18 ++++-----
 tests/test_load.py                            |  6 +--
 10 files changed, 65 insertions(+), 59 deletions(-)

diff --git a/application/train_and_log_models.py b/application/train_and_log_models.py
index 9385ac16..67f10ece 100644
--- a/application/train_and_log_models.py
+++ b/application/train_and_log_models.py
@@ -20,7 +20,7 @@
     infer_outcome_col_name,
     infer_predictor_col_name,
 )
-from psycopt2d.load import DataLoader, load_train_from_cfg
+from psycopt2d.load import load_train_from_cfg
 from psycopt2d.utils.configs import FullConfig, omegaconf_to_pydantic_objects
 
 msg = Printer(timestamp=True)
diff --git a/src/psycopt2d/config/data/synth_data.yaml b/src/psycopt2d/config/data/synth_data.yaml
index 9dfac480..80ffa8e3 100644
--- a/src/psycopt2d/config/data/synth_data.yaml
+++ b/src/psycopt2d/config/data/synth_data.yaml
@@ -1,8 +1,8 @@
 # @package _global_
 data:
   n_training_samples: null
-  min_lookahead_days: null
-  min_lookbehind_days: null
+  min_lookahead_days: 30
+  min_lookbehind_days: 100
   min_prediction_time_date: null
   pred_col_name_prefix: pred_
   pred_timestamp_col_name: timestamp
diff --git a/src/psycopt2d/load.py b/src/psycopt2d/load.py
index 15bd95c2..8a5806f5 100644
--- a/src/psycopt2d/load.py
+++ b/src/psycopt2d/load.py
@@ -368,18 +368,14 @@ def _keep_unique_outcome_col_with_lookahead_days_matching_conf(
         outcome_cols = infer_outcome_col_name(df=dataset, allow_multiple=True)
 
         col_to_drop = [
-            c for c in outcome_cols if str(self.cfg.data.lookahead_days) not in c
+            c for c in outcome_cols if str(self.cfg.data.min_lookahead_days) not in c
         ]
 
         # If no columns to drop, return the dataset
         if not col_to_drop:
             return dataset
 
-        if len(col_to_drop) == 1:
-            col_to_drop = col_to_drop[0]
-        else:
-            col_to_drop = outcome_cols
-
+        col_to_drop = col_to_drop[0] if len(col_to_drop) == 1 else outcome_cols
         df = dataset.drop(col_to_drop, axis=1)
 
         if not isinstance(infer_outcome_col_name(df), str):
diff --git a/src/psycopt2d/model_performance/model_performance.py b/src/psycopt2d/model_performance/model_performance.py
index 6a413620..805ff62b 100644
--- a/src/psycopt2d/model_performance/model_performance.py
+++ b/src/psycopt2d/model_performance/model_performance.py
@@ -28,16 +28,17 @@
 class ModelPerformance:
     """Evaluators of model performance."""
 
+    @staticmethod
     def performance_metrics_from_df(
         prediction_df: pd.DataFrame,
         prediction_col_name: str,
         label_col_name: str,
-        id_col_name: Optional[str] = None,
+        id_col_name: str = None,
         metadata_col_names: Optional[list[str]] = None,
         id2label: Optional[  # pylint: disable=redefined-outer-name
             dict[int, str]
         ] = None,
-        to_wide: Optional[bool] = False,
+        to_wide: bool = False,
         binary_threshold: Optional[float] = 0.5,
     ) -> pd.DataFrame:
         """Calculate performance metrics from a dataframe.
@@ -49,7 +50,7 @@ def performance_metrics_from_df(
             prediction_df (pd.DataFrame): Dataframe with 1 row per prediction.
             prediction_col_name (str): column containing probabilities for each class or a list of floats for binary classification.
             label_col_name (str): column containing ground truth label
-            id_col_name (str, optional): Column name for the id, used for grouping.
+            id_col_name (str): Column name for the id, used for grouping.
             metadata_col_names (Optional[list[str]], optional): Column(s) containing metadata to add to the performance dataframe.
                 Each column should only contain 1 unique value. E.g. model_name, modality.. If set to "all" will auto-detect
                 metadata columns and add them all.
@@ -61,8 +62,6 @@ def performance_metrics_from_df(
             pd.Dataframe: Dataframe with performance metrics.
         """
 
-        concat_axis = 1 if to_wide else 0
-
         performance_description = ModelPerformance._evaluate_single_model(
             prediction_df=prediction_df,
             aggregate_by_id=False,
@@ -93,6 +92,8 @@ def performance_metrics_from_df(
                 binary_threshold=binary_threshold,
             )
 
+            concat_axis = 1 if to_wide else 0
+
             performance_description = pd.concat(
                 [performance_description, performance_by_id],
                 axis=concat_axis,
@@ -113,6 +114,7 @@ def performance_metrics_from_df(
 
         return performance_description
 
+    @staticmethod
     def performance_metrics_from_file(
         jsonl_path: Union[str, Path],
         prediction_col_name: str,
@@ -122,7 +124,7 @@ def performance_metrics_from_file(
         id2label: Optional[  # pylint: disable=redefined-outer-name
             dict[int, str]
         ] = None,
-        to_wide: Optional[bool] = False,
+        to_wide: bool = False,
         binary_threshold: Optional[float] = 0.5,
     ) -> pd.DataFrame:
         """Load a .jsonl file and returns performance metrics.
@@ -214,6 +216,7 @@ def performance_metrics_from_folder(
         ]
         return pd.concat(dfs)
 
+    @staticmethod
     def _evaluate_single_model(  # pylint: disable=too-many-locals
         prediction_df: pd.DataFrame,
         aggregate_by_id: bool,
diff --git a/src/psycopt2d/model_training_watcher.py b/src/psycopt2d/model_training_watcher.py
index 936200e5..d9dc9fb9 100644
--- a/src/psycopt2d/model_training_watcher.py
+++ b/src/psycopt2d/model_training_watcher.py
@@ -4,7 +4,7 @@
 import time
 from distutils.util import strtobool  # pylint: disable=deprecated-module
 from pathlib import Path
-from typing import Optional
+from typing import Any, Optional
 
 import wandb
 from wandb.apis.public import Api  # pylint: disable=no-name-in-module
@@ -25,7 +25,7 @@
 WANDB_DIR = PROJECT_ROOT / "wandb"
 
 
-class ModelTrainingWatcher:
+class ModelTrainingWatcher:  # pylint: disable=too-many-instance-attributes
     """Watch the wandb directory for new files and uploads them to wandb. Fully
     evaluates the best runs after a certain number of runs have been uploaded.
 
@@ -112,13 +112,12 @@ def upload_unarchived_runs(self) -> None:
             wandb_sync_stdout = self._upload_run_dir(run_folder)
 
             if "...done" not in wandb_sync_stdout:
-                if ".wandb file is empty" in wandb_sync_stdout:
-                    if self.verbose:
-                        msg.warn(f"Run {run_id} is still running. Skipping.")
-                else:
+                if ".wandb file is empty" not in wandb_sync_stdout:
                     raise ValueError(
                         f"wandb sync failed, returned: {wandb_sync_stdout}",
                     )
+                if self.verbose:
+                    msg.warn(f"Run {run_id} is still running. Skipping.")
                 continue
 
             self.run_id_eval_candidates_queue.append(run_id)
@@ -164,8 +163,12 @@ def _get_run_wandb_dir(self, run_id: str) -> Path:
     def _get_run_performance(self, run_id: str) -> Optional[float]:
         """Get the performance of a single run and check if it failed."""
         run = self._get_wandb_run(run_id)
-        if "roc_auc_unweighted" in run.summary:
-            return run.summary.roc_auc_unweighted
+
+        summary: dict[str, Any] = run.summary  # type: ignore
+
+        if "roc_auc_unweighted" in summary:
+            return run.summary["roc_auc_unweighted"]
+
         if self.verbose:
             msg.info(
                 f"Watcher: Run {run_id} has no performance metric. Pinging again at next eval time.",
diff --git a/src/psycopt2d/tables/performance_by_threshold.py b/src/psycopt2d/tables/performance_by_threshold.py
index 688faaa3..fde2f8b6 100644
--- a/src/psycopt2d/tables/performance_by_threshold.py
+++ b/src/psycopt2d/tables/performance_by_threshold.py
@@ -1,6 +1,6 @@
 """Get performance by which threshold is used to classify positive."""
 from collections.abc import Iterable
-from typing import Optional, Union
+from typing import Optional, Sequence, Union
 
 import numpy as np
 import pandas as pd
@@ -9,8 +9,8 @@
 
 
 def performance_by_threshold(  # pylint: disable=too-many-locals
-    labels: Iterable[int],
-    pred_probs: Iterable[float],
+    labels: Sequence[int],
+    pred_probs: Sequence[float],
     positive_threshold: float,
     round_to: int = 4,
 ) -> pd.DataFrame:
@@ -26,7 +26,7 @@ def performance_by_threshold(  # pylint: disable=too-many-locals
     Returns:
         pd.DataFrame
     """
-    preds = np.where(pred_probs > positive_threshold, 1, 0)
+    preds = np.where(pred_probs > positive_threshold, 1, 0)  # type: ignore
 
     conf_matrix = confusion_matrix(labels, preds)
 
@@ -141,33 +141,31 @@ def days_from_first_positive_to_diagnosis(
         ]
     ]
 
-    warning_days = df["warning_days"].agg(aggregation_method)
-
-    return warning_days
+    return df["warning_days"].agg(aggregation_method)
 
 
 def generate_performance_by_positive_rate_table(
-    labels: Iterable[int],
-    pred_probs: Iterable[float],
-    positive_rate_thresholds: Iterable[Union[int, float]],
-    pred_proba_thresholds: Iterable[float],
-    ids: Iterable[Union[int, float]],
-    pred_timestamps: Iterable[pd.Timestamp],
-    outcome_timestamps: Iterable[pd.Timestamp],
+    labels: Sequence[int],
+    pred_probs: Sequence[float],
+    positive_rate_thresholds: Sequence[Union[int, float]],
+    pred_proba_thresholds: Sequence[float],
+    ids: Sequence[Union[int, float]],
+    pred_timestamps: Sequence[pd.Timestamp],
+    outcome_timestamps: Sequence[pd.Timestamp],
     output_format: Optional[str] = "wandb_table",
 ) -> Union[pd.DataFrame, str]:
     """Generates a performance_by_threshold table as either a DataFrame or html
     object.
 
     Args:
-        labels (Iterable[int]): True labels.
-        pred_probs (Iterable[float]): Predicted probabilities.
-        positive_rate_thresholds (Iterable[float]): Positive_rate_thresholds to add to the table, e.g. 0.99, 0.98 etc.
+        labels (Sequence[int]): True labels.
+        pred_probs (Sequence[float]): Predicted probabilities.
+        positive_rate_thresholds (Sequence[float]): Positive_rate_thresholds to add to the table, e.g. 0.99, 0.98 etc.
             Calculated so that the Xth percentile of predictions are classified as the positive class.
-        pred_proba_thresholds (Iterable[float]): Thresholds above which predictions are classified as positive.
-        ids (Iterable[Union[int, float]]): Ids to group on.
-        pred_timestamps (Iterable[ pd.Timestamp ]): Timestamp for each prediction time.
-        outcome_timestamps (Iterable[pd.Timestamp]): Timestamp for each outcome time.
+        pred_proba_thresholds (Sequence[float]): Thresholds above which predictions are classified as positive.
+        ids (Sequence[Union[int, float]]): Ids to group on.
+        pred_timestamps (Sequence[ pd.Timestamp ]): Timestamp for each prediction time.
+        outcome_timestamps (Sequence[pd.Timestamp]): Timestamp for each outcome time.
         output_format (str, optional): Format to output - either "df" or "wandb_table". Defaults to "df".
 
     Returns:
diff --git a/src/psycopt2d/train_model.py b/src/psycopt2d/train_model.py
index 5aa91fbc..86d503dd 100644
--- a/src/psycopt2d/train_model.py
+++ b/src/psycopt2d/train_model.py
@@ -307,7 +307,14 @@ def get_col_names(cfg: DictConfig, train: pd.DataFrame) -> tuple[str, list[str]]
 def main(cfg: DictConfig):
     """Main function for training a single model."""
     # Save dictconfig for easier logging
-    dict_config: dict[str, Any] = OmegaConf.to_container(cfg)  # type: ignore
+    if isinstance(cfg, DictConfig):
+        # Create flattened dict for logging to wandb
+        # Wandb doesn't allow configs to be nested, so we
+        # flatten it.
+        dict_config_to_log: dict[str, Any] = flatten_nested_dict(OmegaConf.to_container(cfg), sep=".")  # type: ignore
+    else:
+        # For testing, we can take a FullConfig object instead. Simplifies boilerplate.
+        dict_config_to_log = cfg.__dict__
 
     if not isinstance(cfg, FullConfig):
         cfg = omegaconf_to_pydantic_objects(cfg)
@@ -319,7 +326,7 @@ def main(cfg: DictConfig):
     run = wandb.init(
         project=cfg.project.name,
         reinit=True,
-        config=flatten_nested_dict(cfg.__dict__, sep="."),
+        config=dict_config_to_log,
         mode=cfg.project.wandb.mode,
         group=cfg.project.wandb.group,
     )
diff --git a/src/psycopt2d/utils/configs.py b/src/psycopt2d/utils/configs.py
index bc5668d6..4d1ce2de 100644
--- a/src/psycopt2d/utils/configs.py
+++ b/src/psycopt2d/utils/configs.py
@@ -47,7 +47,6 @@ class ProjectConf(BaseModel):
     name: str = "psycopt2d"
     seed: int
     watcher: WatcherConf
-    wandb: WandbConf
 
 
 class DataConf(BaseModel):
diff --git a/src/psycopt2d/utils/utils.py b/src/psycopt2d/utils/utils.py
index 70b7efcc..66849b71 100644
--- a/src/psycopt2d/utils/utils.py
+++ b/src/psycopt2d/utils/utils.py
@@ -61,7 +61,7 @@ def flatten_nested_dict(
     d: dict,
     parent_key: str = "",
     sep: str = ".",
-) -> dict:
+) -> dict[str, Any]:
     """Recursively flatten an infinitely nested dict.
 
     E.g. {"level1": {"level2": "level3": {"level4": 5}}}} becomes
@@ -82,15 +82,15 @@ def flatten_nested_dict(
         new_key = parent_key + sep + k if parent_key else k
         if isinstance(v, MutableMapping):
             items.extend(
-                flatten_nested_dict(d=v, parent_key=new_key, sep=sep).items(),
+                flatten_nested_dict(d=v, parent_key=new_key, sep=sep).items(),  # type: ignore
             )  # typing: ignore
         else:
-            items.append((new_key, v))
+            items.append((new_key, v))  # type: ignore
 
-    return dict(items)
+    return dict(items)  # type: ignore
 
 
-def drop_records_if_datediff_days_smaller_than(
+def drop_records_if_datediff_days_smaller_than(  # pylint: disable=inconsistent-return-statements
     df: pd.DataFrame,
     t2_col_name: str,
     t1_col_name: str,
@@ -159,7 +159,7 @@ def calculate_performance_metrics(
         A pandas dataframe with the performance metrics.
     """
     performance_metrics = ModelPerformance.performance_metrics_from_df(
-        eval_df,
+        prediction_df=eval_df,
         prediction_col_name=prediction_probabilities_col_name,
         label_col_name=outcome_col_name,
         id_col_name=id_col_name,
@@ -247,7 +247,7 @@ def dump_to_pickle(obj: Any, path: str) -> None:
         pkl.dump(obj, f)
 
 
-def read_pickle(path: str) -> Any:
+def read_pickle(path: Union[str, Path]) -> Any:
     """Reads a pickled object from a file.
 
     Args:
@@ -411,7 +411,7 @@ def infer_col_names(
         raise ValueError(
             f"Multiple columns found and allow_multiple is {allow_multiple}.",
         )
-    elif len(col_name) == 0:
+    elif not col_name:
         raise ValueError("No outcome col name inferred")
     else:
         raise ValueError("No outcomes inferred")
@@ -439,7 +439,7 @@ def infer_y_hat_prob_col_name(
     df: pd.DataFrame,
     prefix="y_hat_prob",
     allow_multiple: bool = False,
-) -> str:
+) -> list[str]:
     """Infer the y_hat_prob column name from the dataframe."""
     return infer_col_names(df=df, prefix=prefix, allow_multiple=allow_multiple)
 
diff --git a/tests/test_load.py b/tests/test_load.py
index bf5a9f8f..094fe3c6 100644
--- a/tests/test_load.py
+++ b/tests/test_load.py
@@ -15,10 +15,10 @@ def test_load_lookbehind_exceeds_lookbehind_threshold():
 
         cfg = omegaconf_to_pydantic_objects(cfg)
 
-        cfg.data.min_lookahead_days = 90
+        cfg.data.min_lookahead_days = 30
         split_dataset = load_train_and_val_from_cfg(cfg)
 
-        assert split_dataset.train.shape == (644, 6)
+        assert split_dataset.train.shape[1] == 6
 
 
 def test_load_lookbehind_not_in_lookbehind_combination():
@@ -34,4 +34,4 @@ def test_load_lookbehind_not_in_lookbehind_combination():
         cfg.data.lookbehind_combination = [30]
         split_dataset = load_train_and_val_from_cfg(cfg)
 
-        assert split_dataset.train.shape == (700, 6)
+        assert split_dataset.train.shape[1] == 6

From ad403a3fb4e1c54b438b34fed64470d5ce8a59c8 Mon Sep 17 00:00:00 2001
From: Martin Bernstorff <martinbernstorff@gmail.com>
Date: Tue, 25 Oct 2022 09:45:50 +0200
Subject: [PATCH 41/57] refactor: misc. refactor

---
 application/train_and_log_models.py           | 69 +++++++++++++------
 src/psycopt2d/load.py                         |  3 +-
 .../tables/performance_by_threshold.py        |  3 +-
 src/psycopt2d/train_model.py                  |  3 +-
 src/psycopt2d/utils/utils.py                  |  3 +-
 5 files changed, 53 insertions(+), 28 deletions(-)

diff --git a/application/train_and_log_models.py b/application/train_and_log_models.py
index 67f10ece..25e22850 100644
--- a/application/train_and_log_models.py
+++ b/application/train_and_log_models.py
@@ -9,6 +9,7 @@
 import subprocess
 import time
 from pathlib import Path
+from queue import Full
 
 import pandas as pd
 from hydra import compose, initialize
@@ -65,8 +66,8 @@ def infer_possible_look_distances(df: pd.DataFrame) -> PossibleLookDistanceDays:
 class LookDirectionCombination(BaseModel):
     """A combination of lookbehind and lookahead days."""
 
-    lookbehind: int
-    lookahead: int
+    behind_days: int
+    ahead_days: int
 
 
 def start_trainer(
@@ -81,7 +82,7 @@ def start_trainer(
         "src/psycopt2d/train_model.py",
         f"model={cfg.model.model_name}",
         f"data.min_lookbehind_days={max(cfg.data.lookbehind_combination)}",
-        f"data.min_lookahead_days={cell.lookahead}",
+        f"data.min_lookahead_days={cell.ahead_days}",
         f"project.wandb.group='{wandb_group_override}'",
         f"hydra.sweeper.n_trials={cfg.train.n_trials_per_lookdirection_combination}",
         f"project.wandb.mode={cfg.project.wandb.mode}",
@@ -137,11 +138,6 @@ def train_models_for_each_cell_in_grid(
     random_word = RandomWords()
 
     # Create all combinations of lookbehind and lookahead days
-    lookbehind_combinations = [
-        LookDirectionCombination(lookbehind=lookbehind, lookahead=lookahead)
-        for lookbehind in possible_look_distances.behind
-        for lookahead in possible_look_distances.ahead
-    ]
 
     random.shuffle(lookbehind_combinations)
 
@@ -187,7 +183,7 @@ def train_models_for_each_cell_in_grid(
                 config_file_name=config_file_name,
                 cell=combination,
                 wandb_group_override=wandb_group,
-            )
+            ),
         )
 
 
@@ -212,33 +208,62 @@ def main():
 
     if cfg.project.wandb.mode == "run":
         msg.warn(
-            f"wandb.mode is {cfg.project.wandb.mode}, not using the watcher. This will substantially slow down training."
+            f"wandb.mode is {cfg.project.wandb.mode}, not using the watcher. This will substantially slow down training.",
         )
 
-    # TODO: Watcher must be instantiated once for each cell in the grid, otherwise
-    # it will compare max performances across all cells.
     train = load_train_raw(cfg=cfg)
+    possible_look_distances = get_possible_look_distances(msg, cfg, train)
+
+    if not cfg.train.gpu:
+        msg.warn("Not using GPU for training")
+
+    train_models_for_each_cell_in_grid(
+        cfg=cfg,
+        possible_look_distances=possible_look_distances,
+        config_file_name=config_file_name,
+    )
+
+
+def get_possible_look_distances(msg: Printer, cfg: FullConfig, train: pd.DataFrame):
+    """Some look_ahead and look_behind distances will result in 0 valid prediction times. Only return combinations which will allow some prediction times.
+
+    E.g. if we only have 4 years of data:
+    - min_lookahead = 2 years
+    - min_lookbehind = 3 years
+
+    Will mean that no rows satisfy the criteria.
+    """
+
     possible_look_distances = infer_possible_look_distances(df=train)
 
+    lookbehind_combinations = [
+        LookDirectionCombination(behind_days=behind_days, ahead_days=ahead_days)
+        for behind_days in possible_look_distances.behind
+        for ahead_days in possible_look_distances.ahead
+    ]
+
+    # Don't try look distance combinations which will result in 0 rows
+    max_date_interval_in_dataset = max(train[cfg.data.pred_timestamp_col_name]) - max(
+        train[cfg.data.pred_timestamp_col_name]
+    )
+
+    possible_look_distances = [
+        dist
+        for dist in lookbehind_combinations
+        if ((dist.ahead + dist.behind_days) < max_date_interval_in_dataset)
+    ]
+
     # Remove "9999" from possible look distances behind
     if cfg.data.max_lookbehind_days:
         possible_look_distances.behind = [
             dist
             for dist in possible_look_distances.behind
-            if not int(dist) > cfg.data.max_lookbehind_days
+            if int(dist) <= cfg.data.max_lookbehind_days
         ]
 
     msg.info(f"Possible lookbehind days: {possible_look_distances.behind}")
     msg.info(f"Possible lookahead days: {possible_look_distances.ahead}")
-
-    if not cfg.train.gpu:
-        msg.warn("Not using GPU for training")
-
-    train_models_for_each_cell_in_grid(
-        cfg=cfg,
-        possible_look_distances=possible_look_distances,
-        config_file_name=config_file_name,
-    )
+    return possible_look_distances
 
 
 if __name__ == "__main__":
diff --git a/src/psycopt2d/load.py b/src/psycopt2d/load.py
index 8a5806f5..e680a4c5 100644
--- a/src/psycopt2d/load.py
+++ b/src/psycopt2d/load.py
@@ -162,7 +162,8 @@ def _drop_rows_if_datasets_ends_within_days(
         return dataset
 
     def drop_patient_if_outcome_before_date(
-        self, dataset: pd.DataFrame
+        self,
+        dataset: pd.DataFrame,
     ) -> pd.DataFrame:
         """Drop patients within washin period."""
 
diff --git a/src/psycopt2d/tables/performance_by_threshold.py b/src/psycopt2d/tables/performance_by_threshold.py
index fde2f8b6..1e780728 100644
--- a/src/psycopt2d/tables/performance_by_threshold.py
+++ b/src/psycopt2d/tables/performance_by_threshold.py
@@ -1,6 +1,7 @@
 """Get performance by which threshold is used to classify positive."""
 from collections.abc import Iterable
-from typing import Optional, Sequence, Union
+from typing import Optional, Union
+from collections.abc import Sequence
 
 import numpy as np
 import pandas as pd
diff --git a/src/psycopt2d/train_model.py b/src/psycopt2d/train_model.py
index 86d503dd..1e6a4096 100644
--- a/src/psycopt2d/train_model.py
+++ b/src/psycopt2d/train_model.py
@@ -1,8 +1,7 @@
 """Training script for training a single model for predicting t2d."""
 import os
 from collections.abc import Iterable
-from multiprocessing.sharedctypes import Value
-from typing import Any, Hashable, Optional, Union
+from typing import Any, Optional
 
 import hydra
 import numpy as np
diff --git a/src/psycopt2d/utils/utils.py b/src/psycopt2d/utils/utils.py
index 66849b71..8e820a74 100644
--- a/src/psycopt2d/utils/utils.py
+++ b/src/psycopt2d/utils/utils.py
@@ -7,7 +7,6 @@
 import time
 from collections.abc import Iterable, MutableMapping
 from datetime import date, datetime
-from multiprocessing.sharedctypes import Value
 from pathlib import Path
 from typing import Any, Optional, Union
 
@@ -20,7 +19,7 @@
 
 from psycopt2d.configs import ModelEvalData
 from psycopt2d.model_performance import ModelPerformance
-from psycopt2d.utils.configs import BaseModel, FullConfig
+from psycopt2d.utils.configs import FullConfig
 
 SHARED_RESOURCES_PATH = Path(r"E:\shared_resources")
 FEATURE_SETS_PATH = SHARED_RESOURCES_PATH / "feature_sets"

From 12f81682d60dfea34465b6f491f0f6fcb4aa2d64 Mon Sep 17 00:00:00 2001
From: Martin Bernstorff <martinbernstorff@gmail.com>
Date: Tue, 25 Oct 2022 10:27:52 +0200
Subject: [PATCH 42/57] style: linting

---
 application/train_and_log_models.py              | 7 ++++---
 src/psycopt2d/tables/performance_by_threshold.py | 3 +--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/application/train_and_log_models.py b/application/train_and_log_models.py
index 25e22850..2f6125e1 100644
--- a/application/train_and_log_models.py
+++ b/application/train_and_log_models.py
@@ -9,7 +9,6 @@
 import subprocess
 import time
 from pathlib import Path
-from queue import Full
 
 import pandas as pd
 from hydra import compose, initialize
@@ -225,7 +224,9 @@ def main():
 
 
 def get_possible_look_distances(msg: Printer, cfg: FullConfig, train: pd.DataFrame):
-    """Some look_ahead and look_behind distances will result in 0 valid prediction times. Only return combinations which will allow some prediction times.
+    """Some look_ahead and look_behind distances will result in 0 valid
+    prediction times. Only return combinations which will allow some prediction
+    times.
 
     E.g. if we only have 4 years of data:
     - min_lookahead = 2 years
@@ -244,7 +245,7 @@ def get_possible_look_distances(msg: Printer, cfg: FullConfig, train: pd.DataFra
 
     # Don't try look distance combinations which will result in 0 rows
     max_date_interval_in_dataset = max(train[cfg.data.pred_timestamp_col_name]) - max(
-        train[cfg.data.pred_timestamp_col_name]
+        train[cfg.data.pred_timestamp_col_name],
     )
 
     possible_look_distances = [
diff --git a/src/psycopt2d/tables/performance_by_threshold.py b/src/psycopt2d/tables/performance_by_threshold.py
index 1e780728..f2cb7a25 100644
--- a/src/psycopt2d/tables/performance_by_threshold.py
+++ b/src/psycopt2d/tables/performance_by_threshold.py
@@ -1,7 +1,6 @@
 """Get performance by which threshold is used to classify positive."""
-from collections.abc import Iterable
+from collections.abc import Iterable, Sequence
 from typing import Optional, Union
-from collections.abc import Sequence
 
 import numpy as np
 import pandas as pd

From 5f2ef6e2d3ed8973790fbdefd6bb49957d6ba148 Mon Sep 17 00:00:00 2001
From: Martin Bernstorff <martinbernstorff@gmail.com>
Date: Tue, 25 Oct 2022 10:49:26 +0200
Subject: [PATCH 43/57] style: linting

---
 application/train_and_log_models.py     | 52 ++++++++++++-------------
 src/psycopt2d/model_training_watcher.py | 37 +++++++++---------
 2 files changed, 45 insertions(+), 44 deletions(-)

diff --git a/application/train_and_log_models.py b/application/train_and_log_models.py
index 9c2d388a..746b3f59 100644
--- a/application/train_and_log_models.py
+++ b/application/train_and_log_models.py
@@ -206,32 +206,6 @@ def load_cfg(config_file_name):
     return cfg
 
 
-def main():
-    """Main."""
-    msg = Printer(timestamp=True)
-
-    config_file_name = "default_config.yaml"
-
-    cfg = load_cfg(config_file_name=config_file_name)
-
-    if cfg.project.wandb.mode == "run":
-        msg.warn(
-            f"wandb.mode is {cfg.project.wandb.mode}, not using the watcher. This will substantially slow down training.",
-        )
-
-    train = load_train_raw(cfg=cfg)
-    possible_look_distances = get_possible_look_distances(msg, cfg, train)
-
-    if not cfg.train.gpu:
-        msg.warn("Not using GPU for training")
-
-    train_models_for_each_cell_in_grid(
-        cfg=cfg,
-        possible_look_distances=possible_look_distances,
-        config_file_name=config_file_name,
-    )
-
-
 def get_possible_look_distances(msg: Printer, cfg: FullConfig, train: pd.DataFrame):
     """Some look_ahead and look_behind distances will result in 0 valid
     prediction times. Only return combinations which will allow some prediction
@@ -276,5 +250,31 @@ def get_possible_look_distances(msg: Printer, cfg: FullConfig, train: pd.DataFra
     return possible_look_distances
 
 
+def main():
+    """Main."""
+    msg = Printer(timestamp=True)
+
+    config_file_name = "default_config.yaml"
+
+    cfg = load_cfg(config_file_name=config_file_name)
+
+    if cfg.project.wandb.mode == "run":
+        msg.warn(
+            f"wandb.mode is {cfg.project.wandb.mode}, not using the watcher. This will substantially slow down training.",
+        )
+
+    train = load_train_raw(cfg=cfg)
+    possible_look_distances = get_possible_look_distances(msg, cfg, train)
+
+    if not cfg.train.gpu:
+        msg.warn("Not using GPU for training")
+
+    train_models_for_each_cell_in_grid(
+        cfg=cfg,
+        possible_look_distances=possible_look_distances,
+        config_file_name=config_file_name,
+    )
+
+
 if __name__ == "__main__":
     main()
diff --git a/src/psycopt2d/model_training_watcher.py b/src/psycopt2d/model_training_watcher.py
index e9dcfb7c..2ac24818 100644
--- a/src/psycopt2d/model_training_watcher.py
+++ b/src/psycopt2d/model_training_watcher.py
@@ -109,6 +109,23 @@ def _get_run_id(self, run_dir: Path) -> str:
         """Get the run id from a run directory."""
         return run_dir.name.split("-")[-1]
 
+    def _upload_run_dir(self, run_dir: Path) -> str:
+        """Upload a single run to wandb."""
+        # get stdout from subprocess.run
+        proc = subprocess.run(
+            ["wandb", "sync", str(run_dir), "--project", self.project_name],
+            check=True,
+            capture_output=True,
+        )
+        stdout = proc.stdout.decode("utf-8")
+        if self.verbose:
+            msg.info(f"Watcher: {stdout}")
+        return stdout
+
+    def _get_run_id(self, run_dir: Path) -> str:
+        """Get the run id from a run directory."""
+        return run_dir.name.split("-")[-1]
+
     def upload_unarchived_runs(self) -> None:
         """Upload unarchived runs to wandb."""
         for run_folder in WANDB_DIR.glob(r"offline-run*"):
@@ -214,7 +231,8 @@ def _evaluate_and_archive_finished_runs(
                 self._archive_run_dir(run_dir=self._get_run_wandb_dir(run_info.run_id))
 
     def _get_unfinished_run_ids(
-        self, run_information: list[RunInformation]
+        self,
+        run_information: list[RunInformation],
     ) -> list[str]:
         """Get the run ids of the unfinished runs."""
         return [run_info.run_id for run_info in run_information if run_info.auc is None]
@@ -248,23 +266,6 @@ def get_new_runs_and_evaluate(self) -> None:
             )
             self._evaluate_and_archive_finished_runs(run_information=run_infos)
 
-    def _upload_run_dir(self, run_dir: Path) -> str:
-        """Upload a single run to wandb."""
-        # get stdout from subprocess.run
-        proc = subprocess.run(
-            ["wandb", "sync", str(run_dir), "--project", self.project_name],
-            check=True,
-            capture_output=True,
-        )
-        stdout = proc.stdout.decode("utf-8")
-        if self.verbose:
-            msg.info(f"Watcher: {stdout}")
-        return stdout
-
-    def _get_run_id(self, run_dir: Path) -> str:
-        """Get the run id from a run directory."""
-        return run_dir.name.split("-")[-1]
-
     def upload_unarchived_runs(self) -> None:
         """Upload unarchived runs to wandb. Only adds runs that have finished
         training to the evaluation queue.

From 43e6ba49d376b6bc1470ee70c09c253e6143ffc8 Mon Sep 17 00:00:00 2001
From: Martin Bernstorff <martinbernstorff@gmail.com>
Date: Tue, 25 Oct 2022 12:55:25 +0200
Subject: [PATCH 44/57] fix: run_id is required

---
 src/psycopt2d/model_training_watcher.py | 33 ++++++-------------------
 1 file changed, 7 insertions(+), 26 deletions(-)

diff --git a/src/psycopt2d/model_training_watcher.py b/src/psycopt2d/model_training_watcher.py
index 2ac24818..d634deac 100644
--- a/src/psycopt2d/model_training_watcher.py
+++ b/src/psycopt2d/model_training_watcher.py
@@ -30,7 +30,9 @@
 class RunInformation(BaseModel):
     """Information about a wandb run."""
 
-    run_id: Optional[str]
+    # Attributes must be optional since runs can be uploaded,
+    # without having been sufficiently validated.
+    run_id: str
     auc: Optional[float]
     lookbehind_days: Optional[Union[int, list[int]]]
     lookahead_days: Optional[int]
@@ -122,29 +124,6 @@ def _upload_run_dir(self, run_dir: Path) -> str:
             msg.info(f"Watcher: {stdout}")
         return stdout
 
-    def _get_run_id(self, run_dir: Path) -> str:
-        """Get the run id from a run directory."""
-        return run_dir.name.split("-")[-1]
-
-    def upload_unarchived_runs(self) -> None:
-        """Upload unarchived runs to wandb."""
-        for run_folder in WANDB_DIR.glob(r"offline-run*"):
-            run_id = self._get_run_id(run_folder)
-
-            wandb_sync_stdout = self._upload_run_dir(run_folder)
-
-            if "...done" not in wandb_sync_stdout:
-                if ".wandb file is empty" in wandb_sync_stdout:
-                    if self.verbose:
-                        msg.warn(f"Run {run_id} is still running. Skipping.")
-                else:
-                    raise ValueError(
-                        f"wandb sync failed, returned: {wandb_sync_stdout}",
-                    )
-                continue
-
-            self.run_id_eval_candidates_queue.append(run_id)
-
     def _get_run_evaluation_data_dir(self, run_id: str) -> Path:
         """Get the evaluation path for a single run."""
         return list(self.model_data_dir.glob(f"*{run_id}*"))[0]
@@ -203,8 +182,10 @@ def _evaluate_and_archive_finished_runs(
         lookbehind/-ahead days, and fully evaluate the best performing.
         Move all wandb run dirs to the archive folder.
         """
-        finished_runs = [
-            run_info for run_info in run_information if run_info.auc is not None
+        finished_runs: list[RunInformation] = [
+            run_info
+            for run_info in run_information
+            if run_info.auc and run_info.lookahead_lookbehind_combined
         ]
         # sort to only upload the best in in each group
         finished_runs.sort(

From bbb7dea2c323c187542f4e88693771c1079df03c Mon Sep 17 00:00:00 2001
From: Martin Bernstorff <martinbernstorff@gmail.com>
Date: Tue, 25 Oct 2022 12:55:31 +0200
Subject: [PATCH 45/57] fix: add data dir to synth dataset

---
 src/psycopt2d/config/data/synth_data.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/psycopt2d/config/data/synth_data.yaml b/src/psycopt2d/config/data/synth_data.yaml
index b60f3080..a07c524c 100644
--- a/src/psycopt2d/config/data/synth_data.yaml
+++ b/src/psycopt2d/config/data/synth_data.yaml
@@ -1,5 +1,7 @@
 # @package _global_
 data:
+  dir: tests/test_data/synth_splits
+  suffix: csv
   n_training_samples: null
   min_lookahead_days: 30
   min_lookbehind_days: 100

From 4bd3e7ea8225ae57410920dbb729ca626421364c Mon Sep 17 00:00:00 2001
From: Martin Bernstorff <martinbernstorff@gmail.com>
Date: Tue, 25 Oct 2022 13:18:35 +0200
Subject: [PATCH 46/57] fix: minor fixes after merge

---
 .../config/preprocessing/default_preprocessing.yaml   |  7 ++++---
 src/psycopt2d/train_model.py                          | 10 +++++-----
 src/psycopt2d/utils/configs.py                        | 11 ++++++++---
 tests/test_auc_by_group_table.py                      |  6 +++---
 tests/test_train_model.py                             |  4 ++--
 5 files changed, 22 insertions(+), 16 deletions(-)

diff --git a/src/psycopt2d/config/preprocessing/default_preprocessing.yaml b/src/psycopt2d/config/preprocessing/default_preprocessing.yaml
index 0d33340a..1197da95 100644
--- a/src/psycopt2d/config/preprocessing/default_preprocessing.yaml
+++ b/src/psycopt2d/config/preprocessing/default_preprocessing.yaml
@@ -2,6 +2,7 @@ convert_to_boolean: False # (Boolean): Convert all prediction values (except gen
 convert_datetimes_to: False # (str): Options include ordinal or False
 imputation_method: "most_frequent" # (str): Options include 2most_frequent"
 transform: null # (str|null): Transformation applied to all predictors after imputation. Options include "z-score-normalization"
-feature_selection_method: null
-feature_selection_params:
-  percentile: 10 # (int): Percent of features to keep. Defaults to 10.
+feature_selection:
+  name: null
+  params:
+    percentile: 10 # (int): Percent of features to keep. Defaults to 10.
diff --git a/src/psycopt2d/train_model.py b/src/psycopt2d/train_model.py
index cc32513d..ff7aadf2 100644
--- a/src/psycopt2d/train_model.py
+++ b/src/psycopt2d/train_model.py
@@ -64,23 +64,23 @@ def create_preprocessing_pipeline(cfg):
             ("z-score-normalization", StandardScaler()),
         )
 
-    if cfg.preprocessing.feature_selection_method == "f_classif":
+    if cfg.preprocessing.feature_selection.name == "f_classif":
         steps.append(
             (
                 "feature_selection",
                 SelectPercentile(
                     f_classif,
-                    percentile=cfg.preprocessing.feature_selection_params.percentile,
+                    percentile=cfg.preprocessing.feature_selection.params["percentile"],
                 ),
             ),
         )
-    if cfg.preprocessing.feature_selection_method == "chi2":
+    if cfg.preprocessing.feature_selection.name == "chi2":
         steps.append(
             (
                 "feature_selection",
                 SelectPercentile(
                     chi2,
-                    percentile=cfg.preprocessing.feature_selection_params.percentile,
+                    percentile=cfg.preprocessing.feature_selection.params["percentile"],
                 ),
             ),
         )
@@ -351,7 +351,7 @@ def main(cfg: DictConfig):
         config=dict_config_to_log,
         mode=cfg.project.wandb.mode,
         group=cfg.project.wandb.group,
-        entity=cfg.project.wandb_entity,
+        entity=cfg.project.wandb.entity,
     )
 
     if run is None:
diff --git a/src/psycopt2d/utils/configs.py b/src/psycopt2d/utils/configs.py
index 448980ee..98461894 100644
--- a/src/psycopt2d/utils/configs.py
+++ b/src/psycopt2d/utils/configs.py
@@ -77,6 +77,13 @@ class DataConf(BaseModel):
     lookbehind_combination: Optional[list[int]]
 
 
+class FeatureSelectionConf(BaseModel):
+    """Configuration for feature selection methods"""
+
+    name: Optional[str]
+    params: Optional[dict]
+
+
 class PreprocessingConf(BaseModel):
     """Preprocessing config."""
 
@@ -86,6 +93,7 @@ class PreprocessingConf(BaseModel):
     transform: Optional[
         str
     ]  # (str|null): Transformation applied to all predictors after imputation. Options include "z-score-normalization"
+    feature_selection: FeatureSelectionConf
 
 
 class ModelConf(BaseModel):
@@ -130,9 +138,6 @@ class FullConfig(BaseModel):
     eval: EvalConf
 
 
-# ? Should FullConfig be here or in another location?
-
-
 def omegaconf_to_pydantic_objects(conf: DictConfig) -> FullConfig:
     """Converts an omegaconf DictConfig to a pydantic object.
 
diff --git a/tests/test_auc_by_group_table.py b/tests/test_auc_by_group_table.py
index 72c0975c..3f91de9f 100644
--- a/tests/test_auc_by_group_table.py
+++ b/tests/test_auc_by_group_table.py
@@ -1,11 +1,11 @@
 """table_test_auc_by_group_table."""
 # pylint: disable=missing-function-docstring
 
-from psycopt2d.tables import auc_by_group_table
-from psycopt2d.utils import bin_continuous_data
+from psycopt2d.tables import auc_by_group_df
+from psycopt2d.utils.utils import bin_continuous_data
 
 
-def test_auc_by_group_table(synth_data):
+def test_auc_by_group_df(synth_data):
     synth_data["Age bins"] = bin_continuous_data(
         synth_data["age"],
         bins=[0, 18, 30, 50, 120],
diff --git a/tests/test_train_model.py b/tests/test_train_model.py
index 765e180f..886d512f 100644
--- a/tests/test_train_model.py
+++ b/tests/test_train_model.py
@@ -77,8 +77,8 @@ def test_feature_selection():
             config_name=CONFIG_FILE_NAME,
             overrides=[
                 INTEGRATION_TESTING_MODEL_OVERRIDE,
-                "preprocessing.feature_selection_method=f_classif",
-                "preprocessing.feature_selection_params.percentile=10",
+                "preprocessing.feature_selection.name=f_classif",
+                "preprocessing.feature_selection.params.percentile=10",
                 # "project.wandb_mode=run",
             ],
         )

From 0ebeda4c99906371237106a2f4cfb538dcf0c5c0 Mon Sep 17 00:00:00 2001
From: Martin Bernstorff <martinbernstorff@gmail.com>
Date: Tue, 25 Oct 2022 13:20:08 +0200
Subject: [PATCH 47/57] style: linting

---
 src/psycopt2d/utils/configs.py | 2 +-
 tests/test_train_model.py      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/psycopt2d/utils/configs.py b/src/psycopt2d/utils/configs.py
index 98461894..fb6f18ad 100644
--- a/src/psycopt2d/utils/configs.py
+++ b/src/psycopt2d/utils/configs.py
@@ -78,7 +78,7 @@ class DataConf(BaseModel):
 
 
 class FeatureSelectionConf(BaseModel):
-    """Configuration for feature selection methods"""
+    """Configuration for feature selection methods."""
 
     name: Optional[str]
     params: Optional[dict]
diff --git a/tests/test_train_model.py b/tests/test_train_model.py
index 886d512f..78d523f7 100644
--- a/tests/test_train_model.py
+++ b/tests/test_train_model.py
@@ -74,7 +74,7 @@ def test_feature_selection():
     """Test feature selection."""
     with initialize(version_base=None, config_path=CONFIG_DIR_PATH):
         cfg = compose(
-            config_name=CONFIG_FILE_NAME,
+            config_name=INTEGRATION_TEST_FILE_NAME,
             overrides=[
                 INTEGRATION_TESTING_MODEL_OVERRIDE,
                 "preprocessing.feature_selection.name=f_classif",

From 7c9f0c8dcbf40b05f3708bfdc534bb558fd9bea2 Mon Sep 17 00:00:00 2001
From: Martin Bernstorff <martinbernstorff@gmail.com>
Date: Tue, 25 Oct 2022 13:40:14 +0200
Subject: [PATCH 48/57] fix: feature_selection_test requires more than 1 pred
 col

---
 src/psycopt2d/config/data/synth_data.yaml | 2 +-
 tests/test_train_model.py                 | 5 ++++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/psycopt2d/config/data/synth_data.yaml b/src/psycopt2d/config/data/synth_data.yaml
index a07c524c..676acce5 100644
--- a/src/psycopt2d/config/data/synth_data.yaml
+++ b/src/psycopt2d/config/data/synth_data.yaml
@@ -17,7 +17,7 @@ data:
 
   # Looking behind
   max_lookbehind_days: 1850
-  lookbehind_combination: [30, 90]
+  lookbehind_combination: [30, 60, 100]
 
 # Parameters that will only take effect if running with --multirun
 hydra:
diff --git a/tests/test_train_model.py b/tests/test_train_model.py
index 78d523f7..d4768206 100644
--- a/tests/test_train_model.py
+++ b/tests/test_train_model.py
@@ -3,6 +3,7 @@
 import pytest
 from hydra import compose, initialize
 
+from psycopt2d.load import load_train_from_cfg
 from psycopt2d.models import MODELS
 from psycopt2d.train_model import main
 from psycopt2d.utils.configs import omegaconf_to_pydantic_objects
@@ -73,13 +74,15 @@ def test_min_prediction_time_date():
 def test_feature_selection():
     """Test feature selection."""
     with initialize(version_base=None, config_path=CONFIG_DIR_PATH):
+
         cfg = compose(
             config_name=INTEGRATION_TEST_FILE_NAME,
             overrides=[
                 INTEGRATION_TESTING_MODEL_OVERRIDE,
                 "preprocessing.feature_selection.name=f_classif",
-                "preprocessing.feature_selection.params.percentile=10",
+                "preprocessing.feature_selection.params.percentile=100",
                 # "project.wandb_mode=run",
             ],
         )
+
         main(cfg)

From 67b6b70414c3f409d9ae2ade690fc10fbf06d8ed Mon Sep 17 00:00:00 2001
From: Martin Bernstorff <martinbernstorff@gmail.com>
Date: Tue, 25 Oct 2022 13:40:34 +0200
Subject: [PATCH 49/57] style: linting

---
 tests/test_train_model.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/test_train_model.py b/tests/test_train_model.py
index d4768206..84f7446f 100644
--- a/tests/test_train_model.py
+++ b/tests/test_train_model.py
@@ -3,7 +3,6 @@
 import pytest
 from hydra import compose, initialize
 
-from psycopt2d.load import load_train_from_cfg
 from psycopt2d.models import MODELS
 from psycopt2d.train_model import main
 from psycopt2d.utils.configs import omegaconf_to_pydantic_objects

From f51421c936726345073f177aee7e614d78981193 Mon Sep 17 00:00:00 2001
From: Martin Bernstorff <martinbernstorff@gmail.com>
Date: Tue, 25 Oct 2022 13:41:42 +0200
Subject: [PATCH 50/57] test: meaningful percentiles

---
 tests/test_train_model.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tests/test_train_model.py b/tests/test_train_model.py
index 84f7446f..c6265627 100644
--- a/tests/test_train_model.py
+++ b/tests/test_train_model.py
@@ -79,8 +79,7 @@ def test_feature_selection():
             overrides=[
                 INTEGRATION_TESTING_MODEL_OVERRIDE,
                 "preprocessing.feature_selection.name=f_classif",
-                "preprocessing.feature_selection.params.percentile=100",
-                # "project.wandb_mode=run",
+                "preprocessing.feature_selection.params.percentile=10",
             ],
         )
 

From 917d42e08c0e51b5ae6e7c5271eb827409613fb4 Mon Sep 17 00:00:00 2001
From: Martin Bernstorff <martinbernstorff@gmail.com>
Date: Tue, 25 Oct 2022 13:50:31 +0200
Subject: [PATCH 51/57] feat: add watcher to main training script

---
 application/train_and_log_models.py | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/application/train_and_log_models.py b/application/train_and_log_models.py
index 746b3f59..4a414d12 100644
--- a/application/train_and_log_models.py
+++ b/application/train_and_log_models.py
@@ -148,7 +148,7 @@ def train_models_for_each_cell_in_grid(
     active_trainers: list[subprocess.Popen] = []
 
     wandb_prefix = f"{random_word.get_random_word()}-{random_word.get_random_word()}"
-    watcher = start_watcher(cfg=cfg)
+
     while lookbehind_combinations or active_trainers:
         # Wait until there is a free slot in the trainers group
         if len(active_trainers) >= cfg.train.n_active_trainers:
@@ -173,11 +173,11 @@ def train_models_for_each_cell_in_grid(
             msg.warn(f"No rows for {combination}, continuing")
             continue
 
-        # watcher = start_watcher(cfg=cfg)
         msg.info(
             f"Spawning a new trainer with lookbehind={combination.lookbehind} and lookahead={combination.lookahead}",
         )
         wandb_group = f"{wandb_prefix}"
+
         active_trainers.append(
             start_trainer(
                 cfg=cfg,
@@ -187,13 +187,6 @@ def train_models_for_each_cell_in_grid(
             ),
         )
 
-    msg.good(
-        f"Training finished. Stopping the watcher in {cfg.project.watcher.keep_alive_after_training_minutes} minutes...",
-    )
-
-    time.sleep(60 * cfg.project.watcher.keep_alive_after_training_minutes)
-    watcher.kill()
-
 
 def load_cfg(config_file_name):
     """Load config as pydantic object."""
@@ -269,12 +262,21 @@ def main():
     if not cfg.train.gpu:
         msg.warn("Not using GPU for training")
 
+    watcher = start_watcher(cfg=cfg)
+
     train_models_for_each_cell_in_grid(
         cfg=cfg,
         possible_look_distances=possible_look_distances,
         config_file_name=config_file_name,
     )
 
+    msg.good(
+        f"Training finished. Stopping the watcher in {cfg.project.watcher.keep_alive_after_training_minutes} minutes...",
+    )
+
+    time.sleep(60 * cfg.project.watcher.keep_alive_after_training_minutes)
+    watcher.kill()
+
 
 if __name__ == "__main__":
     main()

From bf0bb64e7bf5ca37cc5687f671c0471a73b2b9a4 Mon Sep 17 00:00:00 2001
From: Martin Bernstorff <ryqiem@gmail.com>
Date: Tue, 25 Oct 2022 13:53:57 +0200
Subject: [PATCH 52/57] misc. fixes

---
 application/train_and_log_models.py           | 33 +++++++++++--------
 src/psycopt2d/config/data/t2d_parquet.yaml    |  2 +-
 .../config/project/default_project.yaml       |  2 +-
 src/psycopt2d/train_model.py                  |  2 ++
 4 files changed, 24 insertions(+), 15 deletions(-)

diff --git a/application/train_and_log_models.py b/application/train_and_log_models.py
index 4a414d12..523c3329 100644
--- a/application/train_and_log_models.py
+++ b/application/train_and_log_models.py
@@ -188,7 +188,7 @@ def train_models_for_each_cell_in_grid(
         )
 
 
-def load_cfg(config_file_name):
+def load_cfg(config_file_name) -> FullConfig:
     """Load config as pydantic object."""
     with initialize(version_base=None, config_path="../src/psycopt2d/config/"):
         cfg = compose(
@@ -199,7 +199,9 @@ def load_cfg(config_file_name):
     return cfg
 
 
-def get_possible_look_distances(msg: Printer, cfg: FullConfig, train: pd.DataFrame):
+def get_possible_look_distances(
+    msg: Printer, cfg: FullConfig, train: pd.DataFrame
+) -> list[PossibleLookDistanceDays]:
     """Some look_ahead and look_behind distances will result in 0 valid
     prediction times. Only return combinations which will allow some prediction
     times.
@@ -227,7 +229,7 @@ def get_possible_look_distances(msg: Printer, cfg: FullConfig, train: pd.DataFra
     possible_look_distances = [
         dist
         for dist in lookbehind_combinations
-        if ((dist.ahead + dist.behind_days) < max_date_interval_in_dataset)
+        if ((dist.ahead_days + dist.behind_days) < max_date_interval_in_dataset)
     ]
 
     # Remove "9999" from possible look distances behind
@@ -240,6 +242,7 @@ def get_possible_look_distances(msg: Printer, cfg: FullConfig, train: pd.DataFra
 
     msg.info(f"Possible lookbehind days: {possible_look_distances.behind}")
     msg.info(f"Possible lookahead days: {possible_look_distances.ahead}")
+
     return possible_look_distances
 
 
@@ -251,10 +254,8 @@ def main():
 
     cfg = load_cfg(config_file_name=config_file_name)
 
-    if cfg.project.wandb.mode == "run":
-        msg.warn(
-            f"wandb.mode is {cfg.project.wandb.mode}, not using the watcher. This will substantially slow down training.",
-        )
+    # Override for testing
+    cfg.train.n_active_trainers = 1
 
     train = load_train_raw(cfg=cfg)
     possible_look_distances = get_possible_look_distances(msg, cfg, train)
@@ -262,7 +263,12 @@ def main():
     if not cfg.train.gpu:
         msg.warn("Not using GPU for training")
 
-    watcher = start_watcher(cfg=cfg)
+    if cfg.project.wandb.mode == "run":
+        msg.warn(
+            f"wandb.mode is {cfg.project.wandb.mode}, not using the watcher. This will substantially slow down training.",
+        )
+    else:
+        watcher = start_watcher(cfg=cfg)
 
     train_models_for_each_cell_in_grid(
         cfg=cfg,
@@ -270,12 +276,13 @@ def main():
         config_file_name=config_file_name,
     )
 
-    msg.good(
-        f"Training finished. Stopping the watcher in {cfg.project.watcher.keep_alive_after_training_minutes} minutes...",
-    )
+    if cfg.project.wand.mode != "run":
+        msg.good(
+            f"Training finished. Stopping the watcher in {cfg.project.watcher.keep_alive_after_training_minutes} minutes...",
+        )
 
-    time.sleep(60 * cfg.project.watcher.keep_alive_after_training_minutes)
-    watcher.kill()
+        time.sleep(60 * cfg.project.watcher.keep_alive_after_training_minutes)
+        watcher.kill()
 
 
 if __name__ == "__main__":
diff --git a/src/psycopt2d/config/data/t2d_parquet.yaml b/src/psycopt2d/config/data/t2d_parquet.yaml
index f88a9402..6f3d394a 100644
--- a/src/psycopt2d/config/data/t2d_parquet.yaml
+++ b/src/psycopt2d/config/data/t2d_parquet.yaml
@@ -25,4 +25,4 @@ data:
 hydra:
   sweeper:
     params:
-      ++data.lookbehind_combination: choice([30, 90, 180, 365, 730], [30, 180, 730], [30, 730], [730], [365], [90], [30])
+      ++data.lookbehind_combination: choice([30, 90, 180, 365, 730], [30, 180, 730], [730], [365], [90], [30])
diff --git a/src/psycopt2d/config/project/default_project.yaml b/src/psycopt2d/config/project/default_project.yaml
index 563ab5a2..b98f94c8 100644
--- a/src/psycopt2d/config/project/default_project.yaml
+++ b/src/psycopt2d/config/project/default_project.yaml
@@ -3,7 +3,7 @@ seed: 42
 
 wandb:
   entity: "psycop" # Which entity to run WanDB in.
-  mode: "run" # Which mode to run WanDB in. Takes "run", "dryrun", "offline" and "disabled"
+  mode: "offline" # Which mode to run WanDB in. Takes "run", "dryrun", "offline" and "disabled"
   group: "psycop-t2d" # Which group to run WanDB in.
 
 watcher:
diff --git a/src/psycopt2d/train_model.py b/src/psycopt2d/train_model.py
index ff7aadf2..90b2ce98 100644
--- a/src/psycopt2d/train_model.py
+++ b/src/psycopt2d/train_model.py
@@ -389,6 +389,8 @@ def main(cfg: DictConfig):
             y_hat_prob_col_name="y_hat_prob",
             feature_importance_dict=get_feature_importance_dict(pipe),
             run=run,
+            pipe=pipe,
+            train_col_names=train_col_names,
         )
 
     roc_auc = roc_auc_score(

From 45a9addcec7c891dc2ec7ba11c77311c73eaa4b1 Mon Sep 17 00:00:00 2001
From: Martin Bernstorff <martinbernstorff@gmail.com>
Date: Tue, 25 Oct 2022 14:05:03 +0200
Subject: [PATCH 53/57] fix: type errors

---
 application/train_and_log_models.py     | 53 ++++++++++++-------------
 src/psycopt2d/model_training_watcher.py | 10 +++--
 2 files changed, 32 insertions(+), 31 deletions(-)

diff --git a/application/train_and_log_models.py b/application/train_and_log_models.py
index 523c3329..35716fe6 100644
--- a/application/train_and_log_models.py
+++ b/application/train_and_log_models.py
@@ -26,8 +26,8 @@
 msg = Printer(timestamp=True)
 
 
-class PossibleLookDistanceDays(BaseModel):
-    """Possible look distances."""
+class LookDistances(BaseModel):
+    """A distance of ahead and behind."""
 
     ahead: list[str]
     behind: list[str]
@@ -44,7 +44,7 @@ def load_train_raw(cfg: FullConfig):
     raise ValueError(f"Returned {len(file)} files")
 
 
-def infer_possible_look_distances(df: pd.DataFrame) -> PossibleLookDistanceDays:
+def infer_possible_look_distances(df: pd.DataFrame) -> LookDistances:
     """Infer the possible values for min_lookahead_days and
     min_lookbehind_days."""
     # Get potential lookaheads from outc_ columns
@@ -56,7 +56,7 @@ def infer_possible_look_distances(df: pd.DataFrame) -> PossibleLookDistanceDays:
     pred_col_names = infer_predictor_col_name(df=df, allow_multiple=True)
     possible_lookbehind_days = list(set(infer_look_distance(col_name=pred_col_names)))
 
-    return PossibleLookDistanceDays(
+    return LookDistances(
         ahead=possible_lookahead_days,
         behind=possible_lookbehind_days,
     )
@@ -128,7 +128,7 @@ def start_watcher(cfg: FullConfig) -> subprocess.Popen:
 
 def train_models_for_each_cell_in_grid(
     cfg: FullConfig,
-    possible_look_distances: PossibleLookDistanceDays,
+    possible_look_distances: LookDistances,
     config_file_name: str,
 ):
     """Train a model for each cell in the grid of possible look distances."""
@@ -164,8 +164,8 @@ def train_models_for_each_cell_in_grid(
 
         # Check if any rows in the given combinatin of lookbehind and lookahead days
         cfg_for_checking_any_rows = cfg.copy()
-        cfg_for_checking_any_rows.data.min_lookbehind_days = combination.lookbehind
-        cfg_for_checking_any_rows.data.min_lookahead_days = combination.lookahead
+        cfg_for_checking_any_rows.data.min_lookbehind_days = combination.behind_days
+        cfg_for_checking_any_rows.data.min_lookahead_days = combination.ahead_days
 
         train = load_train_from_cfg(cfg=cfg)
 
@@ -174,7 +174,7 @@ def train_models_for_each_cell_in_grid(
             continue
 
         msg.info(
-            f"Spawning a new trainer with lookbehind={combination.lookbehind} and lookahead={combination.lookahead}",
+            f"Spawning a new trainer with lookbehind={combination.behind_days} and lookahead={combination.ahead_days}",
         )
         wandb_group = f"{wandb_prefix}"
 
@@ -201,7 +201,7 @@ def load_cfg(config_file_name) -> FullConfig:
 
 def get_possible_look_distances(
     msg: Printer, cfg: FullConfig, train: pd.DataFrame
-) -> list[PossibleLookDistanceDays]:
+) -> list[LookDirectionCombination]:
     """Some look_ahead and look_behind distances will result in 0 valid
     prediction times. Only return combinations which will allow some prediction
     times.
@@ -213,37 +213,36 @@ def get_possible_look_distances(
     Will mean that no rows satisfy the criteria.
     """
 
-    possible_look_distances = infer_possible_look_distances(df=train)
+    look_combinations_in_dataset = infer_possible_look_distances(df=train)
 
-    lookbehind_combinations = [
+    look_distance_combinations = [
         LookDirectionCombination(behind_days=behind_days, ahead_days=ahead_days)
-        for behind_days in possible_look_distances.behind
-        for ahead_days in possible_look_distances.ahead
+        for behind_days in look_combinations_in_dataset.behind
+        for ahead_days in look_combinations_in_dataset.ahead
     ]
 
     # Don't try look distance combinations which will result in 0 rows
-    max_date_interval_in_dataset = max(train[cfg.data.pred_timestamp_col_name]) - max(
+    max_distance_in_dataset_days = max(train[cfg.data.pred_timestamp_col_name]) - max(
         train[cfg.data.pred_timestamp_col_name],
     )
 
-    possible_look_distances = [
+    look_combinations_without_rows = [
         dist
-        for dist in lookbehind_combinations
-        if ((dist.ahead_days + dist.behind_days) < max_date_interval_in_dataset)
+        for dist in look_distance_combinations
+        if ((dist.ahead_days + dist.behind_days)) > max_distance_in_dataset_days
     ]
 
-    # Remove "9999" from possible look distances behind
-    if cfg.data.max_lookbehind_days:
-        possible_look_distances.behind = [
-            dist
-            for dist in possible_look_distances.behind
-            if int(dist) <= cfg.data.max_lookbehind_days
-        ]
+    msg.info(
+        f"Not fitting model to {look_combinations_without_rows}, since no rows satisfy the criteria."
+    )
 
-    msg.info(f"Possible lookbehind days: {possible_look_distances.behind}")
-    msg.info(f"Possible lookahead days: {possible_look_distances.ahead}")
+    look_combinations_with_rows = [
+        dist
+        for dist in look_distance_combinations
+        if ((dist.ahead_days + dist.behind_days) < max_distance_in_dataset_days)
+    ]
 
-    return possible_look_distances
+    return look_combinations_with_rows
 
 
 def main():
diff --git a/src/psycopt2d/model_training_watcher.py b/src/psycopt2d/model_training_watcher.py
index d634deac..d6d05c1d 100644
--- a/src/psycopt2d/model_training_watcher.py
+++ b/src/psycopt2d/model_training_watcher.py
@@ -199,15 +199,17 @@ def _evaluate_and_archive_finished_runs(
         if finished_runs:
             for run_info in finished_runs:
                 if (
-                    run_info.auc
-                    > self.max_performances[run_info.lookahead_lookbehind_combined]
+                    run_info.auc  # type: ignore
+                    > self.max_performances[
+                        run_info.lookahead_lookbehind_combined  # type: ignore
+                    ]
                 ):
                     msg.good(
                         f"New record performance for {run_info.lookahead_lookbehind_combined}! AUC: {run_info.auc}",
                     )
                     self.max_performances[
-                        run_info.lookahead_lookbehind_combined
-                    ] = run_info.auc
+                        run_info.lookahead_lookbehind_combined  # type: ignore
+                    ] = run_info.auc  # type: ignore
                     self._do_evaluation(run_info.run_id)
                 self._archive_run_dir(run_dir=self._get_run_wandb_dir(run_info.run_id))
 

From faa43dc302e6c236cf141737b5b3aded4ad1ea0a Mon Sep 17 00:00:00 2001
From: Martin Bernstorff <martinbernstorff@gmail.com>
Date: Tue, 25 Oct 2022 14:11:01 +0200
Subject: [PATCH 54/57] fix: type errors

---
 application/train_and_log_models.py | 47 +++++++++++------------------
 1 file changed, 17 insertions(+), 30 deletions(-)

diff --git a/application/train_and_log_models.py b/application/train_and_log_models.py
index 35716fe6..52b87ee8 100644
--- a/application/train_and_log_models.py
+++ b/application/train_and_log_models.py
@@ -9,6 +9,7 @@
 import subprocess
 import time
 from pathlib import Path
+from typing import Union
 
 import pandas as pd
 from hydra import compose, initialize
@@ -26,11 +27,11 @@
 msg = Printer(timestamp=True)
 
 
-class LookDistances(BaseModel):
+class LookDistance(BaseModel):
     """A distance of ahead and behind."""
 
-    ahead: list[str]
-    behind: list[str]
+    behind_days: list[Union[int, float]]
+    ahead_days: list[Union[int, float]]
 
 
 def load_train_raw(cfg: FullConfig):
@@ -44,7 +45,7 @@ def load_train_raw(cfg: FullConfig):
     raise ValueError(f"Returned {len(file)} files")
 
 
-def infer_possible_look_distances(df: pd.DataFrame) -> LookDistances:
+def infer_possible_look_distances(df: pd.DataFrame) -> LookDistance:
     """Infer the possible values for min_lookahead_days and
     min_lookbehind_days."""
     # Get potential lookaheads from outc_ columns
@@ -56,23 +57,16 @@ def infer_possible_look_distances(df: pd.DataFrame) -> LookDistances:
     pred_col_names = infer_predictor_col_name(df=df, allow_multiple=True)
     possible_lookbehind_days = list(set(infer_look_distance(col_name=pred_col_names)))
 
-    return LookDistances(
-        ahead=possible_lookahead_days,
-        behind=possible_lookbehind_days,
+    return LookDistance(
+        behind_days=possible_lookahead_days,
+        ahead_days=possible_lookbehind_days,
     )
 
 
-class LookDirectionCombination(BaseModel):
-    """A combination of lookbehind and lookahead days."""
-
-    behind_days: int
-    ahead_days: int
-
-
 def start_trainer(
     cfg: FullConfig,
     config_file_name: str,
-    cell: LookDirectionCombination,
+    cell: LookDistance,
     wandb_group_override: str,
 ) -> subprocess.Popen:
     """Start a trainer."""
@@ -128,7 +122,7 @@ def start_watcher(cfg: FullConfig) -> subprocess.Popen:
 
 def train_models_for_each_cell_in_grid(
     cfg: FullConfig,
-    possible_look_distances: LookDistances,
+    possible_look_distances: list[LookDistance],
     config_file_name: str,
 ):
     """Train a model for each cell in the grid of possible look distances."""
@@ -136,20 +130,13 @@ def train_models_for_each_cell_in_grid(
 
     random_word = RandomWords()
 
-    # Create all combinations of lookbehind and lookahead days
-    lookbehind_combinations = [
-        LookDirectionCombination(lookbehind=lookbehind, lookahead=lookahead)
-        for lookbehind in possible_look_distances.behind
-        for lookahead in possible_look_distances.ahead
-    ]
-
-    random.shuffle(lookbehind_combinations)
+    random.shuffle(possible_look_distances)
 
     active_trainers: list[subprocess.Popen] = []
 
     wandb_prefix = f"{random_word.get_random_word()}-{random_word.get_random_word()}"
 
-    while lookbehind_combinations or active_trainers:
+    while possible_look_distances or active_trainers:
         # Wait until there is a free slot in the trainers group
         if len(active_trainers) >= cfg.train.n_active_trainers:
             # Drop trainers if they have finished
@@ -160,7 +147,7 @@ def train_models_for_each_cell_in_grid(
 
         # Start a new trainer
 
-        combination = lookbehind_combinations.pop()
+        combination = possible_look_distances.pop()
 
         # Check if any rows in the given combinatin of lookbehind and lookahead days
         cfg_for_checking_any_rows = cfg.copy()
@@ -201,7 +188,7 @@ def load_cfg(config_file_name) -> FullConfig:
 
 def get_possible_look_distances(
     msg: Printer, cfg: FullConfig, train: pd.DataFrame
-) -> list[LookDirectionCombination]:
+) -> list[LookDistance]:
     """Some look_ahead and look_behind distances will result in 0 valid
     prediction times. Only return combinations which will allow some prediction
     times.
@@ -216,9 +203,9 @@ def get_possible_look_distances(
     look_combinations_in_dataset = infer_possible_look_distances(df=train)
 
     look_distance_combinations = [
-        LookDirectionCombination(behind_days=behind_days, ahead_days=ahead_days)
-        for behind_days in look_combinations_in_dataset.behind
-        for ahead_days in look_combinations_in_dataset.ahead
+        LookDistance(behind_days=behind_days, ahead_days=ahead_days)
+        for behind_days in look_combinations_in_dataset.ahead_days
+        for ahead_days in look_combinations_in_dataset.behind_days
     ]
 
     # Don't try look distance combinations which will result in 0 rows

From 7fd5ba1ca2ce5e238c8e34d9a2d0347abf4134d9 Mon Sep 17 00:00:00 2001
From: Martin Bernstorff <ryqiem@gmail.com>
Date: Wed, 26 Oct 2022 12:09:05 +0200
Subject: [PATCH 55/57] fix: watcher is working

---
 application/train_and_log_models.py           | 66 +++++++++----------
 .../config/project/default_project.yaml       |  1 +
 .../project/integration_test_project.yaml     |  1 +
 .../config/train/default_training.yaml        |  2 +-
 src/psycopt2d/evaluation.py                   | 20 +-----
 src/psycopt2d/load.py                         | 28 ++------
 src/psycopt2d/model_training_watcher.py       |  9 +--
 src/psycopt2d/utils/configs.py                |  3 +-
 8 files changed, 46 insertions(+), 84 deletions(-)

diff --git a/application/train_and_log_models.py b/application/train_and_log_models.py
index 52b87ee8..0354be81 100644
--- a/application/train_and_log_models.py
+++ b/application/train_and_log_models.py
@@ -30,22 +30,27 @@
 class LookDistance(BaseModel):
     """A distance of ahead and behind."""
 
-    behind_days: list[Union[int, float]]
-    ahead_days: list[Union[int, float]]
+    behind_days: Union[int, float]
+    ahead_days: Union[int, float]
 
 
 def load_train_raw(cfg: FullConfig):
     """Load the data."""
     path = Path(cfg.data.dir)
-    file = list(path.glob(pattern=r"*train*"))
+    file_names = list(path.glob(pattern=r"*train*"))
 
-    if len(file) == 1:
-        return pd.read_parquet(file)
+    if len(file_names) == 1:
+        file_name = file_names[0]
+        file_suffix = file_name.suffix
+        if file_suffix == ".parquet":
+            return pd.read_parquet(file_name)
+        elif file_suffix == ".csv":
+            return pd.read_csv(file_name)
 
-    raise ValueError(f"Returned {len(file)} files")
+    raise ValueError(f"Returned {len(file_names)} files")
 
 
-def infer_possible_look_distances(df: pd.DataFrame) -> LookDistance:
+def infer_possible_look_distances(df: pd.DataFrame) -> list[LookDistance]:
     """Infer the possible values for min_lookahead_days and
     min_lookbehind_days."""
     # Get potential lookaheads from outc_ columns
@@ -57,10 +62,14 @@ def infer_possible_look_distances(df: pd.DataFrame) -> LookDistance:
     pred_col_names = infer_predictor_col_name(df=df, allow_multiple=True)
     possible_lookbehind_days = list(set(infer_look_distance(col_name=pred_col_names)))
 
-    return LookDistance(
-        behind_days=possible_lookahead_days,
-        ahead_days=possible_lookbehind_days,
-    )
+    return [
+        LookDistance(
+            behind_days=lookbehind_days,
+            ahead_days=lookahead_days,
+        )
+        for lookahead_days in possible_lookahead_days
+        for lookbehind_days in possible_lookbehind_days
+    ]
 
 
 def start_trainer(
@@ -149,17 +158,6 @@ def train_models_for_each_cell_in_grid(
 
         combination = possible_look_distances.pop()
 
-        # Check if any rows in the given combinatin of lookbehind and lookahead days
-        cfg_for_checking_any_rows = cfg.copy()
-        cfg_for_checking_any_rows.data.min_lookbehind_days = combination.behind_days
-        cfg_for_checking_any_rows.data.min_lookahead_days = combination.ahead_days
-
-        train = load_train_from_cfg(cfg=cfg)
-
-        if train.shape[0] == 0:
-            msg.warn(f"No rows for {combination}, continuing")
-            continue
-
         msg.info(
             f"Spawning a new trainer with lookbehind={combination.behind_days} and lookahead={combination.ahead_days}",
         )
@@ -202,20 +200,17 @@ def get_possible_look_distances(
 
     look_combinations_in_dataset = infer_possible_look_distances(df=train)
 
-    look_distance_combinations = [
-        LookDistance(behind_days=behind_days, ahead_days=ahead_days)
-        for behind_days in look_combinations_in_dataset.ahead_days
-        for ahead_days in look_combinations_in_dataset.behind_days
-    ]
-
     # Don't try look distance combinations which will result in 0 rows
-    max_distance_in_dataset_days = max(train[cfg.data.pred_timestamp_col_name]) - max(
-        train[cfg.data.pred_timestamp_col_name],
-    )
+    max_distance_in_dataset_days = (
+        max(train[cfg.data.pred_timestamp_col_name])
+        - min(
+            train[cfg.data.pred_timestamp_col_name],
+        )
+    ).days
 
     look_combinations_without_rows = [
         dist
-        for dist in look_distance_combinations
+        for dist in look_combinations_in_dataset
         if ((dist.ahead_days + dist.behind_days)) > max_distance_in_dataset_days
     ]
 
@@ -225,7 +220,7 @@ def get_possible_look_distances(
 
     look_combinations_with_rows = [
         dist
-        for dist in look_distance_combinations
+        for dist in look_combinations_in_dataset
         if ((dist.ahead_days + dist.behind_days) < max_distance_in_dataset_days)
     ]
 
@@ -240,9 +235,8 @@ def main():
 
     cfg = load_cfg(config_file_name=config_file_name)
 
-    # Override for testing
-    cfg.train.n_active_trainers = 1
-
+    # Load dataset without dropping any rows for inferring
+    # which look distances to grid search over
     train = load_train_raw(cfg=cfg)
     possible_look_distances = get_possible_look_distances(msg, cfg, train)
 
diff --git a/src/psycopt2d/config/project/default_project.yaml b/src/psycopt2d/config/project/default_project.yaml
index b98f94c8..60ef0af8 100644
--- a/src/psycopt2d/config/project/default_project.yaml
+++ b/src/psycopt2d/config/project/default_project.yaml
@@ -10,3 +10,4 @@ watcher:
   archive_all: false
   keep_alive_after_training_minutes: 5
   n_runs_before_eval: 1
+  verbose: true
diff --git a/src/psycopt2d/config/project/integration_test_project.yaml b/src/psycopt2d/config/project/integration_test_project.yaml
index eea71af1..dceda704 100644
--- a/src/psycopt2d/config/project/integration_test_project.yaml
+++ b/src/psycopt2d/config/project/integration_test_project.yaml
@@ -8,4 +8,5 @@ watcher:
   archive_all: true
   keep_alive_after_training_minutes: 5
   n_runs_before_eval: 1
+  verbose: true
 gpu: false
diff --git a/src/psycopt2d/config/train/default_training.yaml b/src/psycopt2d/config/train/default_training.yaml
index e81d99be..f5378cb6 100644
--- a/src/psycopt2d/config/train/default_training.yaml
+++ b/src/psycopt2d/config/train/default_training.yaml
@@ -1,4 +1,4 @@
 n_splits: 3 # (int, Null): Number of k-folds during CV. If Null, loads pre-defined dataset.
 n_trials_per_lookdirection_combination: 20
-n_active_trainers: 8
+n_active_trainers: 1
 gpu: true
diff --git a/src/psycopt2d/evaluation.py b/src/psycopt2d/evaluation.py
index 5f7745da..6c3d9704 100644
--- a/src/psycopt2d/evaluation.py
+++ b/src/psycopt2d/evaluation.py
@@ -58,9 +58,7 @@ def log_feature_importances(
 def evaluate_model(
     cfg: FullConfig,
     eval_df: pd.DataFrame,
-    pipe: Pipeline,
     y_col_name: str,
-    train_col_names: Iterable[str],
     y_hat_prob_col_name: str,
     run: wandb_run,
     feature_importance_dict: Optional[dict[str, float]],
@@ -77,6 +75,8 @@ def evaluate_model(
         run (wandb_run): WandB run to log to.
         feature_importance_dict (Optional[dict[str, float]]): Dict of feature
             names and their importance. If None, will not log feature importance.
+        selected_features (Optional[list[str]]): List of selected features after preprocessing.
+            Used for plotting.
     """
     msg = Printer(timestamp=True)
 
@@ -101,22 +101,6 @@ def evaluate_model(
     pred_timestamps = eval_df[cfg.data.pred_timestamp_col_name]
     y_hat_int = np.round(y_hat_probs, 0)
 
-    if "feature_selection" in pipe["preprocessing"].named_steps:
-        selected_features = (
-            eval_df[train_col_names]
-            .columns[pipe["preprocessing"]["feature_selection"].get_support()]
-            .to_list()
-        )
-
-        run.log(
-            {
-                "feature_selection_table": feature_selection_table(
-                    feature_names=train_col_names,
-                    selected_feature_names=selected_features,
-                ),
-            },
-        )
-
     date_bins_ahead: Iterable[int] = cfg.eval.date_bins_ahead
     date_bins_behind: Iterable[int] = cfg.eval.date_bins_behind
 
diff --git a/src/psycopt2d/load.py b/src/psycopt2d/load.py
index e680a4c5..c0309b5f 100644
--- a/src/psycopt2d/load.py
+++ b/src/psycopt2d/load.py
@@ -13,12 +13,9 @@
 
 from psycopt2d.evaluate_saved_model_predictions import infer_look_distance
 from psycopt2d.utils.configs import FullConfig
-from psycopt2d.utils.utils import (
-    coerce_to_datetime,
-    get_percent_lost,
-    infer_outcome_col_name,
-    infer_predictor_col_name,
-)
+from psycopt2d.utils.utils import (coerce_to_datetime, get_percent_lost,
+                                   infer_outcome_col_name,
+                                   infer_predictor_col_name)
 
 msg = Printer(timestamp=True)
 
@@ -376,10 +373,9 @@ def _keep_unique_outcome_col_with_lookahead_days_matching_conf(
         if not col_to_drop:
             return dataset
 
-        col_to_drop = col_to_drop[0] if len(col_to_drop) == 1 else outcome_cols
         df = dataset.drop(col_to_drop, axis=1)
 
-        if not isinstance(infer_outcome_col_name(df), str):
+        if not len(infer_outcome_col_name(df)) == 1:
             raise ValueError(
                 "Returning more than one outcome column, will cause problems during eval.",
             )
@@ -440,22 +436,6 @@ def load_dataset_from_dir(
             pd.DataFrame: The filtered dataset
         """
         msg.info(f"Loading {split_names}")
-        # Handle input types
-        for timedelta_arg in (
-            self.cfg.data.min_lookbehind_days,
-            self.cfg.data.min_lookahead_days,
-        ):
-            if timedelta_arg:
-                timedelta_arg = timedelta(days=timedelta_arg)  # type: ignore
-
-        for date_arg in (
-            self.cfg.data.drop_patient_if_outcome_before_date,
-            self.cfg.data.min_prediction_time_date,
-        ):
-            if isinstance(date_arg, str):
-                date_arg = coerce_to_datetime(
-                    date_repr=date_arg,
-                )
 
         # Concat splits if multiple are given
         if isinstance(split_names, (list, tuple)):
diff --git a/src/psycopt2d/model_training_watcher.py b/src/psycopt2d/model_training_watcher.py
index d6d05c1d..0de379af 100644
--- a/src/psycopt2d/model_training_watcher.py
+++ b/src/psycopt2d/model_training_watcher.py
@@ -23,7 +23,6 @@
     load_evaluation_data,
 )
 
-# Path to the wandb directory
 WANDB_DIR = PROJECT_ROOT / "wandb"
 
 
@@ -119,7 +118,9 @@ def _upload_run_dir(self, run_dir: Path) -> str:
             check=True,
             capture_output=True,
         )
+
         stdout = proc.stdout.decode("utf-8")
+
         if self.verbose:
             msg.info(f"Watcher: {stdout}")
         return stdout
@@ -139,8 +140,8 @@ def _do_evaluation(self, run_id: str) -> None:
         # get evaluation data
         eval_data = self._get_eval_data(run_id)
         # infer required column names
-        y_col_name = infer_outcome_col_name(df=eval_data.df, prefix="outc_")
-        y_hat_prob_col_name = infer_y_hat_prob_col_name(df=eval_data.df)
+        y_col_name = infer_outcome_col_name(df=eval_data.df, prefix="outc_")[0]
+        y_hat_prob_col_name = infer_y_hat_prob_col_name(df=eval_data.df)[0]
         # get wandb run
         run: Run = wandb.init(project=self.project_name, entity=self.entity, id=run_id)  # type: ignore
 
@@ -261,7 +262,7 @@ def upload_unarchived_runs(self) -> None:
 
             wandb_sync_stdout = self._upload_run_dir(run_folder)
 
-            if "...done" not in wandb_sync_stdout:
+            if "... done" not in wandb_sync_stdout:
                 if ".wandb file is empty" not in wandb_sync_stdout:
                     raise ValueError(
                         f"wandb sync failed, returned: {wandb_sync_stdout}",
diff --git a/src/psycopt2d/utils/configs.py b/src/psycopt2d/utils/configs.py
index fb6f18ad..1b43ef45 100644
--- a/src/psycopt2d/utils/configs.py
+++ b/src/psycopt2d/utils/configs.py
@@ -22,6 +22,7 @@ class Config:
         """Allow arbitrary types."""
 
         arbitrary_types_allowed = True
+        allow_mutation = False
 
 
 class WandbConf(BaseModel):
@@ -38,6 +39,7 @@ class WatcherConf(BaseModel):
     archive_all: bool
     keep_alive_after_training_minutes: Union[int, float]
     n_runs_before_eval: int
+    verbose: bool
 
 
 class ProjectConf(BaseModel):
@@ -46,7 +48,6 @@ class ProjectConf(BaseModel):
     wandb: WandbConf
     name: str = "psycopt2d"
     seed: int
-    wandb: WandbConf
     watcher: WatcherConf
 
 

From cb3de8b273ee677986a8debd0e9cf264c77d31d1 Mon Sep 17 00:00:00 2001
From: Martin Bernstorff <ryqiem@gmail.com>
Date: Wed, 26 Oct 2022 12:09:30 +0200
Subject: [PATCH 56/57] style: linting

---
 application/train_and_log_models.py | 9 +++++----
 src/psycopt2d/evaluation.py         | 2 --
 src/psycopt2d/load.py               | 8 +++++---
 3 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/application/train_and_log_models.py b/application/train_and_log_models.py
index 0354be81..c5b5e226 100644
--- a/application/train_and_log_models.py
+++ b/application/train_and_log_models.py
@@ -21,7 +21,6 @@
     infer_outcome_col_name,
     infer_predictor_col_name,
 )
-from psycopt2d.load import load_train_from_cfg
 from psycopt2d.utils.configs import FullConfig, omegaconf_to_pydantic_objects
 
 msg = Printer(timestamp=True)
@@ -185,7 +184,9 @@ def load_cfg(config_file_name) -> FullConfig:
 
 
 def get_possible_look_distances(
-    msg: Printer, cfg: FullConfig, train: pd.DataFrame
+    msg: Printer,
+    cfg: FullConfig,
+    train: pd.DataFrame,
 ) -> list[LookDistance]:
     """Some look_ahead and look_behind distances will result in 0 valid
     prediction times. Only return combinations which will allow some prediction
@@ -211,11 +212,11 @@ def get_possible_look_distances(
     look_combinations_without_rows = [
         dist
         for dist in look_combinations_in_dataset
-        if ((dist.ahead_days + dist.behind_days)) > max_distance_in_dataset_days
+        if (dist.ahead_days + dist.behind_days) > max_distance_in_dataset_days
     ]
 
     msg.info(
-        f"Not fitting model to {look_combinations_without_rows}, since no rows satisfy the criteria."
+        f"Not fitting model to {look_combinations_without_rows}, since no rows satisfy the criteria.",
     )
 
     look_combinations_with_rows = [
diff --git a/src/psycopt2d/evaluation.py b/src/psycopt2d/evaluation.py
index 6c3d9704..003c959e 100644
--- a/src/psycopt2d/evaluation.py
+++ b/src/psycopt2d/evaluation.py
@@ -7,7 +7,6 @@
 import pandas as pd
 from omegaconf.dictconfig import DictConfig
 from sklearn.metrics import recall_score, roc_auc_score
-from sklearn.pipeline import Pipeline
 from wandb.sdk.wandb_run import Run as wandb_run  # pylint: disable=no-name-in-module
 from wasabi import Printer
 
@@ -15,7 +14,6 @@
 from psycopt2d.tables.performance_by_threshold import (
     generate_performance_by_positive_rate_table,
 )
-from psycopt2d.tables.tables import feature_selection_table
 from psycopt2d.utils.configs import FullConfig
 from psycopt2d.utils.utils import PROJECT_ROOT, positive_rate_to_pred_probs
 from psycopt2d.visualization import (
diff --git a/src/psycopt2d/load.py b/src/psycopt2d/load.py
index c0309b5f..092b09ce 100644
--- a/src/psycopt2d/load.py
+++ b/src/psycopt2d/load.py
@@ -13,9 +13,11 @@
 
 from psycopt2d.evaluate_saved_model_predictions import infer_look_distance
 from psycopt2d.utils.configs import FullConfig
-from psycopt2d.utils.utils import (coerce_to_datetime, get_percent_lost,
-                                   infer_outcome_col_name,
-                                   infer_predictor_col_name)
+from psycopt2d.utils.utils import (
+    get_percent_lost,
+    infer_outcome_col_name,
+    infer_predictor_col_name,
+)
 
 msg = Printer(timestamp=True)
 

From 65dc59a10e390f60917862cfd4239f306ad1059d Mon Sep 17 00:00:00 2001
From: Martin Bernstorff <martinbernstorff@gmail.com>
Date: Wed, 26 Oct 2022 12:20:10 +0200
Subject: [PATCH 57/57] fix: failing tests

---
 src/psycopt2d/load.py                 | 4 ++--
 src/psycopt2d/train_and_log_models.py | 2 --
 tests/test_load.py                    | 7 +++----
 tests/test_train_model.py             | 1 +
 4 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/src/psycopt2d/load.py b/src/psycopt2d/load.py
index 092b09ce..41958499 100644
--- a/src/psycopt2d/load.py
+++ b/src/psycopt2d/load.py
@@ -338,9 +338,9 @@ def _drop_cols_and_rows_if_look_direction_not_met(
         for direction in ("ahead", "behind"):
 
             if direction in ("ahead", "behind"):
-                if self.cfg.data.min_lookahead_days:
+                if direction == "ahead":
                     n_days = self.cfg.data.min_lookahead_days
-                elif self.cfg.data.min_lookbehind_days:
+                elif direction == "behind":
                     n_days = self.cfg.data.min_lookbehind_days
                 else:
                     continue
diff --git a/src/psycopt2d/train_and_log_models.py b/src/psycopt2d/train_and_log_models.py
index 764b28af..3608733f 100644
--- a/src/psycopt2d/train_and_log_models.py
+++ b/src/psycopt2d/train_and_log_models.py
@@ -8,8 +8,6 @@
 - Run this script from project root with `python src/psycopt2d/train_and_log_models.py`
 """
 
-# TODO: Should be unified with the other train_and_log_models in application. Will be done when merging parent branch.
-
 import subprocess
 import time
 
diff --git a/tests/test_load.py b/tests/test_load.py
index 094fe3c6..90991383 100644
--- a/tests/test_load.py
+++ b/tests/test_load.py
@@ -11,14 +11,13 @@ def test_load_lookbehind_exceeds_lookbehind_threshold():
     with initialize(version_base=None, config_path="../src/psycopt2d/config/"):
         cfg = compose(
             config_name="integration_testing.yaml",
+            overrides=["data.min_lookbehind_days=60"],
         )
 
         cfg = omegaconf_to_pydantic_objects(cfg)
-
-        cfg.data.min_lookahead_days = 30
         split_dataset = load_train_and_val_from_cfg(cfg)
 
-        assert split_dataset.train.shape[1] == 6
+        assert split_dataset.train.shape[1] == 7
 
 
 def test_load_lookbehind_not_in_lookbehind_combination():
@@ -27,11 +26,11 @@ def test_load_lookbehind_not_in_lookbehind_combination():
     with initialize(version_base=None, config_path="../src/psycopt2d/config/"):
         cfg = compose(
             config_name="integration_testing.yaml",
+            overrides=["data.lookbehind_combination=[30]"],
         )
 
         cfg = omegaconf_to_pydantic_objects(cfg)
 
-        cfg.data.lookbehind_combination = [30]
         split_dataset = load_train_and_val_from_cfg(cfg)
 
         assert split_dataset.train.shape[1] == 6
diff --git a/tests/test_train_model.py b/tests/test_train_model.py
index c6265627..21762d11 100644
--- a/tests/test_train_model.py
+++ b/tests/test_train_model.py
@@ -3,6 +3,7 @@
 import pytest
 from hydra import compose, initialize
 
+from psycopt2d.load import load_train_from_cfg
 from psycopt2d.models import MODELS
 from psycopt2d.train_model import main
 from psycopt2d.utils.configs import omegaconf_to_pydantic_objects