Merge pull request #348 from Aarhus-Psychiatry-Research/marbern/refac…

…tor_load Marbern/refactor load
Aarhus-Psychiatry-Research · Dec 22, 2022 · 3047536 · 3047536
2 parents 0a6f407 + 0b09b88
commit 3047536
Show file tree

Hide file tree

Showing 118 changed files with 1,875 additions and 3,232 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -13,17 +13,6 @@ repos:
         pass_filenames: false
         always_run: true
 
-  - repo: https://github.com/PyCQA/autoflake
-    rev: v1.7.6
-    hooks:
-      - id: autoflake
-        args:
-          [
-            "--in-place",
-            "--remove-all-unused-imports",
-            "--ignore-init-module-imports",
-          ]
-
   - repo: https://github.com/pycqa/isort
     rev: 5.10.1
     hooks:
@@ -58,12 +47,6 @@ repos:
     hooks:
       - id: black
 
-  - repo: https://github.com/PyCQA/flake8
-    rev: 5.0.4
-    hooks:
-      - id: flake8
-        args: [--config, .flake8]
-
   - repo: https://github.com/pre-commit/pre-commit-hooks
     rev: v4.3.0 # Use the ref you want to point at
     hooks:

diff --git a/src/psycop_model_training/config/__init__.py → application/__init__.py b/src/psycop_model_training/config/__init__.py → application/__init__.py
diff --git a/application/config/__init__.py b/application/config/__init__.py
diff --git a/application/config/data/default_data.yaml b/application/config/data/default_data.yaml
@@ -0,0 +1,19 @@
+# @package _global_
+data:
+  # General config
+  n_training_samples: null
+  dir: E:\shared_resources\t2d\feature_sets\psycop_t2d_adminmanber_features_2022_12_19_15_36
+  suffix: parquet
+
+  # Feature specs
+  pred_prefix: pred_
+  outc_prefix: outc_
+
+  col_name:
+    pred_timestamp: timestamp
+    outcome_timestamp: timestamp_first_t2d_hba1c
+    id: dw_ek_borger
+    age: pred_age_in_years
+    exclusion_timestamp: timestamp_exclusion
+    custom:
+      n_hba1c: eval_hba1c_within_9999_days_count_fallback_nan
diff --git a/...model_training/config/default_config.yaml → application/config/default_config.yaml b/...model_training/config/default_config.yaml → application/config/default_config.yaml
@@ -1,7 +1,7 @@
 # @package _global_
 defaults:
   - project: default_project
-  - data: t2d_parquet
+  - data: default_data
   - preprocessing: default_preprocessing
   - model: xgboost
   - train: default_training

diff --git a/...ining/config/eval/default_evaluation.yaml → ...ation/config/eval/default_evaluation.yaml b/...ining/config/eval/default_evaluation.yaml → ...ation/config/eval/default_evaluation.yaml
diff --git a/...ycop_model_training/config/model/ebm.yaml → application/config/model/ebm.yaml b/...ycop_model_training/config/model/ebm.yaml → application/config/model/ebm.yaml
diff --git a/...ing/config/model/logistic-regression.yaml → ...ion/config/model/logistic-regression.yaml b/...ing/config/model/logistic-regression.yaml → ...ion/config/model/logistic-regression.yaml
@@ -22,4 +22,4 @@ hydra:
       ++model.args.C: interval(1e-5, 1.0)
       ++model.args.l1_ratio: interval(1e-5, 1.0)
       # preprocessing
-      ++preprocessing.scaling: choice("null", "z-score-normalization")
+      ++preprocessing.post_split.scaling: choice("null", "z-score-normalization")
diff --git a/...el_training/config/model/naive-bayes.yaml → application/config/model/naive-bayes.yaml b/...el_training/config/model/naive-bayes.yaml → application/config/model/naive-bayes.yaml
@@ -10,4 +10,4 @@ hydra:
   sweeper:
     params:
       # preprocessing
-      ++preprocessing.scaling: choice(null, "z-score-normalization")
+      ++preprocessing.post_split.scaling: choice(null, "z-score-normalization")
diff --git a/..._model_training/config/model/xgboost.yaml → application/config/model/xgboost.yaml b/..._model_training/config/model/xgboost.yaml → application/config/model/xgboost.yaml
diff --git a/application/config/preprocessing/default_preprocessing.yaml b/application/config/preprocessing/default_preprocessing.yaml
@@ -0,0 +1,30 @@
+# @package _global_
+preprocessing:
+  pre_split:
+    convert_to_boolean: false
+    convert_booleans_to_int: true
+    drop_datetime_predictor_columns: true
+    convert_datetimes_to_ordinal: false
+    drop_patient_if_exclusion_before_date: 2013-01-01
+    min_prediction_time_date: 2013-01-01
+    min_lookahead_days: 1825
+    lookbehind_combination: [30, 90, 180, 365, 730]
+    min_age: 18
+  post_split:
+    imputation_method: most_frequent
+    scaling: z-score-normalisation
+    feature_selection:
+      name: chi2
+      params:
+        percentile: 20 # (int): Percent of features to keep. Defaults to 10.
+
+# Parameters that will only take effect if running with --multirun
+hydra:
+  sweeper:
+    params:
+      ++preprocessing.post_split.imputation_method: choice("most_frequent", "mean", "median", "null")
+      ++preprocessing.post_split.scaling: choice("z-score-normalization", "null")
+      ++preprocessing.post_split.feature_selection.name: choice("chi2", "null")
+      ++preprocessing.post_split.feature_selection.params.percentile: int(tag(log, interval(1, 90)))
+      ++preprocessing.pre_split.lookbehind_combination: choice([30, 90, 180, 365, 730], [30, 180, 730], [730], [365], [90], [30])
+
diff --git a/application/config/project/default_project.yaml b/application/config/project/default_project.yaml
@@ -0,0 +1,9 @@
+name: t2d
+seed: 42
+
+wandb:
+  entity: psycop # Which entity to run WanDB in.
+  mode: run # Which mode to run WanDB in. Takes "run", "dryrun", "offline" and "disabled"
+  group: t2d # Which group to run WanDB in.
+
+gpu: true
diff --git a/...ng/config/sweeper/optuna_multithread.yaml → ...on/config/sweeper/optuna_multithread.yaml b/...ng/config/sweeper/optuna_multithread.yaml → ...on/config/sweeper/optuna_multithread.yaml
diff --git a/...g/config/sweeper/optuna_singlethread.yaml → ...n/config/sweeper/optuna_singlethread.yaml b/...g/config/sweeper/optuna_singlethread.yaml → ...n/config/sweeper/optuna_singlethread.yaml
diff --git a/...aining/config/train/default_training.yaml → ...cation/config/train/default_training.yaml b/...aining/config/train/default_training.yaml → ...cation/config/train/default_training.yaml
diff --git a/application/inspect_dataset.py b/application/inspect_dataset.py
@@ -0,0 +1,22 @@
+"""Example of how to inspect a dataset using the configs."""
+from psycop_model_training.data_loader.utils import (
+    load_and_filter_train_from_cfg,
+    load_train_raw,
+)
+from psycop_model_training.utils.config_schemas import load_test_cfg_as_pydantic
+
+
+def main():
+    """Main."""
+    config_file_name = "default_config.yaml"
+
+    cfg = load_test_cfg_as_pydantic(config_file_name=config_file_name)
+    df = load_train_raw(cfg=cfg)  # pylint: disable=unused-variable
+
+    df_filtered = load_and_filter_train_from_cfg(  # pylint: disable=unused-variable
+        cfg=cfg,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/application/loaders/__init__.py b/application/loaders/__init__.py
diff --git a/application/loaders/preprocessing_loaders.py b/application/loaders/preprocessing_loaders.py
@@ -0,0 +1,37 @@
+import pandas as pd
+from psycopmlutils.sql.loader import sql_load
+
+
+def load_timestamp_for_any_diabetes():
+    """Loads timestamps for the broad definition of diabetes used for wash-in.
+
+    See R files for details.
+    """
+    timestamp_any_diabetes = sql_load(
+        query="SELECT * FROM [fct].[psycop_t2d_first_diabetes_any]",
+        format_timestamp_cols_to_datetime=False,
+    )[["dw_ek_borger", "datotid_first_diabetes_any"]]
+
+    timestamp_any_diabetes = timestamp_any_diabetes.rename(
+        columns={"datotid_first_diabetes_any": "timestamp_washin"},
+    )
+
+    return timestamp_any_diabetes
+
+
+def add_washin_timestamps(dataset: pd.DataFrame) -> pd.DataFrame:
+    """Add washin timestamps to dataset.
+
+    Washin is an exclusion criterion. E.g. if the patient has any visit
+    that looks like diabetes before the study starts (i.e. during
+    washin), they are excluded.
+    """
+    timestamp_washin = load_timestamp_for_any_diabetes()
+
+    dataset = dataset.merge(
+        timestamp_washin,
+        on="dw_ek_borger",
+        how="left",
+    )
+
+    return dataset
diff --git a/application/t2d/train_and_log_models.py → application/main.py b/application/t2d/train_and_log_models.py → application/main.py
@@ -12,19 +12,20 @@
 
 import pandas as pd
 import wandb
+from psycopmlutils.wandb.wandb_try_except_decorator import wandb_alert_on_exception
 from random_word import RandomWords
 from wasabi import Printer
 
-from psycop_model_training.config.schemas import (
-    BaseModel,
-    FullConfigSchema,
-    load_cfg_as_pydantic,
-)
-from psycop_model_training.load import load_train_raw
-from psycop_model_training.model_eval.evaluate_model import (
+from psycop_model_training.data_loader.data_loader import DataLoader
+from psycop_model_training.utils.col_name_inference import (
     infer_look_distance,
     infer_outcome_col_name,
 )
+from psycop_model_training.utils.config_schemas.conf_utils import (
+    BaseModel,
+    load_app_cfg_as_pydantic,
+)
+from psycop_model_training.utils.config_schemas.full_config import FullConfigSchema
 
 
 def start_trainer(
@@ -39,13 +40,13 @@ def start_trainer(
 
     subprocess_args: list[str] = [
         "python",
-        "src/psycop_model_training/train_model.py",
+        "application/train_model.py",
         f"project.wandb.group='{wandb_group_override}'",
         f"project.wandb.mode={cfg.project.wandb.mode}",
         f"hydra.sweeper.n_trials={cfg.train.n_trials_per_lookahead}",
         f"hydra.sweeper.n_jobs={cfg.train.n_jobs_per_trainer}",
         f"model={model_name}",
-        f"data.min_lookahead_days={lookahead_days}",
+        f"preprocessing.pre_split.min_lookahead_days={lookahead_days}",
         "--config-name",
         f"{config_file_name}",
     ]
@@ -148,6 +149,9 @@ def train_models_for_each_cell_in_grid(
             ),
         )
 
+        # Sleep a bit to avoid segfaults
+        time.sleep(10)
+
 
 def get_possible_lookaheads(
     msg: Printer,
@@ -191,24 +195,20 @@ def get_possible_lookaheads(
     return list(set(possible_lookahead_days) - set(lookaheads_without_rows))
 
 
+@wandb_alert_on_exception
 def main():
     """Main."""
     msg = Printer(timestamp=True)
 
-    debug = False
-
-    if debug:
-        config_file_name = "integration_config.yaml"
-    else:
-        config_file_name = "default_config.yaml"
+    config_file_name = "default_config.yaml"
 
-    cfg = load_cfg_as_pydantic(config_file_name=config_file_name)
+    cfg = load_app_cfg_as_pydantic(config_file_name=config_file_name)
 
     random_word = RandomWords()
     wandb_group = f"{random_word.get_random_word()}-{random_word.get_random_word()}"
 
     wandb.init(
-        project=cfg.project.name,
+        project=f"{cfg.project.name}-baseline-model-training",
         mode=cfg.project.wandb.mode,
         group=wandb_group,
         entity=cfg.project.wandb.entity,
@@ -217,7 +217,7 @@ def main():
 
     # Load dataset without dropping any rows for inferring
     # which look distances to grid search over
-    train = load_train_raw(cfg=cfg)
+    train = DataLoader(cfg=cfg).load_dataset_from_dir(split_names="train")
 
     possible_lookaheads = get_possible_lookaheads(
         msg=msg,

diff --git a/application/t2d/inspect_dataset.py b/application/t2d/inspect_dataset.py
diff --git a/...2d/outcome_specification - move to t2d-feature-gen-repo/00_generate_dfs/10_medication.rmd b/...2d/outcome_specification - move to t2d-feature-gen-repo/00_generate_dfs/10_medication.rmd
diff --git a/...ion/t2d/outcome_specification - move to t2d-feature-gen-repo/00_generate_dfs/20_hba1c.rmd b/...ion/t2d/outcome_specification - move to t2d-feature-gen-repo/00_generate_dfs/20_hba1c.rmd