diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 9e05136a..78bf3dca 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -13,17 +13,6 @@ repos: pass_filenames: false always_run: true - - repo: https://github.com/PyCQA/autoflake - rev: v1.7.6 - hooks: - - id: autoflake - args: - [ - "--in-place", - "--remove-all-unused-imports", - "--ignore-init-module-imports", - ] - - repo: https://github.com/pycqa/isort rev: 5.10.1 hooks: @@ -58,12 +47,6 @@ repos: hooks: - id: black - - repo: https://github.com/PyCQA/flake8 - rev: 5.0.4 - hooks: - - id: flake8 - args: [--config, .flake8] - - repo: https://github.com/pre-commit/pre-commit-hooks rev: v4.3.0 # Use the ref you want to point at hooks: diff --git a/src/psycop_model_training/config/__init__.py b/application/__init__.py similarity index 100% rename from src/psycop_model_training/config/__init__.py rename to application/__init__.py diff --git a/application/config/__init__.py b/application/config/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/application/config/data/default_data.yaml b/application/config/data/default_data.yaml new file mode 100644 index 00000000..192d0f3f --- /dev/null +++ b/application/config/data/default_data.yaml @@ -0,0 +1,19 @@ +# @package _global_ +data: + # General config + n_training_samples: null + dir: E:\shared_resources\t2d\feature_sets\psycop_t2d_adminmanber_features_2022_12_19_15_36 + suffix: parquet + + # Feature specs + pred_prefix: pred_ + outc_prefix: outc_ + + col_name: + pred_timestamp: timestamp + outcome_timestamp: timestamp_first_t2d_hba1c + id: dw_ek_borger + age: pred_age_in_years + exclusion_timestamp: timestamp_exclusion + custom: + n_hba1c: eval_hba1c_within_9999_days_count_fallback_nan \ No newline at end of file diff --git a/src/psycop_model_training/config/default_config.yaml b/application/config/default_config.yaml similarity index 90% rename from src/psycop_model_training/config/default_config.yaml rename to application/config/default_config.yaml index 46b91517..46192816 100644 --- a/src/psycop_model_training/config/default_config.yaml +++ b/application/config/default_config.yaml @@ -1,7 +1,7 @@ # @package _global_ defaults: - project: default_project - - data: t2d_parquet + - data: default_data - preprocessing: default_preprocessing - model: xgboost - train: default_training diff --git a/src/psycop_model_training/config/eval/default_evaluation.yaml b/application/config/eval/default_evaluation.yaml similarity index 100% rename from src/psycop_model_training/config/eval/default_evaluation.yaml rename to application/config/eval/default_evaluation.yaml diff --git a/src/psycop_model_training/config/model/ebm.yaml b/application/config/model/ebm.yaml similarity index 100% rename from src/psycop_model_training/config/model/ebm.yaml rename to application/config/model/ebm.yaml diff --git a/src/psycop_model_training/config/model/logistic-regression.yaml b/application/config/model/logistic-regression.yaml similarity index 90% rename from src/psycop_model_training/config/model/logistic-regression.yaml rename to application/config/model/logistic-regression.yaml index 30ab21c4..e4b7817c 100644 --- a/src/psycop_model_training/config/model/logistic-regression.yaml +++ b/application/config/model/logistic-regression.yaml @@ -22,4 +22,4 @@ hydra: ++model.args.C: interval(1e-5, 1.0) ++model.args.l1_ratio: interval(1e-5, 1.0) # preprocessing - ++preprocessing.scaling: choice("null", "z-score-normalization") + ++preprocessing.post_split.scaling: choice("null", "z-score-normalization") diff --git a/src/psycop_model_training/config/model/naive-bayes.yaml b/application/config/model/naive-bayes.yaml similarity index 82% rename from src/psycop_model_training/config/model/naive-bayes.yaml rename to application/config/model/naive-bayes.yaml index cd605228..23899ce4 100644 --- a/src/psycop_model_training/config/model/naive-bayes.yaml +++ b/application/config/model/naive-bayes.yaml @@ -10,4 +10,4 @@ hydra: sweeper: params: # preprocessing - ++preprocessing.scaling: choice(null, "z-score-normalization") + ++preprocessing.post_split.scaling: choice(null, "z-score-normalization") diff --git a/src/psycop_model_training/config/model/xgboost.yaml b/application/config/model/xgboost.yaml similarity index 100% rename from src/psycop_model_training/config/model/xgboost.yaml rename to application/config/model/xgboost.yaml diff --git a/application/config/preprocessing/default_preprocessing.yaml b/application/config/preprocessing/default_preprocessing.yaml new file mode 100644 index 00000000..dcda18c1 --- /dev/null +++ b/application/config/preprocessing/default_preprocessing.yaml @@ -0,0 +1,30 @@ +# @package _global_ +preprocessing: + pre_split: + convert_to_boolean: false + convert_booleans_to_int: true + drop_datetime_predictor_columns: true + convert_datetimes_to_ordinal: false + drop_patient_if_exclusion_before_date: 2013-01-01 + min_prediction_time_date: 2013-01-01 + min_lookahead_days: 1825 + lookbehind_combination: [30, 90, 180, 365, 730] + min_age: 18 + post_split: + imputation_method: most_frequent + scaling: z-score-normalisation + feature_selection: + name: chi2 + params: + percentile: 20 # (int): Percent of features to keep. Defaults to 10. + +# Parameters that will only take effect if running with --multirun +hydra: + sweeper: + params: + ++preprocessing.post_split.imputation_method: choice("most_frequent", "mean", "median", "null") + ++preprocessing.post_split.scaling: choice("z-score-normalization", "null") + ++preprocessing.post_split.feature_selection.name: choice("chi2", "null") + ++preprocessing.post_split.feature_selection.params.percentile: int(tag(log, interval(1, 90))) + ++preprocessing.pre_split.lookbehind_combination: choice([30, 90, 180, 365, 730], [30, 180, 730], [730], [365], [90], [30]) + diff --git a/application/config/project/default_project.yaml b/application/config/project/default_project.yaml new file mode 100644 index 00000000..0447cf73 --- /dev/null +++ b/application/config/project/default_project.yaml @@ -0,0 +1,9 @@ +name: t2d +seed: 42 + +wandb: + entity: psycop # Which entity to run WanDB in. + mode: run # Which mode to run WanDB in. Takes "run", "dryrun", "offline" and "disabled" + group: t2d # Which group to run WanDB in. + +gpu: true diff --git a/src/psycop_model_training/config/sweeper/optuna_multithread.yaml b/application/config/sweeper/optuna_multithread.yaml similarity index 100% rename from src/psycop_model_training/config/sweeper/optuna_multithread.yaml rename to application/config/sweeper/optuna_multithread.yaml diff --git a/src/psycop_model_training/config/sweeper/optuna_singlethread.yaml b/application/config/sweeper/optuna_singlethread.yaml similarity index 100% rename from src/psycop_model_training/config/sweeper/optuna_singlethread.yaml rename to application/config/sweeper/optuna_singlethread.yaml diff --git a/src/psycop_model_training/config/train/default_training.yaml b/application/config/train/default_training.yaml similarity index 100% rename from src/psycop_model_training/config/train/default_training.yaml rename to application/config/train/default_training.yaml diff --git a/application/inspect_dataset.py b/application/inspect_dataset.py new file mode 100644 index 00000000..511cab4e --- /dev/null +++ b/application/inspect_dataset.py @@ -0,0 +1,22 @@ +"""Example of how to inspect a dataset using the configs.""" +from psycop_model_training.data_loader.utils import ( + load_and_filter_train_from_cfg, + load_train_raw, +) +from psycop_model_training.utils.config_schemas import load_test_cfg_as_pydantic + + +def main(): + """Main.""" + config_file_name = "default_config.yaml" + + cfg = load_test_cfg_as_pydantic(config_file_name=config_file_name) + df = load_train_raw(cfg=cfg) # pylint: disable=unused-variable + + df_filtered = load_and_filter_train_from_cfg( # pylint: disable=unused-variable + cfg=cfg, + ) + + +if __name__ == "__main__": + main() diff --git a/application/loaders/__init__.py b/application/loaders/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/application/loaders/preprocessing_loaders.py b/application/loaders/preprocessing_loaders.py new file mode 100644 index 00000000..e321b777 --- /dev/null +++ b/application/loaders/preprocessing_loaders.py @@ -0,0 +1,37 @@ +import pandas as pd +from psycopmlutils.sql.loader import sql_load + + +def load_timestamp_for_any_diabetes(): + """Loads timestamps for the broad definition of diabetes used for wash-in. + + See R files for details. + """ + timestamp_any_diabetes = sql_load( + query="SELECT * FROM [fct].[psycop_t2d_first_diabetes_any]", + format_timestamp_cols_to_datetime=False, + )[["dw_ek_borger", "datotid_first_diabetes_any"]] + + timestamp_any_diabetes = timestamp_any_diabetes.rename( + columns={"datotid_first_diabetes_any": "timestamp_washin"}, + ) + + return timestamp_any_diabetes + + +def add_washin_timestamps(dataset: pd.DataFrame) -> pd.DataFrame: + """Add washin timestamps to dataset. + + Washin is an exclusion criterion. E.g. if the patient has any visit + that looks like diabetes before the study starts (i.e. during + washin), they are excluded. + """ + timestamp_washin = load_timestamp_for_any_diabetes() + + dataset = dataset.merge( + timestamp_washin, + on="dw_ek_borger", + how="left", + ) + + return dataset diff --git a/application/t2d/train_and_log_models.py b/application/main.py similarity index 88% rename from application/t2d/train_and_log_models.py rename to application/main.py index 23ded385..0cac19e4 100644 --- a/application/t2d/train_and_log_models.py +++ b/application/main.py @@ -12,19 +12,20 @@ import pandas as pd import wandb +from psycopmlutils.wandb.wandb_try_except_decorator import wandb_alert_on_exception from random_word import RandomWords from wasabi import Printer -from psycop_model_training.config.schemas import ( - BaseModel, - FullConfigSchema, - load_cfg_as_pydantic, -) -from psycop_model_training.load import load_train_raw -from psycop_model_training.model_eval.evaluate_model import ( +from psycop_model_training.data_loader.data_loader import DataLoader +from psycop_model_training.utils.col_name_inference import ( infer_look_distance, infer_outcome_col_name, ) +from psycop_model_training.utils.config_schemas.conf_utils import ( + BaseModel, + load_app_cfg_as_pydantic, +) +from psycop_model_training.utils.config_schemas.full_config import FullConfigSchema def start_trainer( @@ -39,13 +40,13 @@ def start_trainer( subprocess_args: list[str] = [ "python", - "src/psycop_model_training/train_model.py", + "application/train_model.py", f"project.wandb.group='{wandb_group_override}'", f"project.wandb.mode={cfg.project.wandb.mode}", f"hydra.sweeper.n_trials={cfg.train.n_trials_per_lookahead}", f"hydra.sweeper.n_jobs={cfg.train.n_jobs_per_trainer}", f"model={model_name}", - f"data.min_lookahead_days={lookahead_days}", + f"preprocessing.pre_split.min_lookahead_days={lookahead_days}", "--config-name", f"{config_file_name}", ] @@ -148,6 +149,9 @@ def train_models_for_each_cell_in_grid( ), ) + # Sleep a bit to avoid segfaults + time.sleep(10) + def get_possible_lookaheads( msg: Printer, @@ -191,24 +195,20 @@ def get_possible_lookaheads( return list(set(possible_lookahead_days) - set(lookaheads_without_rows)) +@wandb_alert_on_exception def main(): """Main.""" msg = Printer(timestamp=True) - debug = False - - if debug: - config_file_name = "integration_config.yaml" - else: - config_file_name = "default_config.yaml" + config_file_name = "default_config.yaml" - cfg = load_cfg_as_pydantic(config_file_name=config_file_name) + cfg = load_app_cfg_as_pydantic(config_file_name=config_file_name) random_word = RandomWords() wandb_group = f"{random_word.get_random_word()}-{random_word.get_random_word()}" wandb.init( - project=cfg.project.name, + project=f"{cfg.project.name}-baseline-model-training", mode=cfg.project.wandb.mode, group=wandb_group, entity=cfg.project.wandb.entity, @@ -217,7 +217,7 @@ def main(): # Load dataset without dropping any rows for inferring # which look distances to grid search over - train = load_train_raw(cfg=cfg) + train = DataLoader(cfg=cfg).load_dataset_from_dir(split_names="train") possible_lookaheads = get_possible_lookaheads( msg=msg, diff --git a/application/t2d/inspect_dataset.py b/application/t2d/inspect_dataset.py deleted file mode 100644 index 53c21401..00000000 --- a/application/t2d/inspect_dataset.py +++ /dev/null @@ -1,17 +0,0 @@ -"""Example of how to inspect a dataset using the configs.""" -from psycop_model_training.config.schemas import load_cfg_as_pydantic -from psycop_model_training.load import load_train_from_cfg, load_train_raw - - -def main(): - """Main.""" - config_file_name = "default_config.yaml" - - cfg = load_cfg_as_pydantic(config_file_name=config_file_name) - df = load_train_raw(cfg=cfg) # noqa pylint: disable=unused-variable - - df_filtered = load_train_from_cfg(cfg=cfg) # noqa pylint: disable=unused-variable - - -if __name__ == "__main__": - main() diff --git a/application/t2d/outcome_specification - move to t2d-feature-gen-repo/00_generate_dfs/10_medication.rmd b/application/t2d/outcome_specification - move to t2d-feature-gen-repo/00_generate_dfs/10_medication.rmd deleted file mode 100644 index 88ef6c59..00000000 --- a/application/t2d/outcome_specification - move to t2d-feature-gen-repo/00_generate_dfs/10_medication.rmd +++ /dev/null @@ -1,58 +0,0 @@ -Find first occurrence of hospital prescription or hospital redemption of diabetic medication. - -```{r} -library("pacman") - -p_load(tidyverse, here, future) -source(here("psycop-r-utilities", "import_from_sql.r")) -source(here("functions.r")) - -``` - -```{r} -con <- DBI::dbConnect( - odbc::odbc(), - Driver = "SQL Server", - Server = "BI-DPA-PROD", - database = "USR_PS_Forsk", - Trusted_Connection = "TRUE" -) -``` - -# Get first date of prescribed antidiabetic medication for each patient -## From only that administered -```{r} -df_first_administered_t2d_medication <- con %>% - tbl(sql("SELECT * FROM [fct].FOR_Medicin_administreret_inkl_2021")) %>% - select(dw_ek_borger, datotid_ordination_start, atc) %>% - filter(substr(atc, 1, 3) == "A10") %>% # A10 is all antidiabetic medication - group_by(dw_ek_borger) %>% - filter(datotid_ordination_start == min(datotid_ordination_start)) %>% - collect %>% - format_sql_import() %>% - distinct(dw_ek_borger, datotid_ordination_start) -``` - -## From only that prescribed -```{r} -df_first_prescribed_t2d_medication <- con %>% - tbl(sql("SELECT * FROM [fct].FOR_Medicin_ordineret_inkl_2021")) %>% - filter(substr(atc, 1, 3) == "A10") %>% - select(dw_ek_borger, datotid_ordinationstart, atc) %>% - rename(datotid_ordination_start = datotid_ordinationstart) %>% - group_by(dw_ek_borger) %>% - filter(datotid_ordination_start == min(datotid_ordination_start)) %>% - collect %>% - format_sql_import() %>% - distinct(dw_ek_borger, datotid_ordination_start) -``` - -## Combined -```{r} -df_first_date_of_t2d_medication_prescription <- df_first_administered_t2d_medication %>% - bind_rows(df_first_prescribed_t2d_medication) %>% - group_by(dw_ek_borger) %>% - filter(datotid_ordination_start == min(datotid_ordination_start)) %>% - filter(row_number() == 1) %>% - rename(datotid_first_t2d_medication=datotid_ordination_start) -``` \ No newline at end of file diff --git a/application/t2d/outcome_specification - move to t2d-feature-gen-repo/00_generate_dfs/20_hba1c.rmd b/application/t2d/outcome_specification - move to t2d-feature-gen-repo/00_generate_dfs/20_hba1c.rmd deleted file mode 100644 index 9e6382b8..00000000 --- a/application/t2d/outcome_specification - move to t2d-feature-gen-repo/00_generate_dfs/20_hba1c.rmd +++ /dev/null @@ -1,33 +0,0 @@ -Find the first date where a patient gets a diabetic hba1c-blood-sample. - -```{r} -library("pacman") - -p_load(tidyverse, here, future) - -``` - -```{r} -con <- DBI::dbConnect( - odbc::odbc(), - Driver = "SQL Server", - Server = "BI-DPA-PROD", - database = "USR_PS_Forsk", - Trusted_Connection = "TRUE" -) -``` - -# Get first date of hba1c above threshold -## From only that administered -```{r} -df_first_t2d_blood_sample <- con %>% - tbl(sql("SELECT * FROM [fct].FOR_LABKA_NPU27300_HbA1c_inkl_2021")) %>% - select(dw_ek_borger, datotid_proevemodtagelse, numerisksvar, analysenavn) %>% - filter(numerisksvar >= 48) %>% - group_by(dw_ek_borger) %>% - filter(datotid_proevemodtagelse == min(datotid_proevemodtagelse)) %>% - rename(datotid_start = datotid_proevemodtagelse) %>% - collect %>% - distinct(dw_ek_borger, datotid_start) %>% - format_sql_import() -``` \ No newline at end of file diff --git a/application/t2d/outcome_specification - move to t2d-feature-gen-repo/00_generate_dfs/30_diagnoses.rmd b/application/t2d/outcome_specification - move to t2d-feature-gen-repo/00_generate_dfs/30_diagnoses.rmd deleted file mode 100644 index 65e9e32c..00000000 --- a/application/t2d/outcome_specification - move to t2d-feature-gen-repo/00_generate_dfs/30_diagnoses.rmd +++ /dev/null @@ -1,67 +0,0 @@ -Find the first date where a patient gets a t2d-diagnosis in the hospital system. - -```{r} -library("pacman") -``` - -```{r} -con <- DBI::dbConnect( - odbc::odbc(), - Driver = "SQL Server", - Server = "BI-DPA-PROD", - database = "USR_PS_Forsk", - Trusted_Connection = "TRUE" -) -``` - -# A-diagnoses -## LPR3 -```{r} -df_lpr3_diagnoses_roughly_selected <- con %>% - tbl(sql("SELECT * FROM [fct].FOR_LPR3kontakter_psyk_somatik_inkl_2021")) %>% - rename(datotid_start = datotid_lpr3kontaktstart) %>% - select(dw_ek_borger, datotid_start, diagnosegruppestreng) %>% - collect %>% - format_sql_import() -``` - -## LPR2 -### Inpatient visits -```{r} -df_lpr2_diagnoses_inpatient_roughly_selected <- con %>% - tbl(sql("SELECT * FROM [fct].FOR_indlaeggelser_psyk_somatik_LPR2_inkl_2021")) %>% - rename(datotid_start = datotid_indlaeggelse) %>% - select(dw_ek_borger, datotid_start, diagnosegruppestreng) %>% - collect %>% - format_sql_import() -``` - -### Outpatient visits -```{r} -df_lpr2_diagnoses_outpatient_roughly_selected <- con %>% - tbl(sql("SELECT * FROM [fct].FOR_besoeg_psyk_somatik_LPR2_inkl_2021")) %>% - select(dw_ek_borger, datotid_start, diagnosegruppestreng) %>% - collect %>% - format_sql_import() -``` - -## Combined -```{r} -df_all_visits_combined <- df_lpr3_diagnoses_roughly_selected %>% - bind_rows(df_lpr2_diagnoses_inpatient_roughly_selected) %>% - bind_rows(df_lpr2_diagnoses_outpatient_roughly_selected) -``` - -### T2D -```{r} -df_first_t2d_diagnoses_combined <- df_all_visits_combined %>% - keep_only_first_t2d_by_diag(date_col_string="datotid_start") %>% - rename(datotid_first_t2d_diagnosis = datotid_start) -``` - -### T1D -```{r} -df_first_t1d_diagnoses_combined <- df_all_visits_combined %>% - keep_only_first_t1d_by_diag(date_col_string="datotid_start") %>% - select(dw_ek_borger, datotid_start) -``` \ No newline at end of file diff --git a/application/t2d/outcome_specification - move to t2d-feature-gen-repo/00_generate_dfs/41_find_first_p.rmd b/application/t2d/outcome_specification - move to t2d-feature-gen-repo/00_generate_dfs/41_find_first_p.rmd deleted file mode 100644 index 4f6a293a..00000000 --- a/application/t2d/outcome_specification - move to t2d-feature-gen-repo/00_generate_dfs/41_find_first_p.rmd +++ /dev/null @@ -1,52 +0,0 @@ -```{r} -source(here("psycop-r-utilities", "import_from_sql.r")) -p_load(tidyverse) -``` - -# Remove patients with incidence before first psych-contact -## LPR3, both in and outpatient -```{r} -pt_types = c("Ambulant", "Indlagt") - -df_lpr3_preproc <- con %>% - tbl(sql("SELECT * FROM [fct].FOR_LPR3kontakter_psyk_somatik_inkl_2021")) %>% - filter(pt_type %in% pt_types) %>% - filter(substr(shakkode_lpr3kontaktophold, 1, 4) == "6600") %>% # Only psychiatry in RM - select(dw_ek_borger, datotid_lpr3kontaktstart) %>% - rename(datotid_start = datotid_lpr3kontaktstart) %>% - collect %>% - format_sql_import() -``` - -## LPR2 -### LPR2 inpatient -```{r} -df_lpr2_inp_preproc <- con %>% - tbl(sql("SELECT * FROM [fct].FOR_indlaeggelser_psyk_somatik_LPR2_inkl_2021")) %>% - filter(substr(shakkode_kontaktansvarlig, 1, 4) == "6600") %>% # Only psychiatry in RM - rename(datotid_start = datotid_indlaeggelse) %>% - select(dw_ek_borger, datotid_start) %>% - collect %>% - format_sql_import() -``` - -### LPR2 outpatient -```{r} -df_lpr2_outp_preproc <- con %>% - tbl(sql("SELECT * FROM [fct].FOR_besoeg_psyk_somatik_LPR2_inkl_2021")) %>% - filter(substr(shakafskode, 1, 4) == "6600") %>% # Only psychiatry in RM - select(dw_ek_borger, datotid_start) %>% - collect %>% - format_sql_import() -``` - -# Combine all -```{r} -df_first_psych_visit <- df_lpr2_inp_preproc %>% - bind_rows(df_lpr2_outp_preproc) %>% - bind_rows(df_lpr3_preproc) %>% - group_by(dw_ek_borger) %>% - filter(datotid_start == min(datotid_start)) %>% - rename(datotid_first_psych_visit = datotid_start) %>% - select(dw_ek_borger, datotid_first_psych_visit) -``` \ No newline at end of file diff --git a/application/t2d/outcome_specification - move to t2d-feature-gen-repo/00_generate_dfs/49_combined.rmd b/application/t2d/outcome_specification - move to t2d-feature-gen-repo/00_generate_dfs/49_combined.rmd deleted file mode 100644 index ebb1976d..00000000 --- a/application/t2d/outcome_specification - move to t2d-feature-gen-repo/00_generate_dfs/49_combined.rmd +++ /dev/null @@ -1,53 +0,0 @@ -Combine medication, hba1c and diagnoses to find first date where the patient has t2d. - -```{r} -source(here("functions.r")) -p_load(odbc, dbplyr, DBI) -``` - -## Find "any" diabetes incidence (maximise sensitivity). For use in wash-in (i.e. exclusion). -```{r} -df_first_diabetes_any <- df_first_t2d_blood_sample %>% - rename(datotid_first_t2d_bs = datotid_start) %>% # Add BS - left_join(df_first_date_of_t2d_medication_prescription) %>% - left_join(df_first_t2d_diagnoses_combined) %>% - mutate(datotid_first_diabetes_any = pmin(datotid_first_t2d_medication, datotid_first_t2d_diagnosis, datotid_first_t2d_bs)) %>% - group_by(dw_ek_borger) %>% - filter(datotid_first_diabetes_any == min(datotid_first_diabetes_any)) %>% # Make sure to only have one record per patient - select(dw_ek_borger, datotid_first_diabetes_any) %>% - distinct(dw_ek_borger, datotid_first_diabetes_any) %>% - left_join(df_first_t1d_diagnoses_combined, by = "dw_ek_borger") %>% - rename(datotid_first_t1d_diagnosis = datotid_start) %>% - mutate(datotid_first_diabetes_any = if_else(is.na(datotid_first_t1d_diagnosis), datotid_first_diabetes_any, min(datotid_first_t1d_diagnosis, datotid_first_diabetes_any))) %>% - select(dw_ek_borger, datotid_first_diabetes_any) # Keep only if no t1d diagnosis before t2d: 601 - -copy_to(con, df_first_diabetes_any, name = in_schema("fct", "psycop_t2d_first_diabetes_any"), overwrite = TRUE, temporary = FALSE) - -dim(df_first_diabetes_any) -``` - -## Find "true" incidences (maximise specificity.). For use when training and evaluating model. Try to exclude anyone that is incident due to other causes. See issue #12 regarding reasoning. -```{r} -df_first_t2d_bs_only <- df_first_t2d_blood_sample %>% - rename(datotid_first_t2d_bs = datotid_start) %>% # Add BS - left_join(df_first_date_of_t2d_medication_prescription) %>% - filter(datotid_first_t2d_bs < datotid_first_t2d_medication) %>% - left_join(df_first_t2d_diagnoses_combined) %>% - filter(datotid_first_t2d_bs < datotid_first_t2d_diagnosis) %>% - group_by(dw_ek_borger) %>% - filter(datotid_first_t2d_bs == min(datotid_first_t2d_bs)) %>% # Make sure to only have one record per patient - select(dw_ek_borger, datotid_first_t2d_bs) %>% - distinct(dw_ek_borger, datotid_first_t2d_bs) %>% - left_join(df_first_psych_visit) %>% # 3010 - filter(datotid_first_psych_visit < datotid_first_t2d_bs) %>% # Keep only if diabetes is diagnosed after first psych visit: 810 - left_join(rename(df_first_t1d_diagnoses_combined, datotid_first_t1d_diagnosis = datotid_start), by = "dw_ek_borger") %>% - mutate(!(datotid_first_t1d_diagnosis < datotid_first_t2d_bs)) %>% - filter(is.na(datotid_first_t1d_diagnosis) | !(datotid_first_t1d_diagnosis < datotid_first_t2d_bs)) %>% # Keep only if no t1d diagnosis before t2d: 601 - select(dw_ek_borger, datotid_first_t2d_bs) %>% - rename(timestamp = datotid_first_t2d_bs) - -copy_to(con, df_first_t2d_bs_only, name = in_schema("fct", "psycop_t2d_first_diabetes_t2d"), overwrite = TRUE, temporary = FALSE) - -dim(df_first_t2d_bs_only) -``` - diff --git a/application/t2d/outcome_specification - move to t2d-feature-gen-repo/10_descriptive_stats.rmd b/application/t2d/outcome_specification - move to t2d-feature-gen-repo/10_descriptive_stats.rmd deleted file mode 100644 index 2994a54d..00000000 --- a/application/t2d/outcome_specification - move to t2d-feature-gen-repo/10_descriptive_stats.rmd +++ /dev/null @@ -1,139 +0,0 @@ -```{r} -library(pacman) -p_load(ggplot2, ggbeeswarm, tidyverse, here, ggbeeswarm) - -source(here("psycop_r_utils", "import_from_sql.r")) -``` - - -```{r} -df_demographics <- get_fct("FOR_kohorte_demografi_inkl_2021") %>% - format_sql_import() %>% - mutate(foedselsdato = ymd(foedselsdato)) -``` - -```{r} -df_first_t2d_processed <- read_csv(here("csv", "df_first_t2d_bs_only.csv")) - -df_first_psych_visit <- read_csv(here("csv", "df_first_psych_visit.csv")) -``` - -# Age at first t2d for patients with "true"" positives in cohort time -```{r} -df_age_at_first_t2d <- df_first_t2d_processed %>% - left_join(df_demographics) %>% - mutate(age_at_first_t2d = time_length(difftime(datotid_first_t2d, foedselsdato), "years")) -``` - -## Raincloud -```{r} -ggplot(df_age_at_first_t2d %>% mutate(group=1), aes(x = age_at_first_t2d, y = group)) + - ggdist::stat_halfeye( - adjust = .5, - width = .6, - .width = 0, - justification = -.3, - point_colour = NA) + - geom_boxplot( - width = .1, - outlier.shape = NA - ) + - geom_quasirandom( - size = 1, - alpha = .3, - position = position_jitter( - seed = 1, width = .05 - ), - groupOnX = FALSE - ) + - coord_cartesian(xlim = c(1.2, NA), clip = "off") + - scale_x_continuous(breaks = seq(5, 100, by = 5)) + - ggtitle("Age at first t2d for patients with 'true' positives in cohort time") -``` - -## Cumulative distribution -```{r} -ggplot(df_age_at_first_t2d, aes(x = age_at_first_t2d)) + - stat_ecdf(geom = "step") + - scale_x_continuous(breaks = seq(5, 100, by = 5)) + - scale_y_continuous(breaks = seq(0, 1, by = 0.05)) + - ggtitle("Age at first t2d for patients with 'true' positives in cohort time") -``` - -```{r} -df_without_children <- df_age_at_first_t2d %>% # 3284 - filter(age_at_first_t2d > 30) %>% # 2883 - filter(age_at_first_t2d < 90) # 2804 -``` - -# Number of potentially true-positives that can generate predictions for increasing ∆t -```{r} -df_all_visits_combined <- read_csv(here("csv", "all_visits_combined.csv")) -``` - -```{r} -df_visits_for_size_of_prediction_window <- df_all_visits_combined %>% - rename(datotid_besoeg = datotid_start) %>% - inner_join(df_first_t2d_processed, by="dw_ek_borger") %>% - select(datotid_besoeg, datotid_first_t2d, dw_ek_borger) %>% - mutate(years_from_visit_to_t2d = time_length(difftime(datotid_first_t2d, datotid_besoeg), "years")) %>% - mutate(years_to_end_of_follow_up = time_length(difftime(max(datotid_besoeg), datotid_besoeg), "years")) %>% - filter(years_from_visit_to_t2d > 0) # Drop all visits that are before event %>% -``` - -```{r} -df_size_of_prediction_window_with_selected_cols <- df_visits_for_size_of_prediction_window - -for (i in 1:100) { - colname = paste0("window_", i) - - df_size_of_prediction_window_with_selected_cols <- df_size_of_prediction_window_with_selected_cols %>% - mutate({{colname}} := if_else(((years_from_visit_to_t2di/12)), 1, 0)) -} -``` - -## For each visit -```{r} -df_size_of_prediction_window_summarised <- df_size_of_prediction_window_with_selected_cols %>% - summarise(across(starts_with("window"), mean, .names = "mean_{.col}")) %>% - pivot_longer(cols = starts_with("mean_"), - names_to = "window_size_months", - values_to = "percent_included") %>% - mutate(window_size_months = as.numeric(gsub("mean_window_", "", window_size_months))) -``` - -```{r} -filepath <- here("figures", "window_size_visits.png") - -plot <- ggplot(df_size_of_prediction_window_summarised, aes(x = window_size_months, y = percent_included)) + - geom_point() + - scale_x_continuous(breaks = seq(0, 100, by = 2)) + - ggtitle("Proportion of potentially true-positive visits that are positive as a function of window size") - -ggsave(filepath, plot, dpi = 300, width = 5, height = 3) -``` - -## For each patient -```{r} -df_predict_window_size_patients <- df_size_of_prediction_window_with_selected_cols %>% - group_by(dw_ek_borger) %>% - summarise(across(starts_with("window"), max, .names = "{.col}")) %>% - ungroup() %>% - summarise(across(starts_with("window"), mean, .names = "mean_{.col}")) %>% - pivot_longer(cols = starts_with("mean_"), - names_to = "window_size_months", - values_to = "percent_included") %>% - mutate(window_size_months = as.numeric(gsub("mean_window_", "", window_size_months))) -``` - -```{r} -filepath <- here("figures", "window_size_patients.png") - -plot <- ggplot(df_predict_window_size_patients, aes(x = window_size_months, y = percent_included)) + - geom_point() + - scale_x_continuous(breaks = seq(0, 100, by = 2)) + - scale_y_continuous(breaks = seq(0, 1, by = 0.05)) + - ggtitle("Proportion of potentially true-positive patients that are positive as a function of window size") - -ggsave(filepath, plot, dpi = 300, width = 5, height = 3) -``` diff --git a/application/t2d/outcome_specification - move to t2d-feature-gen-repo/functions.r b/application/t2d/outcome_specification - move to t2d-feature-gen-repo/functions.r deleted file mode 100644 index 583f9e58..00000000 --- a/application/t2d/outcome_specification - move to t2d-feature-gen-repo/functions.r +++ /dev/null @@ -1,49 +0,0 @@ -event_start_date <- ymd("2014-01-01") - -str_contains_t2d_diag <- function(str) { - t2d_regex_pattern <- "(:DE1[1-5].*)|(:DE16[0-2].*)|(:DO24.*)|(:DT383A.*)|(:DM142.*)|(:DG590.*)|(:DG632*)|(:DH280.*)|(:DH334.*)|(:DH360.*)|(:DH450.*)|(:DN083.*)" - - if (isTRUE(str_detect(str, t2d_regex_pattern))) { - return(TRUE) - } - - return(FALSE) -} - -keep_only_first_t2d_by_diag <- function(df, date_col_string) { - str_contains_t2d_diag_vecced <- Vectorize(str_contains_t2d_diag) - - df %>% - filter(str_contains_t2d_diag_vecced((diagnosegruppestreng))) %>% - group_by(dw_ek_borger) %>% - filter(date_col_string == min(date_col_string)) %>% - filter(row_number() == 1) %>% - ungroup() -} - -str_contains_t1d_diag <- function(str) { - t1d_regex_pattern <- "(:DE10.*)|(:DO240.*)" - - if (isTRUE(str_detect(str, t1d_regex_pattern))) { - return(TRUE) - } - - return(FALSE) -} - -keep_only_first_t1d_by_diag <- function(df, date_col_string) { - str_contains_t1d_diag_vecced <- Vectorize(str_contains_t1d_diag) - - df %>% - filter(str_contains_t1d_diag_vecced((diagnosegruppestreng))) %>% - group_by(dw_ek_borger) %>% - filter(date_col_string == min(date_col_string)) %>% - filter(row_number() == 1) %>% - ungroup() -} - -visit_can_generate_prediction <- function(col1, col2, window_width_years) { - if_else(({{col1}}% - rename_with(tolower) - -df_planned_psych_visits <- df_planned_visits_raw %>% - filter(substr(shakafskode_besoeg, 1, 4) == "6600") %>% - filter(psykambbesoeg == 1) %>% - select(dw_ek_borger, datotid_start) %>% - arrange(datotid_start) - -df_p_samp <- df_planned_psych_visits %>% - filter(dw_ek_borger == 31) %>% - arrange(datotid_start) - -# Iterate over planned visits to only keep those, that are not within 3 months from last prediction -drop_within_3_months_from_prediction <- function(df) { - # Only takes as an input a dataframe that is already sorted by date (!!!) - current_CPR <- 0 - patient_i <- 0 - last_selected_date <- 0 - indeces_to_drop <- c() - - for (i in 1:nrow(df)) { - # print(str_c("Row_CPR, Current CPR: ", df$dw_ek_borger[i], ", ", current_CPR)) - - if (df$dw_ek_borger[i] != current_CPR) { # Handle switching to new person - current_CPR = df$dw_ek_borger[i] - last_selected_date = ymd_hms(df$datotid_start[i]) - - if (patient_i %% 100 == 0 ) { - print(str_c("Processing patient nr. ", patient_i)) - } - - patient_i <- patient_i + 1 - - next() - } - - if (df$dw_ek_borger[i] == current_CPR) { # Handle comparison of current visit to previous selected date - if (ymd_hms(df$datotid_start[i]) < (as.Date(last_selected_date) + 90)) { - indeces_to_drop <- c(indeces_to_drop, i) - } else { - last_selected_date <- df$datotid_start[i] - } - } - } - - return(df %>% slice(-indeces_to_drop)) -} - -df_planned_with_3m_spacing <- drop_within_3_months_from_prediction(df_planned_psych_visits) - - - -####### -# Age # -####### -df_demo_raw <- dbGetQuery(conn, "SELECT * FROM [fct].FOR_kohorte_demografi") %>% - rename_with(tolower) - -df_demo <- df_demo_raw %>% - select(foedselsdato, dw_ek_borger) %>% - mutate(foedselsdato = ymd(foedselsdato)) - -############### -# First psych # -############### -df_psyk_raw <- dbGetQuery(conn, "SELECT * FROM [fct].FOR_besoeg_fysiske_fremmoeder") %>% - rename_with(tolower) - -df_first_p <- df_psyk_raw %>% - select(dw_ek_borger, datotid_start) %>% - group_by(dw_ek_borger) %>% - arrange(datotid_start, .by_group=TRUE) %>% - filter(row_number() == 1) %>% - rename(datotid_f_psych = datotid_start) - - -############### -# T2D samples # -############### -# Raw -df_hba1c_raw <- dbGetQuery(conn, "SELECT * FROM [fct].FOR_LABKA_NPU27300_HbA1c") %>% - rename_with(tolower) - -df_maybe_t2d <- df_hba1c_raw %>% - select(datotid_godkendtsvar, svar, dw_ek_borger) %>% - mutate(svar = as.numeric(svar)) %>% - filter(svar > 47) %>% - select(-svar) %>% #Remove incidences that are before first psych contact - left_join(df_first_p) %>% - filter(datotid_f_psych < datotid_godkendtsvar) %>% - group_by(dw_ek_borger) %>% - arrange(datotid_godkendtsvar, .by_group = TRUE) %>% # Keep only first row - filter(row_number() == 1) %>% - rename(datotid_maybe_t2d = datotid_godkendtsvar) %>% - mutate(datotid_maybe_t2d = ymd_hms(datotid_maybe_t2d)) - - -df_probably_t2d <- df_hba1c_raw %>% - select(datotid_godkendtsvar, svar, dw_ek_borger) %>% - mutate(svar = as.numeric(svar)) %>% - filter(is.na(svar) == FALSE) %>% #Remove incidences that are before first psych contact - left_join(df_first_p) %>% - filter(datotid_f_psych < datotid_godkendtsvar) %>% - group_by(dw_ek_borger) %>% # Check if first HbA1c was normal - arrange(datotid_godkendtsvar, .by_group=TRUE) %>% - mutate(first_hba1c_normal = svar[1] < 48) %>% - filter(svar > 47 & first_hba1c_normal == TRUE) %>% - group_by(dw_ek_borger) %>% - arrange(datotid_godkendtsvar, .by_group = TRUE) %>% - filter(row_number() == 1) %>% # Keep only first match - select(datotid_godkendtsvar) %>% - rename(datotid_probably_t2d = datotid_godkendtsvar) %>% - mutate(datotid_probably_t2d = ymd_hms(datotid_probably_t2d)) - -######## -# Plot # -######## -setwd("E:/Users/adminmanber/Desktop/T2D") - -############## -# Age at T2D # -############## -gen_plot_age_df <- function(df, outcome) { - df_out <- df %>% - left_join(df_demo) %>% - mutate(age_at_t2d = interval(foedselsdato, {{outcome}}) / years(1)) - - return(df_out) -} - -df_plot_probably_t2d <- gen_plot_age_df(df_probably_t2d, datotid_probably_t2d) - -df_plot_maybe_t2d <- gen_plot_age_df(df_maybe_t2d, datotid_maybe_t2d) - -save_histogram <- function(df, x_var, filename) { - gg <- ggplot(df, aes(x={{x_var}})) + - geom_histogram(binwidth=1) + - labs( - title = filename, - x = "Age at incident T2D (years)", - y = "Count" - ) + - scale_x_continuous( - breaks = seq(15, 100, by=5), - limits = c(15, 100) - ) - - ggsave(str_c("figures/", filename, ".png"), width = 20, height = 10, dpi = 100, units = "in") - - gg -} - -save_histogram(df_plot_probably_t2d, age_at_t2d, "age_at_first_t2d_hba1c_after_normal_hba1c") -save_histogram(df_plot_maybe_t2d, age_at_t2d, "age_at_first_t2d_hba1c") - - -################################## -# Time from planned visit to T2D # -################################## -gen_planned_to_event_df <- function(df_event, event_col, df_planned_visits) { - df_out <- df_event %>% - inner_join(df_planned_visits) %>% - rename(visit_start = datotid_start) %>% - mutate(years_since_visit = interval(visit_start, {{event_col}}) / years(1)) %>% - filter(years_since_visit > 0.25) - -} - -df_planned_to_probable_t2d <- gen_planned_to_event_df(df_event=df_probably_t2d, - event_col=datotid_probably_t2d, - df_planned_visits=df_planned_psych_visits) - -df_planned_to_maybe_t2d <- gen_planned_to_event_df(df_event=df_maybe_t2d, - event_col=datotid_maybe_t2d, - df_planned_visits=df_planned_psych_visits) - -save_time_from_visit <- function(df, x_var, filename) { - gg <- ggplot(df, aes(x={{x_var}})) + - scale_x_continuous( - breaks = seq(0, 10, by=0.25), - limits = c(0, 10) - ) - - hist <- gg + - geom_histogram( - binwidth = 0.25 - ) - - box <- gg + - geom_boxplot() - - combined <- hist + box + plot_layout(nrow = 2, height = c(2, 1)) - - ggsave(str_c("figures/", filename, "_histogram.png"), width = 20, height = 20, dpi = 100, units = "in") - - combined -} - -save_time_from_visit(df_planned_to_maybe_t2d, years_since_visit, "years_until_maybe_t2d_for_visit_histogram") -save_time_from_visit(df_planned_to_probable_t2d, years_since_visit, "years_until_probable_t2d_for_visit_histogram") diff --git a/application/t2d/outcome_specification - move to t2d-feature-gen-repo/t2d.Rproj b/application/t2d/outcome_specification - move to t2d-feature-gen-repo/t2d.Rproj deleted file mode 100644 index 8e3c2ebc..00000000 --- a/application/t2d/outcome_specification - move to t2d-feature-gen-repo/t2d.Rproj +++ /dev/null @@ -1,13 +0,0 @@ -Version: 1.0 - -RestoreWorkspace: Default -SaveWorkspace: Default -AlwaysSaveHistory: Default - -EnableCodeIndexing: Yes -UseSpacesForTab: Yes -NumSpacesForTab: 2 -Encoding: UTF-8 - -RnwWeave: Sweave -LaTeX: pdfLaTeX diff --git a/application/t2d/outcome_specification - move to t2d-feature-gen-repo/tests/tests.r b/application/t2d/outcome_specification - move to t2d-feature-gen-repo/tests/tests.r deleted file mode 100644 index f6d1b22d..00000000 --- a/application/t2d/outcome_specification - move to t2d-feature-gen-repo/tests/tests.r +++ /dev/null @@ -1,49 +0,0 @@ - -library("pacman") - -p_load(testthat, here, xpectr) - -source(here("src", "functions.r")) - -test_df <- tribble( - ~diagnosegruppestreng, ~datotid_lpr3kontaktstart, ~dw_ek_borger, - "A:DE14#+:ALFC3", "2021-06-30 09:00:00.0000000", 1, - "A:DE14#+:ALFC3", "2021-05-30 09:00:00.0000000", 1, - "A:DE14#+:ALFC3", "2021-04-30 09:00:00.0000000", 1 -) - -source(here("src", "functions.r")) -output_df <- keep_only_first_t2d_by_diag(test_df, "datotid_lpr3kontaktstart") - -test_that("Correct diagnosegruppe-matching",{ - # Testing column values - expect_equal( - output_df[["diagnosegruppestreng"]], - "A:DE14#+:ALFC3", - fixed = TRUE) - expect_equal( - output_df[["dw_ek_borger"]], - 1, - tolerance = 1e-4) -}) - -test_window_gen_df <- tribble( - ~years_from_visit_to_t2d, ~years_to_end_of_follow_up, ~dw_ek_borger, - 1, 1, 1, - 1, 2, 2 -) - -output_df_window <- mutate(test_window_gen_df, window_1 = visit_can_generate_prediction_vecced(years_from_visit_to_t2d, years_from_visit_to_t2d, 1)) - -test_window_grouped_df <- tribble( - ~dw_ek_borger, ~window_1, ~window_2, - 1, 1, 0, - 1, 0, 0, - 1, 0, 1, - 2, 0, 0, - 2, 1, 0 -) - -df_out_window_group <- test_window_grouped_df %>% - group_by(dw_ek_borger) %>% - summarise(across(starts_with("window"), max, .names = "{.col}")) \ No newline at end of file diff --git a/application/train_model.py b/application/train_model.py new file mode 100644 index 00000000..f57fa4b9 --- /dev/null +++ b/application/train_model.py @@ -0,0 +1,150 @@ +"""Train a single model and evaluate it.""" +import time +from typing import Any + +import hydra +import numpy as np +import wandb +from omegaconf import DictConfig, OmegaConf +from sklearn.metrics import roc_auc_score +from wasabi import Printer + +from psycop_model_training.data_loader.utils import ( + load_and_filter_train_and_val_from_cfg, +) +from psycop_model_training.model_eval.dataclasses import PipeMetadata +from psycop_model_training.model_eval.evaluate_model import run_full_evaluation +from psycop_model_training.preprocessing.post_split.pipeline import ( + create_post_split_pipeline, +) +from psycop_model_training.training.train_and_eval import ( + CONFIG_PATH, + train_and_get_model_eval_df, +) +from psycop_model_training.utils.col_name_inference import get_col_names +from psycop_model_training.utils.config_schemas.conf_utils import ( + convert_omegaconf_to_pydantic_object, +) +from psycop_model_training.utils.config_schemas.full_config import FullConfigSchema +from psycop_model_training.utils.utils import ( + PROJECT_ROOT, + create_wandb_folders, + eval_ds_cfg_pipe_to_disk, + flatten_nested_dict, + get_feature_importance_dict, + get_selected_features_dict, +) + + +@hydra.main( + config_path=str(CONFIG_PATH), + config_name="default_config", + version_base="1.2", +) +def main(cfg: DictConfig): + """Main function for training a single model.""" + # Save dictconfig for easier logging + if isinstance(cfg, DictConfig): + # Create flattened dict for logging to wandb + # Wandb doesn't allow configs to be nested, so we + # flatten it. + dict_config_to_log: dict[str, Any] = flatten_nested_dict(OmegaConf.to_container(cfg), sep=".") # type: ignore + else: + # For testing, we can take a FullConfig object instead. Simplifies boilerplate. + dict_config_to_log = cfg.__dict__ + + if not isinstance(cfg, FullConfigSchema): + cfg = convert_omegaconf_to_pydantic_object(cfg) + + msg = Printer(timestamp=True) + + create_wandb_folders() + + run = wandb.init( + project=f"{cfg.project.name}-baseline-model-training", + reinit=True, + config=dict_config_to_log, + mode=cfg.project.wandb.mode, + group=cfg.project.wandb.group, + entity=cfg.project.wandb.entity, + ) + + if run is None: + raise ValueError("Failed to initialise Wandb") + + # Add random delay based on cfg.train.random_delay_per_job to avoid + # each job needing the same resources (GPU, disk, network) at the same time + if cfg.train.random_delay_per_job_seconds: + delay = np.random.randint(0, cfg.train.random_delay_per_job_seconds) + msg.info(f"Delaying job by {delay} seconds to avoid resource competition") + time.sleep(delay) + + dataset = load_and_filter_train_and_val_from_cfg(cfg) + + msg.info("Creating pipeline") + pipe = create_post_split_pipeline(cfg) + + outcome_col_name, train_col_names = get_col_names(cfg, dataset.train) + + msg.info("Training model") + eval_ds = train_and_get_model_eval_df( + cfg=cfg, + train=dataset.train, + val=dataset.val, + pipe=pipe, + outcome_col_name=outcome_col_name, + train_col_names=train_col_names, + n_splits=cfg.train.n_splits, + ) + + pipe_metadata = PipeMetadata() + + if hasattr(pipe["model"], "feature_importances_"): + pipe_metadata.feature_importances = get_feature_importance_dict(pipe=pipe) + if hasattr(pipe["preprocessing"].named_steps, "feature_selection"): + pipe_metadata.selected_features = get_selected_features_dict( + pipe=pipe, + train_col_names=train_col_names, + ) + + # Save model predictions, feature importance, and config to disk + eval_ds_cfg_pipe_to_disk( + eval_dataset=eval_ds, + cfg=cfg, + pipe_metadata=pipe_metadata, + run=run, + ) + + if cfg.project.wandb.mode == "run" or cfg.eval.force: + msg.info("Evaluating model.") + + upload_to_wandb = cfg.project.wandb.mode == "run" + + run_full_evaluation( + cfg=cfg, + eval_dataset=eval_ds, + run=run, + pipe_metadata=pipe_metadata, + save_dir=PROJECT_ROOT / "wandb" / "plots" / run.name, + upload_to_wandb=upload_to_wandb, + ) + + roc_auc = roc_auc_score( + eval_ds.y, + eval_ds.y_hat_probs, + ) + + msg.info(f"ROC AUC: {roc_auc}") + run.log( + { + "roc_auc_unweighted": roc_auc, + "lookbehind": max(cfg.preprocessing.pre_split.lookbehind_combination), + "lookahead": cfg.preprocessing.pre_split.min_lookahead_days, + }, + ) + run.finish() + return roc_auc + + +if __name__ == "__main__": + main() # pylint: disable=no-value-for-parameter diff --git a/pyproject.toml b/pyproject.toml index c6edf27c..8114690b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,6 +28,7 @@ pyarrow = ">=9.0.0,<10.1.0" Random-Word = ">=1.0.11, <=1.0.12" torch = ">=1.12.1,<1.13.2" diskcache = "^5.4.0" +python-Levenshtein = ">=0.10.2,<0.20.9" [tool.poetry.dev-dependencies] @@ -72,7 +73,7 @@ ignore_missing_imports = true [tool.pylint] load-plugins = "pylint.extensions.docparams,pylint.extensions.code_style,pylint.extensions.for_any_all,pylint.extensions.typing" good-names = "df,p,f,d,e,n,k,i,v,y_,X,y" -disable = "too-many-lines,line-too-long,missing-raises-doc,no-self-argument,unused-wildcard-import,wildcard-import,no-else-return,too-many-arguments,redefined-outer-name,c-extension-no-member,wrong-import-order,import-outside-toplevel,unused-import" +disable = "too-many-lines,line-too-long,missing-raises-doc,no-self-argument,unused-wildcard-import,wildcard-import,no-else-return,too-many-arguments,redefined-outer-name,c-extension-no-member,wrong-import-order,import-outside-toplevel,unused-import,wrong-import-position,deprecated-typing-alias,missing-module-docstring" extension-pkg-allow-list = "wandb,pydantic" [tool.pylint.'MESSAGES CONTROL'] diff --git a/src/application/t2d/inspect_dataset.py b/src/application/t2d/inspect_dataset.py deleted file mode 100644 index 53c21401..00000000 --- a/src/application/t2d/inspect_dataset.py +++ /dev/null @@ -1,17 +0,0 @@ -"""Example of how to inspect a dataset using the configs.""" -from psycop_model_training.config.schemas import load_cfg_as_pydantic -from psycop_model_training.load import load_train_from_cfg, load_train_raw - - -def main(): - """Main.""" - config_file_name = "default_config.yaml" - - cfg = load_cfg_as_pydantic(config_file_name=config_file_name) - df = load_train_raw(cfg=cfg) # noqa pylint: disable=unused-variable - - df_filtered = load_train_from_cfg(cfg=cfg) # noqa pylint: disable=unused-variable - - -if __name__ == "__main__": - main() diff --git a/src/application/t2d/outcome_specification - move to t2d-feature-gen-repo/00_generate_dfs/10_medication.rmd b/src/application/t2d/outcome_specification - move to t2d-feature-gen-repo/00_generate_dfs/10_medication.rmd deleted file mode 100644 index 88ef6c59..00000000 --- a/src/application/t2d/outcome_specification - move to t2d-feature-gen-repo/00_generate_dfs/10_medication.rmd +++ /dev/null @@ -1,58 +0,0 @@ -Find first occurrence of hospital prescription or hospital redemption of diabetic medication. - -```{r} -library("pacman") - -p_load(tidyverse, here, future) -source(here("psycop-r-utilities", "import_from_sql.r")) -source(here("functions.r")) - -``` - -```{r} -con <- DBI::dbConnect( - odbc::odbc(), - Driver = "SQL Server", - Server = "BI-DPA-PROD", - database = "USR_PS_Forsk", - Trusted_Connection = "TRUE" -) -``` - -# Get first date of prescribed antidiabetic medication for each patient -## From only that administered -```{r} -df_first_administered_t2d_medication <- con %>% - tbl(sql("SELECT * FROM [fct].FOR_Medicin_administreret_inkl_2021")) %>% - select(dw_ek_borger, datotid_ordination_start, atc) %>% - filter(substr(atc, 1, 3) == "A10") %>% # A10 is all antidiabetic medication - group_by(dw_ek_borger) %>% - filter(datotid_ordination_start == min(datotid_ordination_start)) %>% - collect %>% - format_sql_import() %>% - distinct(dw_ek_borger, datotid_ordination_start) -``` - -## From only that prescribed -```{r} -df_first_prescribed_t2d_medication <- con %>% - tbl(sql("SELECT * FROM [fct].FOR_Medicin_ordineret_inkl_2021")) %>% - filter(substr(atc, 1, 3) == "A10") %>% - select(dw_ek_borger, datotid_ordinationstart, atc) %>% - rename(datotid_ordination_start = datotid_ordinationstart) %>% - group_by(dw_ek_borger) %>% - filter(datotid_ordination_start == min(datotid_ordination_start)) %>% - collect %>% - format_sql_import() %>% - distinct(dw_ek_borger, datotid_ordination_start) -``` - -## Combined -```{r} -df_first_date_of_t2d_medication_prescription <- df_first_administered_t2d_medication %>% - bind_rows(df_first_prescribed_t2d_medication) %>% - group_by(dw_ek_borger) %>% - filter(datotid_ordination_start == min(datotid_ordination_start)) %>% - filter(row_number() == 1) %>% - rename(datotid_first_t2d_medication=datotid_ordination_start) -``` \ No newline at end of file diff --git a/src/application/t2d/outcome_specification - move to t2d-feature-gen-repo/00_generate_dfs/20_hba1c.rmd b/src/application/t2d/outcome_specification - move to t2d-feature-gen-repo/00_generate_dfs/20_hba1c.rmd deleted file mode 100644 index 9e6382b8..00000000 --- a/src/application/t2d/outcome_specification - move to t2d-feature-gen-repo/00_generate_dfs/20_hba1c.rmd +++ /dev/null @@ -1,33 +0,0 @@ -Find the first date where a patient gets a diabetic hba1c-blood-sample. - -```{r} -library("pacman") - -p_load(tidyverse, here, future) - -``` - -```{r} -con <- DBI::dbConnect( - odbc::odbc(), - Driver = "SQL Server", - Server = "BI-DPA-PROD", - database = "USR_PS_Forsk", - Trusted_Connection = "TRUE" -) -``` - -# Get first date of hba1c above threshold -## From only that administered -```{r} -df_first_t2d_blood_sample <- con %>% - tbl(sql("SELECT * FROM [fct].FOR_LABKA_NPU27300_HbA1c_inkl_2021")) %>% - select(dw_ek_borger, datotid_proevemodtagelse, numerisksvar, analysenavn) %>% - filter(numerisksvar >= 48) %>% - group_by(dw_ek_borger) %>% - filter(datotid_proevemodtagelse == min(datotid_proevemodtagelse)) %>% - rename(datotid_start = datotid_proevemodtagelse) %>% - collect %>% - distinct(dw_ek_borger, datotid_start) %>% - format_sql_import() -``` \ No newline at end of file diff --git a/src/application/t2d/outcome_specification - move to t2d-feature-gen-repo/00_generate_dfs/30_diagnoses.rmd b/src/application/t2d/outcome_specification - move to t2d-feature-gen-repo/00_generate_dfs/30_diagnoses.rmd deleted file mode 100644 index 65e9e32c..00000000 --- a/src/application/t2d/outcome_specification - move to t2d-feature-gen-repo/00_generate_dfs/30_diagnoses.rmd +++ /dev/null @@ -1,67 +0,0 @@ -Find the first date where a patient gets a t2d-diagnosis in the hospital system. - -```{r} -library("pacman") -``` - -```{r} -con <- DBI::dbConnect( - odbc::odbc(), - Driver = "SQL Server", - Server = "BI-DPA-PROD", - database = "USR_PS_Forsk", - Trusted_Connection = "TRUE" -) -``` - -# A-diagnoses -## LPR3 -```{r} -df_lpr3_diagnoses_roughly_selected <- con %>% - tbl(sql("SELECT * FROM [fct].FOR_LPR3kontakter_psyk_somatik_inkl_2021")) %>% - rename(datotid_start = datotid_lpr3kontaktstart) %>% - select(dw_ek_borger, datotid_start, diagnosegruppestreng) %>% - collect %>% - format_sql_import() -``` - -## LPR2 -### Inpatient visits -```{r} -df_lpr2_diagnoses_inpatient_roughly_selected <- con %>% - tbl(sql("SELECT * FROM [fct].FOR_indlaeggelser_psyk_somatik_LPR2_inkl_2021")) %>% - rename(datotid_start = datotid_indlaeggelse) %>% - select(dw_ek_borger, datotid_start, diagnosegruppestreng) %>% - collect %>% - format_sql_import() -``` - -### Outpatient visits -```{r} -df_lpr2_diagnoses_outpatient_roughly_selected <- con %>% - tbl(sql("SELECT * FROM [fct].FOR_besoeg_psyk_somatik_LPR2_inkl_2021")) %>% - select(dw_ek_borger, datotid_start, diagnosegruppestreng) %>% - collect %>% - format_sql_import() -``` - -## Combined -```{r} -df_all_visits_combined <- df_lpr3_diagnoses_roughly_selected %>% - bind_rows(df_lpr2_diagnoses_inpatient_roughly_selected) %>% - bind_rows(df_lpr2_diagnoses_outpatient_roughly_selected) -``` - -### T2D -```{r} -df_first_t2d_diagnoses_combined <- df_all_visits_combined %>% - keep_only_first_t2d_by_diag(date_col_string="datotid_start") %>% - rename(datotid_first_t2d_diagnosis = datotid_start) -``` - -### T1D -```{r} -df_first_t1d_diagnoses_combined <- df_all_visits_combined %>% - keep_only_first_t1d_by_diag(date_col_string="datotid_start") %>% - select(dw_ek_borger, datotid_start) -``` \ No newline at end of file diff --git a/src/application/t2d/outcome_specification - move to t2d-feature-gen-repo/00_generate_dfs/41_find_first_p.rmd b/src/application/t2d/outcome_specification - move to t2d-feature-gen-repo/00_generate_dfs/41_find_first_p.rmd deleted file mode 100644 index 4f6a293a..00000000 --- a/src/application/t2d/outcome_specification - move to t2d-feature-gen-repo/00_generate_dfs/41_find_first_p.rmd +++ /dev/null @@ -1,52 +0,0 @@ -```{r} -source(here("psycop-r-utilities", "import_from_sql.r")) -p_load(tidyverse) -``` - -# Remove patients with incidence before first psych-contact -## LPR3, both in and outpatient -```{r} -pt_types = c("Ambulant", "Indlagt") - -df_lpr3_preproc <- con %>% - tbl(sql("SELECT * FROM [fct].FOR_LPR3kontakter_psyk_somatik_inkl_2021")) %>% - filter(pt_type %in% pt_types) %>% - filter(substr(shakkode_lpr3kontaktophold, 1, 4) == "6600") %>% # Only psychiatry in RM - select(dw_ek_borger, datotid_lpr3kontaktstart) %>% - rename(datotid_start = datotid_lpr3kontaktstart) %>% - collect %>% - format_sql_import() -``` - -## LPR2 -### LPR2 inpatient -```{r} -df_lpr2_inp_preproc <- con %>% - tbl(sql("SELECT * FROM [fct].FOR_indlaeggelser_psyk_somatik_LPR2_inkl_2021")) %>% - filter(substr(shakkode_kontaktansvarlig, 1, 4) == "6600") %>% # Only psychiatry in RM - rename(datotid_start = datotid_indlaeggelse) %>% - select(dw_ek_borger, datotid_start) %>% - collect %>% - format_sql_import() -``` - -### LPR2 outpatient -```{r} -df_lpr2_outp_preproc <- con %>% - tbl(sql("SELECT * FROM [fct].FOR_besoeg_psyk_somatik_LPR2_inkl_2021")) %>% - filter(substr(shakafskode, 1, 4) == "6600") %>% # Only psychiatry in RM - select(dw_ek_borger, datotid_start) %>% - collect %>% - format_sql_import() -``` - -# Combine all -```{r} -df_first_psych_visit <- df_lpr2_inp_preproc %>% - bind_rows(df_lpr2_outp_preproc) %>% - bind_rows(df_lpr3_preproc) %>% - group_by(dw_ek_borger) %>% - filter(datotid_start == min(datotid_start)) %>% - rename(datotid_first_psych_visit = datotid_start) %>% - select(dw_ek_borger, datotid_first_psych_visit) -``` \ No newline at end of file diff --git a/src/application/t2d/outcome_specification - move to t2d-feature-gen-repo/00_generate_dfs/49_combined.rmd b/src/application/t2d/outcome_specification - move to t2d-feature-gen-repo/00_generate_dfs/49_combined.rmd deleted file mode 100644 index ebb1976d..00000000 --- a/src/application/t2d/outcome_specification - move to t2d-feature-gen-repo/00_generate_dfs/49_combined.rmd +++ /dev/null @@ -1,53 +0,0 @@ -Combine medication, hba1c and diagnoses to find first date where the patient has t2d. - -```{r} -source(here("functions.r")) -p_load(odbc, dbplyr, DBI) -``` - -## Find "any" diabetes incidence (maximise sensitivity). For use in wash-in (i.e. exclusion). -```{r} -df_first_diabetes_any <- df_first_t2d_blood_sample %>% - rename(datotid_first_t2d_bs = datotid_start) %>% # Add BS - left_join(df_first_date_of_t2d_medication_prescription) %>% - left_join(df_first_t2d_diagnoses_combined) %>% - mutate(datotid_first_diabetes_any = pmin(datotid_first_t2d_medication, datotid_first_t2d_diagnosis, datotid_first_t2d_bs)) %>% - group_by(dw_ek_borger) %>% - filter(datotid_first_diabetes_any == min(datotid_first_diabetes_any)) %>% # Make sure to only have one record per patient - select(dw_ek_borger, datotid_first_diabetes_any) %>% - distinct(dw_ek_borger, datotid_first_diabetes_any) %>% - left_join(df_first_t1d_diagnoses_combined, by = "dw_ek_borger") %>% - rename(datotid_first_t1d_diagnosis = datotid_start) %>% - mutate(datotid_first_diabetes_any = if_else(is.na(datotid_first_t1d_diagnosis), datotid_first_diabetes_any, min(datotid_first_t1d_diagnosis, datotid_first_diabetes_any))) %>% - select(dw_ek_borger, datotid_first_diabetes_any) # Keep only if no t1d diagnosis before t2d: 601 - -copy_to(con, df_first_diabetes_any, name = in_schema("fct", "psycop_t2d_first_diabetes_any"), overwrite = TRUE, temporary = FALSE) - -dim(df_first_diabetes_any) -``` - -## Find "true" incidences (maximise specificity.). For use when training and evaluating model. Try to exclude anyone that is incident due to other causes. See issue #12 regarding reasoning. -```{r} -df_first_t2d_bs_only <- df_first_t2d_blood_sample %>% - rename(datotid_first_t2d_bs = datotid_start) %>% # Add BS - left_join(df_first_date_of_t2d_medication_prescription) %>% - filter(datotid_first_t2d_bs < datotid_first_t2d_medication) %>% - left_join(df_first_t2d_diagnoses_combined) %>% - filter(datotid_first_t2d_bs < datotid_first_t2d_diagnosis) %>% - group_by(dw_ek_borger) %>% - filter(datotid_first_t2d_bs == min(datotid_first_t2d_bs)) %>% # Make sure to only have one record per patient - select(dw_ek_borger, datotid_first_t2d_bs) %>% - distinct(dw_ek_borger, datotid_first_t2d_bs) %>% - left_join(df_first_psych_visit) %>% # 3010 - filter(datotid_first_psych_visit < datotid_first_t2d_bs) %>% # Keep only if diabetes is diagnosed after first psych visit: 810 - left_join(rename(df_first_t1d_diagnoses_combined, datotid_first_t1d_diagnosis = datotid_start), by = "dw_ek_borger") %>% - mutate(!(datotid_first_t1d_diagnosis < datotid_first_t2d_bs)) %>% - filter(is.na(datotid_first_t1d_diagnosis) | !(datotid_first_t1d_diagnosis < datotid_first_t2d_bs)) %>% # Keep only if no t1d diagnosis before t2d: 601 - select(dw_ek_borger, datotid_first_t2d_bs) %>% - rename(timestamp = datotid_first_t2d_bs) - -copy_to(con, df_first_t2d_bs_only, name = in_schema("fct", "psycop_t2d_first_diabetes_t2d"), overwrite = TRUE, temporary = FALSE) - -dim(df_first_t2d_bs_only) -``` - diff --git a/src/application/t2d/outcome_specification - move to t2d-feature-gen-repo/10_descriptive_stats.rmd b/src/application/t2d/outcome_specification - move to t2d-feature-gen-repo/10_descriptive_stats.rmd deleted file mode 100644 index 2994a54d..00000000 --- a/src/application/t2d/outcome_specification - move to t2d-feature-gen-repo/10_descriptive_stats.rmd +++ /dev/null @@ -1,139 +0,0 @@ -```{r} -library(pacman) -p_load(ggplot2, ggbeeswarm, tidyverse, here, ggbeeswarm) - -source(here("psycop_r_utils", "import_from_sql.r")) -``` - - -```{r} -df_demographics <- get_fct("FOR_kohorte_demografi_inkl_2021") %>% - format_sql_import() %>% - mutate(foedselsdato = ymd(foedselsdato)) -``` - -```{r} -df_first_t2d_processed <- read_csv(here("csv", "df_first_t2d_bs_only.csv")) - -df_first_psych_visit <- read_csv(here("csv", "df_first_psych_visit.csv")) -``` - -# Age at first t2d for patients with "true"" positives in cohort time -```{r} -df_age_at_first_t2d <- df_first_t2d_processed %>% - left_join(df_demographics) %>% - mutate(age_at_first_t2d = time_length(difftime(datotid_first_t2d, foedselsdato), "years")) -``` - -## Raincloud -```{r} -ggplot(df_age_at_first_t2d %>% mutate(group=1), aes(x = age_at_first_t2d, y = group)) + - ggdist::stat_halfeye( - adjust = .5, - width = .6, - .width = 0, - justification = -.3, - point_colour = NA) + - geom_boxplot( - width = .1, - outlier.shape = NA - ) + - geom_quasirandom( - size = 1, - alpha = .3, - position = position_jitter( - seed = 1, width = .05 - ), - groupOnX = FALSE - ) + - coord_cartesian(xlim = c(1.2, NA), clip = "off") + - scale_x_continuous(breaks = seq(5, 100, by = 5)) + - ggtitle("Age at first t2d for patients with 'true' positives in cohort time") -``` - -## Cumulative distribution -```{r} -ggplot(df_age_at_first_t2d, aes(x = age_at_first_t2d)) + - stat_ecdf(geom = "step") + - scale_x_continuous(breaks = seq(5, 100, by = 5)) + - scale_y_continuous(breaks = seq(0, 1, by = 0.05)) + - ggtitle("Age at first t2d for patients with 'true' positives in cohort time") -``` - -```{r} -df_without_children <- df_age_at_first_t2d %>% # 3284 - filter(age_at_first_t2d > 30) %>% # 2883 - filter(age_at_first_t2d < 90) # 2804 -``` - -# Number of potentially true-positives that can generate predictions for increasing ∆t -```{r} -df_all_visits_combined <- read_csv(here("csv", "all_visits_combined.csv")) -``` - -```{r} -df_visits_for_size_of_prediction_window <- df_all_visits_combined %>% - rename(datotid_besoeg = datotid_start) %>% - inner_join(df_first_t2d_processed, by="dw_ek_borger") %>% - select(datotid_besoeg, datotid_first_t2d, dw_ek_borger) %>% - mutate(years_from_visit_to_t2d = time_length(difftime(datotid_first_t2d, datotid_besoeg), "years")) %>% - mutate(years_to_end_of_follow_up = time_length(difftime(max(datotid_besoeg), datotid_besoeg), "years")) %>% - filter(years_from_visit_to_t2d > 0) # Drop all visits that are before event %>% -``` - -```{r} -df_size_of_prediction_window_with_selected_cols <- df_visits_for_size_of_prediction_window - -for (i in 1:100) { - colname = paste0("window_", i) - - df_size_of_prediction_window_with_selected_cols <- df_size_of_prediction_window_with_selected_cols %>% - mutate({{colname}} := if_else(((years_from_visit_to_t2di/12)), 1, 0)) -} -``` - -## For each visit -```{r} -df_size_of_prediction_window_summarised <- df_size_of_prediction_window_with_selected_cols %>% - summarise(across(starts_with("window"), mean, .names = "mean_{.col}")) %>% - pivot_longer(cols = starts_with("mean_"), - names_to = "window_size_months", - values_to = "percent_included") %>% - mutate(window_size_months = as.numeric(gsub("mean_window_", "", window_size_months))) -``` - -```{r} -filepath <- here("figures", "window_size_visits.png") - -plot <- ggplot(df_size_of_prediction_window_summarised, aes(x = window_size_months, y = percent_included)) + - geom_point() + - scale_x_continuous(breaks = seq(0, 100, by = 2)) + - ggtitle("Proportion of potentially true-positive visits that are positive as a function of window size") - -ggsave(filepath, plot, dpi = 300, width = 5, height = 3) -``` - -## For each patient -```{r} -df_predict_window_size_patients <- df_size_of_prediction_window_with_selected_cols %>% - group_by(dw_ek_borger) %>% - summarise(across(starts_with("window"), max, .names = "{.col}")) %>% - ungroup() %>% - summarise(across(starts_with("window"), mean, .names = "mean_{.col}")) %>% - pivot_longer(cols = starts_with("mean_"), - names_to = "window_size_months", - values_to = "percent_included") %>% - mutate(window_size_months = as.numeric(gsub("mean_window_", "", window_size_months))) -``` - -```{r} -filepath <- here("figures", "window_size_patients.png") - -plot <- ggplot(df_predict_window_size_patients, aes(x = window_size_months, y = percent_included)) + - geom_point() + - scale_x_continuous(breaks = seq(0, 100, by = 2)) + - scale_y_continuous(breaks = seq(0, 1, by = 0.05)) + - ggtitle("Proportion of potentially true-positive patients that are positive as a function of window size") - -ggsave(filepath, plot, dpi = 300, width = 5, height = 3) -``` diff --git a/src/application/t2d/outcome_specification - move to t2d-feature-gen-repo/functions.r b/src/application/t2d/outcome_specification - move to t2d-feature-gen-repo/functions.r deleted file mode 100644 index 583f9e58..00000000 --- a/src/application/t2d/outcome_specification - move to t2d-feature-gen-repo/functions.r +++ /dev/null @@ -1,49 +0,0 @@ -event_start_date <- ymd("2014-01-01") - -str_contains_t2d_diag <- function(str) { - t2d_regex_pattern <- "(:DE1[1-5].*)|(:DE16[0-2].*)|(:DO24.*)|(:DT383A.*)|(:DM142.*)|(:DG590.*)|(:DG632*)|(:DH280.*)|(:DH334.*)|(:DH360.*)|(:DH450.*)|(:DN083.*)" - - if (isTRUE(str_detect(str, t2d_regex_pattern))) { - return(TRUE) - } - - return(FALSE) -} - -keep_only_first_t2d_by_diag <- function(df, date_col_string) { - str_contains_t2d_diag_vecced <- Vectorize(str_contains_t2d_diag) - - df %>% - filter(str_contains_t2d_diag_vecced((diagnosegruppestreng))) %>% - group_by(dw_ek_borger) %>% - filter(date_col_string == min(date_col_string)) %>% - filter(row_number() == 1) %>% - ungroup() -} - -str_contains_t1d_diag <- function(str) { - t1d_regex_pattern <- "(:DE10.*)|(:DO240.*)" - - if (isTRUE(str_detect(str, t1d_regex_pattern))) { - return(TRUE) - } - - return(FALSE) -} - -keep_only_first_t1d_by_diag <- function(df, date_col_string) { - str_contains_t1d_diag_vecced <- Vectorize(str_contains_t1d_diag) - - df %>% - filter(str_contains_t1d_diag_vecced((diagnosegruppestreng))) %>% - group_by(dw_ek_borger) %>% - filter(date_col_string == min(date_col_string)) %>% - filter(row_number() == 1) %>% - ungroup() -} - -visit_can_generate_prediction <- function(col1, col2, window_width_years) { - if_else(({{col1}}% - rename_with(tolower) - -df_planned_psych_visits <- df_planned_visits_raw %>% - filter(substr(shakafskode_besoeg, 1, 4) == "6600") %>% - filter(psykambbesoeg == 1) %>% - select(dw_ek_borger, datotid_start) %>% - arrange(datotid_start) - -df_p_samp <- df_planned_psych_visits %>% - filter(dw_ek_borger == 31) %>% - arrange(datotid_start) - -# Iterate over planned visits to only keep those, that are not within 3 months from last prediction -drop_within_3_months_from_prediction <- function(df) { - # Only takes as an input a dataframe that is already sorted by date (!!!) - current_CPR <- 0 - patient_i <- 0 - last_selected_date <- 0 - indeces_to_drop <- c() - - for (i in 1:nrow(df)) { - # print(str_c("Row_CPR, Current CPR: ", df$dw_ek_borger[i], ", ", current_CPR)) - - if (df$dw_ek_borger[i] != current_CPR) { # Handle switching to new person - current_CPR = df$dw_ek_borger[i] - last_selected_date = ymd_hms(df$datotid_start[i]) - - if (patient_i %% 100 == 0 ) { - print(str_c("Processing patient nr. ", patient_i)) - } - - patient_i <- patient_i + 1 - - next() - } - - if (df$dw_ek_borger[i] == current_CPR) { # Handle comparison of current visit to previous selected date - if (ymd_hms(df$datotid_start[i]) < (as.Date(last_selected_date) + 90)) { - indeces_to_drop <- c(indeces_to_drop, i) - } else { - last_selected_date <- df$datotid_start[i] - } - } - } - - return(df %>% slice(-indeces_to_drop)) -} - -df_planned_with_3m_spacing <- drop_within_3_months_from_prediction(df_planned_psych_visits) - - - -####### -# Age # -####### -df_demo_raw <- dbGetQuery(conn, "SELECT * FROM [fct].FOR_kohorte_demografi") %>% - rename_with(tolower) - -df_demo <- df_demo_raw %>% - select(foedselsdato, dw_ek_borger) %>% - mutate(foedselsdato = ymd(foedselsdato)) - -############### -# First psych # -############### -df_psyk_raw <- dbGetQuery(conn, "SELECT * FROM [fct].FOR_besoeg_fysiske_fremmoeder") %>% - rename_with(tolower) - -df_first_p <- df_psyk_raw %>% - select(dw_ek_borger, datotid_start) %>% - group_by(dw_ek_borger) %>% - arrange(datotid_start, .by_group=TRUE) %>% - filter(row_number() == 1) %>% - rename(datotid_f_psych = datotid_start) - - -############### -# T2D samples # -############### -# Raw -df_hba1c_raw <- dbGetQuery(conn, "SELECT * FROM [fct].FOR_LABKA_NPU27300_HbA1c") %>% - rename_with(tolower) - -df_maybe_t2d <- df_hba1c_raw %>% - select(datotid_godkendtsvar, svar, dw_ek_borger) %>% - mutate(svar = as.numeric(svar)) %>% - filter(svar > 47) %>% - select(-svar) %>% #Remove incidences that are before first psych contact - left_join(df_first_p) %>% - filter(datotid_f_psych < datotid_godkendtsvar) %>% - group_by(dw_ek_borger) %>% - arrange(datotid_godkendtsvar, .by_group = TRUE) %>% # Keep only first row - filter(row_number() == 1) %>% - rename(datotid_maybe_t2d = datotid_godkendtsvar) %>% - mutate(datotid_maybe_t2d = ymd_hms(datotid_maybe_t2d)) - - -df_probably_t2d <- df_hba1c_raw %>% - select(datotid_godkendtsvar, svar, dw_ek_borger) %>% - mutate(svar = as.numeric(svar)) %>% - filter(is.na(svar) == FALSE) %>% #Remove incidences that are before first psych contact - left_join(df_first_p) %>% - filter(datotid_f_psych < datotid_godkendtsvar) %>% - group_by(dw_ek_borger) %>% # Check if first HbA1c was normal - arrange(datotid_godkendtsvar, .by_group=TRUE) %>% - mutate(first_hba1c_normal = svar[1] < 48) %>% - filter(svar > 47 & first_hba1c_normal == TRUE) %>% - group_by(dw_ek_borger) %>% - arrange(datotid_godkendtsvar, .by_group = TRUE) %>% - filter(row_number() == 1) %>% # Keep only first match - select(datotid_godkendtsvar) %>% - rename(datotid_probably_t2d = datotid_godkendtsvar) %>% - mutate(datotid_probably_t2d = ymd_hms(datotid_probably_t2d)) - -######## -# Plot # -######## -setwd("E:/Users/adminmanber/Desktop/T2D") - -############## -# Age at T2D # -############## -gen_plot_age_df <- function(df, outcome) { - df_out <- df %>% - left_join(df_demo) %>% - mutate(age_at_t2d = interval(foedselsdato, {{outcome}}) / years(1)) - - return(df_out) -} - -df_plot_probably_t2d <- gen_plot_age_df(df_probably_t2d, datotid_probably_t2d) - -df_plot_maybe_t2d <- gen_plot_age_df(df_maybe_t2d, datotid_maybe_t2d) - -save_histogram <- function(df, x_var, filename) { - gg <- ggplot(df, aes(x={{x_var}})) + - geom_histogram(binwidth=1) + - labs( - title = filename, - x = "Age at incident T2D (years)", - y = "Count" - ) + - scale_x_continuous( - breaks = seq(15, 100, by=5), - limits = c(15, 100) - ) - - ggsave(str_c("figures/", filename, ".png"), width = 20, height = 10, dpi = 100, units = "in") - - gg -} - -save_histogram(df_plot_probably_t2d, age_at_t2d, "age_at_first_t2d_hba1c_after_normal_hba1c") -save_histogram(df_plot_maybe_t2d, age_at_t2d, "age_at_first_t2d_hba1c") - - -################################## -# Time from planned visit to T2D # -################################## -gen_planned_to_event_df <- function(df_event, event_col, df_planned_visits) { - df_out <- df_event %>% - inner_join(df_planned_visits) %>% - rename(visit_start = datotid_start) %>% - mutate(years_since_visit = interval(visit_start, {{event_col}}) / years(1)) %>% - filter(years_since_visit > 0.25) - -} - -df_planned_to_probable_t2d <- gen_planned_to_event_df(df_event=df_probably_t2d, - event_col=datotid_probably_t2d, - df_planned_visits=df_planned_psych_visits) - -df_planned_to_maybe_t2d <- gen_planned_to_event_df(df_event=df_maybe_t2d, - event_col=datotid_maybe_t2d, - df_planned_visits=df_planned_psych_visits) - -save_time_from_visit <- function(df, x_var, filename) { - gg <- ggplot(df, aes(x={{x_var}})) + - scale_x_continuous( - breaks = seq(0, 10, by=0.25), - limits = c(0, 10) - ) - - hist <- gg + - geom_histogram( - binwidth = 0.25 - ) - - box <- gg + - geom_boxplot() - - combined <- hist + box + plot_layout(nrow = 2, height = c(2, 1)) - - ggsave(str_c("figures/", filename, "_histogram.png"), width = 20, height = 20, dpi = 100, units = "in") - - combined -} - -save_time_from_visit(df_planned_to_maybe_t2d, years_since_visit, "years_until_maybe_t2d_for_visit_histogram") -save_time_from_visit(df_planned_to_probable_t2d, years_since_visit, "years_until_probable_t2d_for_visit_histogram") diff --git a/src/application/t2d/outcome_specification - move to t2d-feature-gen-repo/t2d.Rproj b/src/application/t2d/outcome_specification - move to t2d-feature-gen-repo/t2d.Rproj deleted file mode 100644 index 8e3c2ebc..00000000 --- a/src/application/t2d/outcome_specification - move to t2d-feature-gen-repo/t2d.Rproj +++ /dev/null @@ -1,13 +0,0 @@ -Version: 1.0 - -RestoreWorkspace: Default -SaveWorkspace: Default -AlwaysSaveHistory: Default - -EnableCodeIndexing: Yes -UseSpacesForTab: Yes -NumSpacesForTab: 2 -Encoding: UTF-8 - -RnwWeave: Sweave -LaTeX: pdfLaTeX diff --git a/src/application/t2d/outcome_specification - move to t2d-feature-gen-repo/tests/tests.r b/src/application/t2d/outcome_specification - move to t2d-feature-gen-repo/tests/tests.r deleted file mode 100644 index f6d1b22d..00000000 --- a/src/application/t2d/outcome_specification - move to t2d-feature-gen-repo/tests/tests.r +++ /dev/null @@ -1,49 +0,0 @@ - -library("pacman") - -p_load(testthat, here, xpectr) - -source(here("src", "functions.r")) - -test_df <- tribble( - ~diagnosegruppestreng, ~datotid_lpr3kontaktstart, ~dw_ek_borger, - "A:DE14#+:ALFC3", "2021-06-30 09:00:00.0000000", 1, - "A:DE14#+:ALFC3", "2021-05-30 09:00:00.0000000", 1, - "A:DE14#+:ALFC3", "2021-04-30 09:00:00.0000000", 1 -) - -source(here("src", "functions.r")) -output_df <- keep_only_first_t2d_by_diag(test_df, "datotid_lpr3kontaktstart") - -test_that("Correct diagnosegruppe-matching",{ - # Testing column values - expect_equal( - output_df[["diagnosegruppestreng"]], - "A:DE14#+:ALFC3", - fixed = TRUE) - expect_equal( - output_df[["dw_ek_borger"]], - 1, - tolerance = 1e-4) -}) - -test_window_gen_df <- tribble( - ~years_from_visit_to_t2d, ~years_to_end_of_follow_up, ~dw_ek_borger, - 1, 1, 1, - 1, 2, 2 -) - -output_df_window <- mutate(test_window_gen_df, window_1 = visit_can_generate_prediction_vecced(years_from_visit_to_t2d, years_from_visit_to_t2d, 1)) - -test_window_grouped_df <- tribble( - ~dw_ek_borger, ~window_1, ~window_2, - 1, 1, 0, - 1, 0, 0, - 1, 0, 1, - 2, 0, 0, - 2, 1, 0 -) - -df_out_window_group <- test_window_grouped_df %>% - group_by(dw_ek_borger) %>% - summarise(across(starts_with("window"), max, .names = "{.col}")) \ No newline at end of file diff --git a/src/application/t2d/train_and_log_models.py b/src/application/t2d/train_and_log_models.py deleted file mode 100644 index 23ded385..00000000 --- a/src/application/t2d/train_and_log_models.py +++ /dev/null @@ -1,238 +0,0 @@ -"""Example script to train multiple models and subsequently log the results to -wandb. - -Usage: -- Replace the HYDRA_ARGS string with the desired arguments for `train_model.py` -- Run this script from project root with `python src/psycop_model_training/train_and_log_models.py -""" -import random -import subprocess -import time -from typing import Optional - -import pandas as pd -import wandb -from random_word import RandomWords -from wasabi import Printer - -from psycop_model_training.config.schemas import ( - BaseModel, - FullConfigSchema, - load_cfg_as_pydantic, -) -from psycop_model_training.load import load_train_raw -from psycop_model_training.model_eval.evaluate_model import ( - infer_look_distance, - infer_outcome_col_name, -) - - -def start_trainer( - cfg: FullConfigSchema, - config_file_name: str, - lookahead_days: int, - wandb_group_override: str, - model_name: str, -) -> subprocess.Popen: - """Start a trainer.""" - msg = Printer(timestamp=True) - - subprocess_args: list[str] = [ - "python", - "src/psycop_model_training/train_model.py", - f"project.wandb.group='{wandb_group_override}'", - f"project.wandb.mode={cfg.project.wandb.mode}", - f"hydra.sweeper.n_trials={cfg.train.n_trials_per_lookahead}", - f"hydra.sweeper.n_jobs={cfg.train.n_jobs_per_trainer}", - f"model={model_name}", - f"data.min_lookahead_days={lookahead_days}", - "--config-name", - f"{config_file_name}", - ] - - if cfg.train.n_trials_per_lookahead > 1: - subprocess_args.insert(2, "--multirun") - - if model_name == "xgboost": - subprocess_args.insert(3, "++model.args.tree_method='gpu_hist'") - - msg.info(f'{" ".join(subprocess_args)}') - - return subprocess.Popen( # pylint: disable=consider-using-with - args=subprocess_args, - ) - - -class TrainerSpec(BaseModel): - """Specification for starting a trainer. - - Provides overrides for the config file. - """ - - lookahead_days: int - model_name: str - - -def combine_lookaheads_and_model_names_to_trainer_specs( - cfg: FullConfigSchema, - possible_lookahead_days: list[int], - model_names: Optional[list[str]] = None, -): - """Generate trainer specs for all combinations of lookaheads and model - names.""" - msg = Printer(timestamp=True) - - random.shuffle(possible_lookahead_days) - - if model_names: - msg.warn( - "model_names was specified in train_models_for_each_cell_in_grid, overriding cfg.model.name", - ) - - model_name_queue = model_names if model_names else cfg.model.name - - # Create all combinations of lookahead_days and models - trainer_combinations_queue = [ - TrainerSpec(lookahead_days=lookahead_days, model_name=model_name) - for lookahead_days in possible_lookahead_days.copy() - for model_name in model_name_queue - ] - - return trainer_combinations_queue - - -def train_models_for_each_cell_in_grid( - cfg: FullConfigSchema, - possible_lookahead_days: list[int], - config_file_name: str, - wandb_prefix: str, - model_names: Optional[list[str]] = None, -): - """Train a model for each cell in the grid of possible look distances.""" - active_trainers: list[subprocess.Popen] = [] - - trainer_combinations_queue = combine_lookaheads_and_model_names_to_trainer_specs( - cfg=cfg, - possible_lookahead_days=possible_lookahead_days, - model_names=model_names, - ) - - while trainer_combinations_queue or active_trainers: - # Wait until there is a free slot in the trainers group - if ( - len(active_trainers) >= cfg.train.n_active_trainers - or len(trainer_combinations_queue) == 0 - ): - # Drop trainers if they have finished - # If finished, t.poll() is not None - active_trainers = [t for t in active_trainers if t.poll() is None] - time.sleep(1) - continue - - # Start a new trainer - trainer_spec = trainer_combinations_queue.pop() - - msg = Printer(timestamp=True) - msg.info( - f"Spawning a new trainer with lookahead={trainer_spec.lookahead_days} days", - ) - wandb_group = f"{wandb_prefix}" - - active_trainers.append( - start_trainer( - cfg=cfg, - config_file_name=config_file_name, - lookahead_days=trainer_spec.lookahead_days, - wandb_group_override=wandb_group, - model_name=trainer_spec.model_name, - ), - ) - - -def get_possible_lookaheads( - msg: Printer, - cfg: FullConfigSchema, - train_df: pd.DataFrame, -) -> list[int]: - """Some look_ahead and look_behind distances will result in 0 valid - prediction times. Only return combinations which will allow some prediction - times. - - E.g. if we only have 4 years of data: - - min_lookahead = 2 years - - min_lookbehind = 3 years - - Will mean that no rows satisfy the criteria. - """ - - outcome_col_names = infer_outcome_col_name(df=train_df, allow_multiple=True) - - possible_lookahead_days: list[int] = [ - int(dist) for dist in infer_look_distance(col_name=outcome_col_names) - ] - - # Don't try look distance combinations which will result in 0 rows - max_distance_in_dataset_days = ( - max(train_df[cfg.data.col_name.pred_timestamp]) - - min( - train_df[cfg.data.col_name.pred_timestamp], - ) - ).days - - lookaheads_without_rows: list[int] = [ - dist for dist in possible_lookahead_days if dist > max_distance_in_dataset_days - ] - - if lookaheads_without_rows: - msg.info( - f"Not fitting model to {lookaheads_without_rows}, since no rows satisfy the criteria.", - ) - - return list(set(possible_lookahead_days) - set(lookaheads_without_rows)) - - -def main(): - """Main.""" - msg = Printer(timestamp=True) - - debug = False - - if debug: - config_file_name = "integration_config.yaml" - else: - config_file_name = "default_config.yaml" - - cfg = load_cfg_as_pydantic(config_file_name=config_file_name) - - random_word = RandomWords() - wandb_group = f"{random_word.get_random_word()}-{random_word.get_random_word()}" - - wandb.init( - project=cfg.project.name, - mode=cfg.project.wandb.mode, - group=wandb_group, - entity=cfg.project.wandb.entity, - name="process_manager", - ) - - # Load dataset without dropping any rows for inferring - # which look distances to grid search over - train = load_train_raw(cfg=cfg) - - possible_lookaheads = get_possible_lookaheads( - msg=msg, - cfg=cfg, - train_df=train, - ) - - train_models_for_each_cell_in_grid( - cfg=cfg, - possible_lookahead_days=possible_lookaheads, - config_file_name=config_file_name, - wandb_prefix=wandb_group, - model_names=["xgboost", "logistic-regression"], - ) - - -if __name__ == "__main__": - main() diff --git a/src/psycop_model_training/archive/main.py b/src/psycop_model_training/archive/main.py index e92f7690..0ac2cf23 100644 --- a/src/psycop_model_training/archive/main.py +++ b/src/psycop_model_training/archive/main.py @@ -37,7 +37,7 @@ if __name__ == "__main__": msg = Printer(timestamp=True) - with initialize(version_base=None, config_path="../config/"): + with initialize(version_base=None, config_path="../../../application/config/"): cfg = compose( config_name=CONFIG_NAME, ) diff --git a/src/psycop_model_training/archive/model_training_watcher.py b/src/psycop_model_training/archive/model_training_watcher.py index 2ac933fe..1c1f164e 100644 --- a/src/psycop_model_training/archive/model_training_watcher.py +++ b/src/psycop_model_training/archive/model_training_watcher.py @@ -13,9 +13,9 @@ from wandb.sdk.wandb_run import Run # pylint: disable=no-name-in-module from wasabi import msg -from psycop_model_training.config.schemas import FullConfigSchema from psycop_model_training.model_eval.dataclasses import ModelEvalData from psycop_model_training.model_eval.evaluate_model import run_full_evaluation +from psycop_model_training.utils.config_schemas.full_config import FullConfigSchema from psycop_model_training.utils.utils import ( MODEL_PREDICTIONS_PATH, PROJECT_ROOT, diff --git a/src/psycop_model_training/config/data/t2d_parquet.yaml b/src/psycop_model_training/config/data/t2d_parquet.yaml deleted file mode 100644 index 8eac4d70..00000000 --- a/src/psycop_model_training/config/data/t2d_parquet.yaml +++ /dev/null @@ -1,37 +0,0 @@ -# @package _global_ -data: - # General config - n_training_samples: null - dir: E:\shared_resources\feature_sets\t2d\feature_sets\psycop_t2d_adminmanber_1090_features_2022_11_18_14_10 - suffix: parquet - min_age: 18 - - # Patient exclusion criteria - drop_patient_if_exclusion_before_date: 2013-01-01 - - # Prediction time exclusion criteria - min_prediction_time_date: 2013-01-01 - min_lookahead_days: 1825 - - # Feature specs - pred_prefix: pred_ - outc_prefix: outc_ - - col_name: - pred_timestamp: timestamp - outcome_timestamp: _t2d - id: dw_ek_borger - age: pred_age_in_years - exclusion_timestamp: _timestamp_exclusion - custom: - n_hba1c: eval_hba1c_within_9999_days_count_fallback_nan - - lookbehind_combination: [30, 90, 180, 365, 730] - - - -# Parameters that will only take effect if running with --multirun -hydra: - sweeper: - params: - ++data.lookbehind_combination: choice([30, 90, 180, 365, 730], [30, 180, 730], [730], [365], [90], [30]) diff --git a/src/psycop_model_training/config/preprocessing/default_preprocessing.yaml b/src/psycop_model_training/config/preprocessing/default_preprocessing.yaml deleted file mode 100644 index ad95e66e..00000000 --- a/src/psycop_model_training/config/preprocessing/default_preprocessing.yaml +++ /dev/null @@ -1,20 +0,0 @@ -# @package _global_ -preprocessing: - convert_to_boolean: false - convert_booleans_to_int: true - drop_datetime_predictor_columns: true - convert_datetimes_to_ordinal: false - imputation_method: most_frequent - scaling: z-score-normalisation - feature_selection: - name: chi2 - params: - percentile: 20 # (int): Percent of features to keep. Defaults to 10. - -hydra: - sweeper: - params: - ++preprocessing.imputation_method: choice("most_frequent", "mean", "median", "null") - ++preprocessing.scaling: choice("z-score-normalization", "null") - ++preprocessing.feature_selection.name: choice("chi2", "null") - ++preprocessing.feature_selection.params.percentile: int(tag(log, interval(1, 90))) diff --git a/src/psycop_model_training/config/project/default_project.yaml b/src/psycop_model_training/config/project/default_project.yaml deleted file mode 100644 index cb8d6eb6..00000000 --- a/src/psycop_model_training/config/project/default_project.yaml +++ /dev/null @@ -1,15 +0,0 @@ -name: psycop-t2d -seed: 42 - -wandb: - entity: psycop # Which entity to run WanDB in. - mode: run # Which mode to run WanDB in. Takes "run", "dryrun", "offline" and "disabled" - group: psycop-t2d # Which group to run WanDB in. - -watcher: - archive_all: false - keep_alive_after_training_minutes: 5 - n_runs_before_eval: 1 - verbose: true - -gpu: true diff --git a/src/psycop_model_training/config/project/overtaci_test_project.yaml b/src/psycop_model_training/config/project/overtaci_test_project.yaml deleted file mode 100644 index 716c9943..00000000 --- a/src/psycop_model_training/config/project/overtaci_test_project.yaml +++ /dev/null @@ -1,5 +0,0 @@ -name: psycop-t2d-testing -seed: 42 -wandb_mode: "offline" # Which mode to run WanDB in. Takes "run", "dryrun", "offline" and "disabled" -wandb_group: "psycop-t2d" -wandb_entity: "psycop" # Optional[str] diff --git a/src/psycop_model_training/config/schemas.py b/src/psycop_model_training/config/schemas.py deleted file mode 100644 index f16582eb..00000000 --- a/src/psycop_model_training/config/schemas.py +++ /dev/null @@ -1,282 +0,0 @@ -"""Utilities for converting config yamls to pydantic objects. Helpful because -it makes them: - -- Addressable with intellisense, -- Refactorable with IDEs, -- Easier to document with docstrings and -- Type checkable -""" -from datetime import datetime -from pathlib import Path -from typing import Any, Literal, Optional, Union - -from hydra import compose, initialize -from omegaconf import DictConfig, OmegaConf -from pydantic import BaseModel as PydanticBaseModel -from pydantic import Extra - - -class BaseModel(PydanticBaseModel): - """.""" - - class Config: - """An pydantic basemodel, which doesn't allow attributes that are not - defined in the class.""" - - allow_mutation = False - arbitrary_types_allowed = True - extra = Extra.forbid - - def __transform_attributes_with_str_to_object( - self, - output_object: Any, - input_string: str = "str", - ): - for key, value in self.__dict__.items(): - if isinstance(value, str): - if value.lower() == input_string.lower(): - self.__dict__[key] = output_object - - def __init__( - self, - allow_mutation: bool = False, - **kwargs, - ): - super().__init__(**kwargs) - self.Config.allow_mutation = allow_mutation - - self.__transform_attributes_with_str_to_object( - input_string="null", - output_object=None, - ) - self.__transform_attributes_with_str_to_object( - input_string="false", - output_object=False, - ) - self.__transform_attributes_with_str_to_object( - input_string="true", - output_object=True, - ) - - -class WandbSchema(BaseModel): - """Configuration for weights and biases.""" - - group: str - mode: str - entity: str - - -class WatcherSchema(BaseModel): - """Configuration for watchers.""" - - archive_all: bool - keep_alive_after_training_minutes: Union[int, float] - n_runs_before_eval: int - verbose: bool - - -class ProjectSchema(BaseModel): - """Project configuration.""" - - wandb: WandbSchema - name: str = "psycop_model_training" - seed: int - watcher: WatcherSchema - gpu: bool - - -class CustomColNames(BaseModel): - """All custom column names, i.e. columns that won't generalise across - projects.""" - - n_hba1c: str - - -class ColumnNamesSchema(BaseModel): - """Column names in the data.""" - - pred_timestamp: str # Column name for prediction times - outcome_timestamp: str # Column name for outcome timestamps - id: str # Citizen colnames - age: str # Name of the age column - exclusion_timestamp: str # Name of the exclusion timestamps column. - # Drops all visits whose pred_timestamp <= exclusion_timestamp. - - custom: Optional[CustomColNames] = None - # Column names that are custom to the given prediction problem. - - -class DataSchema(BaseModel): - """Data configuration.""" - - n_training_samples: Optional[int] - # Number of training samples to use, defaults to null in which cases it uses all samples. - - dir: Union[Path, str] # Location of the dataset - suffix: str # File suffix to load. - - # Feature specs - col_name: ColumnNamesSchema - - pred_prefix: str # prefix of predictor columns - outc_prefix: str # prefix of outcome columns - - min_age: Union[int, float] # Minimum age to include in the dataset - - # Looking ahead - min_lookahead_days: int - # Drop all prediction times where (max timestamp in the dataset) - (current timestamp) is less than min_lookahead_days - - drop_patient_if_exclusion_before_date: Optional[Union[str, datetime]] - # Drop all visits from a patient if the outcome is before this date. If None, no patients are dropped. - - min_prediction_time_date: Optional[Union[str, datetime]] - # Drop all prediction times before this date. - - lookbehind_combination: Optional[list[int]] - # Which combination of features to use. Only uses features that have "within_X_days" in their column name, where X is any of the numbers in this list. - - -class FeatureSelectionSchema(BaseModel): - """Configuration for feature selection methods.""" - - name: Optional[str] = None - # Which feature selection method to use. - - params: Optional[dict] = None - # Parameters for the feature selection method. - - -class PreprocessingConfigSchema(BaseModel): - """Preprocessing config.""" - - convert_to_boolean: bool - # Convert all prediction values (except gender) to boolean. Defaults to False. Useful as a sensitivty test, i.e. "is model performance based on whether blood samples are taken, or their values". If based purely on whether blood samples are taken, might indicate that it's just predicting whatever the doctor suspected. - - convert_booleans_to_int: bool - # Whether to convert columns containing booleans to int - - drop_datetime_predictor_columns: bool - # Whether to drop datetime columns prefixed with data.pred_prefix. - # Typically, we don't want to use these as features, since they are unlikely to generalise into the future. - - convert_datetimes_to_ordinal: bool - # Whether to convert datetimes to ordinal. - - imputation_method: Literal["most_frequent", "mean", "median", "null"] - # How to replace missing values. Takes all values from the sklearn.impute.SimpleImputer class. - # https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html - - scaling: Optional[str] - # Scaling applied to all predictors after imputation. Options include "z-score-normalization". - - feature_selection: FeatureSelectionSchema - - -class ModelConfSchema(BaseModel): - """Model configuration.""" - - name: str # Model, can currently take xgboost - require_imputation: bool # Whether the model requires imputation. (shouldn't this be false?) - args: dict - - -class TrainConfSchema(BaseModel): - """Training configuration.""" - - n_splits: int # ? How do we handle whether to use crossvalidation or train/val splitting? - n_trials_per_lookahead: int - n_active_trainers: int # Number of lookahead windows to train for at once - n_jobs_per_trainer: int # Number of jobs to run in parallel for each lookahead window - random_delay_per_job_seconds: Optional[ - int - ] = None # Add random delay based on cfg.train.random_delay_per_job to avoid - # each job needing the same resources (GPU, disk, network) at the same time - - -class EvalConfSchema(BaseModel): - """Evaluation config.""" - - force: bool = False - # Whether to force evaluation even if wandb is not "run". Used for testing. - - top_n_feature_importances: int - # How many feature_importances to plot. Plots the most important n features. A table with all features is also logged. - - positive_rate_thresholds: list[int] - # The threshold mapping a model's predicted probability to a binary outcome can be computed if we know, which positive rate we're targeting. We can't know beforehand which positive rate is best, beause it's a trade-off between false-positives and false-negatives. Therefore, we compute performacne for a range of positive rates. - - save_model_predictions_on_overtaci: bool - - lookahead_bins: list[int] - # List of lookahead distances for plotting. Will create bins in between each distances. E.g. if specifying 1, 5, 10, will bin evaluation as follows: [0, 1], [1, 5], [5, 10], [10, inf]. - - lookbehind_bins: list[int] - # List of lookbehidn distances for plotting. Will create bins in between each distances. E.g. if specifying 1, 5, 10, will bin evaluation as follows: [0, 1], [1, 5], [5, 10], [10, inf]. - - -class FullConfigSchema(BaseModel): - """A recipe for a full configuration object.""" - - project: ProjectSchema - data: DataSchema - preprocessing: PreprocessingConfigSchema - model: ModelConfSchema - train: TrainConfSchema - eval: EvalConfSchema - - -def convert_omegaconf_to_pydantic_object( - conf: DictConfig, - allow_mutation: bool = False, -) -> FullConfigSchema: - """Converts an omegaconf DictConfig to a pydantic object. - - Args: - conf (DictConfig): Omegaconf DictConfig - allow_mutation (bool, optional): Whether to make the pydantic object mutable. Defaults to False. - Returns: - FullConfig: Pydantic object - """ - conf = OmegaConf.to_container(conf, resolve=True) # type: ignore - return FullConfigSchema(**conf, allow_mutation=allow_mutation) - - -def load_cfg_as_omegaconf( - config_file_name: str, - overrides: Optional[list[str]] = None, -) -> DictConfig: - """Load config as omegaconf object.""" - with initialize(version_base=None, config_path="./"): - if overrides: - cfg = compose( - config_name=config_file_name, - overrides=overrides, - ) - else: - cfg = compose( - config_name=config_file_name, - ) - - # Override the type so we can get autocomplete and renaming - # correctly working - cfg: FullConfigSchema = cfg # type: ignore - - gpu = cfg.project.gpu - - if not gpu and cfg.model.name == "xgboost": - cfg.model.args["tree_method"] = "auto" - - return cfg - - -def load_cfg_as_pydantic( - config_file_name, - allow_mutation: bool = False, - overrides: Optional[list[str]] = None, -) -> FullConfigSchema: - """Load config as pydantic object.""" - cfg = load_cfg_as_omegaconf(config_file_name=config_file_name, overrides=overrides) - - return convert_omegaconf_to_pydantic_object(conf=cfg, allow_mutation=allow_mutation) diff --git a/src/psycop_model_training/data_loader/__init__.py b/src/psycop_model_training/data_loader/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/psycop_model_training/data_loader/col_name_checker.py b/src/psycop_model_training/data_loader/col_name_checker.py new file mode 100644 index 00000000..8a4fed4c --- /dev/null +++ b/src/psycop_model_training/data_loader/col_name_checker.py @@ -0,0 +1,74 @@ +"""Check that all columns in the config exist in the dataset.""" + +import Levenshtein +import pandas as pd + +from psycop_model_training.utils.config_schemas.data import ColumnNamesSchema + + +def get_most_likely_str_from_edit_distance( + candidate_strs: list[str], + input_str: str, + n_str_to_return: int, + edit_distance_threshold: int = 15, +) -> list[str]: + """Get most likely string from edit distance. + + Args: + candidate_strs (list[str]): List of candidate strings. + input_str (str): The incorrect string. + n_str_to_return (int): Number of strings to return. + edit_distance_threshold (int, optional): Maximum edit distance to consider. Defaults to 5. + + Returns: + str: String from candidate_strs that is most similar to input_str by edit distance. + """ + # Compute the Levenshtein distance between the input string and each candidate string. + distances = [ + Levenshtein.distance(input_str, candidate) for candidate in candidate_strs + ] + + # Sort the candidate strings by their Levenshtein distance from the input string. + sorted_candidates = [ + x + for distance, x in sorted(zip(distances, candidate_strs)) + if distance <= edit_distance_threshold + ] + + # Return the first `n_str_to_return` elements of the sorted list of candidate strings. + return sorted_candidates[:n_str_to_return] + + +def check_columns_exist_in_dataset( + col_name_schema: ColumnNamesSchema, + df: pd.DataFrame, +): + """Check that all columns in the config exist in the dataset.""" + # Iterate over attributes in the config + error_strs = [] + + for attr in dir(col_name_schema): + # Skip private attributes + if attr.startswith("_"): + continue + + col_name = getattr(col_name_schema, attr) + + # Skip col names that are not string + if not isinstance(col_name, str): + continue + + # Check that the column exists in the dataset + if not col_name in df: + most_likely_alternatives = get_most_likely_str_from_edit_distance( + candidate_strs=df.columns, + input_str=col_name, + n_str_to_return=3, + ) + + error_str = f"Column '{col_name}' in config but not in dataset.\n" + error_str += f" Did you mean {most_likely_alternatives}? \n" + error_strs.append(error_str) + + if error_strs: + raise ValueError("\n".join(error_strs)) diff --git a/src/psycop_model_training/data_loader/data_classes.py b/src/psycop_model_training/data_loader/data_classes.py new file mode 100644 index 00000000..2132064a --- /dev/null +++ b/src/psycop_model_training/data_loader/data_classes.py @@ -0,0 +1,18 @@ +from typing import Optional + +import pandas as pd + +from psycop_model_training.utils.basemodel import BaseModel + + +class SplitDataset(BaseModel): + """A dataset split into train, test and optionally validation.""" + + class Config: + """Configuration for the dataclass to allow pd.DataFrame as type.""" + + arbitrary_types_allowed = True + + train: pd.DataFrame + test: Optional[pd.DataFrame] = None + val: pd.DataFrame diff --git a/src/psycop_model_training/data_loader/data_loader.py b/src/psycop_model_training/data_loader/data_loader.py new file mode 100644 index 00000000..9725ca62 --- /dev/null +++ b/src/psycop_model_training/data_loader/data_loader.py @@ -0,0 +1,122 @@ +"""Loader for the t2d dataset.""" +import logging +from collections.abc import Iterable +from pathlib import Path +from typing import Callable, Optional, Union + +import pandas as pd +from wasabi import Printer + +from psycop_model_training.utils.config_schemas.full_config import FullConfigSchema + +msg = Printer(timestamp=True) + +from psycop_model_training.data_loader.col_name_checker import ( + check_columns_exist_in_dataset, +) + +log = logging.getLogger(__name__) + + +class DataLoader: + """Class to handle loading of a datasplit.""" + + def __init__( + self, + cfg: FullConfigSchema, + column_name_checker: Optional[Callable] = check_columns_exist_in_dataset, + ): + self.cfg: FullConfigSchema = cfg + + # File handling + self.dir_path = Path(cfg.data.dir) + self.file_suffix = cfg.data.suffix + self.column_name_checker = column_name_checker + + # Column specifications + self.pred_col_name_prefix = cfg.data.pred_prefix + + def _check_column_names(self, df: pd.DataFrame): + """Check that all columns in the config exist in the dataset.""" + if self.column_name_checker: + self.column_name_checker(col_name_schema=self.cfg.data.col_name, df=df) + else: + log.debug("No column name checker specified. Skipping column name check.") + + def _load_dataset_file( # pylint: disable=inconsistent-return-statements + self, + split_name: str, + nrows: Optional[int] = None, + ) -> pd.DataFrame: # pylint: disable=inconsistent-return-statements + """Load dataset from directory. Finds any file with the matching file + suffix with the split name in its filename. + + Args: + split_name (str): Name of split, allowed are ["train", "test", "val"] + nrows (Optional[int]): Number of rows to load. Defaults to None, in which case + all rows are loaded. + self.file_suffix (str, optional): File suffix of the dataset. Defaults to "parquet". + + Returns: + pd.DataFrame: The dataset + """ + msg.info(f"Loading {split_name}") + + if self.file_suffix not in ("csv", "parquet"): + raise ValueError(f"File suffix {self.file_suffix} not supported.") + + if split_name not in ("train", "test", "val"): + raise ValueError(f"Split name {split_name} not supported.") + + path = list(self.dir_path.glob(f"*{split_name}*.{self.file_suffix}"))[0] + + if "parquet" in self.file_suffix: + if nrows: + raise ValueError( + "nrows is not supported for parquet files. Please use csv files.", + ) + + df = pd.read_parquet(path) + elif "csv" in self.file_suffix: + df = pd.read_csv(filepath_or_buffer=path, nrows=nrows) + + if self.column_name_checker: + self._check_column_names(df=df) + + return df + + def load_dataset_from_dir( + self, + split_names: Union[Iterable[str], str], + nrows: Optional[int] = None, + ) -> pd.DataFrame: + """Load dataset for t2d. Can load multiple splits at once, e.g. + concatenate train and val for crossvalidation. + + Args: + split_names (Union[Iterable[str], str]): Name of split, allowed are ["train", "test", "val"] + nrows (Optional[int]): Number of rows to load from dataset. Defaults to None, in which case all rows are loaded. + + Returns: + pd.DataFrame: The filtered dataset + """ + # Concat splits if multiple are given + if isinstance(split_names, (list, tuple)): + if isinstance(split_names, Iterable): + split_names = tuple(split_names) + + if nrows is not None: + nrows = int( + nrows / len(split_names), + ) + + return pd.concat( + [ + self._load_dataset_file(split_name=split, nrows=nrows) + for split in split_names + ], + ignore_index=True, + ) + elif isinstance(split_names, str): + dataset = self._load_dataset_file(split_name=split_names, nrows=nrows) + return dataset diff --git a/src/psycop_model_training/data_loader/utils.py b/src/psycop_model_training/data_loader/utils.py new file mode 100644 index 00000000..23a4f64c --- /dev/null +++ b/src/psycop_model_training/data_loader/utils.py @@ -0,0 +1,83 @@ +import os +from pathlib import Path +from typing import Literal + +import pandas as pd + +from psycop_model_training.data_loader.data_classes import SplitDataset +from psycop_model_training.data_loader.data_loader import DataLoader +from psycop_model_training.preprocessing.pre_split.full_processor import FullProcessor +from psycop_model_training.preprocessing.pre_split.processors.value_cleaner import ( + PreSplitValueCleaner, +) +from psycop_model_training.utils.config_schemas.full_config import FullConfigSchema + + +def get_latest_dataset_dir(path: Path) -> Path: + """Get the latest dataset directory by time of creation.""" + return max(path.glob("*"), key=os.path.getctime) + + +def load_and_filter_split_from_cfg( + cfg: FullConfigSchema, + split: Literal["train", "test", "val"], +) -> pd.DataFrame: + """Load train dataset from config. + + Args: + cfg (FullConfig): Config + split (Literal["train", "test", "val"]): Split to load + + Returns: + pd.DataFrame: Train dataset + """ + dataset = DataLoader(cfg=cfg).load_dataset_from_dir(split_names=split) + filtered_data = FullProcessor(cfg=cfg).process(dataset=dataset) + + return filtered_data + + +def load_and_filter_train_from_cfg(cfg: FullConfigSchema) -> pd.DataFrame: + """Load train dataset from config. + + Args: + cfg (FullConfig): Config + + Returns: + pd.DataFrame: Train dataset + """ + return load_and_filter_split_from_cfg(cfg=cfg, split="train") + + +def load_and_filter_train_and_val_from_cfg(cfg: FullConfigSchema): + """Load train and validation data from file.""" + + return SplitDataset( + train=load_and_filter_split_from_cfg(cfg=cfg, split="train"), + val=load_and_filter_split_from_cfg(cfg=cfg, split="val"), + ) + + +def load_train_raw( + cfg: FullConfigSchema, + convert_timestamp_types_and_nans: bool = True, +) -> pd.DataFrame: + """Load the data.""" + path = Path(cfg.data.dir) + file_names = list(path.glob(pattern=r"*train*")) + + if len(file_names) == 1: + file_name = file_names[0] + file_suffix = file_name.suffix + if file_suffix == ".parquet": + df = pd.read_parquet(file_name) + elif file_suffix == ".csv": + df = pd.read_csv(file_name) + + # Helpful during tests to convert columns with matching names to datetime + if convert_timestamp_types_and_nans: + df = PreSplitValueCleaner.convert_timestamp_dtype_and_nat(dataset=df) + + return df + + raise ValueError(f"Returned {len(file_names)} files") diff --git a/src/psycop_model_training/load.py b/src/psycop_model_training/load.py deleted file mode 100644 index e4a7d2b7..00000000 --- a/src/psycop_model_training/load.py +++ /dev/null @@ -1,616 +0,0 @@ -"""Loader for the t2d dataset.""" -import os -import re -from collections.abc import Iterable -from datetime import timedelta -from pathlib import Path -from typing import Optional, Union - -import numpy as np -import pandas as pd -from psycopmlutils.sql.loader import sql_load -from pydantic import BaseModel -from wasabi import Printer - -from psycop_model_training.config.schemas import FullConfigSchema -from psycop_model_training.utils.col_name_inference import infer_look_distance -from psycop_model_training.utils.decorators import print_df_dimensions_diff -from psycop_model_training.utils.utils import ( - get_percent_lost, - infer_outcome_col_name, - infer_predictor_col_name, -) - -msg = Printer(timestamp=True) - - -def load_timestamp_for_any_diabetes(): - """Loads timestamps for the broad definition of diabetes used for wash-in. - - See R files for details. - """ - timestamp_any_diabetes = sql_load( - query="SELECT * FROM [fct].[psycop_t2d_first_diabetes_any]", - format_timestamp_cols_to_datetime=False, - )[["dw_ek_borger", "datotid_first_diabetes_any"]] - - timestamp_any_diabetes = timestamp_any_diabetes.rename( - columns={"datotid_first_diabetes_any": "timestamp_washin"}, - ) - - return timestamp_any_diabetes - - -def add_washin_timestamps(dataset: pd.DataFrame) -> pd.DataFrame: - """Add washin timestamps to dataset. - - Washin is an exclusion criterion. E.g. if the patient has any visit - that looks like diabetes before the study starts (i.e. during - washin), they are excluded. - """ - timestamp_washin = load_timestamp_for_any_diabetes() - - dataset = dataset.merge( - timestamp_washin, - on="dw_ek_borger", - how="left", - ) - - return dataset - - -class DataLoader: - """Class to handle loading of a datasplit.""" - - def __init__( - self, - cfg: FullConfigSchema, - ): - self.cfg: FullConfigSchema = cfg - - # File handling - self.dir_path = Path(cfg.data.dir) - self.file_suffix = cfg.data.suffix - - # Column specifications - self.pred_col_name_prefix = cfg.data.pred_prefix - - def _load_dataset_file( # pylint: disable=inconsistent-return-statements - self, - split_name: str, - nrows: Optional[int] = None, - ) -> pd.DataFrame: # pylint: disable=inconsistent-return-statements - """Load dataset from directory. Finds any file with the matching file - suffix with the split name in its filename. - - Args: - split_name (str): Name of split, allowed are ["train", "test", "val"] - nrows (Optional[int]): Number of rows to load. Defaults to None, in which case - all rows are loaded. - self.file_suffix (str, optional): File suffix of the dataset. Defaults to "parquet". - - Returns: - pd.DataFrame: The dataset - """ - msg.info(f"Loading {split_name}") - - if self.file_suffix not in ("csv", "parquet"): - raise ValueError(f"File suffix {self.file_suffix} not supported.") - - if split_name not in ("train", "test", "val"): - raise ValueError(f"Split name {split_name} not supported.") - - path = list(self.dir_path.glob(f"*{split_name}*.{self.file_suffix}"))[0] - - if "parquet" in self.file_suffix: - if nrows: - raise ValueError( - "nrows is not supported for parquet files. Please use csv files.", - ) - return pd.read_parquet(path) - elif "csv" in self.file_suffix: - return pd.read_csv(filepath_or_buffer=path, nrows=nrows) - - def _drop_rows_if_datasets_ends_within_days( - self, - n_days: Union[int, float], - dataset: pd.DataFrame, - direction: str, - ) -> pd.DataFrame: - """Drop visits that lie within certain amount of days from end of - dataset. - - Args: - n_days (Union[float, int]): Number of days. - dataset (pd.DataFrame): Dataset. - direction (str): Direction to look. Allowed are ["before", "after"]. - - Returns: - pd.DataFrame: Dataset with dropped rows. - """ - if not isinstance(n_days, timedelta): - n_days_timedelt: timedelta = timedelta(days=n_days) # type: ignore - - if direction not in ("ahead", "behind"): - raise ValueError(f"Direction {direction} not supported.") - - n_rows_before_modification = dataset.shape[0] - - if direction == "ahead": - max_datetime = ( - dataset[self.cfg.data.col_name.pred_timestamp].max() - n_days_timedelt - ) - before_max_dt = ( - dataset[self.cfg.data.col_name.pred_timestamp] < max_datetime - ) - dataset = dataset[before_max_dt] - elif direction == "behind": - min_datetime = ( - dataset[self.cfg.data.col_name.pred_timestamp].min() + n_days_timedelt - ) - after_min_dt = dataset[self.cfg.data.col_name.pred_timestamp] > min_datetime - dataset = dataset[after_min_dt] - - n_rows_after_modification = dataset.shape[0] - percent_dropped = get_percent_lost( - n_before=n_rows_before_modification, - n_after=n_rows_after_modification, - ) - - if n_rows_before_modification - n_rows_after_modification != 0: - msg.info( - f"Dropped {n_rows_before_modification - n_rows_after_modification} ({percent_dropped}%) rows because the end of the dataset was within {n_days} of their prediction time when looking {direction} from their prediction time", - ) - - return dataset - - @print_df_dimensions_diff - def _drop_patient_if_excluded( - self, - dataset: pd.DataFrame, - ) -> pd.DataFrame: - """Drop patients that have an exclusion event within the washin - period.""" - - n_rows_before_modification = dataset.shape[0] - - outcome_before_date = ( - dataset[self.cfg.data.col_name.exclusion_timestamp] - < self.cfg.data.drop_patient_if_exclusion_before_date - ) - - patients_to_drop = set( - dataset[self.cfg.data.col_name.id][outcome_before_date].unique(), - ) - dataset = dataset[~dataset[self.cfg.data.col_name.id].isin(patients_to_drop)] - - n_rows_after_modification = dataset.shape[0] - - percent_dropped = get_percent_lost( - n_before=n_rows_after_modification, - n_after=n_rows_after_modification, - ) - - if n_rows_before_modification - n_rows_after_modification != 0: - msg.info( - f"Dropped {n_rows_before_modification - n_rows_after_modification} ({percent_dropped}%) rows because they met exclusion criteria before {self.cfg.data.drop_patient_if_exclusion_before_date}.", - ) - else: - msg.info( - f"No rows met exclusion criteria before {self.cfg.data.drop_patient_if_exclusion_before_date}. Didn't drop any.", - ) - - return dataset - - @print_df_dimensions_diff - def _drop_cols_not_in_lookbehind_combination( - self, - dataset: pd.DataFrame, - ) -> pd.DataFrame: - """Drop predictor columns that are not in the specified combination of - lookbehind windows. - - Args: - dataset (pd.DataFrame): Dataset. - - Returns: - pd.DataFrame: Dataset with dropped columns. - """ - - if not self.cfg.data.lookbehind_combination: - raise ValueError("No lookbehind_combination provided.") - - # Extract all unique lookbhehinds in the dataset predictors - lookbehinds_in_dataset = { - int(infer_look_distance(col)[0]) - for col in infer_predictor_col_name(df=dataset) - if len(infer_look_distance(col)) > 0 - } - - # Convert list to set - lookbehinds_in_spec = set(self.cfg.data.lookbehind_combination) - - # Check that all loobehinds in lookbehind_combination are used in the predictors - if not lookbehinds_in_spec.issubset( - lookbehinds_in_dataset, - ): - msg.warn( - f"One or more of the provided lookbehinds in lookbehind_combination is/are not used in any predictors in the dataset: {lookbehinds_in_spec - lookbehinds_in_dataset}", - ) - - lookbehinds_to_keep = lookbehinds_in_spec.intersection( - lookbehinds_in_dataset, - ) - - if not lookbehinds_to_keep: - raise ValueError("No predictors left after dropping lookbehinds.") - - msg.warn(f"Training on {lookbehinds_to_keep}.") - else: - lookbehinds_to_keep = lookbehinds_in_spec - - # Create a list of all predictor columns who have a lookbehind window not in lookbehind_combination list - cols_to_drop = [ - c - for c in infer_predictor_col_name(df=dataset) - if all(str(l_beh) not in c for l_beh in lookbehinds_to_keep) - ] - - cols_to_drop = [c for c in cols_to_drop if "within" in c] - # ? Add some specification of within_x_days indicating how to parse columns to find lookbehinds. Or, alternatively, use the column spec. - - dataset = dataset.drop(columns=cols_to_drop) - return dataset - - @staticmethod - @print_df_dimensions_diff - def convert_timestamp_dtype_and_nat(dataset: pd.DataFrame) -> pd.DataFrame: - """Convert columns with `timestamp`in their name to datetime, and - convert 0's to NaT.""" - timestamp_colnames = [col for col in dataset.columns if "timestamp" in col] - - for colname in timestamp_colnames: - if dataset[colname].dtype != "datetime64[ns]": - # Convert all 0s in colname to NaT - dataset[colname] = dataset[colname].apply( - lambda x: pd.NaT if x == "0" else x, - ) - dataset[colname] = pd.to_datetime(dataset[colname]) - - return dataset - - def _drop_cols_if_exceeds_look_direction_threshold( - self, - dataset: pd.DataFrame, - look_direction_threshold: Union[int, float], - direction: str, - ) -> pd.DataFrame: - """Drop columns if they look behind or ahead longer than a specified - threshold. - - For example, if direction is "ahead", and n_days is 30, then the column - should be dropped if it's trying to look 60 days ahead. This is useful - to avoid some rows having more information than others. - - Args: - dataset (pd.DataFrame): Dataset to process. - look_direction_threshold (Union[int, float]): Number of days to look in the direction. - direction (str): Direction to look. Allowed are ["ahead", "behind"]. - - Returns: - pd.DataFrame: Dataset without the dropped columns. - """ - - cols_to_drop = [] - - n_cols_before_modification = dataset.shape[1] - - if direction == "behind": - cols_to_process = infer_predictor_col_name(df=dataset) - - for col in cols_to_process: - # Extract lookbehind days from column name use regex - # E.g. "column_name_within_90_days" == 90 - # E.g. "column_name_within_90_days_fallback_NaN" == 90 - lookbehind_days_strs = re.findall(r"within_(\d+)_days", col) - - if len(lookbehind_days_strs) > 0: - lookbehind_days = int(lookbehind_days_strs[0]) - else: - msg.warn(f"Could not extract lookbehind days from {col}") - continue - - if lookbehind_days > look_direction_threshold: - cols_to_drop.append(col) - - n_cols_after_modification = dataset.shape[1] - percent_dropped = get_percent_lost( - n_before=n_cols_before_modification, - n_after=n_cols_after_modification, - ) - - if n_cols_before_modification - n_cols_after_modification != 0: - msg.info( - f"Dropped {n_cols_before_modification - n_cols_after_modification} ({percent_dropped}%) columns because they were looking {direction} further out than {look_direction_threshold} days.", - ) - - return dataset[[c for c in dataset.columns if c not in cols_to_drop]] - - @print_df_dimensions_diff - def _drop_cols_and_rows_if_look_direction_not_met( - self, - dataset: pd.DataFrame, - ) -> pd.DataFrame: - """Drop columns if they are outside the specification. Specifically: - - - min_lookahead_days is insufficient for the column's lookahead - - The dataset doesn't stretch far enough for the prediction time's lookahead - - The dataset doesn't stretch far enough for the prediction time's lookbehind - - Args: - dataset (pd.DataFrame): Dataset to process. - """ - for direction in ("ahead", "behind"): - - if direction in ("ahead", "behind"): - if direction == "ahead": - n_days = self.cfg.data.min_lookahead_days - elif direction == "behind": - n_days = max(self.cfg.data.lookbehind_combination) - else: - continue - - dataset = self._drop_rows_if_datasets_ends_within_days( - n_days=n_days, - dataset=dataset, - direction=direction, - ) - - dataset = self._drop_cols_if_exceeds_look_direction_threshold( - dataset=dataset, - look_direction_threshold=n_days, - direction=direction, - ) - - return dataset - - @print_df_dimensions_diff - def _keep_unique_outcome_col_with_lookahead_days_matching_conf( - self, - dataset: pd.DataFrame, - ) -> pd.DataFrame: - """Keep only one outcome column with the same lookahead days as set in - the config.""" - outcome_cols = infer_outcome_col_name(df=dataset, allow_multiple=True) - - col_to_drop = [ - c for c in outcome_cols if str(self.cfg.data.min_lookahead_days) not in c - ] - - # If no columns to drop, return the dataset - if not col_to_drop: - return dataset - - df = dataset.drop(col_to_drop, axis=1) - - if not len(infer_outcome_col_name(df)) == 1: - raise ValueError( - "Returning more than one outcome column, will cause problems during eval.", - ) - - return df - - def _keep_only_if_older_than_min_age(self, dataset: pd.DataFrame) -> pd.DataFrame: - """Keep only rows that are older than the minimum age specified in the - config.""" - return dataset[dataset[self.cfg.data.col_name.age] >= self.cfg.data.min_age] - - @print_df_dimensions_diff - def n_outcome_col_names(self, df: pd.DataFrame) -> int: - """How many outcome columns there are in a dataframe.""" - return len(infer_outcome_col_name(df=df, allow_multiple=True)) - - @print_df_dimensions_diff - def _drop_rows_after_event_time(self, dataset: pd.DataFrame) -> pd.DataFrame: - """Drop all rows where prediction timestamp is after the outcome.""" - - rows_to_drop = ( - dataset[self.cfg.data.col_name.pred_timestamp] - > dataset[self.cfg.data.col_name.outcome_timestamp] - ) - - return dataset[~rows_to_drop] - - @print_df_dimensions_diff - def _convert_boolean_dtypes_to_int(self, dataset: pd.DataFrame) -> pd.DataFrame: - """Convert boolean dtypes to int.""" - for col in dataset.columns: - if dataset[col].dtype == bool: - dataset[col] = dataset[col].astype(int) - - return dataset - - @print_df_dimensions_diff - def _negative_values_to_nan(self, dataset: pd.DataFrame) -> pd.DataFrame: - """Convert negative values to NaN.""" - preds = dataset[infer_predictor_col_name(df=dataset)] - - # Get all columns with negative values - cols_with_numerical_values = preds.select_dtypes(include=["number"]).columns - - numerical_columns_with_negative_values = [ - c for c in cols_with_numerical_values if preds[c].min() < 0 - ] - - df_to_replace = dataset[numerical_columns_with_negative_values] - - # Convert to NaN - df_to_replace[df_to_replace < 0] = np.nan - dataset[numerical_columns_with_negative_values] = df_to_replace - - return dataset - - def _process_dataset(self, dataset: pd.DataFrame) -> pd.DataFrame: - """Process dataset, namely: - - - Drop patients with outcome before drop_patient_if_outcome_before_date - - Process timestamp columns - - Drop visits where min_lookahead, min_lookbehind or min_prediction_time_date are not met - - Drop features with lookbehinds not in lookbehind_combination - - Returns: - pd.DataFrame: Processed dataset - """ - msg = Printer(timestamp=True) - msg.info("Processing dataset") - - # Super hacky rename, needs to be removed before merging. Figure out how to add eval columns when creating the dataset. - dataset = dataset.rename( - { - "pred_hba1c_within_9999_days_count_fallback_nan": self.cfg.data.col_name.custom.n_hba1c, - }, - axis=1, - ) - - # Super hacky transformation of negative weights (?!) for chi-square. - # In the future, we want to: - # 1. Fix this in the feature generation for t2d - # 2a. See if there's a way of using feature selection that permits negative values, or - # 2b. Always use z-score normalisation? - dataset = self._negative_values_to_nan(dataset=dataset) - - dataset = self.convert_timestamp_dtype_and_nat(dataset=dataset) - - if self.cfg.preprocessing.convert_booleans_to_int: - dataset = self._convert_boolean_dtypes_to_int(dataset=dataset) - - if self.cfg.data.min_age: - dataset = self._keep_only_if_older_than_min_age(dataset=dataset) - - dataset = self._drop_rows_after_event_time(dataset=dataset) - - if self.cfg.data.drop_patient_if_exclusion_before_date: - dataset = self._drop_patient_if_excluded(dataset=dataset) - - # Drop if later than min prediction time date - if self.cfg.data.min_prediction_time_date: - dataset = dataset[ - dataset[self.cfg.data.col_name.pred_timestamp] - > self.cfg.data.min_prediction_time_date - ] - - dataset = self._drop_cols_and_rows_if_look_direction_not_met(dataset=dataset) - - if self.cfg.data.lookbehind_combination: - dataset = self._drop_cols_not_in_lookbehind_combination(dataset=dataset) - - dataset = self._keep_unique_outcome_col_with_lookahead_days_matching_conf( - dataset=dataset, - ) - - msg.info("Finished processing dataset") - - return dataset - - def load_dataset_from_dir( - self, - split_names: Union[Iterable[str], str], - nrows: Optional[int] = None, - ) -> pd.DataFrame: - """Load dataset for t2d. Can load multiple splits at once, e.g. - concatenate train and val for crossvalidation. - - Args: - split_names (Union[Iterable[str], str]): Name of split, allowed are ["train", "test", "val"] - nrows (Optional[int]): Number of rows to load from dataset. Defaults to None, in which case all rows are loaded. - - Returns: - pd.DataFrame: The filtered dataset - """ - msg.info(f"Loading {split_names}") - - # Concat splits if multiple are given - if isinstance(split_names, (list, tuple)): - if isinstance(split_names, Iterable): - split_names = tuple(split_names) - - if nrows is not None: - nrows = int( - nrows / len(split_names), - ) - - return pd.concat( - [ - self._load_dataset_file(split_name=split, nrows=nrows) - for split in split_names - ], - ignore_index=True, - ) - elif isinstance(split_names, str): - dataset = self._load_dataset_file(split_name=split_names, nrows=nrows) - - dataset = self._process_dataset(dataset=dataset) - - msg.good(f"{split_names}: Returning!") - return dataset - - -class SplitDataset(BaseModel): - """A dataset split into train, test and optionally validation.""" - - class Config: - """Configuration for the dataclass to allow pd.DataFrame as type.""" - - arbitrary_types_allowed = True - - train: pd.DataFrame - test: Optional[pd.DataFrame] = None - val: pd.DataFrame - - -def load_train_from_cfg(cfg: FullConfigSchema) -> pd.DataFrame: - """Load train dataset from config. - - Args: - cfg (FullConfig): Config - - Returns: - pd.DataFrame: Train dataset - """ - return DataLoader(cfg=cfg).load_dataset_from_dir(split_names="train") - - -def load_train_and_val_from_cfg(cfg: FullConfigSchema): - """Load train and validation data from file.""" - - loader = DataLoader(cfg=cfg) - - return SplitDataset( - train=loader.load_dataset_from_dir(split_names="train"), - val=loader.load_dataset_from_dir(split_names="val"), - ) - - -def get_latest_dataset_dir(path: Path) -> Path: - """Get the latest dataset directory by time of creation.""" - return max(path.glob("*"), key=os.path.getctime) - - -def load_train_raw(cfg: FullConfigSchema): - """Load the data.""" - path = Path(cfg.data.dir) - file_names = list(path.glob(pattern=r"*train*")) - - if len(file_names) == 1: - file_name = file_names[0] - file_suffix = file_name.suffix - if file_suffix == ".parquet": - df = pd.read_parquet(file_name) - elif file_suffix == ".csv": - df = pd.read_csv(file_name) - - df = DataLoader.convert_timestamp_dtype_and_nat(dataset=df) - - return df - - raise ValueError(f"Returned {len(file_names)} files") diff --git a/src/psycop_model_training/model_eval/dataclasses.py b/src/psycop_model_training/model_eval/dataclasses.py index 611175f5..ecabe234 100644 --- a/src/psycop_model_training/model_eval/dataclasses.py +++ b/src/psycop_model_training/model_eval/dataclasses.py @@ -4,7 +4,8 @@ import pandas as pd -from psycop_model_training.config.schemas import BaseModel, FullConfigSchema +from psycop_model_training.utils.basemodel import BaseModel +from psycop_model_training.utils.config_schemas.conf_utils import FullConfigSchema class CustomColumns(BaseModel): diff --git a/src/psycop_model_training/model_eval/evaluate_model.py b/src/psycop_model_training/model_eval/evaluate_model.py index 765b57de..f1cc701d 100644 --- a/src/psycop_model_training/model_eval/evaluate_model.py +++ b/src/psycop_model_training/model_eval/evaluate_model.py @@ -8,40 +8,40 @@ from sklearn.metrics import recall_score from wandb.sdk.wandb_run import Run as wandb_run # pylint: disable=no-name-in-module -from psycop_model_training.config.schemas import FullConfigSchema from psycop_model_training.model_eval.dataclasses import ( ArtifactContainer, EvalDataset, PipeMetadata, ) -from psycop_model_training.model_eval.tables.performance_by_threshold import ( - generate_performance_by_positive_rate_table, -) -from psycop_model_training.model_eval.tables.tables import ( - generate_feature_importances_table, - generate_selected_features_table, -) -from psycop_model_training.utils.utils import positive_rate_to_pred_probs -from psycop_model_training.visualization.feature_importance import ( +from psycop_model_training.model_eval.plots.feature_importance import ( plot_feature_importances, ) -from psycop_model_training.visualization.performance_by_age import ( +from psycop_model_training.model_eval.plots.performance_by_age import ( plot_performance_by_age, ) -from psycop_model_training.visualization.performance_by_n_hba1c import ( +from psycop_model_training.model_eval.plots.performance_by_n_hba1c import ( plot_performance_by_n_hba1c, ) -from psycop_model_training.visualization.performance_over_time import ( +from psycop_model_training.model_eval.plots.performance_over_time import ( plot_auc_by_time_from_first_visit, plot_metric_by_calendar_time, plot_metric_by_cyclic_time, plot_metric_by_time_until_diagnosis, ) -from psycop_model_training.visualization.roc_auc import plot_auc_roc -from psycop_model_training.visualization.sens_over_time import ( +from psycop_model_training.model_eval.plots.roc_auc import plot_auc_roc +from psycop_model_training.model_eval.plots.sens_over_time import ( plot_sensitivity_by_time_to_outcome_heatmap, ) -from psycop_model_training.visualization.utils import log_image_to_wandb +from psycop_model_training.model_eval.plots.utils import log_image_to_wandb +from psycop_model_training.model_eval.tables.performance_by_threshold import ( + generate_performance_by_positive_rate_table, +) +from psycop_model_training.model_eval.tables.tables import ( + generate_feature_importances_table, + generate_selected_features_table, +) +from psycop_model_training.utils.config_schemas.full_config import FullConfigSchema +from psycop_model_training.utils.utils import positive_rate_to_pred_probs def upload_artifacts_to_wandb( @@ -170,7 +170,8 @@ def create_custom_plot_artifacts( eval_dataset: EvalDataset, save_dir: Path, ) -> list[ArtifactContainer]: - """A collection of plots that are always generated.""" + """A collection of plots that are only generated for your specific use + case.""" return [ ArtifactContainer( label="performance_by_n_hba1c", diff --git a/src/psycop_model_training/model_eval/examples/evaluate_model_from_file.py b/src/psycop_model_training/model_eval/examples/evaluate_model_from_file.py index cdb0bf72..0fc601d8 100644 --- a/src/psycop_model_training/model_eval/examples/evaluate_model_from_file.py +++ b/src/psycop_model_training/model_eval/examples/evaluate_model_from_file.py @@ -11,15 +11,17 @@ import pandas as pd from omegaconf import DictConfig -from psycop_model_training.utils.utils import ( - PROJECT_ROOT, +from psycop_model_training.model_eval.plots import plot_auc_by_time_from_first_visit +from psycop_model_training.utils.col_name_inference import ( infer_outcome_col_name, infer_predictor_col_name, infer_y_hat_prob_col_name, +) +from psycop_model_training.utils.utils import ( + PROJECT_ROOT, load_evaluation_data, read_pickle, ) -from psycop_model_training.visualization import plot_auc_by_time_from_first_visit def load_model_predictions_and_cfg(path: Path) -> tuple[pd.DataFrame, DictConfig]: diff --git a/src/psycop_model_training/model_performance/__init__.py b/src/psycop_model_training/model_eval/model_performance/__init__.py similarity index 100% rename from src/psycop_model_training/model_performance/__init__.py rename to src/psycop_model_training/model_eval/model_performance/__init__.py diff --git a/src/psycop_model_training/model_performance/model_performance.py b/src/psycop_model_training/model_eval/model_performance/model_performance.py similarity index 99% rename from src/psycop_model_training/model_performance/model_performance.py rename to src/psycop_model_training/model_eval/model_performance/model_performance.py index 89e922db..e330a0a7 100644 --- a/src/psycop_model_training/model_performance/model_performance.py +++ b/src/psycop_model_training/model_eval/model_performance/model_performance.py @@ -15,7 +15,7 @@ roc_auc_score, ) -from psycop_model_training.model_performance.utils import ( +from psycop_model_training.model_eval.model_performance.utils import ( add_metadata_cols, aggregate_predictions, idx_to_class, diff --git a/src/psycop_model_training/model_performance/utils.py b/src/psycop_model_training/model_eval/model_performance/utils.py similarity index 100% rename from src/psycop_model_training/model_performance/utils.py rename to src/psycop_model_training/model_eval/model_performance/utils.py diff --git a/src/psycop_model_training/model_eval/plots/__init__.py b/src/psycop_model_training/model_eval/plots/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/psycop_model_training/visualization/base_charts.py b/src/psycop_model_training/model_eval/plots/base_charts.py similarity index 100% rename from src/psycop_model_training/visualization/base_charts.py rename to src/psycop_model_training/model_eval/plots/base_charts.py diff --git a/src/psycop_model_training/visualization/feature_importance.py b/src/psycop_model_training/model_eval/plots/feature_importance.py similarity index 96% rename from src/psycop_model_training/visualization/feature_importance.py rename to src/psycop_model_training/model_eval/plots/feature_importance.py index 78d8b012..64228512 100644 --- a/src/psycop_model_training/visualization/feature_importance.py +++ b/src/psycop_model_training/model_eval/plots/feature_importance.py @@ -7,7 +7,7 @@ import numpy as np import pandas as pd -from psycop_model_training.visualization.base_charts import plot_basic_chart +from psycop_model_training.model_eval.plots.base_charts import plot_basic_chart def plot_feature_importances( diff --git a/src/psycop_model_training/visualization/performance_by_age.py b/src/psycop_model_training/model_eval/plots/performance_by_age.py similarity index 91% rename from src/psycop_model_training/visualization/performance_by_age.py rename to src/psycop_model_training/model_eval/plots/performance_by_age.py index a8a74b20..591b9751 100644 --- a/src/psycop_model_training/visualization/performance_by_age.py +++ b/src/psycop_model_training/model_eval/plots/performance_by_age.py @@ -7,8 +7,8 @@ from sklearn.metrics import roc_auc_score from psycop_model_training.model_eval.dataclasses import EvalDataset -from psycop_model_training.visualization.base_charts import plot_basic_chart -from psycop_model_training.visualization.utils import create_performance_by_input +from psycop_model_training.model_eval.plots.base_charts import plot_basic_chart +from psycop_model_training.model_eval.plots.utils import create_performance_by_input def plot_performance_by_age( diff --git a/src/psycop_model_training/visualization/performance_by_n_hba1c.py b/src/psycop_model_training/model_eval/plots/performance_by_n_hba1c.py similarity index 92% rename from src/psycop_model_training/visualization/performance_by_n_hba1c.py rename to src/psycop_model_training/model_eval/plots/performance_by_n_hba1c.py index f9b54cfd..9a29d775 100644 --- a/src/psycop_model_training/visualization/performance_by_n_hba1c.py +++ b/src/psycop_model_training/model_eval/plots/performance_by_n_hba1c.py @@ -7,8 +7,8 @@ from sklearn.metrics import roc_auc_score from psycop_model_training.model_eval.dataclasses import EvalDataset -from psycop_model_training.visualization.base_charts import plot_basic_chart -from psycop_model_training.visualization.utils import create_performance_by_input +from psycop_model_training.model_eval.plots.base_charts import plot_basic_chart +from psycop_model_training.model_eval.plots.utils import create_performance_by_input def plot_performance_by_n_hba1c( diff --git a/src/psycop_model_training/visualization/performance_over_time.py b/src/psycop_model_training/model_eval/plots/performance_over_time.py similarity index 98% rename from src/psycop_model_training/visualization/performance_over_time.py rename to src/psycop_model_training/model_eval/plots/performance_over_time.py index 7cdff65e..60880694 100644 --- a/src/psycop_model_training/visualization/performance_over_time.py +++ b/src/psycop_model_training/model_eval/plots/performance_over_time.py @@ -13,9 +13,9 @@ from sklearn.metrics import f1_score, roc_auc_score from psycop_model_training.model_eval.dataclasses import EvalDataset +from psycop_model_training.model_eval.plots.base_charts import plot_basic_chart +from psycop_model_training.model_eval.plots.utils import calc_performance from psycop_model_training.utils.utils import bin_continuous_data, round_floats_to_edge -from psycop_model_training.visualization.base_charts import plot_basic_chart -from psycop_model_training.visualization.utils import calc_performance def create_performance_by_calendar_time_df( diff --git a/src/psycop_model_training/visualization/prob_over_time.py b/src/psycop_model_training/model_eval/plots/prob_over_time.py similarity index 100% rename from src/psycop_model_training/visualization/prob_over_time.py rename to src/psycop_model_training/model_eval/plots/prob_over_time.py diff --git a/src/psycop_model_training/visualization/roc_auc.py b/src/psycop_model_training/model_eval/plots/roc_auc.py similarity index 100% rename from src/psycop_model_training/visualization/roc_auc.py rename to src/psycop_model_training/model_eval/plots/roc_auc.py diff --git a/src/psycop_model_training/visualization/sens_over_time.py b/src/psycop_model_training/model_eval/plots/sens_over_time.py similarity index 100% rename from src/psycop_model_training/visualization/sens_over_time.py rename to src/psycop_model_training/model_eval/plots/sens_over_time.py diff --git a/src/psycop_model_training/visualization/utils.py b/src/psycop_model_training/model_eval/plots/utils.py similarity index 100% rename from src/psycop_model_training/visualization/utils.py rename to src/psycop_model_training/model_eval/plots/utils.py diff --git a/src/psycop_model_training/preprocessing/feature_transformers.py b/src/psycop_model_training/preprocessing/feature_transformers.py deleted file mode 100644 index 5886d6dd..00000000 --- a/src/psycop_model_training/preprocessing/feature_transformers.py +++ /dev/null @@ -1,90 +0,0 @@ -"""Custom transformers for data preprocessing.""" -from datetime import datetime -from typing import Optional - -from sklearn.base import BaseEstimator, TransformerMixin - -# pylint: disable=missing-function-docstring - - -class ConvertToBoolean(BaseEstimator, TransformerMixin): - """Convert all cells with a value to True, otherwise false.""" - - def __init__( - self, - columns_to_include: Optional[tuple[str]] = None, - columns_to_skip: Optional[tuple[str, str]] = ("age_in_years", "sex_female"), - ignore_dtypes: Optional[tuple] = ("datetime64[ns]", " None: - """ - Args: - columns_to_include (list[str], optional): Columns to convert to boolean. - Acts as a whitelist, skipping all columns not in the list. - columns_to_skip (Union(tuple[str], None) : Columns to not convert to boolean. - Acts as a blacklist. - Defaults to ["age_in_years", "male"]. - Default to None in which case all columns are included. - ignore_dtypes (set, optional): Skip columns with these data types. Defaults - to {"datetime64[ns]"}. - """ - self.columns_to_skip = columns_to_skip - self.columns_to_include = columns_to_include - self.ignore_dtypes = set(ignore_dtypes) if ignore_dtypes else None - - def fit(self, _, y=None): # pylint: disable=unused-argument - return self - - def transform(self, X, y=None): # pylint: disable=unused-argument - columns = X.columns - - if self.columns_to_include: - columns = [c for c in columns if c in self.columns_to_include] - - cols_to_round = [ - c - for c in columns - if (X[c].dtype not in self.ignore_dtypes) or c in self.columns_to_skip - ] - - for col in cols_to_round: - X[col] = X[col].notnull() - - return X - - -class DateTimeConverter(BaseEstimator, TransformerMixin): - """Convert datetime columns to integers.""" - - valid_types = {"ordinal"} - datetime_dtypes = {"datetime64[ns]", " None: + self.cfg = cfg + + @print_df_dimensions_diff + def _drop_cols_not_in_lookbehind_combination( + self, + dataset: pd.DataFrame, + ) -> pd.DataFrame: + """Drop predictor columns that are not in the specified combination of + lookbehind windows. + + Args: + dataset (pd.DataFrame): Dataset. + + Returns: + pd.DataFrame: Dataset with dropped columns. + """ + + if not self.cfg.preprocessing.pre_split.lookbehind_combination: + raise ValueError("No lookbehind_combination provided.") + + # Extract all unique lookbhehinds in the dataset predictors + lookbehinds_in_dataset = { + int(infer_look_distance(col)[0]) + for col in infer_predictor_col_name(df=dataset) + if len(infer_look_distance(col)) > 0 + } + + # Convert list to set + lookbehinds_in_spec = set( + self.cfg.preprocessing.pre_split.lookbehind_combination, + ) + + # Check that all loobehinds in lookbehind_combination are used in the predictors + if not lookbehinds_in_spec.issubset( + lookbehinds_in_dataset, + ): + msg.warn( + f"One or more of the provided lookbehinds in lookbehind_combination is/are not used in any predictors in the dataset: {lookbehinds_in_spec - lookbehinds_in_dataset}", + ) + + lookbehinds_to_keep = lookbehinds_in_spec.intersection( + lookbehinds_in_dataset, + ) + + if not lookbehinds_to_keep: + raise ValueError("No predictors left after dropping lookbehinds.") + + msg.warn(f"Training on {lookbehinds_to_keep}.") + else: + lookbehinds_to_keep = lookbehinds_in_spec + + # Create a list of all predictor columns who have a lookbehind window not in lookbehind_combination list + cols_to_drop = [ + c + for c in infer_predictor_col_name(df=dataset) + if all(str(l_beh) not in c for l_beh in lookbehinds_to_keep) + ] + + cols_to_drop = [c for c in cols_to_drop if "within" in c] + # ? Add some specification of within_x_days indicating how to parse columns to find lookbehinds. Or, alternatively, use the column spec. + + dataset = dataset.drop(columns=cols_to_drop) + return dataset + + @print_df_dimensions_diff + def _drop_cols_if_exceeds_look_direction_threshold( + self, + dataset: pd.DataFrame, + look_direction_threshold: Union[int, float], + direction: str, + ) -> pd.DataFrame: + """Drop columns if they look behind or ahead longer than a specified + threshold. + + For example, if direction is "ahead", and n_days is 30, then the column + should be dropped if it's trying to look 60 days ahead. This is useful + to avoid some rows having more information than others. + + Args: + dataset (pd.DataFrame): Dataset to process. + look_direction_threshold (Union[int, float]): Number of days to look in the direction. + direction (str): Direction to look. Allowed are ["ahead", "behind"]. + + Returns: + pd.DataFrame: Dataset without the dropped columns. + """ + + cols_to_drop = [] + + n_cols_before_modification = dataset.shape[1] + + if direction == "behind": + cols_to_process = infer_predictor_col_name(df=dataset) + + for col in cols_to_process: + # Extract lookbehind days from column name use regex + # E.g. "column_name_within_90_days" == 90 + # E.g. "column_name_within_90_days_fallback_NaN" == 90 + lookbehind_days_strs = re.findall(r"within_(\d+)_days", col) + + if len(lookbehind_days_strs) > 0: + lookbehind_days = int(lookbehind_days_strs[0]) + else: + msg.warn(f"Could not extract lookbehind days from {col}") + continue + + if lookbehind_days > look_direction_threshold: + cols_to_drop.append(col) + + n_cols_after_modification = dataset.shape[1] + percent_dropped = get_percent_lost( + n_before=n_cols_before_modification, + n_after=n_cols_after_modification, + ) + + if n_cols_before_modification - n_cols_after_modification != 0: + msg.info( + f"Dropped {n_cols_before_modification - n_cols_after_modification} ({percent_dropped}%) columns because they were looking {direction} further out than {look_direction_threshold} days.", + ) + + return dataset[[c for c in dataset.columns if c not in cols_to_drop]] + + @print_df_dimensions_diff + def _keep_unique_outcome_col_with_lookahead_days_matching_conf( + self, + dataset: pd.DataFrame, + ) -> pd.DataFrame: + """Keep only one outcome column with the same lookahead days as set in + the config.""" + outcome_cols = infer_outcome_col_name(df=dataset, allow_multiple=True) + + col_to_drop = [ + c + for c in outcome_cols + if str(self.cfg.preprocessing.pre_split.min_lookahead_days) not in c + ] + + # If no columns to drop, return the dataset + if not col_to_drop: + return dataset + + df = dataset.drop(col_to_drop, axis=1) + + if not len(infer_outcome_col_name(df)) == 1: + raise ValueError( + "Returning more than one outcome column, will cause problems during eval.", + ) + + return df + + @staticmethod + def _drop_datetime_columns( + pred_prefix: str, + dataset: pd.DataFrame, + drop_dtypes: tuple = ("datetime64[ns]", " pd.DataFrame: + """Drop all datetime columns from the dataset.""" + columns_to_drop = [ + c for c in dataset.columns if dataset[c].dtype in drop_dtypes + ] + columns_to_drop = [c for c in columns_to_drop if c.startswith(pred_prefix)] + + return dataset[[c for c in dataset.columns if c not in columns_to_drop]] + + @print_df_dimensions_diff + def n_outcome_col_names(self, df: pd.DataFrame) -> int: + """How many outcome columns there are in a dataframe.""" + return len(infer_outcome_col_name(df=df, allow_multiple=True)) + + def filter(self, dataset: pd.DataFrame) -> pd.DataFrame: + """Filter a dataframe based on the config.""" + for direction in ("ahead", "behind"): + if direction == "ahead": + n_days = self.cfg.preprocessing.pre_split.min_lookahead_days + elif direction == "behind": + n_days = max(self.cfg.preprocessing.pre_split.lookbehind_combination) + + dataset = self._drop_cols_if_exceeds_look_direction_threshold( + dataset=dataset, + look_direction_threshold=n_days, + direction=direction, + ) + + if self.cfg.preprocessing.pre_split.lookbehind_combination: + dataset = self._drop_cols_not_in_lookbehind_combination(dataset=dataset) + + dataset = self._keep_unique_outcome_col_with_lookahead_days_matching_conf( + dataset=dataset, + ) + + if self.cfg.preprocessing.pre_split.drop_datetime_predictor_columns: + dataset = self._drop_datetime_columns( + pred_prefix=self.cfg.data.pred_prefix, + dataset=dataset, + ) + + return dataset diff --git a/src/psycop_model_training/preprocessing/pre_split/processors/row_filter.py b/src/psycop_model_training/preprocessing/pre_split/processors/row_filter.py new file mode 100644 index 00000000..0fc38bd1 --- /dev/null +++ b/src/psycop_model_training/preprocessing/pre_split/processors/row_filter.py @@ -0,0 +1,159 @@ +"""Row filter for pre-split data.""" +from datetime import timedelta +from typing import Union + +import pandas as pd + +from psycop_model_training.data_loader.data_loader import msg +from psycop_model_training.utils.config_schemas.full_config import FullConfigSchema +from psycop_model_training.utils.decorators import print_df_dimensions_diff +from psycop_model_training.utils.utils import get_percent_lost + + +class PreSplitRowFilter: + """Row filter for pre-split data.""" + + def __init__(self, cfg: FullConfigSchema): + self.cfg = cfg + + @print_df_dimensions_diff + def _drop_rows_if_datasets_ends_within_days( + self, + n_days: Union[int, float], + dataset: pd.DataFrame, + direction: str, + ) -> pd.DataFrame: + """Drop visits that lie within certain amount of days from end of + dataset. + + Args: + n_days (Union[float, int]): Number of days. + dataset (pd.DataFrame): Dataset. + direction (str): Direction to look. Allowed are ["before", "after"]. + + Returns: + pd.DataFrame: Dataset with dropped rows. + """ + if not isinstance(n_days, timedelta): + n_days_timedelt: timedelta = timedelta(days=n_days) # type: ignore + + if direction not in ("ahead", "behind"): + raise ValueError(f"Direction {direction} not supported.") + + n_rows_before_modification = dataset.shape[0] + + if direction == "ahead": + max_datetime = ( + dataset[self.cfg.data.col_name.pred_timestamp].max() - n_days_timedelt + ) + before_max_dt = ( + dataset[self.cfg.data.col_name.pred_timestamp] < max_datetime + ) + dataset = dataset[before_max_dt] + elif direction == "behind": + min_datetime = ( + dataset[self.cfg.data.col_name.pred_timestamp].min() + n_days_timedelt + ) + after_min_dt = dataset[self.cfg.data.col_name.pred_timestamp] > min_datetime + dataset = dataset[after_min_dt] + + n_rows_after_modification = dataset.shape[0] + percent_dropped = get_percent_lost( + n_before=n_rows_before_modification, + n_after=n_rows_after_modification, + ) + + if n_rows_before_modification - n_rows_after_modification != 0: + msg.info( + f"Dropped {n_rows_before_modification - n_rows_after_modification} ({percent_dropped}%) rows because the end of the dataset was within {n_days} of their prediction time when looking {direction} from their prediction time", + ) + + return dataset + + @print_df_dimensions_diff + def _drop_patient_if_excluded_by_date( + self, + dataset: pd.DataFrame, + ) -> pd.DataFrame: + """Drop patients that have an exclusion event within the washin + period.""" + + n_rows_before_modification = dataset.shape[0] + + outcome_before_date = ( + dataset[self.cfg.data.col_name.exclusion_timestamp] + < self.cfg.preprocessing.pre_split.drop_patient_if_exclusion_before_date + ) + + patients_to_drop = set( + dataset[self.cfg.data.col_name.id][outcome_before_date].unique(), + ) + dataset = dataset[~dataset[self.cfg.data.col_name.id].isin(patients_to_drop)] + + n_rows_after_modification = dataset.shape[0] + + percent_dropped = get_percent_lost( + n_before=n_rows_after_modification, + n_after=n_rows_after_modification, + ) + + if n_rows_before_modification - n_rows_after_modification != 0: + msg.info( + f"Dropped {n_rows_before_modification - n_rows_after_modification} ({percent_dropped}%) rows because they met exclusion criteria before {self.cfg.preprocessing.pre_split.drop_patient_if_exclusion_before_date}.", + ) + else: + msg.info( + f"No rows met exclusion criteria before {self.cfg.preprocessing.pre_split.drop_patient_if_exclusion_before_date}. Didn't drop any.", + ) + + return dataset + + @print_df_dimensions_diff + def _keep_only_if_older_than_min_age(self, dataset: pd.DataFrame) -> pd.DataFrame: + """Keep only rows that are older than the minimum age specified in the + config.""" + return dataset[ + dataset[self.cfg.data.col_name.age] + >= self.cfg.preprocessing.pre_split.min_age + ] + + @print_df_dimensions_diff + def _drop_rows_after_event_time(self, dataset: pd.DataFrame) -> pd.DataFrame: + """Drop all rows where prediction timestamp is after the outcome.""" + + rows_to_drop = ( + dataset[self.cfg.data.col_name.pred_timestamp] + > dataset[self.cfg.data.col_name.outcome_timestamp] + ) + + return dataset[~rows_to_drop] + + def filter(self, dataset: pd.DataFrame): + """Run filters based on config.""" + for direction in ("ahead", "behind"): + if direction == "ahead": + n_days = self.cfg.preprocessing.pre_split.min_lookahead_days + elif direction == "behind": + n_days = max(self.cfg.preprocessing.pre_split.lookbehind_combination) + + dataset = self._drop_rows_if_datasets_ends_within_days( + n_days=n_days, + dataset=dataset, + direction=direction, + ) + + if self.cfg.preprocessing.pre_split.min_prediction_time_date: + dataset = dataset[ + dataset[self.cfg.data.col_name.pred_timestamp] + > self.cfg.preprocessing.pre_split.min_prediction_time_date + ] + + if self.cfg.preprocessing.pre_split.drop_patient_if_exclusion_before_date: + dataset = self._drop_patient_if_excluded_by_date(dataset) + + if self.cfg.preprocessing.pre_split.min_age: + dataset = self._keep_only_if_older_than_min_age(dataset) + + dataset = self._drop_rows_after_event_time(dataset=dataset) + + return dataset diff --git a/src/psycop_model_training/preprocessing/pre_split/processors/value_cleaner.py b/src/psycop_model_training/preprocessing/pre_split/processors/value_cleaner.py new file mode 100644 index 00000000..5b2c608c --- /dev/null +++ b/src/psycop_model_training/preprocessing/pre_split/processors/value_cleaner.py @@ -0,0 +1,66 @@ +"""Class for formatting values before split, e.g. assigning datetime, removing +negative values etc.""" +import numpy as np +import pandas as pd + +from psycop_model_training.utils.col_name_inference import infer_predictor_col_name +from psycop_model_training.utils.config_schemas.full_config import FullConfigSchema +from psycop_model_training.utils.decorators import print_df_dimensions_diff + + +class PreSplitValueCleaner: + """Class for cleaning values before split, e.g. assigning datetime, + removing negative values etc.""" + + def __init__(self, cfg: FullConfigSchema) -> None: + self.cfg = cfg + + @staticmethod + @print_df_dimensions_diff + def convert_timestamp_dtype_and_nat(dataset: pd.DataFrame) -> pd.DataFrame: + """Convert columns with `timestamp`in their name to datetime, and + convert 0's to NaT.""" + timestamp_colnames = [col for col in dataset.columns if "timestamp" in col] + + for colname in timestamp_colnames: + if dataset[colname].dtype != "datetime64[ns]": + # Convert all 0s in colname to NaT + dataset[colname] = dataset[colname].apply( + lambda x: pd.NaT if x == "0" else x, + ) + dataset[colname] = pd.to_datetime(dataset[colname]) + + return dataset + + @print_df_dimensions_diff + def _negative_values_to_nan(self, dataset: pd.DataFrame) -> pd.DataFrame: + """Convert negative values to NaN.""" + preds = dataset[infer_predictor_col_name(df=dataset)] + + # Get all columns with negative values + cols_with_numerical_values = preds.select_dtypes(include=["number"]).columns + + numerical_columns_with_negative_values = [ + c for c in cols_with_numerical_values if preds[c].min() < 0 + ] + + df_to_replace = dataset[numerical_columns_with_negative_values] + + # Convert to NaN + df_to_replace[df_to_replace < 0] = np.nan + dataset[numerical_columns_with_negative_values] = df_to_replace + + return dataset + + def clean(self, dataset: pd.DataFrame) -> pd.DataFrame: + """Apply the cleaning functions to the dataset.""" + # Super hacky transformation of negative weights (?!) for chi-square. + # In the future, we want to: + # 1. Fix this in the feature generation for t2d + # 2a. See if there's a way of using feature selection that permits negative values, or + # 2b. Always use z-score normalisation? + dataset = self._negative_values_to_nan(dataset=dataset) + + dataset = self.convert_timestamp_dtype_and_nat(dataset=dataset) + + return dataset diff --git a/src/psycop_model_training/preprocessing/pre_split/processors/value_transformer.py b/src/psycop_model_training/preprocessing/pre_split/processors/value_transformer.py new file mode 100644 index 00000000..92161290 --- /dev/null +++ b/src/psycop_model_training/preprocessing/pre_split/processors/value_transformer.py @@ -0,0 +1,83 @@ +"""Pre-split value transformer. These transformations are applied before the +split. + +To avoid test/train leakage, the transformations must not use any +information about the values in the dataset. +""" +from datetime import datetime +from typing import Optional + +import pandas as pd +from wasabi import Printer + +from psycop_model_training.utils.col_name_inference import infer_predictor_col_name +from psycop_model_training.utils.config_schemas.full_config import FullConfigSchema +from psycop_model_training.utils.decorators import print_df_dimensions_diff + +msg = Printer(timestamp=True) + + +class PreSplitValueTransformer: + """Pre-split value transformer.""" + + def __init__(self, cfg: FullConfigSchema) -> None: + self.cfg = cfg + + @print_df_dimensions_diff + def _convert_boolean_dtypes_to_int(self, dataset: pd.DataFrame) -> pd.DataFrame: + """Convert boolean dtypes to int.""" + for col in dataset.columns: + if dataset[col].dtype == bool: + dataset[col] = dataset[col].astype(int) + + return dataset + + def _convert_datetimes_to_ordinal(self, dataset: pd.DataFrame) -> pd.DataFrame: + """Convert datetime columns to integers.""" + + datetime_dtypes = {"datetime64[ns]", " pd.DataFrame: + """Convert predictors to boolean.""" + columns = infer_predictor_col_name(df=dataset, prefix=self.cfg.data.pred_prefix) + + cols_to_round = [ + c + for c in columns + if (dataset[c].dtype not in ignore_dtypes) or c in columns_to_skip + ] + + for col in cols_to_round: + dataset[col] = dataset[col].notnull() + + return dataset + + def transform(self, dataset: pd.DataFrame) -> pd.DataFrame: + """Transform the dataset.""" + if self.cfg.preprocessing.pre_split.convert_booleans_to_int: + dataset = self._convert_boolean_dtypes_to_int(dataset=dataset) + + if self.cfg.preprocessing.pre_split.convert_datetimes_to_ordinal: + dataset = self._convert_datetimes_to_ordinal(dataset=dataset) + + if self.cfg.preprocessing.pre_split.convert_to_boolean: + dataset = self._convert_predictors_to_boolean(dataset=dataset) + + msg.info("Finished processing dataset") + + return dataset diff --git a/src/psycop_model_training/training/train_model.py b/src/psycop_model_training/training/train_and_eval.py similarity index 53% rename from src/psycop_model_training/training/train_model.py rename to src/psycop_model_training/training/train_and_eval.py index 822e58bf..2061d9e7 100644 --- a/src/psycop_model_training/training/train_model.py +++ b/src/psycop_model_training/training/train_and_eval.py @@ -1,44 +1,23 @@ """Training script for training a single model for predicting t2d.""" import os -import time from collections.abc import Iterable -from typing import Any, Optional +from typing import Optional -import hydra import numpy as np import pandas as pd -import wandb -from omegaconf import OmegaConf -from omegaconf.dictconfig import DictConfig from sklearn.metrics import roc_auc_score from sklearn.model_selection import StratifiedGroupKFold from sklearn.pipeline import Pipeline from wasabi import Printer -from psycop_model_training.config.schemas import ( - FullConfigSchema, - convert_omegaconf_to_pydantic_object, -) - # from psycop_model_training.evaluation import evaluate_model -from psycop_model_training.load import load_train_and_val_from_cfg -from psycop_model_training.model_eval.dataclasses import EvalDataset, PipeMetadata -from psycop_model_training.model_eval.evaluate_model import run_full_evaluation -from psycop_model_training.preprocessing.post_split.create_pipeline import ( - create_preprocessing_pipeline, -) +from psycop_model_training.model_eval.dataclasses import EvalDataset from psycop_model_training.training.model_specs import MODELS -from psycop_model_training.utils.col_name_inference import get_col_names -from psycop_model_training.utils.utils import ( - PROJECT_ROOT, - create_wandb_folders, - eval_ds_cfg_pipe_to_disk, - flatten_nested_dict, - get_feature_importance_dict, - get_selected_features_dict, -) - -CONFIG_PATH = PROJECT_ROOT / "src" / "psycop_model_training" / "config" +from psycop_model_training.training.utils import create_eval_dataset +from psycop_model_training.utils.config_schemas.full_config import FullConfigSchema +from psycop_model_training.utils.utils import PROJECT_ROOT + +CONFIG_PATH = PROJECT_ROOT / "application" / "config" # Handle wandb not playing nice with joblib os.environ["WANDB_START_METHOD"] = "thread" @@ -107,28 +86,6 @@ def stratified_cross_validation( # pylint: disable=too-many-locals return train_df -def create_eval_dataset(cfg: FullConfigSchema, outcome_col_name: str, df: pd.DataFrame): - """Create an evaluation dataset object from a dataframe and - FullConfigSchema.""" - - eval_dataset = EvalDataset( - ids=df[cfg.data.col_name.id], - y=df[outcome_col_name], - y_hat_probs=df["y_hat_prob"], - y_hat_int=df["y_hat_prob"].round(), - pred_timestamps=df[cfg.data.col_name.pred_timestamp], - outcome_timestamps=df[cfg.data.col_name.outcome_timestamp], - age=df[cfg.data.col_name.age], - exclusion_timestamps=df[cfg.data.col_name.exclusion_timestamp], - ) - - if cfg.data.col_name.custom: - if cfg.data.col_name.custom.n_hba1c: - eval_dataset.custom.n_hba1c = df[cfg.data.col_name.custom.n_hba1c] - - return eval_dataset - - def train_and_eval_on_crossvalidation( cfg: FullConfigSchema, train: pd.DataFrame, @@ -262,136 +219,3 @@ def train_and_get_model_eval_df( ) return eval_dataset - - -def create_pipeline(cfg): - """Create pipeline. - - Args: - cfg (DictConfig): Config object - - Returns: - Pipeline - """ - steps = [] - preprocessing_pipe = create_preprocessing_pipeline(cfg) - if len(preprocessing_pipe.steps) != 0: - steps.append(("preprocessing", preprocessing_pipe)) - - mdl = create_model(cfg) - steps.append(("model", mdl)) - return Pipeline(steps) - - -@hydra.main( - config_path=str(CONFIG_PATH), - config_name="default_config", - version_base="1.2", -) -def main(cfg: DictConfig): - """Main function for training a single model.""" - # Save dictconfig for easier logging - if isinstance(cfg, DictConfig): - # Create flattened dict for logging to wandb - # Wandb doesn't allow configs to be nested, so we - # flatten it. - dict_config_to_log: dict[str, Any] = flatten_nested_dict(OmegaConf.to_container(cfg), sep=".") # type: ignore - else: - # For testing, we can take a FullConfig object instead. Simplifies boilerplate. - dict_config_to_log = cfg.__dict__ - - if not isinstance(cfg, FullConfigSchema): - cfg = convert_omegaconf_to_pydantic_object(cfg) - - msg = Printer(timestamp=True) - - create_wandb_folders() - - run = wandb.init( - project=cfg.project.name, - reinit=True, - config=dict_config_to_log, - mode=cfg.project.wandb.mode, - group=cfg.project.wandb.group, - entity=cfg.project.wandb.entity, - ) - - if run is None: - raise ValueError("Failed to initialise Wandb") - - # Add random delay based on cfg.train.random_delay_per_job to avoid - # each job needing the same resources (GPU, disk, network) at the same time - if cfg.train.random_delay_per_job_seconds: - delay = np.random.randint(0, cfg.train.random_delay_per_job_seconds) - msg.info(f"Delaying job by {delay} seconds to avoid resource competition") - time.sleep(delay) - - dataset = load_train_and_val_from_cfg(cfg) - - msg.info("Creating pipeline") - pipe = create_pipeline(cfg) - - outcome_col_name, train_col_names = get_col_names(cfg, dataset.train) - - msg.info("Training model") - eval_ds = train_and_get_model_eval_df( - cfg=cfg, - train=dataset.train, - val=dataset.val, - pipe=pipe, - outcome_col_name=outcome_col_name, - train_col_names=train_col_names, - n_splits=cfg.train.n_splits, - ) - - pipe_metadata = PipeMetadata() - - if hasattr(pipe["model"], "feature_importances_"): - pipe_metadata.feature_importances = get_feature_importance_dict(pipe=pipe) - if hasattr(pipe["preprocessing"].named_steps, "feature_selection"): - pipe_metadata.selected_features = get_selected_features_dict( - pipe=pipe, - train_col_names=train_col_names, - ) - - # Save model predictions, feature importance, and config to disk - eval_ds_cfg_pipe_to_disk( - eval_dataset=eval_ds, - cfg=cfg, - pipe_metadata=pipe_metadata, - run=run, - ) - - if cfg.project.wandb.mode == "run" or cfg.eval.force: - msg.info("Evaluating model.") - - upload_to_wandb = cfg.project.wandb.mode == "run" - - run_full_evaluation( - cfg=cfg, - eval_dataset=eval_ds, - run=run, - pipe_metadata=pipe_metadata, - save_dir=PROJECT_ROOT / "wandb" / "plots" / run.name, - upload_to_wandb=upload_to_wandb, - ) - - roc_auc = roc_auc_score( - eval_ds.y, - eval_ds.y_hat_probs, - ) - - msg.info(f"ROC AUC: {roc_auc}") - run.log( - { - "roc_auc_unweighted": roc_auc, - "lookbehind": max(cfg.data.lookbehind_combination), - "lookahead": cfg.data.min_lookahead_days, - }, - ) - run.finish() - return roc_auc - - -if __name__ == "__main__": - main() # pylint: disable=no-value-for-parameter diff --git a/src/psycop_model_training/training/utils.py b/src/psycop_model_training/training/utils.py new file mode 100644 index 00000000..962472f2 --- /dev/null +++ b/src/psycop_model_training/training/utils.py @@ -0,0 +1,26 @@ +import pandas as pd + +from psycop_model_training.model_eval.dataclasses import EvalDataset +from psycop_model_training.utils.config_schemas.full_config import FullConfigSchema + + +def create_eval_dataset(cfg: FullConfigSchema, outcome_col_name: str, df: pd.DataFrame): + """Create an evaluation dataset object from a dataframe and + FullConfigSchema.""" + + eval_dataset = EvalDataset( + ids=df[cfg.data.col_name.id], + y=df[outcome_col_name], + y_hat_probs=df["y_hat_prob"], + y_hat_int=df["y_hat_prob"].round(), + pred_timestamps=df[cfg.data.col_name.pred_timestamp], + outcome_timestamps=df[cfg.data.col_name.outcome_timestamp], + age=df[cfg.data.col_name.age], + exclusion_timestamps=df[cfg.data.col_name.exclusion_timestamp], + ) + + if cfg.data.col_name.custom: + if cfg.data.col_name.custom.n_hba1c: + eval_dataset.custom.n_hba1c = df[cfg.data.col_name.custom.n_hba1c] + + return eval_dataset diff --git a/src/psycop_model_training/utils/basemodel.py b/src/psycop_model_training/utils/basemodel.py new file mode 100644 index 00000000..058edff3 --- /dev/null +++ b/src/psycop_model_training/utils/basemodel.py @@ -0,0 +1,47 @@ +from typing import Any + +from pydantic import BaseModel as PydanticBaseModel +from pydantic import Extra + + +class BaseModel(PydanticBaseModel): + """.""" + + class Config: + """An pydantic basemodel, which doesn't allow attributes that are not + defined in the class.""" + + allow_mutation = False + arbitrary_types_allowed = True + extra = Extra.forbid + + def __transform_attributes_with_str_to_object( + self, + output_object: Any, + input_string: str = "str", + ): + for key, value in self.__dict__.items(): + if isinstance(value, str): + if value.lower() == input_string.lower(): + self.__dict__[key] = output_object + + def __init__( + self, + allow_mutation: bool = False, + **kwargs, + ): + super().__init__(**kwargs) + self.Config.allow_mutation = allow_mutation + + self.__transform_attributes_with_str_to_object( + input_string="null", + output_object=None, + ) + self.__transform_attributes_with_str_to_object( + input_string="false", + output_object=False, + ) + self.__transform_attributes_with_str_to_object( + input_string="true", + output_object=True, + ) diff --git a/src/psycop_model_training/utils/col_name_inference.py b/src/psycop_model_training/utils/col_name_inference.py index b09f5781..334d5dec 100644 --- a/src/psycop_model_training/utils/col_name_inference.py +++ b/src/psycop_model_training/utils/col_name_inference.py @@ -22,7 +22,8 @@ def get_col_names(cfg: DictConfig, train: pd.DataFrame) -> tuple[str, list[str]] potential_outcome_col_names = [ c for c in train.columns - if cfg.data.outc_prefix in c and str(cfg.data.min_lookahead_days) in c + if cfg.data.outc_prefix in c + and str(cfg.preprocessing.pre_split.min_lookahead_days) in c ] if len(potential_outcome_col_names) != 1: @@ -67,3 +68,52 @@ def infer_look_distance( ) return look_distances + + +def infer_col_names( + df: pd.DataFrame, + prefix: str, + allow_multiple: bool = True, +) -> list[str]: + """Infer col names based on prefix.""" + col_name = [c for c in df.columns if c.startswith(prefix)] + + if len(col_name) == 1: + return col_name + elif len(col_name) > 1: + if allow_multiple: + return col_name + raise ValueError( + f"Multiple columns found and allow_multiple is {allow_multiple}.", + ) + elif not col_name: + raise ValueError("No outcome col name inferred") + else: + raise ValueError("No outcomes inferred") + + +def infer_outcome_col_name( + df: pd.DataFrame, + prefix: str = "outc_", + allow_multiple: bool = True, +) -> list[str]: + """Infer the outcome column name from the dataframe.""" + return infer_col_names(df=df, prefix=prefix, allow_multiple=allow_multiple) + + +def infer_predictor_col_name( + df: pd.DataFrame, + prefix: str = "pred_", + allow_multiple: bool = True, +) -> list[str]: + """Get the predictors that are used in the model.""" + return infer_col_names(df=df, prefix=prefix, allow_multiple=allow_multiple) + + +def infer_y_hat_prob_col_name( + df: pd.DataFrame, + prefix="y_hat_prob", + allow_multiple: bool = False, +) -> list[str]: + """Infer the y_hat_prob column name from the dataframe.""" + return infer_col_names(df=df, prefix=prefix, allow_multiple=allow_multiple) diff --git a/src/psycop_model_training/utils/config_schemas/__init__.py b/src/psycop_model_training/utils/config_schemas/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/psycop_model_training/utils/config_schemas/conf_utils.py b/src/psycop_model_training/utils/config_schemas/conf_utils.py new file mode 100644 index 00000000..83184dc2 --- /dev/null +++ b/src/psycop_model_training/utils/config_schemas/conf_utils.py @@ -0,0 +1,96 @@ +"""Utilities for handling config objects, e.g. load, change format. + +Very useful when testing. +""" +from typing import Optional, Union + +from hydra import compose, initialize +from omegaconf import DictConfig, OmegaConf + +from psycop_model_training.utils.basemodel import BaseModel +from psycop_model_training.utils.config_schemas.full_config import FullConfigSchema + + +def convert_omegaconf_to_pydantic_object( + conf: DictConfig, + allow_mutation: bool = False, +) -> FullConfigSchema: + """Converts an omegaconf DictConfig to a pydantic object. + + Args: + conf (DictConfig): Omegaconf DictConfig + allow_mutation (bool, optional): Whether to make the pydantic object mutable. Defaults to False. + + Returns: + FullConfig: Pydantic object + """ + conf = OmegaConf.to_container(conf, resolve=True) # type: ignore + return FullConfigSchema(**conf, allow_mutation=allow_mutation) + + +def load_test_cfg_as_omegaconf( + config_file_name: str = "default_config", + config_dir_path_rel: str = "../../../../tests/config/", + overrides: Optional[list[str]] = None, +) -> DictConfig: + """Load config as omegaconf object.""" + with initialize(version_base=None, config_path=config_dir_path_rel): + if overrides: + cfg = compose( + config_name=config_file_name, + overrides=overrides, + ) + else: + cfg = compose( + config_name=config_file_name, + ) + + # Override the type so we can get autocomplete and renaming + # correctly working + cfg: FullConfigSchema = cfg # type: ignore + + gpu = cfg.project.gpu + + if not gpu and cfg.model.name == "xgboost": + cfg.model.args["tree_method"] = "auto" + + return cfg + + +def load_app_cfg_as_pydantic( + config_file_name: str, + allow_mutation: bool = False, + overrides: Optional[list[str]] = None, +): + """Load application cfg as pydantic object.""" + cfg = load_test_cfg_as_omegaconf( + config_file_name=config_file_name, + overrides=overrides, + config_dir_path_rel="../../../../application/config/", + ) + + return convert_omegaconf_to_pydantic_object(conf=cfg, allow_mutation=allow_mutation) + + +def load_test_cfg_as_pydantic( + config_file_name: str, + allow_mutation: bool = False, + overrides: Optional[list[str]] = None, +) -> FullConfigSchema: + """Load config as pydantic object.""" + cfg = load_test_cfg_as_omegaconf( + config_file_name=config_file_name, + overrides=overrides, + config_dir_path_rel="../../../../tests/config/", + ) + + return convert_omegaconf_to_pydantic_object(conf=cfg, allow_mutation=allow_mutation) + + +class WatcherSchema(BaseModel): + """Configuration for watchers.""" + + archive_all: bool + keep_alive_after_training_minutes: Union[int, float] + n_runs_before_eval: int + verbose: bool diff --git a/src/psycop_model_training/utils/config_schemas/data.py b/src/psycop_model_training/utils/config_schemas/data.py new file mode 100644 index 00000000..e7229a25 --- /dev/null +++ b/src/psycop_model_training/utils/config_schemas/data.py @@ -0,0 +1,41 @@ +from pathlib import Path +from typing import Optional, Union + +from psycop_model_training.utils.basemodel import BaseModel + + +class CustomColNames(BaseModel): + """All custom column names, i.e. columns that won't generalise across + projects.""" + + n_hba1c: str + + +class ColumnNamesSchema(BaseModel): + """Column names in the data.""" + + pred_timestamp: str # Column name for prediction times + outcome_timestamp: str # Column name for outcome timestamps + id: str # Citizen colnames + age: str # Name of the age column + exclusion_timestamp: str # Name of the exclusion timestamps column. + # Drops all visits whose pred_timestamp <= exclusion_timestamp. + + custom: Optional[CustomColNames] = None + # Column names that are custom to the given prediction problem. + + +class DataSchema(BaseModel): + """Data configuration.""" + + n_training_samples: Optional[int] + # Number of training samples to use, defaults to null in which cases it uses all samples. + + dir: Union[Path, str] # Location of the dataset + suffix: str # File suffix to load. + + # Feature specs + col_name: ColumnNamesSchema + + pred_prefix: str # prefix of predictor columns + outc_prefix: str # prefix of outcome columns diff --git a/src/psycop_model_training/utils/config_schemas/eval.py b/src/psycop_model_training/utils/config_schemas/eval.py new file mode 100644 index 00000000..ef9dc0ed --- /dev/null +++ b/src/psycop_model_training/utils/config_schemas/eval.py @@ -0,0 +1,23 @@ +"""Eval config schema.""" +from psycop_model_training.utils.basemodel import BaseModel + + +class EvalConfSchema(BaseModel): + """Evaluation config.""" + + force: bool = False + # Whether to force evaluation even if wandb is not "run". Used for testing. + + top_n_feature_importances: int + # How many feature_importances to plot. Plots the most important n features. A table with all features is also logged. + + positive_rate_thresholds: list[int] + # The threshold mapping a model's predicted probability to a binary outcome can be computed if we know, which positive rate we're targeting. We can't know beforehand which positive rate is best, beause it's a trade-off between false-positives and false-negatives. Therefore, we compute performacne for a range of positive rates. + + save_model_predictions_on_overtaci: bool + + lookahead_bins: list[int] + # List of lookahead distances for plotting. Will create bins in between each distances. E.g. if specifying 1, 5, 10, will bin evaluation as follows: [0, 1], [1, 5], [5, 10], [10, inf]. + + lookbehind_bins: list[int] + # List of lookbehidn distances for plotting. Will create bins in between each distances. E.g. if specifying 1, 5, 10, will bin evaluation as follows: [0, 1], [1, 5], [5, 10], [10, inf]. diff --git a/src/psycop_model_training/utils/config_schemas/full_config.py b/src/psycop_model_training/utils/config_schemas/full_config.py new file mode 100644 index 00000000..8f8aa834 --- /dev/null +++ b/src/psycop_model_training/utils/config_schemas/full_config.py @@ -0,0 +1,21 @@ +"""Full configuration schema.""" +from psycop_model_training.utils.basemodel import BaseModel +from psycop_model_training.utils.config_schemas.data import DataSchema +from psycop_model_training.utils.config_schemas.eval import EvalConfSchema +from psycop_model_training.utils.config_schemas.model import ModelConfSchema +from psycop_model_training.utils.config_schemas.preprocessing import ( + PreprocessingConfigSchema, +) +from psycop_model_training.utils.config_schemas.project import ProjectSchema +from psycop_model_training.utils.config_schemas.train import TrainConfSchema + + +class FullConfigSchema(BaseModel): + """A recipe for a full configuration object.""" + + project: ProjectSchema + data: DataSchema + preprocessing: PreprocessingConfigSchema + model: ModelConfSchema + train: TrainConfSchema + eval: EvalConfSchema diff --git a/src/psycop_model_training/utils/config_schemas/model.py b/src/psycop_model_training/utils/config_schemas/model.py new file mode 100644 index 00000000..e750afc2 --- /dev/null +++ b/src/psycop_model_training/utils/config_schemas/model.py @@ -0,0 +1,10 @@ +"""Model configuration schemas.""" +from psycop_model_training.utils.basemodel import BaseModel + + +class ModelConfSchema(BaseModel): + """Model configuration.""" + + name: str # Model, can currently take xgboost + require_imputation: bool # Whether the model requires imputation. (shouldn't this be false?) + args: dict diff --git a/src/psycop_model_training/utils/config_schemas/preprocessing.py b/src/psycop_model_training/utils/config_schemas/preprocessing.py new file mode 100644 index 00000000..c5a7aea1 --- /dev/null +++ b/src/psycop_model_training/utils/config_schemas/preprocessing.py @@ -0,0 +1,67 @@ +"""Preprocessing config schemas.""" +from datetime import datetime +from typing import Literal, Optional, Union + +from psycop_model_training.utils.basemodel import BaseModel + + +class FeatureSelectionSchema(BaseModel): + """Configuration for feature selection methods.""" + + name: Optional[str] = None + # Which feature selection method to use. + + params: Optional[dict] = None + # Parameters for the feature selection method. + + +class PreSplitPreprocessingConfigSchema(BaseModel): + """Pre split preprocessing config.""" + + drop_patient_if_exclusion_before_date: Optional[Union[str, datetime]] + # Drop all visits from a patient if the outcome is before this date. If None, no patients are dropped. + + convert_to_boolean: bool + # Convert all prediction values (except gender) to boolean. Defaults to False. Useful as a sensitivty test, i.e. "is model performance based on whether blood samples are taken, or their values". If based purely on whether blood samples are taken, might indicate that it's just predicting whatever the doctor suspected. + + convert_booleans_to_int: bool + # Whether to convert columns containing booleans to int + + drop_datetime_predictor_columns: bool + # Whether to drop datetime columns prefixed with data.pred_prefix. + # Typically, we don't want to use these as features, since they are unlikely to generalise into the future. + + convert_datetimes_to_ordinal: bool + # Whether to convert datetimes to ordinal. + + min_age: Union[int, float] # Minimum age to include in the dataset + + # Looking ahead + min_lookahead_days: int + # Drop all prediction times where (max timestamp in the dataset) - (current timestamp) is less than min_lookahead_days + + min_prediction_time_date: Optional[Union[str, datetime]] + # Drop all prediction times before this date. + + lookbehind_combination: Optional[list[int]] + # Which combination of features to use. Only uses features that have "within_X_days" in their column name, where X is any of the numbers in this list. + + +class PostSplitPreprocessingConfigSchema(BaseModel): + """Post split preprocessing config.""" + + imputation_method: Optional[Literal["most_frequent", "mean", "median", "null"]] + # How to replace missing values. Takes all values from the sklearn.impute.SimpleImputer class. + # https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html + + scaling: Optional[str] + # Scaling applied to all predictors after imputation. Options include "z-score-normalization". + + feature_selection: FeatureSelectionSchema + + +class PreprocessingConfigSchema(BaseModel): + """Preprocessing config.""" + + pre_split: PreSplitPreprocessingConfigSchema + post_split: PostSplitPreprocessingConfigSchema diff --git a/src/psycop_model_training/utils/config_schemas/project.py b/src/psycop_model_training/utils/config_schemas/project.py new file mode 100644 index 00000000..7f71788b --- /dev/null +++ b/src/psycop_model_training/utils/config_schemas/project.py @@ -0,0 +1,19 @@ +"""Project configuration schemas.""" +from psycop_model_training.utils.basemodel import BaseModel + + +class WandbSchema(BaseModel): + """Configuration for weights and biases.""" + + group: str + mode: str + entity: str + + +class ProjectSchema(BaseModel): + """Project configuration.""" + + wandb: WandbSchema + name: str = "psycop_model_training" + seed: int + gpu: bool diff --git a/src/psycop_model_training/utils/config_schemas/train.py b/src/psycop_model_training/utils/config_schemas/train.py new file mode 100644 index 00000000..e91a0fc1 --- /dev/null +++ b/src/psycop_model_training/utils/config_schemas/train.py @@ -0,0 +1,16 @@ +from typing import Optional + +from psycop_model_training.utils.basemodel import BaseModel + + +class TrainConfSchema(BaseModel): + """Training configuration.""" + + n_splits: int # ? How do we handle whether to use crossvalidation or train/val splitting? + n_trials_per_lookahead: int + n_active_trainers: int # Number of lookahead windows to train for at once + n_jobs_per_trainer: int # Number of jobs to run in parallel for each lookahead window + random_delay_per_job_seconds: Optional[ + int + ] = None # Add random delay based on cfg.train.random_delay_per_job to avoid + # each job needing the same resources (GPU, disk, network) at the same time diff --git a/src/psycop_model_training/utils/utils.py b/src/psycop_model_training/utils/utils.py index ab57d871..70931891 100644 --- a/src/psycop_model_training/utils/utils.py +++ b/src/psycop_model_training/utils/utils.py @@ -23,7 +23,7 @@ ModelEvalData, PipeMetadata, ) -from psycop_model_training.model_performance import ModelPerformance +from psycop_model_training.model_eval.model_performance import ModelPerformance SHARED_RESOURCES_PATH = Path(r"E:\shared_resources") FEATURE_SETS_PATH = SHARED_RESOURCES_PATH / "feature_sets" @@ -449,55 +449,6 @@ def load_evaluation_data(model_data_dir: Path) -> ModelEvalData: ) -def infer_col_names( - df: pd.DataFrame, - prefix: str, - allow_multiple: bool = True, -) -> list[str]: - """Infer col names based on prefix.""" - col_name = [c for c in df.columns if c.startswith(prefix)] - - if len(col_name) == 1: - return col_name - elif len(col_name) > 1: - if allow_multiple: - return col_name - raise ValueError( - f"Multiple columns found and allow_multiple is {allow_multiple}.", - ) - elif not col_name: - raise ValueError("No outcome col name inferred") - else: - raise ValueError("No outcomes inferred") - - -def infer_outcome_col_name( - df: pd.DataFrame, - prefix: str = "outc_", - allow_multiple: bool = True, -) -> list[str]: - """Infer the outcome column name from the dataframe.""" - return infer_col_names(df=df, prefix=prefix, allow_multiple=allow_multiple) - - -def infer_predictor_col_name( - df: pd.DataFrame, - prefix: str = "pred_", - allow_multiple: bool = True, -) -> list[str]: - """Get the predictors that are used in the model.""" - return infer_col_names(df=df, prefix=prefix, allow_multiple=allow_multiple) - - -def infer_y_hat_prob_col_name( - df: pd.DataFrame, - prefix="y_hat_prob", - allow_multiple: bool = False, -) -> list[str]: - """Infer the y_hat_prob column name from the dataframe.""" - return infer_col_names(df=df, prefix=prefix, allow_multiple=allow_multiple) - - def get_percent_lost(n_before: Union[int, float], n_after: Union[int, float]) -> float: """Get the percent lost.""" return round((100 * (1 - n_after / n_before)), 2) diff --git a/src/psycop_model_training/visualization/__init__.py b/src/psycop_model_training/visualization/__init__.py deleted file mode 100644 index 421566f8..00000000 --- a/src/psycop_model_training/visualization/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -"""Visualisations.""" -from .feature_importance import plot_feature_importances # noqa -from .performance_over_time import ( - plot_auc_by_time_from_first_visit, - plot_metric_by_calendar_time, - plot_metric_by_time_until_diagnosis, -) -from .prob_over_time import plot_prob_over_time # noqa diff --git a/tests/config/__init__.py b/tests/config/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/psycop_model_training/config/data/synth_data.yaml b/tests/config/data/synth_data.yaml similarity index 50% rename from src/psycop_model_training/config/data/synth_data.yaml rename to tests/config/data/synth_data.yaml index 3b891332..7d72c24e 100644 --- a/src/psycop_model_training/config/data/synth_data.yaml +++ b/tests/config/data/synth_data.yaml @@ -3,11 +3,8 @@ data: dir: tests/test_data/synth_splits suffix: csv n_training_samples: null - min_lookahead_days: 30 - min_prediction_time_date: null pred_prefix: pred_ outc_prefix: outc_ - min_age: 18 col_name: pred_timestamp: timestamp @@ -17,15 +14,3 @@ data: exclusion_timestamp: timestamp_exclusion custom: n_hba1c: hba1c_within_9999_days_count_nan - - # Looking ahead - drop_patient_if_exclusion_before_date: 1971-01-01 - - # Looking behind - lookbehind_combination: [30, 60, 100] - -# Parameters that will only take effect if running with --multirun -hydra: - sweeper: - params: - data.lookbehind_combination: choice([30, 90], [30]) diff --git a/src/psycop_model_training/config/integration_config.yaml b/tests/config/default_config.yaml similarity index 100% rename from src/psycop_model_training/config/integration_config.yaml rename to tests/config/default_config.yaml diff --git a/src/psycop_model_training/config/eval/evaluation_synth.yaml b/tests/config/eval/evaluation_synth.yaml similarity index 100% rename from src/psycop_model_training/config/eval/evaluation_synth.yaml rename to tests/config/eval/evaluation_synth.yaml diff --git a/tests/config/model/ebm.yaml b/tests/config/model/ebm.yaml new file mode 100644 index 00000000..3d833821 --- /dev/null +++ b/tests/config/model/ebm.yaml @@ -0,0 +1,31 @@ +# @package _global_ +model: + name: ebm # (str): Model name, explainable boosting machine + require_imputation: true # (bool): Whether the model requires imputation. + args: # Documentiation: https://interpret.ml/docs/ebm.html#api + max_bins: 256 + max_interaction_bins: 32 + binning: quantile + mains: all + interactions: 10 + outer_bags: 8 + inner_bags: 0 + learning_rate: 0.01 + validation_size: 0.15 + early_stopping_rounds: 50 + early_stopping_tolerance: 0.0001 + max_rounds: 5000 + min_samples_leaf: 2 + max_leaves: 3 + n_jobs: 1 + random_state: ${project.seed} + +# Parameters that will only take effect if running with --multirun +hydra: + sweeper: + params: + ++model.args.interactions: choice(0, 5, 10, 50) + ++model.args.learning_rate: interval(0.001, 0.1) + ++model.args.validation_size: interval(0.20, 0.05) + ++model.args.min_samples_leaf: choice(1, 2) + ++model.args.max_leaves: choice(2, 3, 4, 8, 16) diff --git a/tests/config/model/logistic-regression.yaml b/tests/config/model/logistic-regression.yaml new file mode 100644 index 00000000..e4b7817c --- /dev/null +++ b/tests/config/model/logistic-regression.yaml @@ -0,0 +1,25 @@ +# @package _global_ +model: + name: logistic-regression # (str): Model name + require_imputation: True # (bool): Whether the model requires imputation. + args: # https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html + dual: False + tol: 0.0001 + C: 1.0 + fit_intercept: True + intercept_scaling: True + class_weight: Null + random_state: ${project.seed} + penalty_solver: "l2_lbfgs" # custom argument is split into penalty and solver + max_iter: 100 + l1_ratio: 0.5 + +# Parameters that will only take effect if running with --multirun +hydra: + sweeper: + params: + ++model.args.penalty_solver: choice("elasticnet_saga") + ++model.args.C: interval(1e-5, 1.0) + ++model.args.l1_ratio: interval(1e-5, 1.0) + # preprocessing + ++preprocessing.post_split.scaling: choice("null", "z-score-normalization") diff --git a/tests/config/model/naive-bayes.yaml b/tests/config/model/naive-bayes.yaml new file mode 100644 index 00000000..23899ce4 --- /dev/null +++ b/tests/config/model/naive-bayes.yaml @@ -0,0 +1,13 @@ +# @package _global_ +model: + name: naive-bayes # (str): Model name + require_imputation: True # (bool): Whether the model requires imputation. + args: # https://scikit-learn.org/stable/modules/naive_bayes.html#gaussian-naive-bayes + var_smoothing: 0.000000001 + +# Parameters that will only take effect if running with --multirun +hydra: + sweeper: + params: + # preprocessing + ++preprocessing.post_split.scaling: choice(null, "z-score-normalization") diff --git a/tests/config/model/xgboost.yaml b/tests/config/model/xgboost.yaml new file mode 100644 index 00000000..b96925a6 --- /dev/null +++ b/tests/config/model/xgboost.yaml @@ -0,0 +1,20 @@ +# @package _global_ +model: + name: xgboost + require_imputation: false + args: + n_estimators: 100 + tree_method: gpu_hist # set to gpu_hist to enable GPU training (default auto) + booster: gbtree + +# Parameters that will only take effect if running with --multirun +hydra: + sweeper: + params: + ++model.args.n_estimators: int(tag(log, interval(100, 1200))) + ++model.args.alpha: tag(log, interval(1e-8, 0.1)) + ++model.args.lambda: tag(log, interval(1e-8, 1.0)) + ++model.args.max_depth: int(interval(1, 10)) + ++model.args.learning_rate: tag(log, interval(1e-8, 1)) # Multiplier during boosting, [0,1]. Lower numbers mean more conservative boosting. Default is 0.3 + ++model.args.gamma: tag(log, interval(1e-8, 0.001)) # Threshold for loss reduction per node split. If lower than threshold, stops adding nodes to branch. + ++model.args.grow_policy: choice("depthwise", "lossguide") diff --git a/tests/config/preprocessing/default_preprocessing.yaml b/tests/config/preprocessing/default_preprocessing.yaml new file mode 100644 index 00000000..ee23e6b5 --- /dev/null +++ b/tests/config/preprocessing/default_preprocessing.yaml @@ -0,0 +1,29 @@ +# @package _global_ +preprocessing: + pre_split: + convert_to_boolean: false + convert_booleans_to_int: true + drop_datetime_predictor_columns: true + convert_datetimes_to_ordinal: false + drop_patient_if_exclusion_before_date: 1971-01-01 + min_prediction_time_date: null + min_lookahead_days: 30 + lookbehind_combination: [30, 60, 100] + min_age: 18 + post_split: + imputation_method: most_frequent + scaling: z-score-normalisation + feature_selection: + name: chi2 + params: + percentile: 20 # (int): Percent of features to keep. Defaults to 10. + +# Parameters that will only take effect if running with --multirun +hydra: + sweeper: + params: + ++preprocessing.post_split.imputation_method: choice("most_frequent", "mean", "median", "null") + ++preprocessing.post_split.scaling: choice("z-score-normalization", "null") + ++preprocessing.post_split.feature_selection.name: choice("chi2", "null") + ++preprocessing.post_split.feature_selection.params.percentile: int(tag(log, interval(1, 90))) + preprocessing.pre_split.lookbehind_combination: choice([30, 90], [30]) diff --git a/src/psycop_model_training/config/project/integration_test_project.yaml b/tests/config/project/integration_test_project.yaml similarity index 67% rename from src/psycop_model_training/config/project/integration_test_project.yaml rename to tests/config/project/integration_test_project.yaml index 382a974e..98a8fdc4 100644 --- a/src/psycop_model_training/config/project/integration_test_project.yaml +++ b/tests/config/project/integration_test_project.yaml @@ -6,10 +6,4 @@ wandb: group: integration_testing entity: psycop # Which entity to run WanDB in. -watcher: - archive_all: true - keep_alive_after_training_minutes: 5 - n_runs_before_eval: 1 - verbose: true - gpu: false diff --git a/tests/config/sweeper/optuna_multithread.yaml b/tests/config/sweeper/optuna_multithread.yaml new file mode 100644 index 00000000..e22c8c52 --- /dev/null +++ b/tests/config/sweeper/optuna_multithread.yaml @@ -0,0 +1,12 @@ +# @package _global_ +defaults: + - override /hydra/sweeper: optuna + - override /hydra/sweeper/sampler: tpe + - override /hydra/launcher: joblib + +hydra: + sweeper: + sampler: + seed: 123 + n_jobs: 2 + direction: maximize \ No newline at end of file diff --git a/tests/config/sweeper/optuna_singlethread.yaml b/tests/config/sweeper/optuna_singlethread.yaml new file mode 100644 index 00000000..f40bb5bd --- /dev/null +++ b/tests/config/sweeper/optuna_singlethread.yaml @@ -0,0 +1,10 @@ +# @package _global_ +defaults: + - override /hydra/sweeper: optuna + - override /hydra/sweeper/sampler: tpe + +hydra: + sweeper: + sampler: + seed: 123 + direction: maximize diff --git a/tests/config/train/default_training.yaml b/tests/config/train/default_training.yaml new file mode 100644 index 00000000..074bbb8f --- /dev/null +++ b/tests/config/train/default_training.yaml @@ -0,0 +1,5 @@ +n_splits: 3 # (int, Null): Number of k-folds during CV. If Null, loads pre-defined dataset. +n_trials_per_lookahead: 300 +n_jobs_per_trainer: 1 +n_active_trainers: 10 +random_delay_per_job_seconds: 0 diff --git a/tests/conftest.py b/tests/conftest.py index adc5a600..0e52e585 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -5,13 +5,16 @@ import pandas as pd import pytest -from psycop_model_training.config.schemas import FullConfigSchema, load_cfg_as_pydantic from psycop_model_training.model_eval.dataclasses import EvalDataset +from psycop_model_training.utils.config_schemas.conf_utils import ( + FullConfigSchema, + load_test_cfg_as_pydantic, +) -CONFIG_DIR_PATH_REL = "../src/psycop_model_training/config" +CONFIG_DIR_PATH_REL = "../application/config" -def add_age_gender(df): +def add_age_gender(df: pd.DataFrame): """Add age and gender columns to dataframe. Args: @@ -50,8 +53,8 @@ def synth_eval_dataset() -> EvalDataset: @pytest.fixture(scope="function") def immuteable_test_config() -> FullConfigSchema: """Get an immutable config for testing.""" - return load_cfg_as_pydantic( - config_file_name="integration_config.yaml", + return load_test_cfg_as_pydantic( + config_file_name="default_config.yaml", allow_mutation=False, ) @@ -59,7 +62,7 @@ def immuteable_test_config() -> FullConfigSchema: @pytest.fixture(scope="function") def muteable_test_config() -> FullConfigSchema: """Get a mutable config for testing.""" - return load_cfg_as_pydantic( - config_file_name="integration_config.yaml", + return load_test_cfg_as_pydantic( + config_file_name="default_config.yaml", allow_mutation=True, ) diff --git a/tests/model_evaluation/test_model_performance.py b/tests/model_evaluation/test_model_performance.py index f1cc00a4..b47d1272 100644 --- a/tests/model_evaluation/test_model_performance.py +++ b/tests/model_evaluation/test_model_performance.py @@ -6,7 +6,7 @@ import pandas as pd import pytest -from psycop_model_training.model_performance import ModelPerformance +from psycop_model_training.model_eval.model_performance import ModelPerformance # pylint: disable=missing-function-docstring diff --git a/tests/model_evaluation/test_visualizations.py b/tests/model_evaluation/test_visualizations.py index cd52e195..4df88ee0 100644 --- a/tests/model_evaluation/test_visualizations.py +++ b/tests/model_evaluation/test_visualizations.py @@ -11,29 +11,29 @@ from sklearn.metrics import f1_score, roc_auc_score from psycop_model_training.model_eval.dataclasses import EvalDataset -from psycop_model_training.utils.utils import PROJECT_ROOT, positive_rate_to_pred_probs -from psycop_model_training.visualization import plot_prob_over_time -from psycop_model_training.visualization.base_charts import plot_basic_chart -from psycop_model_training.visualization.feature_importance import ( +from psycop_model_training.model_eval.plots.base_charts import plot_basic_chart +from psycop_model_training.model_eval.plots.feature_importance import ( plot_feature_importances, ) -from psycop_model_training.visualization.performance_by_age import ( +from psycop_model_training.model_eval.plots.performance_by_age import ( plot_performance_by_age, ) -from psycop_model_training.visualization.performance_by_n_hba1c import ( +from psycop_model_training.model_eval.plots.performance_by_n_hba1c import ( plot_performance_by_n_hba1c, ) -from psycop_model_training.visualization.performance_over_time import ( +from psycop_model_training.model_eval.plots.performance_over_time import ( plot_auc_by_time_from_first_visit, plot_metric_by_calendar_time, plot_metric_by_cyclic_time, plot_metric_by_time_until_diagnosis, ) -from psycop_model_training.visualization.roc_auc import plot_auc_roc -from psycop_model_training.visualization.sens_over_time import ( +from psycop_model_training.model_eval.plots.prob_over_time import plot_prob_over_time +from psycop_model_training.model_eval.plots.roc_auc import plot_auc_roc +from psycop_model_training.model_eval.plots.sens_over_time import ( create_sensitivity_by_time_to_outcome_df, plot_sensitivity_by_time_to_outcome_heatmap, ) +from psycop_model_training.utils.utils import PROJECT_ROOT, positive_rate_to_pred_probs @pytest.fixture(scope="function") diff --git a/tests/test_configs.py b/tests/test_configs.py index 8f85115a..c07c480e 100644 --- a/tests/test_configs.py +++ b/tests/test_configs.py @@ -1,24 +1,39 @@ """Testing of config schemas.""" -from pathlib import Path import pytest from hydra import compose, initialize -from psycop_model_training.config.schemas import convert_omegaconf_to_pydantic_object +from psycop_model_training.utils.config_schemas.conf_utils import ( + convert_omegaconf_to_pydantic_object, +) from psycop_model_training.utils.utils import PROJECT_ROOT -CONFIG_DIR_PATH_ABS = PROJECT_ROOT / "src" / "psycop_model_training" / "config" -CONFIG_DIR_PATH_REL = "../src/psycop_model_training/config" +CONFIG_DIR_PATH_ABS = PROJECT_ROOT / "tests" / "config" +CONFIG_DIR_PATH_REL = "../tests/config" + +CONFIG_DIR_PATH_APP_ABS = PROJECT_ROOT / "application" / "config" def get_config_file_names() -> list[str]: """Get all config file names.""" - config_file_paths: list[Path] = list(CONFIG_DIR_PATH_ABS.glob("*.yaml")) + config_file_paths = list(CONFIG_DIR_PATH_ABS.glob("*.yaml")) + return [f"{path.stem}.yaml" for path in config_file_paths] @pytest.mark.parametrize("config_file_name", get_config_file_names()) -def test_configs(config_file_name): +def test_test_configs(config_file_name): + """Test that all configs load correctly.""" + with initialize(version_base=None, config_path=CONFIG_DIR_PATH_REL): + cfg = compose( + config_name=config_file_name, + ) + + cfg = convert_omegaconf_to_pydantic_object(conf=cfg) + + +@pytest.mark.parametrize("config_file_name", get_config_file_names()) +def test_app_configs(config_file_name): """Test that all configs load correctly.""" with initialize(version_base=None, config_path=CONFIG_DIR_PATH_REL): cfg = compose( diff --git a/tests/test_load.py b/tests/test_load.py index 3adba1e1..4f7b2cec 100644 --- a/tests/test_load.py +++ b/tests/test_load.py @@ -1,7 +1,7 @@ """Testing of loader functions.""" -from psycop_model_training.config.schemas import FullConfigSchema -from psycop_model_training.load import load_train_from_cfg +from psycop_model_training.data_loader.utils import load_and_filter_train_from_cfg +from psycop_model_training.utils.config_schemas.full_config import FullConfigSchema def test_load_lookbehind_exceeds_lookbehind_threshold( @@ -11,11 +11,11 @@ def test_load_lookbehind_exceeds_lookbehind_threshold( lookbehind threshold.""" cfg = muteable_test_config - n_cols_before_filtering = load_train_from_cfg(cfg=cfg).shape[1] + n_cols_before_filtering = load_and_filter_train_from_cfg(cfg=cfg).shape[1] - cfg.data.lookbehind_combination = [30, 60] + cfg.preprocessing.pre_split.lookbehind_combination = [30, 60] - n_cols_after_filtering = load_train_from_cfg(cfg=cfg).shape[1] + n_cols_after_filtering = load_and_filter_train_from_cfg(cfg=cfg).shape[1] assert n_cols_before_filtering - n_cols_after_filtering == 2 @@ -27,10 +27,10 @@ def test_load_lookbehind_not_in_lookbehind_combination( specified lookbehind combination list.""" cfg = muteable_test_config - n_cols_before_filtering = load_train_from_cfg(cfg=cfg).shape[1] + n_cols_before_filtering = load_and_filter_train_from_cfg(cfg=cfg).shape[1] - cfg.data.lookbehind_combination = [60] + cfg.preprocessing.pre_split.lookbehind_combination = [60] - n_cols_after_filtering = load_train_from_cfg(cfg=cfg).shape[1] + n_cols_after_filtering = load_and_filter_train_from_cfg(cfg=cfg).shape[1] assert n_cols_before_filtering - n_cols_after_filtering == 3 diff --git a/tests/test_preprocessing.py b/tests/test_preprocessing.py index a5d042b8..86336a22 100644 --- a/tests/test_preprocessing.py +++ b/tests/test_preprocessing.py @@ -1,9 +1,6 @@ """Test custom preprocessing steps.""" -from psycop_model_training.config.schemas import FullConfigSchema -from psycop_model_training.load import load_train_from_cfg -from psycop_model_training.preprocessing.post_split.create_pipeline import ( - create_preprocessing_pipeline, -) +from psycop_model_training.data_loader.utils import load_and_filter_train_from_cfg +from psycop_model_training.utils.config_schemas.full_config import FullConfigSchema def test_drop_datetime_predictor_columns( @@ -13,14 +10,12 @@ def test_drop_datetime_predictor_columns( specified lookbehind combination list.""" cfg = muteable_test_config - cfg.preprocessing.drop_datetime_predictor_columns = True - cfg.preprocessing.imputation_method = None - cfg.preprocessing.feature_selection.name = None - cfg.preprocessing.scaling = None + cfg.preprocessing.pre_split.drop_datetime_predictor_columns = True + cfg.preprocessing.post_split.imputation_method = None + cfg.preprocessing.post_split.feature_selection.name = None + cfg.preprocessing.post_split.scaling = None cfg.data.pred_prefix = "timestamp" - pipe = create_preprocessing_pipeline(cfg=cfg) - train_df = load_train_from_cfg(cfg=cfg) - train_df = pipe.transform(X=train_df) + train_df = load_and_filter_train_from_cfg(cfg=cfg) assert len([x for x in train_df.columns if "timestamp" in x]) == 0 diff --git a/tests/test_train_model.py b/tests/test_train_model.py index 7e4152a5..80f20675 100644 --- a/tests/test_train_model.py +++ b/tests/test_train_model.py @@ -3,18 +3,21 @@ import pytest -from psycop_model_training.config.schemas import FullConfigSchema, load_cfg_as_omegaconf +from application.train_model import main from psycop_model_training.training.model_specs import MODELS -from psycop_model_training.training.train_model import main +from psycop_model_training.utils.config_schemas.conf_utils import ( + load_test_cfg_as_omegaconf, +) +from psycop_model_training.utils.config_schemas.full_config import FullConfigSchema -INTEGRATION_TEST_FILE_NAME = "integration_config.yaml" +INTEGRATION_TEST_FILE_NAME = "default_config.yaml" @pytest.mark.parametrize("model_name", MODELS.keys()) -def test_main(model_name): +def test_main(model_name: str): """Test main using a variety of model.""" - cfg: FullConfigSchema = load_cfg_as_omegaconf( + cfg: FullConfigSchema = load_test_cfg_as_omegaconf( config_file_name=INTEGRATION_TEST_FILE_NAME, overrides=[f"model={model_name}"], ) @@ -43,13 +46,13 @@ def test_crossvalidation(muteable_test_config: FullConfigSchema): def test_min_prediction_time_date(muteable_test_config: FullConfigSchema): """Test minimum prediction times correctly resolving the string.""" cfg = muteable_test_config - cfg.data.min_prediction_time_date = "1972-01-01" + cfg.preprocessing.pre_split.min_prediction_time_date = "1972-01-01" main(cfg) def test_feature_selection(muteable_test_config: FullConfigSchema): """Test feature selection.""" cfg = muteable_test_config - cfg.preprocessing.feature_selection.name = "mutual_info_classif" - cfg.preprocessing.feature_selection.params["percentile"] = 10 + cfg.preprocessing.post_split.feature_selection.name = "mutual_info_classif" + cfg.preprocessing.post_split.feature_selection.params["percentile"] = 10 main(cfg)