From e37e6f1b47b570ad801a6735ce52a73fc797dff5 Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Wed, 15 May 2024 12:31:21 +0200 Subject: [PATCH] feat: update cfg experiments (#909) --- .../cvd/model_training/cvd_baseline.cfg | 16 ++++---- .../cvd/model_training/train_cvd_layers.py | 39 +++++++++++++++++++ .../model_training/train_cvd_lookbehinds.py | 26 +++++++++++++ .../cvd/model_training/train_cvd_model.py | 10 ----- psycop/timeseriesflattener | 1 - 5 files changed, 73 insertions(+), 19 deletions(-) create mode 100644 psycop/projects/cvd/model_training/train_cvd_layers.py create mode 100644 psycop/projects/cvd/model_training/train_cvd_lookbehinds.py delete mode 100644 psycop/projects/cvd/model_training/train_cvd_model.py delete mode 160000 psycop/timeseriesflattener diff --git a/psycop/projects/cvd/model_training/cvd_baseline.cfg b/psycop/projects/cvd/model_training/cvd_baseline.cfg index de113deae..142c328a3 100644 --- a/psycop/projects/cvd/model_training/cvd_baseline.cfg +++ b/psycop/projects/cvd/model_training/cvd_baseline.cfg @@ -6,7 +6,7 @@ [logger.*.mlflow] @loggers = "mlflow_logger" -experiment_name = "cvd_hyperparam_tuning_joblib" +experiment_name = "baseline_v2_cvd" postpone_run_creation_to_first_log = True [trainer] @@ -21,7 +21,7 @@ group_col_name = "dw_ek_borger" [trainer.training_data] @data = "parquet_vertical_concatenator" -paths = ["E:/shared_resources/cvd/feature_set/flattened_datasets/cvd_lookbehind_experiments/train.parquet", "E:/shared_resources/cvd/feature_set/flattened_datasets/cvd_lookbehind_experiments/val.parquet"] +paths = ["E:/shared_resources/cvd/feature_set/flattened_datasets/train.parquet", "E:/shared_resources/cvd/feature_set/flattened_datasets/val.parquet"] validate_on_init = False ################# @@ -45,12 +45,12 @@ age_col_name = "pred_age_in_years" [trainer.preprocessing_pipeline.*.layer_selector] @preprocessing = "filter_columns_within_subset" subset_rule = "pred_.+layer.+" -keep_matching = ".+_layer_(1|2|3).+" +keep_matching = ".+_layer_(1).+" -# [trainer.preprocessing_pipeline.*.aggregation_selector] -# @preprocessing = "filter_columns_within_subset" -# subset_rule = "pred_.+layer.+" -# keep_matching = ".+_(mean|max|min)_.+" +[trainer.preprocessing_pipeline.*.aggregation_selector] +@preprocessing = "filter_columns_within_subset" +subset_rule = "pred_.+layer.+" +keep_matching = ".+_(mean)_.+" ## Outcomes [trainer.preprocessing_pipeline.*.outcome_selector] @@ -75,7 +75,7 @@ column_expectations = [["outc_", 1], ["prediction_timestamp", 0]] # Estimator # ############# [trainer.task.task_pipe.sklearn_pipe.*.model] -@estimator_steps_suggesters = "lightgbm_suggester" +@estimator_steps = "xgboost" ######## # Task # diff --git a/psycop/projects/cvd/model_training/train_cvd_layers.py b/psycop/projects/cvd/model_training/train_cvd_layers.py new file mode 100644 index 000000000..90805a8c3 --- /dev/null +++ b/psycop/projects/cvd/model_training/train_cvd_layers.py @@ -0,0 +1,39 @@ +import logging +from pathlib import Path + +import confection + +from psycop.common.model_training_v2.config.baseline_pipeline import train_baseline_model_from_cfg +from psycop.common.model_training_v2.config.populate_registry import populate_baseline_registry +from psycop.projects.cvd.model_training.populate_cvd_registry import populate_with_cvd_registry + +if __name__ == "__main__": + import coloredlogs + + coloredlogs.install( # type: ignore + level="INFO", + format="%(asctime)s [%(levelname)s] %(name)s: %(message)s", + datefmt="%Y/%m/%d %H:%M:%S", + ) + + populate_baseline_registry() + populate_with_cvd_registry() + + for layer in range(1, 5): + cfg = confection.Config().from_disk(Path(__file__).parent / "cvd_baseline.cfg") + layers = [str(i) for i in range(1, layer + 1)] + cfg["trainer"]["preprocessing_pipeline"]["*"]["layer_selector"][ + "keep_matching" + ] = f".+_layer_({'|'.join(layers)}).+" + + logging.info(f"Training model with layers {layers}") + train_baseline_model_from_cfg(cfg=cfg) + + if layer == 1: + aggs = ".+(mean|min|max).+" + cfg["trainer"]["preprocessing_pipeline"]["*"]["aggregation_selector"][ + "keep_matching" + ] = aggs + + logging.info(f"Training model with {aggs}") + train_baseline_model_from_cfg(cfg=cfg) diff --git a/psycop/projects/cvd/model_training/train_cvd_lookbehinds.py b/psycop/projects/cvd/model_training/train_cvd_lookbehinds.py new file mode 100644 index 000000000..d586b2159 --- /dev/null +++ b/psycop/projects/cvd/model_training/train_cvd_lookbehinds.py @@ -0,0 +1,26 @@ +from pathlib import Path + +import confection + +from psycop.common.model_training_v2.config.baseline_pipeline import train_baseline_model_from_cfg +from psycop.common.model_training_v2.config.populate_registry import populate_baseline_registry +from psycop.projects.cvd.model_training.populate_cvd_registry import populate_with_cvd_registry + +if __name__ == "__main__": + import coloredlogs + + coloredlogs.install( # type: ignore + level="INFO", + format="%(asctime)s [%(levelname)s] %(name)s: %(message)s", + datefmt="%Y/%m/%d %H:%M:%S", + ) + + populate_baseline_registry() + populate_with_cvd_registry() + + cfg = confection.Config().from_disk(Path(__file__).parent / "cvd_baseline.cfg") + cfg["trainer"]["training_data"]["paths"] = [ + f"E:/shared_resources/cvd/feature_set/flattened_datasets/cvd_lookbehind_experiments/{split}.parquet" + for split in ["train", "test"] + ] + train_baseline_model_from_cfg(cfg) diff --git a/psycop/projects/cvd/model_training/train_cvd_model.py b/psycop/projects/cvd/model_training/train_cvd_model.py deleted file mode 100644 index 9ca186a4e..000000000 --- a/psycop/projects/cvd/model_training/train_cvd_model.py +++ /dev/null @@ -1,10 +0,0 @@ -from pathlib import Path - -from psycop.common.model_training_v2.config.baseline_pipeline import train_baseline_model -from psycop.common.model_training_v2.config.populate_registry import populate_baseline_registry -from psycop.projects.cvd.model_training.populate_cvd_registry import populate_with_cvd_registry - -if __name__ == "__main__": - populate_baseline_registry() - populate_with_cvd_registry() - train_baseline_model(Path(__file__).parent / "cvd_baseline.cfg") diff --git a/psycop/timeseriesflattener b/psycop/timeseriesflattener deleted file mode 160000 index 4b89ece58..000000000 --- a/psycop/timeseriesflattener +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 4b89ece5849d798b283e67220602f637921c7d92