Sczbp feature importance and misc (#881)

Aarhus-Psychiatry-Research · Apr 16, 2024 · 82a7aad · 82a7aad
2 parents 9457de6 + 333495b
commit 82a7aad
Show file tree

Hide file tree

Showing 44 changed files with 1,954 additions and 339 deletions.
diff --git a/...configs/estimator_steps/miss_forest_imputation/miss_forest_imputation_20240416_132957.cfg b/...configs/estimator_steps/miss_forest_imputation/miss_forest_imputation_20240416_132957.cfg
@@ -0,0 +1,2 @@
+[placeholder]
+@estimator_steps = "miss_forest_imputation"
diff --git a/...ical_registry_configs/estimator_steps/noop_imputation/noop_imputation_20240416_132957.cfg b/...ical_registry_configs/estimator_steps/noop_imputation/noop_imputation_20240416_132957.cfg
@@ -0,0 +1,2 @@
+[placeholder]
+@estimator_steps = "noop_imputation"
diff --git a/..._registry_configs/estimator_steps/simple_imputation/simple_imputation_20240416_132957.cfg b/..._registry_configs/estimator_steps/simple_imputation/simple_imputation_20240416_132957.cfg
@@ -0,0 +1,3 @@
+[placeholder]
+@estimator_steps = "simple_imputation"
+strategy = "mean"
diff --git a/.../estimator_steps_suggesters/imputation_suggester/imputation_suggester_20240416_132957.cfg b/.../estimator_steps_suggesters/imputation_suggester/imputation_suggester_20240416_132957.cfg
@@ -0,0 +1,3 @@
+[placeholder]
+@estimator_steps_suggesters = "imputation_suggester"
+strategies = ["most_frequent", "mean"]
diff --git a/psycop/common/model_training_v2/config/populate_registry.py b/psycop/common/model_training_v2/config/populate_registry.py
@@ -15,6 +15,12 @@ def populate_baseline_registry() -> None:
     from ..trainer.task.estimator_steps.logistic_regression import logistic_regression_step
     from ..trainer.task.estimator_steps.xgboost import xgboost_classifier_step
     from ..trainer.task.estimator_steps.lightgbm import lightgbm_classifier_step
+    from ..trainer.task.estimator_steps.imputers import (
+        noop_imputation_step,
+        simple_imputation_step,
+        miss_forest_imputation_step,
+        ImputationSuggester,
+    )
 
     # Preprocessing
     from ..trainer.preprocessing.pipeline import BaselinePreprocessingPipeline

diff --git a/psycop/common/model_training_v2/hyperparameter_suggester/optuna_hyperparameter_search.py b/psycop/common/model_training_v2/hyperparameter_suggester/optuna_hyperparameter_search.py
@@ -1,5 +1,6 @@
 import copy
 import re
+import traceback
 from collections.abc import Sequence
 from pathlib import Path
 from typing import Any, Literal
@@ -83,7 +84,13 @@ def _optuna_objective(trial: Trial, cfg_with_resolved_suggesters: dict[str, Any]
 
         concrete_config_schema.logger.log_config(Config(concrete_config))
 
-        run_result = concrete_config_schema.trainer.train()
+        try:
+            run_result = concrete_config_schema.trainer.train()
+        except Exception as e:
+            if "Input X contains NaN" in str(e):
+                raise optuna.TrialPruned from e
+            concrete_config_schema.logger.fail(traceback.format_exc())
+            raise
         return run_result.metric.value
 
     @staticmethod

diff --git a/psycop/common/model_training_v2/hyperparameter_suggester/suggesters/suggester_spaces.py b/psycop/common/model_training_v2/hyperparameter_suggester/suggesters/suggester_spaces.py
@@ -121,6 +121,16 @@ def suggest(self, trial: optuna.Trial, name: str) -> Any:
         return trial.suggest_categorical(name=name, choices=self.choices)
 
 
+@dataclass
+class SingleValue:
+    """If you don't want to search across all possible hparams"""
+
+    val: str | float
+
+    def suggest(self, trial: optuna.Trial, name: str) -> Any:  # noqa: ARG002
+        return self.val
+
+
 @BaselineRegistry.suggesters.register("mock_suggester")
 class MockSuggester(Suggester):
     """Suggester used only for tests. Ensures tests only break if the interface breaks, not because of implementation details in e.g. LogisticRegression."""

diff --git a/psycop/common/model_training_v2/trainer/task/estimator_steps/imputers.py b/psycop/common/model_training_v2/trainer/task/estimator_steps/imputers.py
@@ -0,0 +1,76 @@
+from typing import Literal
+
+import optuna
+from sklearn.base import BaseEstimator, TransformerMixin
+from sklearn.ensemble import RandomForestRegressor
+from sklearn.experimental import enable_iterative_imputer  # noqa
+from sklearn.impute import IterativeImputer, SimpleImputer
+
+from psycop.common.model_training_v2.config.baseline_registry import BaselineRegistry
+from psycop.common.model_training_v2.hyperparameter_suggester.suggesters.base_suggester import (
+    Suggester,
+)
+from psycop.common.model_training_v2.hyperparameter_suggester.suggesters.suggester_spaces import (
+    CategoricalSpace,
+    CategoricalSpaceT,
+)
+from psycop.common.model_training_v2.trainer.task.model_step import ModelStep
+
+
+class IdentityTransformer(BaseEstimator, TransformerMixin):
+    def __init__(self):
+        pass
+
+    def fit(self, input_array, y=None):  # type: ignore # noqa
+        return self
+
+    def transform(self, input_array, y=None):  # type: ignore # noqa
+        return input_array
+
+
+@BaselineRegistry.estimator_steps.register("noop_imputation")
+def noop_imputation_step() -> ModelStep:
+    return ("imputer", IdentityTransformer())
+
+
+@BaselineRegistry.estimator_steps.register("simple_imputation")
+def simple_imputation_step(
+    strategy: Literal["mean", "median", "most_frequent", "constant"] = "mean",
+) -> ModelStep:
+    return ("imputer", SimpleImputer(strategy=strategy))
+
+
+@BaselineRegistry.estimator_steps.register("miss_forest_imputation")
+def miss_forest_imputation_step() -> ModelStep:
+    """Naive implementation of missforest using sklearn's IterativeImputer"""
+
+    return ("imputer", IterativeImputer(estimator=RandomForestRegressor(), random_state=0))
+
+
+IMPLEMENTED_STRATEGIES = ["mean", "median", "most_frequent", "miss_forest", "noop"]
+
+STRATEGY2STEP = {
+    "mean": "simple_imputation",
+    "median": "simple_imputation",
+    "most_frequent": "simple_imputation",
+    "miss_forest": "miss_forest_imputation",
+    "noop": "noop_imputation",
+}
+
+
+@BaselineRegistry.estimator_steps_suggesters.register("imputation_suggester")
+class ImputationSuggester(Suggester):
+    def __init__(self, strategies: CategoricalSpaceT):
+        for strategy in strategies:
+            if strategy not in IMPLEMENTED_STRATEGIES:
+                raise ValueError(f"Imputation strategy {strategy} is not implemented")
+
+        self.strategy = CategoricalSpace(choices=strategies)
+
+    def suggest_hyperparameters(self, trial: optuna.Trial) -> dict[str, str]:
+        strategy = self.strategy.suggest(trial, "imputation_strategy")
+        estimator_step_str = STRATEGY2STEP[strategy]
+
+        if strategy in ["miss_forest", "noop"]:
+            return {"@estimator_steps": estimator_step_str}
+        return {"@estimator_steps": estimator_step_str, "strategy": strategy}
diff --git a/psycop/projects/scz_bp/evaluation/figure2/first_positive_prediction_to_outcome.py b/psycop/projects/scz_bp/evaluation/figure2/first_positive_prediction_to_outcome.py
@@ -101,7 +101,7 @@ def scz_bp_first_pred_to_event_stratified(eval_ds: EvalDataset, ppr: float) -> p
 
 
 if __name__ == "__main__":
-    best_experiment = "sczbp/text_only"
+    best_experiment = "sczbp/structured_text"
     best_pos_rate = 0.04
     eval_ds = scz_bp_get_eval_ds_from_best_run_in_experiment(experiment_name=best_experiment)
 

diff --git a/psycop/projects/scz_bp/evaluation/figure2/main_performance_figure.py b/psycop/projects/scz_bp/evaluation/figure2/main_performance_figure.py
@@ -1,3 +1,5 @@
+import pandas as pd
+
 from psycop.common.model_evaluation.patchwork.patchwork_grid import create_patchwork_grid
 from psycop.projects.scz_bp.evaluation.figure2.auroc_by_data_type import (
     plot_scz_bp_auroc_by_data_type,
@@ -12,14 +14,17 @@
 from psycop.projects.scz_bp.evaluation.scz_bp_run_evaluation_suite import (
     scz_bp_get_eval_ds_from_best_run_in_experiment,
 )
+from psycop.projects.scz_bp.model_training.populate_scz_bp_registry import populate_scz_bp_registry
 
 if __name__ == "__main__":
+    populate_scz_bp_registry()
     modality2experiment_mapping = modality2experiment = {
-        "Structured + text": "sczbp/structured_text",
-        "Structured only ": "sczbp/structured_only",
-        "Text only": "sczbp/text_only",
+        "Structured + text + synthetic": "sczbp/structured_text_xgboost_ddpm_3x_positives",
+        "Structured + text": "sczbp/structured_text_xgboost",
+        "Structured only ": "sczbp/structured_only-xgboost",
+        "Text only": "sczbp/tfidf_1000-xgboost",
     }
-    best_experiment = "sczbp/text_only"
+    best_experiment = "sczbp/structured_text_xgboost_ddpm_3x_positives"
     best_pos_rate = 0.04
 
     best_eval_ds = scz_bp_get_eval_ds_from_best_run_in_experiment(experiment_name=best_experiment)
@@ -37,5 +42,6 @@
 
     panels = [panel_a, panel_b, panel_c, panel_d]
 
-    grid = create_patchwork_grid(plots=panels, single_plot_dimensions=(5, 5), n_in_row=2)
-    grid.savefig("scz_bp_fig_2.png")
+    with pd.option_context("mode.chained_assignment", None):
+        grid = create_patchwork_grid(plots=panels, single_plot_dimensions=(5, 5), n_in_row=2)
+    grid.savefig(f"scz_bp_fig_2_{best_experiment.split('/')[1]}.png")
diff --git a/psycop/projects/scz_bp/evaluation/figure2/sensitivity_by_time_to_event.py b/psycop/projects/scz_bp/evaluation/figure2/sensitivity_by_time_to_event.py
@@ -11,9 +11,14 @@
 )
 
 
-def _plot_metric_by_time_to_event(df: pd.DataFrame, metric: str) -> pn.ggplot:
+def _plot_metric_by_time_to_event(
+    df: pd.DataFrame, metric: str, plot_combined: bool = False
+) -> pn.ggplot:
     df["subset"] = df["subset"].replace({"bp": "BP", "scz": "SCZ", "both": "Combined"})
     df["subset"] = pd.Categorical(df["subset"], ["BP", "SCZ", "Combined"])
+    if not plot_combined:
+        df = df.query("subset != 'Combined'").copy()
+        df["subset"] = pd.Categorical(df["subset"], ["BP", "SCZ"])
 
     p = (
         pn.ggplot(

diff --git a/psycop/projects/scz_bp/evaluation/figure3/figure_3_panels.py b/psycop/projects/scz_bp/evaluation/figure3/figure_3_panels.py
@@ -16,7 +16,7 @@
 )
 
 if __name__ == "__main__":
-    best_experiment = "sczbp/text_only"
+    best_experiment = "sczbp/structured_text"
     best_pos_rate = 0.04
 
     best_eval_ds = scz_bp_get_eval_ds_from_best_run_in_experiment(experiment_name=best_experiment)

diff --git a/psycop/projects/scz_bp/evaluation/model_performance/feature_importance/__init__.py b/psycop/projects/scz_bp/evaluation/model_performance/feature_importance/__init__.py
diff --git a/...jects/scz_bp/evaluation/model_performance/feature_importance/scz_bp_feature_importance.py b/...jects/scz_bp/evaluation/model_performance/feature_importance/scz_bp_feature_importance.py
@@ -0,0 +1,103 @@
+# type: ignore
+import pickle as pkl
+import re
+from pathlib import Path
+
+import pandas as pd
+import polars as pl
+from sklearn.pipeline import Pipeline
+
+from psycop.common.global_utils.mlflow.mlflow_data_extraction import MlflowClientWrapper
+
+
+def scz_bp_parse_static_feature(full_string: str) -> str:
+    """Takes a static feature name and returns a human readable version of it."""
+    feature_name = full_string.replace("pred_", "")
+
+    feature_capitalised = feature_name[0].upper() + feature_name[1:]
+
+    manual_overrides = {"Age_in_years": "Age (years)"}
+
+    if feature_capitalised in manual_overrides:
+        feature_capitalised = manual_overrides[feature_capitalised]
+    return feature_capitalised
+
+
+def scz_bp_parse_temporal_feature(full_string: str) -> str:
+    feature_name = re.findall(r"pred_(.*)?_within", full_string)[0]
+    if "_disorders" in feature_name:
+        words = feature_name.split("_")
+        words[0] = words[0].capitalize()
+        feature_name = " ".join(word for word in words)
+
+    lookbehind = re.findall(r"within_(.*)?_days", full_string)[0]
+    resolve_multiple = re.findall(r"days_(.*)?_fallback", full_string)[0]
+
+    remove = ["all_relevant_", "aktuelt_psykisk_", r"_layer_\d_*"]
+    remove = "(%s)" % "|".join(remove)
+
+    feature_name = re.sub(remove, "", feature_name)
+    output_string = f"{feature_name} {lookbehind}-day {resolve_multiple} "
+    return output_string
+
+
+def scz_bp_feature_name_to_readable(full_string: str) -> str:
+    if "within" not in full_string:
+        output_string = scz_bp_parse_static_feature(full_string)
+    else:
+        output_string = scz_bp_parse_temporal_feature(full_string=full_string)
+    return output_string
+
+
+def scz_bp_generate_feature_importance_table(
+    pipeline: Pipeline, clf_model_name: str = "classifier"
+) -> pd.DataFrame:
+    # Get feature importance scores
+    feature_importances = pipeline.named_steps[clf_model_name].feature_importances_
+
+    if hasattr(pipeline.named_steps[clf_model_name], "feature_names"):
+        selected_feature_names = pipeline.named_steps[clf_model_name].feature_names
+    elif hasattr(pipeline.named_steps[clf_model_name], "feature_name_"):
+        selected_feature_names = pipeline.named_steps[clf_model_name].feature_name_
+    elif hasattr(pipeline.named_steps[clf_model_name], "feature_names_in_"):
+        selected_feature_names = pipeline.named_steps[clf_model_name].feature_names_in_
+    else:
+        raise ValueError("The classifier does not implement .feature_names or .feature_name_")
+
+    # Create a DataFrame to store the feature names and their corresponding gain
+    feature_table = pl.DataFrame(
+        {"Feature Name": selected_feature_names, "Feature Importance": feature_importances}
+    )
+
+    # Sort the table by gain in descending order
+    feature_table = feature_table.sort("Feature Importance", descending=True)
+    # Get the top 100 features by gain
+    top_100_features = feature_table.head(100).with_columns(
+        #  pl.col("Feature Importance").round(3),   # noqa: ERA001
+        pl.col("Feature Name").apply(lambda x: scz_bp_feature_name_to_readable(x))
+    )
+
+    pd_df = top_100_features.to_pandas()
+    pd_df = pd_df.reset_index()
+    pd_df["index"] = pd_df["index"] + 1
+    pd_df = pd_df.set_index("index")
+
+    return pd_df
+
+
+if __name__ == "__main__":
+    best_experiment = "sczbp/structured_text_xgboost_ddpm_3x_positives"
+    best_run = MlflowClientWrapper().get_best_run_from_experiment(
+        experiment_name=best_experiment, metric="all_oof_BinaryAUROC"
+    )
+
+    with best_run.download_artifact("sklearn_pipe.pkl").open("rb") as pipe_pkl:
+        pipe = pkl.load(pipe_pkl)
+
+    feat_imp = scz_bp_generate_feature_importance_table(pipeline=pipe, clf_model_name="classifier")
+    pl.Config.set_tbl_rows(100)
+
+    with (Path(__file__).parent / f"feat_imp_100_{best_experiment.split('/')[1]}.html").open(
+        "w"
+    ) as html_file:
+        html_file.write(feat_imp.to_html())
diff --git a/psycop/projects/scz_bp/evaluation/model_performance/feature_importance/scz_bp_gain.py b/psycop/projects/scz_bp/evaluation/model_performance/feature_importance/scz_bp_gain.py
diff --git a/psycop/projects/scz_bp/evaluation/scz_bp_run_evaluation_suite.py b/psycop/projects/scz_bp/evaluation/scz_bp_run_evaluation_suite.py
@@ -30,6 +30,9 @@
 from psycop.common.model_training_v2.trainer.cross_validator_trainer import CrossValidatorTrainer
 from psycop.common.model_training_v2.trainer.split_trainer import SplitTrainer
 from psycop.projects.scz_bp.evaluation.minimal_eval_dataset import minimal_eval_dataset_from_path
+from psycop.projects.scz_bp.model_training.synthetic_cv_trainer.synthetic_cv_trainer import (
+    SyntheticCrossValidatorTrainer,
+)
 
 populate_baseline_registry()
 
@@ -55,7 +58,7 @@ def scz_bp_df_to_eval_df(df: pl.DataFrame) -> EvalDataset:
 
 def _load_validation_data_from_schema(schema: BaselineSchema) -> pl.DataFrame:
     match schema.trainer:
-        case CrossValidatorTrainer():
+        case CrossValidatorTrainer() | SyntheticCrossValidatorTrainer():
             return schema.trainer.training_data.load().collect()
         case SplitTrainer():
             return schema.trainer.validation_data.load().collect()
@@ -96,9 +99,7 @@ def scz_bp_get_eval_ds_from_best_run_in_experiment(experiment_name: str) -> Eval
     )
 
     # min_eval_ds = minimal_eval_dataset_from_mlflow_run(run=best_run) # noqa: ERA001
-    min_eval_ds = minimal_eval_dataset_from_path(
-        Path(best_run.get_config()["project_info"]["experiment_path"]) / "eval_df.parquet"
-    )
+    min_eval_ds = minimal_eval_dataset_from_path(best_run.download_artifact("eval_df.parquet"))
     cohort_data = cohort_metadata_from_run(
         run=best_run,
         cohort_metadata_cols=[
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		[placeholder]
		@estimator_steps = "miss_forest_imputation"
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		[placeholder]
		@estimator_steps = "noop_imputation"