Skip to content

Commit

Permalink
Sczbp feature importance and misc (#881)
Browse files Browse the repository at this point in the history
<!--
Reviews go much faster if the reviewer knows what to focus on! Help them
out, e.g.:
Reviewers can skip X, but should pay attention to Y.
-->
  • Loading branch information
HLasse authored Apr 16, 2024
2 parents 9457de6 + 333495b commit 82a7aad
Show file tree
Hide file tree
Showing 44 changed files with 1,954 additions and 339 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
[placeholder]
@estimator_steps = "miss_forest_imputation"
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
[placeholder]
@estimator_steps = "noop_imputation"
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[placeholder]
@estimator_steps = "simple_imputation"
strategy = "mean"
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[placeholder]
@estimator_steps_suggesters = "imputation_suggester"
strategies = ["most_frequent", "mean"]
6 changes: 6 additions & 0 deletions psycop/common/model_training_v2/config/populate_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,12 @@ def populate_baseline_registry() -> None:
from ..trainer.task.estimator_steps.logistic_regression import logistic_regression_step
from ..trainer.task.estimator_steps.xgboost import xgboost_classifier_step
from ..trainer.task.estimator_steps.lightgbm import lightgbm_classifier_step
from ..trainer.task.estimator_steps.imputers import (
noop_imputation_step,
simple_imputation_step,
miss_forest_imputation_step,
ImputationSuggester,
)

# Preprocessing
from ..trainer.preprocessing.pipeline import BaselinePreprocessingPipeline
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import copy
import re
import traceback
from collections.abc import Sequence
from pathlib import Path
from typing import Any, Literal
Expand Down Expand Up @@ -83,7 +84,13 @@ def _optuna_objective(trial: Trial, cfg_with_resolved_suggesters: dict[str, Any]

concrete_config_schema.logger.log_config(Config(concrete_config))

run_result = concrete_config_schema.trainer.train()
try:
run_result = concrete_config_schema.trainer.train()
except Exception as e:
if "Input X contains NaN" in str(e):
raise optuna.TrialPruned from e
concrete_config_schema.logger.fail(traceback.format_exc())
raise
return run_result.metric.value

@staticmethod
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,16 @@ def suggest(self, trial: optuna.Trial, name: str) -> Any:
return trial.suggest_categorical(name=name, choices=self.choices)


@dataclass
class SingleValue:
"""If you don't want to search across all possible hparams"""

val: str | float

def suggest(self, trial: optuna.Trial, name: str) -> Any: # noqa: ARG002
return self.val


@BaselineRegistry.suggesters.register("mock_suggester")
class MockSuggester(Suggester):
"""Suggester used only for tests. Ensures tests only break if the interface breaks, not because of implementation details in e.g. LogisticRegression."""
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
from typing import Literal

import optuna
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestRegressor
from sklearn.experimental import enable_iterative_imputer # noqa
from sklearn.impute import IterativeImputer, SimpleImputer

from psycop.common.model_training_v2.config.baseline_registry import BaselineRegistry
from psycop.common.model_training_v2.hyperparameter_suggester.suggesters.base_suggester import (
Suggester,
)
from psycop.common.model_training_v2.hyperparameter_suggester.suggesters.suggester_spaces import (
CategoricalSpace,
CategoricalSpaceT,
)
from psycop.common.model_training_v2.trainer.task.model_step import ModelStep


class IdentityTransformer(BaseEstimator, TransformerMixin):
def __init__(self):
pass

def fit(self, input_array, y=None): # type: ignore # noqa
return self

def transform(self, input_array, y=None): # type: ignore # noqa
return input_array


@BaselineRegistry.estimator_steps.register("noop_imputation")
def noop_imputation_step() -> ModelStep:
return ("imputer", IdentityTransformer())


@BaselineRegistry.estimator_steps.register("simple_imputation")
def simple_imputation_step(
strategy: Literal["mean", "median", "most_frequent", "constant"] = "mean",
) -> ModelStep:
return ("imputer", SimpleImputer(strategy=strategy))


@BaselineRegistry.estimator_steps.register("miss_forest_imputation")
def miss_forest_imputation_step() -> ModelStep:
"""Naive implementation of missforest using sklearn's IterativeImputer"""

return ("imputer", IterativeImputer(estimator=RandomForestRegressor(), random_state=0))


IMPLEMENTED_STRATEGIES = ["mean", "median", "most_frequent", "miss_forest", "noop"]

STRATEGY2STEP = {
"mean": "simple_imputation",
"median": "simple_imputation",
"most_frequent": "simple_imputation",
"miss_forest": "miss_forest_imputation",
"noop": "noop_imputation",
}


@BaselineRegistry.estimator_steps_suggesters.register("imputation_suggester")
class ImputationSuggester(Suggester):
def __init__(self, strategies: CategoricalSpaceT):
for strategy in strategies:
if strategy not in IMPLEMENTED_STRATEGIES:
raise ValueError(f"Imputation strategy {strategy} is not implemented")

self.strategy = CategoricalSpace(choices=strategies)

def suggest_hyperparameters(self, trial: optuna.Trial) -> dict[str, str]:
strategy = self.strategy.suggest(trial, "imputation_strategy")
estimator_step_str = STRATEGY2STEP[strategy]

if strategy in ["miss_forest", "noop"]:
return {"@estimator_steps": estimator_step_str}
return {"@estimator_steps": estimator_step_str, "strategy": strategy}
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ def scz_bp_first_pred_to_event_stratified(eval_ds: EvalDataset, ppr: float) -> p


if __name__ == "__main__":
best_experiment = "sczbp/text_only"
best_experiment = "sczbp/structured_text"
best_pos_rate = 0.04
eval_ds = scz_bp_get_eval_ds_from_best_run_in_experiment(experiment_name=best_experiment)

Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import pandas as pd

from psycop.common.model_evaluation.patchwork.patchwork_grid import create_patchwork_grid
from psycop.projects.scz_bp.evaluation.figure2.auroc_by_data_type import (
plot_scz_bp_auroc_by_data_type,
Expand All @@ -12,14 +14,17 @@
from psycop.projects.scz_bp.evaluation.scz_bp_run_evaluation_suite import (
scz_bp_get_eval_ds_from_best_run_in_experiment,
)
from psycop.projects.scz_bp.model_training.populate_scz_bp_registry import populate_scz_bp_registry

if __name__ == "__main__":
populate_scz_bp_registry()
modality2experiment_mapping = modality2experiment = {
"Structured + text": "sczbp/structured_text",
"Structured only ": "sczbp/structured_only",
"Text only": "sczbp/text_only",
"Structured + text + synthetic": "sczbp/structured_text_xgboost_ddpm_3x_positives",
"Structured + text": "sczbp/structured_text_xgboost",
"Structured only ": "sczbp/structured_only-xgboost",
"Text only": "sczbp/tfidf_1000-xgboost",
}
best_experiment = "sczbp/text_only"
best_experiment = "sczbp/structured_text_xgboost_ddpm_3x_positives"
best_pos_rate = 0.04

best_eval_ds = scz_bp_get_eval_ds_from_best_run_in_experiment(experiment_name=best_experiment)
Expand All @@ -37,5 +42,6 @@

panels = [panel_a, panel_b, panel_c, panel_d]

grid = create_patchwork_grid(plots=panels, single_plot_dimensions=(5, 5), n_in_row=2)
grid.savefig("scz_bp_fig_2.png")
with pd.option_context("mode.chained_assignment", None):
grid = create_patchwork_grid(plots=panels, single_plot_dimensions=(5, 5), n_in_row=2)
grid.savefig(f"scz_bp_fig_2_{best_experiment.split('/')[1]}.png")
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,14 @@
)


def _plot_metric_by_time_to_event(df: pd.DataFrame, metric: str) -> pn.ggplot:
def _plot_metric_by_time_to_event(
df: pd.DataFrame, metric: str, plot_combined: bool = False
) -> pn.ggplot:
df["subset"] = df["subset"].replace({"bp": "BP", "scz": "SCZ", "both": "Combined"})
df["subset"] = pd.Categorical(df["subset"], ["BP", "SCZ", "Combined"])
if not plot_combined:
df = df.query("subset != 'Combined'").copy()
df["subset"] = pd.Categorical(df["subset"], ["BP", "SCZ"])

p = (
pn.ggplot(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
)

if __name__ == "__main__":
best_experiment = "sczbp/text_only"
best_experiment = "sczbp/structured_text"
best_pos_rate = 0.04

best_eval_ds = scz_bp_get_eval_ds_from_best_run_in_experiment(experiment_name=best_experiment)
Expand Down
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
# type: ignore
import pickle as pkl
import re
from pathlib import Path

import pandas as pd
import polars as pl
from sklearn.pipeline import Pipeline

from psycop.common.global_utils.mlflow.mlflow_data_extraction import MlflowClientWrapper


def scz_bp_parse_static_feature(full_string: str) -> str:
"""Takes a static feature name and returns a human readable version of it."""
feature_name = full_string.replace("pred_", "")

feature_capitalised = feature_name[0].upper() + feature_name[1:]

manual_overrides = {"Age_in_years": "Age (years)"}

if feature_capitalised in manual_overrides:
feature_capitalised = manual_overrides[feature_capitalised]
return feature_capitalised


def scz_bp_parse_temporal_feature(full_string: str) -> str:
feature_name = re.findall(r"pred_(.*)?_within", full_string)[0]
if "_disorders" in feature_name:
words = feature_name.split("_")
words[0] = words[0].capitalize()
feature_name = " ".join(word for word in words)

lookbehind = re.findall(r"within_(.*)?_days", full_string)[0]
resolve_multiple = re.findall(r"days_(.*)?_fallback", full_string)[0]

remove = ["all_relevant_", "aktuelt_psykisk_", r"_layer_\d_*"]
remove = "(%s)" % "|".join(remove)

feature_name = re.sub(remove, "", feature_name)
output_string = f"{feature_name} {lookbehind}-day {resolve_multiple} "
return output_string


def scz_bp_feature_name_to_readable(full_string: str) -> str:
if "within" not in full_string:
output_string = scz_bp_parse_static_feature(full_string)
else:
output_string = scz_bp_parse_temporal_feature(full_string=full_string)
return output_string


def scz_bp_generate_feature_importance_table(
pipeline: Pipeline, clf_model_name: str = "classifier"
) -> pd.DataFrame:
# Get feature importance scores
feature_importances = pipeline.named_steps[clf_model_name].feature_importances_

if hasattr(pipeline.named_steps[clf_model_name], "feature_names"):
selected_feature_names = pipeline.named_steps[clf_model_name].feature_names
elif hasattr(pipeline.named_steps[clf_model_name], "feature_name_"):
selected_feature_names = pipeline.named_steps[clf_model_name].feature_name_
elif hasattr(pipeline.named_steps[clf_model_name], "feature_names_in_"):
selected_feature_names = pipeline.named_steps[clf_model_name].feature_names_in_
else:
raise ValueError("The classifier does not implement .feature_names or .feature_name_")

# Create a DataFrame to store the feature names and their corresponding gain
feature_table = pl.DataFrame(
{"Feature Name": selected_feature_names, "Feature Importance": feature_importances}
)

# Sort the table by gain in descending order
feature_table = feature_table.sort("Feature Importance", descending=True)
# Get the top 100 features by gain
top_100_features = feature_table.head(100).with_columns(
# pl.col("Feature Importance").round(3), # noqa: ERA001
pl.col("Feature Name").apply(lambda x: scz_bp_feature_name_to_readable(x))
)

pd_df = top_100_features.to_pandas()
pd_df = pd_df.reset_index()
pd_df["index"] = pd_df["index"] + 1
pd_df = pd_df.set_index("index")

return pd_df


if __name__ == "__main__":
best_experiment = "sczbp/structured_text_xgboost_ddpm_3x_positives"
best_run = MlflowClientWrapper().get_best_run_from_experiment(
experiment_name=best_experiment, metric="all_oof_BinaryAUROC"
)

with best_run.download_artifact("sklearn_pipe.pkl").open("rb") as pipe_pkl:
pipe = pkl.load(pipe_pkl)

feat_imp = scz_bp_generate_feature_importance_table(pipeline=pipe, clf_model_name="classifier")
pl.Config.set_tbl_rows(100)

with (Path(__file__).parent / f"feat_imp_100_{best_experiment.split('/')[1]}.html").open(
"w"
) as html_file:
html_file.write(feat_imp.to_html())

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,9 @@
from psycop.common.model_training_v2.trainer.cross_validator_trainer import CrossValidatorTrainer
from psycop.common.model_training_v2.trainer.split_trainer import SplitTrainer
from psycop.projects.scz_bp.evaluation.minimal_eval_dataset import minimal_eval_dataset_from_path
from psycop.projects.scz_bp.model_training.synthetic_cv_trainer.synthetic_cv_trainer import (
SyntheticCrossValidatorTrainer,
)

populate_baseline_registry()

Expand All @@ -55,7 +58,7 @@ def scz_bp_df_to_eval_df(df: pl.DataFrame) -> EvalDataset:

def _load_validation_data_from_schema(schema: BaselineSchema) -> pl.DataFrame:
match schema.trainer:
case CrossValidatorTrainer():
case CrossValidatorTrainer() | SyntheticCrossValidatorTrainer():
return schema.trainer.training_data.load().collect()
case SplitTrainer():
return schema.trainer.validation_data.load().collect()
Expand Down Expand Up @@ -96,9 +99,7 @@ def scz_bp_get_eval_ds_from_best_run_in_experiment(experiment_name: str) -> Eval
)

# min_eval_ds = minimal_eval_dataset_from_mlflow_run(run=best_run) # noqa: ERA001
min_eval_ds = minimal_eval_dataset_from_path(
Path(best_run.get_config()["project_info"]["experiment_path"]) / "eval_df.parquet"
)
min_eval_ds = minimal_eval_dataset_from_path(best_run.download_artifact("eval_df.parquet"))
cohort_data = cohort_metadata_from_run(
run=best_run,
cohort_metadata_cols=[
Expand Down
Loading

0 comments on commit 82a7aad

Please sign in to comment.