Skip to content
This repository has been archived by the owner on May 1, 2023. It is now read-only.

Commit

Permalink
Merge pull request #281 from Aarhus-Psychiatry-Research/frillecode/Au…
Browse files Browse the repository at this point in the history
…tomatic-feature-selection181

frillecode/Automatic-feature-selection181
  • Loading branch information
frillecode authored Oct 25, 2022
2 parents e58bf62 + b774066 commit d81ecf4
Show file tree
Hide file tree
Showing 12 changed files with 148 additions and 35 deletions.
26 changes: 15 additions & 11 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,12 @@ repos:
rev: v1.7.6
hooks:
- id: autoflake
args: ['--in-place', '--remove-all-unused-imports', "--ignore-init-module-imports"]
args:
[
"--in-place",
"--remove-all-unused-imports",
"--ignore-init-module-imports",
]

- repo: https://github.com/pycqa/isort
rev: 5.10.1
Expand All @@ -21,15 +26,15 @@ repos:
- id: add-trailing-comma

- repo: https://github.com/asottile/pyupgrade
rev: v3.1.0
rev: v3.1.0
hooks:
- id: pyupgrade
args: ["--py39-plus"]

- repo: https://github.com/bwhmather/ssort
rev: v0.11.6
hooks:
- id: ssort
- id: ssort

- repo: https://github.com/myint/docformatter
rev: v1.5.0
Expand All @@ -47,7 +52,7 @@ repos:
hooks:
- id: flake8
args: [--config, .flake8]

- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.3.0 # Use the ref you want to point at
hooks:
Expand All @@ -63,20 +68,19 @@ repos:
hooks:
- id: pylint
types: [python]
args:
[
args: [
"-rn", # Only display messages
"-sn", # Don't display the score
"--disable=R,import-error" # Refactors are not important enough to block a commit.
# Unused-imports aren't testable by the github action without installing the whole project, so don't test that here.
"--disable=R,import-error", # Refactors are not important enough to block a commit.
]


# Unused-imports aren't testable by the github action without installing the whole project, so don't test that here.
- repo: local
hooks:
- id: pytest
- id: pytest
name: Run integration tests before push
entry: .venv/bin/pytest -m pre_push_test
language: script
stages: [push]
pass_filenames: false
always_run: true
always_run: true
7 changes: 5 additions & 2 deletions src/psycopt2d/config/preprocessing/default_preprocessing.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
convert_to_boolean: False # (Boolean): Convert all prediction values (except gender) to boolean. Defaults to False
convert_datetimes_to: False # (str): Options include ordinal or False
convert_datetimes_to: False # (str): Options include ordinal or False
imputation_method: "most_frequent" # (str): Options include 2most_frequent"
transform: null # (str|null): Transformation applied to all predictors after imputation. Options include "z-score-normalization"
transform: null # (str|null): Transformation applied to all predictors after imputation. Options include "z-score-normalization"
feature_selection_method: null
feature_selection_params:
percentile: 10 # (int): Percent of features to keep. Defaults to 10.
3 changes: 2 additions & 1 deletion src/psycopt2d/config/project/default_project.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
name: psycop-t2d
seed: 42
wandb_mode: "run" # Which mode to run WanDB in. Takes "run", "dryrun", "offline" and "disabled"
wandb_mode: "run" # Which mode to run WanDB in. Takes "run", "dryrun", "offline" and "disabled"
wandb_entity: "psycop" # Optional[str]
3 changes: 2 additions & 1 deletion src/psycopt2d/config/project/integration_test_project.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
name: psycop-t2d-integration-testing
seed: 42
wandb_mode: "disabled" # Which mode to run WanDB in. Takes "run", "dryrun", "offline" and "disabled"
wandb_mode: "disabled" # Which mode to run WanDB in. Takes "run", "dryrun", "offline" and "disabled"
wandb_entity: "psycop" # Optional[str]
3 changes: 2 additions & 1 deletion src/psycopt2d/config/project/overtaci_test_project.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
name: psycop-t2d-testing
seed: 42
wandb_mode: "run" # Which mode to run WanDB in. Takes "run", "dryrun", "offline" and "disabled"
wandb_mode: "run" # Which mode to run WanDB in. Takes "run", "dryrun", "offline" and "disabled"
wandb_entity: "psycop" # Optional[str]
28 changes: 27 additions & 1 deletion src/psycopt2d/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,15 @@
import pandas as pd
from omegaconf.dictconfig import DictConfig
from sklearn.metrics import recall_score, roc_auc_score
from sklearn.pipeline import Pipeline
from wandb.sdk.wandb_run import Run as wandb_run # pylint: disable=no-name-in-module
from wasabi import Printer

from psycopt2d.tables import generate_feature_importances_table
from psycopt2d.tables.performance_by_threshold import (
generate_performance_by_positive_rate_table,
)
from psycopt2d.tables.tables import feature_selection_table
from psycopt2d.utils import PROJECT_ROOT, positive_rate_to_pred_probs
from psycopt2d.visualization import (
plot_auc_by_time_from_first_visit,
Expand Down Expand Up @@ -55,7 +57,9 @@ def log_feature_importances(
def evaluate_model(
cfg,
eval_df: pd.DataFrame,
pipe: Pipeline,
y_col_name: str,
train_col_names: Iterable[str],
y_hat_prob_col_name: str,
run: wandb_run,
feature_importance_dict: Optional[dict[str, float]],
Expand All @@ -72,8 +76,10 @@ def evaluate_model(
Args:
cfg (OmegaConf): The hydra config from the run
pipe (Pipeline): Pipeline including the model
eval_df (pd.DataFrame): Evalaution split
y_col_name (str): Label column name
train_col_names (Iterable[str]): Column names for all predictors
y_hat_prob_col_name (str): Column name containing pred_proba output
run (wandb_run): WandB run to log to.
feature_importance_dict (Optional[dict[str, float]]): Dict of feature
Expand Down Expand Up @@ -101,6 +107,22 @@ def evaluate_model(
pred_timestamps = eval_df[cfg.data.pred_timestamp_col_name]
y_hat_int = np.round(y_hat_probs, 0)

if "feature_selection" in pipe["preprocessing"].named_steps:
selected_features = (
eval_df[train_col_names]
.columns[pipe["preprocessing"]["feature_selection"].get_support()]
.to_list()
)

run.log(
{
"feature_selection_table": feature_selection_table(
feature_names=train_col_names,
selected_feature_names=selected_features,
),
},
)

date_bins_ahead: Iterable[int] = cfg.evaluation.date_bins_ahead
date_bins_behind: Iterable[int] = cfg.evaluation.date_bins_behind

Expand Down Expand Up @@ -132,7 +154,11 @@ def evaluate_model(
)

msg.info(f"AUC: {auc}")
run.log({"1_minus_roc_auc_unweighted": 1 - auc})
run.log(
{
"roc_auc_unweighted": auc,
},
)

# Tables
# Performance by threshold
Expand Down
2 changes: 1 addition & 1 deletion src/psycopt2d/tables/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
# pylint: disable=missing-module-docstring
from .performance_by_threshold import generate_performance_by_positive_rate_table
from .tables import auc_by_group_table, generate_feature_importances_table
from .tables import auc_by_group_df, generate_feature_importances_table
58 changes: 48 additions & 10 deletions src/psycopt2d/tables/tables.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""Tables for evaluation of models."""
from collections.abc import Iterable
from collections.abc import Iterable, Sequence
from functools import partial
from typing import Union

Expand Down Expand Up @@ -28,7 +28,7 @@ def _calc_auc_and_n(
return pd.Series([auc, n], index=["AUC", "N"])


def auc_by_group_table(
def auc_by_group_df(
df: pd.DataFrame,
pred_probs_col_name: str,
outcome_col_name: str,
Expand Down Expand Up @@ -69,6 +69,21 @@ def auc_by_group_table(
return pd.concat(groups_df)


def output_table(
output_format: str,
df: pd.DataFrame,
) -> Union[pd.DataFrame, wandb.Table]:
"""Output table in specified format."""
if output_format == "html":
return df.reset_index(drop=True).to_html()
elif output_format == "df":
return df.reset_index(drop=True)
elif output_format == "wandb_table":
return wandb.Table(dataframe=df)
else:
raise ValueError("Output format does not match anything that is allowed")


def generate_feature_importances_table(
feature_names: Iterable[str],
feature_importances: Iterable[str],
Expand All @@ -90,11 +105,34 @@ def generate_feature_importances_table(
)
df = df.sort_values("feature_importance", ascending=False)

if output_format == "html":
return df.reset_index(drop=True).to_html()
elif output_format == "df":
return df.reset_index(drop=True)
elif output_format == "wandb_table":
return wandb.Table(dataframe=df)
else:
raise ValueError("Output format does not match anything that is allowed")
return output_table(output_format=output_format, df=df)


def feature_selection_table(
feature_names: Sequence[str],
selected_feature_names: Sequence[str],
output_format: str = "wandb_table",
removed_first: bool = True,
) -> Union[pd.DataFrame, wandb.Table]:
"""Get table with feature selection results.
Args:
feature_names (Sequence[str]): The names of the features
selected_feature_names (Sequence[str]): The names of the selected features
output_format (str, optional): The output format. Takes one of "html", "df", "wandb_table". Defaults to "wandb_table".
removed_first (bool, optional): Ordering of features in the table, whether the removed features are first. Defaults to True.
"""

df = pd.DataFrame(
{
"train_col_names": feature_names,
"is_removed": [
0 if i in selected_feature_names else 1 for i in feature_names
],
},
)

# Sort df so removed columns appear first
df = df.sort_values("is_removed", ascending=removed_first)

return output_table(output_format=output_format, df=df)
34 changes: 29 additions & 5 deletions src/psycopt2d/train_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import pandas as pd
import wandb
from omegaconf.dictconfig import DictConfig
from sklearn.feature_selection import SelectPercentile, chi2, f_classif
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedGroupKFold
Expand All @@ -18,9 +19,12 @@
from wasabi import Printer

from psycopt2d.evaluation import evaluate_model
from psycopt2d.feature_transformers import ConvertToBoolean, DateTimeConverter
from psycopt2d.load import load_train_and_val_from_cfg
from psycopt2d.models import MODELS
from psycopt2d.preprocessing.feature_transformers import (
ConvertToBoolean,
DateTimeConverter,
)
from psycopt2d.utils import (
create_wandb_folders,
flatten_nested_dict,
Expand Down Expand Up @@ -60,6 +64,27 @@ def create_preprocessing_pipeline(cfg):
("z-score-normalization", StandardScaler()),
)

if cfg.preprocessing.feature_selection_method == "f_classif":
steps.append(
(
"feature_selection",
SelectPercentile(
f_classif,
percentile=cfg.preprocessing.feature_selection_params.percentile,
),
),
)
if cfg.preprocessing.feature_selection_method == "chi2":
steps.append(
(
"feature_selection",
SelectPercentile(
chi2,
percentile=cfg.preprocessing.feature_selection_params.percentile,
),
),
)

return Pipeline(steps)


Expand All @@ -73,8 +98,7 @@ def create_model(cfg):
training_arguments = getattr(cfg.model, "args")
model_args.update(training_arguments)

mdl = model_dict["model"](**model_args)
return mdl
return model_dict["model"](**model_args)


def stratified_cross_validation(
Expand Down Expand Up @@ -271,8 +295,7 @@ def create_pipeline(cfg):

mdl = create_model(cfg)
steps.append(("model", mdl))
pipe = Pipeline(steps)
return pipe
return Pipeline(steps)


def get_col_names(cfg: DictConfig, train: pd.DataFrame) -> tuple[str, list[str]]:
Expand Down Expand Up @@ -318,6 +341,7 @@ def main(cfg):
config=flatten_nested_dict(cfg, sep="."),
mode=cfg.project.wandb_mode,
group=today_str,
entity=cfg.project.wandb_entity,
)

dataset = load_train_and_val_from_cfg(cfg)
Expand Down
4 changes: 2 additions & 2 deletions tests/test_auc_by_group_table.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""table_test_auc_by_group_table."""
# pylint: disable=missing-function-docstring

from psycopt2d.tables import auc_by_group_table
from psycopt2d.tables import auc_by_group_df
from psycopt2d.utils import bin_continuous_data


Expand All @@ -11,7 +11,7 @@ def test_auc_by_group_table(synth_data):
bins=[0, 18, 30, 50, 120],
)

table = auc_by_group_table(
table = auc_by_group_df(
synth_data,
pred_probs_col_name="pred_prob",
outcome_col_name="label",
Expand Down
15 changes: 15 additions & 0 deletions tests/test_train_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,3 +65,18 @@ def test_min_prediction_time_date():
],
)
main(cfg)


def test_feature_selection():
"""Test feature selection."""
with initialize(version_base=None, config_path=CONFIG_DIR_PATH):
cfg = compose(
config_name=CONFIG_FILE_NAME,
overrides=[
INTEGRATION_TESTING_MODEL_OVERRIDE,
"preprocessing.feature_selection_method=f_classif",
"preprocessing.feature_selection_params.percentile=10",
# "project.wandb_mode=run",
],
)
main(cfg)

0 comments on commit d81ecf4

Please sign in to comment.