Merge pull request #281 from Aarhus-Psychiatry-Research/frillecode/Au…

…tomatic-feature-selection181 frillecode/Automatic-feature-selection181
Aarhus-Psychiatry-Research · Oct 25, 2022 · d81ecf4 · d81ecf4
2 parents e58bf62 + b774066
commit d81ecf4
Show file tree

Hide file tree

Showing 12 changed files with 148 additions and 35 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -6,7 +6,12 @@ repos:
     rev: v1.7.6
     hooks:
       - id: autoflake
-        args: ['--in-place', '--remove-all-unused-imports', "--ignore-init-module-imports"]
+        args:
+          [
+            "--in-place",
+            "--remove-all-unused-imports",
+            "--ignore-init-module-imports",
+          ]
 
   - repo: https://github.com/pycqa/isort
     rev: 5.10.1
@@ -21,15 +26,15 @@ repos:
       - id: add-trailing-comma
 
   - repo: https://github.com/asottile/pyupgrade
-    rev: v3.1.0 
+    rev: v3.1.0
     hooks:
       - id: pyupgrade
         args: ["--py39-plus"]
 
   - repo: https://github.com/bwhmather/ssort
     rev: v0.11.6
     hooks:
-    - id: ssort
+      - id: ssort
 
   - repo: https://github.com/myint/docformatter
     rev: v1.5.0
@@ -47,7 +52,7 @@ repos:
     hooks:
       - id: flake8
         args: [--config, .flake8]
-        
+
   - repo: https://github.com/pre-commit/pre-commit-hooks
     rev: v4.3.0 # Use the ref you want to point at
     hooks:
@@ -63,20 +68,19 @@ repos:
     hooks:
       - id: pylint
         types: [python]
-        args:
-          [
+        args: [
             "-rn", # Only display messages
             "-sn", # Don't display the score
-            "--disable=R,import-error" # Refactors are not important enough to block a commit. 
-            # Unused-imports aren't testable by the github action without installing the whole project, so don't test that here.
+            "--disable=R,import-error", # Refactors are not important enough to block a commit.
           ]
-
+
+  # Unused-imports aren't testable by the github action without installing the whole project, so don't test that here.
   - repo: local
     hooks:
-    -   id: pytest
+      - id: pytest
         name: Run integration tests before push
         entry: .venv/bin/pytest -m pre_push_test
         language: script
         stages: [push]
         pass_filenames: false
-        always_run: true
+        always_run: true
diff --git a/src/psycopt2d/config/preprocessing/default_preprocessing.yaml b/src/psycopt2d/config/preprocessing/default_preprocessing.yaml
@@ -1,4 +1,7 @@
 convert_to_boolean: False # (Boolean): Convert all prediction values (except gender) to boolean. Defaults to False
-convert_datetimes_to: False # (str): Options include ordinal or False 
+convert_datetimes_to: False # (str): Options include ordinal or False
 imputation_method: "most_frequent" # (str): Options include 2most_frequent"
-transform: null # (str|null): Transformation applied to all predictors after imputation. Options include "z-score-normalization"
+transform: null # (str|null): Transformation applied to all predictors after imputation. Options include "z-score-normalization"
+feature_selection_method: null
+feature_selection_params:
+  percentile: 10 # (int): Percent of features to keep. Defaults to 10.
diff --git a/src/psycopt2d/config/project/default_project.yaml b/src/psycopt2d/config/project/default_project.yaml
@@ -1,3 +1,4 @@
 name: psycop-t2d
 seed: 42
-wandb_mode: "run" # Which mode to run WanDB in. Takes "run", "dryrun", "offline" and "disabled"
+wandb_mode: "run" # Which mode to run WanDB in. Takes "run", "dryrun", "offline" and "disabled"
+wandb_entity: "psycop" # Optional[str]
diff --git a/src/psycopt2d/config/project/integration_test_project.yaml b/src/psycopt2d/config/project/integration_test_project.yaml
@@ -1,3 +1,4 @@
 name: psycop-t2d-integration-testing
 seed: 42
-wandb_mode: "disabled" # Which mode to run WanDB in. Takes "run", "dryrun", "offline" and "disabled"
+wandb_mode: "disabled" # Which mode to run WanDB in. Takes "run", "dryrun", "offline" and "disabled"
+wandb_entity: "psycop" # Optional[str]
diff --git a/src/psycopt2d/config/project/overtaci_test_project.yaml b/src/psycopt2d/config/project/overtaci_test_project.yaml
@@ -1,3 +1,4 @@
 name: psycop-t2d-testing
 seed: 42
-wandb_mode: "run" # Which mode to run WanDB in. Takes "run", "dryrun", "offline" and "disabled"
+wandb_mode: "run" # Which mode to run WanDB in. Takes "run", "dryrun", "offline" and "disabled"
+wandb_entity: "psycop" # Optional[str]
diff --git a/src/psycopt2d/evaluation.py b/src/psycopt2d/evaluation.py
@@ -7,13 +7,15 @@
 import pandas as pd
 from omegaconf.dictconfig import DictConfig
 from sklearn.metrics import recall_score, roc_auc_score
+from sklearn.pipeline import Pipeline
 from wandb.sdk.wandb_run import Run as wandb_run  # pylint: disable=no-name-in-module
 from wasabi import Printer
 
 from psycopt2d.tables import generate_feature_importances_table
 from psycopt2d.tables.performance_by_threshold import (
     generate_performance_by_positive_rate_table,
 )
+from psycopt2d.tables.tables import feature_selection_table
 from psycopt2d.utils import PROJECT_ROOT, positive_rate_to_pred_probs
 from psycopt2d.visualization import (
     plot_auc_by_time_from_first_visit,
@@ -55,7 +57,9 @@ def log_feature_importances(
 def evaluate_model(
     cfg,
     eval_df: pd.DataFrame,
+    pipe: Pipeline,
     y_col_name: str,
+    train_col_names: Iterable[str],
     y_hat_prob_col_name: str,
     run: wandb_run,
     feature_importance_dict: Optional[dict[str, float]],
@@ -72,8 +76,10 @@ def evaluate_model(
 
     Args:
         cfg (OmegaConf): The hydra config from the run
+        pipe (Pipeline): Pipeline including the model
         eval_df (pd.DataFrame): Evalaution split
         y_col_name (str): Label column name
+        train_col_names (Iterable[str]): Column names for all predictors
         y_hat_prob_col_name (str): Column name containing pred_proba output
         run (wandb_run): WandB run to log to.
         feature_importance_dict (Optional[dict[str, float]]): Dict of feature
@@ -101,6 +107,22 @@ def evaluate_model(
     pred_timestamps = eval_df[cfg.data.pred_timestamp_col_name]
     y_hat_int = np.round(y_hat_probs, 0)
 
+    if "feature_selection" in pipe["preprocessing"].named_steps:
+        selected_features = (
+            eval_df[train_col_names]
+            .columns[pipe["preprocessing"]["feature_selection"].get_support()]
+            .to_list()
+        )
+
+        run.log(
+            {
+                "feature_selection_table": feature_selection_table(
+                    feature_names=train_col_names,
+                    selected_feature_names=selected_features,
+                ),
+            },
+        )
+
     date_bins_ahead: Iterable[int] = cfg.evaluation.date_bins_ahead
     date_bins_behind: Iterable[int] = cfg.evaluation.date_bins_behind
 
@@ -132,7 +154,11 @@ def evaluate_model(
     )
 
     msg.info(f"AUC: {auc}")
-    run.log({"1_minus_roc_auc_unweighted": 1 - auc})
+    run.log(
+        {
+            "roc_auc_unweighted": auc,
+        },
+    )
 
     # Tables
     # Performance by threshold

diff --git a/src/psycopt2d/feature_transformers.py → ...t2d/preprocessing/feature_transformers.py b/src/psycopt2d/feature_transformers.py → ...t2d/preprocessing/feature_transformers.py
diff --git a/src/psycopt2d/tables/__init__.py b/src/psycopt2d/tables/__init__.py
@@ -1,3 +1,3 @@
 # pylint: disable=missing-module-docstring
 from .performance_by_threshold import generate_performance_by_positive_rate_table
-from .tables import auc_by_group_table, generate_feature_importances_table
+from .tables import auc_by_group_df, generate_feature_importances_table
diff --git a/src/psycopt2d/tables/tables.py b/src/psycopt2d/tables/tables.py
@@ -1,5 +1,5 @@
 """Tables for evaluation of models."""
-from collections.abc import Iterable
+from collections.abc import Iterable, Sequence
 from functools import partial
 from typing import Union
 
@@ -28,7 +28,7 @@ def _calc_auc_and_n(
     return pd.Series([auc, n], index=["AUC", "N"])
 
 
-def auc_by_group_table(
+def auc_by_group_df(
     df: pd.DataFrame,
     pred_probs_col_name: str,
     outcome_col_name: str,
@@ -69,6 +69,21 @@ def auc_by_group_table(
     return pd.concat(groups_df)
 
 
+def output_table(
+    output_format: str,
+    df: pd.DataFrame,
+) -> Union[pd.DataFrame, wandb.Table]:
+    """Output table in specified format."""
+    if output_format == "html":
+        return df.reset_index(drop=True).to_html()
+    elif output_format == "df":
+        return df.reset_index(drop=True)
+    elif output_format == "wandb_table":
+        return wandb.Table(dataframe=df)
+    else:
+        raise ValueError("Output format does not match anything that is allowed")
+
+
 def generate_feature_importances_table(
     feature_names: Iterable[str],
     feature_importances: Iterable[str],
@@ -90,11 +105,34 @@ def generate_feature_importances_table(
     )
     df = df.sort_values("feature_importance", ascending=False)
 
-    if output_format == "html":
-        return df.reset_index(drop=True).to_html()
-    elif output_format == "df":
-        return df.reset_index(drop=True)
-    elif output_format == "wandb_table":
-        return wandb.Table(dataframe=df)
-    else:
-        raise ValueError("Output format does not match anything that is allowed")
+    return output_table(output_format=output_format, df=df)
+
+
+def feature_selection_table(
+    feature_names: Sequence[str],
+    selected_feature_names: Sequence[str],
+    output_format: str = "wandb_table",
+    removed_first: bool = True,
+) -> Union[pd.DataFrame, wandb.Table]:
+    """Get table with feature selection results.
+
+    Args:
+        feature_names (Sequence[str]): The names of the features
+        selected_feature_names (Sequence[str]): The names of the selected features
+        output_format (str, optional): The output format. Takes one of "html", "df", "wandb_table". Defaults to "wandb_table".
+        removed_first (bool, optional): Ordering of features in the table, whether the removed features are first. Defaults to True.
+    """
+
+    df = pd.DataFrame(
+        {
+            "train_col_names": feature_names,
+            "is_removed": [
+                0 if i in selected_feature_names else 1 for i in feature_names
+            ],
+        },
+    )
+
+    # Sort df so removed columns appear first
+    df = df.sort_values("is_removed", ascending=removed_first)
+
+    return output_table(output_format=output_format, df=df)
diff --git a/src/psycopt2d/train_model.py b/src/psycopt2d/train_model.py
@@ -10,6 +10,7 @@
 import pandas as pd
 import wandb
 from omegaconf.dictconfig import DictConfig
+from sklearn.feature_selection import SelectPercentile, chi2, f_classif
 from sklearn.impute import SimpleImputer
 from sklearn.metrics import roc_auc_score
 from sklearn.model_selection import StratifiedGroupKFold
@@ -18,9 +19,12 @@
 from wasabi import Printer
 
 from psycopt2d.evaluation import evaluate_model
-from psycopt2d.feature_transformers import ConvertToBoolean, DateTimeConverter
 from psycopt2d.load import load_train_and_val_from_cfg
 from psycopt2d.models import MODELS
+from psycopt2d.preprocessing.feature_transformers import (
+    ConvertToBoolean,
+    DateTimeConverter,
+)
 from psycopt2d.utils import (
     create_wandb_folders,
     flatten_nested_dict,
@@ -60,6 +64,27 @@ def create_preprocessing_pipeline(cfg):
             ("z-score-normalization", StandardScaler()),
         )
 
+    if cfg.preprocessing.feature_selection_method == "f_classif":
+        steps.append(
+            (
+                "feature_selection",
+                SelectPercentile(
+                    f_classif,
+                    percentile=cfg.preprocessing.feature_selection_params.percentile,
+                ),
+            ),
+        )
+    if cfg.preprocessing.feature_selection_method == "chi2":
+        steps.append(
+            (
+                "feature_selection",
+                SelectPercentile(
+                    chi2,
+                    percentile=cfg.preprocessing.feature_selection_params.percentile,
+                ),
+            ),
+        )
+
     return Pipeline(steps)
 
 
@@ -73,8 +98,7 @@ def create_model(cfg):
     training_arguments = getattr(cfg.model, "args")
     model_args.update(training_arguments)
 
-    mdl = model_dict["model"](**model_args)
-    return mdl
+    return model_dict["model"](**model_args)
 
 
 def stratified_cross_validation(
@@ -271,8 +295,7 @@ def create_pipeline(cfg):
 
     mdl = create_model(cfg)
     steps.append(("model", mdl))
-    pipe = Pipeline(steps)
-    return pipe
+    return Pipeline(steps)
 
 
 def get_col_names(cfg: DictConfig, train: pd.DataFrame) -> tuple[str, list[str]]:
@@ -318,6 +341,7 @@ def main(cfg):
         config=flatten_nested_dict(cfg, sep="."),
         mode=cfg.project.wandb_mode,
         group=today_str,
+        entity=cfg.project.wandb_entity,
     )
 
     dataset = load_train_and_val_from_cfg(cfg)

diff --git a/tests/test_auc_by_group_table.py b/tests/test_auc_by_group_table.py
@@ -1,7 +1,7 @@
 """table_test_auc_by_group_table."""
 # pylint: disable=missing-function-docstring
 
-from psycopt2d.tables import auc_by_group_table
+from psycopt2d.tables import auc_by_group_df
 from psycopt2d.utils import bin_continuous_data
 
 
@@ -11,7 +11,7 @@ def test_auc_by_group_table(synth_data):
         bins=[0, 18, 30, 50, 120],
     )
 
-    table = auc_by_group_table(
+    table = auc_by_group_df(
         synth_data,
         pred_probs_col_name="pred_prob",
         outcome_col_name="label",

diff --git a/tests/test_train_model.py b/tests/test_train_model.py
@@ -65,3 +65,18 @@ def test_min_prediction_time_date():
             ],
         )
         main(cfg)
+
+
+def test_feature_selection():
+    """Test feature selection."""
+    with initialize(version_base=None, config_path=CONFIG_DIR_PATH):
+        cfg = compose(
+            config_name=CONFIG_FILE_NAME,
+            overrides=[
+                INTEGRATION_TESTING_MODEL_OVERRIDE,
+                "preprocessing.feature_selection_method=f_classif",
+                "preprocessing.feature_selection_params.percentile=10",
+                # "project.wandb_mode=run",
+            ],
+        )
+        main(cfg)