diff --git a/src/psycopt2d/config/preprocessing/default_preprocessing.yaml b/src/psycopt2d/config/preprocessing/default_preprocessing.yaml index 3abc70b5..10aa9d6a 100644 --- a/src/psycopt2d/config/preprocessing/default_preprocessing.yaml +++ b/src/psycopt2d/config/preprocessing/default_preprocessing.yaml @@ -1,8 +1,8 @@ convert_to_boolean: false convert_datetimes_to_ordinal: false -imputation_method: "most_frequent" # (str): Options include 2most_frequent" +imputation_method: "most_frequent" # (str): Options include "most_frequent" transform: null # (str|null): Transformation applied to all predictors after imputation. Options include "z-score-normalization" -feature_selection: - name: null +feature_selection: + name: null # (str) Options include "f_classif", "chi2" and "mutual_info_classif". Default to use is "mutual_info_classif". params: percentile: 10 # (int): Percent of features to keep. Defaults to 10. diff --git a/src/psycopt2d/train_model.py b/src/psycopt2d/train_model.py index b98a4dc2..d095b07e 100644 --- a/src/psycopt2d/train_model.py +++ b/src/psycopt2d/train_model.py @@ -9,7 +9,12 @@ import wandb from omegaconf import OmegaConf from omegaconf.dictconfig import DictConfig -from sklearn.feature_selection import SelectPercentile, chi2, f_classif +from sklearn.feature_selection import ( + SelectPercentile, + chi2, + f_classif, + mutual_info_classif, +) from sklearn.impute import SimpleImputer from sklearn.metrics import roc_auc_score from sklearn.model_selection import StratifiedGroupKFold @@ -90,6 +95,16 @@ def create_preprocessing_pipeline(cfg: FullConfigSchema): ), ), ) + if cfg.preprocessing.feature_selection.name == "mutual_info_classif": + steps.append( + ( + "feature_selection", + SelectPercentile( + mutual_info_classif, + percentile=cfg.preprocessing.feature_selection.params["percentile"], + ), + ), + ) return Pipeline(steps)