diff --git a/src/psycop_model_training/config_schemas/preprocessing.py b/src/psycop_model_training/config_schemas/preprocessing.py index 035f081b..4ceead47 100644 --- a/src/psycop_model_training/config_schemas/preprocessing.py +++ b/src/psycop_model_training/config_schemas/preprocessing.py @@ -30,6 +30,9 @@ class PreSplitPreprocessingConfigSchema(BaseModel): convert_booleans_to_int: bool = False # Whether to convert columns containing booleans to int + negative_values_to_nan: bool = True + # Whether to change negative values to NaN. Defaults to True since Chi2 cannot handle negative values. Can only be set to True if Chi2 is not used for feature selection. + drop_datetime_predictor_columns: bool = False # Whether to drop datetime columns prefixed with data.pred_prefix. # Typically, we don't want to use these as features, since they are unlikely to generalise into the future. diff --git a/src/psycop_model_training/preprocessing/pre_split/processors/value_cleaner.py b/src/psycop_model_training/preprocessing/pre_split/processors/value_cleaner.py index 932ea019..6ddfc1ad 100644 --- a/src/psycop_model_training/preprocessing/pre_split/processors/value_cleaner.py +++ b/src/psycop_model_training/preprocessing/pre_split/processors/value_cleaner.py @@ -65,7 +65,8 @@ def clean(self, dataset: pd.DataFrame) -> pd.DataFrame: # In the future, we want to: # 1a. See if there's a way of using feature selection that permits negative values, or # 1b. Always use z-score normalisation? - dataset = self._negative_values_to_nan(dataset=dataset) + if self.pre_split_cfg.negative_values_to_nan: + dataset = self._negative_values_to_nan(dataset=dataset) dataset = self.convert_timestamp_dtype_and_nat(dataset=dataset) return dataset