From 3ebd3fec7cc5c2da54b9de8ebb0dcf7be9fa54cb Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Mon, 31 Oct 2022 16:02:52 +0100 Subject: [PATCH] feat: add min_age --- src/psycopt2d/config/data/synth_data.yaml | 1 + src/psycopt2d/config/data/t2d_parquet.yaml | 1 + src/psycopt2d/load.py | 10 +++++- src/psycopt2d/utils/config_schemas.py | 42 +++++++++++----------- 4 files changed, 33 insertions(+), 21 deletions(-) diff --git a/src/psycopt2d/config/data/synth_data.yaml b/src/psycopt2d/config/data/synth_data.yaml index 6ebb395f..b74af1a6 100644 --- a/src/psycopt2d/config/data/synth_data.yaml +++ b/src/psycopt2d/config/data/synth_data.yaml @@ -7,6 +7,7 @@ data: min_lookbehind_days: 100 min_prediction_time_date: null pred_prefix: pred_ + min_age: 18 col_name: pred_timestamp: timestamp diff --git a/src/psycopt2d/config/data/t2d_parquet.yaml b/src/psycopt2d/config/data/t2d_parquet.yaml index fce870a1..c7c62ef7 100644 --- a/src/psycopt2d/config/data/t2d_parquet.yaml +++ b/src/psycopt2d/config/data/t2d_parquet.yaml @@ -4,6 +4,7 @@ data: n_training_samples: null dir: E:\shared_resources\feature_sets\t2d\feature_sets\psycop_t2d_adminmanber_416_features_2022_10_20_11_12 suffix: parquet + min_age: 18 # Patient exclusion criteria drop_patient_if_outcome_before_date: 2013-01-01 diff --git a/src/psycopt2d/load.py b/src/psycopt2d/load.py index 59725272..5f534c36 100644 --- a/src/psycopt2d/load.py +++ b/src/psycopt2d/load.py @@ -385,6 +385,11 @@ def _keep_unique_outcome_col_with_lookahead_days_matching_conf( return df + def _keep_only_if_older_than_min_age(self, dataset: pd.DataFrame) -> pd.DataFrame: + """Keep only rows that are older than the minimum age specified in the + config.""" + return dataset[dataset[self.cfg.data.col_name.age] >= self.cfg.data.min_age] + def n_outcome_col_names(self, df: pd.DataFrame): """How many outcome columns there are in a dataframe.""" return len(infer_outcome_col_name(df=df, allow_multiple=True)) @@ -400,7 +405,10 @@ def _process_dataset(self, dataset: pd.DataFrame) -> pd.DataFrame: Returns: pd.DataFrame: Processed dataset """ - dataset = self.convert_timestamp_dtype_and_nat(dataset) + dataset = self.convert_timestamp_dtype_and_nat(dataset=dataset) + + if self.cfg.data.min_age: + dataset = self._keep_only_if_older_than_min_age(dataset=dataset) if self.cfg.data.drop_patient_if_outcome_before_date: dataset = self.drop_patient_if_outcome_before_date(dataset=dataset) diff --git a/src/psycopt2d/utils/config_schemas.py b/src/psycopt2d/utils/config_schemas.py index 664dc9d5..f3c920c3 100644 --- a/src/psycopt2d/utils/config_schemas.py +++ b/src/psycopt2d/utils/config_schemas.py @@ -37,7 +37,7 @@ def __init__( self.Config.allow_mutation = allow_mutation -class WandbConf(BaseModel): +class WandbSchema(BaseModel): """Configuration for weights and biases.""" group: str @@ -45,7 +45,7 @@ class WandbConf(BaseModel): entity: str -class WatcherConf(BaseModel): +class WatcherSchema(BaseModel): """Configuration for watchers.""" archive_all: bool @@ -54,13 +54,13 @@ class WatcherConf(BaseModel): verbose: bool -class ProjectConf(BaseModel): +class ProjectSchema(BaseModel): """Project configuration.""" - wandb: WandbConf + wandb: WandbSchema name: str = "psycopt2d" seed: int - watcher: WatcherConf + watcher: WatcherSchema gpu: bool @@ -71,7 +71,7 @@ class CustomColNames(BaseModel): n_hba1c: str -class ColumnNames(BaseModel): +class ColumnNamesSchema(BaseModel): """Column names in the data.""" pred_timestamp: str # Column name for prediction times @@ -83,7 +83,7 @@ class ColumnNames(BaseModel): # Column names that are custom to the given prediction problem. -class DataConf(BaseModel): +class DataSchema(BaseModel): """Data configuration.""" n_training_samples: Optional[int] @@ -93,10 +93,12 @@ class DataConf(BaseModel): suffix: str # File suffix to load. # Feature specs - col_name: ColumnNames + col_name: ColumnNamesSchema pred_prefix: str # prefix of predictor columns + min_age: Union[int, float] # Minimum age to include in the dataset + # Looking ahead min_lookahead_days: int # Drop all prediction times where (max timestamp in the dataset) - (current timestamp) is less than min_lookahead_days @@ -114,7 +116,7 @@ class DataConf(BaseModel): # Which combination of features to use. Only uses features that have "within_X_days" in their column name, where X is any of the numbers in this list. -class FeatureSelectionConf(BaseModel): +class FeatureSelectionSchema(BaseModel): """Configuration for feature selection methods.""" name: Optional[str] @@ -124,7 +126,7 @@ class FeatureSelectionConf(BaseModel): # Parameters for the feature selection method. -class PreprocessingConf(BaseModel): +class PreprocessingConfigSchema(BaseModel): """Preprocessing config.""" convert_to_boolean: bool @@ -139,10 +141,10 @@ class PreprocessingConf(BaseModel): transform: Optional[str] # Transformation applied to all predictors after imputation. Options include "z-score-normalization" - feature_selection: FeatureSelectionConf + feature_selection: FeatureSelectionSchema -class ModelConf(BaseModel): +class ModelConfSchema(BaseModel): """Model configuration.""" name: str # Model, can currently take xgboost @@ -150,7 +152,7 @@ class ModelConf(BaseModel): args: dict -class TrainConf(BaseModel): +class TrainConfSchema(BaseModel): """Training configuration.""" n_splits: int # ? How do we handle whether to use crossvalidation or train/val splitting? @@ -159,7 +161,7 @@ class TrainConf(BaseModel): gpu: bool -class EvalConf(BaseModel): +class EvalConfSchema(BaseModel): """Evaluation config.""" force: bool = False @@ -183,12 +185,12 @@ class EvalConf(BaseModel): class FullConfigSchema(BaseModel): """A recipe for a full configuration object.""" - project: ProjectConf - data: DataConf - preprocessing: PreprocessingConf - model: ModelConf - train: TrainConf - eval: EvalConf + project: ProjectSchema + data: DataSchema + preprocessing: PreprocessingConfigSchema + model: ModelConfSchema + train: TrainConfSchema + eval: EvalConfSchema def convert_omegaconf_to_pydantic_object(