Merge pull request #309 from Aarhus-Psychiatry-Research/martbern/min_age

Filter by minimum age
Aarhus-Psychiatry-Research · Nov 1, 2022 · d04847f · d04847f
2 parents ff5dd16 + a08fa59
commit d04847f
Show file tree

Hide file tree

Showing 4 changed files with 33 additions and 21 deletions.
diff --git a/src/psycopt2d/config/data/synth_data.yaml b/src/psycopt2d/config/data/synth_data.yaml
@@ -7,6 +7,7 @@ data:
   min_lookbehind_days: 100
   min_prediction_time_date: null
   pred_prefix: pred_
+  min_age: 18
 
   col_name:
     pred_timestamp: timestamp

diff --git a/src/psycopt2d/config/data/t2d_parquet.yaml b/src/psycopt2d/config/data/t2d_parquet.yaml
@@ -4,6 +4,7 @@ data:
   n_training_samples: null
   dir: E:\shared_resources\feature_sets\t2d\feature_sets\psycop_t2d_adminmanber_416_features_2022_10_20_11_12
   suffix: parquet
+  min_age: 18
 
   # Patient exclusion criteria
   drop_patient_if_exclusion_before_date: 2013-01-01

diff --git a/src/psycopt2d/load.py b/src/psycopt2d/load.py
@@ -390,6 +390,11 @@ def _keep_unique_outcome_col_with_lookahead_days_matching_conf(
 
         return df
 
+    def _keep_only_if_older_than_min_age(self, dataset: pd.DataFrame) -> pd.DataFrame:
+        """Keep only rows that are older than the minimum age specified in the
+        config."""
+        return dataset[dataset[self.cfg.data.col_name.age] >= self.cfg.data.min_age]
+
     def n_outcome_col_names(self, df: pd.DataFrame) -> int:
         """How many outcome columns there are in a dataframe."""
         return len(infer_outcome_col_name(df=df, allow_multiple=True))
@@ -413,7 +418,10 @@ def _process_dataset(self, dataset: pd.DataFrame) -> pd.DataFrame:
         Returns:
             pd.DataFrame: Processed dataset
         """
-        dataset = self.convert_timestamp_dtype_and_nat(dataset)
+        dataset = self.convert_timestamp_dtype_and_nat(dataset=dataset)
+
+        if self.cfg.data.min_age:
+            dataset = self._keep_only_if_older_than_min_age(dataset=dataset)
 
         dataset = self._drop_rows_after_event_time(dataset=dataset)
 

diff --git a/src/psycopt2d/utils/config_schemas.py b/src/psycopt2d/utils/config_schemas.py
@@ -37,15 +37,15 @@ def __init__(
         self.Config.allow_mutation = allow_mutation
 
 
-class WandbConf(BaseModel):
+class WandbSchema(BaseModel):
     """Configuration for weights and biases."""
 
     group: str
     mode: str
     entity: str
 
 
-class WatcherConf(BaseModel):
+class WatcherSchema(BaseModel):
     """Configuration for watchers."""
 
     archive_all: bool
@@ -54,13 +54,13 @@ class WatcherConf(BaseModel):
     verbose: bool
 
 
-class ProjectConf(BaseModel):
+class ProjectSchema(BaseModel):
     """Project configuration."""
 
-    wandb: WandbConf
+    wandb: WandbSchema
     name: str = "psycopt2d"
     seed: int
-    watcher: WatcherConf
+    watcher: WatcherSchema
     gpu: bool
 
 
@@ -71,7 +71,7 @@ class CustomColNames(BaseModel):
     n_hba1c: str
 
 
-class ColumnNames(BaseModel):
+class ColumnNamesSchema(BaseModel):
     """Column names in the data."""
 
     pred_timestamp: str  # Column name for prediction times
@@ -85,7 +85,7 @@ class ColumnNames(BaseModel):
     # Column names that are custom to the given prediction problem.
 
 
-class DataConf(BaseModel):
+class DataSchema(BaseModel):
     """Data configuration."""
 
     n_training_samples: Optional[int]
@@ -95,10 +95,12 @@ class DataConf(BaseModel):
     suffix: str  # File suffix to load.
 
     # Feature specs
-    col_name: ColumnNames
+    col_name: ColumnNamesSchema
 
     pred_prefix: str  # prefix of predictor columns
 
+    min_age: Union[int, float]  # Minimum age to include in the dataset
+
     # Looking ahead
     min_lookahead_days: int
     # Drop all prediction times where (max timestamp in the dataset) - (current timestamp) is less than min_lookahead_days
@@ -116,7 +118,7 @@ class DataConf(BaseModel):
     # Which combination of features to use. Only uses features that have "within_X_days" in their column name, where X is any of the numbers in this list.
 
 
-class FeatureSelectionConf(BaseModel):
+class FeatureSelectionSchema(BaseModel):
     """Configuration for feature selection methods."""
 
     name: Optional[str]
@@ -126,7 +128,7 @@ class FeatureSelectionConf(BaseModel):
     # Parameters for the feature selection method.
 
 
-class PreprocessingConf(BaseModel):
+class PreprocessingConfigSchema(BaseModel):
     """Preprocessing config."""
 
     convert_to_boolean: bool
@@ -141,18 +143,18 @@ class PreprocessingConf(BaseModel):
     transform: Optional[str]
     # Transformation applied to all predictors after imputation. Options include "z-score-normalization"
 
-    feature_selection: FeatureSelectionConf
+    feature_selection: FeatureSelectionSchema
 
 
-class ModelConf(BaseModel):
+class ModelConfSchema(BaseModel):
     """Model configuration."""
 
     name: str  # Model, can currently take xgboost
     require_imputation: bool  # Whether the model requires imputation. (shouldn't this be false?)
     args: dict
 
 
-class TrainConf(BaseModel):
+class TrainConfSchema(BaseModel):
     """Training configuration."""
 
     n_splits: int  # ? How do we handle whether to use crossvalidation or train/val splitting?
@@ -161,7 +163,7 @@ class TrainConf(BaseModel):
     gpu: bool
 
 
-class EvalConf(BaseModel):
+class EvalConfSchema(BaseModel):
     """Evaluation config."""
 
     force: bool = False
@@ -185,12 +187,12 @@ class EvalConf(BaseModel):
 class FullConfigSchema(BaseModel):
     """A recipe for a full configuration object."""
 
-    project: ProjectConf
-    data: DataConf
-    preprocessing: PreprocessingConf
-    model: ModelConf
-    train: TrainConf
-    eval: EvalConf
+    project: ProjectSchema
+    data: DataSchema
+    preprocessing: PreprocessingConfigSchema
+    model: ModelConfSchema
+    train: TrainConfSchema
+    eval: EvalConfSchema
 
 
 def convert_omegaconf_to_pydantic_object(