Skip to content
This repository has been archived by the owner on May 1, 2023. It is now read-only.

Commit

Permalink
Merge pull request #309 from Aarhus-Psychiatry-Research/martbern/min_age
Browse files Browse the repository at this point in the history
Filter by minimum age
  • Loading branch information
MartinBernstorff authored Nov 1, 2022
2 parents ff5dd16 + a08fa59 commit d04847f
Show file tree
Hide file tree
Showing 4 changed files with 33 additions and 21 deletions.
1 change: 1 addition & 0 deletions src/psycopt2d/config/data/synth_data.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ data:
min_lookbehind_days: 100
min_prediction_time_date: null
pred_prefix: pred_
min_age: 18

col_name:
pred_timestamp: timestamp
Expand Down
1 change: 1 addition & 0 deletions src/psycopt2d/config/data/t2d_parquet.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ data:
n_training_samples: null
dir: E:\shared_resources\feature_sets\t2d\feature_sets\psycop_t2d_adminmanber_416_features_2022_10_20_11_12
suffix: parquet
min_age: 18

# Patient exclusion criteria
drop_patient_if_exclusion_before_date: 2013-01-01
Expand Down
10 changes: 9 additions & 1 deletion src/psycopt2d/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -390,6 +390,11 @@ def _keep_unique_outcome_col_with_lookahead_days_matching_conf(

return df

def _keep_only_if_older_than_min_age(self, dataset: pd.DataFrame) -> pd.DataFrame:
"""Keep only rows that are older than the minimum age specified in the
config."""
return dataset[dataset[self.cfg.data.col_name.age] >= self.cfg.data.min_age]

def n_outcome_col_names(self, df: pd.DataFrame) -> int:
"""How many outcome columns there are in a dataframe."""
return len(infer_outcome_col_name(df=df, allow_multiple=True))
Expand All @@ -413,7 +418,10 @@ def _process_dataset(self, dataset: pd.DataFrame) -> pd.DataFrame:
Returns:
pd.DataFrame: Processed dataset
"""
dataset = self.convert_timestamp_dtype_and_nat(dataset)
dataset = self.convert_timestamp_dtype_and_nat(dataset=dataset)

if self.cfg.data.min_age:
dataset = self._keep_only_if_older_than_min_age(dataset=dataset)

dataset = self._drop_rows_after_event_time(dataset=dataset)

Expand Down
42 changes: 22 additions & 20 deletions src/psycopt2d/utils/config_schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,15 +37,15 @@ def __init__(
self.Config.allow_mutation = allow_mutation


class WandbConf(BaseModel):
class WandbSchema(BaseModel):
"""Configuration for weights and biases."""

group: str
mode: str
entity: str


class WatcherConf(BaseModel):
class WatcherSchema(BaseModel):
"""Configuration for watchers."""

archive_all: bool
Expand All @@ -54,13 +54,13 @@ class WatcherConf(BaseModel):
verbose: bool


class ProjectConf(BaseModel):
class ProjectSchema(BaseModel):
"""Project configuration."""

wandb: WandbConf
wandb: WandbSchema
name: str = "psycopt2d"
seed: int
watcher: WatcherConf
watcher: WatcherSchema
gpu: bool


Expand All @@ -71,7 +71,7 @@ class CustomColNames(BaseModel):
n_hba1c: str


class ColumnNames(BaseModel):
class ColumnNamesSchema(BaseModel):
"""Column names in the data."""

pred_timestamp: str # Column name for prediction times
Expand All @@ -85,7 +85,7 @@ class ColumnNames(BaseModel):
# Column names that are custom to the given prediction problem.


class DataConf(BaseModel):
class DataSchema(BaseModel):
"""Data configuration."""

n_training_samples: Optional[int]
Expand All @@ -95,10 +95,12 @@ class DataConf(BaseModel):
suffix: str # File suffix to load.

# Feature specs
col_name: ColumnNames
col_name: ColumnNamesSchema

pred_prefix: str # prefix of predictor columns

min_age: Union[int, float] # Minimum age to include in the dataset

# Looking ahead
min_lookahead_days: int
# Drop all prediction times where (max timestamp in the dataset) - (current timestamp) is less than min_lookahead_days
Expand All @@ -116,7 +118,7 @@ class DataConf(BaseModel):
# Which combination of features to use. Only uses features that have "within_X_days" in their column name, where X is any of the numbers in this list.


class FeatureSelectionConf(BaseModel):
class FeatureSelectionSchema(BaseModel):
"""Configuration for feature selection methods."""

name: Optional[str]
Expand All @@ -126,7 +128,7 @@ class FeatureSelectionConf(BaseModel):
# Parameters for the feature selection method.


class PreprocessingConf(BaseModel):
class PreprocessingConfigSchema(BaseModel):
"""Preprocessing config."""

convert_to_boolean: bool
Expand All @@ -141,18 +143,18 @@ class PreprocessingConf(BaseModel):
transform: Optional[str]
# Transformation applied to all predictors after imputation. Options include "z-score-normalization"

feature_selection: FeatureSelectionConf
feature_selection: FeatureSelectionSchema


class ModelConf(BaseModel):
class ModelConfSchema(BaseModel):
"""Model configuration."""

name: str # Model, can currently take xgboost
require_imputation: bool # Whether the model requires imputation. (shouldn't this be false?)
args: dict


class TrainConf(BaseModel):
class TrainConfSchema(BaseModel):
"""Training configuration."""

n_splits: int # ? How do we handle whether to use crossvalidation or train/val splitting?
Expand All @@ -161,7 +163,7 @@ class TrainConf(BaseModel):
gpu: bool


class EvalConf(BaseModel):
class EvalConfSchema(BaseModel):
"""Evaluation config."""

force: bool = False
Expand All @@ -185,12 +187,12 @@ class EvalConf(BaseModel):
class FullConfigSchema(BaseModel):
"""A recipe for a full configuration object."""

project: ProjectConf
data: DataConf
preprocessing: PreprocessingConf
model: ModelConf
train: TrainConf
eval: EvalConf
project: ProjectSchema
data: DataSchema
preprocessing: PreprocessingConfigSchema
model: ModelConfSchema
train: TrainConfSchema
eval: EvalConfSchema


def convert_omegaconf_to_pydantic_object(
Expand Down

0 comments on commit d04847f

Please sign in to comment.