From db9b2061384bc0eab8d50406ba0ee70cfa4cf63a Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Fri, 16 Dec 2022 12:26:48 +0100 Subject: [PATCH 01/47] refactor: move methods around --- application/t2d/inspect_dataset.py | 2 +- application/t2d/train_and_log_models.py | 2 +- src/application/__init__.py | 0 src/application/t2d/__init__.py | 0 src/application/t2d/inspect_dataset.py | 2 +- src/application/t2d/loaders/__init__.py | 0 .../t2d/loaders/preprocessing_loaders.py | 36 + .../t2d/{train_and_log_models.py => main.py} | 2 +- .../data_loader/__init__.py | 0 .../data_loader/data_classes.py | 16 + .../data_loader/data_loader.py | 169 +++++ .../data_loader/utils.py | 56 ++ src/psycop_model_training/load.py | 616 ------------------ .../post_split/create_pipeline.py | 4 +- .../{ => post_split}/feature_selectors.py | 0 .../{ => post_split}/feature_transformers.py | 0 .../preprocessing/pre_split/__init__.py | 0 .../preprocessing/pre_split/col_filterer.py | 198 ++++++ .../pre_split/col_transformer.py | 54 ++ .../preprocessing/pre_split/row_filterer.py | 120 ++++ .../training/train_model.py | 2 +- tests/test_load.py | 2 +- tests/test_preprocessing.py | 2 +- 23 files changed, 658 insertions(+), 625 deletions(-) create mode 100644 src/application/__init__.py create mode 100644 src/application/t2d/__init__.py create mode 100644 src/application/t2d/loaders/__init__.py create mode 100644 src/application/t2d/loaders/preprocessing_loaders.py rename src/application/t2d/{train_and_log_models.py => main.py} (99%) create mode 100644 src/psycop_model_training/data_loader/__init__.py create mode 100644 src/psycop_model_training/data_loader/data_classes.py create mode 100644 src/psycop_model_training/data_loader/data_loader.py create mode 100644 src/psycop_model_training/data_loader/utils.py delete mode 100644 src/psycop_model_training/load.py rename src/psycop_model_training/preprocessing/{ => post_split}/feature_selectors.py (100%) rename src/psycop_model_training/preprocessing/{ => post_split}/feature_transformers.py (100%) create mode 100644 src/psycop_model_training/preprocessing/pre_split/__init__.py create mode 100644 src/psycop_model_training/preprocessing/pre_split/col_filterer.py create mode 100644 src/psycop_model_training/preprocessing/pre_split/col_transformer.py create mode 100644 src/psycop_model_training/preprocessing/pre_split/row_filterer.py diff --git a/application/t2d/inspect_dataset.py b/application/t2d/inspect_dataset.py index 53c21401..53cc6fe3 100644 --- a/application/t2d/inspect_dataset.py +++ b/application/t2d/inspect_dataset.py @@ -1,6 +1,6 @@ """Example of how to inspect a dataset using the configs.""" from psycop_model_training.config.schemas import load_cfg_as_pydantic -from psycop_model_training.load import load_train_from_cfg, load_train_raw +from psycop_model_training.data_loader.utils import load_train_from_cfg, load_train_raw def main(): diff --git a/application/t2d/train_and_log_models.py b/application/t2d/train_and_log_models.py index 23ded385..d7c9d99b 100644 --- a/application/t2d/train_and_log_models.py +++ b/application/t2d/train_and_log_models.py @@ -20,7 +20,7 @@ FullConfigSchema, load_cfg_as_pydantic, ) -from psycop_model_training.load import load_train_raw +from psycop_model_training.data_loader.utils import load_train_raw from psycop_model_training.model_eval.evaluate_model import ( infer_look_distance, infer_outcome_col_name, diff --git a/src/application/__init__.py b/src/application/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/application/t2d/__init__.py b/src/application/t2d/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/application/t2d/inspect_dataset.py b/src/application/t2d/inspect_dataset.py index 53c21401..53cc6fe3 100644 --- a/src/application/t2d/inspect_dataset.py +++ b/src/application/t2d/inspect_dataset.py @@ -1,6 +1,6 @@ """Example of how to inspect a dataset using the configs.""" from psycop_model_training.config.schemas import load_cfg_as_pydantic -from psycop_model_training.load import load_train_from_cfg, load_train_raw +from psycop_model_training.data_loader.utils import load_train_from_cfg, load_train_raw def main(): diff --git a/src/application/t2d/loaders/__init__.py b/src/application/t2d/loaders/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/application/t2d/loaders/preprocessing_loaders.py b/src/application/t2d/loaders/preprocessing_loaders.py new file mode 100644 index 00000000..d886c8d8 --- /dev/null +++ b/src/application/t2d/loaders/preprocessing_loaders.py @@ -0,0 +1,36 @@ +import pandas as pd + + +def load_timestamp_for_any_diabetes(): + """Loads timestamps for the broad definition of diabetes used for wash-in. + + See R files for details. + """ + timestamp_any_diabetes = sql_load( + query="SELECT * FROM [fct].[psycop_t2d_first_diabetes_any]", + format_timestamp_cols_to_datetime=False, + )[["dw_ek_borger", "datotid_first_diabetes_any"]] + + timestamp_any_diabetes = timestamp_any_diabetes.rename( + columns={"datotid_first_diabetes_any": "timestamp_washin"}, + ) + + return timestamp_any_diabetes + + +def add_washin_timestamps(dataset: pd.DataFrame) -> pd.DataFrame: + """Add washin timestamps to dataset. + + Washin is an exclusion criterion. E.g. if the patient has any visit + that looks like diabetes before the study starts (i.e. during + washin), they are excluded. + """ + timestamp_washin = load_timestamp_for_any_diabetes() + + dataset = dataset.merge( + timestamp_washin, + on="dw_ek_borger", + how="left", + ) + + return dataset diff --git a/src/application/t2d/train_and_log_models.py b/src/application/t2d/main.py similarity index 99% rename from src/application/t2d/train_and_log_models.py rename to src/application/t2d/main.py index 23ded385..d7c9d99b 100644 --- a/src/application/t2d/train_and_log_models.py +++ b/src/application/t2d/main.py @@ -20,7 +20,7 @@ FullConfigSchema, load_cfg_as_pydantic, ) -from psycop_model_training.load import load_train_raw +from psycop_model_training.data_loader.utils import load_train_raw from psycop_model_training.model_eval.evaluate_model import ( infer_look_distance, infer_outcome_col_name, diff --git a/src/psycop_model_training/data_loader/__init__.py b/src/psycop_model_training/data_loader/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/psycop_model_training/data_loader/data_classes.py b/src/psycop_model_training/data_loader/data_classes.py new file mode 100644 index 00000000..8cf7769a --- /dev/null +++ b/src/psycop_model_training/data_loader/data_classes.py @@ -0,0 +1,16 @@ +from typing import Optional + +import pandas as pd + + +class SplitDataset(BaseModel): + """A dataset split into train, test and optionally validation.""" + + class Config: + """Configuration for the dataclass to allow pd.DataFrame as type.""" + + arbitrary_types_allowed = True + + train: pd.DataFrame + test: Optional[pd.DataFrame] = None + val: pd.DataFrame diff --git a/src/psycop_model_training/data_loader/data_loader.py b/src/psycop_model_training/data_loader/data_loader.py new file mode 100644 index 00000000..b0f9a928 --- /dev/null +++ b/src/psycop_model_training/data_loader/data_loader.py @@ -0,0 +1,169 @@ +"""Loader for the t2d dataset.""" +from collections.abc import Iterable +from pathlib import Path +from typing import Optional, Union + +import pandas as pd +from wasabi import Printer + +from psycop_model_training.config.schemas import FullConfigSchema + +msg = Printer(timestamp=True) + + + +class DataLoader: + """Class to handle loading of a datasplit.""" + + def __init__( + self, + cfg: FullConfigSchema, + ): + self.cfg: FullConfigSchema = cfg + + # File handling + self.dir_path = Path(cfg.data.dir) + self.file_suffix = cfg.data.suffix + + # Column specifications + self.pred_col_name_prefix = cfg.data.pred_prefix + + def load_dataset_from_dir( + self, + split_names: Union[Iterable[str], str], + nrows: Optional[int] = None, + ) -> pd.DataFrame: + """Load dataset for t2d. Can load multiple splits at once, e.g. + concatenate train and val for crossvalidation. + + Args: + split_names (Union[Iterable[str], str]): Name of split, allowed are ["train", "test", "val"] + nrows (Optional[int]): Number of rows to load from dataset. Defaults to None, in which case all rows are loaded. + + Returns: + pd.DataFrame: The filtered dataset + """ + msg.info(f"Loading {split_names}") + + # Concat splits if multiple are given + if isinstance(split_names, (list, tuple)): + if isinstance(split_names, Iterable): + split_names = tuple(split_names) + + if nrows is not None: + nrows = int( + nrows / len(split_names), + ) + + return pd.concat( + [ + self._load_dataset_file(split_name=split, nrows=nrows) + for split in split_names + ], + ignore_index=True, + ) + elif isinstance(split_names, str): + dataset = self._load_dataset_file(split_name=split_names, nrows=nrows) + + dataset = self._process_dataset(dataset=dataset) + + msg.good(f"{split_names}: Returning!") + return dataset + + def _load_dataset_file( # pylint: disable=inconsistent-return-statements + self, + split_name: str, + nrows: Optional[int] = None, + ) -> pd.DataFrame: # pylint: disable=inconsistent-return-statements + """Load dataset from directory. Finds any file with the matching file + suffix with the split name in its filename. + + Args: + split_name (str): Name of split, allowed are ["train", "test", "val"] + nrows (Optional[int]): Number of rows to load. Defaults to None, in which case + all rows are loaded. + self.file_suffix (str, optional): File suffix of the dataset. Defaults to "parquet". + + Returns: + pd.DataFrame: The dataset + """ + msg.info(f"Loading {split_name}") + + if self.file_suffix not in ("csv", "parquet"): + raise ValueError(f"File suffix {self.file_suffix} not supported.") + + if split_name not in ("train", "test", "val"): + raise ValueError(f"Split name {split_name} not supported.") + + path = list(self.dir_path.glob(f"*{split_name}*.{self.file_suffix}"))[0] + + if "parquet" in self.file_suffix: + if nrows: + raise ValueError( + "nrows is not supported for parquet files. Please use csv files.", + ) + return pd.read_parquet(path) + elif "csv" in self.file_suffix: + return pd.read_csv(filepath_or_buffer=path, nrows=nrows) + + def _process_dataset(self, dataset: pd.DataFrame) -> pd.DataFrame: + """Process dataset, namely: + + - Drop patients with outcome before drop_patient_if_outcome_before_date + - Process timestamp columns + - Drop visits where min_lookahead, min_lookbehind or min_prediction_time_date are not met + - Drop features with lookbehinds not in lookbehind_combination + + Returns: + pd.DataFrame: Processed dataset + """ + msg = Printer(timestamp=True) + msg.info("Processing dataset") + + # Super hacky rename, needs to be removed before merging. Figure out how to add eval columns when creating the dataset. + dataset = dataset.rename( + { + "pred_hba1c_within_9999_days_count_fallback_nan": self.cfg.data.col_name.custom.n_hba1c, + }, + axis=1, + ) + + # Super hacky transformation of negative weights (?!) for chi-square. + # In the future, we want to: + # 1. Fix this in the feature generation for t2d + # 2a. See if there's a way of using feature selection that permits negative values, or + # 2b. Always use z-score normalisation? + dataset = self._negative_values_to_nan(dataset=dataset) + + dataset = self.convert_timestamp_dtype_and_nat(dataset=dataset) + + if self.cfg.preprocessing.convert_booleans_to_int: + dataset = self._convert_boolean_dtypes_to_int(dataset=dataset) + + if self.cfg.data.min_age: + dataset = self._keep_only_if_older_than_min_age(dataset=dataset) + + dataset = self._drop_rows_after_event_time(dataset=dataset) + + if self.cfg.data.drop_patient_if_exclusion_before_date: + dataset = self._drop_patient_if_excluded(dataset=dataset) + + # Drop if later than min prediction time date + if self.cfg.data.min_prediction_time_date: + dataset = dataset[ + dataset[self.cfg.data.col_name.pred_timestamp] + > self.cfg.data.min_prediction_time_date + ] + + dataset = self._drop_cols_and_rows_if_look_direction_not_met(dataset=dataset) + + if self.cfg.data.lookbehind_combination: + dataset = self._drop_cols_not_in_lookbehind_combination(dataset=dataset) + + dataset = self._keep_unique_outcome_col_with_lookahead_days_matching_conf( + dataset=dataset, + ) + + msg.info("Finished processing dataset") + + return dataset \ No newline at end of file diff --git a/src/psycop_model_training/data_loader/utils.py b/src/psycop_model_training/data_loader/utils.py new file mode 100644 index 00000000..158bae84 --- /dev/null +++ b/src/psycop_model_training/data_loader/utils.py @@ -0,0 +1,56 @@ +import os +from pathlib import Path + +import pandas as pd + +from psycop_model_training.config.schemas import FullConfigSchema +from psycop_model_training.data_loader.data_classes import SplitDataset +from psycop_model_training.data_loader.data_loader import DataLoader + + +def get_latest_dataset_dir(path: Path) -> Path: + """Get the latest dataset directory by time of creation.""" + return max(path.glob("*"), key=os.path.getctime) + + +def load_train_from_cfg(cfg: FullConfigSchema) -> pd.DataFrame: + """Load train dataset from config. + + Args: + cfg (FullConfig): Config + + Returns: + pd.DataFrame: Train dataset + """ + return DataLoader(cfg=cfg).load_dataset_from_dir(split_names="train") + + +def load_train_and_val_from_cfg(cfg: FullConfigSchema): + """Load train and validation data from file.""" + + loader = DataLoader(cfg=cfg) + + return SplitDataset( + train=loader.load_dataset_from_dir(split_names="train"), + val=loader.load_dataset_from_dir(split_names="val"), + ) + + +def load_train_raw(cfg: FullConfigSchema): + """Load the data.""" + path = Path(cfg.data.dir) + file_names = list(path.glob(pattern=r"*train*")) + + if len(file_names) == 1: + file_name = file_names[0] + file_suffix = file_name.suffix + if file_suffix == ".parquet": + df = pd.read_parquet(file_name) + elif file_suffix == ".csv": + df = pd.read_csv(file_name) + + df = DataLoader.convert_timestamp_dtype_and_nat(dataset=df) + + return df + + raise ValueError(f"Returned {len(file_names)} files") diff --git a/src/psycop_model_training/load.py b/src/psycop_model_training/load.py deleted file mode 100644 index e4a7d2b7..00000000 --- a/src/psycop_model_training/load.py +++ /dev/null @@ -1,616 +0,0 @@ -"""Loader for the t2d dataset.""" -import os -import re -from collections.abc import Iterable -from datetime import timedelta -from pathlib import Path -from typing import Optional, Union - -import numpy as np -import pandas as pd -from psycopmlutils.sql.loader import sql_load -from pydantic import BaseModel -from wasabi import Printer - -from psycop_model_training.config.schemas import FullConfigSchema -from psycop_model_training.utils.col_name_inference import infer_look_distance -from psycop_model_training.utils.decorators import print_df_dimensions_diff -from psycop_model_training.utils.utils import ( - get_percent_lost, - infer_outcome_col_name, - infer_predictor_col_name, -) - -msg = Printer(timestamp=True) - - -def load_timestamp_for_any_diabetes(): - """Loads timestamps for the broad definition of diabetes used for wash-in. - - See R files for details. - """ - timestamp_any_diabetes = sql_load( - query="SELECT * FROM [fct].[psycop_t2d_first_diabetes_any]", - format_timestamp_cols_to_datetime=False, - )[["dw_ek_borger", "datotid_first_diabetes_any"]] - - timestamp_any_diabetes = timestamp_any_diabetes.rename( - columns={"datotid_first_diabetes_any": "timestamp_washin"}, - ) - - return timestamp_any_diabetes - - -def add_washin_timestamps(dataset: pd.DataFrame) -> pd.DataFrame: - """Add washin timestamps to dataset. - - Washin is an exclusion criterion. E.g. if the patient has any visit - that looks like diabetes before the study starts (i.e. during - washin), they are excluded. - """ - timestamp_washin = load_timestamp_for_any_diabetes() - - dataset = dataset.merge( - timestamp_washin, - on="dw_ek_borger", - how="left", - ) - - return dataset - - -class DataLoader: - """Class to handle loading of a datasplit.""" - - def __init__( - self, - cfg: FullConfigSchema, - ): - self.cfg: FullConfigSchema = cfg - - # File handling - self.dir_path = Path(cfg.data.dir) - self.file_suffix = cfg.data.suffix - - # Column specifications - self.pred_col_name_prefix = cfg.data.pred_prefix - - def _load_dataset_file( # pylint: disable=inconsistent-return-statements - self, - split_name: str, - nrows: Optional[int] = None, - ) -> pd.DataFrame: # pylint: disable=inconsistent-return-statements - """Load dataset from directory. Finds any file with the matching file - suffix with the split name in its filename. - - Args: - split_name (str): Name of split, allowed are ["train", "test", "val"] - nrows (Optional[int]): Number of rows to load. Defaults to None, in which case - all rows are loaded. - self.file_suffix (str, optional): File suffix of the dataset. Defaults to "parquet". - - Returns: - pd.DataFrame: The dataset - """ - msg.info(f"Loading {split_name}") - - if self.file_suffix not in ("csv", "parquet"): - raise ValueError(f"File suffix {self.file_suffix} not supported.") - - if split_name not in ("train", "test", "val"): - raise ValueError(f"Split name {split_name} not supported.") - - path = list(self.dir_path.glob(f"*{split_name}*.{self.file_suffix}"))[0] - - if "parquet" in self.file_suffix: - if nrows: - raise ValueError( - "nrows is not supported for parquet files. Please use csv files.", - ) - return pd.read_parquet(path) - elif "csv" in self.file_suffix: - return pd.read_csv(filepath_or_buffer=path, nrows=nrows) - - def _drop_rows_if_datasets_ends_within_days( - self, - n_days: Union[int, float], - dataset: pd.DataFrame, - direction: str, - ) -> pd.DataFrame: - """Drop visits that lie within certain amount of days from end of - dataset. - - Args: - n_days (Union[float, int]): Number of days. - dataset (pd.DataFrame): Dataset. - direction (str): Direction to look. Allowed are ["before", "after"]. - - Returns: - pd.DataFrame: Dataset with dropped rows. - """ - if not isinstance(n_days, timedelta): - n_days_timedelt: timedelta = timedelta(days=n_days) # type: ignore - - if direction not in ("ahead", "behind"): - raise ValueError(f"Direction {direction} not supported.") - - n_rows_before_modification = dataset.shape[0] - - if direction == "ahead": - max_datetime = ( - dataset[self.cfg.data.col_name.pred_timestamp].max() - n_days_timedelt - ) - before_max_dt = ( - dataset[self.cfg.data.col_name.pred_timestamp] < max_datetime - ) - dataset = dataset[before_max_dt] - elif direction == "behind": - min_datetime = ( - dataset[self.cfg.data.col_name.pred_timestamp].min() + n_days_timedelt - ) - after_min_dt = dataset[self.cfg.data.col_name.pred_timestamp] > min_datetime - dataset = dataset[after_min_dt] - - n_rows_after_modification = dataset.shape[0] - percent_dropped = get_percent_lost( - n_before=n_rows_before_modification, - n_after=n_rows_after_modification, - ) - - if n_rows_before_modification - n_rows_after_modification != 0: - msg.info( - f"Dropped {n_rows_before_modification - n_rows_after_modification} ({percent_dropped}%) rows because the end of the dataset was within {n_days} of their prediction time when looking {direction} from their prediction time", - ) - - return dataset - - @print_df_dimensions_diff - def _drop_patient_if_excluded( - self, - dataset: pd.DataFrame, - ) -> pd.DataFrame: - """Drop patients that have an exclusion event within the washin - period.""" - - n_rows_before_modification = dataset.shape[0] - - outcome_before_date = ( - dataset[self.cfg.data.col_name.exclusion_timestamp] - < self.cfg.data.drop_patient_if_exclusion_before_date - ) - - patients_to_drop = set( - dataset[self.cfg.data.col_name.id][outcome_before_date].unique(), - ) - dataset = dataset[~dataset[self.cfg.data.col_name.id].isin(patients_to_drop)] - - n_rows_after_modification = dataset.shape[0] - - percent_dropped = get_percent_lost( - n_before=n_rows_after_modification, - n_after=n_rows_after_modification, - ) - - if n_rows_before_modification - n_rows_after_modification != 0: - msg.info( - f"Dropped {n_rows_before_modification - n_rows_after_modification} ({percent_dropped}%) rows because they met exclusion criteria before {self.cfg.data.drop_patient_if_exclusion_before_date}.", - ) - else: - msg.info( - f"No rows met exclusion criteria before {self.cfg.data.drop_patient_if_exclusion_before_date}. Didn't drop any.", - ) - - return dataset - - @print_df_dimensions_diff - def _drop_cols_not_in_lookbehind_combination( - self, - dataset: pd.DataFrame, - ) -> pd.DataFrame: - """Drop predictor columns that are not in the specified combination of - lookbehind windows. - - Args: - dataset (pd.DataFrame): Dataset. - - Returns: - pd.DataFrame: Dataset with dropped columns. - """ - - if not self.cfg.data.lookbehind_combination: - raise ValueError("No lookbehind_combination provided.") - - # Extract all unique lookbhehinds in the dataset predictors - lookbehinds_in_dataset = { - int(infer_look_distance(col)[0]) - for col in infer_predictor_col_name(df=dataset) - if len(infer_look_distance(col)) > 0 - } - - # Convert list to set - lookbehinds_in_spec = set(self.cfg.data.lookbehind_combination) - - # Check that all loobehinds in lookbehind_combination are used in the predictors - if not lookbehinds_in_spec.issubset( - lookbehinds_in_dataset, - ): - msg.warn( - f"One or more of the provided lookbehinds in lookbehind_combination is/are not used in any predictors in the dataset: {lookbehinds_in_spec - lookbehinds_in_dataset}", - ) - - lookbehinds_to_keep = lookbehinds_in_spec.intersection( - lookbehinds_in_dataset, - ) - - if not lookbehinds_to_keep: - raise ValueError("No predictors left after dropping lookbehinds.") - - msg.warn(f"Training on {lookbehinds_to_keep}.") - else: - lookbehinds_to_keep = lookbehinds_in_spec - - # Create a list of all predictor columns who have a lookbehind window not in lookbehind_combination list - cols_to_drop = [ - c - for c in infer_predictor_col_name(df=dataset) - if all(str(l_beh) not in c for l_beh in lookbehinds_to_keep) - ] - - cols_to_drop = [c for c in cols_to_drop if "within" in c] - # ? Add some specification of within_x_days indicating how to parse columns to find lookbehinds. Or, alternatively, use the column spec. - - dataset = dataset.drop(columns=cols_to_drop) - return dataset - - @staticmethod - @print_df_dimensions_diff - def convert_timestamp_dtype_and_nat(dataset: pd.DataFrame) -> pd.DataFrame: - """Convert columns with `timestamp`in their name to datetime, and - convert 0's to NaT.""" - timestamp_colnames = [col for col in dataset.columns if "timestamp" in col] - - for colname in timestamp_colnames: - if dataset[colname].dtype != "datetime64[ns]": - # Convert all 0s in colname to NaT - dataset[colname] = dataset[colname].apply( - lambda x: pd.NaT if x == "0" else x, - ) - dataset[colname] = pd.to_datetime(dataset[colname]) - - return dataset - - def _drop_cols_if_exceeds_look_direction_threshold( - self, - dataset: pd.DataFrame, - look_direction_threshold: Union[int, float], - direction: str, - ) -> pd.DataFrame: - """Drop columns if they look behind or ahead longer than a specified - threshold. - - For example, if direction is "ahead", and n_days is 30, then the column - should be dropped if it's trying to look 60 days ahead. This is useful - to avoid some rows having more information than others. - - Args: - dataset (pd.DataFrame): Dataset to process. - look_direction_threshold (Union[int, float]): Number of days to look in the direction. - direction (str): Direction to look. Allowed are ["ahead", "behind"]. - - Returns: - pd.DataFrame: Dataset without the dropped columns. - """ - - cols_to_drop = [] - - n_cols_before_modification = dataset.shape[1] - - if direction == "behind": - cols_to_process = infer_predictor_col_name(df=dataset) - - for col in cols_to_process: - # Extract lookbehind days from column name use regex - # E.g. "column_name_within_90_days" == 90 - # E.g. "column_name_within_90_days_fallback_NaN" == 90 - lookbehind_days_strs = re.findall(r"within_(\d+)_days", col) - - if len(lookbehind_days_strs) > 0: - lookbehind_days = int(lookbehind_days_strs[0]) - else: - msg.warn(f"Could not extract lookbehind days from {col}") - continue - - if lookbehind_days > look_direction_threshold: - cols_to_drop.append(col) - - n_cols_after_modification = dataset.shape[1] - percent_dropped = get_percent_lost( - n_before=n_cols_before_modification, - n_after=n_cols_after_modification, - ) - - if n_cols_before_modification - n_cols_after_modification != 0: - msg.info( - f"Dropped {n_cols_before_modification - n_cols_after_modification} ({percent_dropped}%) columns because they were looking {direction} further out than {look_direction_threshold} days.", - ) - - return dataset[[c for c in dataset.columns if c not in cols_to_drop]] - - @print_df_dimensions_diff - def _drop_cols_and_rows_if_look_direction_not_met( - self, - dataset: pd.DataFrame, - ) -> pd.DataFrame: - """Drop columns if they are outside the specification. Specifically: - - - min_lookahead_days is insufficient for the column's lookahead - - The dataset doesn't stretch far enough for the prediction time's lookahead - - The dataset doesn't stretch far enough for the prediction time's lookbehind - - Args: - dataset (pd.DataFrame): Dataset to process. - """ - for direction in ("ahead", "behind"): - - if direction in ("ahead", "behind"): - if direction == "ahead": - n_days = self.cfg.data.min_lookahead_days - elif direction == "behind": - n_days = max(self.cfg.data.lookbehind_combination) - else: - continue - - dataset = self._drop_rows_if_datasets_ends_within_days( - n_days=n_days, - dataset=dataset, - direction=direction, - ) - - dataset = self._drop_cols_if_exceeds_look_direction_threshold( - dataset=dataset, - look_direction_threshold=n_days, - direction=direction, - ) - - return dataset - - @print_df_dimensions_diff - def _keep_unique_outcome_col_with_lookahead_days_matching_conf( - self, - dataset: pd.DataFrame, - ) -> pd.DataFrame: - """Keep only one outcome column with the same lookahead days as set in - the config.""" - outcome_cols = infer_outcome_col_name(df=dataset, allow_multiple=True) - - col_to_drop = [ - c for c in outcome_cols if str(self.cfg.data.min_lookahead_days) not in c - ] - - # If no columns to drop, return the dataset - if not col_to_drop: - return dataset - - df = dataset.drop(col_to_drop, axis=1) - - if not len(infer_outcome_col_name(df)) == 1: - raise ValueError( - "Returning more than one outcome column, will cause problems during eval.", - ) - - return df - - def _keep_only_if_older_than_min_age(self, dataset: pd.DataFrame) -> pd.DataFrame: - """Keep only rows that are older than the minimum age specified in the - config.""" - return dataset[dataset[self.cfg.data.col_name.age] >= self.cfg.data.min_age] - - @print_df_dimensions_diff - def n_outcome_col_names(self, df: pd.DataFrame) -> int: - """How many outcome columns there are in a dataframe.""" - return len(infer_outcome_col_name(df=df, allow_multiple=True)) - - @print_df_dimensions_diff - def _drop_rows_after_event_time(self, dataset: pd.DataFrame) -> pd.DataFrame: - """Drop all rows where prediction timestamp is after the outcome.""" - - rows_to_drop = ( - dataset[self.cfg.data.col_name.pred_timestamp] - > dataset[self.cfg.data.col_name.outcome_timestamp] - ) - - return dataset[~rows_to_drop] - - @print_df_dimensions_diff - def _convert_boolean_dtypes_to_int(self, dataset: pd.DataFrame) -> pd.DataFrame: - """Convert boolean dtypes to int.""" - for col in dataset.columns: - if dataset[col].dtype == bool: - dataset[col] = dataset[col].astype(int) - - return dataset - - @print_df_dimensions_diff - def _negative_values_to_nan(self, dataset: pd.DataFrame) -> pd.DataFrame: - """Convert negative values to NaN.""" - preds = dataset[infer_predictor_col_name(df=dataset)] - - # Get all columns with negative values - cols_with_numerical_values = preds.select_dtypes(include=["number"]).columns - - numerical_columns_with_negative_values = [ - c for c in cols_with_numerical_values if preds[c].min() < 0 - ] - - df_to_replace = dataset[numerical_columns_with_negative_values] - - # Convert to NaN - df_to_replace[df_to_replace < 0] = np.nan - dataset[numerical_columns_with_negative_values] = df_to_replace - - return dataset - - def _process_dataset(self, dataset: pd.DataFrame) -> pd.DataFrame: - """Process dataset, namely: - - - Drop patients with outcome before drop_patient_if_outcome_before_date - - Process timestamp columns - - Drop visits where min_lookahead, min_lookbehind or min_prediction_time_date are not met - - Drop features with lookbehinds not in lookbehind_combination - - Returns: - pd.DataFrame: Processed dataset - """ - msg = Printer(timestamp=True) - msg.info("Processing dataset") - - # Super hacky rename, needs to be removed before merging. Figure out how to add eval columns when creating the dataset. - dataset = dataset.rename( - { - "pred_hba1c_within_9999_days_count_fallback_nan": self.cfg.data.col_name.custom.n_hba1c, - }, - axis=1, - ) - - # Super hacky transformation of negative weights (?!) for chi-square. - # In the future, we want to: - # 1. Fix this in the feature generation for t2d - # 2a. See if there's a way of using feature selection that permits negative values, or - # 2b. Always use z-score normalisation? - dataset = self._negative_values_to_nan(dataset=dataset) - - dataset = self.convert_timestamp_dtype_and_nat(dataset=dataset) - - if self.cfg.preprocessing.convert_booleans_to_int: - dataset = self._convert_boolean_dtypes_to_int(dataset=dataset) - - if self.cfg.data.min_age: - dataset = self._keep_only_if_older_than_min_age(dataset=dataset) - - dataset = self._drop_rows_after_event_time(dataset=dataset) - - if self.cfg.data.drop_patient_if_exclusion_before_date: - dataset = self._drop_patient_if_excluded(dataset=dataset) - - # Drop if later than min prediction time date - if self.cfg.data.min_prediction_time_date: - dataset = dataset[ - dataset[self.cfg.data.col_name.pred_timestamp] - > self.cfg.data.min_prediction_time_date - ] - - dataset = self._drop_cols_and_rows_if_look_direction_not_met(dataset=dataset) - - if self.cfg.data.lookbehind_combination: - dataset = self._drop_cols_not_in_lookbehind_combination(dataset=dataset) - - dataset = self._keep_unique_outcome_col_with_lookahead_days_matching_conf( - dataset=dataset, - ) - - msg.info("Finished processing dataset") - - return dataset - - def load_dataset_from_dir( - self, - split_names: Union[Iterable[str], str], - nrows: Optional[int] = None, - ) -> pd.DataFrame: - """Load dataset for t2d. Can load multiple splits at once, e.g. - concatenate train and val for crossvalidation. - - Args: - split_names (Union[Iterable[str], str]): Name of split, allowed are ["train", "test", "val"] - nrows (Optional[int]): Number of rows to load from dataset. Defaults to None, in which case all rows are loaded. - - Returns: - pd.DataFrame: The filtered dataset - """ - msg.info(f"Loading {split_names}") - - # Concat splits if multiple are given - if isinstance(split_names, (list, tuple)): - if isinstance(split_names, Iterable): - split_names = tuple(split_names) - - if nrows is not None: - nrows = int( - nrows / len(split_names), - ) - - return pd.concat( - [ - self._load_dataset_file(split_name=split, nrows=nrows) - for split in split_names - ], - ignore_index=True, - ) - elif isinstance(split_names, str): - dataset = self._load_dataset_file(split_name=split_names, nrows=nrows) - - dataset = self._process_dataset(dataset=dataset) - - msg.good(f"{split_names}: Returning!") - return dataset - - -class SplitDataset(BaseModel): - """A dataset split into train, test and optionally validation.""" - - class Config: - """Configuration for the dataclass to allow pd.DataFrame as type.""" - - arbitrary_types_allowed = True - - train: pd.DataFrame - test: Optional[pd.DataFrame] = None - val: pd.DataFrame - - -def load_train_from_cfg(cfg: FullConfigSchema) -> pd.DataFrame: - """Load train dataset from config. - - Args: - cfg (FullConfig): Config - - Returns: - pd.DataFrame: Train dataset - """ - return DataLoader(cfg=cfg).load_dataset_from_dir(split_names="train") - - -def load_train_and_val_from_cfg(cfg: FullConfigSchema): - """Load train and validation data from file.""" - - loader = DataLoader(cfg=cfg) - - return SplitDataset( - train=loader.load_dataset_from_dir(split_names="train"), - val=loader.load_dataset_from_dir(split_names="val"), - ) - - -def get_latest_dataset_dir(path: Path) -> Path: - """Get the latest dataset directory by time of creation.""" - return max(path.glob("*"), key=os.path.getctime) - - -def load_train_raw(cfg: FullConfigSchema): - """Load the data.""" - path = Path(cfg.data.dir) - file_names = list(path.glob(pattern=r"*train*")) - - if len(file_names) == 1: - file_name = file_names[0] - file_suffix = file_name.suffix - if file_suffix == ".parquet": - df = pd.read_parquet(file_name) - elif file_suffix == ".csv": - df = pd.read_csv(file_name) - - df = DataLoader.convert_timestamp_dtype_and_nat(dataset=df) - - return df - - raise ValueError(f"Returned {len(file_names)} files") diff --git a/src/psycop_model_training/preprocessing/post_split/create_pipeline.py b/src/psycop_model_training/preprocessing/post_split/create_pipeline.py index 21db8355..d13a8cc4 100644 --- a/src/psycop_model_training/preprocessing/post_split/create_pipeline.py +++ b/src/psycop_model_training/preprocessing/post_split/create_pipeline.py @@ -11,8 +11,8 @@ from wasabi import Printer from psycop_model_training.config.schemas import FullConfigSchema -from psycop_model_training.preprocessing.feature_selectors import DropDateTimeColumns -from psycop_model_training.preprocessing.feature_transformers import ( +from psycop_model_training.preprocessing.post_split.feature_selectors import DropDateTimeColumns +from psycop_model_training.preprocessing.post_split.feature_transformers import ( ConvertToBoolean, DateTimeConverter, ) diff --git a/src/psycop_model_training/preprocessing/feature_selectors.py b/src/psycop_model_training/preprocessing/post_split/feature_selectors.py similarity index 100% rename from src/psycop_model_training/preprocessing/feature_selectors.py rename to src/psycop_model_training/preprocessing/post_split/feature_selectors.py diff --git a/src/psycop_model_training/preprocessing/feature_transformers.py b/src/psycop_model_training/preprocessing/post_split/feature_transformers.py similarity index 100% rename from src/psycop_model_training/preprocessing/feature_transformers.py rename to src/psycop_model_training/preprocessing/post_split/feature_transformers.py diff --git a/src/psycop_model_training/preprocessing/pre_split/__init__.py b/src/psycop_model_training/preprocessing/pre_split/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/psycop_model_training/preprocessing/pre_split/col_filterer.py b/src/psycop_model_training/preprocessing/pre_split/col_filterer.py new file mode 100644 index 00000000..0224d068 --- /dev/null +++ b/src/psycop_model_training/preprocessing/pre_split/col_filterer.py @@ -0,0 +1,198 @@ +import re +from typing import Union + +import pandas as pd + +from psycop_model_training.data_loader.data_loader import msg +from psycop_model_training.utils.col_name_inference import infer_look_distance +from psycop_model_training.utils.decorators import print_df_dimensions_diff +from psycop_model_training.utils.utils import infer_predictor_col_name, get_percent_lost, infer_outcome_col_name + + +class PresSplitColFilterer(): + + @print_df_dimensions_diff + def _drop_cols_not_in_lookbehind_combination( + self, + dataset: pd.DataFrame, + ) -> pd.DataFrame: + """Drop predictor columns that are not in the specified combination of + lookbehind windows. + + Args: + dataset (pd.DataFrame): Dataset. + + Returns: + pd.DataFrame: Dataset with dropped columns. + """ + + if not self.cfg.data.lookbehind_combination: + raise ValueError("No lookbehind_combination provided.") + + # Extract all unique lookbhehinds in the dataset predictors + lookbehinds_in_dataset = { + int(infer_look_distance(col)[0]) + for col in infer_predictor_col_name(df=dataset) + if len(infer_look_distance(col)) > 0 + } + + # Convert list to set + lookbehinds_in_spec = set(self.cfg.data.lookbehind_combination) + + # Check that all loobehinds in lookbehind_combination are used in the predictors + if not lookbehinds_in_spec.issubset( + lookbehinds_in_dataset, + ): + msg.warn( + f"One or more of the provided lookbehinds in lookbehind_combination is/are not used in any predictors in the dataset: {lookbehinds_in_spec - lookbehinds_in_dataset}", + ) + + lookbehinds_to_keep = lookbehinds_in_spec.intersection( + lookbehinds_in_dataset, + ) + + if not lookbehinds_to_keep: + raise ValueError("No predictors left after dropping lookbehinds.") + + msg.warn(f"Training on {lookbehinds_to_keep}.") + else: + lookbehinds_to_keep = lookbehinds_in_spec + + # Create a list of all predictor columns who have a lookbehind window not in lookbehind_combination list + cols_to_drop = [ + c + for c in infer_predictor_col_name(df=dataset) + if all(str(l_beh) not in c for l_beh in lookbehinds_to_keep) + ] + + cols_to_drop = [c for c in cols_to_drop if "within" in c] + # ? Add some specification of within_x_days indicating how to parse columns to find lookbehinds. Or, alternatively, use the column spec. + + dataset = dataset.drop(columns=cols_to_drop) + return dataset + + def _drop_cols_if_exceeds_look_direction_threshold( + self, + dataset: pd.DataFrame, + look_direction_threshold: Union[int, float], + direction: str, + ) -> pd.DataFrame: + """Drop columns if they look behind or ahead longer than a specified + threshold. + + For example, if direction is "ahead", and n_days is 30, then the column + should be dropped if it's trying to look 60 days ahead. This is useful + to avoid some rows having more information than others. + + Args: + dataset (pd.DataFrame): Dataset to process. + look_direction_threshold (Union[int, float]): Number of days to look in the direction. + direction (str): Direction to look. Allowed are ["ahead", "behind"]. + + Returns: + pd.DataFrame: Dataset without the dropped columns. + """ + + cols_to_drop = [] + + n_cols_before_modification = dataset.shape[1] + + if direction == "behind": + cols_to_process = infer_predictor_col_name(df=dataset) + + for col in cols_to_process: + # Extract lookbehind days from column name use regex + # E.g. "column_name_within_90_days" == 90 + # E.g. "column_name_within_90_days_fallback_NaN" == 90 + lookbehind_days_strs = re.findall(r"within_(\d+)_days", col) + + if len(lookbehind_days_strs) > 0: + lookbehind_days = int(lookbehind_days_strs[0]) + else: + msg.warn(f"Could not extract lookbehind days from {col}") + continue + + if lookbehind_days > look_direction_threshold: + cols_to_drop.append(col) + + n_cols_after_modification = dataset.shape[1] + percent_dropped = get_percent_lost( + n_before=n_cols_before_modification, + n_after=n_cols_after_modification, + ) + + if n_cols_before_modification - n_cols_after_modification != 0: + msg.info( + f"Dropped {n_cols_before_modification - n_cols_after_modification} ({percent_dropped}%) columns because they were looking {direction} further out than {look_direction_threshold} days.", + ) + + return dataset[[c for c in dataset.columns if c not in cols_to_drop]] + + @print_df_dimensions_diff + def _drop_cols_and_rows_if_look_direction_not_met( + self, + dataset: pd.DataFrame, + ) -> pd.DataFrame: + """Drop columns if they are outside the specification. Specifically: + + - min_lookahead_days is insufficient for the column's lookahead + - The dataset doesn't stretch far enough for the prediction time's lookahead + - The dataset doesn't stretch far enough for the prediction time's lookbehind + + Args: + dataset (pd.DataFrame): Dataset to process. + """ + for direction in ("ahead", "behind"): + + if direction in ("ahead", "behind"): + if direction == "ahead": + n_days = self.cfg.data.min_lookahead_days + elif direction == "behind": + n_days = max(self.cfg.data.lookbehind_combination) + else: + continue + + dataset = self._drop_rows_if_datasets_ends_within_days( + n_days=n_days, + dataset=dataset, + direction=direction, + ) + + dataset = self._drop_cols_if_exceeds_look_direction_threshold( + dataset=dataset, + look_direction_threshold=n_days, + direction=direction, + ) + + return dataset + + @print_df_dimensions_diff + def _keep_unique_outcome_col_with_lookahead_days_matching_conf( + self, + dataset: pd.DataFrame, + ) -> pd.DataFrame: + """Keep only one outcome column with the same lookahead days as set in + the config.""" + outcome_cols = infer_outcome_col_name(df=dataset, allow_multiple=True) + + col_to_drop = [ + c for c in outcome_cols if str(self.cfg.data.min_lookahead_days) not in c + ] + + # If no columns to drop, return the dataset + if not col_to_drop: + return dataset + + df = dataset.drop(col_to_drop, axis=1) + + if not len(infer_outcome_col_name(df)) == 1: + raise ValueError( + "Returning more than one outcome column, will cause problems during eval.", + ) + + return df + + @print_df_dimensions_diff + def n_outcome_col_names(self, df: pd.DataFrame) -> int: + """How many outcome columns there are in a dataframe.""" + return len(infer_outcome_col_name(df=df, allow_multiple=True)) diff --git a/src/psycop_model_training/preprocessing/pre_split/col_transformer.py b/src/psycop_model_training/preprocessing/pre_split/col_transformer.py new file mode 100644 index 00000000..44c164bc --- /dev/null +++ b/src/psycop_model_training/preprocessing/pre_split/col_transformer.py @@ -0,0 +1,54 @@ +import numpy as np +import pandas as pd + +from psycop_model_training.utils.decorators import print_df_dimensions_diff +from psycop_model_training.utils.utils import infer_predictor_col_name + + +class PresSplitColTransformer(): + + @staticmethod + @print_df_dimensions_diff + def convert_timestamp_dtype_and_nat(dataset: pd.DataFrame) -> pd.DataFrame: + """Convert columns with `timestamp`in their name to datetime, and + convert 0's to NaT.""" + timestamp_colnames = [col for col in dataset.columns if "timestamp" in col] + + for colname in timestamp_colnames: + if dataset[colname].dtype != "datetime64[ns]": + # Convert all 0s in colname to NaT + dataset[colname] = dataset[colname].apply( + lambda x: pd.NaT if x == "0" else x, + ) + dataset[colname] = pd.to_datetime(dataset[colname]) + + return dataset + + @print_df_dimensions_diff + def _convert_boolean_dtypes_to_int(self, dataset: pd.DataFrame) -> pd.DataFrame: + """Convert boolean dtypes to int.""" + for col in dataset.columns: + if dataset[col].dtype == bool: + dataset[col] = dataset[col].astype(int) + + return dataset + + @print_df_dimensions_diff + def _negative_values_to_nan(self, dataset: pd.DataFrame) -> pd.DataFrame: + """Convert negative values to NaN.""" + preds = dataset[infer_predictor_col_name(df=dataset)] + + # Get all columns with negative values + cols_with_numerical_values = preds.select_dtypes(include=["number"]).columns + + numerical_columns_with_negative_values = [ + c for c in cols_with_numerical_values if preds[c].min() < 0 + ] + + df_to_replace = dataset[numerical_columns_with_negative_values] + + # Convert to NaN + df_to_replace[df_to_replace < 0] = np.nan + dataset[numerical_columns_with_negative_values] = df_to_replace + + return dataset diff --git a/src/psycop_model_training/preprocessing/pre_split/row_filterer.py b/src/psycop_model_training/preprocessing/pre_split/row_filterer.py new file mode 100644 index 00000000..108220b8 --- /dev/null +++ b/src/psycop_model_training/preprocessing/pre_split/row_filterer.py @@ -0,0 +1,120 @@ +from datetime import timedelta +from typing import Union + +import pandas as pd + +from psycop_model_training.data_loader.data_loader import msg +from psycop_model_training.utils.decorators import print_df_dimensions_diff +from psycop_model_training.utils.utils import get_percent_lost + + +class PreSplitRowFilterer(): + def __init__(self): + raise NotImplementedError + + def _drop_rows_if_datasets_ends_within_days( + self, + n_days: Union[int, float], + dataset: pd.DataFrame, + direction: str, + ) -> pd.DataFrame: + """Drop visits that lie within certain amount of days from end of + dataset. + + Args: + n_days (Union[float, int]): Number of days. + dataset (pd.DataFrame): Dataset. + direction (str): Direction to look. Allowed are ["before", "after"]. + + Returns: + pd.DataFrame: Dataset with dropped rows. + """ + if not isinstance(n_days, timedelta): + n_days_timedelt: timedelta = timedelta(days=n_days) # type: ignore + + if direction not in ("ahead", "behind"): + raise ValueError(f"Direction {direction} not supported.") + + n_rows_before_modification = dataset.shape[0] + + if direction == "ahead": + max_datetime = ( + dataset[self.cfg.data.col_name.pred_timestamp].max() - n_days_timedelt + ) + before_max_dt = ( + dataset[self.cfg.data.col_name.pred_timestamp] < max_datetime + ) + dataset = dataset[before_max_dt] + elif direction == "behind": + min_datetime = ( + dataset[self.cfg.data.col_name.pred_timestamp].min() + n_days_timedelt + ) + after_min_dt = dataset[self.cfg.data.col_name.pred_timestamp] > min_datetime + dataset = dataset[after_min_dt] + + n_rows_after_modification = dataset.shape[0] + percent_dropped = get_percent_lost( + n_before=n_rows_before_modification, + n_after=n_rows_after_modification, + ) + + if n_rows_before_modification - n_rows_after_modification != 0: + msg.info( + f"Dropped {n_rows_before_modification - n_rows_after_modification} ({percent_dropped}%) rows because the end of the dataset was within {n_days} of their prediction time when looking {direction} from their prediction time", + ) + + return dataset + + @print_df_dimensions_diff + def _drop_patient_if_excluded( + self, + dataset: pd.DataFrame, + ) -> pd.DataFrame: + """Drop patients that have an exclusion event within the washin + period.""" + + n_rows_before_modification = dataset.shape[0] + + outcome_before_date = ( + dataset[self.cfg.data.col_name.exclusion_timestamp] + < self.cfg.data.drop_patient_if_exclusion_before_date + ) + + patients_to_drop = set( + dataset[self.cfg.data.col_name.id][outcome_before_date].unique(), + ) + dataset = dataset[~dataset[self.cfg.data.col_name.id].isin(patients_to_drop)] + + n_rows_after_modification = dataset.shape[0] + + percent_dropped = get_percent_lost( + n_before=n_rows_after_modification, + n_after=n_rows_after_modification, + ) + + if n_rows_before_modification - n_rows_after_modification != 0: + msg.info( + f"Dropped {n_rows_before_modification - n_rows_after_modification} ({percent_dropped}%) rows because they met exclusion criteria before {self.cfg.data.drop_patient_if_exclusion_before_date}.", + ) + else: + msg.info( + f"No rows met exclusion criteria before {self.cfg.data.drop_patient_if_exclusion_before_date}. Didn't drop any.", + ) + + return dataset + + def _keep_only_if_older_than_min_age(self, dataset: pd.DataFrame) -> pd.DataFrame: + """Keep only rows that are older than the minimum age specified in the + config.""" + return dataset[dataset[self.cfg.data.col_name.age] >= self.cfg.data.min_age] + + @print_df_dimensions_diff + def _drop_rows_after_event_time(self, dataset: pd.DataFrame) -> pd.DataFrame: + """Drop all rows where prediction timestamp is after the outcome.""" + + rows_to_drop = ( + dataset[self.cfg.data.col_name.pred_timestamp] + > dataset[self.cfg.data.col_name.outcome_timestamp] + ) + + return dataset[~rows_to_drop] diff --git a/src/psycop_model_training/training/train_model.py b/src/psycop_model_training/training/train_model.py index 822e58bf..aaffcb5d 100644 --- a/src/psycop_model_training/training/train_model.py +++ b/src/psycop_model_training/training/train_model.py @@ -21,7 +21,7 @@ ) # from psycop_model_training.evaluation import evaluate_model -from psycop_model_training.load import load_train_and_val_from_cfg +from psycop_model_training.data_loader.utils import load_train_and_val_from_cfg from psycop_model_training.model_eval.dataclasses import EvalDataset, PipeMetadata from psycop_model_training.model_eval.evaluate_model import run_full_evaluation from psycop_model_training.preprocessing.post_split.create_pipeline import ( diff --git a/tests/test_load.py b/tests/test_load.py index 3adba1e1..a03a6a34 100644 --- a/tests/test_load.py +++ b/tests/test_load.py @@ -1,7 +1,7 @@ """Testing of loader functions.""" from psycop_model_training.config.schemas import FullConfigSchema -from psycop_model_training.load import load_train_from_cfg +from psycop_model_training.data_loader.utils import load_train_from_cfg def test_load_lookbehind_exceeds_lookbehind_threshold( diff --git a/tests/test_preprocessing.py b/tests/test_preprocessing.py index a5d042b8..ef1767eb 100644 --- a/tests/test_preprocessing.py +++ b/tests/test_preprocessing.py @@ -1,6 +1,6 @@ """Test custom preprocessing steps.""" from psycop_model_training.config.schemas import FullConfigSchema -from psycop_model_training.load import load_train_from_cfg +from psycop_model_training.data_loader.utils import load_train_from_cfg from psycop_model_training.preprocessing.post_split.create_pipeline import ( create_preprocessing_pipeline, ) From 04dc061032806b87c80908bae5123b99af5f8c4c Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Fri, 16 Dec 2022 13:20:21 +0100 Subject: [PATCH 02/47] refactor: misc. moving files around --- src/application/t2d/train_model.py | 147 ++++++++++++++ .../model_eval/evaluate_model.py | 14 +- .../examples/evaluate_model_from_file.py | 2 +- .../model_performance/__init__.py | 0 .../model_performance/model_performance.py | 2 +- .../model_performance/utils.py | 0 .../plots}/__init__.py | 0 .../plots}/base_charts.py | 0 .../plots}/feature_importance.py | 2 +- .../plots}/performance_by_age.py | 4 +- .../plots}/performance_by_n_hba1c.py | 4 +- .../plots}/performance_over_time.py | 4 +- .../plots}/prob_over_time.py | 0 .../plots}/roc_auc.py | 0 .../plots}/sens_over_time.py | 0 .../plots}/utils.py | 0 .../{train_model.py => train_and_eval.py} | 179 +----------------- src/psycop_model_training/training/utils.py | 26 +++ src/psycop_model_training/utils/utils.py | 2 +- .../test_model_performance.py | 2 +- tests/model_evaluation/test_visualizations.py | 16 +- tests/test_train_model.py | 2 +- 22 files changed, 204 insertions(+), 202 deletions(-) create mode 100644 src/application/t2d/train_model.py rename src/psycop_model_training/{ => model_eval}/model_performance/__init__.py (100%) rename src/psycop_model_training/{ => model_eval}/model_performance/model_performance.py (99%) rename src/psycop_model_training/{ => model_eval}/model_performance/utils.py (100%) rename src/psycop_model_training/{visualization => model_eval/plots}/__init__.py (100%) rename src/psycop_model_training/{visualization => model_eval/plots}/base_charts.py (100%) rename src/psycop_model_training/{visualization => model_eval/plots}/feature_importance.py (96%) rename src/psycop_model_training/{visualization => model_eval/plots}/performance_by_age.py (91%) rename src/psycop_model_training/{visualization => model_eval/plots}/performance_by_n_hba1c.py (92%) rename src/psycop_model_training/{visualization => model_eval/plots}/performance_over_time.py (98%) rename src/psycop_model_training/{visualization => model_eval/plots}/prob_over_time.py (100%) rename src/psycop_model_training/{visualization => model_eval/plots}/roc_auc.py (100%) rename src/psycop_model_training/{visualization => model_eval/plots}/sens_over_time.py (100%) rename src/psycop_model_training/{visualization => model_eval/plots}/utils.py (100%) rename src/psycop_model_training/training/{train_model.py => train_and_eval.py} (54%) create mode 100644 src/psycop_model_training/training/utils.py diff --git a/src/application/t2d/train_model.py b/src/application/t2d/train_model.py new file mode 100644 index 00000000..f5962417 --- /dev/null +++ b/src/application/t2d/train_model.py @@ -0,0 +1,147 @@ +import time +from typing import Any + +import numpy as np + +import wandb +from psycop_model_training.config.schemas import FullConfigSchema, convert_omegaconf_to_pydantic_object +from psycop_model_training.data_loader.utils import load_train_and_val_from_cfg +from psycop_model_training.model_eval.dataclasses import PipeMetadata +from psycop_model_training.model_eval.evaluate_model import run_full_evaluation +from psycop_model_training.preprocessing.post_split.create_pipeline import create_preprocessing_pipeline +from psycop_model_training.training.train_and_eval import create_model, CONFIG_PATH, train_and_get_model_eval_df +from psycop_model_training.utils.col_name_inference import get_col_names +from psycop_model_training.utils.utils import flatten_nested_dict, create_wandb_folders, get_feature_importance_dict, \ + get_selected_features_dict, eval_ds_cfg_pipe_to_disk, PROJECT_ROOT + + +def create_pipeline(cfg): + """Create pipeline. + + Args: + cfg (DictConfig): Config object + + Returns: + Pipeline + """ + steps = [] + preprocessing_pipe = create_preprocessing_pipeline(cfg) + if len(preprocessing_pipe.steps) != 0: + steps.append(("preprocessing", preprocessing_pipe)) + + mdl = create_model(cfg) + steps.append(("model", mdl)) + return Pipeline(steps) + + +@hydra.main( + config_path=str(CONFIG_PATH), + config_name="default_config", + version_base="1.2", +) +def main(cfg: DictConfig): + """Main function for training a single model.""" + # Save dictconfig for easier logging + if isinstance(cfg, DictConfig): + # Create flattened dict for logging to wandb + # Wandb doesn't allow configs to be nested, so we + # flatten it. + dict_config_to_log: dict[str, Any] = flatten_nested_dict(OmegaConf.to_container(cfg), sep=".") # type: ignore + else: + # For testing, we can take a FullConfig object instead. Simplifies boilerplate. + dict_config_to_log = cfg.__dict__ + + if not isinstance(cfg, FullConfigSchema): + cfg = convert_omegaconf_to_pydantic_object(cfg) + + msg = Printer(timestamp=True) + + create_wandb_folders() + + run = wandb.init( + project=cfg.project.name, + reinit=True, + config=dict_config_to_log, + mode=cfg.project.wandb.mode, + group=cfg.project.wandb.group, + entity=cfg.project.wandb.entity, + ) + + if run is None: + raise ValueError("Failed to initialise Wandb") + + # Add random delay based on cfg.train.random_delay_per_job to avoid + # each job needing the same resources (GPU, disk, network) at the same time + if cfg.train.random_delay_per_job_seconds: + delay = np.random.randint(0, cfg.train.random_delay_per_job_seconds) + msg.info(f"Delaying job by {delay} seconds to avoid resource competition") + time.sleep(delay) + + dataset = load_train_and_val_from_cfg(cfg) + + msg.info("Creating pipeline") + pipe = create_pipeline(cfg) + + outcome_col_name, train_col_names = get_col_names(cfg, dataset.train) + + msg.info("Training model") + eval_ds = train_and_get_model_eval_df( + cfg=cfg, + train=dataset.train, + val=dataset.val, + pipe=pipe, + outcome_col_name=outcome_col_name, + train_col_names=train_col_names, + n_splits=cfg.train.n_splits, + ) + + pipe_metadata = PipeMetadata() + + if hasattr(pipe["model"], "feature_importances_"): + pipe_metadata.feature_importances = get_feature_importance_dict(pipe=pipe) + if hasattr(pipe["preprocessing"].named_steps, "feature_selection"): + pipe_metadata.selected_features = get_selected_features_dict( + pipe=pipe, + train_col_names=train_col_names, + ) + + # Save model predictions, feature importance, and config to disk + eval_ds_cfg_pipe_to_disk( + eval_dataset=eval_ds, + cfg=cfg, + pipe_metadata=pipe_metadata, + run=run, + ) + + if cfg.project.wandb.mode == "run" or cfg.eval.force: + msg.info("Evaluating model.") + + upload_to_wandb = cfg.project.wandb.mode == "run" + + run_full_evaluation( + cfg=cfg, + eval_dataset=eval_ds, + run=run, + pipe_metadata=pipe_metadata, + save_dir=PROJECT_ROOT / "wandb" / "plots" / run.name, + upload_to_wandb=upload_to_wandb, + ) + + roc_auc = roc_auc_score( + eval_ds.y, + eval_ds.y_hat_probs, + ) + + msg.info(f"ROC AUC: {roc_auc}") + run.log( + { + "roc_auc_unweighted": roc_auc, + "lookbehind": max(cfg.data.lookbehind_combination), + "lookahead": cfg.data.min_lookahead_days, + }, + ) + run.finish() + return roc_auc + +if __name__ == "__main__": + main() # pylint: disable=no-value-for-parameter diff --git a/src/psycop_model_training/model_eval/evaluate_model.py b/src/psycop_model_training/model_eval/evaluate_model.py index 765b57de..75483342 100644 --- a/src/psycop_model_training/model_eval/evaluate_model.py +++ b/src/psycop_model_training/model_eval/evaluate_model.py @@ -22,26 +22,26 @@ generate_selected_features_table, ) from psycop_model_training.utils.utils import positive_rate_to_pred_probs -from psycop_model_training.visualization.feature_importance import ( +from psycop_model_training.model_eval.plots.feature_importance import ( plot_feature_importances, ) -from psycop_model_training.visualization.performance_by_age import ( +from psycop_model_training.model_eval.plots import ( plot_performance_by_age, ) -from psycop_model_training.visualization.performance_by_n_hba1c import ( +from psycop_model_training.model_eval.plots.performance_by_n_hba1c import ( plot_performance_by_n_hba1c, ) -from psycop_model_training.visualization.performance_over_time import ( +from psycop_model_training.model_eval.plots import ( plot_auc_by_time_from_first_visit, plot_metric_by_calendar_time, plot_metric_by_cyclic_time, plot_metric_by_time_until_diagnosis, ) -from psycop_model_training.visualization.roc_auc import plot_auc_roc -from psycop_model_training.visualization.sens_over_time import ( +from psycop_model_training.model_eval.plots import plot_auc_roc +from psycop_model_training.model_eval.plots import ( plot_sensitivity_by_time_to_outcome_heatmap, ) -from psycop_model_training.visualization.utils import log_image_to_wandb +from psycop_model_training.model_eval.plots import log_image_to_wandb def upload_artifacts_to_wandb( diff --git a/src/psycop_model_training/model_eval/examples/evaluate_model_from_file.py b/src/psycop_model_training/model_eval/examples/evaluate_model_from_file.py index cdb0bf72..ae3e432b 100644 --- a/src/psycop_model_training/model_eval/examples/evaluate_model_from_file.py +++ b/src/psycop_model_training/model_eval/examples/evaluate_model_from_file.py @@ -19,7 +19,7 @@ load_evaluation_data, read_pickle, ) -from psycop_model_training.visualization import plot_auc_by_time_from_first_visit +from psycop_model_training.model_eval.plots import plot_auc_by_time_from_first_visit def load_model_predictions_and_cfg(path: Path) -> tuple[pd.DataFrame, DictConfig]: diff --git a/src/psycop_model_training/model_performance/__init__.py b/src/psycop_model_training/model_eval/model_performance/__init__.py similarity index 100% rename from src/psycop_model_training/model_performance/__init__.py rename to src/psycop_model_training/model_eval/model_performance/__init__.py diff --git a/src/psycop_model_training/model_performance/model_performance.py b/src/psycop_model_training/model_eval/model_performance/model_performance.py similarity index 99% rename from src/psycop_model_training/model_performance/model_performance.py rename to src/psycop_model_training/model_eval/model_performance/model_performance.py index 89e922db..e330a0a7 100644 --- a/src/psycop_model_training/model_performance/model_performance.py +++ b/src/psycop_model_training/model_eval/model_performance/model_performance.py @@ -15,7 +15,7 @@ roc_auc_score, ) -from psycop_model_training.model_performance.utils import ( +from psycop_model_training.model_eval.model_performance.utils import ( add_metadata_cols, aggregate_predictions, idx_to_class, diff --git a/src/psycop_model_training/model_performance/utils.py b/src/psycop_model_training/model_eval/model_performance/utils.py similarity index 100% rename from src/psycop_model_training/model_performance/utils.py rename to src/psycop_model_training/model_eval/model_performance/utils.py diff --git a/src/psycop_model_training/visualization/__init__.py b/src/psycop_model_training/model_eval/plots/__init__.py similarity index 100% rename from src/psycop_model_training/visualization/__init__.py rename to src/psycop_model_training/model_eval/plots/__init__.py diff --git a/src/psycop_model_training/visualization/base_charts.py b/src/psycop_model_training/model_eval/plots/base_charts.py similarity index 100% rename from src/psycop_model_training/visualization/base_charts.py rename to src/psycop_model_training/model_eval/plots/base_charts.py diff --git a/src/psycop_model_training/visualization/feature_importance.py b/src/psycop_model_training/model_eval/plots/feature_importance.py similarity index 96% rename from src/psycop_model_training/visualization/feature_importance.py rename to src/psycop_model_training/model_eval/plots/feature_importance.py index 78d8b012..64228512 100644 --- a/src/psycop_model_training/visualization/feature_importance.py +++ b/src/psycop_model_training/model_eval/plots/feature_importance.py @@ -7,7 +7,7 @@ import numpy as np import pandas as pd -from psycop_model_training.visualization.base_charts import plot_basic_chart +from psycop_model_training.model_eval.plots.base_charts import plot_basic_chart def plot_feature_importances( diff --git a/src/psycop_model_training/visualization/performance_by_age.py b/src/psycop_model_training/model_eval/plots/performance_by_age.py similarity index 91% rename from src/psycop_model_training/visualization/performance_by_age.py rename to src/psycop_model_training/model_eval/plots/performance_by_age.py index a8a74b20..591b9751 100644 --- a/src/psycop_model_training/visualization/performance_by_age.py +++ b/src/psycop_model_training/model_eval/plots/performance_by_age.py @@ -7,8 +7,8 @@ from sklearn.metrics import roc_auc_score from psycop_model_training.model_eval.dataclasses import EvalDataset -from psycop_model_training.visualization.base_charts import plot_basic_chart -from psycop_model_training.visualization.utils import create_performance_by_input +from psycop_model_training.model_eval.plots.base_charts import plot_basic_chart +from psycop_model_training.model_eval.plots.utils import create_performance_by_input def plot_performance_by_age( diff --git a/src/psycop_model_training/visualization/performance_by_n_hba1c.py b/src/psycop_model_training/model_eval/plots/performance_by_n_hba1c.py similarity index 92% rename from src/psycop_model_training/visualization/performance_by_n_hba1c.py rename to src/psycop_model_training/model_eval/plots/performance_by_n_hba1c.py index f9b54cfd..9a29d775 100644 --- a/src/psycop_model_training/visualization/performance_by_n_hba1c.py +++ b/src/psycop_model_training/model_eval/plots/performance_by_n_hba1c.py @@ -7,8 +7,8 @@ from sklearn.metrics import roc_auc_score from psycop_model_training.model_eval.dataclasses import EvalDataset -from psycop_model_training.visualization.base_charts import plot_basic_chart -from psycop_model_training.visualization.utils import create_performance_by_input +from psycop_model_training.model_eval.plots.base_charts import plot_basic_chart +from psycop_model_training.model_eval.plots.utils import create_performance_by_input def plot_performance_by_n_hba1c( diff --git a/src/psycop_model_training/visualization/performance_over_time.py b/src/psycop_model_training/model_eval/plots/performance_over_time.py similarity index 98% rename from src/psycop_model_training/visualization/performance_over_time.py rename to src/psycop_model_training/model_eval/plots/performance_over_time.py index 7cdff65e..bd3c054c 100644 --- a/src/psycop_model_training/visualization/performance_over_time.py +++ b/src/psycop_model_training/model_eval/plots/performance_over_time.py @@ -14,8 +14,8 @@ from psycop_model_training.model_eval.dataclasses import EvalDataset from psycop_model_training.utils.utils import bin_continuous_data, round_floats_to_edge -from psycop_model_training.visualization.base_charts import plot_basic_chart -from psycop_model_training.visualization.utils import calc_performance +from psycop_model_training.model_eval.plots.base_charts import plot_basic_chart +from psycop_model_training.model_eval.plots.utils import calc_performance def create_performance_by_calendar_time_df( diff --git a/src/psycop_model_training/visualization/prob_over_time.py b/src/psycop_model_training/model_eval/plots/prob_over_time.py similarity index 100% rename from src/psycop_model_training/visualization/prob_over_time.py rename to src/psycop_model_training/model_eval/plots/prob_over_time.py diff --git a/src/psycop_model_training/visualization/roc_auc.py b/src/psycop_model_training/model_eval/plots/roc_auc.py similarity index 100% rename from src/psycop_model_training/visualization/roc_auc.py rename to src/psycop_model_training/model_eval/plots/roc_auc.py diff --git a/src/psycop_model_training/visualization/sens_over_time.py b/src/psycop_model_training/model_eval/plots/sens_over_time.py similarity index 100% rename from src/psycop_model_training/visualization/sens_over_time.py rename to src/psycop_model_training/model_eval/plots/sens_over_time.py diff --git a/src/psycop_model_training/visualization/utils.py b/src/psycop_model_training/model_eval/plots/utils.py similarity index 100% rename from src/psycop_model_training/visualization/utils.py rename to src/psycop_model_training/model_eval/plots/utils.py diff --git a/src/psycop_model_training/training/train_model.py b/src/psycop_model_training/training/train_and_eval.py similarity index 54% rename from src/psycop_model_training/training/train_model.py rename to src/psycop_model_training/training/train_and_eval.py index aaffcb5d..00ec5e26 100644 --- a/src/psycop_model_training/training/train_model.py +++ b/src/psycop_model_training/training/train_and_eval.py @@ -1,15 +1,10 @@ """Training script for training a single model for predicting t2d.""" import os -import time from collections.abc import Iterable -from typing import Any, Optional +from typing import Optional -import hydra import numpy as np import pandas as pd -import wandb -from omegaconf import OmegaConf -from omegaconf.dictconfig import DictConfig from sklearn.metrics import roc_auc_score from sklearn.model_selection import StratifiedGroupKFold from sklearn.pipeline import Pipeline @@ -17,25 +12,14 @@ from psycop_model_training.config.schemas import ( FullConfigSchema, - convert_omegaconf_to_pydantic_object, ) # from psycop_model_training.evaluation import evaluate_model -from psycop_model_training.data_loader.utils import load_train_and_val_from_cfg -from psycop_model_training.model_eval.dataclasses import EvalDataset, PipeMetadata -from psycop_model_training.model_eval.evaluate_model import run_full_evaluation -from psycop_model_training.preprocessing.post_split.create_pipeline import ( - create_preprocessing_pipeline, -) +from psycop_model_training.model_eval.dataclasses import EvalDataset from psycop_model_training.training.model_specs import MODELS -from psycop_model_training.utils.col_name_inference import get_col_names +from psycop_model_training.training.utils import create_eval_dataset from psycop_model_training.utils.utils import ( PROJECT_ROOT, - create_wandb_folders, - eval_ds_cfg_pipe_to_disk, - flatten_nested_dict, - get_feature_importance_dict, - get_selected_features_dict, ) CONFIG_PATH = PROJECT_ROOT / "src" / "psycop_model_training" / "config" @@ -107,28 +91,6 @@ def stratified_cross_validation( # pylint: disable=too-many-locals return train_df -def create_eval_dataset(cfg: FullConfigSchema, outcome_col_name: str, df: pd.DataFrame): - """Create an evaluation dataset object from a dataframe and - FullConfigSchema.""" - - eval_dataset = EvalDataset( - ids=df[cfg.data.col_name.id], - y=df[outcome_col_name], - y_hat_probs=df["y_hat_prob"], - y_hat_int=df["y_hat_prob"].round(), - pred_timestamps=df[cfg.data.col_name.pred_timestamp], - outcome_timestamps=df[cfg.data.col_name.outcome_timestamp], - age=df[cfg.data.col_name.age], - exclusion_timestamps=df[cfg.data.col_name.exclusion_timestamp], - ) - - if cfg.data.col_name.custom: - if cfg.data.col_name.custom.n_hba1c: - eval_dataset.custom.n_hba1c = df[cfg.data.col_name.custom.n_hba1c] - - return eval_dataset - - def train_and_eval_on_crossvalidation( cfg: FullConfigSchema, train: pd.DataFrame, @@ -261,137 +223,4 @@ def train_and_get_model_eval_df( n_splits=n_splits, ) - return eval_dataset - - -def create_pipeline(cfg): - """Create pipeline. - - Args: - cfg (DictConfig): Config object - - Returns: - Pipeline - """ - steps = [] - preprocessing_pipe = create_preprocessing_pipeline(cfg) - if len(preprocessing_pipe.steps) != 0: - steps.append(("preprocessing", preprocessing_pipe)) - - mdl = create_model(cfg) - steps.append(("model", mdl)) - return Pipeline(steps) - - -@hydra.main( - config_path=str(CONFIG_PATH), - config_name="default_config", - version_base="1.2", -) -def main(cfg: DictConfig): - """Main function for training a single model.""" - # Save dictconfig for easier logging - if isinstance(cfg, DictConfig): - # Create flattened dict for logging to wandb - # Wandb doesn't allow configs to be nested, so we - # flatten it. - dict_config_to_log: dict[str, Any] = flatten_nested_dict(OmegaConf.to_container(cfg), sep=".") # type: ignore - else: - # For testing, we can take a FullConfig object instead. Simplifies boilerplate. - dict_config_to_log = cfg.__dict__ - - if not isinstance(cfg, FullConfigSchema): - cfg = convert_omegaconf_to_pydantic_object(cfg) - - msg = Printer(timestamp=True) - - create_wandb_folders() - - run = wandb.init( - project=cfg.project.name, - reinit=True, - config=dict_config_to_log, - mode=cfg.project.wandb.mode, - group=cfg.project.wandb.group, - entity=cfg.project.wandb.entity, - ) - - if run is None: - raise ValueError("Failed to initialise Wandb") - - # Add random delay based on cfg.train.random_delay_per_job to avoid - # each job needing the same resources (GPU, disk, network) at the same time - if cfg.train.random_delay_per_job_seconds: - delay = np.random.randint(0, cfg.train.random_delay_per_job_seconds) - msg.info(f"Delaying job by {delay} seconds to avoid resource competition") - time.sleep(delay) - - dataset = load_train_and_val_from_cfg(cfg) - - msg.info("Creating pipeline") - pipe = create_pipeline(cfg) - - outcome_col_name, train_col_names = get_col_names(cfg, dataset.train) - - msg.info("Training model") - eval_ds = train_and_get_model_eval_df( - cfg=cfg, - train=dataset.train, - val=dataset.val, - pipe=pipe, - outcome_col_name=outcome_col_name, - train_col_names=train_col_names, - n_splits=cfg.train.n_splits, - ) - - pipe_metadata = PipeMetadata() - - if hasattr(pipe["model"], "feature_importances_"): - pipe_metadata.feature_importances = get_feature_importance_dict(pipe=pipe) - if hasattr(pipe["preprocessing"].named_steps, "feature_selection"): - pipe_metadata.selected_features = get_selected_features_dict( - pipe=pipe, - train_col_names=train_col_names, - ) - - # Save model predictions, feature importance, and config to disk - eval_ds_cfg_pipe_to_disk( - eval_dataset=eval_ds, - cfg=cfg, - pipe_metadata=pipe_metadata, - run=run, - ) - - if cfg.project.wandb.mode == "run" or cfg.eval.force: - msg.info("Evaluating model.") - - upload_to_wandb = cfg.project.wandb.mode == "run" - - run_full_evaluation( - cfg=cfg, - eval_dataset=eval_ds, - run=run, - pipe_metadata=pipe_metadata, - save_dir=PROJECT_ROOT / "wandb" / "plots" / run.name, - upload_to_wandb=upload_to_wandb, - ) - - roc_auc = roc_auc_score( - eval_ds.y, - eval_ds.y_hat_probs, - ) - - msg.info(f"ROC AUC: {roc_auc}") - run.log( - { - "roc_auc_unweighted": roc_auc, - "lookbehind": max(cfg.data.lookbehind_combination), - "lookahead": cfg.data.min_lookahead_days, - }, - ) - run.finish() - return roc_auc - - -if __name__ == "__main__": - main() # pylint: disable=no-value-for-parameter + return eval_dataset \ No newline at end of file diff --git a/src/psycop_model_training/training/utils.py b/src/psycop_model_training/training/utils.py new file mode 100644 index 00000000..6c9b1aa4 --- /dev/null +++ b/src/psycop_model_training/training/utils.py @@ -0,0 +1,26 @@ +import pandas as pd + +from psycop_model_training.config.schemas import FullConfigSchema +from psycop_model_training.model_eval.dataclasses import EvalDataset + + +def create_eval_dataset(cfg: FullConfigSchema, outcome_col_name: str, df: pd.DataFrame): + """Create an evaluation dataset object from a dataframe and + FullConfigSchema.""" + + eval_dataset = EvalDataset( + ids=df[cfg.data.col_name.id], + y=df[outcome_col_name], + y_hat_probs=df["y_hat_prob"], + y_hat_int=df["y_hat_prob"].round(), + pred_timestamps=df[cfg.data.col_name.pred_timestamp], + outcome_timestamps=df[cfg.data.col_name.outcome_timestamp], + age=df[cfg.data.col_name.age], + exclusion_timestamps=df[cfg.data.col_name.exclusion_timestamp], + ) + + if cfg.data.col_name.custom: + if cfg.data.col_name.custom.n_hba1c: + eval_dataset.custom.n_hba1c = df[cfg.data.col_name.custom.n_hba1c] + + return eval_dataset diff --git a/src/psycop_model_training/utils/utils.py b/src/psycop_model_training/utils/utils.py index ab57d871..6c5ce72d 100644 --- a/src/psycop_model_training/utils/utils.py +++ b/src/psycop_model_training/utils/utils.py @@ -23,7 +23,7 @@ ModelEvalData, PipeMetadata, ) -from psycop_model_training.model_performance import ModelPerformance +from psycop_model_training.model_eval.model_performance import ModelPerformance SHARED_RESOURCES_PATH = Path(r"E:\shared_resources") FEATURE_SETS_PATH = SHARED_RESOURCES_PATH / "feature_sets" diff --git a/tests/model_evaluation/test_model_performance.py b/tests/model_evaluation/test_model_performance.py index f1cc00a4..b47d1272 100644 --- a/tests/model_evaluation/test_model_performance.py +++ b/tests/model_evaluation/test_model_performance.py @@ -6,7 +6,7 @@ import pandas as pd import pytest -from psycop_model_training.model_performance import ModelPerformance +from psycop_model_training.model_eval.model_performance import ModelPerformance # pylint: disable=missing-function-docstring diff --git a/tests/model_evaluation/test_visualizations.py b/tests/model_evaluation/test_visualizations.py index cd52e195..c761d391 100644 --- a/tests/model_evaluation/test_visualizations.py +++ b/tests/model_evaluation/test_visualizations.py @@ -12,25 +12,25 @@ from psycop_model_training.model_eval.dataclasses import EvalDataset from psycop_model_training.utils.utils import PROJECT_ROOT, positive_rate_to_pred_probs -from psycop_model_training.visualization import plot_prob_over_time -from psycop_model_training.visualization.base_charts import plot_basic_chart -from psycop_model_training.visualization.feature_importance import ( +from psycop_model_training.model_eval.plots import plot_prob_over_time +from psycop_model_training.model_eval.plots import plot_basic_chart +from psycop_model_training.model_eval.plots.feature_importance import ( plot_feature_importances, ) -from psycop_model_training.visualization.performance_by_age import ( +from psycop_model_training.model_eval.plots import ( plot_performance_by_age, ) -from psycop_model_training.visualization.performance_by_n_hba1c import ( +from psycop_model_training.model_eval.plots.performance_by_n_hba1c import ( plot_performance_by_n_hba1c, ) -from psycop_model_training.visualization.performance_over_time import ( +from psycop_model_training.model_eval.plots import ( plot_auc_by_time_from_first_visit, plot_metric_by_calendar_time, plot_metric_by_cyclic_time, plot_metric_by_time_until_diagnosis, ) -from psycop_model_training.visualization.roc_auc import plot_auc_roc -from psycop_model_training.visualization.sens_over_time import ( +from psycop_model_training.model_eval.plots import plot_auc_roc +from psycop_model_training.model_eval.plots import ( create_sensitivity_by_time_to_outcome_df, plot_sensitivity_by_time_to_outcome_heatmap, ) diff --git a/tests/test_train_model.py b/tests/test_train_model.py index 7e4152a5..fec6acac 100644 --- a/tests/test_train_model.py +++ b/tests/test_train_model.py @@ -5,7 +5,7 @@ from psycop_model_training.config.schemas import FullConfigSchema, load_cfg_as_omegaconf from psycop_model_training.training.model_specs import MODELS -from psycop_model_training.training.train_model import main +from application.t2d.train_model import main INTEGRATION_TEST_FILE_NAME = "integration_config.yaml" From fd5c351e308bdc9263f882793a1689fce27d283e Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Fri, 16 Dec 2022 13:26:14 +0100 Subject: [PATCH 03/47] refactor: moving files around --- {src/application => application}/__init__.py | 0 .../t2d => application/config}/__init__.py | 0 .../config/data/synth_data.yaml | 0 .../config/data/t2d_parquet.yaml | 0 .../config/default_config.yaml | 0 .../config/eval/default_evaluation.yaml | 0 .../config/eval/evaluation_synth.yaml | 0 .../config/integration_config.yaml | 0 .../config/model/ebm.yaml | 0 .../config/model/logistic-regression.yaml | 0 .../config/model/naive-bayes.yaml | 0 .../config/model/xgboost.yaml | 0 .../preprocessing/default_preprocessing.yaml | 0 .../config/project/default_project.yaml | 0 .../project/integration_test_project.yaml | 0 .../config/project/overtaci_test_project.yaml | 0 .../config/sweeper/optuna_multithread.yaml | 0 .../config/sweeper/optuna_singlethread.yaml | 0 .../config/train/default_training.yaml | 0 application/{t2d => }/inspect_dataset.py | 2 +- .../t2d => application}/loaders/__init__.py | 0 .../loaders/preprocessing_loaders.py | 0 {src/application/t2d => application}/main.py | 2 +- .../00_generate_dfs/10_medication.rmd | 0 .../00_generate_dfs/20_hba1c.rmd | 0 .../00_generate_dfs/30_diagnoses.rmd | 0 .../00_generate_dfs/41_find_first_p.rmd | 0 .../00_generate_dfs/49_combined.rmd | 0 .../10_descriptive_stats.rmd | 0 .../functions.r | 0 .../t2d-plots.r | 0 .../t2d.Rproj | 0 .../tests/tests.r | 0 application/t2d/train_and_log_models.py | 238 ------------------ .../t2d => application}/train_model.py | 28 ++- src/application/t2d/inspect_dataset.py | 17 -- .../00_generate_dfs/10_medication.rmd | 58 ----- .../00_generate_dfs/20_hba1c.rmd | 33 --- .../00_generate_dfs/30_diagnoses.rmd | 67 ----- .../00_generate_dfs/41_find_first_p.rmd | 52 ---- .../00_generate_dfs/49_combined.rmd | 53 ---- .../10_descriptive_stats.rmd | 139 ---------- .../functions.r | 49 ---- .../t2d-plots.r | 223 ---------------- .../t2d.Rproj | 13 - .../tests/tests.r | 49 ---- src/psycop_model_training/archive/main.py | 2 +- .../archive/model_training_watcher.py | 2 +- src/psycop_model_training/config/__init__.py | 0 .../data_loader/data_loader.py | 89 ++++--- .../data_loader/utils.py | 2 +- .../model_eval/dataclasses.py | 2 +- .../model_eval/evaluate_model.py | 36 ++- .../examples/evaluate_model_from_file.py | 2 +- .../model_eval/plots/performance_over_time.py | 2 +- .../post_split/create_pipeline.py | 6 +- .../preprocessing/pre_split/col_filterer.py | 9 +- .../pre_split/col_transformer.py | 3 +- .../preprocessing/pre_split/row_filterer.py | 2 +- .../training/train_and_eval.py | 10 +- src/psycop_model_training/training/utils.py | 2 +- .../schemas.py => utils/config_schemas.py} | 2 +- tests/conftest.py | 4 +- tests/model_evaluation/test_visualizations.py | 28 +-- tests/test_configs.py | 4 +- tests/test_load.py | 2 +- tests/test_preprocessing.py | 2 +- tests/test_train_model.py | 4 +- 68 files changed, 127 insertions(+), 1111 deletions(-) rename {src/application => application}/__init__.py (100%) rename {src/application/t2d => application/config}/__init__.py (100%) rename {src/psycop_model_training => application}/config/data/synth_data.yaml (100%) rename {src/psycop_model_training => application}/config/data/t2d_parquet.yaml (100%) rename {src/psycop_model_training => application}/config/default_config.yaml (100%) rename {src/psycop_model_training => application}/config/eval/default_evaluation.yaml (100%) rename {src/psycop_model_training => application}/config/eval/evaluation_synth.yaml (100%) rename {src/psycop_model_training => application}/config/integration_config.yaml (100%) rename {src/psycop_model_training => application}/config/model/ebm.yaml (100%) rename {src/psycop_model_training => application}/config/model/logistic-regression.yaml (100%) rename {src/psycop_model_training => application}/config/model/naive-bayes.yaml (100%) rename {src/psycop_model_training => application}/config/model/xgboost.yaml (100%) rename {src/psycop_model_training => application}/config/preprocessing/default_preprocessing.yaml (100%) rename {src/psycop_model_training => application}/config/project/default_project.yaml (100%) rename {src/psycop_model_training => application}/config/project/integration_test_project.yaml (100%) rename {src/psycop_model_training => application}/config/project/overtaci_test_project.yaml (100%) rename {src/psycop_model_training => application}/config/sweeper/optuna_multithread.yaml (100%) rename {src/psycop_model_training => application}/config/sweeper/optuna_singlethread.yaml (100%) rename {src/psycop_model_training => application}/config/train/default_training.yaml (100%) rename application/{t2d => }/inspect_dataset.py (86%) rename {src/application/t2d => application}/loaders/__init__.py (100%) rename {src/application/t2d => application}/loaders/preprocessing_loaders.py (100%) rename {src/application/t2d => application}/main.py (99%) rename application/{t2d => }/outcome_specification - move to t2d-feature-gen-repo/00_generate_dfs/10_medication.rmd (100%) rename application/{t2d => }/outcome_specification - move to t2d-feature-gen-repo/00_generate_dfs/20_hba1c.rmd (100%) rename application/{t2d => }/outcome_specification - move to t2d-feature-gen-repo/00_generate_dfs/30_diagnoses.rmd (100%) rename application/{t2d => }/outcome_specification - move to t2d-feature-gen-repo/00_generate_dfs/41_find_first_p.rmd (100%) rename application/{t2d => }/outcome_specification - move to t2d-feature-gen-repo/00_generate_dfs/49_combined.rmd (100%) rename application/{t2d => }/outcome_specification - move to t2d-feature-gen-repo/10_descriptive_stats.rmd (100%) rename application/{t2d => }/outcome_specification - move to t2d-feature-gen-repo/functions.r (100%) rename application/{t2d => }/outcome_specification - move to t2d-feature-gen-repo/t2d-plots.r (100%) rename application/{t2d => }/outcome_specification - move to t2d-feature-gen-repo/t2d.Rproj (100%) rename application/{t2d => }/outcome_specification - move to t2d-feature-gen-repo/tests/tests.r (100%) delete mode 100644 application/t2d/train_and_log_models.py rename {src/application/t2d => application}/train_model.py (89%) delete mode 100644 src/application/t2d/inspect_dataset.py delete mode 100644 src/application/t2d/outcome_specification - move to t2d-feature-gen-repo/00_generate_dfs/10_medication.rmd delete mode 100644 src/application/t2d/outcome_specification - move to t2d-feature-gen-repo/00_generate_dfs/20_hba1c.rmd delete mode 100644 src/application/t2d/outcome_specification - move to t2d-feature-gen-repo/00_generate_dfs/30_diagnoses.rmd delete mode 100644 src/application/t2d/outcome_specification - move to t2d-feature-gen-repo/00_generate_dfs/41_find_first_p.rmd delete mode 100644 src/application/t2d/outcome_specification - move to t2d-feature-gen-repo/00_generate_dfs/49_combined.rmd delete mode 100644 src/application/t2d/outcome_specification - move to t2d-feature-gen-repo/10_descriptive_stats.rmd delete mode 100644 src/application/t2d/outcome_specification - move to t2d-feature-gen-repo/functions.r delete mode 100644 src/application/t2d/outcome_specification - move to t2d-feature-gen-repo/t2d-plots.r delete mode 100644 src/application/t2d/outcome_specification - move to t2d-feature-gen-repo/t2d.Rproj delete mode 100644 src/application/t2d/outcome_specification - move to t2d-feature-gen-repo/tests/tests.r delete mode 100644 src/psycop_model_training/config/__init__.py rename src/psycop_model_training/{config/schemas.py => utils/config_schemas.py} (99%) diff --git a/src/application/__init__.py b/application/__init__.py similarity index 100% rename from src/application/__init__.py rename to application/__init__.py diff --git a/src/application/t2d/__init__.py b/application/config/__init__.py similarity index 100% rename from src/application/t2d/__init__.py rename to application/config/__init__.py diff --git a/src/psycop_model_training/config/data/synth_data.yaml b/application/config/data/synth_data.yaml similarity index 100% rename from src/psycop_model_training/config/data/synth_data.yaml rename to application/config/data/synth_data.yaml diff --git a/src/psycop_model_training/config/data/t2d_parquet.yaml b/application/config/data/t2d_parquet.yaml similarity index 100% rename from src/psycop_model_training/config/data/t2d_parquet.yaml rename to application/config/data/t2d_parquet.yaml diff --git a/src/psycop_model_training/config/default_config.yaml b/application/config/default_config.yaml similarity index 100% rename from src/psycop_model_training/config/default_config.yaml rename to application/config/default_config.yaml diff --git a/src/psycop_model_training/config/eval/default_evaluation.yaml b/application/config/eval/default_evaluation.yaml similarity index 100% rename from src/psycop_model_training/config/eval/default_evaluation.yaml rename to application/config/eval/default_evaluation.yaml diff --git a/src/psycop_model_training/config/eval/evaluation_synth.yaml b/application/config/eval/evaluation_synth.yaml similarity index 100% rename from src/psycop_model_training/config/eval/evaluation_synth.yaml rename to application/config/eval/evaluation_synth.yaml diff --git a/src/psycop_model_training/config/integration_config.yaml b/application/config/integration_config.yaml similarity index 100% rename from src/psycop_model_training/config/integration_config.yaml rename to application/config/integration_config.yaml diff --git a/src/psycop_model_training/config/model/ebm.yaml b/application/config/model/ebm.yaml similarity index 100% rename from src/psycop_model_training/config/model/ebm.yaml rename to application/config/model/ebm.yaml diff --git a/src/psycop_model_training/config/model/logistic-regression.yaml b/application/config/model/logistic-regression.yaml similarity index 100% rename from src/psycop_model_training/config/model/logistic-regression.yaml rename to application/config/model/logistic-regression.yaml diff --git a/src/psycop_model_training/config/model/naive-bayes.yaml b/application/config/model/naive-bayes.yaml similarity index 100% rename from src/psycop_model_training/config/model/naive-bayes.yaml rename to application/config/model/naive-bayes.yaml diff --git a/src/psycop_model_training/config/model/xgboost.yaml b/application/config/model/xgboost.yaml similarity index 100% rename from src/psycop_model_training/config/model/xgboost.yaml rename to application/config/model/xgboost.yaml diff --git a/src/psycop_model_training/config/preprocessing/default_preprocessing.yaml b/application/config/preprocessing/default_preprocessing.yaml similarity index 100% rename from src/psycop_model_training/config/preprocessing/default_preprocessing.yaml rename to application/config/preprocessing/default_preprocessing.yaml diff --git a/src/psycop_model_training/config/project/default_project.yaml b/application/config/project/default_project.yaml similarity index 100% rename from src/psycop_model_training/config/project/default_project.yaml rename to application/config/project/default_project.yaml diff --git a/src/psycop_model_training/config/project/integration_test_project.yaml b/application/config/project/integration_test_project.yaml similarity index 100% rename from src/psycop_model_training/config/project/integration_test_project.yaml rename to application/config/project/integration_test_project.yaml diff --git a/src/psycop_model_training/config/project/overtaci_test_project.yaml b/application/config/project/overtaci_test_project.yaml similarity index 100% rename from src/psycop_model_training/config/project/overtaci_test_project.yaml rename to application/config/project/overtaci_test_project.yaml diff --git a/src/psycop_model_training/config/sweeper/optuna_multithread.yaml b/application/config/sweeper/optuna_multithread.yaml similarity index 100% rename from src/psycop_model_training/config/sweeper/optuna_multithread.yaml rename to application/config/sweeper/optuna_multithread.yaml diff --git a/src/psycop_model_training/config/sweeper/optuna_singlethread.yaml b/application/config/sweeper/optuna_singlethread.yaml similarity index 100% rename from src/psycop_model_training/config/sweeper/optuna_singlethread.yaml rename to application/config/sweeper/optuna_singlethread.yaml diff --git a/src/psycop_model_training/config/train/default_training.yaml b/application/config/train/default_training.yaml similarity index 100% rename from src/psycop_model_training/config/train/default_training.yaml rename to application/config/train/default_training.yaml diff --git a/application/t2d/inspect_dataset.py b/application/inspect_dataset.py similarity index 86% rename from application/t2d/inspect_dataset.py rename to application/inspect_dataset.py index 53cc6fe3..6e504cdb 100644 --- a/application/t2d/inspect_dataset.py +++ b/application/inspect_dataset.py @@ -1,5 +1,5 @@ """Example of how to inspect a dataset using the configs.""" -from psycop_model_training.config.schemas import load_cfg_as_pydantic +from psycop_model_training.utils.config_schemas import load_cfg_as_pydantic from psycop_model_training.data_loader.utils import load_train_from_cfg, load_train_raw diff --git a/src/application/t2d/loaders/__init__.py b/application/loaders/__init__.py similarity index 100% rename from src/application/t2d/loaders/__init__.py rename to application/loaders/__init__.py diff --git a/src/application/t2d/loaders/preprocessing_loaders.py b/application/loaders/preprocessing_loaders.py similarity index 100% rename from src/application/t2d/loaders/preprocessing_loaders.py rename to application/loaders/preprocessing_loaders.py diff --git a/src/application/t2d/main.py b/application/main.py similarity index 99% rename from src/application/t2d/main.py rename to application/main.py index d7c9d99b..9657b156 100644 --- a/src/application/t2d/main.py +++ b/application/main.py @@ -15,7 +15,7 @@ from random_word import RandomWords from wasabi import Printer -from psycop_model_training.config.schemas import ( +from psycop_model_training.utils.config_schemas import ( BaseModel, FullConfigSchema, load_cfg_as_pydantic, diff --git a/application/t2d/outcome_specification - move to t2d-feature-gen-repo/00_generate_dfs/10_medication.rmd b/application/outcome_specification - move to t2d-feature-gen-repo/00_generate_dfs/10_medication.rmd similarity index 100% rename from application/t2d/outcome_specification - move to t2d-feature-gen-repo/00_generate_dfs/10_medication.rmd rename to application/outcome_specification - move to t2d-feature-gen-repo/00_generate_dfs/10_medication.rmd diff --git a/application/t2d/outcome_specification - move to t2d-feature-gen-repo/00_generate_dfs/20_hba1c.rmd b/application/outcome_specification - move to t2d-feature-gen-repo/00_generate_dfs/20_hba1c.rmd similarity index 100% rename from application/t2d/outcome_specification - move to t2d-feature-gen-repo/00_generate_dfs/20_hba1c.rmd rename to application/outcome_specification - move to t2d-feature-gen-repo/00_generate_dfs/20_hba1c.rmd diff --git a/application/t2d/outcome_specification - move to t2d-feature-gen-repo/00_generate_dfs/30_diagnoses.rmd b/application/outcome_specification - move to t2d-feature-gen-repo/00_generate_dfs/30_diagnoses.rmd similarity index 100% rename from application/t2d/outcome_specification - move to t2d-feature-gen-repo/00_generate_dfs/30_diagnoses.rmd rename to application/outcome_specification - move to t2d-feature-gen-repo/00_generate_dfs/30_diagnoses.rmd diff --git a/application/t2d/outcome_specification - move to t2d-feature-gen-repo/00_generate_dfs/41_find_first_p.rmd b/application/outcome_specification - move to t2d-feature-gen-repo/00_generate_dfs/41_find_first_p.rmd similarity index 100% rename from application/t2d/outcome_specification - move to t2d-feature-gen-repo/00_generate_dfs/41_find_first_p.rmd rename to application/outcome_specification - move to t2d-feature-gen-repo/00_generate_dfs/41_find_first_p.rmd diff --git a/application/t2d/outcome_specification - move to t2d-feature-gen-repo/00_generate_dfs/49_combined.rmd b/application/outcome_specification - move to t2d-feature-gen-repo/00_generate_dfs/49_combined.rmd similarity index 100% rename from application/t2d/outcome_specification - move to t2d-feature-gen-repo/00_generate_dfs/49_combined.rmd rename to application/outcome_specification - move to t2d-feature-gen-repo/00_generate_dfs/49_combined.rmd diff --git a/application/t2d/outcome_specification - move to t2d-feature-gen-repo/10_descriptive_stats.rmd b/application/outcome_specification - move to t2d-feature-gen-repo/10_descriptive_stats.rmd similarity index 100% rename from application/t2d/outcome_specification - move to t2d-feature-gen-repo/10_descriptive_stats.rmd rename to application/outcome_specification - move to t2d-feature-gen-repo/10_descriptive_stats.rmd diff --git a/application/t2d/outcome_specification - move to t2d-feature-gen-repo/functions.r b/application/outcome_specification - move to t2d-feature-gen-repo/functions.r similarity index 100% rename from application/t2d/outcome_specification - move to t2d-feature-gen-repo/functions.r rename to application/outcome_specification - move to t2d-feature-gen-repo/functions.r diff --git a/application/t2d/outcome_specification - move to t2d-feature-gen-repo/t2d-plots.r b/application/outcome_specification - move to t2d-feature-gen-repo/t2d-plots.r similarity index 100% rename from application/t2d/outcome_specification - move to t2d-feature-gen-repo/t2d-plots.r rename to application/outcome_specification - move to t2d-feature-gen-repo/t2d-plots.r diff --git a/application/t2d/outcome_specification - move to t2d-feature-gen-repo/t2d.Rproj b/application/outcome_specification - move to t2d-feature-gen-repo/t2d.Rproj similarity index 100% rename from application/t2d/outcome_specification - move to t2d-feature-gen-repo/t2d.Rproj rename to application/outcome_specification - move to t2d-feature-gen-repo/t2d.Rproj diff --git a/application/t2d/outcome_specification - move to t2d-feature-gen-repo/tests/tests.r b/application/outcome_specification - move to t2d-feature-gen-repo/tests/tests.r similarity index 100% rename from application/t2d/outcome_specification - move to t2d-feature-gen-repo/tests/tests.r rename to application/outcome_specification - move to t2d-feature-gen-repo/tests/tests.r diff --git a/application/t2d/train_and_log_models.py b/application/t2d/train_and_log_models.py deleted file mode 100644 index d7c9d99b..00000000 --- a/application/t2d/train_and_log_models.py +++ /dev/null @@ -1,238 +0,0 @@ -"""Example script to train multiple models and subsequently log the results to -wandb. - -Usage: -- Replace the HYDRA_ARGS string with the desired arguments for `train_model.py` -- Run this script from project root with `python src/psycop_model_training/train_and_log_models.py -""" -import random -import subprocess -import time -from typing import Optional - -import pandas as pd -import wandb -from random_word import RandomWords -from wasabi import Printer - -from psycop_model_training.config.schemas import ( - BaseModel, - FullConfigSchema, - load_cfg_as_pydantic, -) -from psycop_model_training.data_loader.utils import load_train_raw -from psycop_model_training.model_eval.evaluate_model import ( - infer_look_distance, - infer_outcome_col_name, -) - - -def start_trainer( - cfg: FullConfigSchema, - config_file_name: str, - lookahead_days: int, - wandb_group_override: str, - model_name: str, -) -> subprocess.Popen: - """Start a trainer.""" - msg = Printer(timestamp=True) - - subprocess_args: list[str] = [ - "python", - "src/psycop_model_training/train_model.py", - f"project.wandb.group='{wandb_group_override}'", - f"project.wandb.mode={cfg.project.wandb.mode}", - f"hydra.sweeper.n_trials={cfg.train.n_trials_per_lookahead}", - f"hydra.sweeper.n_jobs={cfg.train.n_jobs_per_trainer}", - f"model={model_name}", - f"data.min_lookahead_days={lookahead_days}", - "--config-name", - f"{config_file_name}", - ] - - if cfg.train.n_trials_per_lookahead > 1: - subprocess_args.insert(2, "--multirun") - - if model_name == "xgboost": - subprocess_args.insert(3, "++model.args.tree_method='gpu_hist'") - - msg.info(f'{" ".join(subprocess_args)}') - - return subprocess.Popen( # pylint: disable=consider-using-with - args=subprocess_args, - ) - - -class TrainerSpec(BaseModel): - """Specification for starting a trainer. - - Provides overrides for the config file. - """ - - lookahead_days: int - model_name: str - - -def combine_lookaheads_and_model_names_to_trainer_specs( - cfg: FullConfigSchema, - possible_lookahead_days: list[int], - model_names: Optional[list[str]] = None, -): - """Generate trainer specs for all combinations of lookaheads and model - names.""" - msg = Printer(timestamp=True) - - random.shuffle(possible_lookahead_days) - - if model_names: - msg.warn( - "model_names was specified in train_models_for_each_cell_in_grid, overriding cfg.model.name", - ) - - model_name_queue = model_names if model_names else cfg.model.name - - # Create all combinations of lookahead_days and models - trainer_combinations_queue = [ - TrainerSpec(lookahead_days=lookahead_days, model_name=model_name) - for lookahead_days in possible_lookahead_days.copy() - for model_name in model_name_queue - ] - - return trainer_combinations_queue - - -def train_models_for_each_cell_in_grid( - cfg: FullConfigSchema, - possible_lookahead_days: list[int], - config_file_name: str, - wandb_prefix: str, - model_names: Optional[list[str]] = None, -): - """Train a model for each cell in the grid of possible look distances.""" - active_trainers: list[subprocess.Popen] = [] - - trainer_combinations_queue = combine_lookaheads_and_model_names_to_trainer_specs( - cfg=cfg, - possible_lookahead_days=possible_lookahead_days, - model_names=model_names, - ) - - while trainer_combinations_queue or active_trainers: - # Wait until there is a free slot in the trainers group - if ( - len(active_trainers) >= cfg.train.n_active_trainers - or len(trainer_combinations_queue) == 0 - ): - # Drop trainers if they have finished - # If finished, t.poll() is not None - active_trainers = [t for t in active_trainers if t.poll() is None] - time.sleep(1) - continue - - # Start a new trainer - trainer_spec = trainer_combinations_queue.pop() - - msg = Printer(timestamp=True) - msg.info( - f"Spawning a new trainer with lookahead={trainer_spec.lookahead_days} days", - ) - wandb_group = f"{wandb_prefix}" - - active_trainers.append( - start_trainer( - cfg=cfg, - config_file_name=config_file_name, - lookahead_days=trainer_spec.lookahead_days, - wandb_group_override=wandb_group, - model_name=trainer_spec.model_name, - ), - ) - - -def get_possible_lookaheads( - msg: Printer, - cfg: FullConfigSchema, - train_df: pd.DataFrame, -) -> list[int]: - """Some look_ahead and look_behind distances will result in 0 valid - prediction times. Only return combinations which will allow some prediction - times. - - E.g. if we only have 4 years of data: - - min_lookahead = 2 years - - min_lookbehind = 3 years - - Will mean that no rows satisfy the criteria. - """ - - outcome_col_names = infer_outcome_col_name(df=train_df, allow_multiple=True) - - possible_lookahead_days: list[int] = [ - int(dist) for dist in infer_look_distance(col_name=outcome_col_names) - ] - - # Don't try look distance combinations which will result in 0 rows - max_distance_in_dataset_days = ( - max(train_df[cfg.data.col_name.pred_timestamp]) - - min( - train_df[cfg.data.col_name.pred_timestamp], - ) - ).days - - lookaheads_without_rows: list[int] = [ - dist for dist in possible_lookahead_days if dist > max_distance_in_dataset_days - ] - - if lookaheads_without_rows: - msg.info( - f"Not fitting model to {lookaheads_without_rows}, since no rows satisfy the criteria.", - ) - - return list(set(possible_lookahead_days) - set(lookaheads_without_rows)) - - -def main(): - """Main.""" - msg = Printer(timestamp=True) - - debug = False - - if debug: - config_file_name = "integration_config.yaml" - else: - config_file_name = "default_config.yaml" - - cfg = load_cfg_as_pydantic(config_file_name=config_file_name) - - random_word = RandomWords() - wandb_group = f"{random_word.get_random_word()}-{random_word.get_random_word()}" - - wandb.init( - project=cfg.project.name, - mode=cfg.project.wandb.mode, - group=wandb_group, - entity=cfg.project.wandb.entity, - name="process_manager", - ) - - # Load dataset without dropping any rows for inferring - # which look distances to grid search over - train = load_train_raw(cfg=cfg) - - possible_lookaheads = get_possible_lookaheads( - msg=msg, - cfg=cfg, - train_df=train, - ) - - train_models_for_each_cell_in_grid( - cfg=cfg, - possible_lookahead_days=possible_lookaheads, - config_file_name=config_file_name, - wandb_prefix=wandb_group, - model_names=["xgboost", "logistic-regression"], - ) - - -if __name__ == "__main__": - main() diff --git a/src/application/t2d/train_model.py b/application/train_model.py similarity index 89% rename from src/application/t2d/train_model.py rename to application/train_model.py index f5962417..8c240bc7 100644 --- a/src/application/t2d/train_model.py +++ b/application/train_model.py @@ -2,17 +2,32 @@ from typing import Any import numpy as np - import wandb -from psycop_model_training.config.schemas import FullConfigSchema, convert_omegaconf_to_pydantic_object + +from psycop_model_training.utils.config_schemas import ( + FullConfigSchema, + convert_omegaconf_to_pydantic_object, +) from psycop_model_training.data_loader.utils import load_train_and_val_from_cfg from psycop_model_training.model_eval.dataclasses import PipeMetadata from psycop_model_training.model_eval.evaluate_model import run_full_evaluation -from psycop_model_training.preprocessing.post_split.create_pipeline import create_preprocessing_pipeline -from psycop_model_training.training.train_and_eval import create_model, CONFIG_PATH, train_and_get_model_eval_df +from psycop_model_training.preprocessing.post_split.create_pipeline import ( + create_preprocessing_pipeline, +) +from psycop_model_training.training.train_and_eval import ( + CONFIG_PATH, + create_model, + train_and_get_model_eval_df, +) from psycop_model_training.utils.col_name_inference import get_col_names -from psycop_model_training.utils.utils import flatten_nested_dict, create_wandb_folders, get_feature_importance_dict, \ - get_selected_features_dict, eval_ds_cfg_pipe_to_disk, PROJECT_ROOT +from psycop_model_training.utils.utils import ( + PROJECT_ROOT, + create_wandb_folders, + eval_ds_cfg_pipe_to_disk, + flatten_nested_dict, + get_feature_importance_dict, + get_selected_features_dict, +) def create_pipeline(cfg): @@ -143,5 +158,6 @@ def main(cfg: DictConfig): run.finish() return roc_auc + if __name__ == "__main__": main() # pylint: disable=no-value-for-parameter diff --git a/src/application/t2d/inspect_dataset.py b/src/application/t2d/inspect_dataset.py deleted file mode 100644 index 53cc6fe3..00000000 --- a/src/application/t2d/inspect_dataset.py +++ /dev/null @@ -1,17 +0,0 @@ -"""Example of how to inspect a dataset using the configs.""" -from psycop_model_training.config.schemas import load_cfg_as_pydantic -from psycop_model_training.data_loader.utils import load_train_from_cfg, load_train_raw - - -def main(): - """Main.""" - config_file_name = "default_config.yaml" - - cfg = load_cfg_as_pydantic(config_file_name=config_file_name) - df = load_train_raw(cfg=cfg) # noqa pylint: disable=unused-variable - - df_filtered = load_train_from_cfg(cfg=cfg) # noqa pylint: disable=unused-variable - - -if __name__ == "__main__": - main() diff --git a/src/application/t2d/outcome_specification - move to t2d-feature-gen-repo/00_generate_dfs/10_medication.rmd b/src/application/t2d/outcome_specification - move to t2d-feature-gen-repo/00_generate_dfs/10_medication.rmd deleted file mode 100644 index 88ef6c59..00000000 --- a/src/application/t2d/outcome_specification - move to t2d-feature-gen-repo/00_generate_dfs/10_medication.rmd +++ /dev/null @@ -1,58 +0,0 @@ -Find first occurrence of hospital prescription or hospital redemption of diabetic medication. - -```{r} -library("pacman") - -p_load(tidyverse, here, future) -source(here("psycop-r-utilities", "import_from_sql.r")) -source(here("functions.r")) - -``` - -```{r} -con <- DBI::dbConnect( - odbc::odbc(), - Driver = "SQL Server", - Server = "BI-DPA-PROD", - database = "USR_PS_Forsk", - Trusted_Connection = "TRUE" -) -``` - -# Get first date of prescribed antidiabetic medication for each patient -## From only that administered -```{r} -df_first_administered_t2d_medication <- con %>% - tbl(sql("SELECT * FROM [fct].FOR_Medicin_administreret_inkl_2021")) %>% - select(dw_ek_borger, datotid_ordination_start, atc) %>% - filter(substr(atc, 1, 3) == "A10") %>% # A10 is all antidiabetic medication - group_by(dw_ek_borger) %>% - filter(datotid_ordination_start == min(datotid_ordination_start)) %>% - collect %>% - format_sql_import() %>% - distinct(dw_ek_borger, datotid_ordination_start) -``` - -## From only that prescribed -```{r} -df_first_prescribed_t2d_medication <- con %>% - tbl(sql("SELECT * FROM [fct].FOR_Medicin_ordineret_inkl_2021")) %>% - filter(substr(atc, 1, 3) == "A10") %>% - select(dw_ek_borger, datotid_ordinationstart, atc) %>% - rename(datotid_ordination_start = datotid_ordinationstart) %>% - group_by(dw_ek_borger) %>% - filter(datotid_ordination_start == min(datotid_ordination_start)) %>% - collect %>% - format_sql_import() %>% - distinct(dw_ek_borger, datotid_ordination_start) -``` - -## Combined -```{r} -df_first_date_of_t2d_medication_prescription <- df_first_administered_t2d_medication %>% - bind_rows(df_first_prescribed_t2d_medication) %>% - group_by(dw_ek_borger) %>% - filter(datotid_ordination_start == min(datotid_ordination_start)) %>% - filter(row_number() == 1) %>% - rename(datotid_first_t2d_medication=datotid_ordination_start) -``` \ No newline at end of file diff --git a/src/application/t2d/outcome_specification - move to t2d-feature-gen-repo/00_generate_dfs/20_hba1c.rmd b/src/application/t2d/outcome_specification - move to t2d-feature-gen-repo/00_generate_dfs/20_hba1c.rmd deleted file mode 100644 index 9e6382b8..00000000 --- a/src/application/t2d/outcome_specification - move to t2d-feature-gen-repo/00_generate_dfs/20_hba1c.rmd +++ /dev/null @@ -1,33 +0,0 @@ -Find the first date where a patient gets a diabetic hba1c-blood-sample. - -```{r} -library("pacman") - -p_load(tidyverse, here, future) - -``` - -```{r} -con <- DBI::dbConnect( - odbc::odbc(), - Driver = "SQL Server", - Server = "BI-DPA-PROD", - database = "USR_PS_Forsk", - Trusted_Connection = "TRUE" -) -``` - -# Get first date of hba1c above threshold -## From only that administered -```{r} -df_first_t2d_blood_sample <- con %>% - tbl(sql("SELECT * FROM [fct].FOR_LABKA_NPU27300_HbA1c_inkl_2021")) %>% - select(dw_ek_borger, datotid_proevemodtagelse, numerisksvar, analysenavn) %>% - filter(numerisksvar >= 48) %>% - group_by(dw_ek_borger) %>% - filter(datotid_proevemodtagelse == min(datotid_proevemodtagelse)) %>% - rename(datotid_start = datotid_proevemodtagelse) %>% - collect %>% - distinct(dw_ek_borger, datotid_start) %>% - format_sql_import() -``` \ No newline at end of file diff --git a/src/application/t2d/outcome_specification - move to t2d-feature-gen-repo/00_generate_dfs/30_diagnoses.rmd b/src/application/t2d/outcome_specification - move to t2d-feature-gen-repo/00_generate_dfs/30_diagnoses.rmd deleted file mode 100644 index 65e9e32c..00000000 --- a/src/application/t2d/outcome_specification - move to t2d-feature-gen-repo/00_generate_dfs/30_diagnoses.rmd +++ /dev/null @@ -1,67 +0,0 @@ -Find the first date where a patient gets a t2d-diagnosis in the hospital system. - -```{r} -library("pacman") -``` - -```{r} -con <- DBI::dbConnect( - odbc::odbc(), - Driver = "SQL Server", - Server = "BI-DPA-PROD", - database = "USR_PS_Forsk", - Trusted_Connection = "TRUE" -) -``` - -# A-diagnoses -## LPR3 -```{r} -df_lpr3_diagnoses_roughly_selected <- con %>% - tbl(sql("SELECT * FROM [fct].FOR_LPR3kontakter_psyk_somatik_inkl_2021")) %>% - rename(datotid_start = datotid_lpr3kontaktstart) %>% - select(dw_ek_borger, datotid_start, diagnosegruppestreng) %>% - collect %>% - format_sql_import() -``` - -## LPR2 -### Inpatient visits -```{r} -df_lpr2_diagnoses_inpatient_roughly_selected <- con %>% - tbl(sql("SELECT * FROM [fct].FOR_indlaeggelser_psyk_somatik_LPR2_inkl_2021")) %>% - rename(datotid_start = datotid_indlaeggelse) %>% - select(dw_ek_borger, datotid_start, diagnosegruppestreng) %>% - collect %>% - format_sql_import() -``` - -### Outpatient visits -```{r} -df_lpr2_diagnoses_outpatient_roughly_selected <- con %>% - tbl(sql("SELECT * FROM [fct].FOR_besoeg_psyk_somatik_LPR2_inkl_2021")) %>% - select(dw_ek_borger, datotid_start, diagnosegruppestreng) %>% - collect %>% - format_sql_import() -``` - -## Combined -```{r} -df_all_visits_combined <- df_lpr3_diagnoses_roughly_selected %>% - bind_rows(df_lpr2_diagnoses_inpatient_roughly_selected) %>% - bind_rows(df_lpr2_diagnoses_outpatient_roughly_selected) -``` - -### T2D -```{r} -df_first_t2d_diagnoses_combined <- df_all_visits_combined %>% - keep_only_first_t2d_by_diag(date_col_string="datotid_start") %>% - rename(datotid_first_t2d_diagnosis = datotid_start) -``` - -### T1D -```{r} -df_first_t1d_diagnoses_combined <- df_all_visits_combined %>% - keep_only_first_t1d_by_diag(date_col_string="datotid_start") %>% - select(dw_ek_borger, datotid_start) -``` \ No newline at end of file diff --git a/src/application/t2d/outcome_specification - move to t2d-feature-gen-repo/00_generate_dfs/41_find_first_p.rmd b/src/application/t2d/outcome_specification - move to t2d-feature-gen-repo/00_generate_dfs/41_find_first_p.rmd deleted file mode 100644 index 4f6a293a..00000000 --- a/src/application/t2d/outcome_specification - move to t2d-feature-gen-repo/00_generate_dfs/41_find_first_p.rmd +++ /dev/null @@ -1,52 +0,0 @@ -```{r} -source(here("psycop-r-utilities", "import_from_sql.r")) -p_load(tidyverse) -``` - -# Remove patients with incidence before first psych-contact -## LPR3, both in and outpatient -```{r} -pt_types = c("Ambulant", "Indlagt") - -df_lpr3_preproc <- con %>% - tbl(sql("SELECT * FROM [fct].FOR_LPR3kontakter_psyk_somatik_inkl_2021")) %>% - filter(pt_type %in% pt_types) %>% - filter(substr(shakkode_lpr3kontaktophold, 1, 4) == "6600") %>% # Only psychiatry in RM - select(dw_ek_borger, datotid_lpr3kontaktstart) %>% - rename(datotid_start = datotid_lpr3kontaktstart) %>% - collect %>% - format_sql_import() -``` - -## LPR2 -### LPR2 inpatient -```{r} -df_lpr2_inp_preproc <- con %>% - tbl(sql("SELECT * FROM [fct].FOR_indlaeggelser_psyk_somatik_LPR2_inkl_2021")) %>% - filter(substr(shakkode_kontaktansvarlig, 1, 4) == "6600") %>% # Only psychiatry in RM - rename(datotid_start = datotid_indlaeggelse) %>% - select(dw_ek_borger, datotid_start) %>% - collect %>% - format_sql_import() -``` - -### LPR2 outpatient -```{r} -df_lpr2_outp_preproc <- con %>% - tbl(sql("SELECT * FROM [fct].FOR_besoeg_psyk_somatik_LPR2_inkl_2021")) %>% - filter(substr(shakafskode, 1, 4) == "6600") %>% # Only psychiatry in RM - select(dw_ek_borger, datotid_start) %>% - collect %>% - format_sql_import() -``` - -# Combine all -```{r} -df_first_psych_visit <- df_lpr2_inp_preproc %>% - bind_rows(df_lpr2_outp_preproc) %>% - bind_rows(df_lpr3_preproc) %>% - group_by(dw_ek_borger) %>% - filter(datotid_start == min(datotid_start)) %>% - rename(datotid_first_psych_visit = datotid_start) %>% - select(dw_ek_borger, datotid_first_psych_visit) -``` \ No newline at end of file diff --git a/src/application/t2d/outcome_specification - move to t2d-feature-gen-repo/00_generate_dfs/49_combined.rmd b/src/application/t2d/outcome_specification - move to t2d-feature-gen-repo/00_generate_dfs/49_combined.rmd deleted file mode 100644 index ebb1976d..00000000 --- a/src/application/t2d/outcome_specification - move to t2d-feature-gen-repo/00_generate_dfs/49_combined.rmd +++ /dev/null @@ -1,53 +0,0 @@ -Combine medication, hba1c and diagnoses to find first date where the patient has t2d. - -```{r} -source(here("functions.r")) -p_load(odbc, dbplyr, DBI) -``` - -## Find "any" diabetes incidence (maximise sensitivity). For use in wash-in (i.e. exclusion). -```{r} -df_first_diabetes_any <- df_first_t2d_blood_sample %>% - rename(datotid_first_t2d_bs = datotid_start) %>% # Add BS - left_join(df_first_date_of_t2d_medication_prescription) %>% - left_join(df_first_t2d_diagnoses_combined) %>% - mutate(datotid_first_diabetes_any = pmin(datotid_first_t2d_medication, datotid_first_t2d_diagnosis, datotid_first_t2d_bs)) %>% - group_by(dw_ek_borger) %>% - filter(datotid_first_diabetes_any == min(datotid_first_diabetes_any)) %>% # Make sure to only have one record per patient - select(dw_ek_borger, datotid_first_diabetes_any) %>% - distinct(dw_ek_borger, datotid_first_diabetes_any) %>% - left_join(df_first_t1d_diagnoses_combined, by = "dw_ek_borger") %>% - rename(datotid_first_t1d_diagnosis = datotid_start) %>% - mutate(datotid_first_diabetes_any = if_else(is.na(datotid_first_t1d_diagnosis), datotid_first_diabetes_any, min(datotid_first_t1d_diagnosis, datotid_first_diabetes_any))) %>% - select(dw_ek_borger, datotid_first_diabetes_any) # Keep only if no t1d diagnosis before t2d: 601 - -copy_to(con, df_first_diabetes_any, name = in_schema("fct", "psycop_t2d_first_diabetes_any"), overwrite = TRUE, temporary = FALSE) - -dim(df_first_diabetes_any) -``` - -## Find "true" incidences (maximise specificity.). For use when training and evaluating model. Try to exclude anyone that is incident due to other causes. See issue #12 regarding reasoning. -```{r} -df_first_t2d_bs_only <- df_first_t2d_blood_sample %>% - rename(datotid_first_t2d_bs = datotid_start) %>% # Add BS - left_join(df_first_date_of_t2d_medication_prescription) %>% - filter(datotid_first_t2d_bs < datotid_first_t2d_medication) %>% - left_join(df_first_t2d_diagnoses_combined) %>% - filter(datotid_first_t2d_bs < datotid_first_t2d_diagnosis) %>% - group_by(dw_ek_borger) %>% - filter(datotid_first_t2d_bs == min(datotid_first_t2d_bs)) %>% # Make sure to only have one record per patient - select(dw_ek_borger, datotid_first_t2d_bs) %>% - distinct(dw_ek_borger, datotid_first_t2d_bs) %>% - left_join(df_first_psych_visit) %>% # 3010 - filter(datotid_first_psych_visit < datotid_first_t2d_bs) %>% # Keep only if diabetes is diagnosed after first psych visit: 810 - left_join(rename(df_first_t1d_diagnoses_combined, datotid_first_t1d_diagnosis = datotid_start), by = "dw_ek_borger") %>% - mutate(!(datotid_first_t1d_diagnosis < datotid_first_t2d_bs)) %>% - filter(is.na(datotid_first_t1d_diagnosis) | !(datotid_first_t1d_diagnosis < datotid_first_t2d_bs)) %>% # Keep only if no t1d diagnosis before t2d: 601 - select(dw_ek_borger, datotid_first_t2d_bs) %>% - rename(timestamp = datotid_first_t2d_bs) - -copy_to(con, df_first_t2d_bs_only, name = in_schema("fct", "psycop_t2d_first_diabetes_t2d"), overwrite = TRUE, temporary = FALSE) - -dim(df_first_t2d_bs_only) -``` - diff --git a/src/application/t2d/outcome_specification - move to t2d-feature-gen-repo/10_descriptive_stats.rmd b/src/application/t2d/outcome_specification - move to t2d-feature-gen-repo/10_descriptive_stats.rmd deleted file mode 100644 index 2994a54d..00000000 --- a/src/application/t2d/outcome_specification - move to t2d-feature-gen-repo/10_descriptive_stats.rmd +++ /dev/null @@ -1,139 +0,0 @@ -```{r} -library(pacman) -p_load(ggplot2, ggbeeswarm, tidyverse, here, ggbeeswarm) - -source(here("psycop_r_utils", "import_from_sql.r")) -``` - - -```{r} -df_demographics <- get_fct("FOR_kohorte_demografi_inkl_2021") %>% - format_sql_import() %>% - mutate(foedselsdato = ymd(foedselsdato)) -``` - -```{r} -df_first_t2d_processed <- read_csv(here("csv", "df_first_t2d_bs_only.csv")) - -df_first_psych_visit <- read_csv(here("csv", "df_first_psych_visit.csv")) -``` - -# Age at first t2d for patients with "true"" positives in cohort time -```{r} -df_age_at_first_t2d <- df_first_t2d_processed %>% - left_join(df_demographics) %>% - mutate(age_at_first_t2d = time_length(difftime(datotid_first_t2d, foedselsdato), "years")) -``` - -## Raincloud -```{r} -ggplot(df_age_at_first_t2d %>% mutate(group=1), aes(x = age_at_first_t2d, y = group)) + - ggdist::stat_halfeye( - adjust = .5, - width = .6, - .width = 0, - justification = -.3, - point_colour = NA) + - geom_boxplot( - width = .1, - outlier.shape = NA - ) + - geom_quasirandom( - size = 1, - alpha = .3, - position = position_jitter( - seed = 1, width = .05 - ), - groupOnX = FALSE - ) + - coord_cartesian(xlim = c(1.2, NA), clip = "off") + - scale_x_continuous(breaks = seq(5, 100, by = 5)) + - ggtitle("Age at first t2d for patients with 'true' positives in cohort time") -``` - -## Cumulative distribution -```{r} -ggplot(df_age_at_first_t2d, aes(x = age_at_first_t2d)) + - stat_ecdf(geom = "step") + - scale_x_continuous(breaks = seq(5, 100, by = 5)) + - scale_y_continuous(breaks = seq(0, 1, by = 0.05)) + - ggtitle("Age at first t2d for patients with 'true' positives in cohort time") -``` - -```{r} -df_without_children <- df_age_at_first_t2d %>% # 3284 - filter(age_at_first_t2d > 30) %>% # 2883 - filter(age_at_first_t2d < 90) # 2804 -``` - -# Number of potentially true-positives that can generate predictions for increasing ∆t -```{r} -df_all_visits_combined <- read_csv(here("csv", "all_visits_combined.csv")) -``` - -```{r} -df_visits_for_size_of_prediction_window <- df_all_visits_combined %>% - rename(datotid_besoeg = datotid_start) %>% - inner_join(df_first_t2d_processed, by="dw_ek_borger") %>% - select(datotid_besoeg, datotid_first_t2d, dw_ek_borger) %>% - mutate(years_from_visit_to_t2d = time_length(difftime(datotid_first_t2d, datotid_besoeg), "years")) %>% - mutate(years_to_end_of_follow_up = time_length(difftime(max(datotid_besoeg), datotid_besoeg), "years")) %>% - filter(years_from_visit_to_t2d > 0) # Drop all visits that are before event %>% -``` - -```{r} -df_size_of_prediction_window_with_selected_cols <- df_visits_for_size_of_prediction_window - -for (i in 1:100) { - colname = paste0("window_", i) - - df_size_of_prediction_window_with_selected_cols <- df_size_of_prediction_window_with_selected_cols %>% - mutate({{colname}} := if_else(((years_from_visit_to_t2di/12)), 1, 0)) -} -``` - -## For each visit -```{r} -df_size_of_prediction_window_summarised <- df_size_of_prediction_window_with_selected_cols %>% - summarise(across(starts_with("window"), mean, .names = "mean_{.col}")) %>% - pivot_longer(cols = starts_with("mean_"), - names_to = "window_size_months", - values_to = "percent_included") %>% - mutate(window_size_months = as.numeric(gsub("mean_window_", "", window_size_months))) -``` - -```{r} -filepath <- here("figures", "window_size_visits.png") - -plot <- ggplot(df_size_of_prediction_window_summarised, aes(x = window_size_months, y = percent_included)) + - geom_point() + - scale_x_continuous(breaks = seq(0, 100, by = 2)) + - ggtitle("Proportion of potentially true-positive visits that are positive as a function of window size") - -ggsave(filepath, plot, dpi = 300, width = 5, height = 3) -``` - -## For each patient -```{r} -df_predict_window_size_patients <- df_size_of_prediction_window_with_selected_cols %>% - group_by(dw_ek_borger) %>% - summarise(across(starts_with("window"), max, .names = "{.col}")) %>% - ungroup() %>% - summarise(across(starts_with("window"), mean, .names = "mean_{.col}")) %>% - pivot_longer(cols = starts_with("mean_"), - names_to = "window_size_months", - values_to = "percent_included") %>% - mutate(window_size_months = as.numeric(gsub("mean_window_", "", window_size_months))) -``` - -```{r} -filepath <- here("figures", "window_size_patients.png") - -plot <- ggplot(df_predict_window_size_patients, aes(x = window_size_months, y = percent_included)) + - geom_point() + - scale_x_continuous(breaks = seq(0, 100, by = 2)) + - scale_y_continuous(breaks = seq(0, 1, by = 0.05)) + - ggtitle("Proportion of potentially true-positive patients that are positive as a function of window size") - -ggsave(filepath, plot, dpi = 300, width = 5, height = 3) -``` diff --git a/src/application/t2d/outcome_specification - move to t2d-feature-gen-repo/functions.r b/src/application/t2d/outcome_specification - move to t2d-feature-gen-repo/functions.r deleted file mode 100644 index 583f9e58..00000000 --- a/src/application/t2d/outcome_specification - move to t2d-feature-gen-repo/functions.r +++ /dev/null @@ -1,49 +0,0 @@ -event_start_date <- ymd("2014-01-01") - -str_contains_t2d_diag <- function(str) { - t2d_regex_pattern <- "(:DE1[1-5].*)|(:DE16[0-2].*)|(:DO24.*)|(:DT383A.*)|(:DM142.*)|(:DG590.*)|(:DG632*)|(:DH280.*)|(:DH334.*)|(:DH360.*)|(:DH450.*)|(:DN083.*)" - - if (isTRUE(str_detect(str, t2d_regex_pattern))) { - return(TRUE) - } - - return(FALSE) -} - -keep_only_first_t2d_by_diag <- function(df, date_col_string) { - str_contains_t2d_diag_vecced <- Vectorize(str_contains_t2d_diag) - - df %>% - filter(str_contains_t2d_diag_vecced((diagnosegruppestreng))) %>% - group_by(dw_ek_borger) %>% - filter(date_col_string == min(date_col_string)) %>% - filter(row_number() == 1) %>% - ungroup() -} - -str_contains_t1d_diag <- function(str) { - t1d_regex_pattern <- "(:DE10.*)|(:DO240.*)" - - if (isTRUE(str_detect(str, t1d_regex_pattern))) { - return(TRUE) - } - - return(FALSE) -} - -keep_only_first_t1d_by_diag <- function(df, date_col_string) { - str_contains_t1d_diag_vecced <- Vectorize(str_contains_t1d_diag) - - df %>% - filter(str_contains_t1d_diag_vecced((diagnosegruppestreng))) %>% - group_by(dw_ek_borger) %>% - filter(date_col_string == min(date_col_string)) %>% - filter(row_number() == 1) %>% - ungroup() -} - -visit_can_generate_prediction <- function(col1, col2, window_width_years) { - if_else(({{col1}}% - rename_with(tolower) - -df_planned_psych_visits <- df_planned_visits_raw %>% - filter(substr(shakafskode_besoeg, 1, 4) == "6600") %>% - filter(psykambbesoeg == 1) %>% - select(dw_ek_borger, datotid_start) %>% - arrange(datotid_start) - -df_p_samp <- df_planned_psych_visits %>% - filter(dw_ek_borger == 31) %>% - arrange(datotid_start) - -# Iterate over planned visits to only keep those, that are not within 3 months from last prediction -drop_within_3_months_from_prediction <- function(df) { - # Only takes as an input a dataframe that is already sorted by date (!!!) - current_CPR <- 0 - patient_i <- 0 - last_selected_date <- 0 - indeces_to_drop <- c() - - for (i in 1:nrow(df)) { - # print(str_c("Row_CPR, Current CPR: ", df$dw_ek_borger[i], ", ", current_CPR)) - - if (df$dw_ek_borger[i] != current_CPR) { # Handle switching to new person - current_CPR = df$dw_ek_borger[i] - last_selected_date = ymd_hms(df$datotid_start[i]) - - if (patient_i %% 100 == 0 ) { - print(str_c("Processing patient nr. ", patient_i)) - } - - patient_i <- patient_i + 1 - - next() - } - - if (df$dw_ek_borger[i] == current_CPR) { # Handle comparison of current visit to previous selected date - if (ymd_hms(df$datotid_start[i]) < (as.Date(last_selected_date) + 90)) { - indeces_to_drop <- c(indeces_to_drop, i) - } else { - last_selected_date <- df$datotid_start[i] - } - } - } - - return(df %>% slice(-indeces_to_drop)) -} - -df_planned_with_3m_spacing <- drop_within_3_months_from_prediction(df_planned_psych_visits) - - - -####### -# Age # -####### -df_demo_raw <- dbGetQuery(conn, "SELECT * FROM [fct].FOR_kohorte_demografi") %>% - rename_with(tolower) - -df_demo <- df_demo_raw %>% - select(foedselsdato, dw_ek_borger) %>% - mutate(foedselsdato = ymd(foedselsdato)) - -############### -# First psych # -############### -df_psyk_raw <- dbGetQuery(conn, "SELECT * FROM [fct].FOR_besoeg_fysiske_fremmoeder") %>% - rename_with(tolower) - -df_first_p <- df_psyk_raw %>% - select(dw_ek_borger, datotid_start) %>% - group_by(dw_ek_borger) %>% - arrange(datotid_start, .by_group=TRUE) %>% - filter(row_number() == 1) %>% - rename(datotid_f_psych = datotid_start) - - -############### -# T2D samples # -############### -# Raw -df_hba1c_raw <- dbGetQuery(conn, "SELECT * FROM [fct].FOR_LABKA_NPU27300_HbA1c") %>% - rename_with(tolower) - -df_maybe_t2d <- df_hba1c_raw %>% - select(datotid_godkendtsvar, svar, dw_ek_borger) %>% - mutate(svar = as.numeric(svar)) %>% - filter(svar > 47) %>% - select(-svar) %>% #Remove incidences that are before first psych contact - left_join(df_first_p) %>% - filter(datotid_f_psych < datotid_godkendtsvar) %>% - group_by(dw_ek_borger) %>% - arrange(datotid_godkendtsvar, .by_group = TRUE) %>% # Keep only first row - filter(row_number() == 1) %>% - rename(datotid_maybe_t2d = datotid_godkendtsvar) %>% - mutate(datotid_maybe_t2d = ymd_hms(datotid_maybe_t2d)) - - -df_probably_t2d <- df_hba1c_raw %>% - select(datotid_godkendtsvar, svar, dw_ek_borger) %>% - mutate(svar = as.numeric(svar)) %>% - filter(is.na(svar) == FALSE) %>% #Remove incidences that are before first psych contact - left_join(df_first_p) %>% - filter(datotid_f_psych < datotid_godkendtsvar) %>% - group_by(dw_ek_borger) %>% # Check if first HbA1c was normal - arrange(datotid_godkendtsvar, .by_group=TRUE) %>% - mutate(first_hba1c_normal = svar[1] < 48) %>% - filter(svar > 47 & first_hba1c_normal == TRUE) %>% - group_by(dw_ek_borger) %>% - arrange(datotid_godkendtsvar, .by_group = TRUE) %>% - filter(row_number() == 1) %>% # Keep only first match - select(datotid_godkendtsvar) %>% - rename(datotid_probably_t2d = datotid_godkendtsvar) %>% - mutate(datotid_probably_t2d = ymd_hms(datotid_probably_t2d)) - -######## -# Plot # -######## -setwd("E:/Users/adminmanber/Desktop/T2D") - -############## -# Age at T2D # -############## -gen_plot_age_df <- function(df, outcome) { - df_out <- df %>% - left_join(df_demo) %>% - mutate(age_at_t2d = interval(foedselsdato, {{outcome}}) / years(1)) - - return(df_out) -} - -df_plot_probably_t2d <- gen_plot_age_df(df_probably_t2d, datotid_probably_t2d) - -df_plot_maybe_t2d <- gen_plot_age_df(df_maybe_t2d, datotid_maybe_t2d) - -save_histogram <- function(df, x_var, filename) { - gg <- ggplot(df, aes(x={{x_var}})) + - geom_histogram(binwidth=1) + - labs( - title = filename, - x = "Age at incident T2D (years)", - y = "Count" - ) + - scale_x_continuous( - breaks = seq(15, 100, by=5), - limits = c(15, 100) - ) - - ggsave(str_c("figures/", filename, ".png"), width = 20, height = 10, dpi = 100, units = "in") - - gg -} - -save_histogram(df_plot_probably_t2d, age_at_t2d, "age_at_first_t2d_hba1c_after_normal_hba1c") -save_histogram(df_plot_maybe_t2d, age_at_t2d, "age_at_first_t2d_hba1c") - - -################################## -# Time from planned visit to T2D # -################################## -gen_planned_to_event_df <- function(df_event, event_col, df_planned_visits) { - df_out <- df_event %>% - inner_join(df_planned_visits) %>% - rename(visit_start = datotid_start) %>% - mutate(years_since_visit = interval(visit_start, {{event_col}}) / years(1)) %>% - filter(years_since_visit > 0.25) - -} - -df_planned_to_probable_t2d <- gen_planned_to_event_df(df_event=df_probably_t2d, - event_col=datotid_probably_t2d, - df_planned_visits=df_planned_psych_visits) - -df_planned_to_maybe_t2d <- gen_planned_to_event_df(df_event=df_maybe_t2d, - event_col=datotid_maybe_t2d, - df_planned_visits=df_planned_psych_visits) - -save_time_from_visit <- function(df, x_var, filename) { - gg <- ggplot(df, aes(x={{x_var}})) + - scale_x_continuous( - breaks = seq(0, 10, by=0.25), - limits = c(0, 10) - ) - - hist <- gg + - geom_histogram( - binwidth = 0.25 - ) - - box <- gg + - geom_boxplot() - - combined <- hist + box + plot_layout(nrow = 2, height = c(2, 1)) - - ggsave(str_c("figures/", filename, "_histogram.png"), width = 20, height = 20, dpi = 100, units = "in") - - combined -} - -save_time_from_visit(df_planned_to_maybe_t2d, years_since_visit, "years_until_maybe_t2d_for_visit_histogram") -save_time_from_visit(df_planned_to_probable_t2d, years_since_visit, "years_until_probable_t2d_for_visit_histogram") diff --git a/src/application/t2d/outcome_specification - move to t2d-feature-gen-repo/t2d.Rproj b/src/application/t2d/outcome_specification - move to t2d-feature-gen-repo/t2d.Rproj deleted file mode 100644 index 8e3c2ebc..00000000 --- a/src/application/t2d/outcome_specification - move to t2d-feature-gen-repo/t2d.Rproj +++ /dev/null @@ -1,13 +0,0 @@ -Version: 1.0 - -RestoreWorkspace: Default -SaveWorkspace: Default -AlwaysSaveHistory: Default - -EnableCodeIndexing: Yes -UseSpacesForTab: Yes -NumSpacesForTab: 2 -Encoding: UTF-8 - -RnwWeave: Sweave -LaTeX: pdfLaTeX diff --git a/src/application/t2d/outcome_specification - move to t2d-feature-gen-repo/tests/tests.r b/src/application/t2d/outcome_specification - move to t2d-feature-gen-repo/tests/tests.r deleted file mode 100644 index f6d1b22d..00000000 --- a/src/application/t2d/outcome_specification - move to t2d-feature-gen-repo/tests/tests.r +++ /dev/null @@ -1,49 +0,0 @@ - -library("pacman") - -p_load(testthat, here, xpectr) - -source(here("src", "functions.r")) - -test_df <- tribble( - ~diagnosegruppestreng, ~datotid_lpr3kontaktstart, ~dw_ek_borger, - "A:DE14#+:ALFC3", "2021-06-30 09:00:00.0000000", 1, - "A:DE14#+:ALFC3", "2021-05-30 09:00:00.0000000", 1, - "A:DE14#+:ALFC3", "2021-04-30 09:00:00.0000000", 1 -) - -source(here("src", "functions.r")) -output_df <- keep_only_first_t2d_by_diag(test_df, "datotid_lpr3kontaktstart") - -test_that("Correct diagnosegruppe-matching",{ - # Testing column values - expect_equal( - output_df[["diagnosegruppestreng"]], - "A:DE14#+:ALFC3", - fixed = TRUE) - expect_equal( - output_df[["dw_ek_borger"]], - 1, - tolerance = 1e-4) -}) - -test_window_gen_df <- tribble( - ~years_from_visit_to_t2d, ~years_to_end_of_follow_up, ~dw_ek_borger, - 1, 1, 1, - 1, 2, 2 -) - -output_df_window <- mutate(test_window_gen_df, window_1 = visit_can_generate_prediction_vecced(years_from_visit_to_t2d, years_from_visit_to_t2d, 1)) - -test_window_grouped_df <- tribble( - ~dw_ek_borger, ~window_1, ~window_2, - 1, 1, 0, - 1, 0, 0, - 1, 0, 1, - 2, 0, 0, - 2, 1, 0 -) - -df_out_window_group <- test_window_grouped_df %>% - group_by(dw_ek_borger) %>% - summarise(across(starts_with("window"), max, .names = "{.col}")) \ No newline at end of file diff --git a/src/psycop_model_training/archive/main.py b/src/psycop_model_training/archive/main.py index e92f7690..0ac2cf23 100644 --- a/src/psycop_model_training/archive/main.py +++ b/src/psycop_model_training/archive/main.py @@ -37,7 +37,7 @@ if __name__ == "__main__": msg = Printer(timestamp=True) - with initialize(version_base=None, config_path="../config/"): + with initialize(version_base=None, config_path="../../../application/config/"): cfg = compose( config_name=CONFIG_NAME, ) diff --git a/src/psycop_model_training/archive/model_training_watcher.py b/src/psycop_model_training/archive/model_training_watcher.py index 2ac933fe..352042fd 100644 --- a/src/psycop_model_training/archive/model_training_watcher.py +++ b/src/psycop_model_training/archive/model_training_watcher.py @@ -13,7 +13,7 @@ from wandb.sdk.wandb_run import Run # pylint: disable=no-name-in-module from wasabi import msg -from psycop_model_training.config.schemas import FullConfigSchema +from psycop_model_training.utils.config_schemas import FullConfigSchema from psycop_model_training.model_eval.dataclasses import ModelEvalData from psycop_model_training.model_eval.evaluate_model import run_full_evaluation from psycop_model_training.utils.utils import ( diff --git a/src/psycop_model_training/config/__init__.py b/src/psycop_model_training/config/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/src/psycop_model_training/data_loader/data_loader.py b/src/psycop_model_training/data_loader/data_loader.py index b0f9a928..352ebe05 100644 --- a/src/psycop_model_training/data_loader/data_loader.py +++ b/src/psycop_model_training/data_loader/data_loader.py @@ -6,12 +6,11 @@ import pandas as pd from wasabi import Printer -from psycop_model_training.config.schemas import FullConfigSchema +from psycop_model_training.utils.config_schemas import FullConfigSchema msg = Printer(timestamp=True) - class DataLoader: """Class to handle loading of a datasplit.""" @@ -28,48 +27,6 @@ def __init__( # Column specifications self.pred_col_name_prefix = cfg.data.pred_prefix - def load_dataset_from_dir( - self, - split_names: Union[Iterable[str], str], - nrows: Optional[int] = None, - ) -> pd.DataFrame: - """Load dataset for t2d. Can load multiple splits at once, e.g. - concatenate train and val for crossvalidation. - - Args: - split_names (Union[Iterable[str], str]): Name of split, allowed are ["train", "test", "val"] - nrows (Optional[int]): Number of rows to load from dataset. Defaults to None, in which case all rows are loaded. - - Returns: - pd.DataFrame: The filtered dataset - """ - msg.info(f"Loading {split_names}") - - # Concat splits if multiple are given - if isinstance(split_names, (list, tuple)): - if isinstance(split_names, Iterable): - split_names = tuple(split_names) - - if nrows is not None: - nrows = int( - nrows / len(split_names), - ) - - return pd.concat( - [ - self._load_dataset_file(split_name=split, nrows=nrows) - for split in split_names - ], - ignore_index=True, - ) - elif isinstance(split_names, str): - dataset = self._load_dataset_file(split_name=split_names, nrows=nrows) - - dataset = self._process_dataset(dataset=dataset) - - msg.good(f"{split_names}: Returning!") - return dataset - def _load_dataset_file( # pylint: disable=inconsistent-return-statements self, split_name: str, @@ -166,4 +123,46 @@ def _process_dataset(self, dataset: pd.DataFrame) -> pd.DataFrame: msg.info("Finished processing dataset") - return dataset \ No newline at end of file + return dataset + + def load_dataset_from_dir( + self, + split_names: Union[Iterable[str], str], + nrows: Optional[int] = None, + ) -> pd.DataFrame: + """Load dataset for t2d. Can load multiple splits at once, e.g. + concatenate train and val for crossvalidation. + + Args: + split_names (Union[Iterable[str], str]): Name of split, allowed are ["train", "test", "val"] + nrows (Optional[int]): Number of rows to load from dataset. Defaults to None, in which case all rows are loaded. + + Returns: + pd.DataFrame: The filtered dataset + """ + msg.info(f"Loading {split_names}") + + # Concat splits if multiple are given + if isinstance(split_names, (list, tuple)): + if isinstance(split_names, Iterable): + split_names = tuple(split_names) + + if nrows is not None: + nrows = int( + nrows / len(split_names), + ) + + return pd.concat( + [ + self._load_dataset_file(split_name=split, nrows=nrows) + for split in split_names + ], + ignore_index=True, + ) + elif isinstance(split_names, str): + dataset = self._load_dataset_file(split_name=split_names, nrows=nrows) + + dataset = self._process_dataset(dataset=dataset) + + msg.good(f"{split_names}: Returning!") + return dataset diff --git a/src/psycop_model_training/data_loader/utils.py b/src/psycop_model_training/data_loader/utils.py index 158bae84..3f16162a 100644 --- a/src/psycop_model_training/data_loader/utils.py +++ b/src/psycop_model_training/data_loader/utils.py @@ -3,7 +3,7 @@ import pandas as pd -from psycop_model_training.config.schemas import FullConfigSchema +from psycop_model_training.utils.config_schemas import FullConfigSchema from psycop_model_training.data_loader.data_classes import SplitDataset from psycop_model_training.data_loader.data_loader import DataLoader diff --git a/src/psycop_model_training/model_eval/dataclasses.py b/src/psycop_model_training/model_eval/dataclasses.py index 611175f5..11a41832 100644 --- a/src/psycop_model_training/model_eval/dataclasses.py +++ b/src/psycop_model_training/model_eval/dataclasses.py @@ -4,7 +4,7 @@ import pandas as pd -from psycop_model_training.config.schemas import BaseModel, FullConfigSchema +from psycop_model_training.utils.config_schemas import BaseModel, FullConfigSchema class CustomColumns(BaseModel): diff --git a/src/psycop_model_training/model_eval/evaluate_model.py b/src/psycop_model_training/model_eval/evaluate_model.py index 75483342..8a4eb2a4 100644 --- a/src/psycop_model_training/model_eval/evaluate_model.py +++ b/src/psycop_model_training/model_eval/evaluate_model.py @@ -8,40 +8,36 @@ from sklearn.metrics import recall_score from wandb.sdk.wandb_run import Run as wandb_run # pylint: disable=no-name-in-module -from psycop_model_training.config.schemas import FullConfigSchema +from psycop_model_training.utils.config_schemas import FullConfigSchema from psycop_model_training.model_eval.dataclasses import ( ArtifactContainer, EvalDataset, PipeMetadata, ) -from psycop_model_training.model_eval.tables.performance_by_threshold import ( - generate_performance_by_positive_rate_table, -) -from psycop_model_training.model_eval.tables.tables import ( - generate_feature_importances_table, - generate_selected_features_table, +from psycop_model_training.model_eval.plots import ( + log_image_to_wandb, + plot_auc_by_time_from_first_visit, + plot_auc_roc, + plot_metric_by_calendar_time, + plot_metric_by_cyclic_time, + plot_metric_by_time_until_diagnosis, + plot_performance_by_age, + plot_sensitivity_by_time_to_outcome_heatmap, ) -from psycop_model_training.utils.utils import positive_rate_to_pred_probs from psycop_model_training.model_eval.plots.feature_importance import ( plot_feature_importances, ) -from psycop_model_training.model_eval.plots import ( - plot_performance_by_age, -) from psycop_model_training.model_eval.plots.performance_by_n_hba1c import ( plot_performance_by_n_hba1c, ) -from psycop_model_training.model_eval.plots import ( - plot_auc_by_time_from_first_visit, - plot_metric_by_calendar_time, - plot_metric_by_cyclic_time, - plot_metric_by_time_until_diagnosis, +from psycop_model_training.model_eval.tables.performance_by_threshold import ( + generate_performance_by_positive_rate_table, ) -from psycop_model_training.model_eval.plots import plot_auc_roc -from psycop_model_training.model_eval.plots import ( - plot_sensitivity_by_time_to_outcome_heatmap, +from psycop_model_training.model_eval.tables.tables import ( + generate_feature_importances_table, + generate_selected_features_table, ) -from psycop_model_training.model_eval.plots import log_image_to_wandb +from psycop_model_training.utils.utils import positive_rate_to_pred_probs def upload_artifacts_to_wandb( diff --git a/src/psycop_model_training/model_eval/examples/evaluate_model_from_file.py b/src/psycop_model_training/model_eval/examples/evaluate_model_from_file.py index ae3e432b..190ad0fb 100644 --- a/src/psycop_model_training/model_eval/examples/evaluate_model_from_file.py +++ b/src/psycop_model_training/model_eval/examples/evaluate_model_from_file.py @@ -11,6 +11,7 @@ import pandas as pd from omegaconf import DictConfig +from psycop_model_training.model_eval.plots import plot_auc_by_time_from_first_visit from psycop_model_training.utils.utils import ( PROJECT_ROOT, infer_outcome_col_name, @@ -19,7 +20,6 @@ load_evaluation_data, read_pickle, ) -from psycop_model_training.model_eval.plots import plot_auc_by_time_from_first_visit def load_model_predictions_and_cfg(path: Path) -> tuple[pd.DataFrame, DictConfig]: diff --git a/src/psycop_model_training/model_eval/plots/performance_over_time.py b/src/psycop_model_training/model_eval/plots/performance_over_time.py index bd3c054c..60880694 100644 --- a/src/psycop_model_training/model_eval/plots/performance_over_time.py +++ b/src/psycop_model_training/model_eval/plots/performance_over_time.py @@ -13,9 +13,9 @@ from sklearn.metrics import f1_score, roc_auc_score from psycop_model_training.model_eval.dataclasses import EvalDataset -from psycop_model_training.utils.utils import bin_continuous_data, round_floats_to_edge from psycop_model_training.model_eval.plots.base_charts import plot_basic_chart from psycop_model_training.model_eval.plots.utils import calc_performance +from psycop_model_training.utils.utils import bin_continuous_data, round_floats_to_edge def create_performance_by_calendar_time_df( diff --git a/src/psycop_model_training/preprocessing/post_split/create_pipeline.py b/src/psycop_model_training/preprocessing/post_split/create_pipeline.py index d13a8cc4..12764cf6 100644 --- a/src/psycop_model_training/preprocessing/post_split/create_pipeline.py +++ b/src/psycop_model_training/preprocessing/post_split/create_pipeline.py @@ -10,8 +10,10 @@ from sklearn.preprocessing import StandardScaler from wasabi import Printer -from psycop_model_training.config.schemas import FullConfigSchema -from psycop_model_training.preprocessing.post_split.feature_selectors import DropDateTimeColumns +from psycop_model_training.utils.config_schemas import FullConfigSchema +from psycop_model_training.preprocessing.post_split.feature_selectors import ( + DropDateTimeColumns, +) from psycop_model_training.preprocessing.post_split.feature_transformers import ( ConvertToBoolean, DateTimeConverter, diff --git a/src/psycop_model_training/preprocessing/pre_split/col_filterer.py b/src/psycop_model_training/preprocessing/pre_split/col_filterer.py index 0224d068..207cc03c 100644 --- a/src/psycop_model_training/preprocessing/pre_split/col_filterer.py +++ b/src/psycop_model_training/preprocessing/pre_split/col_filterer.py @@ -6,11 +6,14 @@ from psycop_model_training.data_loader.data_loader import msg from psycop_model_training.utils.col_name_inference import infer_look_distance from psycop_model_training.utils.decorators import print_df_dimensions_diff -from psycop_model_training.utils.utils import infer_predictor_col_name, get_percent_lost, infer_outcome_col_name +from psycop_model_training.utils.utils import ( + get_percent_lost, + infer_outcome_col_name, + infer_predictor_col_name, +) -class PresSplitColFilterer(): - +class PresSplitColFilterer: @print_df_dimensions_diff def _drop_cols_not_in_lookbehind_combination( self, diff --git a/src/psycop_model_training/preprocessing/pre_split/col_transformer.py b/src/psycop_model_training/preprocessing/pre_split/col_transformer.py index 44c164bc..1583a592 100644 --- a/src/psycop_model_training/preprocessing/pre_split/col_transformer.py +++ b/src/psycop_model_training/preprocessing/pre_split/col_transformer.py @@ -5,8 +5,7 @@ from psycop_model_training.utils.utils import infer_predictor_col_name -class PresSplitColTransformer(): - +class PresSplitColTransformer: @staticmethod @print_df_dimensions_diff def convert_timestamp_dtype_and_nat(dataset: pd.DataFrame) -> pd.DataFrame: diff --git a/src/psycop_model_training/preprocessing/pre_split/row_filterer.py b/src/psycop_model_training/preprocessing/pre_split/row_filterer.py index 108220b8..101752a6 100644 --- a/src/psycop_model_training/preprocessing/pre_split/row_filterer.py +++ b/src/psycop_model_training/preprocessing/pre_split/row_filterer.py @@ -8,7 +8,7 @@ from psycop_model_training.utils.utils import get_percent_lost -class PreSplitRowFilterer(): +class PreSplitRowFilterer: def __init__(self): raise NotImplementedError diff --git a/src/psycop_model_training/training/train_and_eval.py b/src/psycop_model_training/training/train_and_eval.py index 00ec5e26..9a5c6c2b 100644 --- a/src/psycop_model_training/training/train_and_eval.py +++ b/src/psycop_model_training/training/train_and_eval.py @@ -10,17 +10,13 @@ from sklearn.pipeline import Pipeline from wasabi import Printer -from psycop_model_training.config.schemas import ( - FullConfigSchema, -) +from psycop_model_training.utils.config_schemas import FullConfigSchema # from psycop_model_training.evaluation import evaluate_model from psycop_model_training.model_eval.dataclasses import EvalDataset from psycop_model_training.training.model_specs import MODELS from psycop_model_training.training.utils import create_eval_dataset -from psycop_model_training.utils.utils import ( - PROJECT_ROOT, -) +from psycop_model_training.utils.utils import PROJECT_ROOT CONFIG_PATH = PROJECT_ROOT / "src" / "psycop_model_training" / "config" @@ -223,4 +219,4 @@ def train_and_get_model_eval_df( n_splits=n_splits, ) - return eval_dataset \ No newline at end of file + return eval_dataset diff --git a/src/psycop_model_training/training/utils.py b/src/psycop_model_training/training/utils.py index 6c9b1aa4..99e6ae43 100644 --- a/src/psycop_model_training/training/utils.py +++ b/src/psycop_model_training/training/utils.py @@ -1,6 +1,6 @@ import pandas as pd -from psycop_model_training.config.schemas import FullConfigSchema +from psycop_model_training.utils.config_schemas import FullConfigSchema from psycop_model_training.model_eval.dataclasses import EvalDataset diff --git a/src/psycop_model_training/config/schemas.py b/src/psycop_model_training/utils/config_schemas.py similarity index 99% rename from src/psycop_model_training/config/schemas.py rename to src/psycop_model_training/utils/config_schemas.py index f16582eb..e3f5fc22 100644 --- a/src/psycop_model_training/config/schemas.py +++ b/src/psycop_model_training/utils/config_schemas.py @@ -248,7 +248,7 @@ def load_cfg_as_omegaconf( overrides: Optional[list[str]] = None, ) -> DictConfig: """Load config as omegaconf object.""" - with initialize(version_base=None, config_path="./"): + with initialize(version_base=None, config_path="../../../application/config/"): if overrides: cfg = compose( config_name=config_file_name, diff --git a/tests/conftest.py b/tests/conftest.py index adc5a600..7ffcb6ab 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -5,10 +5,10 @@ import pandas as pd import pytest -from psycop_model_training.config.schemas import FullConfigSchema, load_cfg_as_pydantic +from psycop_model_training.utils.config_schemas import FullConfigSchema, load_cfg_as_pydantic from psycop_model_training.model_eval.dataclasses import EvalDataset -CONFIG_DIR_PATH_REL = "../src/psycop_model_training/config" +CONFIG_DIR_PATH_REL = "../application/config" def add_age_gender(df): diff --git a/tests/model_evaluation/test_visualizations.py b/tests/model_evaluation/test_visualizations.py index c761d391..ee17e9d9 100644 --- a/tests/model_evaluation/test_visualizations.py +++ b/tests/model_evaluation/test_visualizations.py @@ -11,29 +11,25 @@ from sklearn.metrics import f1_score, roc_auc_score from psycop_model_training.model_eval.dataclasses import EvalDataset -from psycop_model_training.utils.utils import PROJECT_ROOT, positive_rate_to_pred_probs -from psycop_model_training.model_eval.plots import plot_prob_over_time -from psycop_model_training.model_eval.plots import plot_basic_chart -from psycop_model_training.model_eval.plots.feature_importance import ( - plot_feature_importances, -) -from psycop_model_training.model_eval.plots import ( - plot_performance_by_age, -) -from psycop_model_training.model_eval.plots.performance_by_n_hba1c import ( - plot_performance_by_n_hba1c, -) from psycop_model_training.model_eval.plots import ( + create_sensitivity_by_time_to_outcome_df, plot_auc_by_time_from_first_visit, + plot_auc_roc, + plot_basic_chart, plot_metric_by_calendar_time, plot_metric_by_cyclic_time, plot_metric_by_time_until_diagnosis, -) -from psycop_model_training.model_eval.plots import plot_auc_roc -from psycop_model_training.model_eval.plots import ( - create_sensitivity_by_time_to_outcome_df, + plot_performance_by_age, + plot_prob_over_time, plot_sensitivity_by_time_to_outcome_heatmap, ) +from psycop_model_training.model_eval.plots.feature_importance import ( + plot_feature_importances, +) +from psycop_model_training.model_eval.plots.performance_by_n_hba1c import ( + plot_performance_by_n_hba1c, +) +from psycop_model_training.utils.utils import PROJECT_ROOT, positive_rate_to_pred_probs @pytest.fixture(scope="function") diff --git a/tests/test_configs.py b/tests/test_configs.py index 8f85115a..b9e2a389 100644 --- a/tests/test_configs.py +++ b/tests/test_configs.py @@ -4,11 +4,11 @@ import pytest from hydra import compose, initialize -from psycop_model_training.config.schemas import convert_omegaconf_to_pydantic_object +from psycop_model_training.utils.config_schemas import convert_omegaconf_to_pydantic_object from psycop_model_training.utils.utils import PROJECT_ROOT CONFIG_DIR_PATH_ABS = PROJECT_ROOT / "src" / "psycop_model_training" / "config" -CONFIG_DIR_PATH_REL = "../src/psycop_model_training/config" +CONFIG_DIR_PATH_REL = "../application/config" def get_config_file_names() -> list[str]: diff --git a/tests/test_load.py b/tests/test_load.py index a03a6a34..1bf597d0 100644 --- a/tests/test_load.py +++ b/tests/test_load.py @@ -1,6 +1,6 @@ """Testing of loader functions.""" -from psycop_model_training.config.schemas import FullConfigSchema +from psycop_model_training.utils.config_schemas import FullConfigSchema from psycop_model_training.data_loader.utils import load_train_from_cfg diff --git a/tests/test_preprocessing.py b/tests/test_preprocessing.py index ef1767eb..ce9b7d76 100644 --- a/tests/test_preprocessing.py +++ b/tests/test_preprocessing.py @@ -1,5 +1,5 @@ """Test custom preprocessing steps.""" -from psycop_model_training.config.schemas import FullConfigSchema +from psycop_model_training.utils.config_schemas import FullConfigSchema from psycop_model_training.data_loader.utils import load_train_from_cfg from psycop_model_training.preprocessing.post_split.create_pipeline import ( create_preprocessing_pipeline, diff --git a/tests/test_train_model.py b/tests/test_train_model.py index fec6acac..99e5ddf6 100644 --- a/tests/test_train_model.py +++ b/tests/test_train_model.py @@ -3,9 +3,9 @@ import pytest -from psycop_model_training.config.schemas import FullConfigSchema, load_cfg_as_omegaconf +from application.train_model import main +from psycop_model_training.utils.config_schemas import FullConfigSchema, load_cfg_as_omegaconf from psycop_model_training.training.model_specs import MODELS -from application.t2d.train_model import main INTEGRATION_TEST_FILE_NAME = "integration_config.yaml" From 5f7abcf6aa875938288b3ad4ef35a9b915659898 Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Fri, 16 Dec 2022 13:27:16 +0100 Subject: [PATCH 04/47] refactor: reomve t2d specific outcome specification --- application/inspect_dataset.py | 2 +- application/main.py | 10 +- .../00_generate_dfs/10_medication.rmd | 58 ----- .../00_generate_dfs/20_hba1c.rmd | 33 --- .../00_generate_dfs/30_diagnoses.rmd | 67 ------ .../00_generate_dfs/41_find_first_p.rmd | 52 ---- .../00_generate_dfs/49_combined.rmd | 53 ----- .../10_descriptive_stats.rmd | 139 ----------- .../functions.r | 49 ---- .../t2d-plots.r | 223 ------------------ .../t2d.Rproj | 13 - .../tests/tests.r | 49 ---- application/train_model.py | 8 +- .../archive/model_training_watcher.py | 2 +- .../data_loader/utils.py | 2 +- .../model_eval/evaluate_model.py | 2 +- .../post_split/create_pipeline.py | 2 +- .../training/train_and_eval.py | 3 +- src/psycop_model_training/training/utils.py | 2 +- tests/conftest.py | 5 +- tests/test_configs.py | 4 +- tests/test_load.py | 2 +- tests/test_preprocessing.py | 2 +- tests/test_train_model.py | 5 +- 24 files changed, 29 insertions(+), 758 deletions(-) delete mode 100644 application/outcome_specification - move to t2d-feature-gen-repo/00_generate_dfs/10_medication.rmd delete mode 100644 application/outcome_specification - move to t2d-feature-gen-repo/00_generate_dfs/20_hba1c.rmd delete mode 100644 application/outcome_specification - move to t2d-feature-gen-repo/00_generate_dfs/30_diagnoses.rmd delete mode 100644 application/outcome_specification - move to t2d-feature-gen-repo/00_generate_dfs/41_find_first_p.rmd delete mode 100644 application/outcome_specification - move to t2d-feature-gen-repo/00_generate_dfs/49_combined.rmd delete mode 100644 application/outcome_specification - move to t2d-feature-gen-repo/10_descriptive_stats.rmd delete mode 100644 application/outcome_specification - move to t2d-feature-gen-repo/functions.r delete mode 100644 application/outcome_specification - move to t2d-feature-gen-repo/t2d-plots.r delete mode 100644 application/outcome_specification - move to t2d-feature-gen-repo/t2d.Rproj delete mode 100644 application/outcome_specification - move to t2d-feature-gen-repo/tests/tests.r diff --git a/application/inspect_dataset.py b/application/inspect_dataset.py index 6e504cdb..d946f02b 100644 --- a/application/inspect_dataset.py +++ b/application/inspect_dataset.py @@ -1,6 +1,6 @@ """Example of how to inspect a dataset using the configs.""" -from psycop_model_training.utils.config_schemas import load_cfg_as_pydantic from psycop_model_training.data_loader.utils import load_train_from_cfg, load_train_raw +from psycop_model_training.utils.config_schemas import load_cfg_as_pydantic def main(): diff --git a/application/main.py b/application/main.py index 9657b156..099aabc1 100644 --- a/application/main.py +++ b/application/main.py @@ -15,16 +15,16 @@ from random_word import RandomWords from wasabi import Printer -from psycop_model_training.utils.config_schemas import ( - BaseModel, - FullConfigSchema, - load_cfg_as_pydantic, -) from psycop_model_training.data_loader.utils import load_train_raw from psycop_model_training.model_eval.evaluate_model import ( infer_look_distance, infer_outcome_col_name, ) +from psycop_model_training.utils.config_schemas import ( + BaseModel, + FullConfigSchema, + load_cfg_as_pydantic, +) def start_trainer( diff --git a/application/outcome_specification - move to t2d-feature-gen-repo/00_generate_dfs/10_medication.rmd b/application/outcome_specification - move to t2d-feature-gen-repo/00_generate_dfs/10_medication.rmd deleted file mode 100644 index 88ef6c59..00000000 --- a/application/outcome_specification - move to t2d-feature-gen-repo/00_generate_dfs/10_medication.rmd +++ /dev/null @@ -1,58 +0,0 @@ -Find first occurrence of hospital prescription or hospital redemption of diabetic medication. - -```{r} -library("pacman") - -p_load(tidyverse, here, future) -source(here("psycop-r-utilities", "import_from_sql.r")) -source(here("functions.r")) - -``` - -```{r} -con <- DBI::dbConnect( - odbc::odbc(), - Driver = "SQL Server", - Server = "BI-DPA-PROD", - database = "USR_PS_Forsk", - Trusted_Connection = "TRUE" -) -``` - -# Get first date of prescribed antidiabetic medication for each patient -## From only that administered -```{r} -df_first_administered_t2d_medication <- con %>% - tbl(sql("SELECT * FROM [fct].FOR_Medicin_administreret_inkl_2021")) %>% - select(dw_ek_borger, datotid_ordination_start, atc) %>% - filter(substr(atc, 1, 3) == "A10") %>% # A10 is all antidiabetic medication - group_by(dw_ek_borger) %>% - filter(datotid_ordination_start == min(datotid_ordination_start)) %>% - collect %>% - format_sql_import() %>% - distinct(dw_ek_borger, datotid_ordination_start) -``` - -## From only that prescribed -```{r} -df_first_prescribed_t2d_medication <- con %>% - tbl(sql("SELECT * FROM [fct].FOR_Medicin_ordineret_inkl_2021")) %>% - filter(substr(atc, 1, 3) == "A10") %>% - select(dw_ek_borger, datotid_ordinationstart, atc) %>% - rename(datotid_ordination_start = datotid_ordinationstart) %>% - group_by(dw_ek_borger) %>% - filter(datotid_ordination_start == min(datotid_ordination_start)) %>% - collect %>% - format_sql_import() %>% - distinct(dw_ek_borger, datotid_ordination_start) -``` - -## Combined -```{r} -df_first_date_of_t2d_medication_prescription <- df_first_administered_t2d_medication %>% - bind_rows(df_first_prescribed_t2d_medication) %>% - group_by(dw_ek_borger) %>% - filter(datotid_ordination_start == min(datotid_ordination_start)) %>% - filter(row_number() == 1) %>% - rename(datotid_first_t2d_medication=datotid_ordination_start) -``` \ No newline at end of file diff --git a/application/outcome_specification - move to t2d-feature-gen-repo/00_generate_dfs/20_hba1c.rmd b/application/outcome_specification - move to t2d-feature-gen-repo/00_generate_dfs/20_hba1c.rmd deleted file mode 100644 index 9e6382b8..00000000 --- a/application/outcome_specification - move to t2d-feature-gen-repo/00_generate_dfs/20_hba1c.rmd +++ /dev/null @@ -1,33 +0,0 @@ -Find the first date where a patient gets a diabetic hba1c-blood-sample. - -```{r} -library("pacman") - -p_load(tidyverse, here, future) - -``` - -```{r} -con <- DBI::dbConnect( - odbc::odbc(), - Driver = "SQL Server", - Server = "BI-DPA-PROD", - database = "USR_PS_Forsk", - Trusted_Connection = "TRUE" -) -``` - -# Get first date of hba1c above threshold -## From only that administered -```{r} -df_first_t2d_blood_sample <- con %>% - tbl(sql("SELECT * FROM [fct].FOR_LABKA_NPU27300_HbA1c_inkl_2021")) %>% - select(dw_ek_borger, datotid_proevemodtagelse, numerisksvar, analysenavn) %>% - filter(numerisksvar >= 48) %>% - group_by(dw_ek_borger) %>% - filter(datotid_proevemodtagelse == min(datotid_proevemodtagelse)) %>% - rename(datotid_start = datotid_proevemodtagelse) %>% - collect %>% - distinct(dw_ek_borger, datotid_start) %>% - format_sql_import() -``` \ No newline at end of file diff --git a/application/outcome_specification - move to t2d-feature-gen-repo/00_generate_dfs/30_diagnoses.rmd b/application/outcome_specification - move to t2d-feature-gen-repo/00_generate_dfs/30_diagnoses.rmd deleted file mode 100644 index 65e9e32c..00000000 --- a/application/outcome_specification - move to t2d-feature-gen-repo/00_generate_dfs/30_diagnoses.rmd +++ /dev/null @@ -1,67 +0,0 @@ -Find the first date where a patient gets a t2d-diagnosis in the hospital system. - -```{r} -library("pacman") -``` - -```{r} -con <- DBI::dbConnect( - odbc::odbc(), - Driver = "SQL Server", - Server = "BI-DPA-PROD", - database = "USR_PS_Forsk", - Trusted_Connection = "TRUE" -) -``` - -# A-diagnoses -## LPR3 -```{r} -df_lpr3_diagnoses_roughly_selected <- con %>% - tbl(sql("SELECT * FROM [fct].FOR_LPR3kontakter_psyk_somatik_inkl_2021")) %>% - rename(datotid_start = datotid_lpr3kontaktstart) %>% - select(dw_ek_borger, datotid_start, diagnosegruppestreng) %>% - collect %>% - format_sql_import() -``` - -## LPR2 -### Inpatient visits -```{r} -df_lpr2_diagnoses_inpatient_roughly_selected <- con %>% - tbl(sql("SELECT * FROM [fct].FOR_indlaeggelser_psyk_somatik_LPR2_inkl_2021")) %>% - rename(datotid_start = datotid_indlaeggelse) %>% - select(dw_ek_borger, datotid_start, diagnosegruppestreng) %>% - collect %>% - format_sql_import() -``` - -### Outpatient visits -```{r} -df_lpr2_diagnoses_outpatient_roughly_selected <- con %>% - tbl(sql("SELECT * FROM [fct].FOR_besoeg_psyk_somatik_LPR2_inkl_2021")) %>% - select(dw_ek_borger, datotid_start, diagnosegruppestreng) %>% - collect %>% - format_sql_import() -``` - -## Combined -```{r} -df_all_visits_combined <- df_lpr3_diagnoses_roughly_selected %>% - bind_rows(df_lpr2_diagnoses_inpatient_roughly_selected) %>% - bind_rows(df_lpr2_diagnoses_outpatient_roughly_selected) -``` - -### T2D -```{r} -df_first_t2d_diagnoses_combined <- df_all_visits_combined %>% - keep_only_first_t2d_by_diag(date_col_string="datotid_start") %>% - rename(datotid_first_t2d_diagnosis = datotid_start) -``` - -### T1D -```{r} -df_first_t1d_diagnoses_combined <- df_all_visits_combined %>% - keep_only_first_t1d_by_diag(date_col_string="datotid_start") %>% - select(dw_ek_borger, datotid_start) -``` \ No newline at end of file diff --git a/application/outcome_specification - move to t2d-feature-gen-repo/00_generate_dfs/41_find_first_p.rmd b/application/outcome_specification - move to t2d-feature-gen-repo/00_generate_dfs/41_find_first_p.rmd deleted file mode 100644 index 4f6a293a..00000000 --- a/application/outcome_specification - move to t2d-feature-gen-repo/00_generate_dfs/41_find_first_p.rmd +++ /dev/null @@ -1,52 +0,0 @@ -```{r} -source(here("psycop-r-utilities", "import_from_sql.r")) -p_load(tidyverse) -``` - -# Remove patients with incidence before first psych-contact -## LPR3, both in and outpatient -```{r} -pt_types = c("Ambulant", "Indlagt") - -df_lpr3_preproc <- con %>% - tbl(sql("SELECT * FROM [fct].FOR_LPR3kontakter_psyk_somatik_inkl_2021")) %>% - filter(pt_type %in% pt_types) %>% - filter(substr(shakkode_lpr3kontaktophold, 1, 4) == "6600") %>% # Only psychiatry in RM - select(dw_ek_borger, datotid_lpr3kontaktstart) %>% - rename(datotid_start = datotid_lpr3kontaktstart) %>% - collect %>% - format_sql_import() -``` - -## LPR2 -### LPR2 inpatient -```{r} -df_lpr2_inp_preproc <- con %>% - tbl(sql("SELECT * FROM [fct].FOR_indlaeggelser_psyk_somatik_LPR2_inkl_2021")) %>% - filter(substr(shakkode_kontaktansvarlig, 1, 4) == "6600") %>% # Only psychiatry in RM - rename(datotid_start = datotid_indlaeggelse) %>% - select(dw_ek_borger, datotid_start) %>% - collect %>% - format_sql_import() -``` - -### LPR2 outpatient -```{r} -df_lpr2_outp_preproc <- con %>% - tbl(sql("SELECT * FROM [fct].FOR_besoeg_psyk_somatik_LPR2_inkl_2021")) %>% - filter(substr(shakafskode, 1, 4) == "6600") %>% # Only psychiatry in RM - select(dw_ek_borger, datotid_start) %>% - collect %>% - format_sql_import() -``` - -# Combine all -```{r} -df_first_psych_visit <- df_lpr2_inp_preproc %>% - bind_rows(df_lpr2_outp_preproc) %>% - bind_rows(df_lpr3_preproc) %>% - group_by(dw_ek_borger) %>% - filter(datotid_start == min(datotid_start)) %>% - rename(datotid_first_psych_visit = datotid_start) %>% - select(dw_ek_borger, datotid_first_psych_visit) -``` \ No newline at end of file diff --git a/application/outcome_specification - move to t2d-feature-gen-repo/00_generate_dfs/49_combined.rmd b/application/outcome_specification - move to t2d-feature-gen-repo/00_generate_dfs/49_combined.rmd deleted file mode 100644 index ebb1976d..00000000 --- a/application/outcome_specification - move to t2d-feature-gen-repo/00_generate_dfs/49_combined.rmd +++ /dev/null @@ -1,53 +0,0 @@ -Combine medication, hba1c and diagnoses to find first date where the patient has t2d. - -```{r} -source(here("functions.r")) -p_load(odbc, dbplyr, DBI) -``` - -## Find "any" diabetes incidence (maximise sensitivity). For use in wash-in (i.e. exclusion). -```{r} -df_first_diabetes_any <- df_first_t2d_blood_sample %>% - rename(datotid_first_t2d_bs = datotid_start) %>% # Add BS - left_join(df_first_date_of_t2d_medication_prescription) %>% - left_join(df_first_t2d_diagnoses_combined) %>% - mutate(datotid_first_diabetes_any = pmin(datotid_first_t2d_medication, datotid_first_t2d_diagnosis, datotid_first_t2d_bs)) %>% - group_by(dw_ek_borger) %>% - filter(datotid_first_diabetes_any == min(datotid_first_diabetes_any)) %>% # Make sure to only have one record per patient - select(dw_ek_borger, datotid_first_diabetes_any) %>% - distinct(dw_ek_borger, datotid_first_diabetes_any) %>% - left_join(df_first_t1d_diagnoses_combined, by = "dw_ek_borger") %>% - rename(datotid_first_t1d_diagnosis = datotid_start) %>% - mutate(datotid_first_diabetes_any = if_else(is.na(datotid_first_t1d_diagnosis), datotid_first_diabetes_any, min(datotid_first_t1d_diagnosis, datotid_first_diabetes_any))) %>% - select(dw_ek_borger, datotid_first_diabetes_any) # Keep only if no t1d diagnosis before t2d: 601 - -copy_to(con, df_first_diabetes_any, name = in_schema("fct", "psycop_t2d_first_diabetes_any"), overwrite = TRUE, temporary = FALSE) - -dim(df_first_diabetes_any) -``` - -## Find "true" incidences (maximise specificity.). For use when training and evaluating model. Try to exclude anyone that is incident due to other causes. See issue #12 regarding reasoning. -```{r} -df_first_t2d_bs_only <- df_first_t2d_blood_sample %>% - rename(datotid_first_t2d_bs = datotid_start) %>% # Add BS - left_join(df_first_date_of_t2d_medication_prescription) %>% - filter(datotid_first_t2d_bs < datotid_first_t2d_medication) %>% - left_join(df_first_t2d_diagnoses_combined) %>% - filter(datotid_first_t2d_bs < datotid_first_t2d_diagnosis) %>% - group_by(dw_ek_borger) %>% - filter(datotid_first_t2d_bs == min(datotid_first_t2d_bs)) %>% # Make sure to only have one record per patient - select(dw_ek_borger, datotid_first_t2d_bs) %>% - distinct(dw_ek_borger, datotid_first_t2d_bs) %>% - left_join(df_first_psych_visit) %>% # 3010 - filter(datotid_first_psych_visit < datotid_first_t2d_bs) %>% # Keep only if diabetes is diagnosed after first psych visit: 810 - left_join(rename(df_first_t1d_diagnoses_combined, datotid_first_t1d_diagnosis = datotid_start), by = "dw_ek_borger") %>% - mutate(!(datotid_first_t1d_diagnosis < datotid_first_t2d_bs)) %>% - filter(is.na(datotid_first_t1d_diagnosis) | !(datotid_first_t1d_diagnosis < datotid_first_t2d_bs)) %>% # Keep only if no t1d diagnosis before t2d: 601 - select(dw_ek_borger, datotid_first_t2d_bs) %>% - rename(timestamp = datotid_first_t2d_bs) - -copy_to(con, df_first_t2d_bs_only, name = in_schema("fct", "psycop_t2d_first_diabetes_t2d"), overwrite = TRUE, temporary = FALSE) - -dim(df_first_t2d_bs_only) -``` - diff --git a/application/outcome_specification - move to t2d-feature-gen-repo/10_descriptive_stats.rmd b/application/outcome_specification - move to t2d-feature-gen-repo/10_descriptive_stats.rmd deleted file mode 100644 index 2994a54d..00000000 --- a/application/outcome_specification - move to t2d-feature-gen-repo/10_descriptive_stats.rmd +++ /dev/null @@ -1,139 +0,0 @@ -```{r} -library(pacman) -p_load(ggplot2, ggbeeswarm, tidyverse, here, ggbeeswarm) - -source(here("psycop_r_utils", "import_from_sql.r")) -``` - - -```{r} -df_demographics <- get_fct("FOR_kohorte_demografi_inkl_2021") %>% - format_sql_import() %>% - mutate(foedselsdato = ymd(foedselsdato)) -``` - -```{r} -df_first_t2d_processed <- read_csv(here("csv", "df_first_t2d_bs_only.csv")) - -df_first_psych_visit <- read_csv(here("csv", "df_first_psych_visit.csv")) -``` - -# Age at first t2d for patients with "true"" positives in cohort time -```{r} -df_age_at_first_t2d <- df_first_t2d_processed %>% - left_join(df_demographics) %>% - mutate(age_at_first_t2d = time_length(difftime(datotid_first_t2d, foedselsdato), "years")) -``` - -## Raincloud -```{r} -ggplot(df_age_at_first_t2d %>% mutate(group=1), aes(x = age_at_first_t2d, y = group)) + - ggdist::stat_halfeye( - adjust = .5, - width = .6, - .width = 0, - justification = -.3, - point_colour = NA) + - geom_boxplot( - width = .1, - outlier.shape = NA - ) + - geom_quasirandom( - size = 1, - alpha = .3, - position = position_jitter( - seed = 1, width = .05 - ), - groupOnX = FALSE - ) + - coord_cartesian(xlim = c(1.2, NA), clip = "off") + - scale_x_continuous(breaks = seq(5, 100, by = 5)) + - ggtitle("Age at first t2d for patients with 'true' positives in cohort time") -``` - -## Cumulative distribution -```{r} -ggplot(df_age_at_first_t2d, aes(x = age_at_first_t2d)) + - stat_ecdf(geom = "step") + - scale_x_continuous(breaks = seq(5, 100, by = 5)) + - scale_y_continuous(breaks = seq(0, 1, by = 0.05)) + - ggtitle("Age at first t2d for patients with 'true' positives in cohort time") -``` - -```{r} -df_without_children <- df_age_at_first_t2d %>% # 3284 - filter(age_at_first_t2d > 30) %>% # 2883 - filter(age_at_first_t2d < 90) # 2804 -``` - -# Number of potentially true-positives that can generate predictions for increasing ∆t -```{r} -df_all_visits_combined <- read_csv(here("csv", "all_visits_combined.csv")) -``` - -```{r} -df_visits_for_size_of_prediction_window <- df_all_visits_combined %>% - rename(datotid_besoeg = datotid_start) %>% - inner_join(df_first_t2d_processed, by="dw_ek_borger") %>% - select(datotid_besoeg, datotid_first_t2d, dw_ek_borger) %>% - mutate(years_from_visit_to_t2d = time_length(difftime(datotid_first_t2d, datotid_besoeg), "years")) %>% - mutate(years_to_end_of_follow_up = time_length(difftime(max(datotid_besoeg), datotid_besoeg), "years")) %>% - filter(years_from_visit_to_t2d > 0) # Drop all visits that are before event %>% -``` - -```{r} -df_size_of_prediction_window_with_selected_cols <- df_visits_for_size_of_prediction_window - -for (i in 1:100) { - colname = paste0("window_", i) - - df_size_of_prediction_window_with_selected_cols <- df_size_of_prediction_window_with_selected_cols %>% - mutate({{colname}} := if_else(((years_from_visit_to_t2di/12)), 1, 0)) -} -``` - -## For each visit -```{r} -df_size_of_prediction_window_summarised <- df_size_of_prediction_window_with_selected_cols %>% - summarise(across(starts_with("window"), mean, .names = "mean_{.col}")) %>% - pivot_longer(cols = starts_with("mean_"), - names_to = "window_size_months", - values_to = "percent_included") %>% - mutate(window_size_months = as.numeric(gsub("mean_window_", "", window_size_months))) -``` - -```{r} -filepath <- here("figures", "window_size_visits.png") - -plot <- ggplot(df_size_of_prediction_window_summarised, aes(x = window_size_months, y = percent_included)) + - geom_point() + - scale_x_continuous(breaks = seq(0, 100, by = 2)) + - ggtitle("Proportion of potentially true-positive visits that are positive as a function of window size") - -ggsave(filepath, plot, dpi = 300, width = 5, height = 3) -``` - -## For each patient -```{r} -df_predict_window_size_patients <- df_size_of_prediction_window_with_selected_cols %>% - group_by(dw_ek_borger) %>% - summarise(across(starts_with("window"), max, .names = "{.col}")) %>% - ungroup() %>% - summarise(across(starts_with("window"), mean, .names = "mean_{.col}")) %>% - pivot_longer(cols = starts_with("mean_"), - names_to = "window_size_months", - values_to = "percent_included") %>% - mutate(window_size_months = as.numeric(gsub("mean_window_", "", window_size_months))) -``` - -```{r} -filepath <- here("figures", "window_size_patients.png") - -plot <- ggplot(df_predict_window_size_patients, aes(x = window_size_months, y = percent_included)) + - geom_point() + - scale_x_continuous(breaks = seq(0, 100, by = 2)) + - scale_y_continuous(breaks = seq(0, 1, by = 0.05)) + - ggtitle("Proportion of potentially true-positive patients that are positive as a function of window size") - -ggsave(filepath, plot, dpi = 300, width = 5, height = 3) -``` diff --git a/application/outcome_specification - move to t2d-feature-gen-repo/functions.r b/application/outcome_specification - move to t2d-feature-gen-repo/functions.r deleted file mode 100644 index 583f9e58..00000000 --- a/application/outcome_specification - move to t2d-feature-gen-repo/functions.r +++ /dev/null @@ -1,49 +0,0 @@ -event_start_date <- ymd("2014-01-01") - -str_contains_t2d_diag <- function(str) { - t2d_regex_pattern <- "(:DE1[1-5].*)|(:DE16[0-2].*)|(:DO24.*)|(:DT383A.*)|(:DM142.*)|(:DG590.*)|(:DG632*)|(:DH280.*)|(:DH334.*)|(:DH360.*)|(:DH450.*)|(:DN083.*)" - - if (isTRUE(str_detect(str, t2d_regex_pattern))) { - return(TRUE) - } - - return(FALSE) -} - -keep_only_first_t2d_by_diag <- function(df, date_col_string) { - str_contains_t2d_diag_vecced <- Vectorize(str_contains_t2d_diag) - - df %>% - filter(str_contains_t2d_diag_vecced((diagnosegruppestreng))) %>% - group_by(dw_ek_borger) %>% - filter(date_col_string == min(date_col_string)) %>% - filter(row_number() == 1) %>% - ungroup() -} - -str_contains_t1d_diag <- function(str) { - t1d_regex_pattern <- "(:DE10.*)|(:DO240.*)" - - if (isTRUE(str_detect(str, t1d_regex_pattern))) { - return(TRUE) - } - - return(FALSE) -} - -keep_only_first_t1d_by_diag <- function(df, date_col_string) { - str_contains_t1d_diag_vecced <- Vectorize(str_contains_t1d_diag) - - df %>% - filter(str_contains_t1d_diag_vecced((diagnosegruppestreng))) %>% - group_by(dw_ek_borger) %>% - filter(date_col_string == min(date_col_string)) %>% - filter(row_number() == 1) %>% - ungroup() -} - -visit_can_generate_prediction <- function(col1, col2, window_width_years) { - if_else(({{col1}}% - rename_with(tolower) - -df_planned_psych_visits <- df_planned_visits_raw %>% - filter(substr(shakafskode_besoeg, 1, 4) == "6600") %>% - filter(psykambbesoeg == 1) %>% - select(dw_ek_borger, datotid_start) %>% - arrange(datotid_start) - -df_p_samp <- df_planned_psych_visits %>% - filter(dw_ek_borger == 31) %>% - arrange(datotid_start) - -# Iterate over planned visits to only keep those, that are not within 3 months from last prediction -drop_within_3_months_from_prediction <- function(df) { - # Only takes as an input a dataframe that is already sorted by date (!!!) - current_CPR <- 0 - patient_i <- 0 - last_selected_date <- 0 - indeces_to_drop <- c() - - for (i in 1:nrow(df)) { - # print(str_c("Row_CPR, Current CPR: ", df$dw_ek_borger[i], ", ", current_CPR)) - - if (df$dw_ek_borger[i] != current_CPR) { # Handle switching to new person - current_CPR = df$dw_ek_borger[i] - last_selected_date = ymd_hms(df$datotid_start[i]) - - if (patient_i %% 100 == 0 ) { - print(str_c("Processing patient nr. ", patient_i)) - } - - patient_i <- patient_i + 1 - - next() - } - - if (df$dw_ek_borger[i] == current_CPR) { # Handle comparison of current visit to previous selected date - if (ymd_hms(df$datotid_start[i]) < (as.Date(last_selected_date) + 90)) { - indeces_to_drop <- c(indeces_to_drop, i) - } else { - last_selected_date <- df$datotid_start[i] - } - } - } - - return(df %>% slice(-indeces_to_drop)) -} - -df_planned_with_3m_spacing <- drop_within_3_months_from_prediction(df_planned_psych_visits) - - - -####### -# Age # -####### -df_demo_raw <- dbGetQuery(conn, "SELECT * FROM [fct].FOR_kohorte_demografi") %>% - rename_with(tolower) - -df_demo <- df_demo_raw %>% - select(foedselsdato, dw_ek_borger) %>% - mutate(foedselsdato = ymd(foedselsdato)) - -############### -# First psych # -############### -df_psyk_raw <- dbGetQuery(conn, "SELECT * FROM [fct].FOR_besoeg_fysiske_fremmoeder") %>% - rename_with(tolower) - -df_first_p <- df_psyk_raw %>% - select(dw_ek_borger, datotid_start) %>% - group_by(dw_ek_borger) %>% - arrange(datotid_start, .by_group=TRUE) %>% - filter(row_number() == 1) %>% - rename(datotid_f_psych = datotid_start) - - -############### -# T2D samples # -############### -# Raw -df_hba1c_raw <- dbGetQuery(conn, "SELECT * FROM [fct].FOR_LABKA_NPU27300_HbA1c") %>% - rename_with(tolower) - -df_maybe_t2d <- df_hba1c_raw %>% - select(datotid_godkendtsvar, svar, dw_ek_borger) %>% - mutate(svar = as.numeric(svar)) %>% - filter(svar > 47) %>% - select(-svar) %>% #Remove incidences that are before first psych contact - left_join(df_first_p) %>% - filter(datotid_f_psych < datotid_godkendtsvar) %>% - group_by(dw_ek_borger) %>% - arrange(datotid_godkendtsvar, .by_group = TRUE) %>% # Keep only first row - filter(row_number() == 1) %>% - rename(datotid_maybe_t2d = datotid_godkendtsvar) %>% - mutate(datotid_maybe_t2d = ymd_hms(datotid_maybe_t2d)) - - -df_probably_t2d <- df_hba1c_raw %>% - select(datotid_godkendtsvar, svar, dw_ek_borger) %>% - mutate(svar = as.numeric(svar)) %>% - filter(is.na(svar) == FALSE) %>% #Remove incidences that are before first psych contact - left_join(df_first_p) %>% - filter(datotid_f_psych < datotid_godkendtsvar) %>% - group_by(dw_ek_borger) %>% # Check if first HbA1c was normal - arrange(datotid_godkendtsvar, .by_group=TRUE) %>% - mutate(first_hba1c_normal = svar[1] < 48) %>% - filter(svar > 47 & first_hba1c_normal == TRUE) %>% - group_by(dw_ek_borger) %>% - arrange(datotid_godkendtsvar, .by_group = TRUE) %>% - filter(row_number() == 1) %>% # Keep only first match - select(datotid_godkendtsvar) %>% - rename(datotid_probably_t2d = datotid_godkendtsvar) %>% - mutate(datotid_probably_t2d = ymd_hms(datotid_probably_t2d)) - -######## -# Plot # -######## -setwd("E:/Users/adminmanber/Desktop/T2D") - -############## -# Age at T2D # -############## -gen_plot_age_df <- function(df, outcome) { - df_out <- df %>% - left_join(df_demo) %>% - mutate(age_at_t2d = interval(foedselsdato, {{outcome}}) / years(1)) - - return(df_out) -} - -df_plot_probably_t2d <- gen_plot_age_df(df_probably_t2d, datotid_probably_t2d) - -df_plot_maybe_t2d <- gen_plot_age_df(df_maybe_t2d, datotid_maybe_t2d) - -save_histogram <- function(df, x_var, filename) { - gg <- ggplot(df, aes(x={{x_var}})) + - geom_histogram(binwidth=1) + - labs( - title = filename, - x = "Age at incident T2D (years)", - y = "Count" - ) + - scale_x_continuous( - breaks = seq(15, 100, by=5), - limits = c(15, 100) - ) - - ggsave(str_c("figures/", filename, ".png"), width = 20, height = 10, dpi = 100, units = "in") - - gg -} - -save_histogram(df_plot_probably_t2d, age_at_t2d, "age_at_first_t2d_hba1c_after_normal_hba1c") -save_histogram(df_plot_maybe_t2d, age_at_t2d, "age_at_first_t2d_hba1c") - - -################################## -# Time from planned visit to T2D # -################################## -gen_planned_to_event_df <- function(df_event, event_col, df_planned_visits) { - df_out <- df_event %>% - inner_join(df_planned_visits) %>% - rename(visit_start = datotid_start) %>% - mutate(years_since_visit = interval(visit_start, {{event_col}}) / years(1)) %>% - filter(years_since_visit > 0.25) - -} - -df_planned_to_probable_t2d <- gen_planned_to_event_df(df_event=df_probably_t2d, - event_col=datotid_probably_t2d, - df_planned_visits=df_planned_psych_visits) - -df_planned_to_maybe_t2d <- gen_planned_to_event_df(df_event=df_maybe_t2d, - event_col=datotid_maybe_t2d, - df_planned_visits=df_planned_psych_visits) - -save_time_from_visit <- function(df, x_var, filename) { - gg <- ggplot(df, aes(x={{x_var}})) + - scale_x_continuous( - breaks = seq(0, 10, by=0.25), - limits = c(0, 10) - ) - - hist <- gg + - geom_histogram( - binwidth = 0.25 - ) - - box <- gg + - geom_boxplot() - - combined <- hist + box + plot_layout(nrow = 2, height = c(2, 1)) - - ggsave(str_c("figures/", filename, "_histogram.png"), width = 20, height = 20, dpi = 100, units = "in") - - combined -} - -save_time_from_visit(df_planned_to_maybe_t2d, years_since_visit, "years_until_maybe_t2d_for_visit_histogram") -save_time_from_visit(df_planned_to_probable_t2d, years_since_visit, "years_until_probable_t2d_for_visit_histogram") diff --git a/application/outcome_specification - move to t2d-feature-gen-repo/t2d.Rproj b/application/outcome_specification - move to t2d-feature-gen-repo/t2d.Rproj deleted file mode 100644 index 8e3c2ebc..00000000 --- a/application/outcome_specification - move to t2d-feature-gen-repo/t2d.Rproj +++ /dev/null @@ -1,13 +0,0 @@ -Version: 1.0 - -RestoreWorkspace: Default -SaveWorkspace: Default -AlwaysSaveHistory: Default - -EnableCodeIndexing: Yes -UseSpacesForTab: Yes -NumSpacesForTab: 2 -Encoding: UTF-8 - -RnwWeave: Sweave -LaTeX: pdfLaTeX diff --git a/application/outcome_specification - move to t2d-feature-gen-repo/tests/tests.r b/application/outcome_specification - move to t2d-feature-gen-repo/tests/tests.r deleted file mode 100644 index f6d1b22d..00000000 --- a/application/outcome_specification - move to t2d-feature-gen-repo/tests/tests.r +++ /dev/null @@ -1,49 +0,0 @@ - -library("pacman") - -p_load(testthat, here, xpectr) - -source(here("src", "functions.r")) - -test_df <- tribble( - ~diagnosegruppestreng, ~datotid_lpr3kontaktstart, ~dw_ek_borger, - "A:DE14#+:ALFC3", "2021-06-30 09:00:00.0000000", 1, - "A:DE14#+:ALFC3", "2021-05-30 09:00:00.0000000", 1, - "A:DE14#+:ALFC3", "2021-04-30 09:00:00.0000000", 1 -) - -source(here("src", "functions.r")) -output_df <- keep_only_first_t2d_by_diag(test_df, "datotid_lpr3kontaktstart") - -test_that("Correct diagnosegruppe-matching",{ - # Testing column values - expect_equal( - output_df[["diagnosegruppestreng"]], - "A:DE14#+:ALFC3", - fixed = TRUE) - expect_equal( - output_df[["dw_ek_borger"]], - 1, - tolerance = 1e-4) -}) - -test_window_gen_df <- tribble( - ~years_from_visit_to_t2d, ~years_to_end_of_follow_up, ~dw_ek_borger, - 1, 1, 1, - 1, 2, 2 -) - -output_df_window <- mutate(test_window_gen_df, window_1 = visit_can_generate_prediction_vecced(years_from_visit_to_t2d, years_from_visit_to_t2d, 1)) - -test_window_grouped_df <- tribble( - ~dw_ek_borger, ~window_1, ~window_2, - 1, 1, 0, - 1, 0, 0, - 1, 0, 1, - 2, 0, 0, - 2, 1, 0 -) - -df_out_window_group <- test_window_grouped_df %>% - group_by(dw_ek_borger) %>% - summarise(across(starts_with("window"), max, .names = "{.col}")) \ No newline at end of file diff --git a/application/train_model.py b/application/train_model.py index 8c240bc7..58b0dbba 100644 --- a/application/train_model.py +++ b/application/train_model.py @@ -4,10 +4,6 @@ import numpy as np import wandb -from psycop_model_training.utils.config_schemas import ( - FullConfigSchema, - convert_omegaconf_to_pydantic_object, -) from psycop_model_training.data_loader.utils import load_train_and_val_from_cfg from psycop_model_training.model_eval.dataclasses import PipeMetadata from psycop_model_training.model_eval.evaluate_model import run_full_evaluation @@ -20,6 +16,10 @@ train_and_get_model_eval_df, ) from psycop_model_training.utils.col_name_inference import get_col_names +from psycop_model_training.utils.config_schemas import ( + FullConfigSchema, + convert_omegaconf_to_pydantic_object, +) from psycop_model_training.utils.utils import ( PROJECT_ROOT, create_wandb_folders, diff --git a/src/psycop_model_training/archive/model_training_watcher.py b/src/psycop_model_training/archive/model_training_watcher.py index 352042fd..92821039 100644 --- a/src/psycop_model_training/archive/model_training_watcher.py +++ b/src/psycop_model_training/archive/model_training_watcher.py @@ -13,9 +13,9 @@ from wandb.sdk.wandb_run import Run # pylint: disable=no-name-in-module from wasabi import msg -from psycop_model_training.utils.config_schemas import FullConfigSchema from psycop_model_training.model_eval.dataclasses import ModelEvalData from psycop_model_training.model_eval.evaluate_model import run_full_evaluation +from psycop_model_training.utils.config_schemas import FullConfigSchema from psycop_model_training.utils.utils import ( MODEL_PREDICTIONS_PATH, PROJECT_ROOT, diff --git a/src/psycop_model_training/data_loader/utils.py b/src/psycop_model_training/data_loader/utils.py index 3f16162a..e73fe1ad 100644 --- a/src/psycop_model_training/data_loader/utils.py +++ b/src/psycop_model_training/data_loader/utils.py @@ -3,9 +3,9 @@ import pandas as pd -from psycop_model_training.utils.config_schemas import FullConfigSchema from psycop_model_training.data_loader.data_classes import SplitDataset from psycop_model_training.data_loader.data_loader import DataLoader +from psycop_model_training.utils.config_schemas import FullConfigSchema def get_latest_dataset_dir(path: Path) -> Path: diff --git a/src/psycop_model_training/model_eval/evaluate_model.py b/src/psycop_model_training/model_eval/evaluate_model.py index 8a4eb2a4..cc16cfb3 100644 --- a/src/psycop_model_training/model_eval/evaluate_model.py +++ b/src/psycop_model_training/model_eval/evaluate_model.py @@ -8,7 +8,6 @@ from sklearn.metrics import recall_score from wandb.sdk.wandb_run import Run as wandb_run # pylint: disable=no-name-in-module -from psycop_model_training.utils.config_schemas import FullConfigSchema from psycop_model_training.model_eval.dataclasses import ( ArtifactContainer, EvalDataset, @@ -37,6 +36,7 @@ generate_feature_importances_table, generate_selected_features_table, ) +from psycop_model_training.utils.config_schemas import FullConfigSchema from psycop_model_training.utils.utils import positive_rate_to_pred_probs diff --git a/src/psycop_model_training/preprocessing/post_split/create_pipeline.py b/src/psycop_model_training/preprocessing/post_split/create_pipeline.py index 12764cf6..5af38c69 100644 --- a/src/psycop_model_training/preprocessing/post_split/create_pipeline.py +++ b/src/psycop_model_training/preprocessing/post_split/create_pipeline.py @@ -10,7 +10,6 @@ from sklearn.preprocessing import StandardScaler from wasabi import Printer -from psycop_model_training.utils.config_schemas import FullConfigSchema from psycop_model_training.preprocessing.post_split.feature_selectors import ( DropDateTimeColumns, ) @@ -18,6 +17,7 @@ ConvertToBoolean, DateTimeConverter, ) +from psycop_model_training.utils.config_schemas import FullConfigSchema def get_feature_selection_steps(cfg): diff --git a/src/psycop_model_training/training/train_and_eval.py b/src/psycop_model_training/training/train_and_eval.py index 9a5c6c2b..f6c0d7c1 100644 --- a/src/psycop_model_training/training/train_and_eval.py +++ b/src/psycop_model_training/training/train_and_eval.py @@ -10,12 +10,11 @@ from sklearn.pipeline import Pipeline from wasabi import Printer -from psycop_model_training.utils.config_schemas import FullConfigSchema - # from psycop_model_training.evaluation import evaluate_model from psycop_model_training.model_eval.dataclasses import EvalDataset from psycop_model_training.training.model_specs import MODELS from psycop_model_training.training.utils import create_eval_dataset +from psycop_model_training.utils.config_schemas import FullConfigSchema from psycop_model_training.utils.utils import PROJECT_ROOT CONFIG_PATH = PROJECT_ROOT / "src" / "psycop_model_training" / "config" diff --git a/src/psycop_model_training/training/utils.py b/src/psycop_model_training/training/utils.py index 99e6ae43..6509f472 100644 --- a/src/psycop_model_training/training/utils.py +++ b/src/psycop_model_training/training/utils.py @@ -1,7 +1,7 @@ import pandas as pd -from psycop_model_training.utils.config_schemas import FullConfigSchema from psycop_model_training.model_eval.dataclasses import EvalDataset +from psycop_model_training.utils.config_schemas import FullConfigSchema def create_eval_dataset(cfg: FullConfigSchema, outcome_col_name: str, df: pd.DataFrame): diff --git a/tests/conftest.py b/tests/conftest.py index 7ffcb6ab..4e020f5d 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -5,8 +5,11 @@ import pandas as pd import pytest -from psycop_model_training.utils.config_schemas import FullConfigSchema, load_cfg_as_pydantic from psycop_model_training.model_eval.dataclasses import EvalDataset +from psycop_model_training.utils.config_schemas import ( + FullConfigSchema, + load_cfg_as_pydantic, +) CONFIG_DIR_PATH_REL = "../application/config" diff --git a/tests/test_configs.py b/tests/test_configs.py index b9e2a389..c6ed58f7 100644 --- a/tests/test_configs.py +++ b/tests/test_configs.py @@ -4,7 +4,9 @@ import pytest from hydra import compose, initialize -from psycop_model_training.utils.config_schemas import convert_omegaconf_to_pydantic_object +from psycop_model_training.utils.config_schemas import ( + convert_omegaconf_to_pydantic_object, +) from psycop_model_training.utils.utils import PROJECT_ROOT CONFIG_DIR_PATH_ABS = PROJECT_ROOT / "src" / "psycop_model_training" / "config" diff --git a/tests/test_load.py b/tests/test_load.py index 1bf597d0..d0ccad51 100644 --- a/tests/test_load.py +++ b/tests/test_load.py @@ -1,7 +1,7 @@ """Testing of loader functions.""" -from psycop_model_training.utils.config_schemas import FullConfigSchema from psycop_model_training.data_loader.utils import load_train_from_cfg +from psycop_model_training.utils.config_schemas import FullConfigSchema def test_load_lookbehind_exceeds_lookbehind_threshold( diff --git a/tests/test_preprocessing.py b/tests/test_preprocessing.py index ce9b7d76..03dd98e3 100644 --- a/tests/test_preprocessing.py +++ b/tests/test_preprocessing.py @@ -1,9 +1,9 @@ """Test custom preprocessing steps.""" -from psycop_model_training.utils.config_schemas import FullConfigSchema from psycop_model_training.data_loader.utils import load_train_from_cfg from psycop_model_training.preprocessing.post_split.create_pipeline import ( create_preprocessing_pipeline, ) +from psycop_model_training.utils.config_schemas import FullConfigSchema def test_drop_datetime_predictor_columns( diff --git a/tests/test_train_model.py b/tests/test_train_model.py index 99e5ddf6..d34ebb3c 100644 --- a/tests/test_train_model.py +++ b/tests/test_train_model.py @@ -4,8 +4,11 @@ import pytest from application.train_model import main -from psycop_model_training.utils.config_schemas import FullConfigSchema, load_cfg_as_omegaconf from psycop_model_training.training.model_specs import MODELS +from psycop_model_training.utils.config_schemas import ( + FullConfigSchema, + load_cfg_as_omegaconf, +) INTEGRATION_TEST_FILE_NAME = "integration_config.yaml" From 900b29a4a2e0a1c549943b2ca6b983bfc1fd375a Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Fri, 16 Dec 2022 13:32:27 +0100 Subject: [PATCH 05/47] refactor: move config files around --- .../config/project/overtaci_test_project.yaml | 5 --- tests/test_configs/__init__.py | 0 .../test_configs}/data/synth_data.yaml | 0 .../test_configs}/eval/evaluation_synth.yaml | 0 .../test_configs}/integration_config.yaml | 0 tests/test_configs/model/ebm.yaml | 31 +++++++++++++++++++ .../model/logistic-regression.yaml | 25 +++++++++++++++ tests/test_configs/model/naive-bayes.yaml | 13 ++++++++ tests/test_configs/model/xgboost.yaml | 20 ++++++++++++ .../preprocessing/default_preprocessing.yaml | 20 ++++++++++++ .../project/integration_test_project.yaml | 0 .../sweeper/optuna_multithread.yaml | 12 +++++++ .../sweeper/optuna_singlethread.yaml | 10 ++++++ .../test_configs/train/default_training.yaml | 5 +++ 14 files changed, 136 insertions(+), 5 deletions(-) delete mode 100644 application/config/project/overtaci_test_project.yaml create mode 100644 tests/test_configs/__init__.py rename {application/config => tests/test_configs}/data/synth_data.yaml (100%) rename {application/config => tests/test_configs}/eval/evaluation_synth.yaml (100%) rename {application/config => tests/test_configs}/integration_config.yaml (100%) create mode 100644 tests/test_configs/model/ebm.yaml create mode 100644 tests/test_configs/model/logistic-regression.yaml create mode 100644 tests/test_configs/model/naive-bayes.yaml create mode 100644 tests/test_configs/model/xgboost.yaml create mode 100644 tests/test_configs/preprocessing/default_preprocessing.yaml rename {application/config => tests/test_configs}/project/integration_test_project.yaml (100%) create mode 100644 tests/test_configs/sweeper/optuna_multithread.yaml create mode 100644 tests/test_configs/sweeper/optuna_singlethread.yaml create mode 100644 tests/test_configs/train/default_training.yaml diff --git a/application/config/project/overtaci_test_project.yaml b/application/config/project/overtaci_test_project.yaml deleted file mode 100644 index 716c9943..00000000 --- a/application/config/project/overtaci_test_project.yaml +++ /dev/null @@ -1,5 +0,0 @@ -name: psycop-t2d-testing -seed: 42 -wandb_mode: "offline" # Which mode to run WanDB in. Takes "run", "dryrun", "offline" and "disabled" -wandb_group: "psycop-t2d" -wandb_entity: "psycop" # Optional[str] diff --git a/tests/test_configs/__init__.py b/tests/test_configs/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/application/config/data/synth_data.yaml b/tests/test_configs/data/synth_data.yaml similarity index 100% rename from application/config/data/synth_data.yaml rename to tests/test_configs/data/synth_data.yaml diff --git a/application/config/eval/evaluation_synth.yaml b/tests/test_configs/eval/evaluation_synth.yaml similarity index 100% rename from application/config/eval/evaluation_synth.yaml rename to tests/test_configs/eval/evaluation_synth.yaml diff --git a/application/config/integration_config.yaml b/tests/test_configs/integration_config.yaml similarity index 100% rename from application/config/integration_config.yaml rename to tests/test_configs/integration_config.yaml diff --git a/tests/test_configs/model/ebm.yaml b/tests/test_configs/model/ebm.yaml new file mode 100644 index 00000000..3d833821 --- /dev/null +++ b/tests/test_configs/model/ebm.yaml @@ -0,0 +1,31 @@ +# @package _global_ +model: + name: ebm # (str): Model name, explainable boosting machine + require_imputation: true # (bool): Whether the model requires imputation. + args: # Documentiation: https://interpret.ml/docs/ebm.html#api + max_bins: 256 + max_interaction_bins: 32 + binning: quantile + mains: all + interactions: 10 + outer_bags: 8 + inner_bags: 0 + learning_rate: 0.01 + validation_size: 0.15 + early_stopping_rounds: 50 + early_stopping_tolerance: 0.0001 + max_rounds: 5000 + min_samples_leaf: 2 + max_leaves: 3 + n_jobs: 1 + random_state: ${project.seed} + +# Parameters that will only take effect if running with --multirun +hydra: + sweeper: + params: + ++model.args.interactions: choice(0, 5, 10, 50) + ++model.args.learning_rate: interval(0.001, 0.1) + ++model.args.validation_size: interval(0.20, 0.05) + ++model.args.min_samples_leaf: choice(1, 2) + ++model.args.max_leaves: choice(2, 3, 4, 8, 16) diff --git a/tests/test_configs/model/logistic-regression.yaml b/tests/test_configs/model/logistic-regression.yaml new file mode 100644 index 00000000..30ab21c4 --- /dev/null +++ b/tests/test_configs/model/logistic-regression.yaml @@ -0,0 +1,25 @@ +# @package _global_ +model: + name: logistic-regression # (str): Model name + require_imputation: True # (bool): Whether the model requires imputation. + args: # https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html + dual: False + tol: 0.0001 + C: 1.0 + fit_intercept: True + intercept_scaling: True + class_weight: Null + random_state: ${project.seed} + penalty_solver: "l2_lbfgs" # custom argument is split into penalty and solver + max_iter: 100 + l1_ratio: 0.5 + +# Parameters that will only take effect if running with --multirun +hydra: + sweeper: + params: + ++model.args.penalty_solver: choice("elasticnet_saga") + ++model.args.C: interval(1e-5, 1.0) + ++model.args.l1_ratio: interval(1e-5, 1.0) + # preprocessing + ++preprocessing.scaling: choice("null", "z-score-normalization") diff --git a/tests/test_configs/model/naive-bayes.yaml b/tests/test_configs/model/naive-bayes.yaml new file mode 100644 index 00000000..cd605228 --- /dev/null +++ b/tests/test_configs/model/naive-bayes.yaml @@ -0,0 +1,13 @@ +# @package _global_ +model: + name: naive-bayes # (str): Model name + require_imputation: True # (bool): Whether the model requires imputation. + args: # https://scikit-learn.org/stable/modules/naive_bayes.html#gaussian-naive-bayes + var_smoothing: 0.000000001 + +# Parameters that will only take effect if running with --multirun +hydra: + sweeper: + params: + # preprocessing + ++preprocessing.scaling: choice(null, "z-score-normalization") diff --git a/tests/test_configs/model/xgboost.yaml b/tests/test_configs/model/xgboost.yaml new file mode 100644 index 00000000..b96925a6 --- /dev/null +++ b/tests/test_configs/model/xgboost.yaml @@ -0,0 +1,20 @@ +# @package _global_ +model: + name: xgboost + require_imputation: false + args: + n_estimators: 100 + tree_method: gpu_hist # set to gpu_hist to enable GPU training (default auto) + booster: gbtree + +# Parameters that will only take effect if running with --multirun +hydra: + sweeper: + params: + ++model.args.n_estimators: int(tag(log, interval(100, 1200))) + ++model.args.alpha: tag(log, interval(1e-8, 0.1)) + ++model.args.lambda: tag(log, interval(1e-8, 1.0)) + ++model.args.max_depth: int(interval(1, 10)) + ++model.args.learning_rate: tag(log, interval(1e-8, 1)) # Multiplier during boosting, [0,1]. Lower numbers mean more conservative boosting. Default is 0.3 + ++model.args.gamma: tag(log, interval(1e-8, 0.001)) # Threshold for loss reduction per node split. If lower than threshold, stops adding nodes to branch. + ++model.args.grow_policy: choice("depthwise", "lossguide") diff --git a/tests/test_configs/preprocessing/default_preprocessing.yaml b/tests/test_configs/preprocessing/default_preprocessing.yaml new file mode 100644 index 00000000..ad95e66e --- /dev/null +++ b/tests/test_configs/preprocessing/default_preprocessing.yaml @@ -0,0 +1,20 @@ +# @package _global_ +preprocessing: + convert_to_boolean: false + convert_booleans_to_int: true + drop_datetime_predictor_columns: true + convert_datetimes_to_ordinal: false + imputation_method: most_frequent + scaling: z-score-normalisation + feature_selection: + name: chi2 + params: + percentile: 20 # (int): Percent of features to keep. Defaults to 10. + +hydra: + sweeper: + params: + ++preprocessing.imputation_method: choice("most_frequent", "mean", "median", "null") + ++preprocessing.scaling: choice("z-score-normalization", "null") + ++preprocessing.feature_selection.name: choice("chi2", "null") + ++preprocessing.feature_selection.params.percentile: int(tag(log, interval(1, 90))) diff --git a/application/config/project/integration_test_project.yaml b/tests/test_configs/project/integration_test_project.yaml similarity index 100% rename from application/config/project/integration_test_project.yaml rename to tests/test_configs/project/integration_test_project.yaml diff --git a/tests/test_configs/sweeper/optuna_multithread.yaml b/tests/test_configs/sweeper/optuna_multithread.yaml new file mode 100644 index 00000000..e22c8c52 --- /dev/null +++ b/tests/test_configs/sweeper/optuna_multithread.yaml @@ -0,0 +1,12 @@ +# @package _global_ +defaults: + - override /hydra/sweeper: optuna + - override /hydra/sweeper/sampler: tpe + - override /hydra/launcher: joblib + +hydra: + sweeper: + sampler: + seed: 123 + n_jobs: 2 + direction: maximize \ No newline at end of file diff --git a/tests/test_configs/sweeper/optuna_singlethread.yaml b/tests/test_configs/sweeper/optuna_singlethread.yaml new file mode 100644 index 00000000..f40bb5bd --- /dev/null +++ b/tests/test_configs/sweeper/optuna_singlethread.yaml @@ -0,0 +1,10 @@ +# @package _global_ +defaults: + - override /hydra/sweeper: optuna + - override /hydra/sweeper/sampler: tpe + +hydra: + sweeper: + sampler: + seed: 123 + direction: maximize diff --git a/tests/test_configs/train/default_training.yaml b/tests/test_configs/train/default_training.yaml new file mode 100644 index 00000000..074bbb8f --- /dev/null +++ b/tests/test_configs/train/default_training.yaml @@ -0,0 +1,5 @@ +n_splits: 3 # (int, Null): Number of k-folds during CV. If Null, loads pre-defined dataset. +n_trials_per_lookahead: 300 +n_jobs_per_trainer: 1 +n_active_trainers: 10 +random_delay_per_job_seconds: 0 From 7852599f125575a8191b77a06d7f6be51e2501bc Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Fri, 16 Dec 2022 13:34:31 +0100 Subject: [PATCH 06/47] fix: broken imports in train_model --- application/train_model.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/application/train_model.py b/application/train_model.py index 58b0dbba..ba7692da 100644 --- a/application/train_model.py +++ b/application/train_model.py @@ -1,8 +1,13 @@ import time from typing import Any +import hydra import numpy as np import wandb +from omegaconf import DictConfig, OmegaConf +from sklearn.metrics import roc_auc_score +from sklearn.pipeline import Pipeline +from wasabi import Printer from psycop_model_training.data_loader.utils import load_train_and_val_from_cfg from psycop_model_training.model_eval.dataclasses import PipeMetadata From 9a1531ce67a69b0e6537e63cb834f554e45f0346 Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Fri, 16 Dec 2022 13:42:27 +0100 Subject: [PATCH 07/47] fix: broken imports --- .../data_loader/data_classes.py | 2 ++ .../model_eval/evaluate_model.py | 24 +++++++++------- .../model_eval/plots/__init__.py | 8 ------ .../utils/config_schemas.py | 2 +- tests/{test_configs => configs}/__init__.py | 0 .../data/synth_data.yaml | 0 .../eval/evaluation_synth.yaml | 0 .../integration_config.yaml | 0 .../{test_configs => configs}/model/ebm.yaml | 0 .../model/logistic-regression.yaml | 0 .../model/naive-bayes.yaml | 0 .../model/xgboost.yaml | 0 .../preprocessing/default_preprocessing.yaml | 0 .../project/integration_test_project.yaml | 0 .../sweeper/optuna_multithread.yaml | 0 .../sweeper/optuna_singlethread.yaml | 0 .../train/default_training.yaml | 0 tests/model_evaluation/test_visualizations.py | 28 +++++++++++-------- 18 files changed, 33 insertions(+), 31 deletions(-) rename tests/{test_configs => configs}/__init__.py (100%) rename tests/{test_configs => configs}/data/synth_data.yaml (100%) rename tests/{test_configs => configs}/eval/evaluation_synth.yaml (100%) rename tests/{test_configs => configs}/integration_config.yaml (100%) rename tests/{test_configs => configs}/model/ebm.yaml (100%) rename tests/{test_configs => configs}/model/logistic-regression.yaml (100%) rename tests/{test_configs => configs}/model/naive-bayes.yaml (100%) rename tests/{test_configs => configs}/model/xgboost.yaml (100%) rename tests/{test_configs => configs}/preprocessing/default_preprocessing.yaml (100%) rename tests/{test_configs => configs}/project/integration_test_project.yaml (100%) rename tests/{test_configs => configs}/sweeper/optuna_multithread.yaml (100%) rename tests/{test_configs => configs}/sweeper/optuna_singlethread.yaml (100%) rename tests/{test_configs => configs}/train/default_training.yaml (100%) diff --git a/src/psycop_model_training/data_loader/data_classes.py b/src/psycop_model_training/data_loader/data_classes.py index 8cf7769a..71c749cb 100644 --- a/src/psycop_model_training/data_loader/data_classes.py +++ b/src/psycop_model_training/data_loader/data_classes.py @@ -2,6 +2,8 @@ import pandas as pd +from psycop_model_training.utils.config_schemas import BaseModel + class SplitDataset(BaseModel): """A dataset split into train, test and optionally validation.""" diff --git a/src/psycop_model_training/model_eval/evaluate_model.py b/src/psycop_model_training/model_eval/evaluate_model.py index cc16cfb3..8def472b 100644 --- a/src/psycop_model_training/model_eval/evaluate_model.py +++ b/src/psycop_model_training/model_eval/evaluate_model.py @@ -13,22 +13,26 @@ EvalDataset, PipeMetadata, ) -from psycop_model_training.model_eval.plots import ( - log_image_to_wandb, - plot_auc_by_time_from_first_visit, - plot_auc_roc, - plot_metric_by_calendar_time, - plot_metric_by_cyclic_time, - plot_metric_by_time_until_diagnosis, - plot_performance_by_age, - plot_sensitivity_by_time_to_outcome_heatmap, -) from psycop_model_training.model_eval.plots.feature_importance import ( plot_feature_importances, ) +from psycop_model_training.model_eval.plots.performance_by_age import ( + plot_performance_by_age, +) from psycop_model_training.model_eval.plots.performance_by_n_hba1c import ( plot_performance_by_n_hba1c, ) +from psycop_model_training.model_eval.plots.performance_over_time import ( + plot_auc_by_time_from_first_visit, + plot_metric_by_calendar_time, + plot_metric_by_cyclic_time, + plot_metric_by_time_until_diagnosis, +) +from psycop_model_training.model_eval.plots.roc_auc import plot_auc_roc +from psycop_model_training.model_eval.plots.sens_over_time import ( + plot_sensitivity_by_time_to_outcome_heatmap, +) +from psycop_model_training.model_eval.plots.utils import log_image_to_wandb from psycop_model_training.model_eval.tables.performance_by_threshold import ( generate_performance_by_positive_rate_table, ) diff --git a/src/psycop_model_training/model_eval/plots/__init__.py b/src/psycop_model_training/model_eval/plots/__init__.py index 421566f8..e69de29b 100644 --- a/src/psycop_model_training/model_eval/plots/__init__.py +++ b/src/psycop_model_training/model_eval/plots/__init__.py @@ -1,8 +0,0 @@ -"""Visualisations.""" -from .feature_importance import plot_feature_importances # noqa -from .performance_over_time import ( - plot_auc_by_time_from_first_visit, - plot_metric_by_calendar_time, - plot_metric_by_time_until_diagnosis, -) -from .prob_over_time import plot_prob_over_time # noqa diff --git a/src/psycop_model_training/utils/config_schemas.py b/src/psycop_model_training/utils/config_schemas.py index e3f5fc22..b5f8b6d6 100644 --- a/src/psycop_model_training/utils/config_schemas.py +++ b/src/psycop_model_training/utils/config_schemas.py @@ -248,7 +248,7 @@ def load_cfg_as_omegaconf( overrides: Optional[list[str]] = None, ) -> DictConfig: """Load config as omegaconf object.""" - with initialize(version_base=None, config_path="../../../application/config/"): + with initialize(version_base=None, config_path="../../../tests/test_configs/"): if overrides: cfg = compose( config_name=config_file_name, diff --git a/tests/test_configs/__init__.py b/tests/configs/__init__.py similarity index 100% rename from tests/test_configs/__init__.py rename to tests/configs/__init__.py diff --git a/tests/test_configs/data/synth_data.yaml b/tests/configs/data/synth_data.yaml similarity index 100% rename from tests/test_configs/data/synth_data.yaml rename to tests/configs/data/synth_data.yaml diff --git a/tests/test_configs/eval/evaluation_synth.yaml b/tests/configs/eval/evaluation_synth.yaml similarity index 100% rename from tests/test_configs/eval/evaluation_synth.yaml rename to tests/configs/eval/evaluation_synth.yaml diff --git a/tests/test_configs/integration_config.yaml b/tests/configs/integration_config.yaml similarity index 100% rename from tests/test_configs/integration_config.yaml rename to tests/configs/integration_config.yaml diff --git a/tests/test_configs/model/ebm.yaml b/tests/configs/model/ebm.yaml similarity index 100% rename from tests/test_configs/model/ebm.yaml rename to tests/configs/model/ebm.yaml diff --git a/tests/test_configs/model/logistic-regression.yaml b/tests/configs/model/logistic-regression.yaml similarity index 100% rename from tests/test_configs/model/logistic-regression.yaml rename to tests/configs/model/logistic-regression.yaml diff --git a/tests/test_configs/model/naive-bayes.yaml b/tests/configs/model/naive-bayes.yaml similarity index 100% rename from tests/test_configs/model/naive-bayes.yaml rename to tests/configs/model/naive-bayes.yaml diff --git a/tests/test_configs/model/xgboost.yaml b/tests/configs/model/xgboost.yaml similarity index 100% rename from tests/test_configs/model/xgboost.yaml rename to tests/configs/model/xgboost.yaml diff --git a/tests/test_configs/preprocessing/default_preprocessing.yaml b/tests/configs/preprocessing/default_preprocessing.yaml similarity index 100% rename from tests/test_configs/preprocessing/default_preprocessing.yaml rename to tests/configs/preprocessing/default_preprocessing.yaml diff --git a/tests/test_configs/project/integration_test_project.yaml b/tests/configs/project/integration_test_project.yaml similarity index 100% rename from tests/test_configs/project/integration_test_project.yaml rename to tests/configs/project/integration_test_project.yaml diff --git a/tests/test_configs/sweeper/optuna_multithread.yaml b/tests/configs/sweeper/optuna_multithread.yaml similarity index 100% rename from tests/test_configs/sweeper/optuna_multithread.yaml rename to tests/configs/sweeper/optuna_multithread.yaml diff --git a/tests/test_configs/sweeper/optuna_singlethread.yaml b/tests/configs/sweeper/optuna_singlethread.yaml similarity index 100% rename from tests/test_configs/sweeper/optuna_singlethread.yaml rename to tests/configs/sweeper/optuna_singlethread.yaml diff --git a/tests/test_configs/train/default_training.yaml b/tests/configs/train/default_training.yaml similarity index 100% rename from tests/test_configs/train/default_training.yaml rename to tests/configs/train/default_training.yaml diff --git a/tests/model_evaluation/test_visualizations.py b/tests/model_evaluation/test_visualizations.py index ee17e9d9..4df88ee0 100644 --- a/tests/model_evaluation/test_visualizations.py +++ b/tests/model_evaluation/test_visualizations.py @@ -11,24 +11,28 @@ from sklearn.metrics import f1_score, roc_auc_score from psycop_model_training.model_eval.dataclasses import EvalDataset -from psycop_model_training.model_eval.plots import ( - create_sensitivity_by_time_to_outcome_df, - plot_auc_by_time_from_first_visit, - plot_auc_roc, - plot_basic_chart, - plot_metric_by_calendar_time, - plot_metric_by_cyclic_time, - plot_metric_by_time_until_diagnosis, - plot_performance_by_age, - plot_prob_over_time, - plot_sensitivity_by_time_to_outcome_heatmap, -) +from psycop_model_training.model_eval.plots.base_charts import plot_basic_chart from psycop_model_training.model_eval.plots.feature_importance import ( plot_feature_importances, ) +from psycop_model_training.model_eval.plots.performance_by_age import ( + plot_performance_by_age, +) from psycop_model_training.model_eval.plots.performance_by_n_hba1c import ( plot_performance_by_n_hba1c, ) +from psycop_model_training.model_eval.plots.performance_over_time import ( + plot_auc_by_time_from_first_visit, + plot_metric_by_calendar_time, + plot_metric_by_cyclic_time, + plot_metric_by_time_until_diagnosis, +) +from psycop_model_training.model_eval.plots.prob_over_time import plot_prob_over_time +from psycop_model_training.model_eval.plots.roc_auc import plot_auc_roc +from psycop_model_training.model_eval.plots.sens_over_time import ( + create_sensitivity_by_time_to_outcome_df, + plot_sensitivity_by_time_to_outcome_heatmap, +) from psycop_model_training.utils.utils import PROJECT_ROOT, positive_rate_to_pred_probs From c00a7d1c5f450281f5cb0331cf921a9f63a7f940 Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Fri, 16 Dec 2022 14:31:00 +0100 Subject: [PATCH 08/47] refactor: split up schemas --- application/inspect_dataset.py | 13 +- application/main.py | 4 +- application/train_model.py | 6 +- .../data_loader/utils.py | 6 +- .../model_eval/evaluate_model.py | 2 +- .../preprocessing/pre_split/full_processor.py | 29 ++ .../preprocessing/pre_split/row_filterer.py | 9 +- .../utils/config_schemas.py | 282 ------------------ .../utils/config_schemas}/__init__.py | 0 .../utils/config_schemas/conf_utils.py | 0 .../utils/config_schemas/data.py | 0 .../utils/config_schemas/eval.py | 0 .../utils/config_schemas/full_config.py | 0 .../utils/config_schemas/model.py | 0 .../utils/config_schemas/preprocessing.py | 0 .../utils/config_schemas/project.py | 0 .../utils/config_schemas/train.py | 0 tests/config/__init__.py | 0 .../{configs => config}/data/synth_data.yaml | 0 .../eval/evaluation_synth.yaml | 0 .../integration_config.yaml | 0 tests/{configs => config}/model/ebm.yaml | 0 .../model/logistic-regression.yaml | 0 .../model/naive-bayes.yaml | 0 tests/{configs => config}/model/xgboost.yaml | 0 .../preprocessing/default_preprocessing.yaml | 0 .../project/integration_test_project.yaml | 0 .../sweeper/optuna_multithread.yaml | 0 .../sweeper/optuna_singlethread.yaml | 0 .../train/default_training.yaml | 0 tests/conftest.py | 6 +- tests/test_configs.py | 4 +- tests/test_load.py | 10 +- tests/test_preprocessing.py | 4 +- tests/test_train_model.py | 4 +- 35 files changed, 69 insertions(+), 310 deletions(-) create mode 100644 src/psycop_model_training/preprocessing/pre_split/full_processor.py delete mode 100644 src/psycop_model_training/utils/config_schemas.py rename {tests/configs => src/psycop_model_training/utils/config_schemas}/__init__.py (100%) create mode 100644 src/psycop_model_training/utils/config_schemas/conf_utils.py create mode 100644 src/psycop_model_training/utils/config_schemas/data.py create mode 100644 src/psycop_model_training/utils/config_schemas/eval.py create mode 100644 src/psycop_model_training/utils/config_schemas/full_config.py create mode 100644 src/psycop_model_training/utils/config_schemas/model.py create mode 100644 src/psycop_model_training/utils/config_schemas/preprocessing.py create mode 100644 src/psycop_model_training/utils/config_schemas/project.py create mode 100644 src/psycop_model_training/utils/config_schemas/train.py create mode 100644 tests/config/__init__.py rename tests/{configs => config}/data/synth_data.yaml (100%) rename tests/{configs => config}/eval/evaluation_synth.yaml (100%) rename tests/{configs => config}/integration_config.yaml (100%) rename tests/{configs => config}/model/ebm.yaml (100%) rename tests/{configs => config}/model/logistic-regression.yaml (100%) rename tests/{configs => config}/model/naive-bayes.yaml (100%) rename tests/{configs => config}/model/xgboost.yaml (100%) rename tests/{configs => config}/preprocessing/default_preprocessing.yaml (100%) rename tests/{configs => config}/project/integration_test_project.yaml (100%) rename tests/{configs => config}/sweeper/optuna_multithread.yaml (100%) rename tests/{configs => config}/sweeper/optuna_singlethread.yaml (100%) rename tests/{configs => config}/train/default_training.yaml (100%) diff --git a/application/inspect_dataset.py b/application/inspect_dataset.py index d946f02b..8ab34b45 100644 --- a/application/inspect_dataset.py +++ b/application/inspect_dataset.py @@ -1,16 +1,21 @@ """Example of how to inspect a dataset using the configs.""" -from psycop_model_training.data_loader.utils import load_train_from_cfg, load_train_raw -from psycop_model_training.utils.config_schemas import load_cfg_as_pydantic +from psycop_model_training.data_loader.utils import ( + load_and_filter_train_from_cfg, + load_train_raw, +) +from psycop_model_training.utils.config_schemas import load_test_cfg_as_pydantic def main(): """Main.""" config_file_name = "default_config.yaml" - cfg = load_cfg_as_pydantic(config_file_name=config_file_name) + cfg = load_test_cfg_as_pydantic(config_file_name=config_file_name) df = load_train_raw(cfg=cfg) # noqa pylint: disable=unused-variable - df_filtered = load_train_from_cfg(cfg=cfg) # noqa pylint: disable=unused-variable + df_filtered = load_and_filter_train_from_cfg( + cfg=cfg + ) # noqa pylint: disable=unused-variable if __name__ == "__main__": diff --git a/application/main.py b/application/main.py index 099aabc1..5223106e 100644 --- a/application/main.py +++ b/application/main.py @@ -23,7 +23,7 @@ from psycop_model_training.utils.config_schemas import ( BaseModel, FullConfigSchema, - load_cfg_as_pydantic, + load_test_cfg_as_pydantic, ) @@ -202,7 +202,7 @@ def main(): else: config_file_name = "default_config.yaml" - cfg = load_cfg_as_pydantic(config_file_name=config_file_name) + cfg = load_test_cfg_as_pydantic(config_file_name=config_file_name) random_word = RandomWords() wandb_group = f"{random_word.get_random_word()}-{random_word.get_random_word()}" diff --git a/application/train_model.py b/application/train_model.py index ba7692da..545953cb 100644 --- a/application/train_model.py +++ b/application/train_model.py @@ -9,7 +9,9 @@ from sklearn.pipeline import Pipeline from wasabi import Printer -from psycop_model_training.data_loader.utils import load_train_and_val_from_cfg +from psycop_model_training.data_loader.utils import ( + load_and_filter_train_and_val_from_cfg, +) from psycop_model_training.model_eval.dataclasses import PipeMetadata from psycop_model_training.model_eval.evaluate_model import run_full_evaluation from psycop_model_training.preprocessing.post_split.create_pipeline import ( @@ -97,7 +99,7 @@ def main(cfg: DictConfig): msg.info(f"Delaying job by {delay} seconds to avoid resource competition") time.sleep(delay) - dataset = load_train_and_val_from_cfg(cfg) + dataset = load_and_filter_train_and_val_from_cfg(cfg) msg.info("Creating pipeline") pipe = create_pipeline(cfg) diff --git a/src/psycop_model_training/data_loader/utils.py b/src/psycop_model_training/data_loader/utils.py index e73fe1ad..7bd92d95 100644 --- a/src/psycop_model_training/data_loader/utils.py +++ b/src/psycop_model_training/data_loader/utils.py @@ -13,7 +13,7 @@ def get_latest_dataset_dir(path: Path) -> Path: return max(path.glob("*"), key=os.path.getctime) -def load_train_from_cfg(cfg: FullConfigSchema) -> pd.DataFrame: +def load_and_filter_train_from_cfg(cfg: FullConfigSchema) -> pd.DataFrame: """Load train dataset from config. Args: @@ -22,10 +22,10 @@ def load_train_from_cfg(cfg: FullConfigSchema) -> pd.DataFrame: Returns: pd.DataFrame: Train dataset """ - return DataLoader(cfg=cfg).load_dataset_from_dir(split_names="train") + data = DataLoader(cfg=cfg).load_dataset_from_dir(split_names="train") -def load_train_and_val_from_cfg(cfg: FullConfigSchema): +def load_and_filter_train_and_val_from_cfg(cfg: FullConfigSchema): """Load train and validation data from file.""" loader = DataLoader(cfg=cfg) diff --git a/src/psycop_model_training/model_eval/evaluate_model.py b/src/psycop_model_training/model_eval/evaluate_model.py index 8def472b..c9cb7d05 100644 --- a/src/psycop_model_training/model_eval/evaluate_model.py +++ b/src/psycop_model_training/model_eval/evaluate_model.py @@ -170,7 +170,7 @@ def create_custom_plot_artifacts( eval_dataset: EvalDataset, save_dir: Path, ) -> list[ArtifactContainer]: - """A collection of plots that are always generated.""" + """A collection of plots that are only generated for your specific use case.""" return [ ArtifactContainer( label="performance_by_n_hba1c", diff --git a/src/psycop_model_training/preprocessing/pre_split/full_processor.py b/src/psycop_model_training/preprocessing/pre_split/full_processor.py new file mode 100644 index 00000000..2be00596 --- /dev/null +++ b/src/psycop_model_training/preprocessing/pre_split/full_processor.py @@ -0,0 +1,29 @@ +import pandas as pd + +from psycop_model_training.preprocessing.pre_split.col_filterer import ( + PresSplitColFilterer, +) +from psycop_model_training.preprocessing.pre_split.col_transformer import ( + PresSplitColTransformer, +) +from psycop_model_training.preprocessing.pre_split.row_filterer import ( + PreSplitRowFilterer, +) +from psycop_model_training.utils.config_schemas import FullConfigSchema + + +class FullProcessor: + """Uses all PresSplit preprocessors.""" + + def __init__(self, cfg): + self.cfg = cfg + self.row_filterer = PreSplitRowFilterer(cfg=cfg) + self.col_filterer = PresSplitColFilterer(cfg=cfg) + self.col_transformer = PresSplitColTransformer(cfg=cfg) + + def process_from_cfg(self, cfg: FullConfigSchema, df: pd.DataFrame): + """Process a dataframe using the configuration.""" + df = self.row_filterer.filter_from_cfg(df=df) + df = self.col_filterer.filter_from_cfg(df=df) + df = self.col_transformer.transform_from_cfg(df=df) + return df diff --git a/src/psycop_model_training/preprocessing/pre_split/row_filterer.py b/src/psycop_model_training/preprocessing/pre_split/row_filterer.py index 101752a6..2df1af68 100644 --- a/src/psycop_model_training/preprocessing/pre_split/row_filterer.py +++ b/src/psycop_model_training/preprocessing/pre_split/row_filterer.py @@ -4,13 +4,14 @@ import pandas as pd from psycop_model_training.data_loader.data_loader import msg +from psycop_model_training.utils.config_schemas import FullConfigSchema from psycop_model_training.utils.decorators import print_df_dimensions_diff from psycop_model_training.utils.utils import get_percent_lost class PreSplitRowFilterer: - def __init__(self): - raise NotImplementedError + def __init__(self, cfg: FullConfigSchema): + self.cfg = cfg def _drop_rows_if_datasets_ends_within_days( self, @@ -118,3 +119,7 @@ def _drop_rows_after_event_time(self, dataset: pd.DataFrame) -> pd.DataFrame: ) return dataset[~rows_to_drop] + + def filter_from_cfg(self, df: pd.DataFrame): + if self.cfg.preprocessing.pre_split.drop_patient_if_exclusion_before_date: + df = self._drop_patient_if_excluded(df) diff --git a/src/psycop_model_training/utils/config_schemas.py b/src/psycop_model_training/utils/config_schemas.py deleted file mode 100644 index b5f8b6d6..00000000 --- a/src/psycop_model_training/utils/config_schemas.py +++ /dev/null @@ -1,282 +0,0 @@ -"""Utilities for converting config yamls to pydantic objects. Helpful because -it makes them: - -- Addressable with intellisense, -- Refactorable with IDEs, -- Easier to document with docstrings and -- Type checkable -""" -from datetime import datetime -from pathlib import Path -from typing import Any, Literal, Optional, Union - -from hydra import compose, initialize -from omegaconf import DictConfig, OmegaConf -from pydantic import BaseModel as PydanticBaseModel -from pydantic import Extra - - -class BaseModel(PydanticBaseModel): - """.""" - - class Config: - """An pydantic basemodel, which doesn't allow attributes that are not - defined in the class.""" - - allow_mutation = False - arbitrary_types_allowed = True - extra = Extra.forbid - - def __transform_attributes_with_str_to_object( - self, - output_object: Any, - input_string: str = "str", - ): - for key, value in self.__dict__.items(): - if isinstance(value, str): - if value.lower() == input_string.lower(): - self.__dict__[key] = output_object - - def __init__( - self, - allow_mutation: bool = False, - **kwargs, - ): - super().__init__(**kwargs) - self.Config.allow_mutation = allow_mutation - - self.__transform_attributes_with_str_to_object( - input_string="null", - output_object=None, - ) - self.__transform_attributes_with_str_to_object( - input_string="false", - output_object=False, - ) - self.__transform_attributes_with_str_to_object( - input_string="true", - output_object=True, - ) - - -class WandbSchema(BaseModel): - """Configuration for weights and biases.""" - - group: str - mode: str - entity: str - - -class WatcherSchema(BaseModel): - """Configuration for watchers.""" - - archive_all: bool - keep_alive_after_training_minutes: Union[int, float] - n_runs_before_eval: int - verbose: bool - - -class ProjectSchema(BaseModel): - """Project configuration.""" - - wandb: WandbSchema - name: str = "psycop_model_training" - seed: int - watcher: WatcherSchema - gpu: bool - - -class CustomColNames(BaseModel): - """All custom column names, i.e. columns that won't generalise across - projects.""" - - n_hba1c: str - - -class ColumnNamesSchema(BaseModel): - """Column names in the data.""" - - pred_timestamp: str # Column name for prediction times - outcome_timestamp: str # Column name for outcome timestamps - id: str # Citizen colnames - age: str # Name of the age column - exclusion_timestamp: str # Name of the exclusion timestamps column. - # Drops all visits whose pred_timestamp <= exclusion_timestamp. - - custom: Optional[CustomColNames] = None - # Column names that are custom to the given prediction problem. - - -class DataSchema(BaseModel): - """Data configuration.""" - - n_training_samples: Optional[int] - # Number of training samples to use, defaults to null in which cases it uses all samples. - - dir: Union[Path, str] # Location of the dataset - suffix: str # File suffix to load. - - # Feature specs - col_name: ColumnNamesSchema - - pred_prefix: str # prefix of predictor columns - outc_prefix: str # prefix of outcome columns - - min_age: Union[int, float] # Minimum age to include in the dataset - - # Looking ahead - min_lookahead_days: int - # Drop all prediction times where (max timestamp in the dataset) - (current timestamp) is less than min_lookahead_days - - drop_patient_if_exclusion_before_date: Optional[Union[str, datetime]] - # Drop all visits from a patient if the outcome is before this date. If None, no patients are dropped. - - min_prediction_time_date: Optional[Union[str, datetime]] - # Drop all prediction times before this date. - - lookbehind_combination: Optional[list[int]] - # Which combination of features to use. Only uses features that have "within_X_days" in their column name, where X is any of the numbers in this list. - - -class FeatureSelectionSchema(BaseModel): - """Configuration for feature selection methods.""" - - name: Optional[str] = None - # Which feature selection method to use. - - params: Optional[dict] = None - # Parameters for the feature selection method. - - -class PreprocessingConfigSchema(BaseModel): - """Preprocessing config.""" - - convert_to_boolean: bool - # Convert all prediction values (except gender) to boolean. Defaults to False. Useful as a sensitivty test, i.e. "is model performance based on whether blood samples are taken, or their values". If based purely on whether blood samples are taken, might indicate that it's just predicting whatever the doctor suspected. - - convert_booleans_to_int: bool - # Whether to convert columns containing booleans to int - - drop_datetime_predictor_columns: bool - # Whether to drop datetime columns prefixed with data.pred_prefix. - # Typically, we don't want to use these as features, since they are unlikely to generalise into the future. - - convert_datetimes_to_ordinal: bool - # Whether to convert datetimes to ordinal. - - imputation_method: Literal["most_frequent", "mean", "median", "null"] - # How to replace missing values. Takes all values from the sklearn.impute.SimpleImputer class. - # https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html - - scaling: Optional[str] - # Scaling applied to all predictors after imputation. Options include "z-score-normalization". - - feature_selection: FeatureSelectionSchema - - -class ModelConfSchema(BaseModel): - """Model configuration.""" - - name: str # Model, can currently take xgboost - require_imputation: bool # Whether the model requires imputation. (shouldn't this be false?) - args: dict - - -class TrainConfSchema(BaseModel): - """Training configuration.""" - - n_splits: int # ? How do we handle whether to use crossvalidation or train/val splitting? - n_trials_per_lookahead: int - n_active_trainers: int # Number of lookahead windows to train for at once - n_jobs_per_trainer: int # Number of jobs to run in parallel for each lookahead window - random_delay_per_job_seconds: Optional[ - int - ] = None # Add random delay based on cfg.train.random_delay_per_job to avoid - # each job needing the same resources (GPU, disk, network) at the same time - - -class EvalConfSchema(BaseModel): - """Evaluation config.""" - - force: bool = False - # Whether to force evaluation even if wandb is not "run". Used for testing. - - top_n_feature_importances: int - # How many feature_importances to plot. Plots the most important n features. A table with all features is also logged. - - positive_rate_thresholds: list[int] - # The threshold mapping a model's predicted probability to a binary outcome can be computed if we know, which positive rate we're targeting. We can't know beforehand which positive rate is best, beause it's a trade-off between false-positives and false-negatives. Therefore, we compute performacne for a range of positive rates. - - save_model_predictions_on_overtaci: bool - - lookahead_bins: list[int] - # List of lookahead distances for plotting. Will create bins in between each distances. E.g. if specifying 1, 5, 10, will bin evaluation as follows: [0, 1], [1, 5], [5, 10], [10, inf]. - - lookbehind_bins: list[int] - # List of lookbehidn distances for plotting. Will create bins in between each distances. E.g. if specifying 1, 5, 10, will bin evaluation as follows: [0, 1], [1, 5], [5, 10], [10, inf]. - - -class FullConfigSchema(BaseModel): - """A recipe for a full configuration object.""" - - project: ProjectSchema - data: DataSchema - preprocessing: PreprocessingConfigSchema - model: ModelConfSchema - train: TrainConfSchema - eval: EvalConfSchema - - -def convert_omegaconf_to_pydantic_object( - conf: DictConfig, - allow_mutation: bool = False, -) -> FullConfigSchema: - """Converts an omegaconf DictConfig to a pydantic object. - - Args: - conf (DictConfig): Omegaconf DictConfig - allow_mutation (bool, optional): Whether to make the pydantic object mutable. Defaults to False. - Returns: - FullConfig: Pydantic object - """ - conf = OmegaConf.to_container(conf, resolve=True) # type: ignore - return FullConfigSchema(**conf, allow_mutation=allow_mutation) - - -def load_cfg_as_omegaconf( - config_file_name: str, - overrides: Optional[list[str]] = None, -) -> DictConfig: - """Load config as omegaconf object.""" - with initialize(version_base=None, config_path="../../../tests/test_configs/"): - if overrides: - cfg = compose( - config_name=config_file_name, - overrides=overrides, - ) - else: - cfg = compose( - config_name=config_file_name, - ) - - # Override the type so we can get autocomplete and renaming - # correctly working - cfg: FullConfigSchema = cfg # type: ignore - - gpu = cfg.project.gpu - - if not gpu and cfg.model.name == "xgboost": - cfg.model.args["tree_method"] = "auto" - - return cfg - - -def load_cfg_as_pydantic( - config_file_name, - allow_mutation: bool = False, - overrides: Optional[list[str]] = None, -) -> FullConfigSchema: - """Load config as pydantic object.""" - cfg = load_cfg_as_omegaconf(config_file_name=config_file_name, overrides=overrides) - - return convert_omegaconf_to_pydantic_object(conf=cfg, allow_mutation=allow_mutation) diff --git a/tests/configs/__init__.py b/src/psycop_model_training/utils/config_schemas/__init__.py similarity index 100% rename from tests/configs/__init__.py rename to src/psycop_model_training/utils/config_schemas/__init__.py diff --git a/src/psycop_model_training/utils/config_schemas/conf_utils.py b/src/psycop_model_training/utils/config_schemas/conf_utils.py new file mode 100644 index 00000000..e69de29b diff --git a/src/psycop_model_training/utils/config_schemas/data.py b/src/psycop_model_training/utils/config_schemas/data.py new file mode 100644 index 00000000..e69de29b diff --git a/src/psycop_model_training/utils/config_schemas/eval.py b/src/psycop_model_training/utils/config_schemas/eval.py new file mode 100644 index 00000000..e69de29b diff --git a/src/psycop_model_training/utils/config_schemas/full_config.py b/src/psycop_model_training/utils/config_schemas/full_config.py new file mode 100644 index 00000000..e69de29b diff --git a/src/psycop_model_training/utils/config_schemas/model.py b/src/psycop_model_training/utils/config_schemas/model.py new file mode 100644 index 00000000..e69de29b diff --git a/src/psycop_model_training/utils/config_schemas/preprocessing.py b/src/psycop_model_training/utils/config_schemas/preprocessing.py new file mode 100644 index 00000000..e69de29b diff --git a/src/psycop_model_training/utils/config_schemas/project.py b/src/psycop_model_training/utils/config_schemas/project.py new file mode 100644 index 00000000..e69de29b diff --git a/src/psycop_model_training/utils/config_schemas/train.py b/src/psycop_model_training/utils/config_schemas/train.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/config/__init__.py b/tests/config/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/configs/data/synth_data.yaml b/tests/config/data/synth_data.yaml similarity index 100% rename from tests/configs/data/synth_data.yaml rename to tests/config/data/synth_data.yaml diff --git a/tests/configs/eval/evaluation_synth.yaml b/tests/config/eval/evaluation_synth.yaml similarity index 100% rename from tests/configs/eval/evaluation_synth.yaml rename to tests/config/eval/evaluation_synth.yaml diff --git a/tests/configs/integration_config.yaml b/tests/config/integration_config.yaml similarity index 100% rename from tests/configs/integration_config.yaml rename to tests/config/integration_config.yaml diff --git a/tests/configs/model/ebm.yaml b/tests/config/model/ebm.yaml similarity index 100% rename from tests/configs/model/ebm.yaml rename to tests/config/model/ebm.yaml diff --git a/tests/configs/model/logistic-regression.yaml b/tests/config/model/logistic-regression.yaml similarity index 100% rename from tests/configs/model/logistic-regression.yaml rename to tests/config/model/logistic-regression.yaml diff --git a/tests/configs/model/naive-bayes.yaml b/tests/config/model/naive-bayes.yaml similarity index 100% rename from tests/configs/model/naive-bayes.yaml rename to tests/config/model/naive-bayes.yaml diff --git a/tests/configs/model/xgboost.yaml b/tests/config/model/xgboost.yaml similarity index 100% rename from tests/configs/model/xgboost.yaml rename to tests/config/model/xgboost.yaml diff --git a/tests/configs/preprocessing/default_preprocessing.yaml b/tests/config/preprocessing/default_preprocessing.yaml similarity index 100% rename from tests/configs/preprocessing/default_preprocessing.yaml rename to tests/config/preprocessing/default_preprocessing.yaml diff --git a/tests/configs/project/integration_test_project.yaml b/tests/config/project/integration_test_project.yaml similarity index 100% rename from tests/configs/project/integration_test_project.yaml rename to tests/config/project/integration_test_project.yaml diff --git a/tests/configs/sweeper/optuna_multithread.yaml b/tests/config/sweeper/optuna_multithread.yaml similarity index 100% rename from tests/configs/sweeper/optuna_multithread.yaml rename to tests/config/sweeper/optuna_multithread.yaml diff --git a/tests/configs/sweeper/optuna_singlethread.yaml b/tests/config/sweeper/optuna_singlethread.yaml similarity index 100% rename from tests/configs/sweeper/optuna_singlethread.yaml rename to tests/config/sweeper/optuna_singlethread.yaml diff --git a/tests/configs/train/default_training.yaml b/tests/config/train/default_training.yaml similarity index 100% rename from tests/configs/train/default_training.yaml rename to tests/config/train/default_training.yaml diff --git a/tests/conftest.py b/tests/conftest.py index 4e020f5d..b10c9367 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -8,7 +8,7 @@ from psycop_model_training.model_eval.dataclasses import EvalDataset from psycop_model_training.utils.config_schemas import ( FullConfigSchema, - load_cfg_as_pydantic, + load_test_cfg_as_pydantic, ) CONFIG_DIR_PATH_REL = "../application/config" @@ -53,7 +53,7 @@ def synth_eval_dataset() -> EvalDataset: @pytest.fixture(scope="function") def immuteable_test_config() -> FullConfigSchema: """Get an immutable config for testing.""" - return load_cfg_as_pydantic( + return load_test_cfg_as_pydantic( config_file_name="integration_config.yaml", allow_mutation=False, ) @@ -62,7 +62,7 @@ def immuteable_test_config() -> FullConfigSchema: @pytest.fixture(scope="function") def muteable_test_config() -> FullConfigSchema: """Get a mutable config for testing.""" - return load_cfg_as_pydantic( + return load_test_cfg_as_pydantic( config_file_name="integration_config.yaml", allow_mutation=True, ) diff --git a/tests/test_configs.py b/tests/test_configs.py index c6ed58f7..31bc472d 100644 --- a/tests/test_configs.py +++ b/tests/test_configs.py @@ -9,8 +9,8 @@ ) from psycop_model_training.utils.utils import PROJECT_ROOT -CONFIG_DIR_PATH_ABS = PROJECT_ROOT / "src" / "psycop_model_training" / "config" -CONFIG_DIR_PATH_REL = "../application/config" +CONFIG_DIR_PATH_ABS = PROJECT_ROOT / "tests" / "config" +CONFIG_DIR_PATH_REL = "../tests/config" def get_config_file_names() -> list[str]: diff --git a/tests/test_load.py b/tests/test_load.py index d0ccad51..c660760b 100644 --- a/tests/test_load.py +++ b/tests/test_load.py @@ -1,6 +1,6 @@ """Testing of loader functions.""" -from psycop_model_training.data_loader.utils import load_train_from_cfg +from psycop_model_training.data_loader.utils import load_and_filter_train_from_cfg from psycop_model_training.utils.config_schemas import FullConfigSchema @@ -11,11 +11,11 @@ def test_load_lookbehind_exceeds_lookbehind_threshold( lookbehind threshold.""" cfg = muteable_test_config - n_cols_before_filtering = load_train_from_cfg(cfg=cfg).shape[1] + n_cols_before_filtering = load_and_filter_train_from_cfg(cfg=cfg).shape[1] cfg.data.lookbehind_combination = [30, 60] - n_cols_after_filtering = load_train_from_cfg(cfg=cfg).shape[1] + n_cols_after_filtering = load_and_filter_train_from_cfg(cfg=cfg).shape[1] assert n_cols_before_filtering - n_cols_after_filtering == 2 @@ -27,10 +27,10 @@ def test_load_lookbehind_not_in_lookbehind_combination( specified lookbehind combination list.""" cfg = muteable_test_config - n_cols_before_filtering = load_train_from_cfg(cfg=cfg).shape[1] + n_cols_before_filtering = load_and_filter_train_from_cfg(cfg=cfg).shape[1] cfg.data.lookbehind_combination = [60] - n_cols_after_filtering = load_train_from_cfg(cfg=cfg).shape[1] + n_cols_after_filtering = load_and_filter_train_from_cfg(cfg=cfg).shape[1] assert n_cols_before_filtering - n_cols_after_filtering == 3 diff --git a/tests/test_preprocessing.py b/tests/test_preprocessing.py index 03dd98e3..2874d11e 100644 --- a/tests/test_preprocessing.py +++ b/tests/test_preprocessing.py @@ -1,5 +1,5 @@ """Test custom preprocessing steps.""" -from psycop_model_training.data_loader.utils import load_train_from_cfg +from psycop_model_training.data_loader.utils import load_and_filter_train_from_cfg from psycop_model_training.preprocessing.post_split.create_pipeline import ( create_preprocessing_pipeline, ) @@ -20,7 +20,7 @@ def test_drop_datetime_predictor_columns( cfg.data.pred_prefix = "timestamp" pipe = create_preprocessing_pipeline(cfg=cfg) - train_df = load_train_from_cfg(cfg=cfg) + train_df = load_and_filter_train_from_cfg(cfg=cfg) train_df = pipe.transform(X=train_df) assert len([x for x in train_df.columns if "timestamp" in x]) == 0 diff --git a/tests/test_train_model.py b/tests/test_train_model.py index d34ebb3c..c5d8cabe 100644 --- a/tests/test_train_model.py +++ b/tests/test_train_model.py @@ -7,7 +7,7 @@ from psycop_model_training.training.model_specs import MODELS from psycop_model_training.utils.config_schemas import ( FullConfigSchema, - load_cfg_as_omegaconf, + load_test_cfg_as_omegaconf, ) INTEGRATION_TEST_FILE_NAME = "integration_config.yaml" @@ -17,7 +17,7 @@ def test_main(model_name): """Test main using a variety of model.""" - cfg: FullConfigSchema = load_cfg_as_omegaconf( + cfg: FullConfigSchema = load_test_cfg_as_omegaconf( config_file_name=INTEGRATION_TEST_FILE_NAME, overrides=[f"model={model_name}"], ) From 36776c60e95706acba3abb0fe829e3adadfa593a Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Fri, 16 Dec 2022 14:40:29 +0100 Subject: [PATCH 09/47] refactor: finish row filterer --- .../archive/model_training_watcher.py | 2 +- .../data_loader/data_loader.py | 2 +- .../data_loader/utils.py | 2 +- .../model_eval/evaluate_model.py | 2 +- .../post_split/create_pipeline.py | 2 +- .../preprocessing/pre_split/full_processor.py | 2 +- .../preprocessing/pre_split/row_filterer.py | 32 ++++- .../training/train_and_eval.py | 2 +- src/psycop_model_training/training/utils.py | 2 +- .../utils/config_schemas/__init__.py | 0 .../utils/config_schemas/conf_utils.py | 112 ++++++++++++++++++ .../utils/config_schemas/data.py | 42 +++++++ .../utils/config_schemas/eval.py | 22 ++++ .../utils/config_schemas/full_config.py | 20 ++++ .../utils/config_schemas/model.py | 9 ++ .../utils/config_schemas/preprocessing.py | 62 ++++++++++ .../utils/config_schemas/project.py | 19 +++ .../utils/config_schemas/train.py | 16 +++ tests/test_load.py | 2 +- tests/test_preprocessing.py | 2 +- 20 files changed, 340 insertions(+), 14 deletions(-) delete mode 100644 src/psycop_model_training/utils/config_schemas/__init__.py diff --git a/src/psycop_model_training/archive/model_training_watcher.py b/src/psycop_model_training/archive/model_training_watcher.py index 92821039..1c1f164e 100644 --- a/src/psycop_model_training/archive/model_training_watcher.py +++ b/src/psycop_model_training/archive/model_training_watcher.py @@ -15,7 +15,7 @@ from psycop_model_training.model_eval.dataclasses import ModelEvalData from psycop_model_training.model_eval.evaluate_model import run_full_evaluation -from psycop_model_training.utils.config_schemas import FullConfigSchema +from psycop_model_training.utils.config_schemas.full_config import FullConfigSchema from psycop_model_training.utils.utils import ( MODEL_PREDICTIONS_PATH, PROJECT_ROOT, diff --git a/src/psycop_model_training/data_loader/data_loader.py b/src/psycop_model_training/data_loader/data_loader.py index 352ebe05..ce1bc199 100644 --- a/src/psycop_model_training/data_loader/data_loader.py +++ b/src/psycop_model_training/data_loader/data_loader.py @@ -6,7 +6,7 @@ import pandas as pd from wasabi import Printer -from psycop_model_training.utils.config_schemas import FullConfigSchema +from psycop_model_training.utils.config_schemas.full_config import FullConfigSchema msg = Printer(timestamp=True) diff --git a/src/psycop_model_training/data_loader/utils.py b/src/psycop_model_training/data_loader/utils.py index 7bd92d95..b6854643 100644 --- a/src/psycop_model_training/data_loader/utils.py +++ b/src/psycop_model_training/data_loader/utils.py @@ -5,7 +5,7 @@ from psycop_model_training.data_loader.data_classes import SplitDataset from psycop_model_training.data_loader.data_loader import DataLoader -from psycop_model_training.utils.config_schemas import FullConfigSchema +from psycop_model_training.utils.config_schemas.full_config import FullConfigSchema def get_latest_dataset_dir(path: Path) -> Path: diff --git a/src/psycop_model_training/model_eval/evaluate_model.py b/src/psycop_model_training/model_eval/evaluate_model.py index c9cb7d05..113c3717 100644 --- a/src/psycop_model_training/model_eval/evaluate_model.py +++ b/src/psycop_model_training/model_eval/evaluate_model.py @@ -40,7 +40,7 @@ generate_feature_importances_table, generate_selected_features_table, ) -from psycop_model_training.utils.config_schemas import FullConfigSchema +from psycop_model_training.utils.config_schemas.full_config import FullConfigSchema from psycop_model_training.utils.utils import positive_rate_to_pred_probs diff --git a/src/psycop_model_training/preprocessing/post_split/create_pipeline.py b/src/psycop_model_training/preprocessing/post_split/create_pipeline.py index 5af38c69..18f4999d 100644 --- a/src/psycop_model_training/preprocessing/post_split/create_pipeline.py +++ b/src/psycop_model_training/preprocessing/post_split/create_pipeline.py @@ -17,7 +17,7 @@ ConvertToBoolean, DateTimeConverter, ) -from psycop_model_training.utils.config_schemas import FullConfigSchema +from psycop_model_training.utils.config_schemas.full_config import FullConfigSchema def get_feature_selection_steps(cfg): diff --git a/src/psycop_model_training/preprocessing/pre_split/full_processor.py b/src/psycop_model_training/preprocessing/pre_split/full_processor.py index 2be00596..d50bb625 100644 --- a/src/psycop_model_training/preprocessing/pre_split/full_processor.py +++ b/src/psycop_model_training/preprocessing/pre_split/full_processor.py @@ -9,7 +9,7 @@ from psycop_model_training.preprocessing.pre_split.row_filterer import ( PreSplitRowFilterer, ) -from psycop_model_training.utils.config_schemas import FullConfigSchema +from psycop_model_training.utils.config_schemas.full_config import FullConfigSchema class FullProcessor: diff --git a/src/psycop_model_training/preprocessing/pre_split/row_filterer.py b/src/psycop_model_training/preprocessing/pre_split/row_filterer.py index 2df1af68..4c83474e 100644 --- a/src/psycop_model_training/preprocessing/pre_split/row_filterer.py +++ b/src/psycop_model_training/preprocessing/pre_split/row_filterer.py @@ -4,7 +4,7 @@ import pandas as pd from psycop_model_training.data_loader.data_loader import msg -from psycop_model_training.utils.config_schemas import FullConfigSchema +from psycop_model_training.utils.config_schemas.full_config import FullConfigSchema from psycop_model_training.utils.decorators import print_df_dimensions_diff from psycop_model_training.utils.utils import get_percent_lost @@ -13,6 +13,7 @@ class PreSplitRowFilterer: def __init__(self, cfg: FullConfigSchema): self.cfg = cfg + @print_df_dimensions_diff def _drop_rows_if_datasets_ends_within_days( self, n_days: Union[int, float], @@ -67,7 +68,7 @@ def _drop_rows_if_datasets_ends_within_days( return dataset @print_df_dimensions_diff - def _drop_patient_if_excluded( + def _drop_patient_if_excluded_by_date( self, dataset: pd.DataFrame, ) -> pd.DataFrame: @@ -104,6 +105,7 @@ def _drop_patient_if_excluded( return dataset + @print_df_dimensions_diff def _keep_only_if_older_than_min_age(self, dataset: pd.DataFrame) -> pd.DataFrame: """Keep only rows that are older than the minimum age specified in the config.""" @@ -120,6 +122,28 @@ def _drop_rows_after_event_time(self, dataset: pd.DataFrame) -> pd.DataFrame: return dataset[~rows_to_drop] - def filter_from_cfg(self, df: pd.DataFrame): + def filter_from_cfg(self, dataset: pd.DataFrame): + for direction in ("ahead", "behind"): + if direction in ("ahead", "behind"): + if direction == "ahead": + n_days = self.cfg.preprocessing.pre_split.min_lookahead_days + elif direction == "behind": + n_days = max( + self.cfg.preprocessing.pre_split.lookbehind_combination + ) + else: + continue + + dataset = self._drop_rows_if_datasets_ends_within_days( + n_days=n_days, + dataset=dataset, + direction=direction, + ) + if self.cfg.preprocessing.pre_split.drop_patient_if_exclusion_before_date: - df = self._drop_patient_if_excluded(df) + dataset = self._drop_patient_if_excluded_by_date(dataset) + + if self.cfg.preprocessing.pre_split.min_age: + dataset = self._keep_only_if_older_than_min_age(dataset) + + dataset = self._drop_rows_after_event_time(dataset=dataset) diff --git a/src/psycop_model_training/training/train_and_eval.py b/src/psycop_model_training/training/train_and_eval.py index f6c0d7c1..62f4ae2d 100644 --- a/src/psycop_model_training/training/train_and_eval.py +++ b/src/psycop_model_training/training/train_and_eval.py @@ -14,7 +14,7 @@ from psycop_model_training.model_eval.dataclasses import EvalDataset from psycop_model_training.training.model_specs import MODELS from psycop_model_training.training.utils import create_eval_dataset -from psycop_model_training.utils.config_schemas import FullConfigSchema +from psycop_model_training.utils.config_schemas.full_config import FullConfigSchema from psycop_model_training.utils.utils import PROJECT_ROOT CONFIG_PATH = PROJECT_ROOT / "src" / "psycop_model_training" / "config" diff --git a/src/psycop_model_training/training/utils.py b/src/psycop_model_training/training/utils.py index 6509f472..962472f2 100644 --- a/src/psycop_model_training/training/utils.py +++ b/src/psycop_model_training/training/utils.py @@ -1,7 +1,7 @@ import pandas as pd from psycop_model_training.model_eval.dataclasses import EvalDataset -from psycop_model_training.utils.config_schemas import FullConfigSchema +from psycop_model_training.utils.config_schemas.full_config import FullConfigSchema def create_eval_dataset(cfg: FullConfigSchema, outcome_col_name: str, df: pd.DataFrame): diff --git a/src/psycop_model_training/utils/config_schemas/__init__.py b/src/psycop_model_training/utils/config_schemas/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/src/psycop_model_training/utils/config_schemas/conf_utils.py b/src/psycop_model_training/utils/config_schemas/conf_utils.py index e69de29b..4b5e8a77 100644 --- a/src/psycop_model_training/utils/config_schemas/conf_utils.py +++ b/src/psycop_model_training/utils/config_schemas/conf_utils.py @@ -0,0 +1,112 @@ +from typing import Optional, Any, Union + +from psycop_model_training.utils.config_schemas.full_config import FullConfigSchema + + +def convert_omegaconf_to_pydantic_object( + conf: DictConfig, + allow_mutation: bool = False, +) -> FullConfigSchema: + """Converts an omegaconf DictConfig to a pydantic object. + + Args: + conf (DictConfig): Omegaconf DictConfig + allow_mutation (bool, optional): Whether to make the pydantic object mutable. Defaults to False. + Returns: + FullConfig: Pydantic object + """ + conf = OmegaConf.to_container(conf, resolve=True) # type: ignore + return FullConfigSchema(**conf, allow_mutation=allow_mutation) + + +def load_test_cfg_as_omegaconf( + config_file_name: str, + overrides: Optional[list[str]] = None, +) -> DictConfig: + """Load config as omegaconf object.""" + with initialize(version_base=None, config_path="../../../tests/config/"): + if overrides: + cfg = compose( + config_name=config_file_name, + overrides=overrides, + ) + else: + cfg = compose( + config_name=config_file_name, + ) + + # Override the type so we can get autocomplete and renaming + # correctly working + cfg: FullConfigSchema = cfg # type: ignore + + gpu = cfg.project.gpu + + if not gpu and cfg.model.name == "xgboost": + cfg.model.args["tree_method"] = "auto" + + return cfg + + +def load_test_cfg_as_pydantic( + config_file_name, + allow_mutation: bool = False, + overrides: Optional[list[str]] = None, +) -> FullConfigSchema: + """Load config as pydantic object.""" + cfg = load_test_cfg_as_omegaconf( + config_file_name=config_file_name, overrides=overrides + ) + + return convert_omegaconf_to_pydantic_object(conf=cfg, allow_mutation=allow_mutation) + + +class BaseModel(PydanticBaseModel): + """.""" + + class Config: + """An pydantic basemodel, which doesn't allow attributes that are not + defined in the class.""" + + allow_mutation = False + arbitrary_types_allowed = True + extra = Extra.forbid + + def __transform_attributes_with_str_to_object( + self, + output_object: Any, + input_string: str = "str", + ): + for key, value in self.__dict__.items(): + if isinstance(value, str): + if value.lower() == input_string.lower(): + self.__dict__[key] = output_object + + def __init__( + self, + allow_mutation: bool = False, + **kwargs, + ): + super().__init__(**kwargs) + self.Config.allow_mutation = allow_mutation + + self.__transform_attributes_with_str_to_object( + input_string="null", + output_object=None, + ) + self.__transform_attributes_with_str_to_object( + input_string="false", + output_object=False, + ) + self.__transform_attributes_with_str_to_object( + input_string="true", + output_object=True, + ) + + +class WatcherSchema(BaseModel): + """Configuration for watchers.""" + + archive_all: bool + keep_alive_after_training_minutes: Union[int, float] + n_runs_before_eval: int + verbose: bool diff --git a/src/psycop_model_training/utils/config_schemas/data.py b/src/psycop_model_training/utils/config_schemas/data.py index e69de29b..53a6ed0d 100644 --- a/src/psycop_model_training/utils/config_schemas/data.py +++ b/src/psycop_model_training/utils/config_schemas/data.py @@ -0,0 +1,42 @@ +from datetime import datetime +from pathlib import Path +from typing import Optional, Union + +from psycop_model_training.utils.config_schemas import BaseModel + + +class CustomColNames(BaseModel): + """All custom column names, i.e. columns that won't generalise across + projects.""" + + n_hba1c: str + + +class ColumnNamesSchema(BaseModel): + """Column names in the data.""" + + pred_timestamp: str # Column name for prediction times + outcome_timestamp: str # Column name for outcome timestamps + id: str # Citizen colnames + age: str # Name of the age column + exclusion_timestamp: str # Name of the exclusion timestamps column. + # Drops all visits whose pred_timestamp <= exclusion_timestamp. + + custom: Optional[CustomColNames] = None + # Column names that are custom to the given prediction problem. + + +class DataSchema(BaseModel): + """Data configuration.""" + + n_training_samples: Optional[int] + # Number of training samples to use, defaults to null in which cases it uses all samples. + + dir: Union[Path, str] # Location of the dataset + suffix: str # File suffix to load. + + # Feature specs + col_name: ColumnNamesSchema + + pred_prefix: str # prefix of predictor columns + outc_prefix: str # prefix of outcome columns diff --git a/src/psycop_model_training/utils/config_schemas/eval.py b/src/psycop_model_training/utils/config_schemas/eval.py index e69de29b..06e08489 100644 --- a/src/psycop_model_training/utils/config_schemas/eval.py +++ b/src/psycop_model_training/utils/config_schemas/eval.py @@ -0,0 +1,22 @@ +from psycop_model_training.utils.config_schemas import BaseModel + + +class EvalConfSchema(BaseModel): + """Evaluation config.""" + + force: bool = False + # Whether to force evaluation even if wandb is not "run". Used for testing. + + top_n_feature_importances: int + # How many feature_importances to plot. Plots the most important n features. A table with all features is also logged. + + positive_rate_thresholds: list[int] + # The threshold mapping a model's predicted probability to a binary outcome can be computed if we know, which positive rate we're targeting. We can't know beforehand which positive rate is best, beause it's a trade-off between false-positives and false-negatives. Therefore, we compute performacne for a range of positive rates. + + save_model_predictions_on_overtaci: bool + + lookahead_bins: list[int] + # List of lookahead distances for plotting. Will create bins in between each distances. E.g. if specifying 1, 5, 10, will bin evaluation as follows: [0, 1], [1, 5], [5, 10], [10, inf]. + + lookbehind_bins: list[int] + # List of lookbehidn distances for plotting. Will create bins in between each distances. E.g. if specifying 1, 5, 10, will bin evaluation as follows: [0, 1], [1, 5], [5, 10], [10, inf]. diff --git a/src/psycop_model_training/utils/config_schemas/full_config.py b/src/psycop_model_training/utils/config_schemas/full_config.py index e69de29b..99bad476 100644 --- a/src/psycop_model_training/utils/config_schemas/full_config.py +++ b/src/psycop_model_training/utils/config_schemas/full_config.py @@ -0,0 +1,20 @@ +from psycop_model_training.utils.config_schemas.conf_utils import BaseModel +from psycop_model_training.utils.config_schemas.data import DataSchema +from psycop_model_training.utils.config_schemas.eval import EvalConfSchema +from psycop_model_training.utils.config_schemas.model import ModelConfSchema +from psycop_model_training.utils.config_schemas.preprocessing import ( + PreprocessingConfigSchema, +) +from psycop_model_training.utils.config_schemas.project import ProjectSchema +from psycop_model_training.utils.config_schemas.train import TrainConfSchema + + +class FullConfigSchema(BaseModel): + """A recipe for a full configuration object.""" + + project: ProjectSchema + data: DataSchema + preprocessing: PreprocessingConfigSchema + model: ModelConfSchema + train: TrainConfSchema + eval: EvalConfSchema diff --git a/src/psycop_model_training/utils/config_schemas/model.py b/src/psycop_model_training/utils/config_schemas/model.py index e69de29b..36bbc604 100644 --- a/src/psycop_model_training/utils/config_schemas/model.py +++ b/src/psycop_model_training/utils/config_schemas/model.py @@ -0,0 +1,9 @@ +from psycop_model_training.utils.config_schemas import BaseModel + + +class ModelConfSchema(BaseModel): + """Model configuration.""" + + name: str # Model, can currently take xgboost + require_imputation: bool # Whether the model requires imputation. (shouldn't this be false?) + args: dict diff --git a/src/psycop_model_training/utils/config_schemas/preprocessing.py b/src/psycop_model_training/utils/config_schemas/preprocessing.py index e69de29b..ae927c39 100644 --- a/src/psycop_model_training/utils/config_schemas/preprocessing.py +++ b/src/psycop_model_training/utils/config_schemas/preprocessing.py @@ -0,0 +1,62 @@ +from datetime import datetime +from typing import Literal, Optional, Union + +from psycop_model_training.utils.config_schemas import BaseModel + + +class FeatureSelectionSchema(BaseModel): + """Configuration for feature selection methods.""" + + name: Optional[str] = None + # Which feature selection method to use. + + params: Optional[dict] = None + # Parameters for the feature selection method. + + +class PreSplitPreprocessingConfigSchema(BaseModel): + drop_patient_if_exclusion_before_date: Optional[Union[str, datetime]] + # Drop all visits from a patient if the outcome is before this date. If None, no patients are dropped. + + convert_to_boolean: bool + # Convert all prediction values (except gender) to boolean. Defaults to False. Useful as a sensitivty test, i.e. "is model performance based on whether blood samples are taken, or their values". If based purely on whether blood samples are taken, might indicate that it's just predicting whatever the doctor suspected. + + convert_booleans_to_int: bool + # Whether to convert columns containing booleans to int + + drop_datetime_predictor_columns: bool + # Whether to drop datetime columns prefixed with data.pred_prefix. + # Typically, we don't want to use these as features, since they are unlikely to generalise into the future. + + convert_datetimes_to_ordinal: bool + # Whether to convert datetimes to ordinal. + + min_age: Union[int, float] # Minimum age to include in the dataset + + # Looking ahead + min_lookahead_days: int + # Drop all prediction times where (max timestamp in the dataset) - (current timestamp) is less than min_lookahead_days + + min_prediction_time_date: Optional[Union[str, datetime]] + # Drop all prediction times before this date. + + lookbehind_combination: Optional[list[int]] + # Which combination of features to use. Only uses features that have "within_X_days" in their column name, where X is any of the numbers in this list. + + +class PostSplitPreprocessingConfigSchema(BaseModel): + imputation_method: Literal["most_frequent", "mean", "median", "null"] + # How to replace missing values. Takes all values from the sklearn.impute.SimpleImputer class. + # https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html + + scaling: Optional[str] + # Scaling applied to all predictors after imputation. Options include "z-score-normalization". + + feature_selection: FeatureSelectionSchema + + +class PreprocessingConfigSchema(BaseModel): + """Preprocessing config.""" + + pre_split: PreSplitPreprocessingConfigSchema + post_split: PostSplitPreprocessingConfigSchema diff --git a/src/psycop_model_training/utils/config_schemas/project.py b/src/psycop_model_training/utils/config_schemas/project.py index e69de29b..9e524341 100644 --- a/src/psycop_model_training/utils/config_schemas/project.py +++ b/src/psycop_model_training/utils/config_schemas/project.py @@ -0,0 +1,19 @@ +from psycop_model_training.utils.config_schemas import BaseModel, WandbSchema, WatcherSchema + + +class ProjectSchema(BaseModel): + """Project configuration.""" + + wandb: WandbSchema + name: str = "psycop_model_training" + seed: int + watcher: WatcherSchema + gpu: bool + + +class WandbSchema(BaseModel): + """Configuration for weights and biases.""" + + group: str + mode: str + entity: str diff --git a/src/psycop_model_training/utils/config_schemas/train.py b/src/psycop_model_training/utils/config_schemas/train.py index e69de29b..111f2b73 100644 --- a/src/psycop_model_training/utils/config_schemas/train.py +++ b/src/psycop_model_training/utils/config_schemas/train.py @@ -0,0 +1,16 @@ +from typing import Optional + +from psycop_model_training.utils.config_schemas import BaseModel + + +class TrainConfSchema(BaseModel): + """Training configuration.""" + + n_splits: int # ? How do we handle whether to use crossvalidation or train/val splitting? + n_trials_per_lookahead: int + n_active_trainers: int # Number of lookahead windows to train for at once + n_jobs_per_trainer: int # Number of jobs to run in parallel for each lookahead window + random_delay_per_job_seconds: Optional[ + int + ] = None # Add random delay based on cfg.train.random_delay_per_job to avoid + # each job needing the same resources (GPU, disk, network) at the same time diff --git a/tests/test_load.py b/tests/test_load.py index c660760b..c79b77b3 100644 --- a/tests/test_load.py +++ b/tests/test_load.py @@ -1,7 +1,7 @@ """Testing of loader functions.""" from psycop_model_training.data_loader.utils import load_and_filter_train_from_cfg -from psycop_model_training.utils.config_schemas import FullConfigSchema +from psycop_model_training.utils.config_schemas.full_config import FullConfigSchema def test_load_lookbehind_exceeds_lookbehind_threshold( diff --git a/tests/test_preprocessing.py b/tests/test_preprocessing.py index 2874d11e..8f94dca8 100644 --- a/tests/test_preprocessing.py +++ b/tests/test_preprocessing.py @@ -3,7 +3,7 @@ from psycop_model_training.preprocessing.post_split.create_pipeline import ( create_preprocessing_pipeline, ) -from psycop_model_training.utils.config_schemas import FullConfigSchema +from psycop_model_training.utils.config_schemas.full_config import FullConfigSchema def test_drop_datetime_predictor_columns( From 3e59293992de98a40189ca3dd182dc3247d38bc9 Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Fri, 16 Dec 2022 15:29:30 +0100 Subject: [PATCH 10/47] refactor: refactor pre-split processing --- .../data_loader/data_loader.py | 62 ----------------- .../data_loader/utils.py | 2 +- .../preprocessing/pre_split/full_processor.py | 25 ++++--- .../col_filter.py} | 68 ++++++++----------- .../{ => processors}/col_transformer.py | 23 +++++++ .../row_filter.py} | 25 ++++--- 6 files changed, 79 insertions(+), 126 deletions(-) rename src/psycop_model_training/preprocessing/pre_split/{col_filterer.py => processors/col_filter.py} (85%) rename src/psycop_model_training/preprocessing/pre_split/{ => processors}/col_transformer.py (69%) rename src/psycop_model_training/preprocessing/pre_split/{row_filterer.py => processors/row_filter.py} (90%) diff --git a/src/psycop_model_training/data_loader/data_loader.py b/src/psycop_model_training/data_loader/data_loader.py index ce1bc199..1f7c408a 100644 --- a/src/psycop_model_training/data_loader/data_loader.py +++ b/src/psycop_model_training/data_loader/data_loader.py @@ -63,68 +63,6 @@ def _load_dataset_file( # pylint: disable=inconsistent-return-statements elif "csv" in self.file_suffix: return pd.read_csv(filepath_or_buffer=path, nrows=nrows) - def _process_dataset(self, dataset: pd.DataFrame) -> pd.DataFrame: - """Process dataset, namely: - - - Drop patients with outcome before drop_patient_if_outcome_before_date - - Process timestamp columns - - Drop visits where min_lookahead, min_lookbehind or min_prediction_time_date are not met - - Drop features with lookbehinds not in lookbehind_combination - - Returns: - pd.DataFrame: Processed dataset - """ - msg = Printer(timestamp=True) - msg.info("Processing dataset") - - # Super hacky rename, needs to be removed before merging. Figure out how to add eval columns when creating the dataset. - dataset = dataset.rename( - { - "pred_hba1c_within_9999_days_count_fallback_nan": self.cfg.data.col_name.custom.n_hba1c, - }, - axis=1, - ) - - # Super hacky transformation of negative weights (?!) for chi-square. - # In the future, we want to: - # 1. Fix this in the feature generation for t2d - # 2a. See if there's a way of using feature selection that permits negative values, or - # 2b. Always use z-score normalisation? - dataset = self._negative_values_to_nan(dataset=dataset) - - dataset = self.convert_timestamp_dtype_and_nat(dataset=dataset) - - if self.cfg.preprocessing.convert_booleans_to_int: - dataset = self._convert_boolean_dtypes_to_int(dataset=dataset) - - if self.cfg.data.min_age: - dataset = self._keep_only_if_older_than_min_age(dataset=dataset) - - dataset = self._drop_rows_after_event_time(dataset=dataset) - - if self.cfg.data.drop_patient_if_exclusion_before_date: - dataset = self._drop_patient_if_excluded(dataset=dataset) - - # Drop if later than min prediction time date - if self.cfg.data.min_prediction_time_date: - dataset = dataset[ - dataset[self.cfg.data.col_name.pred_timestamp] - > self.cfg.data.min_prediction_time_date - ] - - dataset = self._drop_cols_and_rows_if_look_direction_not_met(dataset=dataset) - - if self.cfg.data.lookbehind_combination: - dataset = self._drop_cols_not_in_lookbehind_combination(dataset=dataset) - - dataset = self._keep_unique_outcome_col_with_lookahead_days_matching_conf( - dataset=dataset, - ) - - msg.info("Finished processing dataset") - - return dataset - def load_dataset_from_dir( self, split_names: Union[Iterable[str], str], diff --git a/src/psycop_model_training/data_loader/utils.py b/src/psycop_model_training/data_loader/utils.py index b6854643..21f52643 100644 --- a/src/psycop_model_training/data_loader/utils.py +++ b/src/psycop_model_training/data_loader/utils.py @@ -22,7 +22,7 @@ def load_and_filter_train_from_cfg(cfg: FullConfigSchema) -> pd.DataFrame: Returns: pd.DataFrame: Train dataset """ - data = DataLoader(cfg=cfg).load_dataset_from_dir(split_names="train") + return DataLoader(cfg=cfg).load_dataset_from_dir(split_names="train") def load_and_filter_train_and_val_from_cfg(cfg: FullConfigSchema): diff --git a/src/psycop_model_training/preprocessing/pre_split/full_processor.py b/src/psycop_model_training/preprocessing/pre_split/full_processor.py index d50bb625..032f2403 100644 --- a/src/psycop_model_training/preprocessing/pre_split/full_processor.py +++ b/src/psycop_model_training/preprocessing/pre_split/full_processor.py @@ -1,15 +1,14 @@ import pandas as pd -from psycop_model_training.preprocessing.pre_split.col_filterer import ( - PresSplitColFilterer, +from psycop_model_training.preprocessing.pre_split.processors.col_filter import ( + PresSplitColFilter, ) -from psycop_model_training.preprocessing.pre_split.col_transformer import ( +from psycop_model_training.preprocessing.pre_split.processors.col_transformer import ( PresSplitColTransformer, ) -from psycop_model_training.preprocessing.pre_split.row_filterer import ( - PreSplitRowFilterer, +from psycop_model_training.preprocessing.pre_split.processors.row_filter import ( + PreSplitRowFilter, ) -from psycop_model_training.utils.config_schemas.full_config import FullConfigSchema class FullProcessor: @@ -17,13 +16,13 @@ class FullProcessor: def __init__(self, cfg): self.cfg = cfg - self.row_filterer = PreSplitRowFilterer(cfg=cfg) - self.col_filterer = PresSplitColFilterer(cfg=cfg) + self.row_filterer = PreSplitRowFilter(cfg=cfg) + self.col_filterer = PresSplitColFilter(cfg=cfg) self.col_transformer = PresSplitColTransformer(cfg=cfg) - def process_from_cfg(self, cfg: FullConfigSchema, df: pd.DataFrame): + def process_from_cfg(self, dataset: pd.DataFrame): """Process a dataframe using the configuration.""" - df = self.row_filterer.filter_from_cfg(df=df) - df = self.col_filterer.filter_from_cfg(df=df) - df = self.col_transformer.transform_from_cfg(df=df) - return df + dataset = self.row_filterer.filter(dataset=dataset) + dataset = self.col_filterer.filter(dataset=dataset) + dataset = self.col_transformer.transform_from_cfg(dataset=dataset) + return dataset diff --git a/src/psycop_model_training/preprocessing/pre_split/col_filterer.py b/src/psycop_model_training/preprocessing/pre_split/processors/col_filter.py similarity index 85% rename from src/psycop_model_training/preprocessing/pre_split/col_filterer.py rename to src/psycop_model_training/preprocessing/pre_split/processors/col_filter.py index 207cc03c..f1d845c1 100644 --- a/src/psycop_model_training/preprocessing/pre_split/col_filterer.py +++ b/src/psycop_model_training/preprocessing/pre_split/processors/col_filter.py @@ -5,6 +5,7 @@ from psycop_model_training.data_loader.data_loader import msg from psycop_model_training.utils.col_name_inference import infer_look_distance +from psycop_model_training.utils.config_schemas.full_config import FullConfigSchema from psycop_model_training.utils.decorators import print_df_dimensions_diff from psycop_model_training.utils.utils import ( get_percent_lost, @@ -13,7 +14,10 @@ ) -class PresSplitColFilterer: +class PresSplitColFilter: + def __init__(self, cfg: FullConfigSchema) -> None: + self.cfg = cfg + @print_df_dimensions_diff def _drop_cols_not_in_lookbehind_combination( self, @@ -74,6 +78,7 @@ def _drop_cols_not_in_lookbehind_combination( dataset = dataset.drop(columns=cols_to_drop) return dataset + @print_df_dimensions_diff def _drop_cols_if_exceeds_look_direction_threshold( self, dataset: pd.DataFrame, @@ -131,44 +136,6 @@ def _drop_cols_if_exceeds_look_direction_threshold( return dataset[[c for c in dataset.columns if c not in cols_to_drop]] - @print_df_dimensions_diff - def _drop_cols_and_rows_if_look_direction_not_met( - self, - dataset: pd.DataFrame, - ) -> pd.DataFrame: - """Drop columns if they are outside the specification. Specifically: - - - min_lookahead_days is insufficient for the column's lookahead - - The dataset doesn't stretch far enough for the prediction time's lookahead - - The dataset doesn't stretch far enough for the prediction time's lookbehind - - Args: - dataset (pd.DataFrame): Dataset to process. - """ - for direction in ("ahead", "behind"): - - if direction in ("ahead", "behind"): - if direction == "ahead": - n_days = self.cfg.data.min_lookahead_days - elif direction == "behind": - n_days = max(self.cfg.data.lookbehind_combination) - else: - continue - - dataset = self._drop_rows_if_datasets_ends_within_days( - n_days=n_days, - dataset=dataset, - direction=direction, - ) - - dataset = self._drop_cols_if_exceeds_look_direction_threshold( - dataset=dataset, - look_direction_threshold=n_days, - direction=direction, - ) - - return dataset - @print_df_dimensions_diff def _keep_unique_outcome_col_with_lookahead_days_matching_conf( self, @@ -199,3 +166,26 @@ def _keep_unique_outcome_col_with_lookahead_days_matching_conf( def n_outcome_col_names(self, df: pd.DataFrame) -> int: """How many outcome columns there are in a dataframe.""" return len(infer_outcome_col_name(df=df, allow_multiple=True)) + + def filter(self, dataset: pd.DataFrame) -> pd.DataFrame: + """Filter a dataframe based on the config.""" + for direction in ("ahead", "behind"): + if direction == "ahead": + n_days = self.cfg.preprocessing.pre_split.min_lookahead_days + elif direction == "behind": + n_days = max(self.cfg.preprocessing.pre_split.lookbehind_combination) + + dataset = self._drop_cols_if_exceeds_look_direction_threshold( + dataset=dataset, + look_direction_threshold=n_days, + direction=direction, + ) + + if self.cfg.preprocessing.pre_split.lookbehind_combination: + dataset = self._drop_cols_not_in_lookbehind_combination(dataset=dataset) + + dataset = self._keep_unique_outcome_col_with_lookahead_days_matching_conf( + dataset=dataset, + ) + + return dataset diff --git a/src/psycop_model_training/preprocessing/pre_split/col_transformer.py b/src/psycop_model_training/preprocessing/pre_split/processors/col_transformer.py similarity index 69% rename from src/psycop_model_training/preprocessing/pre_split/col_transformer.py rename to src/psycop_model_training/preprocessing/pre_split/processors/col_transformer.py index 1583a592..7f5b2029 100644 --- a/src/psycop_model_training/preprocessing/pre_split/col_transformer.py +++ b/src/psycop_model_training/preprocessing/pre_split/processors/col_transformer.py @@ -1,11 +1,17 @@ import numpy as np import pandas as pd +from wasabi import Printer from psycop_model_training.utils.decorators import print_df_dimensions_diff from psycop_model_training.utils.utils import infer_predictor_col_name +msg = Printer(timestamp=True) + class PresSplitColTransformer: + def __init__(self, cfg: FullConfigSchema) -> None: + self.cfg = cfg + @staticmethod @print_df_dimensions_diff def convert_timestamp_dtype_and_nat(dataset: pd.DataFrame) -> pd.DataFrame: @@ -51,3 +57,20 @@ def _negative_values_to_nan(self, dataset: pd.DataFrame) -> pd.DataFrame: dataset[numerical_columns_with_negative_values] = df_to_replace return dataset + + def transform(self, dataset: pd.DataFrame) -> pd.DataFrame: + # Super hacky transformation of negative weights (?!) for chi-square. + # In the future, we want to: + # 1. Fix this in the feature generation for t2d + # 2a. See if there's a way of using feature selection that permits negative values, or + # 2b. Always use z-score normalisation? + dataset = self._negative_values_to_nan(dataset=dataset) + + dataset = self.convert_timestamp_dtype_and_nat(dataset=dataset) + + if self.cfg.preprocessing.convert_booleans_to_int: + dataset = self._convert_boolean_dtypes_to_int(dataset=dataset) + + msg.info("Finished processing dataset") + + return dataset diff --git a/src/psycop_model_training/preprocessing/pre_split/row_filterer.py b/src/psycop_model_training/preprocessing/pre_split/processors/row_filter.py similarity index 90% rename from src/psycop_model_training/preprocessing/pre_split/row_filterer.py rename to src/psycop_model_training/preprocessing/pre_split/processors/row_filter.py index 4c83474e..a14c105c 100644 --- a/src/psycop_model_training/preprocessing/pre_split/row_filterer.py +++ b/src/psycop_model_training/preprocessing/pre_split/processors/row_filter.py @@ -9,7 +9,7 @@ from psycop_model_training.utils.utils import get_percent_lost -class PreSplitRowFilterer: +class PreSplitRowFilter: def __init__(self, cfg: FullConfigSchema): self.cfg = cfg @@ -122,17 +122,12 @@ def _drop_rows_after_event_time(self, dataset: pd.DataFrame) -> pd.DataFrame: return dataset[~rows_to_drop] - def filter_from_cfg(self, dataset: pd.DataFrame): + def filter(self, dataset: pd.DataFrame): for direction in ("ahead", "behind"): - if direction in ("ahead", "behind"): - if direction == "ahead": - n_days = self.cfg.preprocessing.pre_split.min_lookahead_days - elif direction == "behind": - n_days = max( - self.cfg.preprocessing.pre_split.lookbehind_combination - ) - else: - continue + if direction == "ahead": + n_days = self.cfg.preprocessing.pre_split.min_lookahead_days + elif direction == "behind": + n_days = max(self.cfg.preprocessing.pre_split.lookbehind_combination) dataset = self._drop_rows_if_datasets_ends_within_days( n_days=n_days, @@ -140,6 +135,12 @@ def filter_from_cfg(self, dataset: pd.DataFrame): direction=direction, ) + if self.cfg.preprocessing.pre_split.min_prediction_time_date: + dataset = dataset[ + dataset[self.cfg.data.col_name.pred_timestamp] + > self.cfg.data.min_prediction_time_date + ] + if self.cfg.preprocessing.pre_split.drop_patient_if_exclusion_before_date: dataset = self._drop_patient_if_excluded_by_date(dataset) @@ -147,3 +148,5 @@ def filter_from_cfg(self, dataset: pd.DataFrame): dataset = self._keep_only_if_older_than_min_age(dataset) dataset = self._drop_rows_after_event_time(dataset=dataset) + + return dataset From 50b4e25ea333ad7b9bb190c1d91a8ebc8b40fb0b Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Fri, 16 Dec 2022 15:47:29 +0100 Subject: [PATCH 11/47] tests: init tests with new pre_split processing --- .../data_loader/data_classes.py | 2 +- .../data_loader/data_loader.py | 2 - .../data_loader/utils.py | 28 ++++++++-- .../model_eval/dataclasses.py | 5 +- .../preprocessing/pre_split/full_processor.py | 4 +- .../pre_split/processors/col_filter.py | 10 ++-- .../pre_split/processors/col_transformer.py | 3 +- .../pre_split/processors/row_filter.py | 13 +++-- src/psycop_model_training/utils/basemodel.py | 47 +++++++++++++++++ .../utils/config_schemas/__init__.py | 0 .../utils/config_schemas/conf_utils.py | 51 +++---------------- .../utils/config_schemas/data.py | 3 +- .../utils/config_schemas/eval.py | 2 +- .../utils/config_schemas/full_config.py | 2 +- .../utils/config_schemas/model.py | 2 +- .../utils/config_schemas/preprocessing.py | 2 +- .../utils/config_schemas/project.py | 19 ++++--- .../utils/config_schemas/train.py | 2 +- tests/config/data/synth_data.yaml | 15 ------ .../preprocessing/default_preprocessing.yaml | 37 +++++++++----- .../project/integration_test_project.yaml | 6 --- tests/conftest.py | 2 +- tests/test_load.py | 2 +- 23 files changed, 140 insertions(+), 119 deletions(-) create mode 100644 src/psycop_model_training/utils/basemodel.py create mode 100644 src/psycop_model_training/utils/config_schemas/__init__.py diff --git a/src/psycop_model_training/data_loader/data_classes.py b/src/psycop_model_training/data_loader/data_classes.py index 71c749cb..2132064a 100644 --- a/src/psycop_model_training/data_loader/data_classes.py +++ b/src/psycop_model_training/data_loader/data_classes.py @@ -2,7 +2,7 @@ import pandas as pd -from psycop_model_training.utils.config_schemas import BaseModel +from psycop_model_training.utils.basemodel import BaseModel class SplitDataset(BaseModel): diff --git a/src/psycop_model_training/data_loader/data_loader.py b/src/psycop_model_training/data_loader/data_loader.py index 1f7c408a..a701bd8d 100644 --- a/src/psycop_model_training/data_loader/data_loader.py +++ b/src/psycop_model_training/data_loader/data_loader.py @@ -100,7 +100,5 @@ def load_dataset_from_dir( elif isinstance(split_names, str): dataset = self._load_dataset_file(split_name=split_names, nrows=nrows) - dataset = self._process_dataset(dataset=dataset) - msg.good(f"{split_names}: Returning!") return dataset diff --git a/src/psycop_model_training/data_loader/utils.py b/src/psycop_model_training/data_loader/utils.py index 21f52643..18db5630 100644 --- a/src/psycop_model_training/data_loader/utils.py +++ b/src/psycop_model_training/data_loader/utils.py @@ -1,10 +1,12 @@ import os from pathlib import Path +from typing import Literal import pandas as pd from psycop_model_training.data_loader.data_classes import SplitDataset from psycop_model_training.data_loader.data_loader import DataLoader +from psycop_model_training.preprocessing.pre_split.full_processor import FullProcessor from psycop_model_training.utils.config_schemas.full_config import FullConfigSchema @@ -13,6 +15,24 @@ def get_latest_dataset_dir(path: Path) -> Path: return max(path.glob("*"), key=os.path.getctime) +def load_and_filter_split_from_cfg( + cfg: FullConfigSchema, split: Literal["train", "test", "val"] +) -> pd.DataFrame: + """Load train dataset from config. + + Args: + cfg (FullConfig): Config + split (Literal["train", "test", "val"]): Split to load + + Returns: + pd.DataFrame: Train dataset + """ + dataset = DataLoader(cfg=cfg).load_dataset_from_dir(split_names=split) + filtered_data = FullProcessor(cfg=cfg).process(dataset=dataset) + + return filtered_data + + def load_and_filter_train_from_cfg(cfg: FullConfigSchema) -> pd.DataFrame: """Load train dataset from config. @@ -22,17 +42,15 @@ def load_and_filter_train_from_cfg(cfg: FullConfigSchema) -> pd.DataFrame: Returns: pd.DataFrame: Train dataset """ - return DataLoader(cfg=cfg).load_dataset_from_dir(split_names="train") + return load_and_filter_split_from_cfg(cfg=cfg, split="train") def load_and_filter_train_and_val_from_cfg(cfg: FullConfigSchema): """Load train and validation data from file.""" - loader = DataLoader(cfg=cfg) - return SplitDataset( - train=loader.load_dataset_from_dir(split_names="train"), - val=loader.load_dataset_from_dir(split_names="val"), + train=load_and_filter_split_from_cfg(cfg=cfg, split="train"), + val=load_and_filter_split_from_cfg(cfg=cfg, split="val"), ) diff --git a/src/psycop_model_training/model_eval/dataclasses.py b/src/psycop_model_training/model_eval/dataclasses.py index 11a41832..df30876c 100644 --- a/src/psycop_model_training/model_eval/dataclasses.py +++ b/src/psycop_model_training/model_eval/dataclasses.py @@ -4,7 +4,10 @@ import pandas as pd -from psycop_model_training.utils.config_schemas import BaseModel, FullConfigSchema +from psycop_model_training.utils.config_schemas.conf_utils import ( + FullConfigSchema, +) +from psycop_model_training.utils.basemodel import BaseModel class CustomColumns(BaseModel): diff --git a/src/psycop_model_training/preprocessing/pre_split/full_processor.py b/src/psycop_model_training/preprocessing/pre_split/full_processor.py index 032f2403..c18ee5e3 100644 --- a/src/psycop_model_training/preprocessing/pre_split/full_processor.py +++ b/src/psycop_model_training/preprocessing/pre_split/full_processor.py @@ -20,9 +20,9 @@ def __init__(self, cfg): self.col_filterer = PresSplitColFilter(cfg=cfg) self.col_transformer = PresSplitColTransformer(cfg=cfg) - def process_from_cfg(self, dataset: pd.DataFrame): + def process(self, dataset: pd.DataFrame): """Process a dataframe using the configuration.""" + dataset = self.col_transformer.transform(dataset=dataset) dataset = self.row_filterer.filter(dataset=dataset) dataset = self.col_filterer.filter(dataset=dataset) - dataset = self.col_transformer.transform_from_cfg(dataset=dataset) return dataset diff --git a/src/psycop_model_training/preprocessing/pre_split/processors/col_filter.py b/src/psycop_model_training/preprocessing/pre_split/processors/col_filter.py index f1d845c1..c028706e 100644 --- a/src/psycop_model_training/preprocessing/pre_split/processors/col_filter.py +++ b/src/psycop_model_training/preprocessing/pre_split/processors/col_filter.py @@ -33,7 +33,7 @@ def _drop_cols_not_in_lookbehind_combination( pd.DataFrame: Dataset with dropped columns. """ - if not self.cfg.data.lookbehind_combination: + if not self.cfg.preprocessing.pre_split.lookbehind_combination: raise ValueError("No lookbehind_combination provided.") # Extract all unique lookbhehinds in the dataset predictors @@ -44,7 +44,9 @@ def _drop_cols_not_in_lookbehind_combination( } # Convert list to set - lookbehinds_in_spec = set(self.cfg.data.lookbehind_combination) + lookbehinds_in_spec = set( + self.cfg.preprocessing.pre_split.lookbehind_combination + ) # Check that all loobehinds in lookbehind_combination are used in the predictors if not lookbehinds_in_spec.issubset( @@ -146,7 +148,9 @@ def _keep_unique_outcome_col_with_lookahead_days_matching_conf( outcome_cols = infer_outcome_col_name(df=dataset, allow_multiple=True) col_to_drop = [ - c for c in outcome_cols if str(self.cfg.data.min_lookahead_days) not in c + c + for c in outcome_cols + if str(self.cfg.preprocessing.pre_split.min_lookahead_days) not in c ] # If no columns to drop, return the dataset diff --git a/src/psycop_model_training/preprocessing/pre_split/processors/col_transformer.py b/src/psycop_model_training/preprocessing/pre_split/processors/col_transformer.py index 7f5b2029..45299438 100644 --- a/src/psycop_model_training/preprocessing/pre_split/processors/col_transformer.py +++ b/src/psycop_model_training/preprocessing/pre_split/processors/col_transformer.py @@ -2,6 +2,7 @@ import pandas as pd from wasabi import Printer +from psycop_model_training.utils.config_schemas.full_config import FullConfigSchema from psycop_model_training.utils.decorators import print_df_dimensions_diff from psycop_model_training.utils.utils import infer_predictor_col_name @@ -68,7 +69,7 @@ def transform(self, dataset: pd.DataFrame) -> pd.DataFrame: dataset = self.convert_timestamp_dtype_and_nat(dataset=dataset) - if self.cfg.preprocessing.convert_booleans_to_int: + if self.cfg.preprocessing.pre_split.convert_booleans_to_int: dataset = self._convert_boolean_dtypes_to_int(dataset=dataset) msg.info("Finished processing dataset") diff --git a/src/psycop_model_training/preprocessing/pre_split/processors/row_filter.py b/src/psycop_model_training/preprocessing/pre_split/processors/row_filter.py index a14c105c..45832835 100644 --- a/src/psycop_model_training/preprocessing/pre_split/processors/row_filter.py +++ b/src/psycop_model_training/preprocessing/pre_split/processors/row_filter.py @@ -79,7 +79,7 @@ def _drop_patient_if_excluded_by_date( outcome_before_date = ( dataset[self.cfg.data.col_name.exclusion_timestamp] - < self.cfg.data.drop_patient_if_exclusion_before_date + < self.cfg.preprocessing.pre_split.drop_patient_if_exclusion_before_date ) patients_to_drop = set( @@ -96,11 +96,11 @@ def _drop_patient_if_excluded_by_date( if n_rows_before_modification - n_rows_after_modification != 0: msg.info( - f"Dropped {n_rows_before_modification - n_rows_after_modification} ({percent_dropped}%) rows because they met exclusion criteria before {self.cfg.data.drop_patient_if_exclusion_before_date}.", + f"Dropped {n_rows_before_modification - n_rows_after_modification} ({percent_dropped}%) rows because they met exclusion criteria before {self.cfg.preprocessing.pre_split.drop_patient_if_exclusion_before_date}.", ) else: msg.info( - f"No rows met exclusion criteria before {self.cfg.data.drop_patient_if_exclusion_before_date}. Didn't drop any.", + f"No rows met exclusion criteria before {self.cfg.preprocessing.pre_split.drop_patient_if_exclusion_before_date}. Didn't drop any.", ) return dataset @@ -109,7 +109,10 @@ def _drop_patient_if_excluded_by_date( def _keep_only_if_older_than_min_age(self, dataset: pd.DataFrame) -> pd.DataFrame: """Keep only rows that are older than the minimum age specified in the config.""" - return dataset[dataset[self.cfg.data.col_name.age] >= self.cfg.data.min_age] + return dataset[ + dataset[self.cfg.data.col_name.age] + >= self.cfg.preprocessing.pre_split.min_age + ] @print_df_dimensions_diff def _drop_rows_after_event_time(self, dataset: pd.DataFrame) -> pd.DataFrame: @@ -138,7 +141,7 @@ def filter(self, dataset: pd.DataFrame): if self.cfg.preprocessing.pre_split.min_prediction_time_date: dataset = dataset[ dataset[self.cfg.data.col_name.pred_timestamp] - > self.cfg.data.min_prediction_time_date + > self.cfg.preprocessing.pre_split.min_prediction_time_date ] if self.cfg.preprocessing.pre_split.drop_patient_if_exclusion_before_date: diff --git a/src/psycop_model_training/utils/basemodel.py b/src/psycop_model_training/utils/basemodel.py new file mode 100644 index 00000000..058edff3 --- /dev/null +++ b/src/psycop_model_training/utils/basemodel.py @@ -0,0 +1,47 @@ +from typing import Any + +from pydantic import BaseModel as PydanticBaseModel +from pydantic import Extra + + +class BaseModel(PydanticBaseModel): + """.""" + + class Config: + """An pydantic basemodel, which doesn't allow attributes that are not + defined in the class.""" + + allow_mutation = False + arbitrary_types_allowed = True + extra = Extra.forbid + + def __transform_attributes_with_str_to_object( + self, + output_object: Any, + input_string: str = "str", + ): + for key, value in self.__dict__.items(): + if isinstance(value, str): + if value.lower() == input_string.lower(): + self.__dict__[key] = output_object + + def __init__( + self, + allow_mutation: bool = False, + **kwargs, + ): + super().__init__(**kwargs) + self.Config.allow_mutation = allow_mutation + + self.__transform_attributes_with_str_to_object( + input_string="null", + output_object=None, + ) + self.__transform_attributes_with_str_to_object( + input_string="false", + output_object=False, + ) + self.__transform_attributes_with_str_to_object( + input_string="true", + output_object=True, + ) diff --git a/src/psycop_model_training/utils/config_schemas/__init__.py b/src/psycop_model_training/utils/config_schemas/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/psycop_model_training/utils/config_schemas/conf_utils.py b/src/psycop_model_training/utils/config_schemas/conf_utils.py index 4b5e8a77..09830b11 100644 --- a/src/psycop_model_training/utils/config_schemas/conf_utils.py +++ b/src/psycop_model_training/utils/config_schemas/conf_utils.py @@ -1,5 +1,9 @@ -from typing import Optional, Any, Union +from typing import Optional, Union +from hydra import compose, initialize +from omegaconf import DictConfig, OmegaConf + +from psycop_model_training.utils.basemodel import BaseModel from psycop_model_training.utils.config_schemas.full_config import FullConfigSchema @@ -24,7 +28,7 @@ def load_test_cfg_as_omegaconf( overrides: Optional[list[str]] = None, ) -> DictConfig: """Load config as omegaconf object.""" - with initialize(version_base=None, config_path="../../../tests/config/"): + with initialize(version_base=None, config_path="../../../../tests/config/"): if overrides: cfg = compose( config_name=config_file_name, @@ -60,49 +64,6 @@ def load_test_cfg_as_pydantic( return convert_omegaconf_to_pydantic_object(conf=cfg, allow_mutation=allow_mutation) -class BaseModel(PydanticBaseModel): - """.""" - - class Config: - """An pydantic basemodel, which doesn't allow attributes that are not - defined in the class.""" - - allow_mutation = False - arbitrary_types_allowed = True - extra = Extra.forbid - - def __transform_attributes_with_str_to_object( - self, - output_object: Any, - input_string: str = "str", - ): - for key, value in self.__dict__.items(): - if isinstance(value, str): - if value.lower() == input_string.lower(): - self.__dict__[key] = output_object - - def __init__( - self, - allow_mutation: bool = False, - **kwargs, - ): - super().__init__(**kwargs) - self.Config.allow_mutation = allow_mutation - - self.__transform_attributes_with_str_to_object( - input_string="null", - output_object=None, - ) - self.__transform_attributes_with_str_to_object( - input_string="false", - output_object=False, - ) - self.__transform_attributes_with_str_to_object( - input_string="true", - output_object=True, - ) - - class WatcherSchema(BaseModel): """Configuration for watchers.""" diff --git a/src/psycop_model_training/utils/config_schemas/data.py b/src/psycop_model_training/utils/config_schemas/data.py index 53a6ed0d..e7229a25 100644 --- a/src/psycop_model_training/utils/config_schemas/data.py +++ b/src/psycop_model_training/utils/config_schemas/data.py @@ -1,8 +1,7 @@ -from datetime import datetime from pathlib import Path from typing import Optional, Union -from psycop_model_training.utils.config_schemas import BaseModel +from psycop_model_training.utils.basemodel import BaseModel class CustomColNames(BaseModel): diff --git a/src/psycop_model_training/utils/config_schemas/eval.py b/src/psycop_model_training/utils/config_schemas/eval.py index 06e08489..61d9a2e2 100644 --- a/src/psycop_model_training/utils/config_schemas/eval.py +++ b/src/psycop_model_training/utils/config_schemas/eval.py @@ -1,4 +1,4 @@ -from psycop_model_training.utils.config_schemas import BaseModel +from psycop_model_training.utils.basemodel import BaseModel class EvalConfSchema(BaseModel): diff --git a/src/psycop_model_training/utils/config_schemas/full_config.py b/src/psycop_model_training/utils/config_schemas/full_config.py index 99bad476..8ff8d8ca 100644 --- a/src/psycop_model_training/utils/config_schemas/full_config.py +++ b/src/psycop_model_training/utils/config_schemas/full_config.py @@ -1,4 +1,4 @@ -from psycop_model_training.utils.config_schemas.conf_utils import BaseModel +from psycop_model_training.utils.basemodel import BaseModel from psycop_model_training.utils.config_schemas.data import DataSchema from psycop_model_training.utils.config_schemas.eval import EvalConfSchema from psycop_model_training.utils.config_schemas.model import ModelConfSchema diff --git a/src/psycop_model_training/utils/config_schemas/model.py b/src/psycop_model_training/utils/config_schemas/model.py index 36bbc604..2148afbc 100644 --- a/src/psycop_model_training/utils/config_schemas/model.py +++ b/src/psycop_model_training/utils/config_schemas/model.py @@ -1,4 +1,4 @@ -from psycop_model_training.utils.config_schemas import BaseModel +from psycop_model_training.utils.basemodel import BaseModel class ModelConfSchema(BaseModel): diff --git a/src/psycop_model_training/utils/config_schemas/preprocessing.py b/src/psycop_model_training/utils/config_schemas/preprocessing.py index ae927c39..b082d877 100644 --- a/src/psycop_model_training/utils/config_schemas/preprocessing.py +++ b/src/psycop_model_training/utils/config_schemas/preprocessing.py @@ -1,7 +1,7 @@ from datetime import datetime from typing import Literal, Optional, Union -from psycop_model_training.utils.config_schemas import BaseModel +from psycop_model_training.utils.basemodel import BaseModel class FeatureSelectionSchema(BaseModel): diff --git a/src/psycop_model_training/utils/config_schemas/project.py b/src/psycop_model_training/utils/config_schemas/project.py index 9e524341..562dbe4c 100644 --- a/src/psycop_model_training/utils/config_schemas/project.py +++ b/src/psycop_model_training/utils/config_schemas/project.py @@ -1,4 +1,12 @@ -from psycop_model_training.utils.config_schemas import BaseModel, WandbSchema, WatcherSchema +from psycop_model_training.utils.basemodel import BaseModel + + +class WandbSchema(BaseModel): + """Configuration for weights and biases.""" + + group: str + mode: str + entity: str class ProjectSchema(BaseModel): @@ -7,13 +15,4 @@ class ProjectSchema(BaseModel): wandb: WandbSchema name: str = "psycop_model_training" seed: int - watcher: WatcherSchema gpu: bool - - -class WandbSchema(BaseModel): - """Configuration for weights and biases.""" - - group: str - mode: str - entity: str diff --git a/src/psycop_model_training/utils/config_schemas/train.py b/src/psycop_model_training/utils/config_schemas/train.py index 111f2b73..e91a0fc1 100644 --- a/src/psycop_model_training/utils/config_schemas/train.py +++ b/src/psycop_model_training/utils/config_schemas/train.py @@ -1,6 +1,6 @@ from typing import Optional -from psycop_model_training.utils.config_schemas import BaseModel +from psycop_model_training.utils.basemodel import BaseModel class TrainConfSchema(BaseModel): diff --git a/tests/config/data/synth_data.yaml b/tests/config/data/synth_data.yaml index 3b891332..7d72c24e 100644 --- a/tests/config/data/synth_data.yaml +++ b/tests/config/data/synth_data.yaml @@ -3,11 +3,8 @@ data: dir: tests/test_data/synth_splits suffix: csv n_training_samples: null - min_lookahead_days: 30 - min_prediction_time_date: null pred_prefix: pred_ outc_prefix: outc_ - min_age: 18 col_name: pred_timestamp: timestamp @@ -17,15 +14,3 @@ data: exclusion_timestamp: timestamp_exclusion custom: n_hba1c: hba1c_within_9999_days_count_nan - - # Looking ahead - drop_patient_if_exclusion_before_date: 1971-01-01 - - # Looking behind - lookbehind_combination: [30, 60, 100] - -# Parameters that will only take effect if running with --multirun -hydra: - sweeper: - params: - data.lookbehind_combination: choice([30, 90], [30]) diff --git a/tests/config/preprocessing/default_preprocessing.yaml b/tests/config/preprocessing/default_preprocessing.yaml index ad95e66e..f47e331e 100644 --- a/tests/config/preprocessing/default_preprocessing.yaml +++ b/tests/config/preprocessing/default_preprocessing.yaml @@ -1,20 +1,29 @@ # @package _global_ preprocessing: - convert_to_boolean: false - convert_booleans_to_int: true - drop_datetime_predictor_columns: true - convert_datetimes_to_ordinal: false - imputation_method: most_frequent - scaling: z-score-normalisation - feature_selection: - name: chi2 - params: - percentile: 20 # (int): Percent of features to keep. Defaults to 10. + pre_split: + convert_to_boolean: false + convert_booleans_to_int: true + drop_datetime_predictor_columns: true + convert_datetimes_to_ordinal: false + drop_patient_if_exclusion_before_date: 1971-01-01 + min_prediction_time_date: null + min_lookahead_days: 30 + lookbehind_combination: [30, 60, 100] + min_age: 18 + post_split: + imputation_method: most_frequent + scaling: z-score-normalisation + feature_selection: + name: chi2 + params: + percentile: 20 # (int): Percent of features to keep. Defaults to 10. +# Parameters that will only take effect if running with --multirun hydra: sweeper: params: - ++preprocessing.imputation_method: choice("most_frequent", "mean", "median", "null") - ++preprocessing.scaling: choice("z-score-normalization", "null") - ++preprocessing.feature_selection.name: choice("chi2", "null") - ++preprocessing.feature_selection.params.percentile: int(tag(log, interval(1, 90))) + ++preprocessing.post_split.imputation_method: choice("most_frequent", "mean", "median", "null") + ++preprocessing.post_split.scaling: choice("z-score-normalization", "null") + ++preprocessing.post_split.feature_selection.name: choice("chi2", "null") + ++preprocessing.post_splitfeature_selection.params.percentile: int(tag(log, interval(1, 90))) + preprocessing.pre_split.lookbehind_combination: choice([30, 90], [30]) diff --git a/tests/config/project/integration_test_project.yaml b/tests/config/project/integration_test_project.yaml index 382a974e..98a8fdc4 100644 --- a/tests/config/project/integration_test_project.yaml +++ b/tests/config/project/integration_test_project.yaml @@ -6,10 +6,4 @@ wandb: group: integration_testing entity: psycop # Which entity to run WanDB in. -watcher: - archive_all: true - keep_alive_after_training_minutes: 5 - n_runs_before_eval: 1 - verbose: true - gpu: false diff --git a/tests/conftest.py b/tests/conftest.py index b10c9367..6ca2361b 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -6,7 +6,7 @@ import pytest from psycop_model_training.model_eval.dataclasses import EvalDataset -from psycop_model_training.utils.config_schemas import ( +from psycop_model_training.utils.config_schemas.conf_utils import ( FullConfigSchema, load_test_cfg_as_pydantic, ) diff --git a/tests/test_load.py b/tests/test_load.py index c79b77b3..d60f2d12 100644 --- a/tests/test_load.py +++ b/tests/test_load.py @@ -13,7 +13,7 @@ def test_load_lookbehind_exceeds_lookbehind_threshold( n_cols_before_filtering = load_and_filter_train_from_cfg(cfg=cfg).shape[1] - cfg.data.lookbehind_combination = [30, 60] + cfg.preprocessing.pre_split.lookbehind_combination = [30, 60] n_cols_after_filtering = load_and_filter_train_from_cfg(cfg=cfg).shape[1] From f76dd5a73dd70f9a85bb097111ab05acb7264774 Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Fri, 16 Dec 2022 15:49:57 +0100 Subject: [PATCH 12/47] fix: post_split config addressing --- .../post_split/create_pipeline.py | 25 +++++++++++-------- tests/test_preprocessing.py | 8 +++--- 2 files changed, 18 insertions(+), 15 deletions(-) diff --git a/src/psycop_model_training/preprocessing/post_split/create_pipeline.py b/src/psycop_model_training/preprocessing/post_split/create_pipeline.py index 18f4999d..8f56a338 100644 --- a/src/psycop_model_training/preprocessing/post_split/create_pipeline.py +++ b/src/psycop_model_training/preprocessing/post_split/create_pipeline.py @@ -24,7 +24,7 @@ def get_feature_selection_steps(cfg): """Add feature selection steps to the preprocessing pipeline.""" new_steps = [] - if cfg.preprocessing.feature_selection.name: + if cfg.preprocessing.post_split.feature_selection.name: if cfg.preprocessing.feature_selection.name == "f_classif": new_steps.append( ( @@ -75,7 +75,7 @@ def create_preprocessing_pipeline(cfg: FullConfigSchema): steps = [] # Conversion - if cfg.preprocessing.drop_datetime_predictor_columns: + if cfg.preprocessing.pre_split.drop_datetime_predictor_columns: steps.append( ( "DropDateTimeColumns", @@ -83,31 +83,34 @@ def create_preprocessing_pipeline(cfg: FullConfigSchema): ), ) - if cfg.preprocessing.convert_datetimes_to_ordinal: + if cfg.preprocessing.pre_split.convert_datetimes_to_ordinal: dtconverter = DateTimeConverter() steps.append(("DateTimeConverter", dtconverter)) - if cfg.preprocessing.convert_to_boolean: + if cfg.preprocessing.pre_split.convert_to_boolean: steps.append(("ConvertToBoolean", ConvertToBoolean())) # Imputation - if cfg.model.require_imputation and not cfg.preprocessing.imputation_method: + if ( + cfg.model.require_imputation + and not cfg.preprocessing.post_split.imputation_method + ): msg.warn( f"{cfg.model.name} requires imputation, but no imputation method was specified in the config file. Overriding to 'mean'.", ) - cfg.preprocessing.imputation_method = "mean" + cfg.preprocessing.post_split.imputation_method = "mean" # Not a great solution, but preferable to the script breaking and stopping a hyperparameter search. raise ValueError( f"{cfg.model.name} requires imputation, but no imputation method was specified in the config file.", ) - if cfg.preprocessing.imputation_method: + if cfg.preprocessing.post_split.imputation_method: steps.append( ( "Imputation", - SimpleImputer(strategy=cfg.preprocessing.imputation_method), + SimpleImputer(strategy=cfg.preprocessing.post_split.imputation_method), ), ) @@ -120,8 +123,8 @@ def create_preprocessing_pipeline(cfg: FullConfigSchema): # Important to do this after feature selection, since # half of the values in z-score normalisation will be negative, # which is not allowed for chi2 - if cfg.preprocessing.scaling: - if cfg.preprocessing.scaling in { + if cfg.preprocessing.post_split.scaling: + if cfg.preprocessing.post_split.scaling in { "z-score-normalization", "z-score-normalisation", }: @@ -130,7 +133,7 @@ def create_preprocessing_pipeline(cfg: FullConfigSchema): ) else: raise ValueError( - f"{cfg.preprocessing.scaling} is not implemented. See above", + f"{cfg.preprocessing.post_split.scaling} is not implemented. See above", ) return Pipeline(steps) diff --git a/tests/test_preprocessing.py b/tests/test_preprocessing.py index 8f94dca8..faa6d882 100644 --- a/tests/test_preprocessing.py +++ b/tests/test_preprocessing.py @@ -13,10 +13,10 @@ def test_drop_datetime_predictor_columns( specified lookbehind combination list.""" cfg = muteable_test_config - cfg.preprocessing.drop_datetime_predictor_columns = True - cfg.preprocessing.imputation_method = None - cfg.preprocessing.feature_selection.name = None - cfg.preprocessing.scaling = None + cfg.preprocessing.pre_split.drop_datetime_predictor_columns = True + cfg.preprocessing.post_split.imputation_method = None + cfg.preprocessing.post_split.feature_selection.name = None + cfg.preprocessing.post_split.scaling = None cfg.data.pred_prefix = "timestamp" pipe = create_preprocessing_pipeline(cfg=cfg) From c896e38a52fbd25c8b2510940d8c243558bd3228 Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Fri, 16 Dec 2022 15:54:56 +0100 Subject: [PATCH 13/47] tests: more errors from new config --- application/train_model.py | 8 ++++---- .../preprocessing/post_split/create_pipeline.py | 16 +++++++++------- .../utils/col_name_inference.py | 3 ++- tests/test_configs.py | 2 +- tests/test_load.py | 2 +- tests/test_train_model.py | 10 +++++----- 6 files changed, 22 insertions(+), 19 deletions(-) diff --git a/application/train_model.py b/application/train_model.py index 545953cb..36bf952d 100644 --- a/application/train_model.py +++ b/application/train_model.py @@ -23,10 +23,10 @@ train_and_get_model_eval_df, ) from psycop_model_training.utils.col_name_inference import get_col_names -from psycop_model_training.utils.config_schemas import ( - FullConfigSchema, +from psycop_model_training.utils.config_schemas.conf_utils import ( convert_omegaconf_to_pydantic_object, ) +from psycop_model_training.utils.config_schemas.full_config import FullConfigSchema from psycop_model_training.utils.utils import ( PROJECT_ROOT, create_wandb_folders, @@ -158,8 +158,8 @@ def main(cfg: DictConfig): run.log( { "roc_auc_unweighted": roc_auc, - "lookbehind": max(cfg.data.lookbehind_combination), - "lookahead": cfg.data.min_lookahead_days, + "lookbehind": max(cfg.preprocessing.pre_split.lookbehind_combination), + "lookahead": cfg.preprocessing.pre_split.min_lookahead_days, }, ) run.finish() diff --git a/src/psycop_model_training/preprocessing/post_split/create_pipeline.py b/src/psycop_model_training/preprocessing/post_split/create_pipeline.py index 8f56a338..9b50f8c4 100644 --- a/src/psycop_model_training/preprocessing/post_split/create_pipeline.py +++ b/src/psycop_model_training/preprocessing/post_split/create_pipeline.py @@ -25,37 +25,39 @@ def get_feature_selection_steps(cfg): new_steps = [] if cfg.preprocessing.post_split.feature_selection.name: - if cfg.preprocessing.feature_selection.name == "f_classif": + if cfg.preprocessing.post_split.feature_selection.name == "f_classif": new_steps.append( ( "feature_selection", SelectPercentile( f_classif, - percentile=cfg.preprocessing.feature_selection.params[ + percentile=cfg.preprocessing.post_split.feature_selection.params[ "percentile" ], ), ), ) - elif cfg.preprocessing.feature_selection.name == "chi2": + elif cfg.preprocessing.post_split.feature_selection.name == "chi2": new_steps.append( ( "feature_selection", SelectPercentile( chi2, - percentile=cfg.preprocessing.feature_selection.params[ + percentile=cfg.preprocessing.post_split.feature_selection.params[ "percentile" ], ), ), ) - elif cfg.preprocessing.feature_selection.name == "mutual_info_classif": + elif ( + cfg.preprocessing.post_split.feature_selection.name == "mutual_info_classif" + ): new_steps.append( ( "feature_selection", SelectPercentile( mutual_info_classif, - percentile=cfg.preprocessing.feature_selection.params[ + percentile=cfg.preprocessing.post_split.feature_selection.params[ "percentile" ], ), @@ -63,7 +65,7 @@ def get_feature_selection_steps(cfg): ) else: raise ValueError( - f"Unknown feature selection method {cfg.preprocessing.feature_selection.name}", + f"Unknown feature selection method {cfg.preprocessing.post_split.feature_selection.name}", ) return new_steps diff --git a/src/psycop_model_training/utils/col_name_inference.py b/src/psycop_model_training/utils/col_name_inference.py index b09f5781..22348653 100644 --- a/src/psycop_model_training/utils/col_name_inference.py +++ b/src/psycop_model_training/utils/col_name_inference.py @@ -22,7 +22,8 @@ def get_col_names(cfg: DictConfig, train: pd.DataFrame) -> tuple[str, list[str]] potential_outcome_col_names = [ c for c in train.columns - if cfg.data.outc_prefix in c and str(cfg.data.min_lookahead_days) in c + if cfg.data.outc_prefix in c + and str(cfg.preprocessing.pre_split.min_lookahead_days) in c ] if len(potential_outcome_col_names) != 1: diff --git a/tests/test_configs.py b/tests/test_configs.py index 31bc472d..f54369bf 100644 --- a/tests/test_configs.py +++ b/tests/test_configs.py @@ -4,7 +4,7 @@ import pytest from hydra import compose, initialize -from psycop_model_training.utils.config_schemas import ( +from psycop_model_training.utils.config_schemas.conf_utils import ( convert_omegaconf_to_pydantic_object, ) from psycop_model_training.utils.utils import PROJECT_ROOT diff --git a/tests/test_load.py b/tests/test_load.py index d60f2d12..4f7b2cec 100644 --- a/tests/test_load.py +++ b/tests/test_load.py @@ -29,7 +29,7 @@ def test_load_lookbehind_not_in_lookbehind_combination( n_cols_before_filtering = load_and_filter_train_from_cfg(cfg=cfg).shape[1] - cfg.data.lookbehind_combination = [60] + cfg.preprocessing.pre_split.lookbehind_combination = [60] n_cols_after_filtering = load_and_filter_train_from_cfg(cfg=cfg).shape[1] diff --git a/tests/test_train_model.py b/tests/test_train_model.py index c5d8cabe..8be1e351 100644 --- a/tests/test_train_model.py +++ b/tests/test_train_model.py @@ -5,10 +5,10 @@ from application.train_model import main from psycop_model_training.training.model_specs import MODELS -from psycop_model_training.utils.config_schemas import ( - FullConfigSchema, +from psycop_model_training.utils.config_schemas.conf_utils import ( load_test_cfg_as_omegaconf, ) +from psycop_model_training.utils.config_schemas.full_config import FullConfigSchema INTEGRATION_TEST_FILE_NAME = "integration_config.yaml" @@ -46,13 +46,13 @@ def test_crossvalidation(muteable_test_config: FullConfigSchema): def test_min_prediction_time_date(muteable_test_config: FullConfigSchema): """Test minimum prediction times correctly resolving the string.""" cfg = muteable_test_config - cfg.data.min_prediction_time_date = "1972-01-01" + cfg.preprocessing.pre_split.min_prediction_time_date = "1972-01-01" main(cfg) def test_feature_selection(muteable_test_config: FullConfigSchema): """Test feature selection.""" cfg = muteable_test_config - cfg.preprocessing.feature_selection.name = "mutual_info_classif" - cfg.preprocessing.feature_selection.params["percentile"] = 10 + cfg.preprocessing.post_split.feature_selection.name = "mutual_info_classif" + cfg.preprocessing.post_split.feature_selection.params["percentile"] = 10 main(cfg) From ea98c38e65d0390ba27fae64286adf7a54b121cf Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Fri, 16 Dec 2022 15:55:15 +0100 Subject: [PATCH 14/47] style: linting --- application/inspect_dataset.py | 2 +- src/psycop_model_training/data_loader/utils.py | 3 ++- src/psycop_model_training/model_eval/dataclasses.py | 4 +--- src/psycop_model_training/model_eval/evaluate_model.py | 3 ++- .../preprocessing/pre_split/processors/col_filter.py | 2 +- src/psycop_model_training/utils/config_schemas/conf_utils.py | 3 ++- 6 files changed, 9 insertions(+), 8 deletions(-) diff --git a/application/inspect_dataset.py b/application/inspect_dataset.py index 8ab34b45..5c90b68e 100644 --- a/application/inspect_dataset.py +++ b/application/inspect_dataset.py @@ -14,7 +14,7 @@ def main(): df = load_train_raw(cfg=cfg) # noqa pylint: disable=unused-variable df_filtered = load_and_filter_train_from_cfg( - cfg=cfg + cfg=cfg, ) # noqa pylint: disable=unused-variable diff --git a/src/psycop_model_training/data_loader/utils.py b/src/psycop_model_training/data_loader/utils.py index 18db5630..885e16a9 100644 --- a/src/psycop_model_training/data_loader/utils.py +++ b/src/psycop_model_training/data_loader/utils.py @@ -16,7 +16,8 @@ def get_latest_dataset_dir(path: Path) -> Path: def load_and_filter_split_from_cfg( - cfg: FullConfigSchema, split: Literal["train", "test", "val"] + cfg: FullConfigSchema, + split: Literal["train", "test", "val"], ) -> pd.DataFrame: """Load train dataset from config. diff --git a/src/psycop_model_training/model_eval/dataclasses.py b/src/psycop_model_training/model_eval/dataclasses.py index df30876c..ecabe234 100644 --- a/src/psycop_model_training/model_eval/dataclasses.py +++ b/src/psycop_model_training/model_eval/dataclasses.py @@ -4,10 +4,8 @@ import pandas as pd -from psycop_model_training.utils.config_schemas.conf_utils import ( - FullConfigSchema, -) from psycop_model_training.utils.basemodel import BaseModel +from psycop_model_training.utils.config_schemas.conf_utils import FullConfigSchema class CustomColumns(BaseModel): diff --git a/src/psycop_model_training/model_eval/evaluate_model.py b/src/psycop_model_training/model_eval/evaluate_model.py index 113c3717..f1cc701d 100644 --- a/src/psycop_model_training/model_eval/evaluate_model.py +++ b/src/psycop_model_training/model_eval/evaluate_model.py @@ -170,7 +170,8 @@ def create_custom_plot_artifacts( eval_dataset: EvalDataset, save_dir: Path, ) -> list[ArtifactContainer]: - """A collection of plots that are only generated for your specific use case.""" + """A collection of plots that are only generated for your specific use + case.""" return [ ArtifactContainer( label="performance_by_n_hba1c", diff --git a/src/psycop_model_training/preprocessing/pre_split/processors/col_filter.py b/src/psycop_model_training/preprocessing/pre_split/processors/col_filter.py index c028706e..35166edd 100644 --- a/src/psycop_model_training/preprocessing/pre_split/processors/col_filter.py +++ b/src/psycop_model_training/preprocessing/pre_split/processors/col_filter.py @@ -45,7 +45,7 @@ def _drop_cols_not_in_lookbehind_combination( # Convert list to set lookbehinds_in_spec = set( - self.cfg.preprocessing.pre_split.lookbehind_combination + self.cfg.preprocessing.pre_split.lookbehind_combination, ) # Check that all loobehinds in lookbehind_combination are used in the predictors diff --git a/src/psycop_model_training/utils/config_schemas/conf_utils.py b/src/psycop_model_training/utils/config_schemas/conf_utils.py index 09830b11..f9d0c556 100644 --- a/src/psycop_model_training/utils/config_schemas/conf_utils.py +++ b/src/psycop_model_training/utils/config_schemas/conf_utils.py @@ -58,7 +58,8 @@ def load_test_cfg_as_pydantic( ) -> FullConfigSchema: """Load config as pydantic object.""" cfg = load_test_cfg_as_omegaconf( - config_file_name=config_file_name, overrides=overrides + config_file_name=config_file_name, + overrides=overrides, ) return convert_omegaconf_to_pydantic_object(conf=cfg, allow_mutation=allow_mutation) From 1339bc5254dc432bc3294675422b5c80d617c2f3 Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Mon, 19 Dec 2022 09:49:46 +0100 Subject: [PATCH 15/47] refactor: move pre_split pipeline objects to pre_split dir --- application/train_model.py | 27 +------------ .../post_split/create_pipeline.py | 23 ----------- .../post_split/feature_selectors.py | 4 +- .../preprocessing/post_split/pipeline.py | 25 ++++++++++++ .../preprocessing/pre_split/full_processor.py | 4 ++ .../processors/pre_split_pipeline.py | 38 +++++++++++++++++++ tests/test_preprocessing.py | 5 --- 7 files changed, 71 insertions(+), 55 deletions(-) create mode 100644 src/psycop_model_training/preprocessing/post_split/pipeline.py create mode 100644 src/psycop_model_training/preprocessing/pre_split/processors/pre_split_pipeline.py diff --git a/application/train_model.py b/application/train_model.py index 36bf952d..6de7b070 100644 --- a/application/train_model.py +++ b/application/train_model.py @@ -6,7 +6,6 @@ import wandb from omegaconf import DictConfig, OmegaConf from sklearn.metrics import roc_auc_score -from sklearn.pipeline import Pipeline from wasabi import Printer from psycop_model_training.data_loader.utils import ( @@ -14,12 +13,8 @@ ) from psycop_model_training.model_eval.dataclasses import PipeMetadata from psycop_model_training.model_eval.evaluate_model import run_full_evaluation -from psycop_model_training.preprocessing.post_split.create_pipeline import ( - create_preprocessing_pipeline, -) from psycop_model_training.training.train_and_eval import ( CONFIG_PATH, - create_model, train_and_get_model_eval_df, ) from psycop_model_training.utils.col_name_inference import get_col_names @@ -35,25 +30,7 @@ get_feature_importance_dict, get_selected_features_dict, ) - - -def create_pipeline(cfg): - """Create pipeline. - - Args: - cfg (DictConfig): Config object - - Returns: - Pipeline - """ - steps = [] - preprocessing_pipe = create_preprocessing_pipeline(cfg) - if len(preprocessing_pipe.steps) != 0: - steps.append(("preprocessing", preprocessing_pipe)) - - mdl = create_model(cfg) - steps.append(("model", mdl)) - return Pipeline(steps) +from psycop_model_training.preprocessing.post_split.pipeline import create_post_split_pipeline @hydra.main( @@ -102,7 +79,7 @@ def main(cfg: DictConfig): dataset = load_and_filter_train_and_val_from_cfg(cfg) msg.info("Creating pipeline") - pipe = create_pipeline(cfg) + pipe = create_post_split_pipeline(cfg) outcome_col_name, train_col_names = get_col_names(cfg, dataset.train) diff --git a/src/psycop_model_training/preprocessing/post_split/create_pipeline.py b/src/psycop_model_training/preprocessing/post_split/create_pipeline.py index 9b50f8c4..9726c131 100644 --- a/src/psycop_model_training/preprocessing/post_split/create_pipeline.py +++ b/src/psycop_model_training/preprocessing/post_split/create_pipeline.py @@ -10,13 +10,6 @@ from sklearn.preprocessing import StandardScaler from wasabi import Printer -from psycop_model_training.preprocessing.post_split.feature_selectors import ( - DropDateTimeColumns, -) -from psycop_model_training.preprocessing.post_split.feature_transformers import ( - ConvertToBoolean, - DateTimeConverter, -) from psycop_model_training.utils.config_schemas.full_config import FullConfigSchema @@ -76,22 +69,6 @@ def create_preprocessing_pipeline(cfg: FullConfigSchema): msg = Printer(timestamp=True) steps = [] - # Conversion - if cfg.preprocessing.pre_split.drop_datetime_predictor_columns: - steps.append( - ( - "DropDateTimeColumns", - DropDateTimeColumns(pred_prefix=cfg.data.pred_prefix), - ), - ) - - if cfg.preprocessing.pre_split.convert_datetimes_to_ordinal: - dtconverter = DateTimeConverter() - steps.append(("DateTimeConverter", dtconverter)) - - if cfg.preprocessing.pre_split.convert_to_boolean: - steps.append(("ConvertToBoolean", ConvertToBoolean())) - # Imputation if ( cfg.model.require_imputation diff --git a/src/psycop_model_training/preprocessing/post_split/feature_selectors.py b/src/psycop_model_training/preprocessing/post_split/feature_selectors.py index 339a0d3c..02a0a848 100644 --- a/src/psycop_model_training/preprocessing/post_split/feature_selectors.py +++ b/src/psycop_model_training/preprocessing/post_split/feature_selectors.py @@ -14,7 +14,7 @@ def __init__( Args: drop_dtypes (set, optional): Drop columns with these data types. """ - self.drop_dypes = drop_dtypes + self.drop_dtypes = drop_dtypes self.pred_prefix = pred_prefix def fit(self, _, y=None): # pylint: disable=unused-argument @@ -23,7 +23,7 @@ def fit(self, _, y=None): # pylint: disable=unused-argument def transform(self, X, y=None): # pylint: disable=unused-argument """Transform the data.""" - columns_to_drop = [c for c in X.columns if X[c].dtype in self.drop_dypes] + columns_to_drop = [c for c in X.columns if X[c].dtype in self.drop_dtypes] columns_to_drop = [c for c in columns_to_drop if c.startswith(self.pred_prefix)] return X[[c for c in X.columns if c not in columns_to_drop]] diff --git a/src/psycop_model_training/preprocessing/post_split/pipeline.py b/src/psycop_model_training/preprocessing/post_split/pipeline.py new file mode 100644 index 00000000..f3c664ed --- /dev/null +++ b/src/psycop_model_training/preprocessing/post_split/pipeline.py @@ -0,0 +1,25 @@ +from sklearn.pipeline import Pipeline + +from psycop_model_training.preprocessing.post_split.create_pipeline import ( + create_preprocessing_pipeline, +) +from psycop_model_training.training.train_and_eval import create_model + + +def create_post_split_pipeline(cfg): + """Create pipeline. + + Args: + cfg (DictConfig): Config object + + Returns: + Pipeline + """ + steps = [] + preprocessing_pipe = create_preprocessing_pipeline(cfg) + if len(preprocessing_pipe.steps) != 0: + steps.append(("preprocessing", preprocessing_pipe)) + + mdl = create_model(cfg) + steps.append(("model", mdl)) + return Pipeline(steps) diff --git a/src/psycop_model_training/preprocessing/pre_split/full_processor.py b/src/psycop_model_training/preprocessing/pre_split/full_processor.py index c18ee5e3..eab0a2a2 100644 --- a/src/psycop_model_training/preprocessing/pre_split/full_processor.py +++ b/src/psycop_model_training/preprocessing/pre_split/full_processor.py @@ -6,6 +6,9 @@ from psycop_model_training.preprocessing.pre_split.processors.col_transformer import ( PresSplitColTransformer, ) +from psycop_model_training.preprocessing.pre_split.processors.pre_split_pipeline import ( + apply_pre_split_pipeline, +) from psycop_model_training.preprocessing.pre_split.processors.row_filter import ( PreSplitRowFilter, ) @@ -25,4 +28,5 @@ def process(self, dataset: pd.DataFrame): dataset = self.col_transformer.transform(dataset=dataset) dataset = self.row_filterer.filter(dataset=dataset) dataset = self.col_filterer.filter(dataset=dataset) + dataset = apply_pre_split_pipeline(cfg=self.cfg, data=dataset) return dataset diff --git a/src/psycop_model_training/preprocessing/pre_split/processors/pre_split_pipeline.py b/src/psycop_model_training/preprocessing/pre_split/processors/pre_split_pipeline.py new file mode 100644 index 00000000..47d286fa --- /dev/null +++ b/src/psycop_model_training/preprocessing/pre_split/processors/pre_split_pipeline.py @@ -0,0 +1,38 @@ +import pandas as pd +from sklearn.pipeline import Pipeline + +from psycop_model_training.preprocessing.post_split.feature_selectors import ( + DropDateTimeColumns, +) +from psycop_model_training.preprocessing.post_split.feature_transformers import ( + ConvertToBoolean, + DateTimeConverter, +) +from psycop_model_training.utils.config_schemas.full_config import FullConfigSchema + + +def apply_pre_split_pipeline(cfg: FullConfigSchema, data: pd.DataFrame): + pipe = create_pre_split_pipeline(cfg=cfg) + + return pipe.fit_transform(X=data) + + +def create_pre_split_pipeline(cfg: FullConfigSchema): + steps = [] + # Conversion + if cfg.preprocessing.pre_split.drop_datetime_predictor_columns: + steps.append( + ( + "DropDateTimeColumns", + DropDateTimeColumns(pred_prefix=cfg.data.pred_prefix), + ), + ) + + if cfg.preprocessing.pre_split.convert_datetimes_to_ordinal: + dtconverter = DateTimeConverter() + steps.append(("DateTimeConverter", dtconverter)) + + if cfg.preprocessing.pre_split.convert_to_boolean: + steps.append(("ConvertToBoolean", ConvertToBoolean())) + + return Pipeline(steps) diff --git a/tests/test_preprocessing.py b/tests/test_preprocessing.py index faa6d882..86336a22 100644 --- a/tests/test_preprocessing.py +++ b/tests/test_preprocessing.py @@ -1,8 +1,5 @@ """Test custom preprocessing steps.""" from psycop_model_training.data_loader.utils import load_and_filter_train_from_cfg -from psycop_model_training.preprocessing.post_split.create_pipeline import ( - create_preprocessing_pipeline, -) from psycop_model_training.utils.config_schemas.full_config import FullConfigSchema @@ -19,8 +16,6 @@ def test_drop_datetime_predictor_columns( cfg.preprocessing.post_split.scaling = None cfg.data.pred_prefix = "timestamp" - pipe = create_preprocessing_pipeline(cfg=cfg) train_df = load_and_filter_train_from_cfg(cfg=cfg) - train_df = pipe.transform(X=train_df) assert len([x for x in train_df.columns if "timestamp" in x]) == 0 From 70611810dcdb9c9767d4ed0322e74ce49ec0bf13 Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Mon, 19 Dec 2022 09:52:03 +0100 Subject: [PATCH 16/47] style: linting --- application/train_model.py | 5 ++++- .../preprocessing/post_split/pipeline.py | 1 + .../preprocessing/pre_split/full_processor.py | 1 + .../processors/pre_split_pipeline.py | 19 +++++++++++++------ 4 files changed, 19 insertions(+), 7 deletions(-) diff --git a/application/train_model.py b/application/train_model.py index 6de7b070..79488bd4 100644 --- a/application/train_model.py +++ b/application/train_model.py @@ -1,3 +1,4 @@ +"""Train a single model and evaluate it.""" import time from typing import Any @@ -13,6 +14,9 @@ ) from psycop_model_training.model_eval.dataclasses import PipeMetadata from psycop_model_training.model_eval.evaluate_model import run_full_evaluation +from psycop_model_training.preprocessing.post_split.pipeline import ( + create_post_split_pipeline, +) from psycop_model_training.training.train_and_eval import ( CONFIG_PATH, train_and_get_model_eval_df, @@ -30,7 +34,6 @@ get_feature_importance_dict, get_selected_features_dict, ) -from psycop_model_training.preprocessing.post_split.pipeline import create_post_split_pipeline @hydra.main( diff --git a/src/psycop_model_training/preprocessing/post_split/pipeline.py b/src/psycop_model_training/preprocessing/post_split/pipeline.py index f3c664ed..67a55eab 100644 --- a/src/psycop_model_training/preprocessing/post_split/pipeline.py +++ b/src/psycop_model_training/preprocessing/post_split/pipeline.py @@ -1,3 +1,4 @@ +"""Create post split pipeline.""" "" from sklearn.pipeline import Pipeline from psycop_model_training.preprocessing.post_split.create_pipeline import ( diff --git a/src/psycop_model_training/preprocessing/pre_split/full_processor.py b/src/psycop_model_training/preprocessing/pre_split/full_processor.py index eab0a2a2..1cf7eaa6 100644 --- a/src/psycop_model_training/preprocessing/pre_split/full_processor.py +++ b/src/psycop_model_training/preprocessing/pre_split/full_processor.py @@ -1,3 +1,4 @@ +"""Full processor for pre-split preprocessing.""" import pandas as pd from psycop_model_training.preprocessing.pre_split.processors.col_filter import ( diff --git a/src/psycop_model_training/preprocessing/pre_split/processors/pre_split_pipeline.py b/src/psycop_model_training/preprocessing/pre_split/processors/pre_split_pipeline.py index 47d286fa..c5befe89 100644 --- a/src/psycop_model_training/preprocessing/pre_split/processors/pre_split_pipeline.py +++ b/src/psycop_model_training/preprocessing/pre_split/processors/pre_split_pipeline.py @@ -1,3 +1,8 @@ +"""Pipeline for pre_split misc. + +Legacy from when we used pipelines, will be refactored into the row and +col filters and transformers. +""" import pandas as pd from sklearn.pipeline import Pipeline @@ -11,13 +16,8 @@ from psycop_model_training.utils.config_schemas.full_config import FullConfigSchema -def apply_pre_split_pipeline(cfg: FullConfigSchema, data: pd.DataFrame): - pipe = create_pre_split_pipeline(cfg=cfg) - - return pipe.fit_transform(X=data) - - def create_pre_split_pipeline(cfg: FullConfigSchema): + """Create pipeline.""" steps = [] # Conversion if cfg.preprocessing.pre_split.drop_datetime_predictor_columns: @@ -36,3 +36,10 @@ def create_pre_split_pipeline(cfg: FullConfigSchema): steps.append(("ConvertToBoolean", ConvertToBoolean())) return Pipeline(steps) + + +def apply_pre_split_pipeline(cfg: FullConfigSchema, data: pd.DataFrame): + """Apply pipeline to data.""" + pipe = create_pre_split_pipeline(cfg=cfg) + + return pipe.fit_transform(X=data) From afc6f88e2075ef264df2280f62d3c44559d8bf43 Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Mon, 19 Dec 2022 10:38:43 +0100 Subject: [PATCH 17/47] refactor: remove pre_split pipeline --- .../post_split/feature_transformers.py | 90 ------------------- .../preprocessing/pre_split/full_processor.py | 19 ++-- .../pre_split/processors/col_filter.py | 19 ++++ .../processors/pre_split_pipeline.py | 45 ---------- ...{col_transformer.py => value_formatter.py} | 37 ++++---- .../pre_split/processors/value_transformer.py | 74 +++++++++++++++ 6 files changed, 123 insertions(+), 161 deletions(-) delete mode 100644 src/psycop_model_training/preprocessing/post_split/feature_transformers.py delete mode 100644 src/psycop_model_training/preprocessing/pre_split/processors/pre_split_pipeline.py rename src/psycop_model_training/preprocessing/pre_split/processors/{col_transformer.py => value_formatter.py} (70%) create mode 100644 src/psycop_model_training/preprocessing/pre_split/processors/value_transformer.py diff --git a/src/psycop_model_training/preprocessing/post_split/feature_transformers.py b/src/psycop_model_training/preprocessing/post_split/feature_transformers.py deleted file mode 100644 index 5886d6dd..00000000 --- a/src/psycop_model_training/preprocessing/post_split/feature_transformers.py +++ /dev/null @@ -1,90 +0,0 @@ -"""Custom transformers for data preprocessing.""" -from datetime import datetime -from typing import Optional - -from sklearn.base import BaseEstimator, TransformerMixin - -# pylint: disable=missing-function-docstring - - -class ConvertToBoolean(BaseEstimator, TransformerMixin): - """Convert all cells with a value to True, otherwise false.""" - - def __init__( - self, - columns_to_include: Optional[tuple[str]] = None, - columns_to_skip: Optional[tuple[str, str]] = ("age_in_years", "sex_female"), - ignore_dtypes: Optional[tuple] = ("datetime64[ns]", " None: - """ - Args: - columns_to_include (list[str], optional): Columns to convert to boolean. - Acts as a whitelist, skipping all columns not in the list. - columns_to_skip (Union(tuple[str], None) : Columns to not convert to boolean. - Acts as a blacklist. - Defaults to ["age_in_years", "male"]. - Default to None in which case all columns are included. - ignore_dtypes (set, optional): Skip columns with these data types. Defaults - to {"datetime64[ns]"}. - """ - self.columns_to_skip = columns_to_skip - self.columns_to_include = columns_to_include - self.ignore_dtypes = set(ignore_dtypes) if ignore_dtypes else None - - def fit(self, _, y=None): # pylint: disable=unused-argument - return self - - def transform(self, X, y=None): # pylint: disable=unused-argument - columns = X.columns - - if self.columns_to_include: - columns = [c for c in columns if c in self.columns_to_include] - - cols_to_round = [ - c - for c in columns - if (X[c].dtype not in self.ignore_dtypes) or c in self.columns_to_skip - ] - - for col in cols_to_round: - X[col] = X[col].notnull() - - return X - - -class DateTimeConverter(BaseEstimator, TransformerMixin): - """Convert datetime columns to integers.""" - - valid_types = {"ordinal"} - datetime_dtypes = {"datetime64[ns]", " pd.DataFrame: + """Drop all datetime columns from the dataset.""" + columns_to_drop = [ + c for c in dataset.columns if dataset[c].dtype in drop_dtypes + ] + columns_to_drop = [c for c in columns_to_drop if c.startswith(pred_prefix)] + + return dataset[[c for c in dataset.columns if c not in columns_to_drop]] + @print_df_dimensions_diff def n_outcome_col_names(self, df: pd.DataFrame) -> int: """How many outcome columns there are in a dataframe.""" @@ -192,4 +206,9 @@ def filter(self, dataset: pd.DataFrame) -> pd.DataFrame: dataset=dataset, ) + if self.cfg.preprocessing.pre_split.drop_datetime_predictor_columns: + dataset = self._drop_datetime_columns( + pred_prefix=self.cfg.data.pred_prefix, dataset=dataset + ) + return dataset diff --git a/src/psycop_model_training/preprocessing/pre_split/processors/pre_split_pipeline.py b/src/psycop_model_training/preprocessing/pre_split/processors/pre_split_pipeline.py deleted file mode 100644 index c5befe89..00000000 --- a/src/psycop_model_training/preprocessing/pre_split/processors/pre_split_pipeline.py +++ /dev/null @@ -1,45 +0,0 @@ -"""Pipeline for pre_split misc. - -Legacy from when we used pipelines, will be refactored into the row and -col filters and transformers. -""" -import pandas as pd -from sklearn.pipeline import Pipeline - -from psycop_model_training.preprocessing.post_split.feature_selectors import ( - DropDateTimeColumns, -) -from psycop_model_training.preprocessing.post_split.feature_transformers import ( - ConvertToBoolean, - DateTimeConverter, -) -from psycop_model_training.utils.config_schemas.full_config import FullConfigSchema - - -def create_pre_split_pipeline(cfg: FullConfigSchema): - """Create pipeline.""" - steps = [] - # Conversion - if cfg.preprocessing.pre_split.drop_datetime_predictor_columns: - steps.append( - ( - "DropDateTimeColumns", - DropDateTimeColumns(pred_prefix=cfg.data.pred_prefix), - ), - ) - - if cfg.preprocessing.pre_split.convert_datetimes_to_ordinal: - dtconverter = DateTimeConverter() - steps.append(("DateTimeConverter", dtconverter)) - - if cfg.preprocessing.pre_split.convert_to_boolean: - steps.append(("ConvertToBoolean", ConvertToBoolean())) - - return Pipeline(steps) - - -def apply_pre_split_pipeline(cfg: FullConfigSchema, data: pd.DataFrame): - """Apply pipeline to data.""" - pipe = create_pre_split_pipeline(cfg=cfg) - - return pipe.fit_transform(X=data) diff --git a/src/psycop_model_training/preprocessing/pre_split/processors/col_transformer.py b/src/psycop_model_training/preprocessing/pre_split/processors/value_formatter.py similarity index 70% rename from src/psycop_model_training/preprocessing/pre_split/processors/col_transformer.py rename to src/psycop_model_training/preprocessing/pre_split/processors/value_formatter.py index 45299438..50182b47 100644 --- a/src/psycop_model_training/preprocessing/pre_split/processors/col_transformer.py +++ b/src/psycop_model_training/preprocessing/pre_split/processors/value_formatter.py @@ -1,21 +1,18 @@ import numpy as np import pandas as pd -from wasabi import Printer from psycop_model_training.utils.config_schemas.full_config import FullConfigSchema from psycop_model_training.utils.decorators import print_df_dimensions_diff from psycop_model_training.utils.utils import infer_predictor_col_name -msg = Printer(timestamp=True) - -class PresSplitColTransformer: +class PreSplitValueFormatter: def __init__(self, cfg: FullConfigSchema) -> None: self.cfg = cfg @staticmethod @print_df_dimensions_diff - def convert_timestamp_dtype_and_nat(dataset: pd.DataFrame) -> pd.DataFrame: + def _convert_timestamp_dtype_and_nat(dataset: pd.DataFrame) -> pd.DataFrame: """Convert columns with `timestamp`in their name to datetime, and convert 0's to NaT.""" timestamp_colnames = [col for col in dataset.columns if "timestamp" in col] @@ -31,11 +28,22 @@ def convert_timestamp_dtype_and_nat(dataset: pd.DataFrame) -> pd.DataFrame: return dataset @print_df_dimensions_diff - def _convert_boolean_dtypes_to_int(self, dataset: pd.DataFrame) -> pd.DataFrame: - """Convert boolean dtypes to int.""" - for col in dataset.columns: - if dataset[col].dtype == bool: - dataset[col] = dataset[col].astype(int) + def _negative_values_to_nan(self, dataset: pd.DataFrame) -> pd.DataFrame: + """Convert negative values to NaN.""" + preds = dataset[infer_predictor_col_name(df=dataset)] + + # Get all columns with negative values + cols_with_numerical_values = preds.select_dtypes(include=["number"]).columns + + numerical_columns_with_negative_values = [ + c for c in cols_with_numerical_values if preds[c].min() < 0 + ] + + df_to_replace = dataset[numerical_columns_with_negative_values] + + # Convert to NaN + df_to_replace[df_to_replace < 0] = np.nan + dataset[numerical_columns_with_negative_values] = df_to_replace return dataset @@ -59,7 +67,7 @@ def _negative_values_to_nan(self, dataset: pd.DataFrame) -> pd.DataFrame: return dataset - def transform(self, dataset: pd.DataFrame) -> pd.DataFrame: + def format(self, dataset: pd.DataFrame) -> pd.DataFrame: # Super hacky transformation of negative weights (?!) for chi-square. # In the future, we want to: # 1. Fix this in the feature generation for t2d @@ -67,11 +75,6 @@ def transform(self, dataset: pd.DataFrame) -> pd.DataFrame: # 2b. Always use z-score normalisation? dataset = self._negative_values_to_nan(dataset=dataset) - dataset = self.convert_timestamp_dtype_and_nat(dataset=dataset) - - if self.cfg.preprocessing.pre_split.convert_booleans_to_int: - dataset = self._convert_boolean_dtypes_to_int(dataset=dataset) - - msg.info("Finished processing dataset") + dataset = self._convert_timestamp_dtype_and_nat(dataset=dataset) return dataset diff --git a/src/psycop_model_training/preprocessing/pre_split/processors/value_transformer.py b/src/psycop_model_training/preprocessing/pre_split/processors/value_transformer.py new file mode 100644 index 00000000..4351866e --- /dev/null +++ b/src/psycop_model_training/preprocessing/pre_split/processors/value_transformer.py @@ -0,0 +1,74 @@ +import datetime +from typing import Optional + +import pandas as pd +from wasabi import Printer + +from psycop_model_training.utils.config_schemas.full_config import FullConfigSchema +from psycop_model_training.utils.decorators import print_df_dimensions_diff +from psycop_model_training.utils.utils import infer_predictor_col_name + +msg = Printer(timestamp=True) + + +class PreSplitValueTransformer: + def __init__(self, cfg: FullConfigSchema) -> None: + self.cfg = cfg + + @print_df_dimensions_diff + def _convert_boolean_dtypes_to_int(self, dataset: pd.DataFrame) -> pd.DataFrame: + """Convert boolean dtypes to int.""" + for col in dataset.columns: + if dataset[col].dtype == bool: + dataset[col] = dataset[col].astype(int) + + return dataset + + def _convert_datetimes_to_ordinal(self, dataset: pd.DataFrame) -> pd.DataFrame: + """Convert datetime columns to integers.""" + + datetime_dtypes = {"datetime64[ns]", " pd.DataFrame: + """Convert predictors to boolean.""" + columns = infer_predictor_col_name(df=dataset, prefix=self.cfg.data.pred_prefix) + + cols_to_round = [ + c + for c in columns + if (dataset[c].dtype not in ignore_dtypes) or c in columns_to_skip + ] + + for col in cols_to_round: + dataset[col] = dataset[col].notnull() + + return dataset + + def transform(self, dataset: pd.DataFrame) -> pd.DataFrame: + if self.cfg.preprocessing.pre_split.convert_booleans_to_int: + dataset = self._convert_boolean_dtypes_to_int(dataset=dataset) + + if self.cfg.preprocessing.pre_split.convert_datetimes_to_ordinal: + dataset = self._convert_datetimes_to_ordinal(dataset=dataset) + + if self.cfg.preprocessing.pre_split.convert_to_boolean: + dataset = self._convert_predictors_to_boolean(dataset=dataset) + + msg.info("Finished processing dataset") + + return dataset From 0d163889ae1385fbfd37fe740c4f00c5b4e08488 Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Mon, 19 Dec 2022 10:43:23 +0100 Subject: [PATCH 18/47] style: linting --- .../preprocessing/pre_split/full_processor.py | 8 ++--- .../pre_split/processors/col_filter.py | 6 +++- .../{value_formatter.py => value_cleaner.py} | 30 +++++-------------- .../pre_split/processors/value_transformer.py | 11 ++++++- 4 files changed, 27 insertions(+), 28 deletions(-) rename src/psycop_model_training/preprocessing/pre_split/processors/{value_formatter.py => value_cleaner.py} (73%) diff --git a/src/psycop_model_training/preprocessing/pre_split/full_processor.py b/src/psycop_model_training/preprocessing/pre_split/full_processor.py index d92480a6..0799570b 100644 --- a/src/psycop_model_training/preprocessing/pre_split/full_processor.py +++ b/src/psycop_model_training/preprocessing/pre_split/full_processor.py @@ -7,8 +7,8 @@ from psycop_model_training.preprocessing.pre_split.processors.row_filter import ( PreSplitRowFilter, ) -from psycop_model_training.preprocessing.pre_split.processors.value_formatter import ( - PreSplitValueFormatter, +from psycop_model_training.preprocessing.pre_split.processors.value_cleaner import ( + PreSplitValueCleaner, ) from psycop_model_training.preprocessing.pre_split.processors.value_transformer import ( PreSplitValueTransformer, @@ -23,11 +23,11 @@ def __init__(self, cfg): self.row_filterer = PreSplitRowFilter(cfg=cfg) self.col_filterer = PresSplitColFilter(cfg=cfg) self.value_transformer = PreSplitValueTransformer(cfg=cfg) - self.value_formatter = PreSplitValueFormatter(cfg=cfg) + self.value_cleaner = PreSplitValueCleaner(cfg=cfg) def process(self, dataset: pd.DataFrame): """Process a dataframe using the configuration.""" - dataset = self.value_formatter.format(dataset=dataset) + dataset = self.value_cleaner.clean(dataset=dataset) dataset = self.value_transformer.transform(dataset=dataset) dataset = self.row_filterer.filter(dataset=dataset) dataset = self.col_filterer.filter(dataset=dataset) diff --git a/src/psycop_model_training/preprocessing/pre_split/processors/col_filter.py b/src/psycop_model_training/preprocessing/pre_split/processors/col_filter.py index 0061656d..0dd95a28 100644 --- a/src/psycop_model_training/preprocessing/pre_split/processors/col_filter.py +++ b/src/psycop_model_training/preprocessing/pre_split/processors/col_filter.py @@ -1,3 +1,4 @@ +"""Module for filtering columns before split.""" import re from typing import Union @@ -15,6 +16,8 @@ class PresSplitColFilter: + """Class for filtering columns before split.""" + def __init__(self, cfg: FullConfigSchema) -> None: self.cfg = cfg @@ -208,7 +211,8 @@ def filter(self, dataset: pd.DataFrame) -> pd.DataFrame: if self.cfg.preprocessing.pre_split.drop_datetime_predictor_columns: dataset = self._drop_datetime_columns( - pred_prefix=self.cfg.data.pred_prefix, dataset=dataset + pred_prefix=self.cfg.data.pred_prefix, + dataset=dataset, ) return dataset diff --git a/src/psycop_model_training/preprocessing/pre_split/processors/value_formatter.py b/src/psycop_model_training/preprocessing/pre_split/processors/value_cleaner.py similarity index 73% rename from src/psycop_model_training/preprocessing/pre_split/processors/value_formatter.py rename to src/psycop_model_training/preprocessing/pre_split/processors/value_cleaner.py index 50182b47..8cf802d4 100644 --- a/src/psycop_model_training/preprocessing/pre_split/processors/value_formatter.py +++ b/src/psycop_model_training/preprocessing/pre_split/processors/value_cleaner.py @@ -1,3 +1,5 @@ +"""Class for formatting values before split, e.g. assigning datetime, removing +negative values etc.""" import numpy as np import pandas as pd @@ -6,7 +8,10 @@ from psycop_model_training.utils.utils import infer_predictor_col_name -class PreSplitValueFormatter: +class PreSplitValueCleaner: + """Class for cleaning values before split, e.g. assigning datetime, + removing negative values etc.""" + def __init__(self, cfg: FullConfigSchema) -> None: self.cfg = cfg @@ -47,27 +52,8 @@ def _negative_values_to_nan(self, dataset: pd.DataFrame) -> pd.DataFrame: return dataset - @print_df_dimensions_diff - def _negative_values_to_nan(self, dataset: pd.DataFrame) -> pd.DataFrame: - """Convert negative values to NaN.""" - preds = dataset[infer_predictor_col_name(df=dataset)] - - # Get all columns with negative values - cols_with_numerical_values = preds.select_dtypes(include=["number"]).columns - - numerical_columns_with_negative_values = [ - c for c in cols_with_numerical_values if preds[c].min() < 0 - ] - - df_to_replace = dataset[numerical_columns_with_negative_values] - - # Convert to NaN - df_to_replace[df_to_replace < 0] = np.nan - dataset[numerical_columns_with_negative_values] = df_to_replace - - return dataset - - def format(self, dataset: pd.DataFrame) -> pd.DataFrame: + def clean(self, dataset: pd.DataFrame) -> pd.DataFrame: + """Apply the cleaning functions to the dataset.""" # Super hacky transformation of negative weights (?!) for chi-square. # In the future, we want to: # 1. Fix this in the feature generation for t2d diff --git a/src/psycop_model_training/preprocessing/pre_split/processors/value_transformer.py b/src/psycop_model_training/preprocessing/pre_split/processors/value_transformer.py index 4351866e..8be69681 100644 --- a/src/psycop_model_training/preprocessing/pre_split/processors/value_transformer.py +++ b/src/psycop_model_training/preprocessing/pre_split/processors/value_transformer.py @@ -1,4 +1,10 @@ -import datetime +"""Pre-split value transformer. These transformations are applied before the +split. + +To avoid test/train leakage, the transformations must not use any +information about the values in the dataset. +""" +from datetime import datetime from typing import Optional import pandas as pd @@ -12,6 +18,8 @@ class PreSplitValueTransformer: + """Pre-split value transformer.""" + def __init__(self, cfg: FullConfigSchema) -> None: self.cfg = cfg @@ -60,6 +68,7 @@ def _convert_predictors_to_boolean( return dataset def transform(self, dataset: pd.DataFrame) -> pd.DataFrame: + """Transform the dataset.""" if self.cfg.preprocessing.pre_split.convert_booleans_to_int: dataset = self._convert_boolean_dtypes_to_int(dataset=dataset) From ad0b8423e07e8a5adba8402f3385024d52040a6a Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Mon, 19 Dec 2022 11:06:16 +0100 Subject: [PATCH 19/47] docs: improve docs --- .../preprocessing/pre_split/full_processor.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/psycop_model_training/preprocessing/pre_split/full_processor.py b/src/psycop_model_training/preprocessing/pre_split/full_processor.py index 0799570b..985de335 100644 --- a/src/psycop_model_training/preprocessing/pre_split/full_processor.py +++ b/src/psycop_model_training/preprocessing/pre_split/full_processor.py @@ -13,12 +13,18 @@ from psycop_model_training.preprocessing.pre_split.processors.value_transformer import ( PreSplitValueTransformer, ) +from psycop_model_training.utils.config_schemas.full_config import FullConfigSchema class FullProcessor: - """Uses all PresSplit preprocessors.""" + """Uses all PresSplit preprocessors. Acts as an adapter in case we want to change the interfaces of its components. - def __init__(self, cfg): + I.e. if we want to make PresSplitValueTransformer a class that takes a set of arguments instead of a FullConfig, we can do that without changing FullProcessor's interface. + + This means we can refactor without breaking the package for our users. + """ + + def __init__(self, cfg: FullConfigSchema): self.cfg = cfg self.row_filterer = PreSplitRowFilter(cfg=cfg) self.col_filterer = PresSplitColFilter(cfg=cfg) From eccbd66e873a56a968774141989289b546aa22e7 Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Mon, 19 Dec 2022 11:06:32 +0100 Subject: [PATCH 20/47] style: linting --- .../preprocessing/pre_split/full_processor.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/psycop_model_training/preprocessing/pre_split/full_processor.py b/src/psycop_model_training/preprocessing/pre_split/full_processor.py index 985de335..90433804 100644 --- a/src/psycop_model_training/preprocessing/pre_split/full_processor.py +++ b/src/psycop_model_training/preprocessing/pre_split/full_processor.py @@ -17,7 +17,8 @@ class FullProcessor: - """Uses all PresSplit preprocessors. Acts as an adapter in case we want to change the interfaces of its components. + """Uses all PresSplit preprocessors. Acts as an adapter in case we want to + change the interfaces of its components. I.e. if we want to make PresSplitValueTransformer a class that takes a set of arguments instead of a FullConfig, we can do that without changing FullProcessor's interface. From b5e4e1a5c9ad6b41c7da605fda733e52242c8190 Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Mon, 19 Dec 2022 12:37:38 +0100 Subject: [PATCH 21/47] refactor: move col name inference to col name inference folder --- .../examples/evaluate_model_from_file.py | 5 +- .../pre_split/processors/col_filter.py | 5 +- .../pre_split/processors/value_cleaner.py | 2 +- .../pre_split/processors/value_transformer.py | 2 +- .../utils/col_name_inference.py | 49 +++++++++++++++++++ src/psycop_model_training/utils/utils.py | 49 ------------------- 6 files changed, 55 insertions(+), 57 deletions(-) diff --git a/src/psycop_model_training/model_eval/examples/evaluate_model_from_file.py b/src/psycop_model_training/model_eval/examples/evaluate_model_from_file.py index 190ad0fb..57c9e8c1 100644 --- a/src/psycop_model_training/model_eval/examples/evaluate_model_from_file.py +++ b/src/psycop_model_training/model_eval/examples/evaluate_model_from_file.py @@ -14,12 +14,11 @@ from psycop_model_training.model_eval.plots import plot_auc_by_time_from_first_visit from psycop_model_training.utils.utils import ( PROJECT_ROOT, - infer_outcome_col_name, - infer_predictor_col_name, - infer_y_hat_prob_col_name, load_evaluation_data, read_pickle, ) +from psycop_model_training.utils.col_name_inference import infer_outcome_col_name, infer_predictor_col_name, \ + infer_y_hat_prob_col_name def load_model_predictions_and_cfg(path: Path) -> tuple[pd.DataFrame, DictConfig]: diff --git a/src/psycop_model_training/preprocessing/pre_split/processors/col_filter.py b/src/psycop_model_training/preprocessing/pre_split/processors/col_filter.py index 0dd95a28..2b4bf973 100644 --- a/src/psycop_model_training/preprocessing/pre_split/processors/col_filter.py +++ b/src/psycop_model_training/preprocessing/pre_split/processors/col_filter.py @@ -5,13 +5,12 @@ import pandas as pd from psycop_model_training.data_loader.data_loader import msg -from psycop_model_training.utils.col_name_inference import infer_look_distance +from psycop_model_training.utils.col_name_inference import infer_look_distance, infer_outcome_col_name, \ + infer_predictor_col_name from psycop_model_training.utils.config_schemas.full_config import FullConfigSchema from psycop_model_training.utils.decorators import print_df_dimensions_diff from psycop_model_training.utils.utils import ( get_percent_lost, - infer_outcome_col_name, - infer_predictor_col_name, ) diff --git a/src/psycop_model_training/preprocessing/pre_split/processors/value_cleaner.py b/src/psycop_model_training/preprocessing/pre_split/processors/value_cleaner.py index 8cf802d4..a9678554 100644 --- a/src/psycop_model_training/preprocessing/pre_split/processors/value_cleaner.py +++ b/src/psycop_model_training/preprocessing/pre_split/processors/value_cleaner.py @@ -5,7 +5,7 @@ from psycop_model_training.utils.config_schemas.full_config import FullConfigSchema from psycop_model_training.utils.decorators import print_df_dimensions_diff -from psycop_model_training.utils.utils import infer_predictor_col_name +from psycop_model_training.utils.col_name_inference import infer_predictor_col_name class PreSplitValueCleaner: diff --git a/src/psycop_model_training/preprocessing/pre_split/processors/value_transformer.py b/src/psycop_model_training/preprocessing/pre_split/processors/value_transformer.py index 8be69681..e8a88807 100644 --- a/src/psycop_model_training/preprocessing/pre_split/processors/value_transformer.py +++ b/src/psycop_model_training/preprocessing/pre_split/processors/value_transformer.py @@ -12,7 +12,7 @@ from psycop_model_training.utils.config_schemas.full_config import FullConfigSchema from psycop_model_training.utils.decorators import print_df_dimensions_diff -from psycop_model_training.utils.utils import infer_predictor_col_name +from psycop_model_training.utils.col_name_inference import infer_predictor_col_name msg = Printer(timestamp=True) diff --git a/src/psycop_model_training/utils/col_name_inference.py b/src/psycop_model_training/utils/col_name_inference.py index 22348653..334d5dec 100644 --- a/src/psycop_model_training/utils/col_name_inference.py +++ b/src/psycop_model_training/utils/col_name_inference.py @@ -68,3 +68,52 @@ def infer_look_distance( ) return look_distances + + +def infer_col_names( + df: pd.DataFrame, + prefix: str, + allow_multiple: bool = True, +) -> list[str]: + """Infer col names based on prefix.""" + col_name = [c for c in df.columns if c.startswith(prefix)] + + if len(col_name) == 1: + return col_name + elif len(col_name) > 1: + if allow_multiple: + return col_name + raise ValueError( + f"Multiple columns found and allow_multiple is {allow_multiple}.", + ) + elif not col_name: + raise ValueError("No outcome col name inferred") + else: + raise ValueError("No outcomes inferred") + + +def infer_outcome_col_name( + df: pd.DataFrame, + prefix: str = "outc_", + allow_multiple: bool = True, +) -> list[str]: + """Infer the outcome column name from the dataframe.""" + return infer_col_names(df=df, prefix=prefix, allow_multiple=allow_multiple) + + +def infer_predictor_col_name( + df: pd.DataFrame, + prefix: str = "pred_", + allow_multiple: bool = True, +) -> list[str]: + """Get the predictors that are used in the model.""" + return infer_col_names(df=df, prefix=prefix, allow_multiple=allow_multiple) + + +def infer_y_hat_prob_col_name( + df: pd.DataFrame, + prefix="y_hat_prob", + allow_multiple: bool = False, +) -> list[str]: + """Infer the y_hat_prob column name from the dataframe.""" + return infer_col_names(df=df, prefix=prefix, allow_multiple=allow_multiple) diff --git a/src/psycop_model_training/utils/utils.py b/src/psycop_model_training/utils/utils.py index 6c5ce72d..70931891 100644 --- a/src/psycop_model_training/utils/utils.py +++ b/src/psycop_model_training/utils/utils.py @@ -449,55 +449,6 @@ def load_evaluation_data(model_data_dir: Path) -> ModelEvalData: ) -def infer_col_names( - df: pd.DataFrame, - prefix: str, - allow_multiple: bool = True, -) -> list[str]: - """Infer col names based on prefix.""" - col_name = [c for c in df.columns if c.startswith(prefix)] - - if len(col_name) == 1: - return col_name - elif len(col_name) > 1: - if allow_multiple: - return col_name - raise ValueError( - f"Multiple columns found and allow_multiple is {allow_multiple}.", - ) - elif not col_name: - raise ValueError("No outcome col name inferred") - else: - raise ValueError("No outcomes inferred") - - -def infer_outcome_col_name( - df: pd.DataFrame, - prefix: str = "outc_", - allow_multiple: bool = True, -) -> list[str]: - """Infer the outcome column name from the dataframe.""" - return infer_col_names(df=df, prefix=prefix, allow_multiple=allow_multiple) - - -def infer_predictor_col_name( - df: pd.DataFrame, - prefix: str = "pred_", - allow_multiple: bool = True, -) -> list[str]: - """Get the predictors that are used in the model.""" - return infer_col_names(df=df, prefix=prefix, allow_multiple=allow_multiple) - - -def infer_y_hat_prob_col_name( - df: pd.DataFrame, - prefix="y_hat_prob", - allow_multiple: bool = False, -) -> list[str]: - """Infer the y_hat_prob column name from the dataframe.""" - return infer_col_names(df=df, prefix=prefix, allow_multiple=allow_multiple) - - def get_percent_lost(n_before: Union[int, float], n_after: Union[int, float]) -> float: """Get the percent lost.""" return round((100 * (1 - n_after / n_before)), 2) From 668cddaa5a02923c3791b84782143f8c7ec0b507 Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Mon, 19 Dec 2022 12:38:06 +0100 Subject: [PATCH 22/47] style: linting --- .../model_eval/examples/evaluate_model_from_file.py | 7 +++++-- .../preprocessing/pre_split/processors/col_filter.py | 11 ++++++----- .../pre_split/processors/value_cleaner.py | 2 +- .../pre_split/processors/value_transformer.py | 2 +- 4 files changed, 13 insertions(+), 9 deletions(-) diff --git a/src/psycop_model_training/model_eval/examples/evaluate_model_from_file.py b/src/psycop_model_training/model_eval/examples/evaluate_model_from_file.py index 57c9e8c1..0fc601d8 100644 --- a/src/psycop_model_training/model_eval/examples/evaluate_model_from_file.py +++ b/src/psycop_model_training/model_eval/examples/evaluate_model_from_file.py @@ -12,13 +12,16 @@ from omegaconf import DictConfig from psycop_model_training.model_eval.plots import plot_auc_by_time_from_first_visit +from psycop_model_training.utils.col_name_inference import ( + infer_outcome_col_name, + infer_predictor_col_name, + infer_y_hat_prob_col_name, +) from psycop_model_training.utils.utils import ( PROJECT_ROOT, load_evaluation_data, read_pickle, ) -from psycop_model_training.utils.col_name_inference import infer_outcome_col_name, infer_predictor_col_name, \ - infer_y_hat_prob_col_name def load_model_predictions_and_cfg(path: Path) -> tuple[pd.DataFrame, DictConfig]: diff --git a/src/psycop_model_training/preprocessing/pre_split/processors/col_filter.py b/src/psycop_model_training/preprocessing/pre_split/processors/col_filter.py index 2b4bf973..602e175d 100644 --- a/src/psycop_model_training/preprocessing/pre_split/processors/col_filter.py +++ b/src/psycop_model_training/preprocessing/pre_split/processors/col_filter.py @@ -5,13 +5,14 @@ import pandas as pd from psycop_model_training.data_loader.data_loader import msg -from psycop_model_training.utils.col_name_inference import infer_look_distance, infer_outcome_col_name, \ - infer_predictor_col_name +from psycop_model_training.utils.col_name_inference import ( + infer_look_distance, + infer_outcome_col_name, + infer_predictor_col_name, +) from psycop_model_training.utils.config_schemas.full_config import FullConfigSchema from psycop_model_training.utils.decorators import print_df_dimensions_diff -from psycop_model_training.utils.utils import ( - get_percent_lost, -) +from psycop_model_training.utils.utils import get_percent_lost class PresSplitColFilter: diff --git a/src/psycop_model_training/preprocessing/pre_split/processors/value_cleaner.py b/src/psycop_model_training/preprocessing/pre_split/processors/value_cleaner.py index a9678554..b25e3c5f 100644 --- a/src/psycop_model_training/preprocessing/pre_split/processors/value_cleaner.py +++ b/src/psycop_model_training/preprocessing/pre_split/processors/value_cleaner.py @@ -3,9 +3,9 @@ import numpy as np import pandas as pd +from psycop_model_training.utils.col_name_inference import infer_predictor_col_name from psycop_model_training.utils.config_schemas.full_config import FullConfigSchema from psycop_model_training.utils.decorators import print_df_dimensions_diff -from psycop_model_training.utils.col_name_inference import infer_predictor_col_name class PreSplitValueCleaner: diff --git a/src/psycop_model_training/preprocessing/pre_split/processors/value_transformer.py b/src/psycop_model_training/preprocessing/pre_split/processors/value_transformer.py index e8a88807..92161290 100644 --- a/src/psycop_model_training/preprocessing/pre_split/processors/value_transformer.py +++ b/src/psycop_model_training/preprocessing/pre_split/processors/value_transformer.py @@ -10,9 +10,9 @@ import pandas as pd from wasabi import Printer +from psycop_model_training.utils.col_name_inference import infer_predictor_col_name from psycop_model_training.utils.config_schemas.full_config import FullConfigSchema from psycop_model_training.utils.decorators import print_df_dimensions_diff -from psycop_model_training.utils.col_name_inference import infer_predictor_col_name msg = Printer(timestamp=True) From cb361e5fcb9f20ae37eedabdaefaec22787fad21 Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Mon, 19 Dec 2022 12:39:26 +0100 Subject: [PATCH 23/47] fix: broken imports --- application/main.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/application/main.py b/application/main.py index 5223106e..9e54abe9 100644 --- a/application/main.py +++ b/application/main.py @@ -16,15 +16,15 @@ from wasabi import Printer from psycop_model_training.data_loader.utils import load_train_raw -from psycop_model_training.model_eval.evaluate_model import ( +from psycop_model_training.utils.col_name_inference import ( infer_look_distance, infer_outcome_col_name, ) -from psycop_model_training.utils.config_schemas import ( +from psycop_model_training.utils.config_schemas.conf_utils import ( BaseModel, - FullConfigSchema, load_test_cfg_as_pydantic, ) +from psycop_model_training.utils.config_schemas.full_config import FullConfigSchema def start_trainer( From cef9e6bdd558db1aa0f1d8b60fe406268a15cc3d Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Mon, 19 Dec 2022 12:39:33 +0100 Subject: [PATCH 24/47] build: update dataset dir --- application/config/data/t2d_parquet.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/application/config/data/t2d_parquet.yaml b/application/config/data/t2d_parquet.yaml index 8eac4d70..f794f734 100644 --- a/application/config/data/t2d_parquet.yaml +++ b/application/config/data/t2d_parquet.yaml @@ -2,7 +2,7 @@ data: # General config n_training_samples: null - dir: E:\shared_resources\feature_sets\t2d\feature_sets\psycop_t2d_adminmanber_1090_features_2022_11_18_14_10 + dir: E:\shared_resources\t2d\feature_sets\psycop_t2d_adminmanber_features_2022_12_14_13_17 suffix: parquet min_age: 18 From 17e45f00dabae1954b9d4712ab28c489f0efeb9e Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Mon, 19 Dec 2022 12:52:41 +0100 Subject: [PATCH 25/47] misc. --- application/config/data/t2d_parquet.yaml | 22 +---------- .../preprocessing/default_preprocessing.yaml | 38 ++++++++++++------- .../config/project/default_project.yaml | 8 +--- application/main.py | 3 +- .../data_loader/utils.py | 11 +++++- .../utils/config_schemas/conf_utils.py | 18 ++++++++- 6 files changed, 55 insertions(+), 45 deletions(-) diff --git a/application/config/data/t2d_parquet.yaml b/application/config/data/t2d_parquet.yaml index f794f734..258854ec 100644 --- a/application/config/data/t2d_parquet.yaml +++ b/application/config/data/t2d_parquet.yaml @@ -2,16 +2,8 @@ data: # General config n_training_samples: null - dir: E:\shared_resources\t2d\feature_sets\psycop_t2d_adminmanber_features_2022_12_14_13_17 + dir: E:\shared_resources\t2d\feature_sets\psycop_t2d_adminmanber_features_2022_12_14_12_05\ suffix: parquet - min_age: 18 - - # Patient exclusion criteria - drop_patient_if_exclusion_before_date: 2013-01-01 - - # Prediction time exclusion criteria - min_prediction_time_date: 2013-01-01 - min_lookahead_days: 1825 # Feature specs pred_prefix: pred_ @@ -24,14 +16,4 @@ data: age: pred_age_in_years exclusion_timestamp: _timestamp_exclusion custom: - n_hba1c: eval_hba1c_within_9999_days_count_fallback_nan - - lookbehind_combination: [30, 90, 180, 365, 730] - - - -# Parameters that will only take effect if running with --multirun -hydra: - sweeper: - params: - ++data.lookbehind_combination: choice([30, 90, 180, 365, 730], [30, 180, 730], [730], [365], [90], [30]) + n_hba1c: eval_hba1c_within_9999_days_count_fallback_nan \ No newline at end of file diff --git a/application/config/preprocessing/default_preprocessing.yaml b/application/config/preprocessing/default_preprocessing.yaml index ad95e66e..36cc5270 100644 --- a/application/config/preprocessing/default_preprocessing.yaml +++ b/application/config/preprocessing/default_preprocessing.yaml @@ -1,20 +1,30 @@ # @package _global_ preprocessing: - convert_to_boolean: false - convert_booleans_to_int: true - drop_datetime_predictor_columns: true - convert_datetimes_to_ordinal: false - imputation_method: most_frequent - scaling: z-score-normalisation - feature_selection: - name: chi2 - params: - percentile: 20 # (int): Percent of features to keep. Defaults to 10. + pre_split: + convert_to_boolean: false + convert_booleans_to_int: true + drop_datetime_predictor_columns: true + convert_datetimes_to_ordinal: false + drop_patient_if_exclusion_before_date: 2013-01-01 + min_prediction_time_date: 2013-01-01 + min_lookahead_days: 1825 + lookbehind_combination: [30, 90, 180, 365, 730] + min_age: 18 + post_split: + imputation_method: most_frequent + scaling: z-score-normalisation + feature_selection: + name: chi2 + params: + percentile: 20 # (int): Percent of features to keep. Defaults to 10. +# Parameters that will only take effect if running with --multirun hydra: sweeper: params: - ++preprocessing.imputation_method: choice("most_frequent", "mean", "median", "null") - ++preprocessing.scaling: choice("z-score-normalization", "null") - ++preprocessing.feature_selection.name: choice("chi2", "null") - ++preprocessing.feature_selection.params.percentile: int(tag(log, interval(1, 90))) + ++preprocessing.post_split.imputation_method: choice("most_frequent", "mean", "median", "null") + ++preprocessing.post_split.scaling: choice("z-score-normalization", "null") + ++preprocessing.post_split.feature_selection.name: choice("chi2", "null") + ++preprocessing.post_splitfeature_selection.params.percentile: int(tag(log, interval(1, 90))) + ++preprocessing.pre_split.lookbehind_combination: choice([30, 90, 180, 365, 730], [30, 180, 730], [730], [365], [90], [30]) + diff --git a/application/config/project/default_project.yaml b/application/config/project/default_project.yaml index cb8d6eb6..3cb495e1 100644 --- a/application/config/project/default_project.yaml +++ b/application/config/project/default_project.yaml @@ -5,11 +5,5 @@ wandb: entity: psycop # Which entity to run WanDB in. mode: run # Which mode to run WanDB in. Takes "run", "dryrun", "offline" and "disabled" group: psycop-t2d # Which group to run WanDB in. - -watcher: - archive_all: false - keep_alive_after_training_minutes: 5 - n_runs_before_eval: 1 - verbose: true - + gpu: true diff --git a/application/main.py b/application/main.py index 9e54abe9..fe30ff6e 100644 --- a/application/main.py +++ b/application/main.py @@ -22,6 +22,7 @@ ) from psycop_model_training.utils.config_schemas.conf_utils import ( BaseModel, + load_app_cfg_as_pydantic, load_test_cfg_as_pydantic, ) from psycop_model_training.utils.config_schemas.full_config import FullConfigSchema @@ -202,7 +203,7 @@ def main(): else: config_file_name = "default_config.yaml" - cfg = load_test_cfg_as_pydantic(config_file_name=config_file_name) + cfg = load_app_cfg_as_pydantic(config_file_name=config_file_name) random_word = RandomWords() wandb_group = f"{random_word.get_random_word()}-{random_word.get_random_word()}" diff --git a/src/psycop_model_training/data_loader/utils.py b/src/psycop_model_training/data_loader/utils.py index 885e16a9..4c6f078d 100644 --- a/src/psycop_model_training/data_loader/utils.py +++ b/src/psycop_model_training/data_loader/utils.py @@ -7,6 +7,9 @@ from psycop_model_training.data_loader.data_classes import SplitDataset from psycop_model_training.data_loader.data_loader import DataLoader from psycop_model_training.preprocessing.pre_split.full_processor import FullProcessor +from psycop_model_training.preprocessing.pre_split.processors.value_cleaner import ( + PreSplitValueCleaner, +) from psycop_model_training.utils.config_schemas.full_config import FullConfigSchema @@ -55,7 +58,9 @@ def load_and_filter_train_and_val_from_cfg(cfg: FullConfigSchema): ) -def load_train_raw(cfg: FullConfigSchema): +def load_train_raw( + cfg: FullConfigSchema, convert_timestamp_types_and_nans: bool = True +) -> pd.DataFrame: """Load the data.""" path = Path(cfg.data.dir) file_names = list(path.glob(pattern=r"*train*")) @@ -68,7 +73,9 @@ def load_train_raw(cfg: FullConfigSchema): elif file_suffix == ".csv": df = pd.read_csv(file_name) - df = DataLoader.convert_timestamp_dtype_and_nat(dataset=df) + # Helpful during tests to convert columns with matching names to datetime + if convert_timestamp_types_and_nans: + df = PreSplitValueCleaner._convert_timestamp_dtype_and_nat(dataset=df) return df diff --git a/src/psycop_model_training/utils/config_schemas/conf_utils.py b/src/psycop_model_training/utils/config_schemas/conf_utils.py index f9d0c556..fd35f53a 100644 --- a/src/psycop_model_training/utils/config_schemas/conf_utils.py +++ b/src/psycop_model_training/utils/config_schemas/conf_utils.py @@ -23,12 +23,27 @@ def convert_omegaconf_to_pydantic_object( return FullConfigSchema(**conf, allow_mutation=allow_mutation) +def load_app_cfg_as_pydantic( + config_file_name, + allow_mutation: bool = False, + overrides: Optional[list[str]] = None, +): + cfg = load_test_cfg_as_omegaconf( + config_file_name=config_file_name, + overrides=overrides, + config_dir_path_rel="../../../../application/config/", + ) + + return convert_omegaconf_to_pydantic_object(conf=cfg, allow_mutation=allow_mutation) + + def load_test_cfg_as_omegaconf( config_file_name: str, + config_dir_path_rel: str, overrides: Optional[list[str]] = None, ) -> DictConfig: """Load config as omegaconf object.""" - with initialize(version_base=None, config_path="../../../../tests/config/"): + with initialize(version_base=None, config_path=config_dir_path_rel): if overrides: cfg = compose( config_name=config_file_name, @@ -60,6 +75,7 @@ def load_test_cfg_as_pydantic( cfg = load_test_cfg_as_omegaconf( config_file_name=config_file_name, overrides=overrides, + config_dir_path_rel="../../../../tests/config/", ) return convert_omegaconf_to_pydantic_object(conf=cfg, allow_mutation=allow_mutation) From ef4e9254f6cbbf8d510eaf61605b86a9cec7bbb5 Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Mon, 19 Dec 2022 13:54:37 +0100 Subject: [PATCH 26/47] fix: incorrect paths --- application/main.py | 5 ++--- src/psycop_model_training/training/train_and_eval.py | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/application/main.py b/application/main.py index fe30ff6e..788eaed7 100644 --- a/application/main.py +++ b/application/main.py @@ -23,7 +23,6 @@ from psycop_model_training.utils.config_schemas.conf_utils import ( BaseModel, load_app_cfg_as_pydantic, - load_test_cfg_as_pydantic, ) from psycop_model_training.utils.config_schemas.full_config import FullConfigSchema @@ -40,13 +39,13 @@ def start_trainer( subprocess_args: list[str] = [ "python", - "src/psycop_model_training/train_model.py", + "application/train_model.py", f"project.wandb.group='{wandb_group_override}'", f"project.wandb.mode={cfg.project.wandb.mode}", f"hydra.sweeper.n_trials={cfg.train.n_trials_per_lookahead}", f"hydra.sweeper.n_jobs={cfg.train.n_jobs_per_trainer}", f"model={model_name}", - f"data.min_lookahead_days={lookahead_days}", + f"preprocessing.pre_split.min_lookahead_days={lookahead_days}", "--config-name", f"{config_file_name}", ] diff --git a/src/psycop_model_training/training/train_and_eval.py b/src/psycop_model_training/training/train_and_eval.py index 62f4ae2d..2061d9e7 100644 --- a/src/psycop_model_training/training/train_and_eval.py +++ b/src/psycop_model_training/training/train_and_eval.py @@ -17,7 +17,7 @@ from psycop_model_training.utils.config_schemas.full_config import FullConfigSchema from psycop_model_training.utils.utils import PROJECT_ROOT -CONFIG_PATH = PROJECT_ROOT / "src" / "psycop_model_training" / "config" +CONFIG_PATH = PROJECT_ROOT / "application" / "config" # Handle wandb not playing nice with joblib os.environ["WANDB_START_METHOD"] = "thread" From eee7768ca74fce9b933c00ee778dba938c1680b5 Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Mon, 19 Dec 2022 14:15:07 +0100 Subject: [PATCH 27/47] fix: rename preprocessing keys --- .../config/model/logistic-regression.yaml | 2 +- application/config/model/naive-bayes.yaml | 2 +- .../preprocessing/default_preprocessing.yaml | 2 +- ...gration_config.yaml => default_config.yaml} | 0 tests/config/model/logistic-regression.yaml | 2 +- tests/config/model/naive-bayes.yaml | 2 +- .../preprocessing/default_preprocessing.yaml | 2 +- tests/test_configs.py | 18 ++++++++++++++++-- 8 files changed, 22 insertions(+), 8 deletions(-) rename tests/config/{integration_config.yaml => default_config.yaml} (100%) diff --git a/application/config/model/logistic-regression.yaml b/application/config/model/logistic-regression.yaml index 30ab21c4..e4b7817c 100644 --- a/application/config/model/logistic-regression.yaml +++ b/application/config/model/logistic-regression.yaml @@ -22,4 +22,4 @@ hydra: ++model.args.C: interval(1e-5, 1.0) ++model.args.l1_ratio: interval(1e-5, 1.0) # preprocessing - ++preprocessing.scaling: choice("null", "z-score-normalization") + ++preprocessing.post_split.scaling: choice("null", "z-score-normalization") diff --git a/application/config/model/naive-bayes.yaml b/application/config/model/naive-bayes.yaml index cd605228..23899ce4 100644 --- a/application/config/model/naive-bayes.yaml +++ b/application/config/model/naive-bayes.yaml @@ -10,4 +10,4 @@ hydra: sweeper: params: # preprocessing - ++preprocessing.scaling: choice(null, "z-score-normalization") + ++preprocessing.post_split.scaling: choice(null, "z-score-normalization") diff --git a/application/config/preprocessing/default_preprocessing.yaml b/application/config/preprocessing/default_preprocessing.yaml index 36cc5270..dcda18c1 100644 --- a/application/config/preprocessing/default_preprocessing.yaml +++ b/application/config/preprocessing/default_preprocessing.yaml @@ -25,6 +25,6 @@ hydra: ++preprocessing.post_split.imputation_method: choice("most_frequent", "mean", "median", "null") ++preprocessing.post_split.scaling: choice("z-score-normalization", "null") ++preprocessing.post_split.feature_selection.name: choice("chi2", "null") - ++preprocessing.post_splitfeature_selection.params.percentile: int(tag(log, interval(1, 90))) + ++preprocessing.post_split.feature_selection.params.percentile: int(tag(log, interval(1, 90))) ++preprocessing.pre_split.lookbehind_combination: choice([30, 90, 180, 365, 730], [30, 180, 730], [730], [365], [90], [30]) diff --git a/tests/config/integration_config.yaml b/tests/config/default_config.yaml similarity index 100% rename from tests/config/integration_config.yaml rename to tests/config/default_config.yaml diff --git a/tests/config/model/logistic-regression.yaml b/tests/config/model/logistic-regression.yaml index 30ab21c4..e4b7817c 100644 --- a/tests/config/model/logistic-regression.yaml +++ b/tests/config/model/logistic-regression.yaml @@ -22,4 +22,4 @@ hydra: ++model.args.C: interval(1e-5, 1.0) ++model.args.l1_ratio: interval(1e-5, 1.0) # preprocessing - ++preprocessing.scaling: choice("null", "z-score-normalization") + ++preprocessing.post_split.scaling: choice("null", "z-score-normalization") diff --git a/tests/config/model/naive-bayes.yaml b/tests/config/model/naive-bayes.yaml index cd605228..23899ce4 100644 --- a/tests/config/model/naive-bayes.yaml +++ b/tests/config/model/naive-bayes.yaml @@ -10,4 +10,4 @@ hydra: sweeper: params: # preprocessing - ++preprocessing.scaling: choice(null, "z-score-normalization") + ++preprocessing.post_split.scaling: choice(null, "z-score-normalization") diff --git a/tests/config/preprocessing/default_preprocessing.yaml b/tests/config/preprocessing/default_preprocessing.yaml index f47e331e..ee23e6b5 100644 --- a/tests/config/preprocessing/default_preprocessing.yaml +++ b/tests/config/preprocessing/default_preprocessing.yaml @@ -25,5 +25,5 @@ hydra: ++preprocessing.post_split.imputation_method: choice("most_frequent", "mean", "median", "null") ++preprocessing.post_split.scaling: choice("z-score-normalization", "null") ++preprocessing.post_split.feature_selection.name: choice("chi2", "null") - ++preprocessing.post_splitfeature_selection.params.percentile: int(tag(log, interval(1, 90))) + ++preprocessing.post_split.feature_selection.params.percentile: int(tag(log, interval(1, 90))) preprocessing.pre_split.lookbehind_combination: choice([30, 90], [30]) diff --git a/tests/test_configs.py b/tests/test_configs.py index f54369bf..f1b0a539 100644 --- a/tests/test_configs.py +++ b/tests/test_configs.py @@ -12,15 +12,29 @@ CONFIG_DIR_PATH_ABS = PROJECT_ROOT / "tests" / "config" CONFIG_DIR_PATH_REL = "../tests/config" +CONFIG_DIR_PATH_APP_ABS = PROJECT_ROOT / "application" / "config" + def get_config_file_names() -> list[str]: """Get all config file names.""" - config_file_paths: list[Path] = list(CONFIG_DIR_PATH_ABS.glob("*.yaml")) + config_file_paths = list(CONFIG_DIR_PATH_ABS.glob("*.yaml")) + return [f"{path.stem}.yaml" for path in config_file_paths] @pytest.mark.parametrize("config_file_name", get_config_file_names()) -def test_configs(config_file_name): +def test_test_configs(config_file_name): + """Test that all configs load correctly.""" + with initialize(version_base=None, config_path=CONFIG_DIR_PATH_REL): + cfg = compose( + config_name=config_file_name, + ) + + cfg = convert_omegaconf_to_pydantic_object(conf=cfg) + + +@pytest.mark.parametrize("config_file_name", get_config_file_names()) +def test_app_configs(config_file_name): """Test that all configs load correctly.""" with initialize(version_base=None, config_path=CONFIG_DIR_PATH_REL): cfg = compose( From fcd7cd22069e2c68d6dff3b1a60d808fb1b7455c Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Mon, 19 Dec 2022 14:21:09 +0100 Subject: [PATCH 28/47] fix: sleep between workers to avoid segfaults --- application/main.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/application/main.py b/application/main.py index 788eaed7..dc4a629a 100644 --- a/application/main.py +++ b/application/main.py @@ -147,6 +147,9 @@ def train_models_for_each_cell_in_grid( model_name=trainer_spec.model_name, ), ) + + # Sleep a bit to avoid segfaults + time.sleep(10) def get_possible_lookaheads( @@ -195,12 +198,7 @@ def main(): """Main.""" msg = Printer(timestamp=True) - debug = False - - if debug: - config_file_name = "integration_config.yaml" - else: - config_file_name = "default_config.yaml" + config_file_name = "default_config.yaml" cfg = load_app_cfg_as_pydantic(config_file_name=config_file_name) From f2ac4dc9cf760c3687065adfa2cc44bf9f7779a4 Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Mon, 19 Dec 2022 14:21:47 +0100 Subject: [PATCH 29/47] feat: add model-training suffix to trainer --- application/train_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/application/train_model.py b/application/train_model.py index 79488bd4..f57fa4b9 100644 --- a/application/train_model.py +++ b/application/train_model.py @@ -61,7 +61,7 @@ def main(cfg: DictConfig): create_wandb_folders() run = wandb.init( - project=cfg.project.name, + project=f"{cfg.project.name}-baseline-model-training", reinit=True, config=dict_config_to_log, mode=cfg.project.wandb.mode, From aaaf52f16d0d07701a106439f57e229105718619 Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Mon, 19 Dec 2022 14:22:46 +0100 Subject: [PATCH 30/47] feat: update project name --- application/config/project/default_project.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/application/config/project/default_project.yaml b/application/config/project/default_project.yaml index 3cb495e1..0447cf73 100644 --- a/application/config/project/default_project.yaml +++ b/application/config/project/default_project.yaml @@ -1,9 +1,9 @@ -name: psycop-t2d +name: t2d seed: 42 wandb: entity: psycop # Which entity to run WanDB in. mode: run # Which mode to run WanDB in. Takes "run", "dryrun", "offline" and "disabled" - group: psycop-t2d # Which group to run WanDB in. + group: t2d # Which group to run WanDB in. gpu: true From ef3df84b974c1f2a27cfbd486aa681d1f4c94286 Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Mon, 19 Dec 2022 15:42:56 +0100 Subject: [PATCH 31/47] feat: add wandb alert on exception --- application/main.py | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/application/main.py b/application/main.py index dc4a629a..98550797 100644 --- a/application/main.py +++ b/application/main.py @@ -12,6 +12,7 @@ import pandas as pd import wandb +from psycopmlutils.wandb.wandb_try_except_decorator import wandb_alert_on_exception from random_word import RandomWords from wasabi import Printer @@ -24,6 +25,7 @@ BaseModel, load_app_cfg_as_pydantic, ) +from psycop_model_training.utils.config_schemas.data import ColumnNamesSchema from psycop_model_training.utils.config_schemas.full_config import FullConfigSchema @@ -147,7 +149,7 @@ def train_models_for_each_cell_in_grid( model_name=trainer_spec.model_name, ), ) - + # Sleep a bit to avoid segfaults time.sleep(10) @@ -194,6 +196,26 @@ def get_possible_lookaheads( return list(set(possible_lookahead_days) - set(lookaheads_without_rows)) +def check_columns_exist_in_dataset(cfg: ColumnNamesSchema, df: pd.DataFrame): + # Iterate over attributes in the config + missing_columns = [] + for attr in dir(cfg): + # Skip private attributes + if attr.startswith("_"): + continue + + # Check that the attribute is a string + if not isinstance(getattr(cfg, attr), str): + continue + + # Check that the column exists in the dataset + if not getattr(cfg, attr) in df: + missing_columns.append(getattr(cfg, attr)) + + if missing_columns: + raise ValueError(f"Columns in config but not in dataset: {missing_columns}") + +@wandb_alert_on_exception def main(): """Main.""" msg = Printer(timestamp=True) @@ -216,6 +238,8 @@ def main(): # Load dataset without dropping any rows for inferring # which look distances to grid search over train = load_train_raw(cfg=cfg) + + check_columns_exist_in_dataset(cfg=cfg.data.col_name, df=train) possible_lookaheads = get_possible_lookaheads( msg=msg, From 7208e1372c0d975420611fd6f98b27c6879d12f9 Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Thu, 22 Dec 2022 09:43:07 +0100 Subject: [PATCH 32/47] feat: check that specified col names exist in dataset --- application/main.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/application/main.py b/application/main.py index 98550797..70537d38 100644 --- a/application/main.py +++ b/application/main.py @@ -197,13 +197,11 @@ def get_possible_lookaheads( def check_columns_exist_in_dataset(cfg: ColumnNamesSchema, df: pd.DataFrame): + """Check that all columns in the config exist in the dataset.""" # Iterate over attributes in the config missing_columns = [] - for attr in dir(cfg): - # Skip private attributes - if attr.startswith("_"): - continue + for attr in dir(cfg): # Check that the attribute is a string if not isinstance(getattr(cfg, attr), str): continue @@ -213,7 +211,10 @@ def check_columns_exist_in_dataset(cfg: ColumnNamesSchema, df: pd.DataFrame): missing_columns.append(getattr(cfg, attr)) if missing_columns: - raise ValueError(f"Columns in config but not in dataset: {missing_columns}") + raise ValueError( + f"Columns in config but not in dataset: {missing_columns}. Columns in dataset: {df.columns}" + ) + @wandb_alert_on_exception def main(): @@ -238,7 +239,7 @@ def main(): # Load dataset without dropping any rows for inferring # which look distances to grid search over train = load_train_raw(cfg=cfg) - + check_columns_exist_in_dataset(cfg=cfg.data.col_name, df=train) possible_lookaheads = get_possible_lookaheads( From 7eac89211356c4d25a3d66ccc09068c1ee36a7f2 Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Thu, 22 Dec 2022 09:43:18 +0100 Subject: [PATCH 33/47] refactor: make data config file name generic --- application/config/data/{t2d_parquet.yaml => default_data.yaml} | 2 +- application/config/default_config.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) rename application/config/data/{t2d_parquet.yaml => default_data.yaml} (93%) diff --git a/application/config/data/t2d_parquet.yaml b/application/config/data/default_data.yaml similarity index 93% rename from application/config/data/t2d_parquet.yaml rename to application/config/data/default_data.yaml index 258854ec..27d94b82 100644 --- a/application/config/data/t2d_parquet.yaml +++ b/application/config/data/default_data.yaml @@ -2,7 +2,7 @@ data: # General config n_training_samples: null - dir: E:\shared_resources\t2d\feature_sets\psycop_t2d_adminmanber_features_2022_12_14_12_05\ + dir: E:\shared_resources\t2d\feature_sets\psycop_t2d_adminmanber_features_2022_12_19_15_36 suffix: parquet # Feature specs diff --git a/application/config/default_config.yaml b/application/config/default_config.yaml index 46b91517..46192816 100644 --- a/application/config/default_config.yaml +++ b/application/config/default_config.yaml @@ -1,7 +1,7 @@ # @package _global_ defaults: - project: default_project - - data: t2d_parquet + - data: default_data - preprocessing: default_preprocessing - model: xgboost - train: default_training From ee61991bbe4a884c2780ad96e1df8b0458d09b1a Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Thu, 22 Dec 2022 09:44:02 +0100 Subject: [PATCH 34/47] fix: only check col names in col schema --- application/config/data/default_data.yaml | 4 ++-- application/main.py | 6 +++++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/application/config/data/default_data.yaml b/application/config/data/default_data.yaml index 27d94b82..192d0f3f 100644 --- a/application/config/data/default_data.yaml +++ b/application/config/data/default_data.yaml @@ -11,9 +11,9 @@ data: col_name: pred_timestamp: timestamp - outcome_timestamp: _t2d + outcome_timestamp: timestamp_first_t2d_hba1c id: dw_ek_borger age: pred_age_in_years - exclusion_timestamp: _timestamp_exclusion + exclusion_timestamp: timestamp_exclusion custom: n_hba1c: eval_hba1c_within_9999_days_count_fallback_nan \ No newline at end of file diff --git a/application/main.py b/application/main.py index 70537d38..bfcaea7f 100644 --- a/application/main.py +++ b/application/main.py @@ -202,7 +202,11 @@ def check_columns_exist_in_dataset(cfg: ColumnNamesSchema, df: pd.DataFrame): missing_columns = [] for attr in dir(cfg): - # Check that the attribute is a string + # Skip private attributes + if attr.startswith("_"): + continue + + # Skip col names that are not string if not isinstance(getattr(cfg, attr), str): continue From 7ae1bb0658fe1039ca497f88acb1b29c4bba185a Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Thu, 22 Dec 2022 09:50:05 +0100 Subject: [PATCH 35/47] feat: unify wandb group naming --- application/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/application/main.py b/application/main.py index bfcaea7f..52fa076e 100644 --- a/application/main.py +++ b/application/main.py @@ -233,7 +233,7 @@ def main(): wandb_group = f"{random_word.get_random_word()}-{random_word.get_random_word()}" wandb.init( - project=cfg.project.name, + project=f"{cfg.project.name}-baseline-model-training", mode=cfg.project.wandb.mode, group=wandb_group, entity=cfg.project.wandb.entity, From f47d8b10cd6a200e42577769058156d6b17f55af Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Thu, 22 Dec 2022 10:17:41 +0100 Subject: [PATCH 36/47] fix: imputation method is not required --- src/psycop_model_training/utils/config_schemas/preprocessing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/psycop_model_training/utils/config_schemas/preprocessing.py b/src/psycop_model_training/utils/config_schemas/preprocessing.py index b082d877..e0008503 100644 --- a/src/psycop_model_training/utils/config_schemas/preprocessing.py +++ b/src/psycop_model_training/utils/config_schemas/preprocessing.py @@ -45,7 +45,7 @@ class PreSplitPreprocessingConfigSchema(BaseModel): class PostSplitPreprocessingConfigSchema(BaseModel): - imputation_method: Literal["most_frequent", "mean", "median", "null"] + imputation_method: Optional[Literal["most_frequent", "mean", "median", "null"]] # How to replace missing values. Takes all values from the sklearn.impute.SimpleImputer class. # https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html From c89d67dbd31183443ee4901bf75054d13c27f488 Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Thu, 22 Dec 2022 10:42:45 +0100 Subject: [PATCH 37/47] refactor: move column exists validation to DataLoader --- application/main.py | 30 ++--------------- .../data_loader/col_name_checker.py | 30 +++++++++++++++++ .../data_loader/data_loader.py | 32 +++++++++++++++---- 3 files changed, 57 insertions(+), 35 deletions(-) create mode 100644 src/psycop_model_training/data_loader/col_name_checker.py diff --git a/application/main.py b/application/main.py index 52fa076e..08f823c2 100644 --- a/application/main.py +++ b/application/main.py @@ -16,6 +16,7 @@ from random_word import RandomWords from wasabi import Printer +from psycop_model_training.data_loader.data_loader import DataLoader from psycop_model_training.data_loader.utils import load_train_raw from psycop_model_training.utils.col_name_inference import ( infer_look_distance, @@ -25,7 +26,6 @@ BaseModel, load_app_cfg_as_pydantic, ) -from psycop_model_training.utils.config_schemas.data import ColumnNamesSchema from psycop_model_training.utils.config_schemas.full_config import FullConfigSchema @@ -196,30 +196,6 @@ def get_possible_lookaheads( return list(set(possible_lookahead_days) - set(lookaheads_without_rows)) -def check_columns_exist_in_dataset(cfg: ColumnNamesSchema, df: pd.DataFrame): - """Check that all columns in the config exist in the dataset.""" - # Iterate over attributes in the config - missing_columns = [] - - for attr in dir(cfg): - # Skip private attributes - if attr.startswith("_"): - continue - - # Skip col names that are not string - if not isinstance(getattr(cfg, attr), str): - continue - - # Check that the column exists in the dataset - if not getattr(cfg, attr) in df: - missing_columns.append(getattr(cfg, attr)) - - if missing_columns: - raise ValueError( - f"Columns in config but not in dataset: {missing_columns}. Columns in dataset: {df.columns}" - ) - - @wandb_alert_on_exception def main(): """Main.""" @@ -242,9 +218,7 @@ def main(): # Load dataset without dropping any rows for inferring # which look distances to grid search over - train = load_train_raw(cfg=cfg) - - check_columns_exist_in_dataset(cfg=cfg.data.col_name, df=train) + train = DataLoader(cfg=cfg).load_dataset_from_dir(split_names="train") possible_lookaheads = get_possible_lookaheads( msg=msg, diff --git a/src/psycop_model_training/data_loader/col_name_checker.py b/src/psycop_model_training/data_loader/col_name_checker.py new file mode 100644 index 00000000..bfc8a3a8 --- /dev/null +++ b/src/psycop_model_training/data_loader/col_name_checker.py @@ -0,0 +1,30 @@ +"""Check that all columns in the config exist in the dataset.""" +import pandas as pd + +from psycop_model_training.utils.config_schemas.data import ColumnNamesSchema + + +def check_columns_exist_in_dataset( + col_name_schema: ColumnNamesSchema, df: pd.DataFrame +): + """Check that all columns in the config exist in the dataset.""" + # Iterate over attributes in the config + missing_columns = [] + + for attr in dir(col_name_schema): + # Skip private attributes + if attr.startswith("_"): + continue + + # Skip col names that are not string + if not isinstance(getattr(col_name_schema, attr), str): + continue + + # Check that the column exists in the dataset + if not getattr(col_name_schema, attr) in df: + missing_columns.append(getattr(col_name_schema, attr)) + + if missing_columns: + raise ValueError( + f"Columns in config but not in dataset: {missing_columns}. Columns in dataset: {df.columns}" + ) diff --git a/src/psycop_model_training/data_loader/data_loader.py b/src/psycop_model_training/data_loader/data_loader.py index a701bd8d..4f1baa27 100644 --- a/src/psycop_model_training/data_loader/data_loader.py +++ b/src/psycop_model_training/data_loader/data_loader.py @@ -1,7 +1,7 @@ """Loader for the t2d dataset.""" from collections.abc import Iterable from pathlib import Path -from typing import Optional, Union +from typing import Callable, Optional, Union import pandas as pd from wasabi import Printer @@ -9,6 +9,13 @@ from psycop_model_training.utils.config_schemas.full_config import FullConfigSchema msg = Printer(timestamp=True) +import logging + +from psycop_model_training.data_loader.col_name_checker import ( + check_columns_exist_in_dataset, +) + +log = logging.getLogger(__name__) class DataLoader: @@ -17,16 +24,25 @@ class DataLoader: def __init__( self, cfg: FullConfigSchema, + column_name_checker: Optional[Callable] = check_columns_exist_in_dataset, ): self.cfg: FullConfigSchema = cfg # File handling self.dir_path = Path(cfg.data.dir) self.file_suffix = cfg.data.suffix + self.column_name_checker = column_name_checker # Column specifications self.pred_col_name_prefix = cfg.data.pred_prefix + def _check_column_names(self, df: pd.DataFrame): + """Check that all columns in the config exist in the dataset.""" + if self.column_name_checker: + self.column_name_checker(col_name_schema=self.cfg.data.col_name, df=df) + else: + log.debug("No column name checker specified. Skipping column name check.") + def _load_dataset_file( # pylint: disable=inconsistent-return-statements self, split_name: str, @@ -59,9 +75,15 @@ def _load_dataset_file( # pylint: disable=inconsistent-return-statements raise ValueError( "nrows is not supported for parquet files. Please use csv files.", ) - return pd.read_parquet(path) + + df = pd.read_parquet(path) elif "csv" in self.file_suffix: - return pd.read_csv(filepath_or_buffer=path, nrows=nrows) + df = pd.read_csv(filepath_or_buffer=path, nrows=nrows) + + if self.column_name_checker: + self._check_column_names(df=df) + + return df def load_dataset_from_dir( self, @@ -78,8 +100,6 @@ def load_dataset_from_dir( Returns: pd.DataFrame: The filtered dataset """ - msg.info(f"Loading {split_names}") - # Concat splits if multiple are given if isinstance(split_names, (list, tuple)): if isinstance(split_names, Iterable): @@ -99,6 +119,4 @@ def load_dataset_from_dir( ) elif isinstance(split_names, str): dataset = self._load_dataset_file(split_name=split_names, nrows=nrows) - - msg.good(f"{split_names}: Returning!") return dataset From ae6a6c2ee04fe8596555648cf8aee537ea4bd3c5 Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Thu, 22 Dec 2022 10:55:06 +0100 Subject: [PATCH 38/47] feat: add suggested cols if missing using Levenshtein edit distance --- pyproject.toml | 1 + .../data_loader/col_name_checker.py | 60 ++++++++++++++++--- 2 files changed, 53 insertions(+), 8 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 2e0cce15..40244e6c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,6 +28,7 @@ pyarrow = ">=9.0.0,<10.1.0" Random-Word = ">=1.0.11, <=1.0.12" torch = ">=1.12.1,<1.13.1" diskcache = "^5.4.0" +python-Levenshtein = ">=0.10.2,<0.20.9" [tool.poetry.dev-dependencies] diff --git a/src/psycop_model_training/data_loader/col_name_checker.py b/src/psycop_model_training/data_loader/col_name_checker.py index bfc8a3a8..fd4c171b 100644 --- a/src/psycop_model_training/data_loader/col_name_checker.py +++ b/src/psycop_model_training/data_loader/col_name_checker.py @@ -1,4 +1,7 @@ """Check that all columns in the config exist in the dataset.""" +from typing import List + +import Levenshtein import pandas as pd from psycop_model_training.utils.config_schemas.data import ColumnNamesSchema @@ -9,22 +12,63 @@ def check_columns_exist_in_dataset( ): """Check that all columns in the config exist in the dataset.""" # Iterate over attributes in the config - missing_columns = [] + error_strs = [] for attr in dir(col_name_schema): # Skip private attributes if attr.startswith("_"): continue + col_name = getattr(col_name_schema, attr) + # Skip col names that are not string - if not isinstance(getattr(col_name_schema, attr), str): + if not isinstance(col_name, str): continue # Check that the column exists in the dataset - if not getattr(col_name_schema, attr) in df: - missing_columns.append(getattr(col_name_schema, attr)) + if not col_name in df: + most_likely_alternatives = get_most_likely_str_from_edit_distance( + candidate_strs=df.columns, + input_str=col_name, + n_str_to_return=3, + ) + + error_str = f"Column '{col_name}' in config but not in dataset.\n" + error_str += f" Did you mean {most_likely_alternatives}? \n" + error_strs.append(error_str) + + if error_strs: + raise ValueError("\n".join(error_strs)) + + +def get_most_likely_str_from_edit_distance( + candidate_strs: list[str], + input_str: str, + n_str_to_return: int, + edit_distance_threshold: int = 15, +) -> List[str]: + """Get most likely string from edit distance. + + Args: + candidate_strs (list[str]): List of candidate strings. + input_str (str): The incorrect string. + n_str_to_return (int): Number of strings to return. + edit_distance_threshold (int, optional): Maximum edit distance to consider. Defaults to 5. + + Returns: + str: String from candidate_strs that is most similar to input_str by edit distance. + """ + # Compute the Levenshtein distance between the input string and each candidate string. + distances = [ + Levenshtein.distance(input_str, candidate) for candidate in candidate_strs + ] + + # Sort the candidate strings by their Levenshtein distance from the input string. + sorted_candidates = [ + x + for distance, x in sorted(zip(distances, candidate_strs)) + if distance <= edit_distance_threshold + ] - if missing_columns: - raise ValueError( - f"Columns in config but not in dataset: {missing_columns}. Columns in dataset: {df.columns}" - ) + # Return the first `n_str_to_return` elements of the sorted list of candidate strings. + return sorted_candidates[:n_str_to_return] From afff55ae1ed7093f10aa37526f6e1a422086ea5e Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Thu, 22 Dec 2022 10:58:19 +0100 Subject: [PATCH 39/47] style: linting --- application/loaders/preprocessing_loaders.py | 1 + application/main.py | 1 - .../data_loader/col_name_checker.py | 69 ++++++++++--------- .../data_loader/utils.py | 3 +- .../utils/config_schemas/conf_utils.py | 28 ++++---- tests/test_configs.py | 1 - 6 files changed, 52 insertions(+), 51 deletions(-) diff --git a/application/loaders/preprocessing_loaders.py b/application/loaders/preprocessing_loaders.py index d886c8d8..e321b777 100644 --- a/application/loaders/preprocessing_loaders.py +++ b/application/loaders/preprocessing_loaders.py @@ -1,4 +1,5 @@ import pandas as pd +from psycopmlutils.sql.loader import sql_load def load_timestamp_for_any_diabetes(): diff --git a/application/main.py b/application/main.py index 08f823c2..0cac19e4 100644 --- a/application/main.py +++ b/application/main.py @@ -17,7 +17,6 @@ from wasabi import Printer from psycop_model_training.data_loader.data_loader import DataLoader -from psycop_model_training.data_loader.utils import load_train_raw from psycop_model_training.utils.col_name_inference import ( infer_look_distance, infer_outcome_col_name, diff --git a/src/psycop_model_training/data_loader/col_name_checker.py b/src/psycop_model_training/data_loader/col_name_checker.py index fd4c171b..f4218b61 100644 --- a/src/psycop_model_training/data_loader/col_name_checker.py +++ b/src/psycop_model_training/data_loader/col_name_checker.py @@ -7,8 +7,42 @@ from psycop_model_training.utils.config_schemas.data import ColumnNamesSchema +def get_most_likely_str_from_edit_distance( + candidate_strs: list[str], + input_str: str, + n_str_to_return: int, + edit_distance_threshold: int = 15, +) -> list[str]: + """Get most likely string from edit distance. + + Args: + candidate_strs (list[str]): List of candidate strings. + input_str (str): The incorrect string. + n_str_to_return (int): Number of strings to return. + edit_distance_threshold (int, optional): Maximum edit distance to consider. Defaults to 5. + + Returns: + str: String from candidate_strs that is most similar to input_str by edit distance. + """ + # Compute the Levenshtein distance between the input string and each candidate string. + distances = [ + Levenshtein.distance(input_str, candidate) for candidate in candidate_strs + ] + + # Sort the candidate strings by their Levenshtein distance from the input string. + sorted_candidates = [ + x + for distance, x in sorted(zip(distances, candidate_strs)) + if distance <= edit_distance_threshold + ] + + # Return the first `n_str_to_return` elements of the sorted list of candidate strings. + return sorted_candidates[:n_str_to_return] + + def check_columns_exist_in_dataset( - col_name_schema: ColumnNamesSchema, df: pd.DataFrame + col_name_schema: ColumnNamesSchema, + df: pd.DataFrame, ): """Check that all columns in the config exist in the dataset.""" # Iterate over attributes in the config @@ -39,36 +73,3 @@ def check_columns_exist_in_dataset( if error_strs: raise ValueError("\n".join(error_strs)) - - -def get_most_likely_str_from_edit_distance( - candidate_strs: list[str], - input_str: str, - n_str_to_return: int, - edit_distance_threshold: int = 15, -) -> List[str]: - """Get most likely string from edit distance. - - Args: - candidate_strs (list[str]): List of candidate strings. - input_str (str): The incorrect string. - n_str_to_return (int): Number of strings to return. - edit_distance_threshold (int, optional): Maximum edit distance to consider. Defaults to 5. - - Returns: - str: String from candidate_strs that is most similar to input_str by edit distance. - """ - # Compute the Levenshtein distance between the input string and each candidate string. - distances = [ - Levenshtein.distance(input_str, candidate) for candidate in candidate_strs - ] - - # Sort the candidate strings by their Levenshtein distance from the input string. - sorted_candidates = [ - x - for distance, x in sorted(zip(distances, candidate_strs)) - if distance <= edit_distance_threshold - ] - - # Return the first `n_str_to_return` elements of the sorted list of candidate strings. - return sorted_candidates[:n_str_to_return] diff --git a/src/psycop_model_training/data_loader/utils.py b/src/psycop_model_training/data_loader/utils.py index 4c6f078d..c5917d7b 100644 --- a/src/psycop_model_training/data_loader/utils.py +++ b/src/psycop_model_training/data_loader/utils.py @@ -59,7 +59,8 @@ def load_and_filter_train_and_val_from_cfg(cfg: FullConfigSchema): def load_train_raw( - cfg: FullConfigSchema, convert_timestamp_types_and_nans: bool = True + cfg: FullConfigSchema, + convert_timestamp_types_and_nans: bool = True, ) -> pd.DataFrame: """Load the data.""" path = Path(cfg.data.dir) diff --git a/src/psycop_model_training/utils/config_schemas/conf_utils.py b/src/psycop_model_training/utils/config_schemas/conf_utils.py index fd35f53a..63e4df35 100644 --- a/src/psycop_model_training/utils/config_schemas/conf_utils.py +++ b/src/psycop_model_training/utils/config_schemas/conf_utils.py @@ -23,20 +23,6 @@ def convert_omegaconf_to_pydantic_object( return FullConfigSchema(**conf, allow_mutation=allow_mutation) -def load_app_cfg_as_pydantic( - config_file_name, - allow_mutation: bool = False, - overrides: Optional[list[str]] = None, -): - cfg = load_test_cfg_as_omegaconf( - config_file_name=config_file_name, - overrides=overrides, - config_dir_path_rel="../../../../application/config/", - ) - - return convert_omegaconf_to_pydantic_object(conf=cfg, allow_mutation=allow_mutation) - - def load_test_cfg_as_omegaconf( config_file_name: str, config_dir_path_rel: str, @@ -66,6 +52,20 @@ def load_test_cfg_as_omegaconf( return cfg +def load_app_cfg_as_pydantic( + config_file_name, + allow_mutation: bool = False, + overrides: Optional[list[str]] = None, +): + cfg = load_test_cfg_as_omegaconf( + config_file_name=config_file_name, + overrides=overrides, + config_dir_path_rel="../../../../application/config/", + ) + + return convert_omegaconf_to_pydantic_object(conf=cfg, allow_mutation=allow_mutation) + + def load_test_cfg_as_pydantic( config_file_name, allow_mutation: bool = False, diff --git a/tests/test_configs.py b/tests/test_configs.py index f1b0a539..c07c480e 100644 --- a/tests/test_configs.py +++ b/tests/test_configs.py @@ -1,5 +1,4 @@ """Testing of config schemas.""" -from pathlib import Path import pytest from hydra import compose, initialize From 2c3bb8677d98bd58fca92b40a6848a77ff5a8ac8 Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Thu, 22 Dec 2022 10:59:00 +0100 Subject: [PATCH 40/47] ci: remove flake for speedups --- .pre-commit-config.yaml | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 9e05136a..78bf3dca 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -13,17 +13,6 @@ repos: pass_filenames: false always_run: true - - repo: https://github.com/PyCQA/autoflake - rev: v1.7.6 - hooks: - - id: autoflake - args: - [ - "--in-place", - "--remove-all-unused-imports", - "--ignore-init-module-imports", - ] - - repo: https://github.com/pycqa/isort rev: 5.10.1 hooks: @@ -58,12 +47,6 @@ repos: hooks: - id: black - - repo: https://github.com/PyCQA/flake8 - rev: 5.0.4 - hooks: - - id: flake8 - args: [--config, .flake8] - - repo: https://github.com/pre-commit/pre-commit-hooks rev: v4.3.0 # Use the ref you want to point at hooks: From 5e8dd218bc97abe9bcb5132d8526545523a9f48b Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Thu, 22 Dec 2022 10:59:29 +0100 Subject: [PATCH 41/47] style: linting --- src/psycop_model_training/data_loader/col_name_checker.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/psycop_model_training/data_loader/col_name_checker.py b/src/psycop_model_training/data_loader/col_name_checker.py index f4218b61..8a4fed4c 100644 --- a/src/psycop_model_training/data_loader/col_name_checker.py +++ b/src/psycop_model_training/data_loader/col_name_checker.py @@ -1,5 +1,4 @@ """Check that all columns in the config exist in the dataset.""" -from typing import List import Levenshtein import pandas as pd From c134e0b7d9d36b2934d3bcd3d9c592ac0e79558b Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Thu, 22 Dec 2022 10:59:29 +0100 Subject: [PATCH 42/47] style: linting --- src/psycop_model_training/data_loader/col_name_checker.py | 1 - src/psycop_model_training/data_loader/data_loader.py | 2 +- src/psycop_model_training/utils/config_schemas/eval.py | 1 + .../utils/config_schemas/full_config.py | 1 + src/psycop_model_training/utils/config_schemas/model.py | 1 + .../utils/config_schemas/preprocessing.py | 5 +++++ src/psycop_model_training/utils/config_schemas/project.py | 1 + 7 files changed, 10 insertions(+), 2 deletions(-) diff --git a/src/psycop_model_training/data_loader/col_name_checker.py b/src/psycop_model_training/data_loader/col_name_checker.py index f4218b61..8a4fed4c 100644 --- a/src/psycop_model_training/data_loader/col_name_checker.py +++ b/src/psycop_model_training/data_loader/col_name_checker.py @@ -1,5 +1,4 @@ """Check that all columns in the config exist in the dataset.""" -from typing import List import Levenshtein import pandas as pd diff --git a/src/psycop_model_training/data_loader/data_loader.py b/src/psycop_model_training/data_loader/data_loader.py index 4f1baa27..9725ca62 100644 --- a/src/psycop_model_training/data_loader/data_loader.py +++ b/src/psycop_model_training/data_loader/data_loader.py @@ -1,4 +1,5 @@ """Loader for the t2d dataset.""" +import logging from collections.abc import Iterable from pathlib import Path from typing import Callable, Optional, Union @@ -9,7 +10,6 @@ from psycop_model_training.utils.config_schemas.full_config import FullConfigSchema msg = Printer(timestamp=True) -import logging from psycop_model_training.data_loader.col_name_checker import ( check_columns_exist_in_dataset, diff --git a/src/psycop_model_training/utils/config_schemas/eval.py b/src/psycop_model_training/utils/config_schemas/eval.py index 61d9a2e2..ef9dc0ed 100644 --- a/src/psycop_model_training/utils/config_schemas/eval.py +++ b/src/psycop_model_training/utils/config_schemas/eval.py @@ -1,3 +1,4 @@ +"""Eval config schema.""" from psycop_model_training.utils.basemodel import BaseModel diff --git a/src/psycop_model_training/utils/config_schemas/full_config.py b/src/psycop_model_training/utils/config_schemas/full_config.py index 8ff8d8ca..8f8aa834 100644 --- a/src/psycop_model_training/utils/config_schemas/full_config.py +++ b/src/psycop_model_training/utils/config_schemas/full_config.py @@ -1,3 +1,4 @@ +"""Full configuration schema.""" from psycop_model_training.utils.basemodel import BaseModel from psycop_model_training.utils.config_schemas.data import DataSchema from psycop_model_training.utils.config_schemas.eval import EvalConfSchema diff --git a/src/psycop_model_training/utils/config_schemas/model.py b/src/psycop_model_training/utils/config_schemas/model.py index 2148afbc..e750afc2 100644 --- a/src/psycop_model_training/utils/config_schemas/model.py +++ b/src/psycop_model_training/utils/config_schemas/model.py @@ -1,3 +1,4 @@ +"""Model configuration schemas.""" from psycop_model_training.utils.basemodel import BaseModel diff --git a/src/psycop_model_training/utils/config_schemas/preprocessing.py b/src/psycop_model_training/utils/config_schemas/preprocessing.py index e0008503..c5a7aea1 100644 --- a/src/psycop_model_training/utils/config_schemas/preprocessing.py +++ b/src/psycop_model_training/utils/config_schemas/preprocessing.py @@ -1,3 +1,4 @@ +"""Preprocessing config schemas.""" from datetime import datetime from typing import Literal, Optional, Union @@ -15,6 +16,8 @@ class FeatureSelectionSchema(BaseModel): class PreSplitPreprocessingConfigSchema(BaseModel): + """Pre split preprocessing config.""" + drop_patient_if_exclusion_before_date: Optional[Union[str, datetime]] # Drop all visits from a patient if the outcome is before this date. If None, no patients are dropped. @@ -45,6 +48,8 @@ class PreSplitPreprocessingConfigSchema(BaseModel): class PostSplitPreprocessingConfigSchema(BaseModel): + """Post split preprocessing config.""" + imputation_method: Optional[Literal["most_frequent", "mean", "median", "null"]] # How to replace missing values. Takes all values from the sklearn.impute.SimpleImputer class. # https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html diff --git a/src/psycop_model_training/utils/config_schemas/project.py b/src/psycop_model_training/utils/config_schemas/project.py index 562dbe4c..7f71788b 100644 --- a/src/psycop_model_training/utils/config_schemas/project.py +++ b/src/psycop_model_training/utils/config_schemas/project.py @@ -1,3 +1,4 @@ +"""Project configuration schemas.""" from psycop_model_training.utils.basemodel import BaseModel From eab8dc3e2eb1543383cdae2e4c9a503c9146274e Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Thu, 22 Dec 2022 11:03:36 +0100 Subject: [PATCH 43/47] style: decrease strictness of pylint --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 40244e6c..2725524b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -73,7 +73,7 @@ ignore_missing_imports = true [tool.pylint] load-plugins = "pylint.extensions.docparams,pylint.extensions.code_style,pylint.extensions.for_any_all,pylint.extensions.typing" good-names = "df,p,f,d,e,n,k,i,v,y_,X,y" -disable = "too-many-lines,line-too-long,missing-raises-doc,no-self-argument,unused-wildcard-import,wildcard-import,no-else-return,too-many-arguments,redefined-outer-name,c-extension-no-member,wrong-import-order,import-outside-toplevel,unused-import" +disable = "too-many-lines,line-too-long,missing-raises-doc,no-self-argument,unused-wildcard-import,wildcard-import,no-else-return,too-many-arguments,redefined-outer-name,c-extension-no-member,wrong-import-order,import-outside-toplevel,unused-import,wrong-import-position,deprecated-typing-alias,missing-module-docstring" extension-pkg-allow-list = "wandb,pydantic" [tool.pylint.'MESSAGES CONTROL'] From 5eb6b01bf71cd1c8796a10cc4d12051d5685c082 Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Thu, 22 Dec 2022 11:08:34 +0100 Subject: [PATCH 44/47] style: linting --- application/inspect_dataset.py | 6 +++--- src/psycop_model_training/data_loader/utils.py | 2 +- .../pre_split/processors/row_filter.py | 10 ++++++++-- .../pre_split/processors/value_cleaner.py | 13 ++++++++----- .../utils/config_schemas/conf_utils.py | 4 +++- 5 files changed, 23 insertions(+), 12 deletions(-) diff --git a/application/inspect_dataset.py b/application/inspect_dataset.py index 5c90b68e..511cab4e 100644 --- a/application/inspect_dataset.py +++ b/application/inspect_dataset.py @@ -11,11 +11,11 @@ def main(): config_file_name = "default_config.yaml" cfg = load_test_cfg_as_pydantic(config_file_name=config_file_name) - df = load_train_raw(cfg=cfg) # noqa pylint: disable=unused-variable + df = load_train_raw(cfg=cfg) # pylint: disable=unused-variable - df_filtered = load_and_filter_train_from_cfg( + df_filtered = load_and_filter_train_from_cfg( # pylint: disable=unused-variable cfg=cfg, - ) # noqa pylint: disable=unused-variable + ) if __name__ == "__main__": diff --git a/src/psycop_model_training/data_loader/utils.py b/src/psycop_model_training/data_loader/utils.py index c5917d7b..23a4f64c 100644 --- a/src/psycop_model_training/data_loader/utils.py +++ b/src/psycop_model_training/data_loader/utils.py @@ -76,7 +76,7 @@ def load_train_raw( # Helpful during tests to convert columns with matching names to datetime if convert_timestamp_types_and_nans: - df = PreSplitValueCleaner._convert_timestamp_dtype_and_nat(dataset=df) + df = PreSplitValueCleaner.convert_timestamp_dtype_and_nat(dataset=df) return df diff --git a/src/psycop_model_training/preprocessing/pre_split/processors/row_filter.py b/src/psycop_model_training/preprocessing/pre_split/processors/row_filter.py index 45832835..0302025f 100644 --- a/src/psycop_model_training/preprocessing/pre_split/processors/row_filter.py +++ b/src/psycop_model_training/preprocessing/pre_split/processors/row_filter.py @@ -1,3 +1,4 @@ +"""Row filter for pre-split data.""" from datetime import timedelta from typing import Union @@ -10,6 +11,8 @@ class PreSplitRowFilter: + """Row filter for pre-split data.""" + def __init__(self, cfg: FullConfigSchema): self.cfg = cfg @@ -73,7 +76,8 @@ def _drop_patient_if_excluded_by_date( dataset: pd.DataFrame, ) -> pd.DataFrame: """Drop patients that have an exclusion event within the washin - period.""" + period. + """ n_rows_before_modification = dataset.shape[0] @@ -108,7 +112,8 @@ def _drop_patient_if_excluded_by_date( @print_df_dimensions_diff def _keep_only_if_older_than_min_age(self, dataset: pd.DataFrame) -> pd.DataFrame: """Keep only rows that are older than the minimum age specified in the - config.""" + config. + """ return dataset[ dataset[self.cfg.data.col_name.age] >= self.cfg.preprocessing.pre_split.min_age @@ -126,6 +131,7 @@ def _drop_rows_after_event_time(self, dataset: pd.DataFrame) -> pd.DataFrame: return dataset[~rows_to_drop] def filter(self, dataset: pd.DataFrame): + """Run filters based on config.""" for direction in ("ahead", "behind"): if direction == "ahead": n_days = self.cfg.preprocessing.pre_split.min_lookahead_days diff --git a/src/psycop_model_training/preprocessing/pre_split/processors/value_cleaner.py b/src/psycop_model_training/preprocessing/pre_split/processors/value_cleaner.py index b25e3c5f..b3113eac 100644 --- a/src/psycop_model_training/preprocessing/pre_split/processors/value_cleaner.py +++ b/src/psycop_model_training/preprocessing/pre_split/processors/value_cleaner.py @@ -1,5 +1,6 @@ """Class for formatting values before split, e.g. assigning datetime, removing -negative values etc.""" +negative values etc. +""" import numpy as np import pandas as pd @@ -10,16 +11,18 @@ class PreSplitValueCleaner: """Class for cleaning values before split, e.g. assigning datetime, - removing negative values etc.""" + removing negative values etc. + """ def __init__(self, cfg: FullConfigSchema) -> None: self.cfg = cfg @staticmethod @print_df_dimensions_diff - def _convert_timestamp_dtype_and_nat(dataset: pd.DataFrame) -> pd.DataFrame: + def convert_timestamp_dtype_and_nat(dataset: pd.DataFrame) -> pd.DataFrame: """Convert columns with `timestamp`in their name to datetime, and - convert 0's to NaT.""" + convert 0's to NaT. + """ timestamp_colnames = [col for col in dataset.columns if "timestamp" in col] for colname in timestamp_colnames: @@ -61,6 +64,6 @@ def clean(self, dataset: pd.DataFrame) -> pd.DataFrame: # 2b. Always use z-score normalisation? dataset = self._negative_values_to_nan(dataset=dataset) - dataset = self._convert_timestamp_dtype_and_nat(dataset=dataset) + dataset = self.convert_timestamp_dtype_and_nat(dataset=dataset) return dataset diff --git a/src/psycop_model_training/utils/config_schemas/conf_utils.py b/src/psycop_model_training/utils/config_schemas/conf_utils.py index 63e4df35..b1f927ee 100644 --- a/src/psycop_model_training/utils/config_schemas/conf_utils.py +++ b/src/psycop_model_training/utils/config_schemas/conf_utils.py @@ -16,6 +16,7 @@ def convert_omegaconf_to_pydantic_object( Args: conf (DictConfig): Omegaconf DictConfig allow_mutation (bool, optional): Whether to make the pydantic object mutable. Defaults to False. + Returns: FullConfig: Pydantic object """ @@ -53,10 +54,11 @@ def load_test_cfg_as_omegaconf( def load_app_cfg_as_pydantic( - config_file_name, + config_file_name: str, allow_mutation: bool = False, overrides: Optional[list[str]] = None, ): + """Load application cfg as pydantic object.""" cfg = load_test_cfg_as_omegaconf( config_file_name=config_file_name, overrides=overrides, From 404de4169c103c15ccfad8eefdbf360405217dc2 Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Thu, 22 Dec 2022 11:08:54 +0100 Subject: [PATCH 45/47] style: linting --- .../preprocessing/pre_split/processors/row_filter.py | 6 ++---- .../preprocessing/pre_split/processors/value_cleaner.py | 9 +++------ .../utils/config_schemas/conf_utils.py | 2 +- 3 files changed, 6 insertions(+), 11 deletions(-) diff --git a/src/psycop_model_training/preprocessing/pre_split/processors/row_filter.py b/src/psycop_model_training/preprocessing/pre_split/processors/row_filter.py index 0302025f..0fc38bd1 100644 --- a/src/psycop_model_training/preprocessing/pre_split/processors/row_filter.py +++ b/src/psycop_model_training/preprocessing/pre_split/processors/row_filter.py @@ -76,8 +76,7 @@ def _drop_patient_if_excluded_by_date( dataset: pd.DataFrame, ) -> pd.DataFrame: """Drop patients that have an exclusion event within the washin - period. - """ + period.""" n_rows_before_modification = dataset.shape[0] @@ -112,8 +111,7 @@ def _drop_patient_if_excluded_by_date( @print_df_dimensions_diff def _keep_only_if_older_than_min_age(self, dataset: pd.DataFrame) -> pd.DataFrame: """Keep only rows that are older than the minimum age specified in the - config. - """ + config.""" return dataset[ dataset[self.cfg.data.col_name.age] >= self.cfg.preprocessing.pre_split.min_age diff --git a/src/psycop_model_training/preprocessing/pre_split/processors/value_cleaner.py b/src/psycop_model_training/preprocessing/pre_split/processors/value_cleaner.py index b3113eac..5b2c608c 100644 --- a/src/psycop_model_training/preprocessing/pre_split/processors/value_cleaner.py +++ b/src/psycop_model_training/preprocessing/pre_split/processors/value_cleaner.py @@ -1,6 +1,5 @@ """Class for formatting values before split, e.g. assigning datetime, removing -negative values etc. -""" +negative values etc.""" import numpy as np import pandas as pd @@ -11,8 +10,7 @@ class PreSplitValueCleaner: """Class for cleaning values before split, e.g. assigning datetime, - removing negative values etc. - """ + removing negative values etc.""" def __init__(self, cfg: FullConfigSchema) -> None: self.cfg = cfg @@ -21,8 +19,7 @@ def __init__(self, cfg: FullConfigSchema) -> None: @print_df_dimensions_diff def convert_timestamp_dtype_and_nat(dataset: pd.DataFrame) -> pd.DataFrame: """Convert columns with `timestamp`in their name to datetime, and - convert 0's to NaT. - """ + convert 0's to NaT.""" timestamp_colnames = [col for col in dataset.columns if "timestamp" in col] for colname in timestamp_colnames: diff --git a/src/psycop_model_training/utils/config_schemas/conf_utils.py b/src/psycop_model_training/utils/config_schemas/conf_utils.py index b1f927ee..ef1b87ef 100644 --- a/src/psycop_model_training/utils/config_schemas/conf_utils.py +++ b/src/psycop_model_training/utils/config_schemas/conf_utils.py @@ -69,7 +69,7 @@ def load_app_cfg_as_pydantic( def load_test_cfg_as_pydantic( - config_file_name, + config_file_name: str, allow_mutation: bool = False, overrides: Optional[list[str]] = None, ) -> FullConfigSchema: From 3b00e104c422f2782eef7bbdc5efcc3ab58bd7cf Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Thu, 22 Dec 2022 11:09:31 +0100 Subject: [PATCH 46/47] style: linting --- src/psycop_model_training/utils/config_schemas/conf_utils.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/psycop_model_training/utils/config_schemas/conf_utils.py b/src/psycop_model_training/utils/config_schemas/conf_utils.py index ef1b87ef..0fda940f 100644 --- a/src/psycop_model_training/utils/config_schemas/conf_utils.py +++ b/src/psycop_model_training/utils/config_schemas/conf_utils.py @@ -1,3 +1,7 @@ +"""Utilities for handling config objects, e.g. load, change format. + +Very useful when testing. +""" from typing import Optional, Union from hydra import compose, initialize From ba058b1b3d43d0d2bc23c52a5883ebce26a6eae0 Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Thu, 22 Dec 2022 11:12:46 +0100 Subject: [PATCH 47/47] tests: fix path after rename of test config --- .../utils/config_schemas/conf_utils.py | 4 ++-- tests/conftest.py | 6 +++--- tests/test_train_model.py | 4 ++-- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/psycop_model_training/utils/config_schemas/conf_utils.py b/src/psycop_model_training/utils/config_schemas/conf_utils.py index 0fda940f..83184dc2 100644 --- a/src/psycop_model_training/utils/config_schemas/conf_utils.py +++ b/src/psycop_model_training/utils/config_schemas/conf_utils.py @@ -29,8 +29,8 @@ def convert_omegaconf_to_pydantic_object( def load_test_cfg_as_omegaconf( - config_file_name: str, - config_dir_path_rel: str, + config_file_name: str = "default_config", + config_dir_path_rel: str = "../../../../tests/config/", overrides: Optional[list[str]] = None, ) -> DictConfig: """Load config as omegaconf object.""" diff --git a/tests/conftest.py b/tests/conftest.py index 6ca2361b..0e52e585 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -14,7 +14,7 @@ CONFIG_DIR_PATH_REL = "../application/config" -def add_age_gender(df): +def add_age_gender(df: pd.DataFrame): """Add age and gender columns to dataframe. Args: @@ -54,7 +54,7 @@ def synth_eval_dataset() -> EvalDataset: def immuteable_test_config() -> FullConfigSchema: """Get an immutable config for testing.""" return load_test_cfg_as_pydantic( - config_file_name="integration_config.yaml", + config_file_name="default_config.yaml", allow_mutation=False, ) @@ -63,6 +63,6 @@ def immuteable_test_config() -> FullConfigSchema: def muteable_test_config() -> FullConfigSchema: """Get a mutable config for testing.""" return load_test_cfg_as_pydantic( - config_file_name="integration_config.yaml", + config_file_name="default_config.yaml", allow_mutation=True, ) diff --git a/tests/test_train_model.py b/tests/test_train_model.py index 8be1e351..80f20675 100644 --- a/tests/test_train_model.py +++ b/tests/test_train_model.py @@ -10,11 +10,11 @@ ) from psycop_model_training.utils.config_schemas.full_config import FullConfigSchema -INTEGRATION_TEST_FILE_NAME = "integration_config.yaml" +INTEGRATION_TEST_FILE_NAME = "default_config.yaml" @pytest.mark.parametrize("model_name", MODELS.keys()) -def test_main(model_name): +def test_main(model_name: str): """Test main using a variety of model.""" cfg: FullConfigSchema = load_test_cfg_as_omegaconf(