Merge pull request #293 from Aarhus-Psychiatry-Research/bokajgd/issue188

Plot perfomance by age and n_hba1c
Aarhus-Psychiatry-Research · Oct 28, 2022 · adea6bd · adea6bd
2 parents 1b5abb2 + 982bf0d
commit adea6bd
Show file tree

Hide file tree

Showing 22 changed files with 1,420 additions and 1,111 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -38,6 +38,7 @@ flake8 = ">=4.0.1,<4.1.0"
 pytest-xdist = "^2.5.0"
 mypy = "^0.982"
 setuptools = ">=65.3.0,<65.6.0"
+pylint = "^2.15.5"
 
 [build-system]
 requires = ["poetry-core>=1.0.0", "pip"]

diff --git a/src/psycopt2d/config/data/synth_data.yaml b/src/psycopt2d/config/data/synth_data.yaml
@@ -6,11 +6,15 @@ data:
   min_lookahead_days: 30
   min_lookbehind_days: 100
   min_prediction_time_date: null
+  pred_prefix: pred_
+
   col_name:
-    pred_prefix: pred_
     pred_timestamp: timestamp
     outcome_timestamp: timestamp_outcome
     id: citizen_ids
+    age: pred_age
+    custom:
+      n_hba1c: hba1c_within_9999_days_count_nan
 
   # Looking ahead
   drop_patient_if_outcome_before_date: null

diff --git a/src/psycopt2d/config/data/t2d_parquet.yaml b/src/psycopt2d/config/data/t2d_parquet.yaml
@@ -14,12 +14,15 @@ data:
   min_lookahead_days: 1825
 
   # Feature specs
+  pred_prefix: pred_
+
   col_name:
-    pred_prefix: pred_
     pred_timestamp: timestamp
     outcome_timestamp: _timestamp_first_t2d
     id: dw_ek_borger
     age: pred_age_in_years
+    custom:
+      n_hba1c: hba1c_within_9999_days_count_fallback_0
 
   max_lookbehind_days: 3650
   lookbehind_combination: [30, 90, 180, 365, 730]

diff --git a/src/psycopt2d/evaluate_model.py b/src/psycopt2d/evaluate_model.py
@@ -20,6 +20,8 @@
 from psycopt2d.utils.config_schemas import FullConfigSchema
 from psycopt2d.utils.utils import positive_rate_to_pred_probs
 from psycopt2d.visualization.feature_importance import plot_feature_importances
+from psycopt2d.visualization.performance_by_age import plot_performance_by_age
+from psycopt2d.visualization.performance_by_n_hba1c import plot_performance_by_n_hba1c
 from psycopt2d.visualization.performance_over_time import (
     plot_auc_by_time_from_first_visit,
     plot_metric_by_calendar_time,
@@ -87,7 +89,7 @@ def filter_plot_bins(
     return lookahead_bins, lookbehind_bins
 
 
-def create_default_plot_artifacts(
+def create_base_plot_artifacts(
     cfg: FullConfigSchema,
     eval_dataset: EvalDataset,
     save_dir: Path,
@@ -145,6 +147,29 @@ def create_default_plot_artifacts(
                 output_format="df",
             ),
         ),
+        ArtifactContainer(
+            label="performance_by_age",
+            artifact=plot_performance_by_age(
+                eval_dataset=eval_dataset,
+                save_path=save_dir / "performance_by_age.png",
+            ),
+        ),
+    ]
+
+
+def create_custom_plot_artifacts(
+    eval_dataset: EvalDataset,
+    save_dir: Path,
+) -> list[ArtifactContainer]:
+    """A collection of plots that are always generated."""
+    return [
+        ArtifactContainer(
+            label="performance_by_age",
+            artifact=plot_performance_by_n_hba1c(
+                eval_dataset=eval_dataset,
+                save_path=save_dir / "performance_by_age.png",
+            ),
+        ),
     ]
 
 
@@ -171,14 +196,19 @@ def run_full_evaluation(
     # Create the directory if it doesn't exist
     save_dir.mkdir(parents=True, exist_ok=True)
 
-    artifact_containers = create_default_plot_artifacts(
+    artifact_containers = create_base_plot_artifacts(
         cfg=cfg,
         eval_dataset=eval_dataset,
         lookahead_bins=lookahead_bins,
         lookbehind_bins=lookbehind_bins,
         save_dir=save_dir,
     )
 
+    artifact_containers += create_custom_plot_artifacts(
+        eval_dataset=eval_dataset,
+        save_dir=save_dir,
+    )
+
     if pipe_metadata and pipe_metadata.feature_importances:
         artifact_containers += [
             ArtifactContainer(

diff --git a/src/psycopt2d/evaluation_dataclasses.py b/src/psycopt2d/evaluation_dataclasses.py
@@ -7,25 +7,31 @@
 from psycopt2d.utils.config_schemas import BaseModel, FullConfigSchema
 
 
+class CustomColumns(BaseModel):
+    """Custom columns to use in evaluation."""
+
+    n_hba1c: Optional[pd.Series]
+
+
 class EvalDataset(BaseModel):
     """Evaluation dataset.
 
     Makes the interfaces of our evaluation functions simpler and
     consistent.
     """
 
-    class Config:
-        """Configuration of Pydantic model."""
-
-        allow_mutation = True
-
     ids: pd.Series
     pred_timestamps: pd.Series
     outcome_timestamps: pd.Series
     y: pd.Series
     y_hat_probs: pd.Series
     y_hat_int: pd.Series
     age: Optional[pd.Series]
+    custom: Optional[CustomColumns] = CustomColumns(n_hba1c=None)
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.Config.allow_mutation = True
 
 
 class ArtifactContainer(BaseModel):

diff --git a/src/psycopt2d/load.py b/src/psycopt2d/load.py
@@ -71,7 +71,7 @@ def __init__(
         self.file_suffix = cfg.data.suffix
 
         # Column specifications
-        self.pred_col_name_prefix = cfg.data.col_name.pred_prefix
+        self.pred_col_name_prefix = cfg.data.pred_prefix
 
     def _load_dataset_file(  # pylint: disable=inconsistent-return-statements
         self,

diff --git a/src/psycopt2d/model_performance/model_performance.py b/src/psycopt2d/model_performance/model_performance.py
@@ -67,7 +67,7 @@ def performance_metrics_from_df(
             aggregate_by_id=False,
             prediction_col_name=prediction_col_name,
             label_col_name=label_col_name,
-            id_col_name=id_col_name,
+            id_col_name=id_col_name,  # type: ignore
             to_wide=to_wide,
             id2label=id2label,
             binary_threshold=binary_threshold,
@@ -166,6 +166,7 @@ def performance_metrics_from_file(
             binary_threshold=binary_threshold,
         )
 
+    @staticmethod
     def performance_metrics_from_folder(
         folder: Union[str, Path],
         pattern: str,
@@ -419,18 +420,18 @@ def compute_metrics(
         performance = pd.melt(performance)
         # split score and class into two columns
         if add_level_prefix:
-            performance[["level", "score_type", "class"]] = performance[
+            performance[["level", "score_type", "class"]] = performance[  # type: ignore
                 "variable"
             ].str.split("-", 2, expand=True)
             # drop unused columns and re-arrange
-            return performance[["level", "class", "score_type", "value"]]
+            return performance[["level", "class", "score_type", "value"]]  # type: ignore
         else:
-            performance[["score_type", "class"]] = performance["variable"].str.split(
+            performance[["score_type", "class"]] = performance["variable"].str.split(  # type: ignore
                 "-",
                 1,
                 expand=True,
             )
-            return performance[["class", "score_type", "value"]]
+            return performance[["class", "score_type", "value"]]  # type: ignore
 
 
 if __name__ == "__main__":
@@ -464,7 +465,7 @@ def compute_metrics(
         prediction_col_name="scores",
         id_col_name="id",
         id2label=id2label,
-        metadata_col_names="all",
+        metadata_col_names="all",  # type: ignore
         to_wide=False,
     )
 

diff --git a/src/psycopt2d/model_training_watcher.py b/src/psycopt2d/model_training_watcher.py
@@ -185,7 +185,7 @@ def _get_run_wandb_dir(self, run_id: str) -> Path:
 
     def _get_run_attribute(self, run: Run, attribute: str) -> Any:
         """Get an attribute from a wandb run."""
-        if attribute in run.summary:
+        if attribute in run.summary:  # type: ignore
             return run.summary[attribute]
         if self.verbose:
             msg.info(

diff --git a/src/psycopt2d/train_model.py b/src/psycopt2d/train_model.py
@@ -201,10 +201,11 @@ def train_and_eval_on_crossvalidation(
         y_hat_int=df["y_hat_prob"].round(),
         pred_timestamps=df[cfg.data.col_name.pred_timestamp],
         outcome_timestamps=df[cfg.data.col_name.outcome_timestamp],
+        age=df[cfg.data.col_name.age],
     )
 
-    if cfg.data.col_name.age:
-        eval_dataset.age = df[cfg.data.col_name.age]
+    if cfg.data.col_name.custom.n_hba1c:
+        eval_dataset.custom.n_hba1c = df[cfg.data.col_name.custom.n_hba1c]
 
     return eval_dataset
 
@@ -255,11 +256,9 @@ def train_and_eval_on_val_split(
         y_hat_int=df["y_hat_prob"].round(),
         pred_timestamps=df[cfg.data.col_name.pred_timestamp],
         outcome_timestamps=df[cfg.data.col_name.outcome_timestamp],
+        age=df[cfg.data.col_name.age],
     )
 
-    if cfg.data.col_name.age:
-        eval_dataset.age = df[cfg.data.col_name.age]
-
     return eval_dataset
 
 
@@ -350,7 +349,7 @@ def get_col_names(cfg: DictConfig, train: pd.DataFrame) -> tuple[str, list[str]]
     )
 
     train_col_names = [  # pylint: disable=invalid-name
-        c for c in train.columns if c.startswith(cfg.data.col_name.pred_prefix)
+        c for c in train.columns if c.startswith(cfg.data.pred_prefix)
     ]
 
     return outcome_col_name, train_col_names

diff --git a/src/psycopt2d/utils/config_schemas.py b/src/psycopt2d/utils/config_schemas.py
@@ -64,14 +64,21 @@ class ProjectConf(BaseModel):
     gpu: bool
 
 
+class CustomColNames(BaseModel):
+    """All custom column names, i.e. columns that won't generalise across
+    projects."""
+
+    n_hba1c: str
+
+
 class ColumnNames(BaseModel):
     """Column names in the data."""
 
-    pred_prefix: str  # prefix of predictor columns
     pred_timestamp: str  # (str): Column name for prediction times
     outcome_timestamp: str  # (str): Column name for outcome timestamps
     id: str  # (str): Citizen colnames
-    age: Optional[str]  # Name of the age column
+    age: str  # Name of the age column
+    custom: CustomColNames
 
 
 class DataConf(BaseModel):
@@ -85,6 +92,7 @@ class DataConf(BaseModel):
 
     # Feature specs
     col_name: ColumnNames
+    pred_prefix: str  # prefix of predictor columns
 
     # Looking ahead
     min_lookahead_days: int  # (int): Drop all prediction times where (max timestamp in the dataset) - (current timestamp) is less than min_lookahead_days

diff --git a/src/psycopt2d/utils/utils.py b/src/psycopt2d/utils/utils.py
@@ -183,7 +183,7 @@ def bin_continuous_data(series: pd.Series, bins: list[int]) -> pd.Series:
 
     Args:
         series (pd.Series): Series with continuous data such as age
-        bins (list[int]): Desired bins
+        bins (list[int]): Desired bins. Last value creates a bin from the last value to infinity.
 
     Returns:
         pd.Series: Binned data
@@ -203,11 +203,25 @@ def bin_continuous_data(series: pd.Series, bins: list[int]) -> pd.Series:
     8      51+
     """
     labels = []
+
+    if isinstance(bins, tuple):
+        bins = list(bins)
+    # Apend maximum value from series ot bins set upper cut-off if larger than maximum bins value
+    if series.max() > max(bins):
+        bins.append(series.max())
+
+    # Create bin labels
     for i, bin_v in enumerate(bins):
-        if i == 0:
-            labels.append(f"{bin_v}-{bins[i+1]}")
-        elif i < len(bins) - 2:
-            labels.append(f"{bin_v+1}-{bins[i+1]}")
+        # If not the final bin
+        if i < len(bins) - 2:
+            # If the difference between the current bin and the next bin is 1, the bin label is a single value and not an interval
+            if (bins[i + 1] - bin_v) == 1:
+                labels.append(f"{bin_v}")
+            # Else generate bin labels as intervals
+            elif i == 0:
+                labels.append(f"{bin_v}-{bins[i+1]}")
+            else:
+                labels.append(f"{bin_v+1}-{bins[i+1]}")
         elif i == len(bins) - 2:
             labels.append(f"{bin_v+1}+")
         else:
@@ -311,13 +325,23 @@ def eval_dataset_to_disk(eval_dataset: EvalDataset, file_path: Path) -> None:
 
     Handles csv and parquet files based on suffix.
     """
+    # Add base columns and custom columns
     df_template = {
         col_name: series
         for col_name, series in eval_dataset.__dict__.items()
         if series is not None
+    } | {
+        col_name: series
+        for col_name, series in eval_dataset.custom.__dict__.items()
+        if series is not None
+    }
+
+    # Remove items that aren't series, e.g. the top level CustomColumns object
+    template_filtered = {
+        k: v for k, v in df_template.items() if isinstance(v, pd.Series)
     }
 
-    df = pd.DataFrame(df_template)
+    df = pd.DataFrame(template_filtered)
 
     write_df_to_file(df=df, file_path=file_path)