Skip to content
This repository has been archived by the owner on May 1, 2023. It is now read-only.

Commit

Permalink
Merge pull request #293 from Aarhus-Psychiatry-Research/bokajgd/issue188
Browse files Browse the repository at this point in the history
Plot perfomance by age and n_hba1c
  • Loading branch information
MartinBernstorff authored Oct 28, 2022
2 parents 1b5abb2 + 982bf0d commit adea6bd
Show file tree
Hide file tree
Showing 22 changed files with 1,420 additions and 1,111 deletions.
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ flake8 = ">=4.0.1,<4.1.0"
pytest-xdist = "^2.5.0"
mypy = "^0.982"
setuptools = ">=65.3.0,<65.6.0"
pylint = "^2.15.5"

[build-system]
requires = ["poetry-core>=1.0.0", "pip"]
Expand Down
6 changes: 5 additions & 1 deletion src/psycopt2d/config/data/synth_data.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,15 @@ data:
min_lookahead_days: 30
min_lookbehind_days: 100
min_prediction_time_date: null
pred_prefix: pred_

col_name:
pred_prefix: pred_
pred_timestamp: timestamp
outcome_timestamp: timestamp_outcome
id: citizen_ids
age: pred_age
custom:
n_hba1c: hba1c_within_9999_days_count_nan

# Looking ahead
drop_patient_if_outcome_before_date: null
Expand Down
5 changes: 4 additions & 1 deletion src/psycopt2d/config/data/t2d_parquet.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,15 @@ data:
min_lookahead_days: 1825

# Feature specs
pred_prefix: pred_

col_name:
pred_prefix: pred_
pred_timestamp: timestamp
outcome_timestamp: _timestamp_first_t2d
id: dw_ek_borger
age: pred_age_in_years
custom:
n_hba1c: hba1c_within_9999_days_count_fallback_0

max_lookbehind_days: 3650
lookbehind_combination: [30, 90, 180, 365, 730]
Expand Down
34 changes: 32 additions & 2 deletions src/psycopt2d/evaluate_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
from psycopt2d.utils.config_schemas import FullConfigSchema
from psycopt2d.utils.utils import positive_rate_to_pred_probs
from psycopt2d.visualization.feature_importance import plot_feature_importances
from psycopt2d.visualization.performance_by_age import plot_performance_by_age
from psycopt2d.visualization.performance_by_n_hba1c import plot_performance_by_n_hba1c
from psycopt2d.visualization.performance_over_time import (
plot_auc_by_time_from_first_visit,
plot_metric_by_calendar_time,
Expand Down Expand Up @@ -87,7 +89,7 @@ def filter_plot_bins(
return lookahead_bins, lookbehind_bins


def create_default_plot_artifacts(
def create_base_plot_artifacts(
cfg: FullConfigSchema,
eval_dataset: EvalDataset,
save_dir: Path,
Expand Down Expand Up @@ -145,6 +147,29 @@ def create_default_plot_artifacts(
output_format="df",
),
),
ArtifactContainer(
label="performance_by_age",
artifact=plot_performance_by_age(
eval_dataset=eval_dataset,
save_path=save_dir / "performance_by_age.png",
),
),
]


def create_custom_plot_artifacts(
eval_dataset: EvalDataset,
save_dir: Path,
) -> list[ArtifactContainer]:
"""A collection of plots that are always generated."""
return [
ArtifactContainer(
label="performance_by_age",
artifact=plot_performance_by_n_hba1c(
eval_dataset=eval_dataset,
save_path=save_dir / "performance_by_age.png",
),
),
]


Expand All @@ -171,14 +196,19 @@ def run_full_evaluation(
# Create the directory if it doesn't exist
save_dir.mkdir(parents=True, exist_ok=True)

artifact_containers = create_default_plot_artifacts(
artifact_containers = create_base_plot_artifacts(
cfg=cfg,
eval_dataset=eval_dataset,
lookahead_bins=lookahead_bins,
lookbehind_bins=lookbehind_bins,
save_dir=save_dir,
)

artifact_containers += create_custom_plot_artifacts(
eval_dataset=eval_dataset,
save_dir=save_dir,
)

if pipe_metadata and pipe_metadata.feature_importances:
artifact_containers += [
ArtifactContainer(
Expand Down
16 changes: 11 additions & 5 deletions src/psycopt2d/evaluation_dataclasses.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,25 +7,31 @@
from psycopt2d.utils.config_schemas import BaseModel, FullConfigSchema


class CustomColumns(BaseModel):
"""Custom columns to use in evaluation."""

n_hba1c: Optional[pd.Series]


class EvalDataset(BaseModel):
"""Evaluation dataset.
Makes the interfaces of our evaluation functions simpler and
consistent.
"""

class Config:
"""Configuration of Pydantic model."""

allow_mutation = True

ids: pd.Series
pred_timestamps: pd.Series
outcome_timestamps: pd.Series
y: pd.Series
y_hat_probs: pd.Series
y_hat_int: pd.Series
age: Optional[pd.Series]
custom: Optional[CustomColumns] = CustomColumns(n_hba1c=None)

def __init__(self, **kwargs):
super().__init__(**kwargs)
self.Config.allow_mutation = True


class ArtifactContainer(BaseModel):
Expand Down
2 changes: 1 addition & 1 deletion src/psycopt2d/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ def __init__(
self.file_suffix = cfg.data.suffix

# Column specifications
self.pred_col_name_prefix = cfg.data.col_name.pred_prefix
self.pred_col_name_prefix = cfg.data.pred_prefix

def _load_dataset_file( # pylint: disable=inconsistent-return-statements
self,
Expand Down
13 changes: 7 additions & 6 deletions src/psycopt2d/model_performance/model_performance.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def performance_metrics_from_df(
aggregate_by_id=False,
prediction_col_name=prediction_col_name,
label_col_name=label_col_name,
id_col_name=id_col_name,
id_col_name=id_col_name, # type: ignore
to_wide=to_wide,
id2label=id2label,
binary_threshold=binary_threshold,
Expand Down Expand Up @@ -166,6 +166,7 @@ def performance_metrics_from_file(
binary_threshold=binary_threshold,
)

@staticmethod
def performance_metrics_from_folder(
folder: Union[str, Path],
pattern: str,
Expand Down Expand Up @@ -419,18 +420,18 @@ def compute_metrics(
performance = pd.melt(performance)
# split score and class into two columns
if add_level_prefix:
performance[["level", "score_type", "class"]] = performance[
performance[["level", "score_type", "class"]] = performance[ # type: ignore
"variable"
].str.split("-", 2, expand=True)
# drop unused columns and re-arrange
return performance[["level", "class", "score_type", "value"]]
return performance[["level", "class", "score_type", "value"]] # type: ignore
else:
performance[["score_type", "class"]] = performance["variable"].str.split(
performance[["score_type", "class"]] = performance["variable"].str.split( # type: ignore
"-",
1,
expand=True,
)
return performance[["class", "score_type", "value"]]
return performance[["class", "score_type", "value"]] # type: ignore


if __name__ == "__main__":
Expand Down Expand Up @@ -464,7 +465,7 @@ def compute_metrics(
prediction_col_name="scores",
id_col_name="id",
id2label=id2label,
metadata_col_names="all",
metadata_col_names="all", # type: ignore
to_wide=False,
)

Expand Down
2 changes: 1 addition & 1 deletion src/psycopt2d/model_training_watcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ def _get_run_wandb_dir(self, run_id: str) -> Path:

def _get_run_attribute(self, run: Run, attribute: str) -> Any:
"""Get an attribute from a wandb run."""
if attribute in run.summary:
if attribute in run.summary: # type: ignore
return run.summary[attribute]
if self.verbose:
msg.info(
Expand Down
11 changes: 5 additions & 6 deletions src/psycopt2d/train_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,10 +201,11 @@ def train_and_eval_on_crossvalidation(
y_hat_int=df["y_hat_prob"].round(),
pred_timestamps=df[cfg.data.col_name.pred_timestamp],
outcome_timestamps=df[cfg.data.col_name.outcome_timestamp],
age=df[cfg.data.col_name.age],
)

if cfg.data.col_name.age:
eval_dataset.age = df[cfg.data.col_name.age]
if cfg.data.col_name.custom.n_hba1c:
eval_dataset.custom.n_hba1c = df[cfg.data.col_name.custom.n_hba1c]

return eval_dataset

Expand Down Expand Up @@ -255,11 +256,9 @@ def train_and_eval_on_val_split(
y_hat_int=df["y_hat_prob"].round(),
pred_timestamps=df[cfg.data.col_name.pred_timestamp],
outcome_timestamps=df[cfg.data.col_name.outcome_timestamp],
age=df[cfg.data.col_name.age],
)

if cfg.data.col_name.age:
eval_dataset.age = df[cfg.data.col_name.age]

return eval_dataset


Expand Down Expand Up @@ -350,7 +349,7 @@ def get_col_names(cfg: DictConfig, train: pd.DataFrame) -> tuple[str, list[str]]
)

train_col_names = [ # pylint: disable=invalid-name
c for c in train.columns if c.startswith(cfg.data.col_name.pred_prefix)
c for c in train.columns if c.startswith(cfg.data.pred_prefix)
]

return outcome_col_name, train_col_names
Expand Down
12 changes: 10 additions & 2 deletions src/psycopt2d/utils/config_schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,14 +64,21 @@ class ProjectConf(BaseModel):
gpu: bool


class CustomColNames(BaseModel):
"""All custom column names, i.e. columns that won't generalise across
projects."""

n_hba1c: str


class ColumnNames(BaseModel):
"""Column names in the data."""

pred_prefix: str # prefix of predictor columns
pred_timestamp: str # (str): Column name for prediction times
outcome_timestamp: str # (str): Column name for outcome timestamps
id: str # (str): Citizen colnames
age: Optional[str] # Name of the age column
age: str # Name of the age column
custom: CustomColNames


class DataConf(BaseModel):
Expand All @@ -85,6 +92,7 @@ class DataConf(BaseModel):

# Feature specs
col_name: ColumnNames
pred_prefix: str # prefix of predictor columns

# Looking ahead
min_lookahead_days: int # (int): Drop all prediction times where (max timestamp in the dataset) - (current timestamp) is less than min_lookahead_days
Expand Down
36 changes: 30 additions & 6 deletions src/psycopt2d/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ def bin_continuous_data(series: pd.Series, bins: list[int]) -> pd.Series:
Args:
series (pd.Series): Series with continuous data such as age
bins (list[int]): Desired bins
bins (list[int]): Desired bins. Last value creates a bin from the last value to infinity.
Returns:
pd.Series: Binned data
Expand All @@ -203,11 +203,25 @@ def bin_continuous_data(series: pd.Series, bins: list[int]) -> pd.Series:
8 51+
"""
labels = []

if isinstance(bins, tuple):
bins = list(bins)
# Apend maximum value from series ot bins set upper cut-off if larger than maximum bins value
if series.max() > max(bins):
bins.append(series.max())

# Create bin labels
for i, bin_v in enumerate(bins):
if i == 0:
labels.append(f"{bin_v}-{bins[i+1]}")
elif i < len(bins) - 2:
labels.append(f"{bin_v+1}-{bins[i+1]}")
# If not the final bin
if i < len(bins) - 2:
# If the difference between the current bin and the next bin is 1, the bin label is a single value and not an interval
if (bins[i + 1] - bin_v) == 1:
labels.append(f"{bin_v}")
# Else generate bin labels as intervals
elif i == 0:
labels.append(f"{bin_v}-{bins[i+1]}")
else:
labels.append(f"{bin_v+1}-{bins[i+1]}")
elif i == len(bins) - 2:
labels.append(f"{bin_v+1}+")
else:
Expand Down Expand Up @@ -311,13 +325,23 @@ def eval_dataset_to_disk(eval_dataset: EvalDataset, file_path: Path) -> None:
Handles csv and parquet files based on suffix.
"""
# Add base columns and custom columns
df_template = {
col_name: series
for col_name, series in eval_dataset.__dict__.items()
if series is not None
} | {
col_name: series
for col_name, series in eval_dataset.custom.__dict__.items()
if series is not None
}

# Remove items that aren't series, e.g. the top level CustomColumns object
template_filtered = {
k: v for k, v in df_template.items() if isinstance(v, pd.Series)
}

df = pd.DataFrame(df_template)
df = pd.DataFrame(template_filtered)

write_df_to_file(df=df, file_path=file_path)

Expand Down
Loading

0 comments on commit adea6bd

Please sign in to comment.