Skip to content
This repository has been archived by the owner on May 1, 2023. It is now read-only.

Commit

Permalink
Merge pull request #348 from Aarhus-Psychiatry-Research/marbern/refac…
Browse files Browse the repository at this point in the history
…tor_load

Marbern/refactor load
  • Loading branch information
MartinBernstorff authored Dec 22, 2022
2 parents 0a6f407 + 0b09b88 commit 3047536
Show file tree
Hide file tree
Showing 118 changed files with 1,875 additions and 3,232 deletions.
17 changes: 0 additions & 17 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,17 +13,6 @@ repos:
pass_filenames: false
always_run: true

- repo: https://github.com/PyCQA/autoflake
rev: v1.7.6
hooks:
- id: autoflake
args:
[
"--in-place",
"--remove-all-unused-imports",
"--ignore-init-module-imports",
]

- repo: https://github.com/pycqa/isort
rev: 5.10.1
hooks:
Expand Down Expand Up @@ -58,12 +47,6 @@ repos:
hooks:
- id: black

- repo: https://github.com/PyCQA/flake8
rev: 5.0.4
hooks:
- id: flake8
args: [--config, .flake8]

- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.3.0 # Use the ref you want to point at
hooks:
Expand Down
File renamed without changes.
Empty file added application/config/__init__.py
Empty file.
19 changes: 19 additions & 0 deletions application/config/data/default_data.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# @package _global_
data:
# General config
n_training_samples: null
dir: E:\shared_resources\t2d\feature_sets\psycop_t2d_adminmanber_features_2022_12_19_15_36
suffix: parquet

# Feature specs
pred_prefix: pred_
outc_prefix: outc_

col_name:
pred_timestamp: timestamp
outcome_timestamp: timestamp_first_t2d_hba1c
id: dw_ek_borger
age: pred_age_in_years
exclusion_timestamp: timestamp_exclusion
custom:
n_hba1c: eval_hba1c_within_9999_days_count_fallback_nan
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# @package _global_
defaults:
- project: default_project
- data: t2d_parquet
- data: default_data
- preprocessing: default_preprocessing
- model: xgboost
- train: default_training
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,4 @@ hydra:
++model.args.C: interval(1e-5, 1.0)
++model.args.l1_ratio: interval(1e-5, 1.0)
# preprocessing
++preprocessing.scaling: choice("null", "z-score-normalization")
++preprocessing.post_split.scaling: choice("null", "z-score-normalization")
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,4 @@ hydra:
sweeper:
params:
# preprocessing
++preprocessing.scaling: choice(null, "z-score-normalization")
++preprocessing.post_split.scaling: choice(null, "z-score-normalization")
30 changes: 30 additions & 0 deletions application/config/preprocessing/default_preprocessing.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# @package _global_
preprocessing:
pre_split:
convert_to_boolean: false
convert_booleans_to_int: true
drop_datetime_predictor_columns: true
convert_datetimes_to_ordinal: false
drop_patient_if_exclusion_before_date: 2013-01-01
min_prediction_time_date: 2013-01-01
min_lookahead_days: 1825
lookbehind_combination: [30, 90, 180, 365, 730]
min_age: 18
post_split:
imputation_method: most_frequent
scaling: z-score-normalisation
feature_selection:
name: chi2
params:
percentile: 20 # (int): Percent of features to keep. Defaults to 10.

# Parameters that will only take effect if running with --multirun
hydra:
sweeper:
params:
++preprocessing.post_split.imputation_method: choice("most_frequent", "mean", "median", "null")
++preprocessing.post_split.scaling: choice("z-score-normalization", "null")
++preprocessing.post_split.feature_selection.name: choice("chi2", "null")
++preprocessing.post_split.feature_selection.params.percentile: int(tag(log, interval(1, 90)))
++preprocessing.pre_split.lookbehind_combination: choice([30, 90, 180, 365, 730], [30, 180, 730], [730], [365], [90], [30])

9 changes: 9 additions & 0 deletions application/config/project/default_project.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
name: t2d
seed: 42

wandb:
entity: psycop # Which entity to run WanDB in.
mode: run # Which mode to run WanDB in. Takes "run", "dryrun", "offline" and "disabled"
group: t2d # Which group to run WanDB in.

gpu: true
22 changes: 22 additions & 0 deletions application/inspect_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
"""Example of how to inspect a dataset using the configs."""
from psycop_model_training.data_loader.utils import (
load_and_filter_train_from_cfg,
load_train_raw,
)
from psycop_model_training.utils.config_schemas import load_test_cfg_as_pydantic


def main():
"""Main."""
config_file_name = "default_config.yaml"

cfg = load_test_cfg_as_pydantic(config_file_name=config_file_name)
df = load_train_raw(cfg=cfg) # pylint: disable=unused-variable

df_filtered = load_and_filter_train_from_cfg( # pylint: disable=unused-variable
cfg=cfg,
)


if __name__ == "__main__":
main()
Empty file added application/loaders/__init__.py
Empty file.
37 changes: 37 additions & 0 deletions application/loaders/preprocessing_loaders.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import pandas as pd
from psycopmlutils.sql.loader import sql_load


def load_timestamp_for_any_diabetes():
"""Loads timestamps for the broad definition of diabetes used for wash-in.
See R files for details.
"""
timestamp_any_diabetes = sql_load(
query="SELECT * FROM [fct].[psycop_t2d_first_diabetes_any]",
format_timestamp_cols_to_datetime=False,
)[["dw_ek_borger", "datotid_first_diabetes_any"]]

timestamp_any_diabetes = timestamp_any_diabetes.rename(
columns={"datotid_first_diabetes_any": "timestamp_washin"},
)

return timestamp_any_diabetes


def add_washin_timestamps(dataset: pd.DataFrame) -> pd.DataFrame:
"""Add washin timestamps to dataset.
Washin is an exclusion criterion. E.g. if the patient has any visit
that looks like diabetes before the study starts (i.e. during
washin), they are excluded.
"""
timestamp_washin = load_timestamp_for_any_diabetes()

dataset = dataset.merge(
timestamp_washin,
on="dw_ek_borger",
how="left",
)

return dataset
36 changes: 18 additions & 18 deletions application/t2d/train_and_log_models.py → application/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,19 +12,20 @@

import pandas as pd
import wandb
from psycopmlutils.wandb.wandb_try_except_decorator import wandb_alert_on_exception
from random_word import RandomWords
from wasabi import Printer

from psycop_model_training.config.schemas import (
BaseModel,
FullConfigSchema,
load_cfg_as_pydantic,
)
from psycop_model_training.load import load_train_raw
from psycop_model_training.model_eval.evaluate_model import (
from psycop_model_training.data_loader.data_loader import DataLoader
from psycop_model_training.utils.col_name_inference import (
infer_look_distance,
infer_outcome_col_name,
)
from psycop_model_training.utils.config_schemas.conf_utils import (
BaseModel,
load_app_cfg_as_pydantic,
)
from psycop_model_training.utils.config_schemas.full_config import FullConfigSchema


def start_trainer(
Expand All @@ -39,13 +40,13 @@ def start_trainer(

subprocess_args: list[str] = [
"python",
"src/psycop_model_training/train_model.py",
"application/train_model.py",
f"project.wandb.group='{wandb_group_override}'",
f"project.wandb.mode={cfg.project.wandb.mode}",
f"hydra.sweeper.n_trials={cfg.train.n_trials_per_lookahead}",
f"hydra.sweeper.n_jobs={cfg.train.n_jobs_per_trainer}",
f"model={model_name}",
f"data.min_lookahead_days={lookahead_days}",
f"preprocessing.pre_split.min_lookahead_days={lookahead_days}",
"--config-name",
f"{config_file_name}",
]
Expand Down Expand Up @@ -148,6 +149,9 @@ def train_models_for_each_cell_in_grid(
),
)

# Sleep a bit to avoid segfaults
time.sleep(10)


def get_possible_lookaheads(
msg: Printer,
Expand Down Expand Up @@ -191,24 +195,20 @@ def get_possible_lookaheads(
return list(set(possible_lookahead_days) - set(lookaheads_without_rows))


@wandb_alert_on_exception
def main():
"""Main."""
msg = Printer(timestamp=True)

debug = False

if debug:
config_file_name = "integration_config.yaml"
else:
config_file_name = "default_config.yaml"
config_file_name = "default_config.yaml"

cfg = load_cfg_as_pydantic(config_file_name=config_file_name)
cfg = load_app_cfg_as_pydantic(config_file_name=config_file_name)

random_word = RandomWords()
wandb_group = f"{random_word.get_random_word()}-{random_word.get_random_word()}"

wandb.init(
project=cfg.project.name,
project=f"{cfg.project.name}-baseline-model-training",
mode=cfg.project.wandb.mode,
group=wandb_group,
entity=cfg.project.wandb.entity,
Expand All @@ -217,7 +217,7 @@ def main():

# Load dataset without dropping any rows for inferring
# which look distances to grid search over
train = load_train_raw(cfg=cfg)
train = DataLoader(cfg=cfg).load_dataset_from_dir(split_names="train")

possible_lookaheads = get_possible_lookaheads(
msg=msg,
Expand Down
17 changes: 0 additions & 17 deletions application/t2d/inspect_dataset.py

This file was deleted.

This file was deleted.

This file was deleted.

Loading

0 comments on commit 3047536

Please sign in to comment.