Skip to content
This repository has been archived by the owner on May 3, 2023. It is now read-only.

Commit

Permalink
Merge pull request #134 from Aarhus-Psychiatry-Research/martbern/keep…
Browse files Browse the repository at this point in the history
…_only_last_visit_in_sequence_for_diagnoses

Keep only last visit in sequence for diagnoses to avoid data leakage from the future
  • Loading branch information
sarakolding authored Jan 31, 2023
2 parents 4df9054 + 57f8107 commit c87a95b
Show file tree
Hide file tree
Showing 12 changed files with 102 additions and 179 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,9 @@ def __init__(
self.prediction_times_df = prediction_times_df
self.quarantine_df = quarantine_timestamps_df
self.quarantine_days = quarantine_interval_days
self.quarantine_df = self.quarantine_df.rename(columns={"timestamp": "timestamp_quarantine"})
self.quarantine_df = self.quarantine_df.rename(
columns={"timestamp": "timestamp_quarantine"},
)
self.entity_id_col_name = entity_id_col_name
self.timestamp_col_name = timestamp_col_name

Expand All @@ -50,7 +52,7 @@ def __init__(
self.prediction_times_df[
self.pred_time_uuid_col_name
] = self.prediction_times_df[self.entity_id_col_name].astype(
str
str,
) + self.prediction_times_df[
timestamp_col_name
].dt.strftime(
Expand Down Expand Up @@ -106,10 +108,9 @@ def _filter_prediction_times_by_quarantine_period(self):
],
)


n_after = len(df)
log.info(
f"Filtered {n_before - n_after} prediction times by quarantine period."
f"Filtered {n_before - n_after} prediction times by quarantine period.",
)

return df
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,20 +4,19 @@

import pandas as pd
import psutil
from timeseriesflattener.feature_cache.cache_to_disk import DiskCache
from timeseriesflattener.feature_spec_objects import _AnySpec
from timeseriesflattener.flattened_dataset import TimeseriesFlattener

from psycop_feature_generation.application_modules.filter_prediction_times import (
PredictionTimeFilterer,
)
from psycop_feature_generation.application_modules.project_setup import ProjectInfo
from psycop_feature_generation.application_modules.utils import print_df_dimensions_diff
from psycop_feature_generation.application_modules.wandb_utils import (
wandb_alert_on_exception,
)
from psycop_feature_generation.loaders.raw.load_demographic import birthdays

from timeseriesflattener.feature_cache.cache_to_disk import DiskCache
from timeseriesflattener.feature_spec_objects import _AnySpec
from timeseriesflattener.flattened_dataset import TimeseriesFlattener

log = logging.getLogger(__name__)


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from datetime import datetime

import coloredlogs

from psycop_feature_generation.application_modules.project_setup import ProjectInfo


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,13 @@
from pathlib import Path
from typing import Literal

from psycop_feature_generation.utils import RELATIVE_PROJECT_ROOT, SHARED_RESOURCES_PATH

import wandb
from timeseriesflattener.feature_spec_objects import ( # pylint: disable=no-name-in-module
BaseModel,
)

from psycop_feature_generation.utils import RELATIVE_PROJECT_ROOT, SHARED_RESOURCES_PATH

log = logging.getLogger(__name__)


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,18 +6,18 @@

import numpy as np
import pandas as pd
from wasabi import Printer

from psycop_feature_generation.data_checks.utils import save_df_to_pretty_html_table
from psycop_feature_generation.loaders.flattened.local_feature_loaders import (
load_split_predictors,
)
from timeseriesflattener.feature_spec_objects import (
PredictorSpec,
StaticSpec,
TemporalSpec,
_AnySpec,
)
from wasabi import Printer

from psycop_feature_generation.data_checks.utils import save_df_to_pretty_html_table
from psycop_feature_generation.loaders.flattened.local_feature_loaders import (
load_split_predictors,
)

UNICODE_HIST = {
0: " ",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@
import psycop_feature_generation.loaders.raw.load_diagnoses as d

if __name__ == "__main__":
df = d.gerd(n_rows=200)
df = d.gerd(n_rows=1_000)
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def admissions(
sql = f"SELECT {cols} FROM [fct].{meta['view']} WHERE {meta['datetime_col']} IS NOT NULL AND {meta['value_col']} IS NOT NULL {meta['where']}"

if shak_code is not None:
sql += f" AND left({meta['location_col']}, {len(str(shak_code))}) {shak_sql_operator} {str(shak_code)}"
sql += f" AND left({meta['location_col']}, {len(str(shak_code))}) {shak_sql_operator} {str(shak_code)}"

df = sql_load(sql, database="USR_PS_FORSK", chunksize=None, n_rows=n_rows)
df.rename(
Expand Down
10 changes: 6 additions & 4 deletions src/psycop_feature_generation/loaders/raw/load_coercion.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,10 +49,12 @@ def coercion_duration(
df = sql_load(sql, database="USR_PS_FORSK", chunksize=None, n_rows=n_rows)

# add end time as start time for acute sedation
df.loc[df.typetekst_sei == 'Beroligende medicin', 'datotid_slut_sei'] = df['datotid_start_sei']

df.loc[df.typetekst_sei == "Beroligende medicin", "datotid_slut_sei"] = df[
"datotid_start_sei"
]

# drop nas for coercion end times
df = df.dropna(subset='datotid_slut_sei')
df = df.dropna(subset="datotid_slut_sei")

# Drop duplicate rows
df = df.drop_duplicates(keep="first")
Expand All @@ -65,7 +67,7 @@ def coercion_duration(
# Change NaNs to 0
df["value"].fillna(0, inplace=True)

return df[['dw_ek_borger', 'timestamp', 'value']].reset_index(drop=True)
return df[["dw_ek_borger", "timestamp", "value"]].reset_index(drop=True)


def _concatenate_coercion(
Expand Down
Loading

0 comments on commit c87a95b

Please sign in to comment.