Skip to content

Commit

Permalink
bp_project: setup bipolar patient representations project (#882)
Browse files Browse the repository at this point in the history
Initial setup of dynamic patient representations project for
investigating subtypes in patients with bipolar disorders.

Initial implementation includes: 

- [x] Cohort definition
- [x] Script for generating predictions timestamps
  • Loading branch information
bokajgd authored Apr 16, 2024
2 parents 82a7aad + f4c0838 commit cb93383
Show file tree
Hide file tree
Showing 8 changed files with 254 additions and 3 deletions.
43 changes: 42 additions & 1 deletion psycop/common/feature_generation/loaders/raw/load_diagnoses.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
def from_contacts(
icd_code: list[str] | str,
output_col_name: str = "value",
code_col_name: str = "diagnosegruppestreng",
n_rows: int | None = None,
wildcard_icd_code: bool = False,
shak_location_col: str | None = None,
Expand All @@ -40,6 +41,7 @@ def from_contacts(
Args:
icd_code (str): Substring to match diagnoses for. Matches any diagnoses, whether a-diagnosis, b-diagnosis etc. # noqa: DAR102
output_col_name (str, optional): Name of new column string. Defaults to "value".
code_col_name (str, optional): Name of column in loaded data frame from which to extract the diagnosis codes. Defaults to "diagnosegruppestrengs".
n_rows: Number of rows to return. Defaults to None.
wildcard_icd_code (bool, optional): Whether to match on icd_code*. Defaults to False.
shak_location_col (str, optional): Name of column containing shak code. Defaults to None. For diagnosis loaders, this column is "shakkode_ansvarlig". Combine with shak_code and shak_sql_operator.
Expand Down Expand Up @@ -75,7 +77,7 @@ def from_contacts(

df = load_from_codes(
codes_to_match=icd_code,
code_col_name="diagnosegruppestreng",
code_col_name=code_col_name,
source_timestamp_col_name=source_timestamp_col_name,
view="FOR_kohorte_indhold_pt_journal_psyk_somatik_inkl_2021_feb2022",
output_col_name=output_col_name,
Expand Down Expand Up @@ -771,6 +773,25 @@ def manic_and_bipolar(
)


@data_loaders.register("bipolar")
def bipolar(
n_rows: int | None = None,
shak_location_col: str | None = None,
shak_code: int | None = None,
shak_sql_operator: str | None = None,
timestamp_purpose: Literal["predictor", "outcome"] | None = "predictor",
) -> pd.DataFrame:
return from_contacts(
icd_code=["f31"],
wildcard_icd_code=True,
n_rows=n_rows,
shak_location_col=shak_location_col,
shak_code=shak_code,
shak_sql_operator=shak_sql_operator,
timestamp_purpose=timestamp_purpose,
)


@data_loaders.register("depressive_disorders")
def depressive_disorders(
n_rows: int | None = None,
Expand Down Expand Up @@ -1333,3 +1354,23 @@ def gerd(
shak_sql_operator=shak_sql_operator,
timestamp_purpose=timestamp_purpose,
)


@data_loaders.register("bipolar_a_diagnosis")
def bipolar_a_diagnosis(
n_rows: int | None = None,
shak_location_col: str | None = None,
shak_code: int | None = None,
shak_sql_operator: str | None = None,
timestamp_purpose: Literal["predictor", "outcome"] | None = "predictor",
) -> pd.DataFrame:
return from_contacts(
icd_code=["f31"],
code_col_name="adiagnose4kar",
wildcard_icd_code=True,
n_rows=n_rows,
shak_location_col=shak_location_col,
shak_code=shak_code,
shak_sql_operator=shak_sql_operator,
timestamp_purpose=timestamp_purpose,
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
import pandas as pd
import polars as pl
from tqdm import tqdm

from psycop.common.cohort_definition import (
CohortDefiner,
FilteredPredictionTimeBundle,
filter_prediction_times,
)
from psycop.common.feature_generation.loaders.raw.load_visits import (
get_time_of_first_visit_to_psychiatry,
)
from psycop.projects.bipolar.cohort_definition.diagnosis_specification.first_bipolar_diagnosis import (
get_first_bipolar_diagnosis,
)
from psycop.projects.bipolar.cohort_definition.eligible_data.single_filters import (
BipolarMinAgeFilter,
BipolarMinDateFilter,
BipolarPatientsWithF20F25Filter,
BipolarWashoutMove,
)


def generate_timestamps(
first_visit_date: pd.Timestamp, diagnosis_date: pd.Timestamp, interval_days: int = 30
) -> list: # type: ignore
timestamps = [diagnosis_date]
current_date = diagnosis_date
while current_date > (first_visit_date + pd.to_timedelta(interval_days, "d")):
current_date -= pd.Timedelta(days=interval_days)
timestamps.append(current_date)
return timestamps[::-1]


class BipolarCohortDefiner(CohortDefiner):
@staticmethod
def get_bipolar_cohort(interval_days: int = 30) -> FilteredPredictionTimeBundle:
bipolar_diagnosis_timestamps = pl.from_pandas(get_first_bipolar_diagnosis())

filtered_bipolar_diagnosis_timestamps = filter_prediction_times(
prediction_times=bipolar_diagnosis_timestamps.lazy(),
filtering_steps=(
BipolarMinDateFilter(),
BipolarMinAgeFilter(),
BipolarWashoutMove(),
BipolarPatientsWithF20F25Filter(),
),
entity_id_col_name="dw_ek_borger",
)

filtered_bipolar_diagnosis_timestamps_df = pd.DataFrame(
filtered_bipolar_diagnosis_timestamps.prediction_times.frame.to_pandas()
)

first_visits_to_psychiatry = pd.DataFrame(
get_time_of_first_visit_to_psychiatry().to_pandas()
)

filtered_bipolar_diagnosis_timestamps_df = filtered_bipolar_diagnosis_timestamps_df.merge(
first_visits_to_psychiatry,
on="dw_ek_borger",
how="left",
suffixes=(None, "_first_visit"),
)

filtered_bipolar_diagnosis_timestamps_df = filtered_bipolar_diagnosis_timestamps_df.dropna(
subset=["timestamp_first_visit"]
)

filtered_bipolar_diagnosis_timestamps_df = filtered_bipolar_diagnosis_timestamps_df[
filtered_bipolar_diagnosis_timestamps_df["timestamp"]
>= filtered_bipolar_diagnosis_timestamps_df["timestamp_first_visit"]
]

filtered_bipolar_diagnosis_timestamps_df["time_from_first_visit"] = (
filtered_bipolar_diagnosis_timestamps_df["timestamp"]
- filtered_bipolar_diagnosis_timestamps_df["timestamp_first_visit"]
)

timestamps_per_patient = []

for _, row in tqdm(filtered_bipolar_diagnosis_timestamps_df.iterrows()):
timestamps = generate_timestamps(
row["timestamp_first_visit"], row["timestamp"], interval_days=interval_days
)
timestamps_per_patient.extend(
[(row["dw_ek_borger"], timestamp) for timestamp in timestamps]
)

result_df = pd.DataFrame(timestamps_per_patient, columns=["patient_id", "timestamp"])

filtered_bipolar_diagnosis_timestamps.prediction_times.frame = result_df # type: ignore

return filtered_bipolar_diagnosis_timestamps


if __name__ == "__main__":
df = BipolarCohortDefiner.get_bipolar_cohort()
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import pandas as pd
import polars as pl

from psycop.common.feature_generation.loaders.raw.load_diagnoses import bipolar_a_diagnosis


def get_first_bipolar_diagnosis() -> pd.DataFrame:
diagnoses = pl.DataFrame(bipolar_a_diagnosis())

first_bipolar = diagnoses.sort("timestamp").groupby("dw_ek_borger").first()

return first_bipolar.to_pandas()[["dw_ek_borger", "timestamp"]]


if __name__ == "__main__":
df = get_first_bipolar_diagnosis()
Empty file.
18 changes: 18 additions & 0 deletions psycop/projects/bipolar/cohort_definition/eligible_data/add_age.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
import polars as pl

from psycop.common.feature_generation.loaders.raw.load_demographic import birthdays
from psycop.projects.forced_admission_inpatient.cohort.prediction_timestamp_filters.eligible_config import (
AGE_COL_NAME,
)


def add_age(df: pl.DataFrame) -> pl.DataFrame:
birthday_df = pl.from_pandas(birthdays())

df = df.join(birthday_df, on="dw_ek_borger", how="inner")
df = df.with_columns(
((pl.col("timestamp") - pl.col("date_of_birth")).dt.days()).alias(AGE_COL_NAME)
)
df = df.with_columns((pl.col(AGE_COL_NAME) / 365.25).alias(AGE_COL_NAME))

return df
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from datetime import datetime

AGE_COL_NAME = "age"
MIN_AGE = 18
MIN_DATE = datetime(year=2013, month=1, day=1)
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
import pandas as pd
import polars as pl

from psycop.common.cohort_definition import PredictionTimeFilter
from psycop.common.feature_generation.loaders.raw.load_diagnoses import (
schizoaffective,
schizophrenia,
)
from psycop.common.feature_generation.loaders.raw.load_moves import MoveIntoRMBaselineLoader
from psycop.common.model_training_v2.trainer.preprocessing.steps.row_filter_other import (
QuarantineFilter,
)
from psycop.projects.bipolar.cohort_definition.eligible_data.add_age import add_age
from psycop.projects.bipolar.cohort_definition.eligible_data.eligible_config import (
AGE_COL_NAME,
MIN_AGE,
MIN_DATE,
)


class BipolarMinDateFilter(PredictionTimeFilter):
def apply(self, df: pl.LazyFrame) -> pl.LazyFrame:
after_df = df.filter(pl.col("timestamp") > MIN_DATE)
return after_df


class BipolarMinAgeFilter(PredictionTimeFilter):
def apply(self, df: pl.LazyFrame) -> pl.LazyFrame:
df = add_age(df.collect()).lazy()
after_df = df.filter(pl.col(AGE_COL_NAME) >= MIN_AGE)
return after_df


class BipolarWashoutMove(PredictionTimeFilter):
def apply(self, df: pl.LazyFrame) -> pl.LazyFrame:
not_within_two_years_from_move = QuarantineFilter(
entity_id_col_name="dw_ek_borger",
quarantine_timestamps_loader=MoveIntoRMBaselineLoader(),
quarantine_interval_days=730,
timestamp_col_name="timestamp",
).apply(df)

return not_within_two_years_from_move


class BipolarPatientsWithF20F25Filter(PredictionTimeFilter):
def apply(self, df: pl.LazyFrame) -> pl.LazyFrame:
f20_df = schizophrenia()
f25_df = schizoaffective()
pd_df = pd.DataFrame(df.collect().to_pandas())

merged_df_f20 = pd.merge(
pd_df, f20_df, on="dw_ek_borger", how="left", suffixes=("_df", "_f20")
)
bipolar_patients_with_later_f20 = merged_df_f20[
merged_df_f20["timestamp_df"] <= merged_df_f20["timestamp_f20"]
].dw_ek_borger.unique()

merged_df_f25 = pd.merge(
pd_df, f25_df, on="dw_ek_borger", how="left", suffixes=("_df", "_f25")
)
bipolar_patients_with_later_f25 = merged_df_f25[
merged_df_f25["timestamp_df"] <= merged_df_f25["timestamp_f25"]
].dw_ek_borger.unique()

bipolar_patients_with_f20_f25 = set(bipolar_patients_with_later_f20).union(
set(bipolar_patients_with_later_f25)
)
filtered_df = pd_df[~pd_df["dw_ek_borger"].isin(bipolar_patients_with_f20_f25)]

filtered_df = pl.DataFrame(filtered_df).lazy()

return filtered_df
Original file line number Diff line number Diff line change
Expand Up @@ -137,8 +137,8 @@ def pipe(self) -> Pipeline:

@dataclass
class ForcedAdmissionInpatientArtifactNames:
main_performance_figure: str = "fa_inpatient_main_performance_figure.png"
main_robustness_figure: str = "fa_inpatient_main_robustness.png"
main_performance_figure: str = "fa_inpatient_main_performance_figure.pdf"
main_robustness_figure: str = "fa_inpatient_main_robustness.pdf"
performance_by_ppr: str = "fa_inpatient_performance_by_ppr.xlsx"


Expand Down

0 comments on commit cb93383

Please sign in to comment.