diff --git a/psycop/common/feature_generation/loaders/raw/load_diagnoses.py b/psycop/common/feature_generation/loaders/raw/load_diagnoses.py index f82fdfd3f..85bb6f9a2 100644 --- a/psycop/common/feature_generation/loaders/raw/load_diagnoses.py +++ b/psycop/common/feature_generation/loaders/raw/load_diagnoses.py @@ -25,6 +25,7 @@ def from_contacts( icd_code: list[str] | str, output_col_name: str = "value", + code_col_name: str = "diagnosegruppestreng", n_rows: int | None = None, wildcard_icd_code: bool = False, shak_location_col: str | None = None, @@ -40,6 +41,7 @@ def from_contacts( Args: icd_code (str): Substring to match diagnoses for. Matches any diagnoses, whether a-diagnosis, b-diagnosis etc. # noqa: DAR102 output_col_name (str, optional): Name of new column string. Defaults to "value". + code_col_name (str, optional): Name of column in loaded data frame from which to extract the diagnosis codes. Defaults to "diagnosegruppestrengs". n_rows: Number of rows to return. Defaults to None. wildcard_icd_code (bool, optional): Whether to match on icd_code*. Defaults to False. shak_location_col (str, optional): Name of column containing shak code. Defaults to None. For diagnosis loaders, this column is "shakkode_ansvarlig". Combine with shak_code and shak_sql_operator. @@ -75,7 +77,7 @@ def from_contacts( df = load_from_codes( codes_to_match=icd_code, - code_col_name="diagnosegruppestreng", + code_col_name=code_col_name, source_timestamp_col_name=source_timestamp_col_name, view="FOR_kohorte_indhold_pt_journal_psyk_somatik_inkl_2021_feb2022", output_col_name=output_col_name, @@ -771,6 +773,25 @@ def manic_and_bipolar( ) +@data_loaders.register("bipolar") +def bipolar( + n_rows: int | None = None, + shak_location_col: str | None = None, + shak_code: int | None = None, + shak_sql_operator: str | None = None, + timestamp_purpose: Literal["predictor", "outcome"] | None = "predictor", +) -> pd.DataFrame: + return from_contacts( + icd_code=["f31"], + wildcard_icd_code=True, + n_rows=n_rows, + shak_location_col=shak_location_col, + shak_code=shak_code, + shak_sql_operator=shak_sql_operator, + timestamp_purpose=timestamp_purpose, + ) + + @data_loaders.register("depressive_disorders") def depressive_disorders( n_rows: int | None = None, @@ -1333,3 +1354,23 @@ def gerd( shak_sql_operator=shak_sql_operator, timestamp_purpose=timestamp_purpose, ) + + +@data_loaders.register("bipolar_a_diagnosis") +def bipolar_a_diagnosis( + n_rows: int | None = None, + shak_location_col: str | None = None, + shak_code: int | None = None, + shak_sql_operator: str | None = None, + timestamp_purpose: Literal["predictor", "outcome"] | None = "predictor", +) -> pd.DataFrame: + return from_contacts( + icd_code=["f31"], + code_col_name="adiagnose4kar", + wildcard_icd_code=True, + n_rows=n_rows, + shak_location_col=shak_location_col, + shak_code=shak_code, + shak_sql_operator=shak_sql_operator, + timestamp_purpose=timestamp_purpose, + ) diff --git a/psycop/projects/bipolar/cohort_definition/bipolar_cohort_definition.py b/psycop/projects/bipolar/cohort_definition/bipolar_cohort_definition.py new file mode 100644 index 000000000..54f0f776e --- /dev/null +++ b/psycop/projects/bipolar/cohort_definition/bipolar_cohort_definition.py @@ -0,0 +1,98 @@ +import pandas as pd +import polars as pl +from tqdm import tqdm + +from psycop.common.cohort_definition import ( + CohortDefiner, + FilteredPredictionTimeBundle, + filter_prediction_times, +) +from psycop.common.feature_generation.loaders.raw.load_visits import ( + get_time_of_first_visit_to_psychiatry, +) +from psycop.projects.bipolar.cohort_definition.diagnosis_specification.first_bipolar_diagnosis import ( + get_first_bipolar_diagnosis, +) +from psycop.projects.bipolar.cohort_definition.eligible_data.single_filters import ( + BipolarMinAgeFilter, + BipolarMinDateFilter, + BipolarPatientsWithF20F25Filter, + BipolarWashoutMove, +) + + +def generate_timestamps( + first_visit_date: pd.Timestamp, diagnosis_date: pd.Timestamp, interval_days: int = 30 +) -> list: # type: ignore + timestamps = [diagnosis_date] + current_date = diagnosis_date + while current_date > (first_visit_date + pd.to_timedelta(interval_days, "d")): + current_date -= pd.Timedelta(days=interval_days) + timestamps.append(current_date) + return timestamps[::-1] + + +class BipolarCohortDefiner(CohortDefiner): + @staticmethod + def get_bipolar_cohort(interval_days: int = 30) -> FilteredPredictionTimeBundle: + bipolar_diagnosis_timestamps = pl.from_pandas(get_first_bipolar_diagnosis()) + + filtered_bipolar_diagnosis_timestamps = filter_prediction_times( + prediction_times=bipolar_diagnosis_timestamps.lazy(), + filtering_steps=( + BipolarMinDateFilter(), + BipolarMinAgeFilter(), + BipolarWashoutMove(), + BipolarPatientsWithF20F25Filter(), + ), + entity_id_col_name="dw_ek_borger", + ) + + filtered_bipolar_diagnosis_timestamps_df = pd.DataFrame( + filtered_bipolar_diagnosis_timestamps.prediction_times.frame.to_pandas() + ) + + first_visits_to_psychiatry = pd.DataFrame( + get_time_of_first_visit_to_psychiatry().to_pandas() + ) + + filtered_bipolar_diagnosis_timestamps_df = filtered_bipolar_diagnosis_timestamps_df.merge( + first_visits_to_psychiatry, + on="dw_ek_borger", + how="left", + suffixes=(None, "_first_visit"), + ) + + filtered_bipolar_diagnosis_timestamps_df = filtered_bipolar_diagnosis_timestamps_df.dropna( + subset=["timestamp_first_visit"] + ) + + filtered_bipolar_diagnosis_timestamps_df = filtered_bipolar_diagnosis_timestamps_df[ + filtered_bipolar_diagnosis_timestamps_df["timestamp"] + >= filtered_bipolar_diagnosis_timestamps_df["timestamp_first_visit"] + ] + + filtered_bipolar_diagnosis_timestamps_df["time_from_first_visit"] = ( + filtered_bipolar_diagnosis_timestamps_df["timestamp"] + - filtered_bipolar_diagnosis_timestamps_df["timestamp_first_visit"] + ) + + timestamps_per_patient = [] + + for _, row in tqdm(filtered_bipolar_diagnosis_timestamps_df.iterrows()): + timestamps = generate_timestamps( + row["timestamp_first_visit"], row["timestamp"], interval_days=interval_days + ) + timestamps_per_patient.extend( + [(row["dw_ek_borger"], timestamp) for timestamp in timestamps] + ) + + result_df = pd.DataFrame(timestamps_per_patient, columns=["patient_id", "timestamp"]) + + filtered_bipolar_diagnosis_timestamps.prediction_times.frame = result_df # type: ignore + + return filtered_bipolar_diagnosis_timestamps + + +if __name__ == "__main__": + df = BipolarCohortDefiner.get_bipolar_cohort() diff --git a/psycop/projects/bipolar/cohort_definition/diagnosis_specification/first_bipolar_diagnosis.py b/psycop/projects/bipolar/cohort_definition/diagnosis_specification/first_bipolar_diagnosis.py new file mode 100644 index 000000000..3e2523970 --- /dev/null +++ b/psycop/projects/bipolar/cohort_definition/diagnosis_specification/first_bipolar_diagnosis.py @@ -0,0 +1,16 @@ +import pandas as pd +import polars as pl + +from psycop.common.feature_generation.loaders.raw.load_diagnoses import bipolar_a_diagnosis + + +def get_first_bipolar_diagnosis() -> pd.DataFrame: + diagnoses = pl.DataFrame(bipolar_a_diagnosis()) + + first_bipolar = diagnoses.sort("timestamp").groupby("dw_ek_borger").first() + + return first_bipolar.to_pandas()[["dw_ek_borger", "timestamp"]] + + +if __name__ == "__main__": + df = get_first_bipolar_diagnosis() diff --git a/psycop/projects/bipolar/cohort_definition/eligible_data/__init__.py b/psycop/projects/bipolar/cohort_definition/eligible_data/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/psycop/projects/bipolar/cohort_definition/eligible_data/add_age.py b/psycop/projects/bipolar/cohort_definition/eligible_data/add_age.py new file mode 100644 index 000000000..45a67d0f0 --- /dev/null +++ b/psycop/projects/bipolar/cohort_definition/eligible_data/add_age.py @@ -0,0 +1,18 @@ +import polars as pl + +from psycop.common.feature_generation.loaders.raw.load_demographic import birthdays +from psycop.projects.forced_admission_inpatient.cohort.prediction_timestamp_filters.eligible_config import ( + AGE_COL_NAME, +) + + +def add_age(df: pl.DataFrame) -> pl.DataFrame: + birthday_df = pl.from_pandas(birthdays()) + + df = df.join(birthday_df, on="dw_ek_borger", how="inner") + df = df.with_columns( + ((pl.col("timestamp") - pl.col("date_of_birth")).dt.days()).alias(AGE_COL_NAME) + ) + df = df.with_columns((pl.col(AGE_COL_NAME) / 365.25).alias(AGE_COL_NAME)) + + return df diff --git a/psycop/projects/bipolar/cohort_definition/eligible_data/eligible_config.py b/psycop/projects/bipolar/cohort_definition/eligible_data/eligible_config.py new file mode 100644 index 000000000..fe314c133 --- /dev/null +++ b/psycop/projects/bipolar/cohort_definition/eligible_data/eligible_config.py @@ -0,0 +1,5 @@ +from datetime import datetime + +AGE_COL_NAME = "age" +MIN_AGE = 18 +MIN_DATE = datetime(year=2013, month=1, day=1) diff --git a/psycop/projects/bipolar/cohort_definition/eligible_data/single_filters.py b/psycop/projects/bipolar/cohort_definition/eligible_data/single_filters.py new file mode 100644 index 000000000..19753396b --- /dev/null +++ b/psycop/projects/bipolar/cohort_definition/eligible_data/single_filters.py @@ -0,0 +1,73 @@ +import pandas as pd +import polars as pl + +from psycop.common.cohort_definition import PredictionTimeFilter +from psycop.common.feature_generation.loaders.raw.load_diagnoses import ( + schizoaffective, + schizophrenia, +) +from psycop.common.feature_generation.loaders.raw.load_moves import MoveIntoRMBaselineLoader +from psycop.common.model_training_v2.trainer.preprocessing.steps.row_filter_other import ( + QuarantineFilter, +) +from psycop.projects.bipolar.cohort_definition.eligible_data.add_age import add_age +from psycop.projects.bipolar.cohort_definition.eligible_data.eligible_config import ( + AGE_COL_NAME, + MIN_AGE, + MIN_DATE, +) + + +class BipolarMinDateFilter(PredictionTimeFilter): + def apply(self, df: pl.LazyFrame) -> pl.LazyFrame: + after_df = df.filter(pl.col("timestamp") > MIN_DATE) + return after_df + + +class BipolarMinAgeFilter(PredictionTimeFilter): + def apply(self, df: pl.LazyFrame) -> pl.LazyFrame: + df = add_age(df.collect()).lazy() + after_df = df.filter(pl.col(AGE_COL_NAME) >= MIN_AGE) + return after_df + + +class BipolarWashoutMove(PredictionTimeFilter): + def apply(self, df: pl.LazyFrame) -> pl.LazyFrame: + not_within_two_years_from_move = QuarantineFilter( + entity_id_col_name="dw_ek_borger", + quarantine_timestamps_loader=MoveIntoRMBaselineLoader(), + quarantine_interval_days=730, + timestamp_col_name="timestamp", + ).apply(df) + + return not_within_two_years_from_move + + +class BipolarPatientsWithF20F25Filter(PredictionTimeFilter): + def apply(self, df: pl.LazyFrame) -> pl.LazyFrame: + f20_df = schizophrenia() + f25_df = schizoaffective() + pd_df = pd.DataFrame(df.collect().to_pandas()) + + merged_df_f20 = pd.merge( + pd_df, f20_df, on="dw_ek_borger", how="left", suffixes=("_df", "_f20") + ) + bipolar_patients_with_later_f20 = merged_df_f20[ + merged_df_f20["timestamp_df"] <= merged_df_f20["timestamp_f20"] + ].dw_ek_borger.unique() + + merged_df_f25 = pd.merge( + pd_df, f25_df, on="dw_ek_borger", how="left", suffixes=("_df", "_f25") + ) + bipolar_patients_with_later_f25 = merged_df_f25[ + merged_df_f25["timestamp_df"] <= merged_df_f25["timestamp_f25"] + ].dw_ek_borger.unique() + + bipolar_patients_with_f20_f25 = set(bipolar_patients_with_later_f20).union( + set(bipolar_patients_with_later_f25) + ) + filtered_df = pd_df[~pd_df["dw_ek_borger"].isin(bipolar_patients_with_f20_f25)] + + filtered_df = pl.DataFrame(filtered_df).lazy() + + return filtered_df diff --git a/psycop/projects/forced_admission_inpatient/utils/pipeline_objects.py b/psycop/projects/forced_admission_inpatient/utils/pipeline_objects.py index 4056eb981..feeba4478 100644 --- a/psycop/projects/forced_admission_inpatient/utils/pipeline_objects.py +++ b/psycop/projects/forced_admission_inpatient/utils/pipeline_objects.py @@ -137,8 +137,8 @@ def pipe(self) -> Pipeline: @dataclass class ForcedAdmissionInpatientArtifactNames: - main_performance_figure: str = "fa_inpatient_main_performance_figure.png" - main_robustness_figure: str = "fa_inpatient_main_robustness.png" + main_performance_figure: str = "fa_inpatient_main_performance_figure.pdf" + main_robustness_figure: str = "fa_inpatient_main_robustness.pdf" performance_by_ppr: str = "fa_inpatient_performance_by_ppr.xlsx"