diff --git a/src/psycop_feature_generation/loaders/raw/load_coercion.py b/src/psycop_feature_generation/loaders/raw/load_coercion.py index c1d76622..72f25d05 100644 --- a/src/psycop_feature_generation/loaders/raw/load_coercion.py +++ b/src/psycop_feature_generation/loaders/raw/load_coercion.py @@ -7,6 +7,7 @@ import pandas as pd from psycop_feature_generation.loaders.raw.sql_load import sql_load +from psycop_feature_generation.loaders.raw.utils import unpack_intervals from psycop_feature_generation.utils import data_loaders @@ -15,6 +16,8 @@ def coercion_duration( coercion_type: Optional[str] = None, reason_for_coercion: Optional[str] = None, n_rows: Optional[int] = None, + unpack_to_intervals: Optional[bool] = False, + unpack_freq: Optional[str] = "D", ) -> pd.DataFrame: """Load coercion data. By default returns entire coercion data view with duration in hours as the value column. @@ -23,16 +26,17 @@ def coercion_duration( coercion_type (str): Type of coercion, e.g. 'tvangsindlæggelse', 'bæltefiksering'. Defaults to None. # noqa: DAR102 reason_for_coercion (str): Reason for coercion, e.g. 'farlighed'. Defaults to None. n_rows: Number of rows to return. Defaults to None which returns entire coercion data view. + unpack_to_intervals: Unpack time interval to rows with set frequency (see below). Defaults to False. + unpack_freq: unpack_freq: Frequency string by which the interval will be unpacked. Default to "D" (day). For e.g., 5 hours, write "5H". Returns: pd.DataFrame """ + coercion_discard = """('Døraflåsning', 'Personlig afskærmning over 24 timer', 'Koordinationsplan', 'Udskrivningsaftale', 'Særlige dørlåse', 'Personlige alarm- og pejlesystemer', 'Andet' )""" - view = "[FOR_tvang_alt_hele_kohorten_inkl_2021_feb2022]" - - sql = f"SELECT dw_ek_borger, datotid_start_sei, datotid_slut_sei, varighed_timer_sei, typetekst_sei FROM [fct].{view} WHERE datotid_start_sei IS NOT NULL AND typetekst_sei NOT IN {coercion_discard}" + sql = f"SELECT dw_ek_borger, datotid_start_sei, datotid_slut_sei, varighed_timer_sei, typetekst_sei FROM [fct].[FOR_tvang_alt_hele_kohorten_inkl_2021_feb2022] WHERE datotid_start_sei IS NOT NULL AND typetekst_sei NOT IN {coercion_discard}" if coercion_type and reason_for_coercion is None: @@ -67,12 +71,22 @@ def coercion_duration( # Change NaNs to 0 df["value"].fillna(0, inplace=True) + if unpack_to_intervals: + df = unpack_intervals( + df, + starttime_col="datotid_start_sei", + endtime_col="timestamp", + unpack_freq=unpack_freq, + ) + return df[["dw_ek_borger", "timestamp", "value"]].reset_index(drop=True) def _concatenate_coercion( coercion_types_list: list[dict[str, str]], n_rows: Optional[int] = None, + unpack_to_intervals: Optional[bool] = False, + unpack_freq: Optional[str] = "D", ) -> pd.DataFrame: """Aggregate multiple types of coercion with multiple reasons into one column. @@ -80,6 +94,8 @@ def _concatenate_coercion( Args: coercion_types_list (list): list of dictionaries containing a 'coercion_type' key and a 'reason_for_coercion' key. If keys not in dicts, they are set to None # noqa: DAR102 n (int, optional): Number of rows to return. Defaults to None. + unpack_to_intervals: Unpack time interval to rows with set frequency (see below). Defaults to False. + unpack_freq: unpack_freq: Frequency string by which the interval will be unpacked. Default to "D" (day). For e.g., 5 hours, write "5H". Returns: pd.DataFrame @@ -100,6 +116,8 @@ def _concatenate_coercion( coercion_type=d["coercion_type"], reason_for_coercion=d["reason_for_coercion"], n_rows=n_rows, + unpack_to_intervals=unpack_to_intervals, + unpack_freq=unpack_freq, ) for d in coercion_types_list ] @@ -111,7 +129,11 @@ def _concatenate_coercion( @data_loaders.register("farlighed") -def farlighed(n_rows: Optional[int] = None) -> pd.DataFrame: +def farlighed( + n_rows: Optional[int] = None, + unpack_to_intervals: Optional[bool] = False, + unpack_freq: Optional[str] = "D", +) -> pd.DataFrame: coercion_types_list = [ { "reason_for_coercion": "Farlighed", @@ -124,55 +146,91 @@ def farlighed(n_rows: Optional[int] = None) -> pd.DataFrame: return _concatenate_coercion( coercion_types_list=coercion_types_list, n_rows=n_rows, + unpack_to_intervals=unpack_to_intervals, + unpack_freq=unpack_freq, ) -# Røde papir ved tvangsindlæggelse/tvangstilbageholdelse +# Røde papirer ved tvangsindlæggelse/tvangstilbageholdelse @data_loaders.register("paa_grund_af_farlighed") -def paa_grund_af_farlighed(n_rows: Optional[int] = None) -> pd.DataFrame: +def paa_grund_af_farlighed( + n_rows: Optional[int] = None, + unpack_to_intervals: Optional[bool] = False, + unpack_freq: Optional[str] = "D", +) -> pd.DataFrame: return coercion_duration( reason_for_coercion="På grund af farlighed", n_rows=n_rows, + unpack_to_intervals=unpack_to_intervals, + unpack_freq=unpack_freq, ) -# Gule papir ved tvangsindlæggelse/tvangstilbageholdelse +# Gule papirer ved tvangsindlæggelse/tvangstilbageholdelse @data_loaders.register("af_helbredsmaessige_grunde") -def af_helbredsmaessige_grunde(n_rows: Optional[int] = None) -> pd.DataFrame: +def af_helbredsmaessige_grunde( + n_rows: Optional[int] = None, + unpack_to_intervals: Optional[bool] = False, + unpack_freq: Optional[str] = "D", +) -> pd.DataFrame: return coercion_duration( reason_for_coercion=" Af helbredsmæssige grunde", n_rows=n_rows, + unpack_to_intervals=unpack_to_intervals, + unpack_freq=unpack_freq, ) @data_loaders.register("urolig_tilstand") -def urolig_tilstand(n_rows: Optional[int] = None) -> pd.DataFrame: +def urolig_tilstand( + n_rows: Optional[int] = None, + unpack_to_intervals: Optional[bool] = False, + unpack_freq: Optional[str] = "D", +) -> pd.DataFrame: return coercion_duration( reason_for_coercion="Urolig tilstand", n_rows=n_rows, + unpack_to_intervals=unpack_to_intervals, + unpack_freq=unpack_freq, ) @data_loaders.register("anden_begrundelse") -def anden_begrundelse(n_rows: Optional[int] = None) -> pd.DataFrame: +def anden_begrundelse( + n_rows: Optional[int] = None, + unpack_to_intervals: Optional[bool] = False, + unpack_freq: Optional[str] = "D", +) -> pd.DataFrame: return coercion_duration( reason_for_coercion="Anden begrundelse", n_rows=n_rows, + unpack_to_intervals=unpack_to_intervals, + unpack_freq=unpack_freq, ) @data_loaders.register("naerliggende_eller_vaesentlig_fare_for_patienten_eller_andre") -def naerliggende_fare(n_rows: Optional[int] = None) -> pd.DataFrame: +def naerliggende_fare( + n_rows: Optional[int] = None, + unpack_to_intervals: Optional[bool] = False, + unpack_freq: Optional[str] = "D", +) -> pd.DataFrame: return coercion_duration( reason_for_coercion="Nærliggende_eller_væsentlig_fare_for_patienten_eller_andre", n_rows=n_rows, + unpack_to_intervals=unpack_to_intervals, + unpack_freq=unpack_freq, ) # GENERAL TYPE (tabeltekst) ### # frihedsberøvelser @data_loaders.register("skema_1") -def skema_1(n_rows: Optional[int] = None) -> pd.DataFrame: +def skema_1( + n_rows: Optional[int] = None, + unpack_to_intervals: Optional[bool] = False, + unpack_freq: Optional[str] = "D", +) -> pd.DataFrame: coercion_types_list = [ { "coercion_type": "Tvangsindlæggelse", @@ -185,12 +243,18 @@ def skema_1(n_rows: Optional[int] = None) -> pd.DataFrame: return _concatenate_coercion( coercion_types_list=coercion_types_list, n_rows=n_rows, + unpack_to_intervals=unpack_to_intervals, + unpack_freq=unpack_freq, ) # tvangsbehandlinger @data_loaders.register("skema_2") -def skema_2(n_rows: Optional[int] = None) -> pd.DataFrame: +def skema_2( + n_rows: Optional[int] = None, + unpack_to_intervals: Optional[bool] = False, + unpack_freq: Optional[str] = "D", +) -> pd.DataFrame: coercion_types_list = [ { "coercion_type": "Af legemlig lidelse", @@ -209,32 +273,18 @@ def skema_2(n_rows: Optional[int] = None) -> pd.DataFrame: return _concatenate_coercion( coercion_types_list=coercion_types_list, n_rows=n_rows, - ) - - -@data_loaders.register("skema_2_without_nutrition") -def skema_2_without_nutrition(n_rows: Optional[int] = None) -> pd.DataFrame: - coercion_types_list = [ - { - "coercion_type": "Af legemlig lidelse", - }, - { - "coercion_type": "Medicinering", - }, - { - "coercion_type": "ECT", - }, - ] - - return _concatenate_coercion( - coercion_types_list=coercion_types_list, - n_rows=n_rows, + unpack_to_intervals=unpack_to_intervals, + unpack_freq=unpack_freq, ) # magtanvendelse @data_loaders.register("skema_3") -def skema_3(n_rows: Optional[int] = None) -> pd.DataFrame: +def skema_3( + n_rows: Optional[int] = None, + unpack_to_intervals: Optional[bool] = False, + unpack_freq: Optional[str] = "D", +) -> pd.DataFrame: coercion_types_list = [ { "coercion_type": "Bælte", @@ -258,6 +308,8 @@ def skema_3(n_rows: Optional[int] = None) -> pd.DataFrame: return _concatenate_coercion( coercion_types_list=coercion_types_list, n_rows=n_rows, + unpack_to_intervals=unpack_to_intervals, + unpack_freq=unpack_freq, ) @@ -266,88 +318,154 @@ def skema_3(n_rows: Optional[int] = None) -> pd.DataFrame: @data_loaders.register("baelte") -def baelte(n_rows: Optional[int] = None) -> pd.DataFrame: +def baelte( + n_rows: Optional[int] = None, + unpack_to_intervals: Optional[bool] = False, + unpack_freq: Optional[str] = "D", +) -> pd.DataFrame: return coercion_duration( coercion_type="Bælte", n_rows=n_rows, + unpack_to_intervals=unpack_to_intervals, + unpack_freq=unpack_freq, ) @data_loaders.register("remme") -def remme(n_rows: Optional[int] = None) -> pd.DataFrame: +def remme( + n_rows: Optional[int] = None, + unpack_to_intervals: Optional[bool] = False, + unpack_freq: Optional[str] = "D", +) -> pd.DataFrame: return coercion_duration( coercion_type="Remme", n_rows=n_rows, + unpack_to_intervals=unpack_to_intervals, + unpack_freq=unpack_freq, ) @data_loaders.register("fastholden") -def fastholden(n_rows: Optional[int] = None) -> pd.DataFrame: +def fastholden( + n_rows: Optional[int] = None, + unpack_to_intervals: Optional[bool] = False, + unpack_freq: Optional[str] = "D", +) -> pd.DataFrame: return coercion_duration( coercion_type="Fastholden", n_rows=n_rows, + unpack_to_intervals=unpack_to_intervals, + unpack_freq=unpack_freq, ) @data_loaders.register("beroligende_medicin") -def beroligende_medicin(n_rows: Optional[int] = None) -> pd.DataFrame: +def beroligende_medicin( + n_rows: Optional[int] = None, + unpack_to_intervals: Optional[bool] = False, + unpack_freq: Optional[str] = "D", +) -> pd.DataFrame: return coercion_duration( coercion_type="Beroligende medicin", n_rows=n_rows, + unpack_to_intervals=unpack_to_intervals, + unpack_freq=unpack_freq, ) @data_loaders.register("handsker") -def handsker(n_rows: Optional[int] = None) -> pd.DataFrame: +def handsker( + n_rows: Optional[int] = None, + unpack_to_intervals: Optional[bool] = False, + unpack_freq: Optional[str] = "D", +) -> pd.DataFrame: return coercion_duration( coercion_type="Handsker", n_rows=n_rows, + unpack_to_intervals=unpack_to_intervals, + unpack_freq=unpack_freq, ) @data_loaders.register("tvangsindlaeggelse") -def tvangsindlaeggelse(n_rows: Optional[int] = None) -> pd.DataFrame: +def tvangsindlaeggelse( + n_rows: Optional[int] = None, + unpack_to_intervals: Optional[bool] = False, + unpack_freq: Optional[str] = "D", +) -> pd.DataFrame: return coercion_duration( coercion_type="Tvangsindlæggelse", n_rows=n_rows, + unpack_to_intervals=unpack_to_intervals, + unpack_freq=unpack_freq, ) @data_loaders.register("tvangstilbageholdelse") -def tvangstilbageholdelse(n_rows: Optional[int] = None) -> pd.DataFrame: +def tvangstilbageholdelse( + n_rows: Optional[int] = None, + unpack_to_intervals: Optional[bool] = False, + unpack_freq: Optional[str] = "D", +) -> pd.DataFrame: return coercion_duration( coercion_type="Tvangstilbageholdelse", n_rows=n_rows, + unpack_to_intervals=unpack_to_intervals, + unpack_freq=unpack_freq, ) @data_loaders.register("medicinering") -def medicinering(n_rows: Optional[int] = None) -> pd.DataFrame: +def medicinering( + n_rows: Optional[int] = None, + unpack_to_intervals: Optional[bool] = False, + unpack_freq: Optional[str] = "D", +) -> pd.DataFrame: return coercion_duration( coercion_type="Medicinering", n_rows=n_rows, + unpack_to_intervals=unpack_to_intervals, + unpack_freq=unpack_freq, ) @data_loaders.register("ect") -def ect(n_rows: Optional[int] = None) -> pd.DataFrame: +def ect( + n_rows: Optional[int] = None, + unpack_to_intervals: Optional[bool] = False, + unpack_freq: Optional[str] = "D", +) -> pd.DataFrame: return coercion_duration( coercion_type="ECT", n_rows=n_rows, + unpack_to_intervals=unpack_to_intervals, + unpack_freq=unpack_freq, ) @data_loaders.register("ernaering") -def ernaering(n_rows: Optional[int] = None) -> pd.DataFrame: +def ernaering( + n_rows: Optional[int] = None, + unpack_to_intervals: Optional[bool] = False, + unpack_freq: Optional[str] = "D", +) -> pd.DataFrame: return coercion_duration( coercion_type="Ernæring", n_rows=n_rows, + unpack_to_intervals=unpack_to_intervals, + unpack_freq=unpack_freq, ) @data_loaders.register("af_legemlig_lidelse") -def af_legemlig_lidelse(n_rows: Optional[int] = None) -> pd.DataFrame: +def af_legemlig_lidelse( + n_rows: Optional[int] = None, + unpack_to_intervals: Optional[bool] = False, + unpack_freq: Optional[str] = "D", +) -> pd.DataFrame: return coercion_duration( coercion_type="Af legemlig lidelse", n_rows=n_rows, + unpack_to_intervals=unpack_to_intervals, + unpack_freq=unpack_freq, ) diff --git a/src/psycop_feature_generation/loaders/raw/utils.py b/src/psycop_feature_generation/loaders/raw/utils.py index 9da20c2c..1c99e4a1 100644 --- a/src/psycop_feature_generation/loaders/raw/utils.py +++ b/src/psycop_feature_generation/loaders/raw/utils.py @@ -280,3 +280,63 @@ def load_from_codes( source_timestamp_col_name: "timestamp", }, ) + + +def unpack_intervals( + df: pd.DataFrame, + starttime_col: str = "datotid_start_sei", + endtime_col: str = "timestamp", + entity_id: str = "dw_ek_borger", + unpack_freq: str = "D", +) -> pd.DataFrame: + """Transform df with starttime_col and endtime_col to day grain (one row per day in the interval starttime_col-endtime_col). + First and last day will have the specific start and end time, while days inbetween will be 00:00:00. + + Args: + df (pd.DataFrame): dataframe with time interval in separate columns. + starttime_col (str, optional): Name of column with start time. Defaults to "datotid_start_sei". + endtime_col (str, optional): Name of column with end time. Defaults to "timestamp". + unpack_freq: Frequency string by which the interval will be unpacked. Default to "D" (day). For e.g., 5 hours, write "5H". + + Returns: + pd.DataFrame: Dataframe with time interval unpacked to day grain. + + """ + + # create rows with end time + df_end_rows = df.copy() + df_end_rows["date_range"] = df_end_rows[f"{endtime_col}"] + + # create a date range column between start date and end date for each visit/admission/coercion instance + df["date_range"] = df.apply( + lambda x: pd.date_range( + start=x[f"{starttime_col}"], + end=x[f"{endtime_col}"], + freq=unpack_freq, + ), + axis=1, + ) + + # explode the date range column to create a new row for each date in the range + df = df.explode("date_range") + + # concat df with start and end time rows + df = pd.concat([df, df_end_rows], ignore_index=True).sort_values( + [f"{entity_id}", f"{starttime_col}", "date_range"] + ) + + # drop duplicates (when start and/or end time = 00:00:00) + df = df.drop_duplicates(keep="first") + + # reset index + df = df.reset_index(drop=True) + + # set value to 1 (duration has lost meaning now, since durations are repeated on multiple rows per coercion instance now) + df["value"] = 1 + + # only keep relevant columns and rename date_range to timestamp + df = df[[f"{entity_id}", "date_range", "value"]].rename( + columns={"date_range": "timestamp"} + ) + + return df diff --git a/tests/test_loaders/test_unpack_intervals.py b/tests/test_loaders/test_unpack_intervals.py new file mode 100644 index 00000000..802d41f2 --- /dev/null +++ b/tests/test_loaders/test_unpack_intervals.py @@ -0,0 +1,84 @@ +"""Tests of unpack_intervals""" + +import pandas as pd + +from psycop_feature_generation.loaders.raw.utils import ( + unpack_intervals, +) +from psycop_feature_generation.utils_for_testing import ( + str_to_df, +) + + +def test_unpack_intervals_to_days(): + df_str = """dw_ek_borger,timestamp_start,timestamp_end,value + 1,2021-01-01 12:00:00,2021-01-01 13:00:00,1.0 + 2,2021-03-03 15:00:00,2021-03-05 15:30:00,48.5 + 3,2021-06-06 00:00:00,2021-06-09 00:00:00,72.0 + """ + + expected_df_str = """dw_ek_borger,timestamp,value + 1,2021-01-01 12:00:00,1 + 1,2021-01-01 13:00:00,1 + 2,2021-03-03 15:00:00,1 + 2,2021-03-04 15:00:00,1 + 2,2021-03-05 15:00:00,1 + 2,2021-03-05 15:30:00,1 + 3,2021-06-06 00:00:00,1 + 3,2021-06-07 00:00:00,1 + 3,2021-06-08 00:00:00,1 + 3,2021-06-09 00:00:00,1 + """ + + # 1: interval < 1 day (= two rows, one with start time and one with end time) + # 2: interval > 1 day and times are not 00:00:00 (= one row with start time, one row per day in-between with timestamp same as start time, and one row with end time) + # 3: interval > 1 day and both times are 00:00:00 (= one row per day, includeing start and end day, all times 00:00:00) + + df = str_to_df(df_str, convert_str_to_float=False) + expected_df = str_to_df(expected_df_str, convert_str_to_float=False) + + df = unpack_intervals( + df, + starttime_col="timestamp_start", + endtime_col="timestamp_end", + unpack_freq="D", + ) + + for col in df.columns: + pd.testing.assert_series_equal(df[col], expected_df[col]) + + +def test_unpack_intervals_to_5Hfreq(): + df_str = """dw_ek_borger,timestamp_start,timestamp_end,value + 1,2021-01-01 12:00:00,2021-01-01 13:00:00,1.0 + 2,2021-02-02 15:00:00,2021-02-02 20:00:00,5.0 + 3,2021-03-04 16:00:00,2021-03-05 4:00:00,12.0 + """ + + expected_df_str = """dw_ek_borger,timestamp,value + 1,2021-01-01 12:00:00,1 + 1,2021-01-01 13:00:00,1 + 2,2021-02-02 15:00:00,1 + 2,2021-02-02 20:00:00,1 + 3,2021-03-04 16:00:00,1 + 3,2021-03-04 21:00:00,1 + 3,2021-03-05 02:00:00,1 + 3,2021-03-05 4:00:00,1 + """ + + # 1: interval < 5 hours + # 2: interval = 5 hours + # 3: interval > 5 hours + + df = str_to_df(df_str, convert_str_to_float=False) + expected_df = str_to_df(expected_df_str, convert_str_to_float=False) + + df = unpack_intervals( + df, + starttime_col="timestamp_start", + endtime_col="timestamp_end", + unpack_freq="5H", + ) + + for col in df.columns: + pd.testing.assert_series_equal(df[col], expected_df[col])