From 2527fbe1561698a17f78e4b602527c55257233f9 Mon Sep 17 00:00:00 2001 From: signekb Date: Fri, 10 Mar 2023 14:22:44 +0100 Subject: [PATCH 1/8] unpack to days function and implement in coercion loader --- .../loaders/raw/load_coercion.py | 120 ++++++++++++++---- .../loaders/raw/utils.py | 60 +++++++++ 2 files changed, 155 insertions(+), 25 deletions(-) diff --git a/src/psycop_feature_generation/loaders/raw/load_coercion.py b/src/psycop_feature_generation/loaders/raw/load_coercion.py index 49f74ae9..5442d0e8 100644 --- a/src/psycop_feature_generation/loaders/raw/load_coercion.py +++ b/src/psycop_feature_generation/loaders/raw/load_coercion.py @@ -7,6 +7,7 @@ import pandas as pd from psycop_feature_generation.loaders.raw.sql_load import sql_load +from psycop_feature_generation.loaders.raw.utils import unpack_intervals_to_days from psycop_feature_generation.utils import data_loaders @@ -15,6 +16,7 @@ def coercion_duration( coercion_type: Optional[str] = None, reason_for_coercion: Optional[str] = None, n_rows: Optional[int] = None, + unpack_to_days: Optional[bool] = False, ) -> pd.DataFrame: """Load coercion data. By default returns entire coercion data view with duration in hours as the value column. @@ -23,16 +25,16 @@ def coercion_duration( coercion_type (str): Type of coercion, e.g. 'tvangsindlæggelse', 'bæltefiksering'. Defaults to None. # noqa: DAR102 reason_for_coercion (str): Reason for coercion, e.g. 'farlighed'. Defaults to None. n_rows: Number of rows to return. Defaults to None which returns entire coercion data view. + unpack_to_days: if we want to create features about the "current state" of the coercion instance, we need to unpack the start and end times to one row per day in the interval. Defaults to False. Returns: pd.DataFrame """ + coercion_discard = """('Døraflåsning', 'Personlig afskærmning over 24 timer', 'Koordinationsplan', 'Udskrivningsaftale', 'Særlige dørlåse', 'Personlige alarm- og pejlesystemer', 'Andet' )""" - view = "[FOR_tvang_alt_hele_kohorten_inkl_2021]" - - sql = f"SELECT dw_ek_borger, datotid_start_sei, datotid_slut_sei, varighed_timer_sei, typetekst_sei FROM [fct].{view} WHERE datotid_start_sei IS NOT NULL AND typetekst_sei NOT IN {coercion_discard}" + sql = f"SELECT dw_ek_borger, datotid_start_sei, datotid_slut_sei, varighed_timer_sei, typetekst_sei FROM [fct].[FOR_tvang_alt_hele_kohorten_inkl_2021_feb2022] WHERE datotid_start_sei IS NOT NULL AND typetekst_sei NOT IN {coercion_discard}" if coercion_type and reason_for_coercion is None: @@ -67,12 +69,18 @@ def coercion_duration( # Change NaNs to 0 df["value"].fillna(0, inplace=True) + if unpack_to_days: + df = unpack_intervals_to_days( + df, starttime_col="datotid_start_sei", endtime_col="timestamp" + ) + return df[["dw_ek_borger", "timestamp", "value"]].reset_index(drop=True) def _concatenate_coercion( coercion_types_list: list[dict[str, str]], n_rows: Optional[int] = None, + unpack_to_days: Optional[bool] = False, ) -> pd.DataFrame: """Aggregate multiple types of coercion with multiple reasons into one column. @@ -80,6 +88,7 @@ def _concatenate_coercion( Args: coercion_types_list (list): list of dictionaries containing a 'coercion_type' key and a 'reason_for_coercion' key. If keys not in dicts, they are set to None # noqa: DAR102 n (int, optional): Number of rows to return. Defaults to None. + unpack_to_days: if we want to create features about the "current state" of the coercion instance, we need to unpack the start and end times to one row per day in the interval. Defaults to False. Returns: pd.DataFrame @@ -100,6 +109,7 @@ def _concatenate_coercion( coercion_type=d["coercion_type"], reason_for_coercion=d["reason_for_coercion"], n_rows=n_rows, + unpack_to_days=unpack_to_days, ) for d in coercion_types_list ] @@ -111,7 +121,9 @@ def _concatenate_coercion( @data_loaders.register("farlighed") -def farlighed(n_rows: Optional[int] = None) -> pd.DataFrame: +def farlighed( + n_rows: Optional[int] = None, unpack_to_days: Optional[bool] = False +) -> pd.DataFrame: coercion_types_list = [ { "reason_for_coercion": "Farlighed", @@ -124,55 +136,73 @@ def farlighed(n_rows: Optional[int] = None) -> pd.DataFrame: return _concatenate_coercion( coercion_types_list=coercion_types_list, n_rows=n_rows, + unpack_to_days=unpack_to_days, ) -# Røde papir ved tvangsindlæggelse/tvangstilbageholdelse +# Røde papirer ved tvangsindlæggelse/tvangstilbageholdelse @data_loaders.register("paa_grund_af_farlighed") -def paa_grund_af_farlighed(n_rows: Optional[int] = None) -> pd.DataFrame: +def paa_grund_af_farlighed( + n_rows: Optional[int] = None, unpack_to_days: Optional[bool] = False +) -> pd.DataFrame: return coercion_duration( reason_for_coercion="På grund af farlighed", n_rows=n_rows, + unpack_to_days=unpack_to_days, ) -# Gule papir ved tvangsindlæggelse/tvangstilbageholdelse +# Gule papirer ved tvangsindlæggelse/tvangstilbageholdelse @data_loaders.register("af_helbredsmaessige_grunde") -def af_helbredsmaessige_grunde(n_rows: Optional[int] = None) -> pd.DataFrame: +def af_helbredsmaessige_grunde( + n_rows: Optional[int] = None, unpack_to_days: Optional[bool] = False +) -> pd.DataFrame: return coercion_duration( reason_for_coercion=" Af helbredsmæssige grunde", n_rows=n_rows, + unpack_to_days=unpack_to_days, ) @data_loaders.register("urolig_tilstand") -def urolig_tilstand(n_rows: Optional[int] = None) -> pd.DataFrame: +def urolig_tilstand( + n_rows: Optional[int] = None, unpack_to_days: Optional[bool] = False +) -> pd.DataFrame: return coercion_duration( reason_for_coercion="Urolig tilstand", n_rows=n_rows, + unpack_to_days=unpack_to_days, ) @data_loaders.register("anden_begrundelse") -def anden_begrundelse(n_rows: Optional[int] = None) -> pd.DataFrame: +def anden_begrundelse( + n_rows: Optional[int] = None, unpack_to_days: Optional[bool] = False +) -> pd.DataFrame: return coercion_duration( reason_for_coercion="Anden begrundelse", n_rows=n_rows, + unpack_to_days=unpack_to_days, ) @data_loaders.register("naerliggende_eller_vaesentlig_fare_for_patienten_eller_andre") -def naerliggende_fare(n_rows: Optional[int] = None) -> pd.DataFrame: +def naerliggende_fare( + n_rows: Optional[int] = None, unpack_to_days: Optional[bool] = False +) -> pd.DataFrame: return coercion_duration( reason_for_coercion="Nærliggende_eller_væsentlig_fare_for_patienten_eller_andre", n_rows=n_rows, + unpack_to_days=unpack_to_days, ) # GENERAL TYPE (tabeltekst) ### # frihedsberøvelser @data_loaders.register("skema_1") -def skema_1(n_rows: Optional[int] = None) -> pd.DataFrame: +def skema_1( + n_rows: Optional[int] = None, unpack_to_days: Optional[bool] = False +) -> pd.DataFrame: coercion_types_list = [ { "coercion_type": "Tvangsindlæggelse", @@ -185,12 +215,15 @@ def skema_1(n_rows: Optional[int] = None) -> pd.DataFrame: return _concatenate_coercion( coercion_types_list=coercion_types_list, n_rows=n_rows, + unpack_to_days=unpack_to_days, ) # tvangsbehandlinger @data_loaders.register("skema_2") -def skema_2(n_rows: Optional[int] = None) -> pd.DataFrame: +def skema_2( + n_rows: Optional[int] = None, unpack_to_days: Optional[bool] = False +) -> pd.DataFrame: coercion_types_list = [ { "coercion_type": "Af legemlig lidelse", @@ -209,12 +242,15 @@ def skema_2(n_rows: Optional[int] = None) -> pd.DataFrame: return _concatenate_coercion( coercion_types_list=coercion_types_list, n_rows=n_rows, + unpack_to_days=unpack_to_days, ) # magtanvendelse @data_loaders.register("skema_3") -def skema_3(n_rows: Optional[int] = None) -> pd.DataFrame: +def skema_3( + n_rows: Optional[int] = None, unpack_to_days: Optional[bool] = False +) -> pd.DataFrame: coercion_types_list = [ { "coercion_type": "Bælte", @@ -238,6 +274,7 @@ def skema_3(n_rows: Optional[int] = None) -> pd.DataFrame: return _concatenate_coercion( coercion_types_list=coercion_types_list, n_rows=n_rows, + unpack_to_days=unpack_to_days, ) @@ -246,88 +283,121 @@ def skema_3(n_rows: Optional[int] = None) -> pd.DataFrame: @data_loaders.register("baelte") -def baelte(n_rows: Optional[int] = None) -> pd.DataFrame: +def baelte( + n_rows: Optional[int] = None, unpack_to_days: Optional[bool] = False +) -> pd.DataFrame: return coercion_duration( coercion_type="Bælte", n_rows=n_rows, + unpack_to_days=unpack_to_days, ) @data_loaders.register("remme") -def remme(n_rows: Optional[int] = None) -> pd.DataFrame: +def remme( + n_rows: Optional[int] = None, unpack_to_days: Optional[bool] = False +) -> pd.DataFrame: return coercion_duration( coercion_type="Remme", n_rows=n_rows, + unpack_to_days=unpack_to_days, ) @data_loaders.register("fastholden") -def fastholden(n_rows: Optional[int] = None) -> pd.DataFrame: +def fastholden( + n_rows: Optional[int] = None, unpack_to_days: Optional[bool] = False +) -> pd.DataFrame: return coercion_duration( coercion_type="Fastholden", n_rows=n_rows, + unpack_to_days=unpack_to_days, ) @data_loaders.register("beroligende_medicin") -def beroligende_medicin(n_rows: Optional[int] = None) -> pd.DataFrame: +def beroligende_medicin( + n_rows: Optional[int] = None, unpack_to_days: Optional[bool] = False +) -> pd.DataFrame: return coercion_duration( coercion_type="Beroligende medicin", n_rows=n_rows, + unpack_to_days=unpack_to_days, ) @data_loaders.register("handsker") -def handsker(n_rows: Optional[int] = None) -> pd.DataFrame: +def handsker( + n_rows: Optional[int] = None, unpack_to_days: Optional[bool] = False +) -> pd.DataFrame: return coercion_duration( coercion_type="Handsker", n_rows=n_rows, + unpack_to_days=unpack_to_days, ) @data_loaders.register("tvangsindlaeggelse") -def tvangsindlaeggelse(n_rows: Optional[int] = None) -> pd.DataFrame: +def tvangsindlaeggelse( + n_rows: Optional[int] = None, unpack_to_days: Optional[bool] = False +) -> pd.DataFrame: return coercion_duration( coercion_type="Tvangsindlæggelse", n_rows=n_rows, + unpack_to_days=unpack_to_days, ) @data_loaders.register("tvangstilbageholdelse") -def tvangstilbageholdelse(n_rows: Optional[int] = None) -> pd.DataFrame: +def tvangstilbageholdelse( + n_rows: Optional[int] = None, unpack_to_days: Optional[bool] = False +) -> pd.DataFrame: return coercion_duration( coercion_type="Tvangstilbageholdelse", n_rows=n_rows, + unpack_to_days=unpack_to_days, ) @data_loaders.register("medicinering") -def medicinering(n_rows: Optional[int] = None) -> pd.DataFrame: +def medicinering( + n_rows: Optional[int] = None, unpack_to_days: Optional[bool] = False +) -> pd.DataFrame: return coercion_duration( coercion_type="Medicinering", n_rows=n_rows, + unpack_to_days=unpack_to_days, ) @data_loaders.register("ect") -def ect(n_rows: Optional[int] = None) -> pd.DataFrame: +def ect( + n_rows: Optional[int] = None, unpack_to_days: Optional[bool] = False +) -> pd.DataFrame: return coercion_duration( coercion_type="ECT", n_rows=n_rows, + unpack_to_days=unpack_to_days, ) @data_loaders.register("ernaering") -def ernaering(n_rows: Optional[int] = None) -> pd.DataFrame: +def ernaering( + n_rows: Optional[int] = None, unpack_to_days: Optional[bool] = False +) -> pd.DataFrame: return coercion_duration( coercion_type="Ernæring", n_rows=n_rows, + unpack_to_days=unpack_to_days, ) @data_loaders.register("af_legemlig_lidelse") -def af_legemlig_lidelse(n_rows: Optional[int] = None) -> pd.DataFrame: +def af_legemlig_lidelse( + n_rows: Optional[int] = None, unpack_to_days: Optional[bool] = False +) -> pd.DataFrame: return coercion_duration( coercion_type="Af legemlig lidelse", n_rows=n_rows, + unpack_to_days=unpack_to_days, ) diff --git a/src/psycop_feature_generation/loaders/raw/utils.py b/src/psycop_feature_generation/loaders/raw/utils.py index a17e80ab..e33f12b3 100644 --- a/src/psycop_feature_generation/loaders/raw/utils.py +++ b/src/psycop_feature_generation/loaders/raw/utils.py @@ -271,3 +271,63 @@ def load_from_codes( source_timestamp_col_name: "timestamp", }, ) + + +def unpack_intervals_to_days( + df: pd.DataFrame, + starttime_col: str = "datotid_start_sei", + endtime_col: str = "timestamp", +) -> pd.DataFrame: + """Transform df with starttime_col and endtime_col to day grain (one row per day in the interval starttime_col-endtime_col). + First and last day will have the specific start and end time, while days inbetween will be 00:00:00. + + Args: + df (pd.DataFrame): dataframe with time interval in separate columns. + starttime_col (str, optional): Name of column with start time. Defaults to "datotid_start_sei". + endtime_col (str, optional): Name of column with end time. Defaults to "datotid_slut_slut". + + Returns: + pd.DataFrame: Dataframe with time interval unpacked to day grain. + + """ + + # create rows with start and end time + df_start_rows, df_end_rows = df.copy(), df.copy() + df_start_rows["date_range"] = df_start_rows[f"{starttime_col}"] + df_end_rows["date_range"] = df_end_rows[f"{endtime_col}"] + + # create a date range column between start date and end date for each visit/admission/coercion instance + df["date_range"] = df.apply( + lambda x: pd.date_range( + start=x[f"{starttime_col}"].date() + pd.DateOffset(1), + end=x[f"{endtime_col}"].date(), + ), + axis=1, + ) + + # explode the date range column to create a new row for each date in the range + df = df.explode("date_range") + + # remove na's (produced when start date = end date) + df = df[df["date_range"].notnull()] + + # concat df with start and end time rows + df = pd.concat([df, df_start_rows, df_end_rows], ignore_index=True).sort_values( + ["dw_ek_borger", f"{starttime_col}", "date_range"] + ) + + # drop duplicates (when start or end time = 00:00:00) + df = df.drop_duplicates(keep="first") + + # reset index + df = df.reset_index(drop=True) + + # set value to 1 (duration has lost meaning now, since duration are repeated multiple times per visit/admission/coercion instance now) + df["value"] = 1 + + # only keep relevant columns and rename date_range to timestamp + df = df[["dw_ek_borger", "date_range", "value"]].rename( + columns={"date_range": "timestamp"} + ) + + return df From e4c22c95a0ca173cecdc6f7bffa7e24ab0ce0bff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Signe=20Kirk=20Br=C3=B8db=C3=A6k?= <40836345+signekb@users.noreply.github.com> Date: Fri, 10 Mar 2023 16:29:43 +0100 Subject: [PATCH 2/8] Create test_unpack_interval_to_days.py --- .../test_unpack_interval_to_days.py | 66 +++++++++++++++++++ 1 file changed, 66 insertions(+) create mode 100644 tests/test_loaders/test_unpack_interval_to_days.py diff --git a/tests/test_loaders/test_unpack_interval_to_days.py b/tests/test_loaders/test_unpack_interval_to_days.py new file mode 100644 index 00000000..8aaf411c --- /dev/null +++ b/tests/test_loaders/test_unpack_interval_to_days.py @@ -0,0 +1,66 @@ +"""Test of unpack intervals to days""" + +import pandas as pd + +from psycop_feature_generation.loaders.raw.utils import ( + unpack_intervals_to_days, +) +from psycop_feature_generation.utils_for_testing import ( + str_to_df, +) + + +def test_unpack_intervals_to_days(): + df_str = """dw_ek_borger,datotid_start,datotid_slut,value + 1,2021-01-01 12:00:00,2021-01-01 13:00:00,1.0 + 2,2021-02-02 00:00:00,2021-02-02 14:00:00,10.0 + 3,2021-03-03 15:00:00,2021-03-05 15:30:00,48.5 + 4,2021-04-04 00:00:00,2021-04-06 16:00:00,16.0 + 5,2021-05-05 17:30:00,2021-05-07 00:00:00,30.5 + 6,2021-06-06 00:00:00,2021-06-09 00:00:00,72.0 + """ + + expected_df_str = """dw_ek_borger,timestamp,value + 1,2021-01-01 12:00:00,1 + 1,2021-01-01 13:00:00,1 + 2,2021-02-02 00:00:00,1 + 2,2021-02-02 14:00:00,1 + 3,2021-03-03 15:00:00,1 + 3,2021-03-04 00:00:00,1 + 3,2021-03-05 00:00:00,1 + 3,2021-03-05 15:30:00,1 + 4,2021-04-04 00:00:00,1 + 4,2021-04-05 00:00:00,1 + 4,2021-04-06 00:00:00,1 + 4,2021-04-06 16:00:00,1 + 5,2021-05-05 17:30:00,1 + 5,2021-05-06 00:00:00,1 + 5,2021-05-07 00:00:00,1 + 6,2021-06-06 00:00:00,1 + 6,2021-06-07 00:00:00,1 + 6,2021-06-08 00:00:00,1 + 6,2021-06-09 00:00:00,1 + """ + + # 1: interval < 1 day and times are not 00:00:00 (= two rows with start and end time) + # 2: interval < 1 day and start is 00:00:00 (= two rows with start and end time) + # 3: interval > 1 day and times are not 00:00:00 (= one row with start time, rows with time 00:00:00 in-between, and one row with end time) + # 4: interval > 1 day and start is 00:00:00 (= one row with start time 00:00:00, rows with time 00:00:00 in-between, and one row with end time) + # 5: interval > 1 day and end is 00:00:00 (= one row with start time 00:00:00, rows with time 00:00:00 in-between, and one row with end time 00:00:00) + # 6: interval > 1 day and both times are 00:00:00 (= one row per day, all times 00:00:00) + + df = str_to_df(df_str, convert_str_to_float=False) + df["datotid_start"] = pd.to_datetime(df["datotid_start"]) + df["datotid_slut"] = pd.to_datetime(df["datotid_slut"]) + + expected_df = str_to_df(expected_df_str, convert_str_to_float=False) + expected_df["timestamp"] = pd.to_datetime(expected_df["timestamp"]) + + df = unpack_intervals_to_days( + df, + starttime_col="datotid_start", + endtime_col="datotid_slut", + ) + + for col in df.columns: + pd.testing.assert_series_equal(df[col], expected_df[col]) From 69b3e4df161829d5de23ec4b38db8451b0ab24ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Signe=20Kirk=20Br=C3=B8db=C3=A6k?= <40836345+signekb@users.noreply.github.com> Date: Fri, 10 Mar 2023 21:08:55 +0100 Subject: [PATCH 3/8] Update test_unpack_interval_to_days.py Change column names so they get transformed to datetime via str_to_df. --- tests/test_loaders/test_unpack_interval_to_days.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/tests/test_loaders/test_unpack_interval_to_days.py b/tests/test_loaders/test_unpack_interval_to_days.py index 8aaf411c..81daf9e7 100644 --- a/tests/test_loaders/test_unpack_interval_to_days.py +++ b/tests/test_loaders/test_unpack_interval_to_days.py @@ -11,7 +11,7 @@ def test_unpack_intervals_to_days(): - df_str = """dw_ek_borger,datotid_start,datotid_slut,value + df_str = """dw_ek_borger,timestamp_start,timestamp_end,value 1,2021-01-01 12:00:00,2021-01-01 13:00:00,1.0 2,2021-02-02 00:00:00,2021-02-02 14:00:00,10.0 3,2021-03-03 15:00:00,2021-03-05 15:30:00,48.5 @@ -50,16 +50,12 @@ def test_unpack_intervals_to_days(): # 6: interval > 1 day and both times are 00:00:00 (= one row per day, all times 00:00:00) df = str_to_df(df_str, convert_str_to_float=False) - df["datotid_start"] = pd.to_datetime(df["datotid_start"]) - df["datotid_slut"] = pd.to_datetime(df["datotid_slut"]) - expected_df = str_to_df(expected_df_str, convert_str_to_float=False) - expected_df["timestamp"] = pd.to_datetime(expected_df["timestamp"]) df = unpack_intervals_to_days( df, - starttime_col="datotid_start", - endtime_col="datotid_slut", + starttime_col="timestamp_start", + endtime_col="timestamp_end", ) for col in df.columns: From 44f5bbfa0633fc96ea94f5d62112fa9a1ccacdf3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Signe=20Kirk=20Br=C3=B8db=C3=A6k?= <40836345+signekb@users.noreply.github.com> Date: Mon, 13 Mar 2023 15:29:49 +0100 Subject: [PATCH 4/8] Update load_coercion.py with unpack_freq arg --- .../loaders/raw/load_coercion.py | 191 +++++++++++------- 1 file changed, 117 insertions(+), 74 deletions(-) diff --git a/src/psycop_feature_generation/loaders/raw/load_coercion.py b/src/psycop_feature_generation/loaders/raw/load_coercion.py index ed360e39..ac131abe 100644 --- a/src/psycop_feature_generation/loaders/raw/load_coercion.py +++ b/src/psycop_feature_generation/loaders/raw/load_coercion.py @@ -7,7 +7,7 @@ import pandas as pd from psycop_feature_generation.loaders.raw.sql_load import sql_load -from psycop_feature_generation.loaders.raw.utils import unpack_intervals_to_days +from psycop_feature_generation.loaders.raw.utils import unpack_intervals from psycop_feature_generation.utils import data_loaders @@ -16,7 +16,8 @@ def coercion_duration( coercion_type: Optional[str] = None, reason_for_coercion: Optional[str] = None, n_rows: Optional[int] = None, - unpack_to_days: Optional[bool] = False, + unpack_to_intervals: Optional[bool] = False, + unpack_freq: Optional[str] = "D", ) -> pd.DataFrame: """Load coercion data. By default returns entire coercion data view with duration in hours as the value column. @@ -25,7 +26,8 @@ def coercion_duration( coercion_type (str): Type of coercion, e.g. 'tvangsindlæggelse', 'bæltefiksering'. Defaults to None. # noqa: DAR102 reason_for_coercion (str): Reason for coercion, e.g. 'farlighed'. Defaults to None. n_rows: Number of rows to return. Defaults to None which returns entire coercion data view. - unpack_to_days: if we want to create features about the "current state" of the coercion instance, we need to unpack the start and end times to one row per day in the interval. Defaults to False. + unpack_to_intervals: if we want to create features about the "current state" of the coercion instance, we need to unpack the start and end times to one row per day in the interval. Defaults to False. + unpack_freq: unpack_freq: Frequency string by which the interval will be unpacked. Default to "D" (day). For e.g., 5 hours, write "5H". Returns: pd.DataFrame @@ -34,11 +36,7 @@ def coercion_duration( coercion_discard = """('Døraflåsning', 'Personlig afskærmning over 24 timer', 'Koordinationsplan', 'Udskrivningsaftale', 'Særlige dørlåse', 'Personlige alarm- og pejlesystemer', 'Andet' )""" - - view = "[FOR_tvang_alt_hele_kohorten_inkl_2021_feb2022]" - - sql = f"SELECT dw_ek_borger, datotid_start_sei, datotid_slut_sei, varighed_timer_sei, typetekst_sei FROM [fct].{view} WHERE datotid_start_sei IS NOT NULL AND typetekst_sei NOT IN {coercion_discard}" - + sql = f"SELECT dw_ek_borger, datotid_start_sei, datotid_slut_sei, varighed_timer_sei, typetekst_sei FROM [fct].[FOR_tvang_alt_hele_kohorten_inkl_2021_feb2022] WHERE datotid_start_sei IS NOT NULL AND typetekst_sei NOT IN {coercion_discard}" if coercion_type and reason_for_coercion is None: @@ -73,9 +71,12 @@ def coercion_duration( # Change NaNs to 0 df["value"].fillna(0, inplace=True) - if unpack_to_days: - df = unpack_intervals_to_days( - df, starttime_col="datotid_start_sei", endtime_col="timestamp" + if unpack_to_intervals: + df = unpack_intervals( + df, + starttime_col="datotid_start_sei", + endtime_col="timestamp", + unpack_freq=unpack_freq, ) return df[["dw_ek_borger", "timestamp", "value"]].reset_index(drop=True) @@ -84,7 +85,8 @@ def coercion_duration( def _concatenate_coercion( coercion_types_list: list[dict[str, str]], n_rows: Optional[int] = None, - unpack_to_days: Optional[bool] = False, + unpack_to_intervals: Optional[bool] = False, + unpack_freq: Optional[str] = "D", ) -> pd.DataFrame: """Aggregate multiple types of coercion with multiple reasons into one column. @@ -92,7 +94,7 @@ def _concatenate_coercion( Args: coercion_types_list (list): list of dictionaries containing a 'coercion_type' key and a 'reason_for_coercion' key. If keys not in dicts, they are set to None # noqa: DAR102 n (int, optional): Number of rows to return. Defaults to None. - unpack_to_days: if we want to create features about the "current state" of the coercion instance, we need to unpack the start and end times to one row per day in the interval. Defaults to False. + unpack_interval: if we want to create features about the "current state" of the coercion instance, we need to unpack the start and end times to one row per day in the interval. Defaults to False. Returns: pd.DataFrame @@ -113,7 +115,8 @@ def _concatenate_coercion( coercion_type=d["coercion_type"], reason_for_coercion=d["reason_for_coercion"], n_rows=n_rows, - unpack_to_days=unpack_to_days, + unpack_to_intervals=unpack_to_intervals, + unpack_freq=unpack_freq, ) for d in coercion_types_list ] @@ -126,7 +129,9 @@ def _concatenate_coercion( @data_loaders.register("farlighed") def farlighed( - n_rows: Optional[int] = None, unpack_to_days: Optional[bool] = False + n_rows: Optional[int] = None, + unpack_to_intervals: Optional[bool] = False, + unpack_freq: Optional[str] = "D", ) -> pd.DataFrame: coercion_types_list = [ { @@ -140,64 +145,80 @@ def farlighed( return _concatenate_coercion( coercion_types_list=coercion_types_list, n_rows=n_rows, - unpack_to_days=unpack_to_days, + unpack_to_intervals=unpack_to_intervals, + unpack_freq=unpack_freq, ) # Røde papirer ved tvangsindlæggelse/tvangstilbageholdelse @data_loaders.register("paa_grund_af_farlighed") def paa_grund_af_farlighed( - n_rows: Optional[int] = None, unpack_to_days: Optional[bool] = False + n_rows: Optional[int] = None, + unpack_to_intervals: Optional[bool] = False, + unpack_freq: Optional[str] = "D", ) -> pd.DataFrame: return coercion_duration( reason_for_coercion="På grund af farlighed", n_rows=n_rows, - unpack_to_days=unpack_to_days, + unpack_to_intervals=unpack_to_intervals, + unpack_freq=unpack_freq, ) # Gule papirer ved tvangsindlæggelse/tvangstilbageholdelse @data_loaders.register("af_helbredsmaessige_grunde") def af_helbredsmaessige_grunde( - n_rows: Optional[int] = None, unpack_to_days: Optional[bool] = False + n_rows: Optional[int] = None, + unpack_to_intervals: Optional[bool] = False, + unpack_freq: Optional[str] = "D", ) -> pd.DataFrame: return coercion_duration( reason_for_coercion=" Af helbredsmæssige grunde", n_rows=n_rows, - unpack_to_days=unpack_to_days, + unpack_to_intervals=unpack_to_intervals, + unpack_freq=unpack_freq, ) @data_loaders.register("urolig_tilstand") def urolig_tilstand( - n_rows: Optional[int] = None, unpack_to_days: Optional[bool] = False + n_rows: Optional[int] = None, + unpack_to_intervals: Optional[bool] = False, + unpack_freq: Optional[str] = "D", ) -> pd.DataFrame: return coercion_duration( reason_for_coercion="Urolig tilstand", n_rows=n_rows, - unpack_to_days=unpack_to_days, + unpack_to_intervals=unpack_to_intervals, + unpack_freq=unpack_freq, ) @data_loaders.register("anden_begrundelse") def anden_begrundelse( - n_rows: Optional[int] = None, unpack_to_days: Optional[bool] = False + n_rows: Optional[int] = None, + unpack_to_intervals: Optional[bool] = False, + unpack_freq: Optional[str] = "D", ) -> pd.DataFrame: return coercion_duration( reason_for_coercion="Anden begrundelse", n_rows=n_rows, - unpack_to_days=unpack_to_days, + unpack_to_intervals=unpack_to_intervals, + unpack_freq=unpack_freq, ) @data_loaders.register("naerliggende_eller_vaesentlig_fare_for_patienten_eller_andre") def naerliggende_fare( - n_rows: Optional[int] = None, unpack_to_days: Optional[bool] = False + n_rows: Optional[int] = None, + unpack_to_intervals: Optional[bool] = False, + unpack_freq: Optional[str] = "D", ) -> pd.DataFrame: return coercion_duration( reason_for_coercion="Nærliggende_eller_væsentlig_fare_for_patienten_eller_andre", n_rows=n_rows, - unpack_to_days=unpack_to_days, + unpack_to_intervals=unpack_to_intervals, + unpack_freq=unpack_freq, ) @@ -205,7 +226,9 @@ def naerliggende_fare( # frihedsberøvelser @data_loaders.register("skema_1") def skema_1( - n_rows: Optional[int] = None, unpack_to_days: Optional[bool] = False + n_rows: Optional[int] = None, + unpack_to_intervals: Optional[bool] = False, + unpack_freq: Optional[str] = "D", ) -> pd.DataFrame: coercion_types_list = [ { @@ -219,14 +242,17 @@ def skema_1( return _concatenate_coercion( coercion_types_list=coercion_types_list, n_rows=n_rows, - unpack_to_days=unpack_to_days, + unpack_to_intervals=unpack_to_intervals, + unpack_freq=unpack_freq, ) # tvangsbehandlinger @data_loaders.register("skema_2") def skema_2( - n_rows: Optional[int] = None, unpack_to_days: Optional[bool] = False + n_rows: Optional[int] = None, + unpack_to_intervals: Optional[bool] = False, + unpack_freq: Optional[str] = "D", ) -> pd.DataFrame: coercion_types_list = [ { @@ -246,34 +272,17 @@ def skema_2( return _concatenate_coercion( coercion_types_list=coercion_types_list, n_rows=n_rows, - unpack_to_days=unpack_to_days, - ) - - -@data_loaders.register("skema_2_without_nutrition") -def skema_2_without_nutrition(n_rows: Optional[int] = None) -> pd.DataFrame: - coercion_types_list = [ - { - "coercion_type": "Af legemlig lidelse", - }, - { - "coercion_type": "Medicinering", - }, - { - "coercion_type": "ECT", - }, - ] - - return _concatenate_coercion( - coercion_types_list=coercion_types_list, - n_rows=n_rows, + unpack_to_intervals=unpack_to_intervals, + unpack_freq=unpack_freq, ) # magtanvendelse @data_loaders.register("skema_3") def skema_3( - n_rows: Optional[int] = None, unpack_to_days: Optional[bool] = False + n_rows: Optional[int] = None, + unpack_to_intervals: Optional[bool] = False, + unpack_freq: Optional[str] = "D", ) -> pd.DataFrame: coercion_types_list = [ { @@ -298,7 +307,8 @@ def skema_3( return _concatenate_coercion( coercion_types_list=coercion_types_list, n_rows=n_rows, - unpack_to_days=unpack_to_days, + unpack_to_intervals=unpack_to_intervals, + unpack_freq=unpack_freq, ) @@ -308,120 +318,153 @@ def skema_3( @data_loaders.register("baelte") def baelte( - n_rows: Optional[int] = None, unpack_to_days: Optional[bool] = False + n_rows: Optional[int] = None, + unpack_to_intervals: Optional[bool] = False, + unpack_freq: Optional[str] = "D", ) -> pd.DataFrame: return coercion_duration( coercion_type="Bælte", n_rows=n_rows, - unpack_to_days=unpack_to_days, + unpack_to_intervals=unpack_to_intervals, + unpack_freq=unpack_freq, ) @data_loaders.register("remme") def remme( - n_rows: Optional[int] = None, unpack_to_days: Optional[bool] = False + n_rows: Optional[int] = None, + unpack_to_intervals: Optional[bool] = False, + unpack_freq: Optional[str] = "D", ) -> pd.DataFrame: return coercion_duration( coercion_type="Remme", n_rows=n_rows, - unpack_to_days=unpack_to_days, + unpack_to_intervals=unpack_to_intervals, + unpack_freq=unpack_freq, ) @data_loaders.register("fastholden") def fastholden( - n_rows: Optional[int] = None, unpack_to_days: Optional[bool] = False + n_rows: Optional[int] = None, + unpack_to_intervals: Optional[bool] = False, + unpack_freq: Optional[str] = "D", ) -> pd.DataFrame: return coercion_duration( coercion_type="Fastholden", n_rows=n_rows, - unpack_to_days=unpack_to_days, + unpack_to_intervals=unpack_to_intervals, + unpack_freq=unpack_freq, ) @data_loaders.register("beroligende_medicin") def beroligende_medicin( - n_rows: Optional[int] = None, unpack_to_days: Optional[bool] = False + n_rows: Optional[int] = None, + unpack_to_intervals: Optional[bool] = False, + unpack_freq: Optional[str] = "D", ) -> pd.DataFrame: return coercion_duration( coercion_type="Beroligende medicin", n_rows=n_rows, - unpack_to_days=unpack_to_days, + unpack_to_intervals=unpack_to_intervals, + unpack_freq=unpack_freq, ) @data_loaders.register("handsker") def handsker( - n_rows: Optional[int] = None, unpack_to_days: Optional[bool] = False + n_rows: Optional[int] = None, + unpack_to_intervals: Optional[bool] = False, + unpack_freq: Optional[str] = "D", ) -> pd.DataFrame: return coercion_duration( coercion_type="Handsker", n_rows=n_rows, - unpack_to_days=unpack_to_days, + unpack_to_intervals=unpack_to_intervals, + unpack_freq=unpack_freq, ) @data_loaders.register("tvangsindlaeggelse") def tvangsindlaeggelse( - n_rows: Optional[int] = None, unpack_to_days: Optional[bool] = False + n_rows: Optional[int] = None, + unpack_to_intervals: Optional[bool] = False, + unpack_freq: Optional[str] = "D", ) -> pd.DataFrame: return coercion_duration( coercion_type="Tvangsindlæggelse", n_rows=n_rows, - unpack_to_days=unpack_to_days, + unpack_to_intervals=unpack_to_intervals, + unpack_freq=unpack_freq, ) @data_loaders.register("tvangstilbageholdelse") def tvangstilbageholdelse( - n_rows: Optional[int] = None, unpack_to_days: Optional[bool] = False + n_rows: Optional[int] = None, + unpack_to_intervals: Optional[bool] = False, + unpack_freq: Optional[str] = "D", ) -> pd.DataFrame: return coercion_duration( coercion_type="Tvangstilbageholdelse", n_rows=n_rows, - unpack_to_days=unpack_to_days, + unpack_to_intervals=unpack_to_intervals, + unpack_freq=unpack_freq, ) @data_loaders.register("medicinering") def medicinering( - n_rows: Optional[int] = None, unpack_to_days: Optional[bool] = False + n_rows: Optional[int] = None, + unpack_to_intervals: Optional[bool] = False, + unpack_freq: Optional[str] = "D", ) -> pd.DataFrame: return coercion_duration( coercion_type="Medicinering", n_rows=n_rows, - unpack_to_days=unpack_to_days, + unpack_to_intervals=unpack_to_intervals, + unpack_freq=unpack_freq, ) @data_loaders.register("ect") def ect( - n_rows: Optional[int] = None, unpack_to_days: Optional[bool] = False + n_rows: Optional[int] = None, + unpack_to_intervals: Optional[bool] = False, + unpack_freq: Optional[str] = "D", ) -> pd.DataFrame: return coercion_duration( coercion_type="ECT", n_rows=n_rows, - unpack_to_days=unpack_to_days, + unpack_to_intervals=unpack_to_intervals, + unpack_freq=unpack_freq, ) @data_loaders.register("ernaering") def ernaering( - n_rows: Optional[int] = None, unpack_to_days: Optional[bool] = False + n_rows: Optional[int] = None, + unpack_to_intervals: Optional[bool] = False, + unpack_freq: Optional[str] = "D", ) -> pd.DataFrame: return coercion_duration( coercion_type="Ernæring", n_rows=n_rows, - unpack_to_days=unpack_to_days, + unpack_to_intervals=unpack_to_intervals, + unpack_freq=unpack_freq, ) @data_loaders.register("af_legemlig_lidelse") def af_legemlig_lidelse( - n_rows: Optional[int] = None, unpack_to_days: Optional[bool] = False + n_rows: Optional[int] = None, + unpack_to_intervals: Optional[bool] = False, + unpack_freq: Optional[str] = "D", ) -> pd.DataFrame: return coercion_duration( coercion_type="Af legemlig lidelse", n_rows=n_rows, - unpack_to_days=unpack_to_days, + unpack_to_intervals=unpack_to_intervals, + unpack_freq=unpack_freq, ) From b6bcce93f97cd9269eab29b0a942696cc1dc4a89 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Signe=20Kirk=20Br=C3=B8db=C3=A6k?= <40836345+signekb@users.noreply.github.com> Date: Mon, 13 Mar 2023 15:33:10 +0100 Subject: [PATCH 5/8] Update args doc strings in load_coercion.py --- src/psycop_feature_generation/loaders/raw/load_coercion.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/psycop_feature_generation/loaders/raw/load_coercion.py b/src/psycop_feature_generation/loaders/raw/load_coercion.py index ac131abe..72f25d05 100644 --- a/src/psycop_feature_generation/loaders/raw/load_coercion.py +++ b/src/psycop_feature_generation/loaders/raw/load_coercion.py @@ -26,7 +26,7 @@ def coercion_duration( coercion_type (str): Type of coercion, e.g. 'tvangsindlæggelse', 'bæltefiksering'. Defaults to None. # noqa: DAR102 reason_for_coercion (str): Reason for coercion, e.g. 'farlighed'. Defaults to None. n_rows: Number of rows to return. Defaults to None which returns entire coercion data view. - unpack_to_intervals: if we want to create features about the "current state" of the coercion instance, we need to unpack the start and end times to one row per day in the interval. Defaults to False. + unpack_to_intervals: Unpack time interval to rows with set frequency (see below). Defaults to False. unpack_freq: unpack_freq: Frequency string by which the interval will be unpacked. Default to "D" (day). For e.g., 5 hours, write "5H". Returns: @@ -94,7 +94,8 @@ def _concatenate_coercion( Args: coercion_types_list (list): list of dictionaries containing a 'coercion_type' key and a 'reason_for_coercion' key. If keys not in dicts, they are set to None # noqa: DAR102 n (int, optional): Number of rows to return. Defaults to None. - unpack_interval: if we want to create features about the "current state" of the coercion instance, we need to unpack the start and end times to one row per day in the interval. Defaults to False. + unpack_to_intervals: Unpack time interval to rows with set frequency (see below). Defaults to False. + unpack_freq: unpack_freq: Frequency string by which the interval will be unpacked. Default to "D" (day). For e.g., 5 hours, write "5H". Returns: pd.DataFrame From dc14ddf833e29cf8619b37930c0946a3d6279a16 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Signe=20Kirk=20Br=C3=B8db=C3=A6k?= <40836345+signekb@users.noreply.github.com> Date: Mon, 13 Mar 2023 15:35:22 +0100 Subject: [PATCH 6/8] Update unpack_intervals function with freq arg --- .../loaders/raw/utils.py | 25 +++++++++---------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/src/psycop_feature_generation/loaders/raw/utils.py b/src/psycop_feature_generation/loaders/raw/utils.py index 47b4c264..6209edd9 100644 --- a/src/psycop_feature_generation/loaders/raw/utils.py +++ b/src/psycop_feature_generation/loaders/raw/utils.py @@ -282,10 +282,11 @@ def load_from_codes( ) -def unpack_intervals_to_days( +def unpack_intervals( df: pd.DataFrame, starttime_col: str = "datotid_start_sei", endtime_col: str = "timestamp", + unpack_freq: str = "D", ) -> pd.DataFrame: """Transform df with starttime_col and endtime_col to day grain (one row per day in the interval starttime_col-endtime_col). First and last day will have the specific start and end time, while days inbetween will be 00:00:00. @@ -293,23 +294,24 @@ def unpack_intervals_to_days( Args: df (pd.DataFrame): dataframe with time interval in separate columns. starttime_col (str, optional): Name of column with start time. Defaults to "datotid_start_sei". - endtime_col (str, optional): Name of column with end time. Defaults to "datotid_slut_slut". + endtime_col (str, optional): Name of column with end time. Defaults to "timestamp". + unpack_freq: Frequency string by which the interval will be unpacked. Default to "D" (day). For e.g., 5 hours, write "5H". Returns: pd.DataFrame: Dataframe with time interval unpacked to day grain. """ - # create rows with start and end time - df_start_rows, df_end_rows = df.copy(), df.copy() - df_start_rows["date_range"] = df_start_rows[f"{starttime_col}"] + # create rows with end time + df_end_rows = df.copy() df_end_rows["date_range"] = df_end_rows[f"{endtime_col}"] # create a date range column between start date and end date for each visit/admission/coercion instance df["date_range"] = df.apply( lambda x: pd.date_range( - start=x[f"{starttime_col}"].date() + pd.DateOffset(1), - end=x[f"{endtime_col}"].date(), + start=x[f"{starttime_col}"], + end=x[f"{endtime_col}"], + freq=unpack_freq, ), axis=1, ) @@ -317,21 +319,18 @@ def unpack_intervals_to_days( # explode the date range column to create a new row for each date in the range df = df.explode("date_range") - # remove na's (produced when start date = end date) - df = df[df["date_range"].notnull()] - # concat df with start and end time rows - df = pd.concat([df, df_start_rows, df_end_rows], ignore_index=True).sort_values( + df = pd.concat([df, df_end_rows], ignore_index=True).sort_values( ["dw_ek_borger", f"{starttime_col}", "date_range"] ) - # drop duplicates (when start or end time = 00:00:00) + # drop duplicates (when start and/or end time = 00:00:00) df = df.drop_duplicates(keep="first") # reset index df = df.reset_index(drop=True) - # set value to 1 (duration has lost meaning now, since duration are repeated multiple times per visit/admission/coercion instance now) + # set value to 1 (duration has lost meaning now, since durations are repeated on multiple rows per coercion instance now) df["value"] = 1 # only keep relevant columns and rename date_range to timestamp From 6b09aa7119bfc22cd6ea98fbfbc443e215824351 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Signe=20Kirk=20Br=C3=B8db=C3=A6k?= <40836345+signekb@users.noreply.github.com> Date: Mon, 13 Mar 2023 15:36:48 +0100 Subject: [PATCH 7/8] Update and rename test_unpack_interval_to_days.py to test_unpack_intervals.py --- .../test_unpack_interval_to_days.py | 62 -------------- tests/test_loaders/test_unpack_intervals.py | 84 +++++++++++++++++++ 2 files changed, 84 insertions(+), 62 deletions(-) delete mode 100644 tests/test_loaders/test_unpack_interval_to_days.py create mode 100644 tests/test_loaders/test_unpack_intervals.py diff --git a/tests/test_loaders/test_unpack_interval_to_days.py b/tests/test_loaders/test_unpack_interval_to_days.py deleted file mode 100644 index 81daf9e7..00000000 --- a/tests/test_loaders/test_unpack_interval_to_days.py +++ /dev/null @@ -1,62 +0,0 @@ -"""Test of unpack intervals to days""" - -import pandas as pd - -from psycop_feature_generation.loaders.raw.utils import ( - unpack_intervals_to_days, -) -from psycop_feature_generation.utils_for_testing import ( - str_to_df, -) - - -def test_unpack_intervals_to_days(): - df_str = """dw_ek_borger,timestamp_start,timestamp_end,value - 1,2021-01-01 12:00:00,2021-01-01 13:00:00,1.0 - 2,2021-02-02 00:00:00,2021-02-02 14:00:00,10.0 - 3,2021-03-03 15:00:00,2021-03-05 15:30:00,48.5 - 4,2021-04-04 00:00:00,2021-04-06 16:00:00,16.0 - 5,2021-05-05 17:30:00,2021-05-07 00:00:00,30.5 - 6,2021-06-06 00:00:00,2021-06-09 00:00:00,72.0 - """ - - expected_df_str = """dw_ek_borger,timestamp,value - 1,2021-01-01 12:00:00,1 - 1,2021-01-01 13:00:00,1 - 2,2021-02-02 00:00:00,1 - 2,2021-02-02 14:00:00,1 - 3,2021-03-03 15:00:00,1 - 3,2021-03-04 00:00:00,1 - 3,2021-03-05 00:00:00,1 - 3,2021-03-05 15:30:00,1 - 4,2021-04-04 00:00:00,1 - 4,2021-04-05 00:00:00,1 - 4,2021-04-06 00:00:00,1 - 4,2021-04-06 16:00:00,1 - 5,2021-05-05 17:30:00,1 - 5,2021-05-06 00:00:00,1 - 5,2021-05-07 00:00:00,1 - 6,2021-06-06 00:00:00,1 - 6,2021-06-07 00:00:00,1 - 6,2021-06-08 00:00:00,1 - 6,2021-06-09 00:00:00,1 - """ - - # 1: interval < 1 day and times are not 00:00:00 (= two rows with start and end time) - # 2: interval < 1 day and start is 00:00:00 (= two rows with start and end time) - # 3: interval > 1 day and times are not 00:00:00 (= one row with start time, rows with time 00:00:00 in-between, and one row with end time) - # 4: interval > 1 day and start is 00:00:00 (= one row with start time 00:00:00, rows with time 00:00:00 in-between, and one row with end time) - # 5: interval > 1 day and end is 00:00:00 (= one row with start time 00:00:00, rows with time 00:00:00 in-between, and one row with end time 00:00:00) - # 6: interval > 1 day and both times are 00:00:00 (= one row per day, all times 00:00:00) - - df = str_to_df(df_str, convert_str_to_float=False) - expected_df = str_to_df(expected_df_str, convert_str_to_float=False) - - df = unpack_intervals_to_days( - df, - starttime_col="timestamp_start", - endtime_col="timestamp_end", - ) - - for col in df.columns: - pd.testing.assert_series_equal(df[col], expected_df[col]) diff --git a/tests/test_loaders/test_unpack_intervals.py b/tests/test_loaders/test_unpack_intervals.py new file mode 100644 index 00000000..802d41f2 --- /dev/null +++ b/tests/test_loaders/test_unpack_intervals.py @@ -0,0 +1,84 @@ +"""Tests of unpack_intervals""" + +import pandas as pd + +from psycop_feature_generation.loaders.raw.utils import ( + unpack_intervals, +) +from psycop_feature_generation.utils_for_testing import ( + str_to_df, +) + + +def test_unpack_intervals_to_days(): + df_str = """dw_ek_borger,timestamp_start,timestamp_end,value + 1,2021-01-01 12:00:00,2021-01-01 13:00:00,1.0 + 2,2021-03-03 15:00:00,2021-03-05 15:30:00,48.5 + 3,2021-06-06 00:00:00,2021-06-09 00:00:00,72.0 + """ + + expected_df_str = """dw_ek_borger,timestamp,value + 1,2021-01-01 12:00:00,1 + 1,2021-01-01 13:00:00,1 + 2,2021-03-03 15:00:00,1 + 2,2021-03-04 15:00:00,1 + 2,2021-03-05 15:00:00,1 + 2,2021-03-05 15:30:00,1 + 3,2021-06-06 00:00:00,1 + 3,2021-06-07 00:00:00,1 + 3,2021-06-08 00:00:00,1 + 3,2021-06-09 00:00:00,1 + """ + + # 1: interval < 1 day (= two rows, one with start time and one with end time) + # 2: interval > 1 day and times are not 00:00:00 (= one row with start time, one row per day in-between with timestamp same as start time, and one row with end time) + # 3: interval > 1 day and both times are 00:00:00 (= one row per day, includeing start and end day, all times 00:00:00) + + df = str_to_df(df_str, convert_str_to_float=False) + expected_df = str_to_df(expected_df_str, convert_str_to_float=False) + + df = unpack_intervals( + df, + starttime_col="timestamp_start", + endtime_col="timestamp_end", + unpack_freq="D", + ) + + for col in df.columns: + pd.testing.assert_series_equal(df[col], expected_df[col]) + + +def test_unpack_intervals_to_5Hfreq(): + df_str = """dw_ek_borger,timestamp_start,timestamp_end,value + 1,2021-01-01 12:00:00,2021-01-01 13:00:00,1.0 + 2,2021-02-02 15:00:00,2021-02-02 20:00:00,5.0 + 3,2021-03-04 16:00:00,2021-03-05 4:00:00,12.0 + """ + + expected_df_str = """dw_ek_borger,timestamp,value + 1,2021-01-01 12:00:00,1 + 1,2021-01-01 13:00:00,1 + 2,2021-02-02 15:00:00,1 + 2,2021-02-02 20:00:00,1 + 3,2021-03-04 16:00:00,1 + 3,2021-03-04 21:00:00,1 + 3,2021-03-05 02:00:00,1 + 3,2021-03-05 4:00:00,1 + """ + + # 1: interval < 5 hours + # 2: interval = 5 hours + # 3: interval > 5 hours + + df = str_to_df(df_str, convert_str_to_float=False) + expected_df = str_to_df(expected_df_str, convert_str_to_float=False) + + df = unpack_intervals( + df, + starttime_col="timestamp_start", + endtime_col="timestamp_end", + unpack_freq="5H", + ) + + for col in df.columns: + pd.testing.assert_series_equal(df[col], expected_df[col]) From 1cd3b7f9d27165287e52c4262846391b2bf14935 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Signe=20Kirk=20Br=C3=B8db=C3=A6k?= <40836345+signekb@users.noreply.github.com> Date: Mon, 13 Mar 2023 15:49:58 +0100 Subject: [PATCH 8/8] update unpack_intervals to include entity_id arg --- src/psycop_feature_generation/loaders/raw/utils.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/psycop_feature_generation/loaders/raw/utils.py b/src/psycop_feature_generation/loaders/raw/utils.py index 6209edd9..1c99e4a1 100644 --- a/src/psycop_feature_generation/loaders/raw/utils.py +++ b/src/psycop_feature_generation/loaders/raw/utils.py @@ -286,6 +286,7 @@ def unpack_intervals( df: pd.DataFrame, starttime_col: str = "datotid_start_sei", endtime_col: str = "timestamp", + entity_id: str = "dw_ek_borger", unpack_freq: str = "D", ) -> pd.DataFrame: """Transform df with starttime_col and endtime_col to day grain (one row per day in the interval starttime_col-endtime_col). @@ -321,7 +322,7 @@ def unpack_intervals( # concat df with start and end time rows df = pd.concat([df, df_end_rows], ignore_index=True).sort_values( - ["dw_ek_borger", f"{starttime_col}", "date_range"] + [f"{entity_id}", f"{starttime_col}", "date_range"] ) # drop duplicates (when start and/or end time = 00:00:00) @@ -334,7 +335,7 @@ def unpack_intervals( df["value"] = 1 # only keep relevant columns and rename date_range to timestamp - df = df[["dw_ek_borger", "date_range", "value"]].rename( + df = df[[f"{entity_id}", "date_range", "value"]].rename( columns={"date_range": "timestamp"} )