From 3deb052d7373ba8a4ebef265f8364101555bfd72 Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Thu, 15 Dec 2022 09:51:11 +0100 Subject: [PATCH 1/8] feat: allow filtering based on quarantine dates --- .../filter_prediction_times.py | 75 +++++++++++++++++++ .../utils_for_testing.py | 9 +++ .../test_filter_prediction_times.py | 55 ++++++++++++++ 3 files changed, 139 insertions(+) create mode 100644 src/psycop_feature_generation/application_modules/filter_prediction_times.py create mode 100644 tests/test_application_modules/test_filter_prediction_times.py diff --git a/src/psycop_feature_generation/application_modules/filter_prediction_times.py b/src/psycop_feature_generation/application_modules/filter_prediction_times.py new file mode 100644 index 00000000..a8c28626 --- /dev/null +++ b/src/psycop_feature_generation/application_modules/filter_prediction_times.py @@ -0,0 +1,75 @@ +"""Class for filtering prediction times before they are used for feature generation.""" +from typing import Optional + +import pandas as pd + + +class PredictionTimeFilterer: + def __init__( + self, + prediction_time_df: pd.DataFrame, + id_col_name: str, + quarantine_df: Optional[pd.DataFrame] = None, + quarantine_days: Optional[int] = None, + ): + self.prediction_time_df = prediction_time_df + self.quarantine_df = quarantine_df + self.quarantine_days = quarantine_days + self.id_col_name = id_col_name + + def _filter_prediction_times_by_quarantine_period(self): + # We need to check if ANY quarantine date hits each prediction time. + # Create combinations + df = self.prediction_time_df.merge( + self.quarantine_df, + on=self.id_col_name, + how="left", + suffixes=("_pred", "_quarantine"), + ) + + df["days_since_quarantine"] = ( + df["timestamp_pred"] - df["timestamp_quarantine"] + ).dt.days + + # Check if the prediction time is hit by the quarantine date. + df.loc[ + (df["days_since_quarantine"] < self.quarantine_days) + & (df["days_since_quarantine"] > 0), + "hit_by_quarantine", + ] = True + + # If any of the combinations for a UUID is hit by a quarantine date, drop it. + df["hit_by_quarantine"] = df.groupby("pred_time_uuid")[ + "hit_by_quarantine" + ].transform("max") + + df = df.loc[df["hit_by_quarantine"] != True] + + df = df.drop_duplicates(subset="pred_time_uuid") + + # Drop the columns we added + df = df.drop( + columns=[ + "days_since_quarantine", + "hit_by_quarantine", + "timestamp_quarantine", + ] + ) + + # Rename the timestamp column + df = df.rename(columns={"timestamp_pred": "timestamp"}) + + return df + + def filter(self): + df = self.prediction_time_df + + if self.quarantine_df is not None or self.quarantine_days is not None: + if self.quarantine_days is None or self.quarantine_days is None: + raise ValueError( + "If either of quarantine_df and quarantine_days are provided, both must be provided." + ) + + df = self._filter_prediction_times_by_quarantine_period() + + return df diff --git a/src/psycop_feature_generation/utils_for_testing.py b/src/psycop_feature_generation/utils_for_testing.py index b596ad67..0d784fba 100644 --- a/src/psycop_feature_generation/utils_for_testing.py +++ b/src/psycop_feature_generation/utils_for_testing.py @@ -39,6 +39,8 @@ def str_to_df( convert_timestamp_to_datetime: bool = True, convert_np_nan_to_nan: bool = True, convert_str_to_float: bool = False, + add_pred_time_uuid: bool = False, + entity_id_colname: str = "entity_id", ) -> DataFrame: """Convert a string representation of a dataframe to a dataframe. @@ -51,6 +53,8 @@ def str_to_df( Returns: DataFrame: A dataframe. """ + # Drop anything after the last comma + string = string[: string.rfind(",") + 1] df = pd.read_table(StringIO(string), sep=",", index_col=False) @@ -65,6 +69,11 @@ def str_to_df( # Convert all str to float df = df.apply(pd.to_numeric, axis=0, errors="coerce") + if add_pred_time_uuid: + df["pred_time_uuid"] = ( + df[entity_id_colname].astype(str) + "_" + df["timestamp"].astype(str) + ) + # Drop "Unnamed" cols return df.loc[:, ~df.columns.str.contains("^Unnamed")] diff --git a/tests/test_application_modules/test_filter_prediction_times.py b/tests/test_application_modules/test_filter_prediction_times.py new file mode 100644 index 00000000..9932aed5 --- /dev/null +++ b/tests/test_application_modules/test_filter_prediction_times.py @@ -0,0 +1,55 @@ +import pandas as pd + +from psycop_feature_generation.application_modules.filter_prediction_times import ( + PredictionTimeFilterer, +) +from psycop_feature_generation.utils_for_testing import str_to_df + + +def test_filter_by_quarantine_date(): + """Test filtering by quarantine date. + + Should filter if the prediction times is within X days after the quarantine date. + + Uses dataframes as input. + """ + quarantine_df = str_to_df( + """entity_id,timestamp, + 1,2021-01-01 00:00:01, + 1,2022-01-01 00:00:01, + """, + ) + + prediction_time_df = str_to_df( + """entity_id,timestamp, + 1,2020-12-01 00:00:01, # keep: before quarantine date + 1,2022-12-01 00:00:01, # drop: after quarantine date + 1,2026-02-01 00:00:01, # keep: outside quarantine days + 2,2023-02-01 00:00:01, # keep: no quarantine date for this id + """, + add_pred_time_uuid=True, + ) + + expected_df = str_to_df( + """entity_id,timestamp, + 1,2020-12-01 00:00:01, + 1,2026-02-01 00:00:01, + 2,2023-02-01 00:00:01, + """, + add_pred_time_uuid=True, + ) + + filterer = PredictionTimeFilterer( + prediction_time_df=prediction_time_df, + quarantine_df=quarantine_df, + quarantine_days=730, + id_col_name="entity_id", + ) + + result_df = filterer.filter() + + # Check that the result is as expected using pandas.testing.assert_frame_equal + pd.testing.assert_frame_equal( + result_df.reset_index(drop=True), + expected_df.reset_index(drop=True), + ) From c3c2b6825f871e2c431352bc2c5f1413702b9f21 Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Thu, 15 Dec 2022 09:52:59 +0100 Subject: [PATCH 2/8] style: linting --- .../filter_prediction_times.py | 15 +++++++++++---- .../utils_for_testing.py | 2 ++ .../test_filter_prediction_times.py | 3 ++- 3 files changed, 15 insertions(+), 5 deletions(-) diff --git a/src/psycop_feature_generation/application_modules/filter_prediction_times.py b/src/psycop_feature_generation/application_modules/filter_prediction_times.py index a8c28626..4704f562 100644 --- a/src/psycop_feature_generation/application_modules/filter_prediction_times.py +++ b/src/psycop_feature_generation/application_modules/filter_prediction_times.py @@ -1,10 +1,14 @@ -"""Class for filtering prediction times before they are used for feature generation.""" +"""Class for filtering prediction times before they are used for feature +generation.""" from typing import Optional import pandas as pd class PredictionTimeFilterer: + """Class for filtering prediction times before they are used for + feature.""" + def __init__( self, prediction_time_df: pd.DataFrame, @@ -43,7 +47,9 @@ def _filter_prediction_times_by_quarantine_period(self): "hit_by_quarantine" ].transform("max") - df = df.loc[df["hit_by_quarantine"] != True] + df = df.loc[ + df["hit_by_quarantine"] != True # pylint: disable=singleton-comparison + ] df = df.drop_duplicates(subset="pred_time_uuid") @@ -53,7 +59,7 @@ def _filter_prediction_times_by_quarantine_period(self): "days_since_quarantine", "hit_by_quarantine", "timestamp_quarantine", - ] + ], ) # Rename the timestamp column @@ -62,12 +68,13 @@ def _filter_prediction_times_by_quarantine_period(self): return df def filter(self): + """Run filters based on the provided parameters.""" df = self.prediction_time_df if self.quarantine_df is not None or self.quarantine_days is not None: if self.quarantine_days is None or self.quarantine_days is None: raise ValueError( - "If either of quarantine_df and quarantine_days are provided, both must be provided." + "If either of quarantine_df and quarantine_days are provided, both must be provided.", ) df = self._filter_prediction_times_by_quarantine_period() diff --git a/src/psycop_feature_generation/utils_for_testing.py b/src/psycop_feature_generation/utils_for_testing.py index 0d784fba..604cd47a 100644 --- a/src/psycop_feature_generation/utils_for_testing.py +++ b/src/psycop_feature_generation/utils_for_testing.py @@ -49,6 +49,8 @@ def str_to_df( convert_timestamp_to_datetime (bool): Whether to convert the timestamp column to datetime. Defaults to True. convert_np_nan_to_nan (bool): Whether to convert np.nan to np.nan. Defaults to True. convert_str_to_float (bool): Whether to convert strings to floats. Defaults to False. + add_pred_time_uuid (bool): Whether to add a pred_time_uuid column. Defaults to False. + entity_id_colname (str): The name of the entity_id column. Defaults to "entity_id". Returns: DataFrame: A dataframe. diff --git a/tests/test_application_modules/test_filter_prediction_times.py b/tests/test_application_modules/test_filter_prediction_times.py index 9932aed5..5343525c 100644 --- a/tests/test_application_modules/test_filter_prediction_times.py +++ b/tests/test_application_modules/test_filter_prediction_times.py @@ -1,3 +1,4 @@ +"""Tests of the filter_prediction_times module.""" import pandas as pd from psycop_feature_generation.application_modules.filter_prediction_times import ( @@ -22,7 +23,7 @@ def test_filter_by_quarantine_date(): prediction_time_df = str_to_df( """entity_id,timestamp, - 1,2020-12-01 00:00:01, # keep: before quarantine date + 1,2020-12-01 00:00:01, # keep: before quarantine date 1,2022-12-01 00:00:01, # drop: after quarantine date 1,2026-02-01 00:00:01, # keep: outside quarantine days 2,2023-02-01 00:00:01, # keep: no quarantine date for this id From 62e8baeb7ab496ce28fc9a05bb75ed23bc43d1ad Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Thu, 15 Dec 2022 09:58:00 +0100 Subject: [PATCH 3/8] ci: bump cache version to rebuild virtualenv --- .github/workflows/main_test_and_release.yml | 40 ++++++++++----------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/.github/workflows/main_test_and_release.yml b/.github/workflows/main_test_and_release.yml index 5edc5436..8c658bdc 100644 --- a/.github/workflows/main_test_and_release.yml +++ b/.github/workflows/main_test_and_release.yml @@ -9,10 +9,10 @@ on: pull_request: push: branches: - - main + - main env: - cache-version: 0.0.4 + cache-version: 0.0.5 poetry-version: 1.1.15 python-version: 3.9 # Change this number if you want to manually invalidate all caches @@ -26,7 +26,7 @@ jobs: # This allows a subsequently queued workflow run to interrupt previous runs concurrency: - group: '${{ github.workflow }} - ${{ matrix.os }} @ ${{ github.ref }}' + group: "${{ github.workflow }} - ${{ matrix.os }} @ ${{ github.ref }}" cancel-in-progress: true steps: @@ -46,21 +46,21 @@ jobs: needs: test if: ${{ github.ref == 'refs/heads/main' }} steps: - # Checkout action is required for token to persist - - uses: actions/checkout@v2 - with: - fetch-depth: 0 - token: ${{ secrets.RELEASE_BOT }} + # Checkout action is required for token to persist + - uses: actions/checkout@v2 + with: + fetch-depth: 0 + token: ${{ secrets.RELEASE_BOT }} - - name: Python Semantic Release - uses: relekang/python-semantic-release@v7.32.0 - with: - github_token: ${{ secrets.RELEASE_BOT }} - # Remember to copy the tool.semantic_release section from pyproject.toml - # as well - # To enable pypi, - # 1) Set upload_to_pypi to true in pyproject.toml and - # 2) Set the pypi_token in the repo - # 3) Uncomment the two lines below - repository_username: __token__ - repository_password: ${{ secrets.PYPI_TOKEN }} \ No newline at end of file + - name: Python Semantic Release + uses: relekang/python-semantic-release@v7.32.0 + with: + github_token: ${{ secrets.RELEASE_BOT }} + # Remember to copy the tool.semantic_release section from pyproject.toml + # as well + # To enable pypi, + # 1) Set upload_to_pypi to true in pyproject.toml and + # 2) Set the pypi_token in the repo + # 3) Uncomment the two lines below + repository_username: __token__ + repository_password: ${{ secrets.PYPI_TOKEN }} From 9f47e276f21378a4d19975b854798cf0949bbf4b Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Thu, 15 Dec 2022 11:08:17 +0100 Subject: [PATCH 4/8] init: first stab at adding to application module --- .../application_modules/flatten_dataset.py | 49 +++++++++++++++++-- 1 file changed, 45 insertions(+), 4 deletions(-) diff --git a/src/psycop_feature_generation/application_modules/flatten_dataset.py b/src/psycop_feature_generation/application_modules/flatten_dataset.py index 64012a5a..6d84911d 100644 --- a/src/psycop_feature_generation/application_modules/flatten_dataset.py +++ b/src/psycop_feature_generation/application_modules/flatten_dataset.py @@ -1,15 +1,46 @@ """Flatten the dataset.""" import pandas as pd import psutil +from timeseriesflattener.feature_cache.cache_to_disk import DiskCache +from timeseriesflattener.feature_spec_objects import _AnySpec +from timeseriesflattener.flattened_dataset import TimeseriesFlattener +from psycop_feature_generation.application_modules.filter_prediction_times import ( + PredictionTimeFilterer, +) from psycop_feature_generation.application_modules.project_setup import ProjectInfo from psycop_feature_generation.application_modules.wandb_utils import ( wandb_alert_on_exception, ) from psycop_feature_generation.loaders.raw.load_demographic import birthdays -from timeseriesflattener.feature_cache.cache_to_disk import DiskCache -from timeseriesflattener.feature_spec_objects import _AnySpec -from timeseriesflattener.flattened_dataset import TimeseriesFlattener + + +def filter_prediction_times( + prediction_times_df: pd.DataFrame, + project_info: ProjectInfo, + quarantine_df=None, + quarantine_days=None, +) -> pd.DataFrame: + """Filter prediction times. + + Args: + prediction_times_df (pd.DataFrame): Prediction times dataframe. + Should contain entity_id and timestamp columns with col_names matching those in project_info.col_names. + project_info (ProjectInfo): Project info. + quarantine_df (pd.DataFrame, optional): Quarantine dataframe with "timestamp" and "project_info.col_names.id" columns. + quarantine_days (int, optional): Number of days to quarantine. + + Returns: + pd.DataFrame: Filtered prediction times dataframe. + """ + filterer = PredictionTimeFilterer( + prediction_time_df=prediction_times_df, + id_col_name=project_info.col_names.id, + quarantine_df=quarantine_df, + quarantine_days=quarantine_days, + ) + + return filterer.filter() @wandb_alert_on_exception @@ -18,6 +49,8 @@ def create_flattened_dataset( prediction_times_df: pd.DataFrame, drop_pred_times_with_insufficient_look_distance: bool, project_info: ProjectInfo, + quarantine_df=None, + quarantine_days=None, ) -> pd.DataFrame: """Create flattened dataset. @@ -28,13 +61,21 @@ def create_flattened_dataset( Should contain entity_id and timestamp columns with col_names matching those in project_info.col_names. drop_pred_times_with_insufficient_look_distance (bool): Whether to drop prediction times with insufficient look distance. See timeseriesflattener tutorial for more info. + quarantine_df (pd.DataFrame, optional): Quarantine dataframe with "timestamp" and "project_info.col_names.id" columns. + quarantine_days (int, optional): Number of days to quarantine. Returns: FlattenedDataset: Flattened dataset. """ + filtered_prediction_times_df = filter_prediction_times( + prediction_times_df=prediction_times_df, + project_info=project_info, + quarantine_df=quarantine_df, + quarantine_days=quarantine_days, + ) flattened_dataset = TimeseriesFlattener( - prediction_times_df=prediction_times_df, + prediction_times_df=filtered_prediction_times_df, n_workers=min( len(feature_specs), psutil.cpu_count(logical=True), From d7f0c6da03ee8739418304d8b7fd2955ab12ac77 Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Thu, 15 Dec 2022 11:19:49 +0100 Subject: [PATCH 5/8] misc: improvements from review --- .../filter_prediction_times.py | 36 ++++++++++++------- .../application_modules/flatten_dataset.py | 8 ++--- .../utils_for_testing.py | 9 ++--- .../test_filter_prediction_times.py | 19 ++++++---- 4 files changed, 45 insertions(+), 27 deletions(-) diff --git a/src/psycop_feature_generation/application_modules/filter_prediction_times.py b/src/psycop_feature_generation/application_modules/filter_prediction_times.py index 4704f562..9b95f161 100644 --- a/src/psycop_feature_generation/application_modules/filter_prediction_times.py +++ b/src/psycop_feature_generation/application_modules/filter_prediction_times.py @@ -11,22 +11,34 @@ class PredictionTimeFilterer: def __init__( self, - prediction_time_df: pd.DataFrame, - id_col_name: str, - quarantine_df: Optional[pd.DataFrame] = None, - quarantine_days: Optional[int] = None, + prediction_times_df: pd.DataFrame, + entity_id_col_name: str, + quarantine_timestamps_df: Optional[pd.DataFrame] = None, + quarantine_interval_days: Optional[int] = None, ): - self.prediction_time_df = prediction_time_df - self.quarantine_df = quarantine_df - self.quarantine_days = quarantine_days - self.id_col_name = id_col_name + """Initialize PredictionTimeFilterer. + + Args: + prediction_times_df (pd.DataFrame): Prediction times dataframe. + Should contain entity_id and timestamp columns with col_names matching those in project_info.col_names. + quarantine_df (pd.DataFrame, optional): A dataframe with "timestamp" column from which to start the quarantine. + Any prediction times within the quarantine_interval_days after this timestamp will be dropped. + quarantine_days (int, optional): Number of days to quarantine. + entity_id_col_name (str): Name of the entity_id_col_name column. + + """ + + self.prediction_times_df = prediction_times_df + self.quarantine_df = quarantine_timestamps_df + self.quarantine_days = quarantine_interval_days + self.entity_id_col_name = entity_id_col_name def _filter_prediction_times_by_quarantine_period(self): # We need to check if ANY quarantine date hits each prediction time. # Create combinations - df = self.prediction_time_df.merge( + df = self.prediction_times_df.merge( self.quarantine_df, - on=self.id_col_name, + on=self.entity_id_col_name, how="left", suffixes=("_pred", "_quarantine"), ) @@ -69,10 +81,10 @@ def _filter_prediction_times_by_quarantine_period(self): def filter(self): """Run filters based on the provided parameters.""" - df = self.prediction_time_df + df = self.prediction_times_df if self.quarantine_df is not None or self.quarantine_days is not None: - if self.quarantine_days is None or self.quarantine_days is None: + if all([v is None for v in (self.quarantine_days, self.quarantine_df)]): raise ValueError( "If either of quarantine_df and quarantine_days are provided, both must be provided.", ) diff --git a/src/psycop_feature_generation/application_modules/flatten_dataset.py b/src/psycop_feature_generation/application_modules/flatten_dataset.py index 6d84911d..153d5b08 100644 --- a/src/psycop_feature_generation/application_modules/flatten_dataset.py +++ b/src/psycop_feature_generation/application_modules/flatten_dataset.py @@ -34,10 +34,10 @@ def filter_prediction_times( pd.DataFrame: Filtered prediction times dataframe. """ filterer = PredictionTimeFilterer( - prediction_time_df=prediction_times_df, - id_col_name=project_info.col_names.id, - quarantine_df=quarantine_df, - quarantine_days=quarantine_days, + prediction_times_df=prediction_times_df, + entity_id_col_name=project_info.col_names.id, + quarantine_timestamps_df=quarantine_df, + quarantine_interval_days=quarantine_days, ) return filterer.filter() diff --git a/src/psycop_feature_generation/utils_for_testing.py b/src/psycop_feature_generation/utils_for_testing.py index 604cd47a..fd702526 100644 --- a/src/psycop_feature_generation/utils_for_testing.py +++ b/src/psycop_feature_generation/utils_for_testing.py @@ -41,6 +41,7 @@ def str_to_df( convert_str_to_float: bool = False, add_pred_time_uuid: bool = False, entity_id_colname: str = "entity_id", + timestamp_col_name: str = "timestamp", ) -> DataFrame: """Convert a string representation of a dataframe to a dataframe. @@ -49,14 +50,14 @@ def str_to_df( convert_timestamp_to_datetime (bool): Whether to convert the timestamp column to datetime. Defaults to True. convert_np_nan_to_nan (bool): Whether to convert np.nan to np.nan. Defaults to True. convert_str_to_float (bool): Whether to convert strings to floats. Defaults to False. - add_pred_time_uuid (bool): Whether to add a pred_time_uuid column. Defaults to False. + add_pred_time_uuid (bool): Whether to infer a pred_time_uuid column from entity_id and timestamp columns. Defaults to False. entity_id_colname (str): The name of the entity_id column. Defaults to "entity_id". Returns: DataFrame: A dataframe. """ - # Drop anything after the last comma - string = string[: string.rfind(",") + 1] + # Drop comments for each line if any exist inside the str + string = string[: string.rfind("#")] df = pd.read_table(StringIO(string), sep=",", index_col=False) @@ -73,7 +74,7 @@ def str_to_df( if add_pred_time_uuid: df["pred_time_uuid"] = ( - df[entity_id_colname].astype(str) + "_" + df["timestamp"].astype(str) + df[entity_id_colname].astype(str) + "_" + df[timestamp_col_name].astype(str) ) # Drop "Unnamed" cols diff --git a/tests/test_application_modules/test_filter_prediction_times.py b/tests/test_application_modules/test_filter_prediction_times.py index 5343525c..22e2715b 100644 --- a/tests/test_application_modules/test_filter_prediction_times.py +++ b/tests/test_application_modules/test_filter_prediction_times.py @@ -7,12 +7,17 @@ from psycop_feature_generation.utils_for_testing import str_to_df -def test_filter_by_quarantine_date(): +def test_filter_by_quarantine_period(): """Test filtering by quarantine date. - Should filter if the prediction times is within X days after the quarantine date. + Should filter if the prediction times lie within quarantine_interval_days after the prediction time. - Uses dataframes as input. + E.g. for type 2 diabetes, patients are quarantined for 730 days after they return to the Central Denmark Region. + If a patient has an event in that time, they were probably incident outside of the region. + Therefore, their following prediction times should be filtered out. + + Note that this function only filters the prediction times within the quarantine period. + Filtering after the outcome is done inside TimeseriesFlattener when the OutcomeSpec has incident = True. """ quarantine_df = str_to_df( """entity_id,timestamp, @@ -41,10 +46,10 @@ def test_filter_by_quarantine_date(): ) filterer = PredictionTimeFilterer( - prediction_time_df=prediction_time_df, - quarantine_df=quarantine_df, - quarantine_days=730, - id_col_name="entity_id", + prediction_times_df=prediction_time_df, + quarantine_timestamps_df=quarantine_df, + quarantine_interval_days=730, + entity_id_col_name="entity_id", ) result_df = filterer.filter() From 33ba525c1cd3231dce080b64eb57d571a824ba3b Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Thu, 15 Dec 2022 11:24:19 +0100 Subject: [PATCH 6/8] feat: add rows dropped logging --- .../application_modules/flatten_dataset.py | 12 ++-- .../application_modules/utils.py | 60 +++++++++++++++++++ 2 files changed, 68 insertions(+), 4 deletions(-) create mode 100644 src/psycop_feature_generation/application_modules/utils.py diff --git a/src/psycop_feature_generation/application_modules/flatten_dataset.py b/src/psycop_feature_generation/application_modules/flatten_dataset.py index 153d5b08..ed5393e2 100644 --- a/src/psycop_feature_generation/application_modules/flatten_dataset.py +++ b/src/psycop_feature_generation/application_modules/flatten_dataset.py @@ -1,4 +1,6 @@ """Flatten the dataset.""" +from typing import Optional + import pandas as pd import psutil from timeseriesflattener.feature_cache.cache_to_disk import DiskCache @@ -9,17 +11,19 @@ PredictionTimeFilterer, ) from psycop_feature_generation.application_modules.project_setup import ProjectInfo +from psycop_feature_generation.application_modules.utils import print_df_dimensions_diff from psycop_feature_generation.application_modules.wandb_utils import ( wandb_alert_on_exception, ) from psycop_feature_generation.loaders.raw.load_demographic import birthdays +@print_df_dimensions_diff def filter_prediction_times( prediction_times_df: pd.DataFrame, project_info: ProjectInfo, - quarantine_df=None, - quarantine_days=None, + quarantine_df: Optional[pd.DataFrame] = None, + quarantine_days: Optional[int] = None, ) -> pd.DataFrame: """Filter prediction times. @@ -49,8 +53,8 @@ def create_flattened_dataset( prediction_times_df: pd.DataFrame, drop_pred_times_with_insufficient_look_distance: bool, project_info: ProjectInfo, - quarantine_df=None, - quarantine_days=None, + quarantine_df: Optional[pd.DataFrame] = None, + quarantine_days: Optional[int] = None, ) -> pd.DataFrame: """Create flattened dataset. diff --git a/src/psycop_feature_generation/application_modules/utils.py b/src/psycop_feature_generation/application_modules/utils.py new file mode 100644 index 00000000..889d63d7 --- /dev/null +++ b/src/psycop_feature_generation/application_modules/utils.py @@ -0,0 +1,60 @@ +import functools +import logging +from typing import Callable + +import pandas as pd + + +def print_df_dimensions_diff( + func: Callable, + print_when_starting: bool = False, + print_when_no_diff=False, +): + """Print the difference in rows between the input and output dataframes.""" + + @functools.wraps(func) + def wrapper(*args, **kwargs): + log = logging.getLogger(__name__) + + if print_when_starting: + log.info(f"{func.__name__}: Starting") + + arg_dfs = [arg for arg in args if isinstance(arg, pd.DataFrame)] + kwargs_dfs = [arg for arg in kwargs.values() if isinstance(arg, pd.DataFrame)] + potential_dfs = arg_dfs + kwargs_dfs + + if len(potential_dfs) > 1: + raise ValueError("More than one DataFrame found in args or kwargs") + + df = potential_dfs[0] + + for dim in ("rows", "columns"): + dim_int = 0 if dim == "rows" else 1 + + n_in_dim_before_func = df.shape[dim_int] + + if print_when_no_diff: + log.info( + f"{func.__name__}: {n_in_dim_before_func} {dim} before function", + ) + + result = func(*args, **kwargs) + + diff = n_in_dim_before_func - result.shape[dim_int] + + if diff != 0: + percent_diff = round( + (n_in_dim_before_func - result.shape[dim_int]) + / n_in_dim_before_func + * 100, + 2, + ) + + log.info(f"{func.__name__}: Dropped {diff} ({percent_diff}%) {dim}") + else: + if print_when_no_diff: + log.info(f"{func.__name__}: No {dim} dropped") + + return result + + return wrapper From efda4bb55211479bae963e44979e14438b7c69bb Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Thu, 15 Dec 2022 11:24:34 +0100 Subject: [PATCH 7/8] style: linting --- .../t2d/modules/specify_features.py | 83 ++++++++++--------- .../filter_prediction_times.py | 1 - .../application_modules/project_setup.py | 4 +- .../application_modules/utils.py | 1 + .../flattened/feature_describer.py | 2 +- .../loaders/raw/load_medications.py | 1 - .../loaders/raw/load_visits.py | 4 +- .../utils_for_testing.py | 1 + src/timeseriesflattener | 1 - 9 files changed, 53 insertions(+), 45 deletions(-) delete mode 160000 src/timeseriesflattener diff --git a/src/application/t2d/modules/specify_features.py b/src/application/t2d/modules/specify_features.py index 1ca1c00b..12cc1f44 100644 --- a/src/application/t2d/modules/specify_features.py +++ b/src/application/t2d/modules/specify_features.py @@ -2,8 +2,6 @@ import logging import numpy as np - -from psycop_feature_generation.application_modules.project_setup import ProjectInfo from timeseriesflattener.feature_spec_objects import ( BaseModel, OutcomeGroupSpec, @@ -14,6 +12,8 @@ _AnySpec, ) +from psycop_feature_generation.application_modules.project_setup import ProjectInfo + from .loaders.t2d_loaders import ( # noqa pylint: disable=unused-import timestamp_exclusion, ) @@ -31,7 +31,7 @@ class SpecSet(BaseModel): def get_static_predictor_specs(project_info: ProjectInfo): - """Get static predictor specs""" + """Get static predictor specs.""" return [ StaticSpec( values_loader="sex_female", @@ -42,7 +42,7 @@ def get_static_predictor_specs(project_info: ProjectInfo): def get_metadata_specs(project_info: ProjectInfo) -> list[_AnySpec]: - """Get metadata specs""" + """Get metadata specs.""" log.info("–––––––– Generating metadata specs ––––––––") return [ @@ -70,7 +70,7 @@ def get_metadata_specs(project_info: ProjectInfo) -> list[_AnySpec]: def get_outcome_specs(project_info: ProjectInfo): - """Get outcome specs""" + """Get outcome specs.""" log.info("–––––––– Generating outcome specs ––––––––") return OutcomeGroupSpec( @@ -84,39 +84,8 @@ def get_outcome_specs(project_info: ProjectInfo): ).create_combinations() -def get_temporal_predictor_specs(project_info: ProjectInfo) -> list[PredictorSpec]: - """Generate predictor spec list.""" - log.info("–––––––– Generating temporal predictor specs ––––––––") - - resolve_multiple = ["max", "min", "mean", "latest", "count"] - interval_days = [30, 90, 180, 365, 730] - allowed_nan_value_prop = [0] - - lab_results = get_lab_result_specs( - resolve_multiple, interval_days, allowed_nan_value_prop - ) - - diagnoses = get_diagnoses_specs( - resolve_multiple, interval_days, allowed_nan_value_prop - ) - - medications = get_medication_specs( - resolve_multiple, interval_days, allowed_nan_value_prop - ) - - demographics = PredictorGroupSpec( - values_loader=["weight_in_kg", "height_in_cm", "bmi"], - lookbehind_days=interval_days, - resolve_multiple_fn=["latest"], - fallback=[np.nan], - allowed_nan_value_prop=allowed_nan_value_prop, - prefix=project_info.prefix.predictor, - ).create_combinations() - - return lab_results + medications + diagnoses + demographics - - def get_medication_specs(resolve_multiple, interval_days, allowed_nan_value_prop): + """Get medication specs.""" log.info("–––––––– Generating medication specs ––––––––") psychiatric_medications = PredictorGroupSpec( @@ -158,6 +127,7 @@ def get_medication_specs(resolve_multiple, interval_days, allowed_nan_value_prop def get_diagnoses_specs(resolve_multiple, interval_days, allowed_nan_value_prop): + """Get diagnoses specs.""" log.info("–––––––– Generating diagnoses specs ––––––––") lifestyle_diagnoses = PredictorGroupSpec( @@ -197,6 +167,7 @@ def get_diagnoses_specs(resolve_multiple, interval_days, allowed_nan_value_prop) def get_lab_result_specs(resolve_multiple, interval_days, allowed_nan_value_prop): + """Get lab result specs.""" log.info("–––––––– Generating lab result specs ––––––––") general_lab_results = PredictorGroupSpec( @@ -231,6 +202,44 @@ def get_lab_result_specs(resolve_multiple, interval_days, allowed_nan_value_prop return general_lab_results + diabetes_lab_results +def get_temporal_predictor_specs(project_info: ProjectInfo) -> list[PredictorSpec]: + """Generate predictor spec list.""" + log.info("–––––––– Generating temporal predictor specs ––––––––") + + resolve_multiple = ["max", "min", "mean", "latest", "count"] + interval_days = [30, 90, 180, 365, 730] + allowed_nan_value_prop = [0] + + lab_results = get_lab_result_specs( + resolve_multiple, + interval_days, + allowed_nan_value_prop, + ) + + diagnoses = get_diagnoses_specs( + resolve_multiple, + interval_days, + allowed_nan_value_prop, + ) + + medications = get_medication_specs( + resolve_multiple, + interval_days, + allowed_nan_value_prop, + ) + + demographics = PredictorGroupSpec( + values_loader=["weight_in_kg", "height_in_cm", "bmi"], + lookbehind_days=interval_days, + resolve_multiple_fn=["latest"], + fallback=[np.nan], + allowed_nan_value_prop=allowed_nan_value_prop, + prefix=project_info.prefix.predictor, + ).create_combinations() + + return lab_results + medications + diagnoses + demographics + + def get_feature_specs(project_info: ProjectInfo) -> list[_AnySpec]: """Get a spec set.""" diff --git a/src/psycop_feature_generation/application_modules/filter_prediction_times.py b/src/psycop_feature_generation/application_modules/filter_prediction_times.py index 9b95f161..a9091b65 100644 --- a/src/psycop_feature_generation/application_modules/filter_prediction_times.py +++ b/src/psycop_feature_generation/application_modules/filter_prediction_times.py @@ -25,7 +25,6 @@ def __init__( Any prediction times within the quarantine_interval_days after this timestamp will be dropped. quarantine_days (int, optional): Number of days to quarantine. entity_id_col_name (str): Name of the entity_id_col_name column. - """ self.prediction_times_df = prediction_times_df diff --git a/src/psycop_feature_generation/application_modules/project_setup.py b/src/psycop_feature_generation/application_modules/project_setup.py index 08a50b99..7ec03648 100644 --- a/src/psycop_feature_generation/application_modules/project_setup.py +++ b/src/psycop_feature_generation/application_modules/project_setup.py @@ -7,12 +7,12 @@ from typing import Literal import wandb - -from psycop_feature_generation.utils import RELATIVE_PROJECT_ROOT, SHARED_RESOURCES_PATH from timeseriesflattener.feature_spec_objects import ( # pylint: disable=no-name-in-module BaseModel, ) +from psycop_feature_generation.utils import RELATIVE_PROJECT_ROOT, SHARED_RESOURCES_PATH + log = logging.getLogger(__name__) diff --git a/src/psycop_feature_generation/application_modules/utils.py b/src/psycop_feature_generation/application_modules/utils.py index 889d63d7..e57a593f 100644 --- a/src/psycop_feature_generation/application_modules/utils.py +++ b/src/psycop_feature_generation/application_modules/utils.py @@ -1,3 +1,4 @@ +"""Utility functions for the application modules.""" import functools import logging from typing import Callable diff --git a/src/psycop_feature_generation/data_checks/flattened/feature_describer.py b/src/psycop_feature_generation/data_checks/flattened/feature_describer.py index 725a5b63..fbe02568 100644 --- a/src/psycop_feature_generation/data_checks/flattened/feature_describer.py +++ b/src/psycop_feature_generation/data_checks/flattened/feature_describer.py @@ -7,10 +7,10 @@ import numpy as np import pandas as pd from timeseriesflattener.feature_spec_objects import ( - _AnySpec, PredictorSpec, StaticSpec, TemporalSpec, + _AnySpec, ) from wasabi import Printer diff --git a/src/psycop_feature_generation/loaders/raw/load_medications.py b/src/psycop_feature_generation/loaders/raw/load_medications.py index 0eedfdca..0962d205 100644 --- a/src/psycop_feature_generation/loaders/raw/load_medications.py +++ b/src/psycop_feature_generation/loaders/raw/load_medications.py @@ -4,7 +4,6 @@ from typing import Optional, Union import pandas as pd -from wasabi import msg from psycop_feature_generation.loaders.raw.utils import load_from_codes from psycop_feature_generation.utils import data_loaders diff --git a/src/psycop_feature_generation/loaders/raw/load_visits.py b/src/psycop_feature_generation/loaders/raw/load_visits.py index 2b56856b..dc698163 100644 --- a/src/psycop_feature_generation/loaders/raw/load_visits.py +++ b/src/psycop_feature_generation/loaders/raw/load_visits.py @@ -4,7 +4,6 @@ from typing import Optional import pandas as pd -from wasabi import msg from psycop_feature_generation.loaders.raw.sql_load import sql_load from psycop_feature_generation.utils import data_loaders @@ -98,7 +97,8 @@ def physical_visits( @data_loaders.register("physical_visits_to_psychiatry") def physical_visits_to_psychiatry( - n_rows: Optional[int] = None, timestamps_only: bool = True + n_rows: Optional[int] = None, + timestamps_only: bool = True, ) -> pd.DataFrame: """Load physical visits to psychiatry.""" df = physical_visits(shak_code=6600, shak_sql_operator="=", n_rows=n_rows) diff --git a/src/psycop_feature_generation/utils_for_testing.py b/src/psycop_feature_generation/utils_for_testing.py index fd702526..e4f4353e 100644 --- a/src/psycop_feature_generation/utils_for_testing.py +++ b/src/psycop_feature_generation/utils_for_testing.py @@ -52,6 +52,7 @@ def str_to_df( convert_str_to_float (bool): Whether to convert strings to floats. Defaults to False. add_pred_time_uuid (bool): Whether to infer a pred_time_uuid column from entity_id and timestamp columns. Defaults to False. entity_id_colname (str): The name of the entity_id column. Defaults to "entity_id". + timestamp_col_name (str): The name of the timestamp column. Defaults to "timestamp". Returns: DataFrame: A dataframe. diff --git a/src/timeseriesflattener b/src/timeseriesflattener deleted file mode 160000 index 087f843e..00000000 --- a/src/timeseriesflattener +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 087f843e48be74ddd35bd1b9afcb89b6c6bf1cc0 From 4c8538e8cee60a54efebcc037fe394bb67fb1744 Mon Sep 17 00:00:00 2001 From: Martin Bernstorff Date: Thu, 15 Dec 2022 11:47:29 +0100 Subject: [PATCH 8/8] deps: update timeseriesflattener dependency --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index fe3b87ed..4acac540 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,7 +26,7 @@ pyarrow = ">=9.0.0,<9.1.0" psycopmlutils = ">=0.2.4, <0.3.0" protobuf = "<=3.20.3" # Other versions give errors with pytest, super weird! frozendict = "^2.3.4" -timeseriesflattener = "0.20.2" +timeseriesflattener = "0.21.0" [tool.poetry.dev-dependencies] pytest = ">=7.1.3, <7.1.5"