From 3deb052d7373ba8a4ebef265f8364101555bfd72 Mon Sep 17 00:00:00 2001
From: Martin Bernstorff <martinbernstorff@gmail.com>
Date: Thu, 15 Dec 2022 09:51:11 +0100
Subject: [PATCH 1/8] feat: allow filtering based on quarantine dates

---
 .../filter_prediction_times.py                | 75 +++++++++++++++++++
 .../utils_for_testing.py                      |  9 +++
 .../test_filter_prediction_times.py           | 55 ++++++++++++++
 3 files changed, 139 insertions(+)
 create mode 100644 src/psycop_feature_generation/application_modules/filter_prediction_times.py
 create mode 100644 tests/test_application_modules/test_filter_prediction_times.py

diff --git a/src/psycop_feature_generation/application_modules/filter_prediction_times.py b/src/psycop_feature_generation/application_modules/filter_prediction_times.py
new file mode 100644
index 00000000..a8c28626
--- /dev/null
+++ b/src/psycop_feature_generation/application_modules/filter_prediction_times.py
@@ -0,0 +1,75 @@
+"""Class for filtering prediction times before they are used for feature generation."""
+from typing import Optional
+
+import pandas as pd
+
+
+class PredictionTimeFilterer:
+    def __init__(
+        self,
+        prediction_time_df: pd.DataFrame,
+        id_col_name: str,
+        quarantine_df: Optional[pd.DataFrame] = None,
+        quarantine_days: Optional[int] = None,
+    ):
+        self.prediction_time_df = prediction_time_df
+        self.quarantine_df = quarantine_df
+        self.quarantine_days = quarantine_days
+        self.id_col_name = id_col_name
+
+    def _filter_prediction_times_by_quarantine_period(self):
+        # We need to check if ANY quarantine date hits each prediction time.
+        # Create combinations
+        df = self.prediction_time_df.merge(
+            self.quarantine_df,
+            on=self.id_col_name,
+            how="left",
+            suffixes=("_pred", "_quarantine"),
+        )
+
+        df["days_since_quarantine"] = (
+            df["timestamp_pred"] - df["timestamp_quarantine"]
+        ).dt.days
+
+        # Check if the prediction time is hit by the quarantine date.
+        df.loc[
+            (df["days_since_quarantine"] < self.quarantine_days)
+            & (df["days_since_quarantine"] > 0),
+            "hit_by_quarantine",
+        ] = True
+
+        # If any of the combinations for a UUID is hit by a quarantine date, drop it.
+        df["hit_by_quarantine"] = df.groupby("pred_time_uuid")[
+            "hit_by_quarantine"
+        ].transform("max")
+
+        df = df.loc[df["hit_by_quarantine"] != True]
+
+        df = df.drop_duplicates(subset="pred_time_uuid")
+
+        # Drop the columns we added
+        df = df.drop(
+            columns=[
+                "days_since_quarantine",
+                "hit_by_quarantine",
+                "timestamp_quarantine",
+            ]
+        )
+
+        # Rename the timestamp column
+        df = df.rename(columns={"timestamp_pred": "timestamp"})
+
+        return df
+
+    def filter(self):
+        df = self.prediction_time_df
+
+        if self.quarantine_df is not None or self.quarantine_days is not None:
+            if self.quarantine_days is None or self.quarantine_days is None:
+                raise ValueError(
+                    "If either of quarantine_df and quarantine_days are provided, both must be provided."
+                )
+
+            df = self._filter_prediction_times_by_quarantine_period()
+
+        return df
diff --git a/src/psycop_feature_generation/utils_for_testing.py b/src/psycop_feature_generation/utils_for_testing.py
index b596ad67..0d784fba 100644
--- a/src/psycop_feature_generation/utils_for_testing.py
+++ b/src/psycop_feature_generation/utils_for_testing.py
@@ -39,6 +39,8 @@ def str_to_df(
     convert_timestamp_to_datetime: bool = True,
     convert_np_nan_to_nan: bool = True,
     convert_str_to_float: bool = False,
+    add_pred_time_uuid: bool = False,
+    entity_id_colname: str = "entity_id",
 ) -> DataFrame:
     """Convert a string representation of a dataframe to a dataframe.
 
@@ -51,6 +53,8 @@ def str_to_df(
     Returns:
         DataFrame: A dataframe.
     """
+    # Drop anything after the last comma
+    string = string[: string.rfind(",") + 1]
 
     df = pd.read_table(StringIO(string), sep=",", index_col=False)
 
@@ -65,6 +69,11 @@ def str_to_df(
         # Convert all str to float
         df = df.apply(pd.to_numeric, axis=0, errors="coerce")
 
+    if add_pred_time_uuid:
+        df["pred_time_uuid"] = (
+            df[entity_id_colname].astype(str) + "_" + df["timestamp"].astype(str)
+        )
+
     # Drop "Unnamed" cols
     return df.loc[:, ~df.columns.str.contains("^Unnamed")]
 
diff --git a/tests/test_application_modules/test_filter_prediction_times.py b/tests/test_application_modules/test_filter_prediction_times.py
new file mode 100644
index 00000000..9932aed5
--- /dev/null
+++ b/tests/test_application_modules/test_filter_prediction_times.py
@@ -0,0 +1,55 @@
+import pandas as pd
+
+from psycop_feature_generation.application_modules.filter_prediction_times import (
+    PredictionTimeFilterer,
+)
+from psycop_feature_generation.utils_for_testing import str_to_df
+
+
+def test_filter_by_quarantine_date():
+    """Test filtering by quarantine date.
+
+    Should filter if the prediction times is within X days after the quarantine date.
+
+    Uses dataframes as input.
+    """
+    quarantine_df = str_to_df(
+        """entity_id,timestamp,
+        1,2021-01-01 00:00:01,
+        1,2022-01-01 00:00:01,
+        """,
+    )
+
+    prediction_time_df = str_to_df(
+        """entity_id,timestamp,
+        1,2020-12-01 00:00:01, # keep: before quarantine date 
+        1,2022-12-01 00:00:01, # drop: after quarantine date
+        1,2026-02-01 00:00:01, # keep: outside quarantine days
+        2,2023-02-01 00:00:01, # keep: no quarantine date for this id
+        """,
+        add_pred_time_uuid=True,
+    )
+
+    expected_df = str_to_df(
+        """entity_id,timestamp,
+        1,2020-12-01 00:00:01,
+        1,2026-02-01 00:00:01,
+        2,2023-02-01 00:00:01,
+        """,
+        add_pred_time_uuid=True,
+    )
+
+    filterer = PredictionTimeFilterer(
+        prediction_time_df=prediction_time_df,
+        quarantine_df=quarantine_df,
+        quarantine_days=730,
+        id_col_name="entity_id",
+    )
+
+    result_df = filterer.filter()
+
+    # Check that the result is as expected using pandas.testing.assert_frame_equal
+    pd.testing.assert_frame_equal(
+        result_df.reset_index(drop=True),
+        expected_df.reset_index(drop=True),
+    )

From c3c2b6825f871e2c431352bc2c5f1413702b9f21 Mon Sep 17 00:00:00 2001
From: Martin Bernstorff <martinbernstorff@gmail.com>
Date: Thu, 15 Dec 2022 09:52:59 +0100
Subject: [PATCH 2/8] style: linting

---
 .../filter_prediction_times.py                    | 15 +++++++++++----
 .../utils_for_testing.py                          |  2 ++
 .../test_filter_prediction_times.py               |  3 ++-
 3 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/src/psycop_feature_generation/application_modules/filter_prediction_times.py b/src/psycop_feature_generation/application_modules/filter_prediction_times.py
index a8c28626..4704f562 100644
--- a/src/psycop_feature_generation/application_modules/filter_prediction_times.py
+++ b/src/psycop_feature_generation/application_modules/filter_prediction_times.py
@@ -1,10 +1,14 @@
-"""Class for filtering prediction times before they are used for feature generation."""
+"""Class for filtering prediction times before they are used for feature
+generation."""
 from typing import Optional
 
 import pandas as pd
 
 
 class PredictionTimeFilterer:
+    """Class for filtering prediction times before they are used for
+    feature."""
+
     def __init__(
         self,
         prediction_time_df: pd.DataFrame,
@@ -43,7 +47,9 @@ def _filter_prediction_times_by_quarantine_period(self):
             "hit_by_quarantine"
         ].transform("max")
 
-        df = df.loc[df["hit_by_quarantine"] != True]
+        df = df.loc[
+            df["hit_by_quarantine"] != True  # pylint: disable=singleton-comparison
+        ]
 
         df = df.drop_duplicates(subset="pred_time_uuid")
 
@@ -53,7 +59,7 @@ def _filter_prediction_times_by_quarantine_period(self):
                 "days_since_quarantine",
                 "hit_by_quarantine",
                 "timestamp_quarantine",
-            ]
+            ],
         )
 
         # Rename the timestamp column
@@ -62,12 +68,13 @@ def _filter_prediction_times_by_quarantine_period(self):
         return df
 
     def filter(self):
+        """Run filters based on the provided parameters."""
         df = self.prediction_time_df
 
         if self.quarantine_df is not None or self.quarantine_days is not None:
             if self.quarantine_days is None or self.quarantine_days is None:
                 raise ValueError(
-                    "If either of quarantine_df and quarantine_days are provided, both must be provided."
+                    "If either of quarantine_df and quarantine_days are provided, both must be provided.",
                 )
 
             df = self._filter_prediction_times_by_quarantine_period()
diff --git a/src/psycop_feature_generation/utils_for_testing.py b/src/psycop_feature_generation/utils_for_testing.py
index 0d784fba..604cd47a 100644
--- a/src/psycop_feature_generation/utils_for_testing.py
+++ b/src/psycop_feature_generation/utils_for_testing.py
@@ -49,6 +49,8 @@ def str_to_df(
         convert_timestamp_to_datetime (bool): Whether to convert the timestamp column to datetime. Defaults to True.
         convert_np_nan_to_nan (bool): Whether to convert np.nan to np.nan. Defaults to True.
         convert_str_to_float (bool): Whether to convert strings to floats. Defaults to False.
+        add_pred_time_uuid (bool): Whether to add a pred_time_uuid column. Defaults to False.
+        entity_id_colname (str): The name of the entity_id column. Defaults to "entity_id".
 
     Returns:
         DataFrame: A dataframe.
diff --git a/tests/test_application_modules/test_filter_prediction_times.py b/tests/test_application_modules/test_filter_prediction_times.py
index 9932aed5..5343525c 100644
--- a/tests/test_application_modules/test_filter_prediction_times.py
+++ b/tests/test_application_modules/test_filter_prediction_times.py
@@ -1,3 +1,4 @@
+"""Tests of the filter_prediction_times module."""
 import pandas as pd
 
 from psycop_feature_generation.application_modules.filter_prediction_times import (
@@ -22,7 +23,7 @@ def test_filter_by_quarantine_date():
 
     prediction_time_df = str_to_df(
         """entity_id,timestamp,
-        1,2020-12-01 00:00:01, # keep: before quarantine date 
+        1,2020-12-01 00:00:01, # keep: before quarantine date
         1,2022-12-01 00:00:01, # drop: after quarantine date
         1,2026-02-01 00:00:01, # keep: outside quarantine days
         2,2023-02-01 00:00:01, # keep: no quarantine date for this id

From 62e8baeb7ab496ce28fc9a05bb75ed23bc43d1ad Mon Sep 17 00:00:00 2001
From: Martin Bernstorff <martinbernstorff@gmail.com>
Date: Thu, 15 Dec 2022 09:58:00 +0100
Subject: [PATCH 3/8] ci: bump cache version to rebuild virtualenv

---
 .github/workflows/main_test_and_release.yml | 40 ++++++++++-----------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/.github/workflows/main_test_and_release.yml b/.github/workflows/main_test_and_release.yml
index 5edc5436..8c658bdc 100644
--- a/.github/workflows/main_test_and_release.yml
+++ b/.github/workflows/main_test_and_release.yml
@@ -9,10 +9,10 @@ on:
   pull_request:
   push:
     branches:
-      - main 
+      - main
 
 env:
-  cache-version: 0.0.4
+  cache-version: 0.0.5
   poetry-version: 1.1.15
   python-version: 3.9 # Change this number if you want to manually invalidate all caches
 
@@ -26,7 +26,7 @@ jobs:
 
     # This allows a subsequently queued workflow run to interrupt previous runs
     concurrency:
-      group: '${{ github.workflow }} - ${{ matrix.os }} @ ${{ github.ref }}'
+      group: "${{ github.workflow }} - ${{ matrix.os }} @ ${{ github.ref }}"
       cancel-in-progress: true
 
     steps:
@@ -46,21 +46,21 @@ jobs:
     needs: test
     if: ${{ github.ref == 'refs/heads/main' }}
     steps:
-    # Checkout action is required for token to persist
-    - uses: actions/checkout@v2
-      with:
-        fetch-depth: 0
-        token: ${{ secrets.RELEASE_BOT }}
+      # Checkout action is required for token to persist
+      - uses: actions/checkout@v2
+        with:
+          fetch-depth: 0
+          token: ${{ secrets.RELEASE_BOT }}
 
-    - name: Python Semantic Release
-      uses: relekang/python-semantic-release@v7.32.0
-      with: 
-        github_token: ${{ secrets.RELEASE_BOT }}
-        # Remember to copy the tool.semantic_release section from pyproject.toml 
-        # as well
-        # To enable pypi, 
-        # 1) Set upload_to_pypi to true in pyproject.toml and
-        # 2) Set the pypi_token in the repo
-        # 3) Uncomment the two lines below
-        repository_username: __token__
-        repository_password: ${{ secrets.PYPI_TOKEN }}
\ No newline at end of file
+      - name: Python Semantic Release
+        uses: relekang/python-semantic-release@v7.32.0
+        with:
+          github_token: ${{ secrets.RELEASE_BOT }}
+          # Remember to copy the tool.semantic_release section from pyproject.toml
+          # as well
+          # To enable pypi,
+          # 1) Set upload_to_pypi to true in pyproject.toml and
+          # 2) Set the pypi_token in the repo
+          # 3) Uncomment the two lines below
+          repository_username: __token__
+          repository_password: ${{ secrets.PYPI_TOKEN }}

From 9f47e276f21378a4d19975b854798cf0949bbf4b Mon Sep 17 00:00:00 2001
From: Martin Bernstorff <martinbernstorff@gmail.com>
Date: Thu, 15 Dec 2022 11:08:17 +0100
Subject: [PATCH 4/8] init: first stab at adding to application module

---
 .../application_modules/flatten_dataset.py    | 49 +++++++++++++++++--
 1 file changed, 45 insertions(+), 4 deletions(-)

diff --git a/src/psycop_feature_generation/application_modules/flatten_dataset.py b/src/psycop_feature_generation/application_modules/flatten_dataset.py
index 64012a5a..6d84911d 100644
--- a/src/psycop_feature_generation/application_modules/flatten_dataset.py
+++ b/src/psycop_feature_generation/application_modules/flatten_dataset.py
@@ -1,15 +1,46 @@
 """Flatten the dataset."""
 import pandas as pd
 import psutil
+from timeseriesflattener.feature_cache.cache_to_disk import DiskCache
+from timeseriesflattener.feature_spec_objects import _AnySpec
+from timeseriesflattener.flattened_dataset import TimeseriesFlattener
 
+from psycop_feature_generation.application_modules.filter_prediction_times import (
+    PredictionTimeFilterer,
+)
 from psycop_feature_generation.application_modules.project_setup import ProjectInfo
 from psycop_feature_generation.application_modules.wandb_utils import (
     wandb_alert_on_exception,
 )
 from psycop_feature_generation.loaders.raw.load_demographic import birthdays
-from timeseriesflattener.feature_cache.cache_to_disk import DiskCache
-from timeseriesflattener.feature_spec_objects import _AnySpec
-from timeseriesflattener.flattened_dataset import TimeseriesFlattener
+
+
+def filter_prediction_times(
+    prediction_times_df: pd.DataFrame,
+    project_info: ProjectInfo,
+    quarantine_df=None,
+    quarantine_days=None,
+) -> pd.DataFrame:
+    """Filter prediction times.
+
+    Args:
+        prediction_times_df (pd.DataFrame): Prediction times dataframe.
+            Should contain entity_id and timestamp columns with col_names matching those in project_info.col_names.
+        project_info (ProjectInfo): Project info.
+        quarantine_df (pd.DataFrame, optional): Quarantine dataframe with "timestamp" and "project_info.col_names.id" columns.
+        quarantine_days (int, optional): Number of days to quarantine.
+
+    Returns:
+        pd.DataFrame: Filtered prediction times dataframe.
+    """
+    filterer = PredictionTimeFilterer(
+        prediction_time_df=prediction_times_df,
+        id_col_name=project_info.col_names.id,
+        quarantine_df=quarantine_df,
+        quarantine_days=quarantine_days,
+    )
+
+    return filterer.filter()
 
 
 @wandb_alert_on_exception
@@ -18,6 +49,8 @@ def create_flattened_dataset(
     prediction_times_df: pd.DataFrame,
     drop_pred_times_with_insufficient_look_distance: bool,
     project_info: ProjectInfo,
+    quarantine_df=None,
+    quarantine_days=None,
 ) -> pd.DataFrame:
     """Create flattened dataset.
 
@@ -28,13 +61,21 @@ def create_flattened_dataset(
             Should contain entity_id and timestamp columns with col_names matching those in project_info.col_names.
         drop_pred_times_with_insufficient_look_distance (bool): Whether to drop prediction times with insufficient look distance.
             See timeseriesflattener tutorial for more info.
+        quarantine_df (pd.DataFrame, optional): Quarantine dataframe with "timestamp" and "project_info.col_names.id" columns.
+        quarantine_days (int, optional): Number of days to quarantine.
 
     Returns:
         FlattenedDataset: Flattened dataset.
     """
+    filtered_prediction_times_df = filter_prediction_times(
+        prediction_times_df=prediction_times_df,
+        project_info=project_info,
+        quarantine_df=quarantine_df,
+        quarantine_days=quarantine_days,
+    )
 
     flattened_dataset = TimeseriesFlattener(
-        prediction_times_df=prediction_times_df,
+        prediction_times_df=filtered_prediction_times_df,
         n_workers=min(
             len(feature_specs),
             psutil.cpu_count(logical=True),

From d7f0c6da03ee8739418304d8b7fd2955ab12ac77 Mon Sep 17 00:00:00 2001
From: Martin Bernstorff <martinbernstorff@gmail.com>
Date: Thu, 15 Dec 2022 11:19:49 +0100
Subject: [PATCH 5/8] misc: improvements from review

---
 .../filter_prediction_times.py                | 36 ++++++++++++-------
 .../application_modules/flatten_dataset.py    |  8 ++---
 .../utils_for_testing.py                      |  9 ++---
 .../test_filter_prediction_times.py           | 19 ++++++----
 4 files changed, 45 insertions(+), 27 deletions(-)

diff --git a/src/psycop_feature_generation/application_modules/filter_prediction_times.py b/src/psycop_feature_generation/application_modules/filter_prediction_times.py
index 4704f562..9b95f161 100644
--- a/src/psycop_feature_generation/application_modules/filter_prediction_times.py
+++ b/src/psycop_feature_generation/application_modules/filter_prediction_times.py
@@ -11,22 +11,34 @@ class PredictionTimeFilterer:
 
     def __init__(
         self,
-        prediction_time_df: pd.DataFrame,
-        id_col_name: str,
-        quarantine_df: Optional[pd.DataFrame] = None,
-        quarantine_days: Optional[int] = None,
+        prediction_times_df: pd.DataFrame,
+        entity_id_col_name: str,
+        quarantine_timestamps_df: Optional[pd.DataFrame] = None,
+        quarantine_interval_days: Optional[int] = None,
     ):
-        self.prediction_time_df = prediction_time_df
-        self.quarantine_df = quarantine_df
-        self.quarantine_days = quarantine_days
-        self.id_col_name = id_col_name
+        """Initialize PredictionTimeFilterer.
+
+        Args:
+            prediction_times_df (pd.DataFrame): Prediction times dataframe.
+                Should contain entity_id and timestamp columns with col_names matching those in project_info.col_names.
+            quarantine_df (pd.DataFrame, optional): A dataframe with "timestamp" column from which to start the quarantine.
+                Any prediction times within the quarantine_interval_days after this timestamp will be dropped.
+            quarantine_days (int, optional): Number of days to quarantine.
+            entity_id_col_name (str): Name of the entity_id_col_name column.
+
+        """
+
+        self.prediction_times_df = prediction_times_df
+        self.quarantine_df = quarantine_timestamps_df
+        self.quarantine_days = quarantine_interval_days
+        self.entity_id_col_name = entity_id_col_name
 
     def _filter_prediction_times_by_quarantine_period(self):
         # We need to check if ANY quarantine date hits each prediction time.
         # Create combinations
-        df = self.prediction_time_df.merge(
+        df = self.prediction_times_df.merge(
             self.quarantine_df,
-            on=self.id_col_name,
+            on=self.entity_id_col_name,
             how="left",
             suffixes=("_pred", "_quarantine"),
         )
@@ -69,10 +81,10 @@ def _filter_prediction_times_by_quarantine_period(self):
 
     def filter(self):
         """Run filters based on the provided parameters."""
-        df = self.prediction_time_df
+        df = self.prediction_times_df
 
         if self.quarantine_df is not None or self.quarantine_days is not None:
-            if self.quarantine_days is None or self.quarantine_days is None:
+            if all([v is None for v in (self.quarantine_days, self.quarantine_df)]):
                 raise ValueError(
                     "If either of quarantine_df and quarantine_days are provided, both must be provided.",
                 )
diff --git a/src/psycop_feature_generation/application_modules/flatten_dataset.py b/src/psycop_feature_generation/application_modules/flatten_dataset.py
index 6d84911d..153d5b08 100644
--- a/src/psycop_feature_generation/application_modules/flatten_dataset.py
+++ b/src/psycop_feature_generation/application_modules/flatten_dataset.py
@@ -34,10 +34,10 @@ def filter_prediction_times(
         pd.DataFrame: Filtered prediction times dataframe.
     """
     filterer = PredictionTimeFilterer(
-        prediction_time_df=prediction_times_df,
-        id_col_name=project_info.col_names.id,
-        quarantine_df=quarantine_df,
-        quarantine_days=quarantine_days,
+        prediction_times_df=prediction_times_df,
+        entity_id_col_name=project_info.col_names.id,
+        quarantine_timestamps_df=quarantine_df,
+        quarantine_interval_days=quarantine_days,
     )
 
     return filterer.filter()
diff --git a/src/psycop_feature_generation/utils_for_testing.py b/src/psycop_feature_generation/utils_for_testing.py
index 604cd47a..fd702526 100644
--- a/src/psycop_feature_generation/utils_for_testing.py
+++ b/src/psycop_feature_generation/utils_for_testing.py
@@ -41,6 +41,7 @@ def str_to_df(
     convert_str_to_float: bool = False,
     add_pred_time_uuid: bool = False,
     entity_id_colname: str = "entity_id",
+    timestamp_col_name: str = "timestamp",
 ) -> DataFrame:
     """Convert a string representation of a dataframe to a dataframe.
 
@@ -49,14 +50,14 @@ def str_to_df(
         convert_timestamp_to_datetime (bool): Whether to convert the timestamp column to datetime. Defaults to True.
         convert_np_nan_to_nan (bool): Whether to convert np.nan to np.nan. Defaults to True.
         convert_str_to_float (bool): Whether to convert strings to floats. Defaults to False.
-        add_pred_time_uuid (bool): Whether to add a pred_time_uuid column. Defaults to False.
+        add_pred_time_uuid (bool): Whether to infer a pred_time_uuid column from entity_id and timestamp columns. Defaults to False.
         entity_id_colname (str): The name of the entity_id column. Defaults to "entity_id".
 
     Returns:
         DataFrame: A dataframe.
     """
-    # Drop anything after the last comma
-    string = string[: string.rfind(",") + 1]
+    # Drop comments for each line if any exist inside the str
+    string = string[: string.rfind("#")]
 
     df = pd.read_table(StringIO(string), sep=",", index_col=False)
 
@@ -73,7 +74,7 @@ def str_to_df(
 
     if add_pred_time_uuid:
         df["pred_time_uuid"] = (
-            df[entity_id_colname].astype(str) + "_" + df["timestamp"].astype(str)
+            df[entity_id_colname].astype(str) + "_" + df[timestamp_col_name].astype(str)
         )
 
     # Drop "Unnamed" cols
diff --git a/tests/test_application_modules/test_filter_prediction_times.py b/tests/test_application_modules/test_filter_prediction_times.py
index 5343525c..22e2715b 100644
--- a/tests/test_application_modules/test_filter_prediction_times.py
+++ b/tests/test_application_modules/test_filter_prediction_times.py
@@ -7,12 +7,17 @@
 from psycop_feature_generation.utils_for_testing import str_to_df
 
 
-def test_filter_by_quarantine_date():
+def test_filter_by_quarantine_period():
     """Test filtering by quarantine date.
 
-    Should filter if the prediction times is within X days after the quarantine date.
+    Should filter if the prediction times lie within quarantine_interval_days after the prediction time.
 
-    Uses dataframes as input.
+    E.g. for type 2 diabetes, patients are quarantined for 730 days after they return to the Central Denmark Region.
+    If a patient has an event in that time, they were probably incident outside of the region.
+    Therefore, their following prediction times should be filtered out.
+
+    Note that this function only filters the prediction times within the quarantine period.
+    Filtering after the outcome is done inside TimeseriesFlattener when the OutcomeSpec has incident = True.
     """
     quarantine_df = str_to_df(
         """entity_id,timestamp,
@@ -41,10 +46,10 @@ def test_filter_by_quarantine_date():
     )
 
     filterer = PredictionTimeFilterer(
-        prediction_time_df=prediction_time_df,
-        quarantine_df=quarantine_df,
-        quarantine_days=730,
-        id_col_name="entity_id",
+        prediction_times_df=prediction_time_df,
+        quarantine_timestamps_df=quarantine_df,
+        quarantine_interval_days=730,
+        entity_id_col_name="entity_id",
     )
 
     result_df = filterer.filter()

From 33ba525c1cd3231dce080b64eb57d571a824ba3b Mon Sep 17 00:00:00 2001
From: Martin Bernstorff <martinbernstorff@gmail.com>
Date: Thu, 15 Dec 2022 11:24:19 +0100
Subject: [PATCH 6/8] feat: add rows dropped logging

---
 .../application_modules/flatten_dataset.py    | 12 ++--
 .../application_modules/utils.py              | 60 +++++++++++++++++++
 2 files changed, 68 insertions(+), 4 deletions(-)
 create mode 100644 src/psycop_feature_generation/application_modules/utils.py

diff --git a/src/psycop_feature_generation/application_modules/flatten_dataset.py b/src/psycop_feature_generation/application_modules/flatten_dataset.py
index 153d5b08..ed5393e2 100644
--- a/src/psycop_feature_generation/application_modules/flatten_dataset.py
+++ b/src/psycop_feature_generation/application_modules/flatten_dataset.py
@@ -1,4 +1,6 @@
 """Flatten the dataset."""
+from typing import Optional
+
 import pandas as pd
 import psutil
 from timeseriesflattener.feature_cache.cache_to_disk import DiskCache
@@ -9,17 +11,19 @@
     PredictionTimeFilterer,
 )
 from psycop_feature_generation.application_modules.project_setup import ProjectInfo
+from psycop_feature_generation.application_modules.utils import print_df_dimensions_diff
 from psycop_feature_generation.application_modules.wandb_utils import (
     wandb_alert_on_exception,
 )
 from psycop_feature_generation.loaders.raw.load_demographic import birthdays
 
 
+@print_df_dimensions_diff
 def filter_prediction_times(
     prediction_times_df: pd.DataFrame,
     project_info: ProjectInfo,
-    quarantine_df=None,
-    quarantine_days=None,
+    quarantine_df: Optional[pd.DataFrame] = None,
+    quarantine_days: Optional[int] = None,
 ) -> pd.DataFrame:
     """Filter prediction times.
 
@@ -49,8 +53,8 @@ def create_flattened_dataset(
     prediction_times_df: pd.DataFrame,
     drop_pred_times_with_insufficient_look_distance: bool,
     project_info: ProjectInfo,
-    quarantine_df=None,
-    quarantine_days=None,
+    quarantine_df: Optional[pd.DataFrame] = None,
+    quarantine_days: Optional[int] = None,
 ) -> pd.DataFrame:
     """Create flattened dataset.
 
diff --git a/src/psycop_feature_generation/application_modules/utils.py b/src/psycop_feature_generation/application_modules/utils.py
new file mode 100644
index 00000000..889d63d7
--- /dev/null
+++ b/src/psycop_feature_generation/application_modules/utils.py
@@ -0,0 +1,60 @@
+import functools
+import logging
+from typing import Callable
+
+import pandas as pd
+
+
+def print_df_dimensions_diff(
+    func: Callable,
+    print_when_starting: bool = False,
+    print_when_no_diff=False,
+):
+    """Print the difference in rows between the input and output dataframes."""
+
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        log = logging.getLogger(__name__)
+
+        if print_when_starting:
+            log.info(f"{func.__name__}: Starting")
+
+        arg_dfs = [arg for arg in args if isinstance(arg, pd.DataFrame)]
+        kwargs_dfs = [arg for arg in kwargs.values() if isinstance(arg, pd.DataFrame)]
+        potential_dfs = arg_dfs + kwargs_dfs
+
+        if len(potential_dfs) > 1:
+            raise ValueError("More than one DataFrame found in args or kwargs")
+
+        df = potential_dfs[0]
+
+        for dim in ("rows", "columns"):
+            dim_int = 0 if dim == "rows" else 1
+
+            n_in_dim_before_func = df.shape[dim_int]
+
+            if print_when_no_diff:
+                log.info(
+                    f"{func.__name__}: {n_in_dim_before_func} {dim} before function",
+                )
+
+            result = func(*args, **kwargs)
+
+            diff = n_in_dim_before_func - result.shape[dim_int]
+
+            if diff != 0:
+                percent_diff = round(
+                    (n_in_dim_before_func - result.shape[dim_int])
+                    / n_in_dim_before_func
+                    * 100,
+                    2,
+                )
+
+                log.info(f"{func.__name__}: Dropped {diff} ({percent_diff}%) {dim}")
+            else:
+                if print_when_no_diff:
+                    log.info(f"{func.__name__}: No {dim} dropped")
+
+        return result
+
+    return wrapper

From efda4bb55211479bae963e44979e14438b7c69bb Mon Sep 17 00:00:00 2001
From: Martin Bernstorff <martinbernstorff@gmail.com>
Date: Thu, 15 Dec 2022 11:24:34 +0100
Subject: [PATCH 7/8] style: linting

---
 .../t2d/modules/specify_features.py           | 83 ++++++++++---------
 .../filter_prediction_times.py                |  1 -
 .../application_modules/project_setup.py      |  4 +-
 .../application_modules/utils.py              |  1 +
 .../flattened/feature_describer.py            |  2 +-
 .../loaders/raw/load_medications.py           |  1 -
 .../loaders/raw/load_visits.py                |  4 +-
 .../utils_for_testing.py                      |  1 +
 src/timeseriesflattener                       |  1 -
 9 files changed, 53 insertions(+), 45 deletions(-)
 delete mode 160000 src/timeseriesflattener

diff --git a/src/application/t2d/modules/specify_features.py b/src/application/t2d/modules/specify_features.py
index 1ca1c00b..12cc1f44 100644
--- a/src/application/t2d/modules/specify_features.py
+++ b/src/application/t2d/modules/specify_features.py
@@ -2,8 +2,6 @@
 import logging
 
 import numpy as np
-
-from psycop_feature_generation.application_modules.project_setup import ProjectInfo
 from timeseriesflattener.feature_spec_objects import (
     BaseModel,
     OutcomeGroupSpec,
@@ -14,6 +12,8 @@
     _AnySpec,
 )
 
+from psycop_feature_generation.application_modules.project_setup import ProjectInfo
+
 from .loaders.t2d_loaders import (  # noqa pylint: disable=unused-import
     timestamp_exclusion,
 )
@@ -31,7 +31,7 @@ class SpecSet(BaseModel):
 
 
 def get_static_predictor_specs(project_info: ProjectInfo):
-    """Get static predictor specs"""
+    """Get static predictor specs."""
     return [
         StaticSpec(
             values_loader="sex_female",
@@ -42,7 +42,7 @@ def get_static_predictor_specs(project_info: ProjectInfo):
 
 
 def get_metadata_specs(project_info: ProjectInfo) -> list[_AnySpec]:
-    """Get metadata specs"""
+    """Get metadata specs."""
     log.info("–––––––– Generating metadata specs ––––––––")
 
     return [
@@ -70,7 +70,7 @@ def get_metadata_specs(project_info: ProjectInfo) -> list[_AnySpec]:
 
 
 def get_outcome_specs(project_info: ProjectInfo):
-    """Get outcome specs"""
+    """Get outcome specs."""
     log.info("–––––––– Generating outcome specs ––––––––")
 
     return OutcomeGroupSpec(
@@ -84,39 +84,8 @@ def get_outcome_specs(project_info: ProjectInfo):
     ).create_combinations()
 
 
-def get_temporal_predictor_specs(project_info: ProjectInfo) -> list[PredictorSpec]:
-    """Generate predictor spec list."""
-    log.info("–––––––– Generating temporal predictor specs ––––––––")
-
-    resolve_multiple = ["max", "min", "mean", "latest", "count"]
-    interval_days = [30, 90, 180, 365, 730]
-    allowed_nan_value_prop = [0]
-
-    lab_results = get_lab_result_specs(
-        resolve_multiple, interval_days, allowed_nan_value_prop
-    )
-
-    diagnoses = get_diagnoses_specs(
-        resolve_multiple, interval_days, allowed_nan_value_prop
-    )
-
-    medications = get_medication_specs(
-        resolve_multiple, interval_days, allowed_nan_value_prop
-    )
-
-    demographics = PredictorGroupSpec(
-        values_loader=["weight_in_kg", "height_in_cm", "bmi"],
-        lookbehind_days=interval_days,
-        resolve_multiple_fn=["latest"],
-        fallback=[np.nan],
-        allowed_nan_value_prop=allowed_nan_value_prop,
-        prefix=project_info.prefix.predictor,
-    ).create_combinations()
-
-    return lab_results + medications + diagnoses + demographics
-
-
 def get_medication_specs(resolve_multiple, interval_days, allowed_nan_value_prop):
+    """Get medication specs."""
     log.info("–––––––– Generating medication specs ––––––––")
 
     psychiatric_medications = PredictorGroupSpec(
@@ -158,6 +127,7 @@ def get_medication_specs(resolve_multiple, interval_days, allowed_nan_value_prop
 
 
 def get_diagnoses_specs(resolve_multiple, interval_days, allowed_nan_value_prop):
+    """Get diagnoses specs."""
     log.info("–––––––– Generating diagnoses specs ––––––––")
 
     lifestyle_diagnoses = PredictorGroupSpec(
@@ -197,6 +167,7 @@ def get_diagnoses_specs(resolve_multiple, interval_days, allowed_nan_value_prop)
 
 
 def get_lab_result_specs(resolve_multiple, interval_days, allowed_nan_value_prop):
+    """Get lab result specs."""
     log.info("–––––––– Generating lab result specs ––––––––")
 
     general_lab_results = PredictorGroupSpec(
@@ -231,6 +202,44 @@ def get_lab_result_specs(resolve_multiple, interval_days, allowed_nan_value_prop
     return general_lab_results + diabetes_lab_results
 
 
+def get_temporal_predictor_specs(project_info: ProjectInfo) -> list[PredictorSpec]:
+    """Generate predictor spec list."""
+    log.info("–––––––– Generating temporal predictor specs ––––––––")
+
+    resolve_multiple = ["max", "min", "mean", "latest", "count"]
+    interval_days = [30, 90, 180, 365, 730]
+    allowed_nan_value_prop = [0]
+
+    lab_results = get_lab_result_specs(
+        resolve_multiple,
+        interval_days,
+        allowed_nan_value_prop,
+    )
+
+    diagnoses = get_diagnoses_specs(
+        resolve_multiple,
+        interval_days,
+        allowed_nan_value_prop,
+    )
+
+    medications = get_medication_specs(
+        resolve_multiple,
+        interval_days,
+        allowed_nan_value_prop,
+    )
+
+    demographics = PredictorGroupSpec(
+        values_loader=["weight_in_kg", "height_in_cm", "bmi"],
+        lookbehind_days=interval_days,
+        resolve_multiple_fn=["latest"],
+        fallback=[np.nan],
+        allowed_nan_value_prop=allowed_nan_value_prop,
+        prefix=project_info.prefix.predictor,
+    ).create_combinations()
+
+    return lab_results + medications + diagnoses + demographics
+
+
 def get_feature_specs(project_info: ProjectInfo) -> list[_AnySpec]:
     """Get a spec set."""
 
diff --git a/src/psycop_feature_generation/application_modules/filter_prediction_times.py b/src/psycop_feature_generation/application_modules/filter_prediction_times.py
index 9b95f161..a9091b65 100644
--- a/src/psycop_feature_generation/application_modules/filter_prediction_times.py
+++ b/src/psycop_feature_generation/application_modules/filter_prediction_times.py
@@ -25,7 +25,6 @@ def __init__(
                 Any prediction times within the quarantine_interval_days after this timestamp will be dropped.
             quarantine_days (int, optional): Number of days to quarantine.
             entity_id_col_name (str): Name of the entity_id_col_name column.
-
         """
 
         self.prediction_times_df = prediction_times_df
diff --git a/src/psycop_feature_generation/application_modules/project_setup.py b/src/psycop_feature_generation/application_modules/project_setup.py
index 08a50b99..7ec03648 100644
--- a/src/psycop_feature_generation/application_modules/project_setup.py
+++ b/src/psycop_feature_generation/application_modules/project_setup.py
@@ -7,12 +7,12 @@
 from typing import Literal
 
 import wandb
-
-from psycop_feature_generation.utils import RELATIVE_PROJECT_ROOT, SHARED_RESOURCES_PATH
 from timeseriesflattener.feature_spec_objects import (  # pylint: disable=no-name-in-module
     BaseModel,
 )
 
+from psycop_feature_generation.utils import RELATIVE_PROJECT_ROOT, SHARED_RESOURCES_PATH
+
 log = logging.getLogger(__name__)
 
 
diff --git a/src/psycop_feature_generation/application_modules/utils.py b/src/psycop_feature_generation/application_modules/utils.py
index 889d63d7..e57a593f 100644
--- a/src/psycop_feature_generation/application_modules/utils.py
+++ b/src/psycop_feature_generation/application_modules/utils.py
@@ -1,3 +1,4 @@
+"""Utility functions for the application modules."""
 import functools
 import logging
 from typing import Callable
diff --git a/src/psycop_feature_generation/data_checks/flattened/feature_describer.py b/src/psycop_feature_generation/data_checks/flattened/feature_describer.py
index 725a5b63..fbe02568 100644
--- a/src/psycop_feature_generation/data_checks/flattened/feature_describer.py
+++ b/src/psycop_feature_generation/data_checks/flattened/feature_describer.py
@@ -7,10 +7,10 @@
 import numpy as np
 import pandas as pd
 from timeseriesflattener.feature_spec_objects import (
-    _AnySpec,
     PredictorSpec,
     StaticSpec,
     TemporalSpec,
+    _AnySpec,
 )
 from wasabi import Printer
 
diff --git a/src/psycop_feature_generation/loaders/raw/load_medications.py b/src/psycop_feature_generation/loaders/raw/load_medications.py
index 0eedfdca..0962d205 100644
--- a/src/psycop_feature_generation/loaders/raw/load_medications.py
+++ b/src/psycop_feature_generation/loaders/raw/load_medications.py
@@ -4,7 +4,6 @@
 from typing import Optional, Union
 
 import pandas as pd
-from wasabi import msg
 
 from psycop_feature_generation.loaders.raw.utils import load_from_codes
 from psycop_feature_generation.utils import data_loaders
diff --git a/src/psycop_feature_generation/loaders/raw/load_visits.py b/src/psycop_feature_generation/loaders/raw/load_visits.py
index 2b56856b..dc698163 100644
--- a/src/psycop_feature_generation/loaders/raw/load_visits.py
+++ b/src/psycop_feature_generation/loaders/raw/load_visits.py
@@ -4,7 +4,6 @@
 from typing import Optional
 
 import pandas as pd
-from wasabi import msg
 
 from psycop_feature_generation.loaders.raw.sql_load import sql_load
 from psycop_feature_generation.utils import data_loaders
@@ -98,7 +97,8 @@ def physical_visits(
 
 @data_loaders.register("physical_visits_to_psychiatry")
 def physical_visits_to_psychiatry(
-    n_rows: Optional[int] = None, timestamps_only: bool = True
+    n_rows: Optional[int] = None,
+    timestamps_only: bool = True,
 ) -> pd.DataFrame:
     """Load physical visits to psychiatry."""
     df = physical_visits(shak_code=6600, shak_sql_operator="=", n_rows=n_rows)
diff --git a/src/psycop_feature_generation/utils_for_testing.py b/src/psycop_feature_generation/utils_for_testing.py
index fd702526..e4f4353e 100644
--- a/src/psycop_feature_generation/utils_for_testing.py
+++ b/src/psycop_feature_generation/utils_for_testing.py
@@ -52,6 +52,7 @@ def str_to_df(
         convert_str_to_float (bool): Whether to convert strings to floats. Defaults to False.
         add_pred_time_uuid (bool): Whether to infer a pred_time_uuid column from entity_id and timestamp columns. Defaults to False.
         entity_id_colname (str): The name of the entity_id column. Defaults to "entity_id".
+        timestamp_col_name (str): The name of the timestamp column. Defaults to "timestamp".
 
     Returns:
         DataFrame: A dataframe.
diff --git a/src/timeseriesflattener b/src/timeseriesflattener
deleted file mode 160000
index 087f843e..00000000
--- a/src/timeseriesflattener
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 087f843e48be74ddd35bd1b9afcb89b6c6bf1cc0

From 4c8538e8cee60a54efebcc037fe394bb67fb1744 Mon Sep 17 00:00:00 2001
From: Martin Bernstorff <martinbernstorff@gmail.com>
Date: Thu, 15 Dec 2022 11:47:29 +0100
Subject: [PATCH 8/8] deps: update timeseriesflattener dependency

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index fe3b87ed..4acac540 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -26,7 +26,7 @@ pyarrow = ">=9.0.0,<9.1.0"
 psycopmlutils = ">=0.2.4, <0.3.0"
 protobuf = "<=3.20.3" # Other versions give errors with pytest, super weird!
 frozendict = "^2.3.4"
-timeseriesflattener = "0.20.2"
+timeseriesflattener = "0.21.0"
 
 [tool.poetry.dev-dependencies]
 pytest = ">=7.1.3, <7.1.5"