From d78f46592213b8245229d6618d40f1a1ff4d80eb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakob=20Gr=C3=B8hn?= <bokajgd@gmail.com>
Date: Thu, 17 Nov 2022 09:09:28 +0100
Subject: [PATCH 1/9] feat: allow load_medications to concat a list of
 medications Fixes #54

---
 .../loaders/raw/load_medications.py           |  4 +-
 .../loaders/raw/utils.py                      | 97 +++++++++++++++++++
 2 files changed, 99 insertions(+), 2 deletions(-)
 create mode 100644 src/psycop_feature_generation/loaders/raw/utils.py

diff --git a/src/psycop_feature_generation/loaders/raw/load_medications.py b/src/psycop_feature_generation/loaders/raw/load_medications.py
index b3af1949..bddada3e 100644
--- a/src/psycop_feature_generation/loaders/raw/load_medications.py
+++ b/src/psycop_feature_generation/loaders/raw/load_medications.py
@@ -18,7 +18,7 @@ def _load_one_source(
     wildcard_icd_code: Optional[bool] = False,
     n_rows: Optional[int] = None,
 ) -> pd.DataFrame:
-    """Load the prescribed medications that match atc. If wildcard_icd_code,
+    """Load the prescribed medications that match atc. If wildcard_atc_code,
     match from atc_code*. Aggregates all that match. Beware that data is
     incomplete prior to sep. 2016 for prescribed medications.
 
@@ -75,7 +75,7 @@ def load(
     n_rows: Optional[int] = None,
 ) -> pd.DataFrame:
     """Load medications. Aggregates prescribed/administered if both true. If
-    wildcard_icd_code, match from atc_code*. Aggregates all that match. Beware
+    wildcard_atc_code, match from atc_code*. Aggregates all that match. Beware
     that data is incomplete prior to sep. 2016 for prescribed medications.
 
     Args:
diff --git a/src/psycop_feature_generation/loaders/raw/utils.py b/src/psycop_feature_generation/loaders/raw/utils.py
new file mode 100644
index 00000000..9c5d804b
--- /dev/null
+++ b/src/psycop_feature_generation/loaders/raw/utils.py
@@ -0,0 +1,97 @@
+"""Example of."""
+
+from typing import Optional, Union
+
+import pandas as pd
+
+from psycop_feature_generation.loaders.raw.sql_load import sql_load
+
+
+def load_from_list(
+    codes_to_match: Union[list[str], str],
+    code_col_name: str,
+    source_timestamp_col_name: str,
+    fct: str,
+    output_col_name: Optional[str] = None,
+    wildcard_code: Optional[bool] = True,
+    n_rows: Optional[int] = None,
+) -> pd.DataFrame:
+    """Load the visits that have diagnoses that match icd_code or atc code from
+    the beginning of their adiagnosekode or atc code string. Aggregates all
+    that match.
+
+    Args:
+        codes_to_match (Union[list[str], str]): Substring(s) to match diagnoses or medictions for. # noqa: DAR102
+            Matches any diagnoses, whether a-diagnosis, b-diagnosis or any atc code etc. If a list is passed, will
+            count as a match if any of the icd_codes or act codes in the list match.
+        code_col_name (str): Name of column containing either diagnosis (icd) or medication (atc) codes.
+            Takes either 'diagnosegruppestreng' or 'atc' as input.
+        source_timestamp_col_name (str): Name of the timestamp column in the SQL
+            view.
+        fct (str): Name of the SQL view to load from.
+        output_col_name (str, optional): Name of new column string. Defaults to
+            None.
+        wildcard_code (bool, optional): Whether to match on icd_code* / atc_code*.
+            Defaults to true.
+        n_rows: Number of rows to return. Defaults to None.
+
+    Returns:
+        pd.DataFrame: A pandas dataframe with dw_ek_borger, timestamp and
+            output_col_name = 1
+    """
+    fct = f"[{fct}]"
+
+    # Must be able to split a string like this:
+    #   A:DF431#+:ALFC3#B:DF329
+    # Which means that if wildcard_code is False, we must match on icd_code# or icd_code followed by nothing.
+    # If it's true, we can match on icd_code*.
+
+    # Handle if there are multiple ICD codes to count together.
+    if isinstance(codes_to_match, list):
+        match_col_sql_strings = []
+
+        for code_str in codes_to_match:  # pylint: disable=not-an-iterable
+            if wildcard_code:
+                match_col_sql_strings.append(
+                    f"lower({code_col_name}) LIKE '%{code_str.lower()}%'",
+                )
+            else:
+                # If the string is at the end of diagnosegruppestreng, it doesn't end with a hashtag
+                match_col_sql_strings.append(
+                    f"lower({code_col_name}) LIKE '%{code_str.lower()}'",
+                )
+
+                # But if it is at the end, it does
+                match_col_sql_strings.append(
+                    f"lower({code_col_name}) LIKE '%{code_str.lower()}#%'",
+                )
+
+        match_col_sql_str = " OR ".join(match_col_sql_strings)
+    else:
+        if wildcard_code:
+            match_col_sql_str = (
+                f"lower({code_col_name}) LIKE '%{codes_to_match.lower()}%'"
+            )
+
+        else:
+            match_col_sql_str = f"lower({code_col_name}) LIKE '%{codes_to_match.lower()}' OR lower({code_col_name}) LIKE '%{codes_to_match.lower()}#%'"
+
+    sql = (
+        f"SELECT dw_ek_borger, {source_timestamp_col_name}, {code_col_name}"
+        + f" FROM [fct].{fct} WHERE {source_timestamp_col_name} IS NOT NULL AND ({match_col_sql_str})"
+    )
+
+    df = sql_load(sql, database="USR_PS_FORSK", chunksize=None, n_rows=n_rows)
+
+    if output_col_name is None:
+        output_col_name = codes_to_match
+
+    df[output_col_name] = 1
+
+    df.drop([f"{code_col_name}"], axis="columns", inplace=True)
+
+    return df.rename(
+        columns={
+            source_timestamp_col_name: "timestamp",
+        },
+    )

From da59110978469b0743ce2d625005fc90950fb436 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakob=20Gr=C3=B8hn?= <bokajgd@gmail.com>
Date: Thu, 17 Nov 2022 13:09:31 +0100
Subject: [PATCH 2/9] fix: remove original functions

---
 .../loaders/raw/load_diagnoses.py             | 100 ++----------------
 .../loaders/raw/load_medications.py           |  73 ++-----------
 2 files changed, 18 insertions(+), 155 deletions(-)

diff --git a/src/psycop_feature_generation/loaders/raw/load_diagnoses.py b/src/psycop_feature_generation/loaders/raw/load_diagnoses.py
index 526ba3ec..1ce8920f 100644
--- a/src/psycop_feature_generation/loaders/raw/load_diagnoses.py
+++ b/src/psycop_feature_generation/loaders/raw/load_diagnoses.py
@@ -11,95 +11,10 @@
 import pandas as pd
 
 from psycop_feature_generation.loaders.raw.sql_load import sql_load
+from psycop_feature_generation.loaders.raw.utils import load_from_list
 from psycop_feature_generation.utils import data_loaders
 
 
-def _load(
-    icd_code: Union[list[str], str],
-    source_timestamp_col_name: str,
-    fct: str,
-    output_col_name: Optional[str] = None,
-    wildcard_icd_code: Optional[bool] = True,
-    n_rows: Optional[int] = None,
-) -> pd.DataFrame:
-    """Load the visits that have diagnoses that match icd_code from the
-    beginning of their adiagnosekode string. Aggregates all that match.
-
-    Args:
-        icd_code (Union[list[str], str]): Substring(s) to match diagnoses for. # noqa: DAR102
-            Matches any diagnoses, whether a-diagnosis, b-diagnosis etc. If a list is passed, will
-            count a diagnosis as a match if any of the icd_codes in the list match.
-        source_timestamp_col_name (str): Name of the timestamp column in the SQL
-            view.
-        fct (str): Name of the SQL view to load from.
-        output_col_name (str, optional): Name of new column string. Defaults to
-            None.
-        wildcard_icd_code (bool, optional): Whether to match on icd_code*.
-            Defaults to true.
-        n_rows: Number of rows to return. Defaults to None.
-
-    Returns:
-        pd.DataFrame: A pandas dataframe with dw_ek_borger, timestamp and
-            output_col_name = 1
-    """
-    fct = f"[{fct}]"
-
-    # Must be able to split a string like this:
-    #   A:DF431#+:ALFC3#B:DF329
-    # Which means that if wildcard_icd_code is False, we must match on icd_code# or icd_code followed by nothing.
-    # If it's true, we can match on icd_code*.
-
-    # Handle if there are multiple ICD codes to count together.
-    if isinstance(icd_code, list):
-        match_col_sql_strings = []
-
-        for code_str in icd_code:  # pylint: disable=not-an-iterable
-            if wildcard_icd_code:
-                match_col_sql_strings.append(
-                    f"lower(diagnosegruppestreng) LIKE '%{code_str.lower()}%'",
-                )
-            else:
-                # If the string is at the end of diagnosegruppestreng, it doesn't end with a hashtag
-                match_col_sql_strings.append(
-                    f"lower(diagnosegruppestreng) LIKE '%{code_str.lower()}'",
-                )
-
-                # But if it is at the end, it does
-                match_col_sql_strings.append(
-                    f"lower(diagnosegruppestreng) LIKE '%{code_str.lower()}#%'",
-                )
-
-        match_col_sql_str = " OR ".join(match_col_sql_strings)
-    else:
-        if wildcard_icd_code:
-            match_col_sql_str = (
-                f"lower(diagnosegruppestreng) LIKE '%{icd_code.lower()}%'"
-            )
-
-        else:
-            match_col_sql_str = f"lower(diagnosegruppestreng) LIKE '%{icd_code.lower()}' OR lower(diagnosegruppestreng) LIKE '%{icd_code.lower()}#%'"
-
-    sql = (
-        f"SELECT dw_ek_borger, {source_timestamp_col_name}, diagnosegruppestreng"
-        + f" FROM [fct].{fct} WHERE {source_timestamp_col_name} IS NOT NULL AND ({match_col_sql_str})"
-    )
-
-    df = sql_load(sql, database="USR_PS_FORSK", chunksize=None, n_rows=n_rows)
-
-    if output_col_name is None:
-        output_col_name = icd_code
-
-    df[output_col_name] = 1
-
-    df.drop(["diagnosegruppestreng"], axis="columns", inplace=True)
-
-    return df.rename(
-        columns={
-            source_timestamp_col_name: "timestamp",
-        },
-    )
-
-
 def concat_from_physical_visits(
     icd_codes: list[str],
     output_col_name: str,
@@ -141,10 +56,11 @@ def concat_from_physical_visits(
     # Using ._load is faster than from_physical_visits since it can process all icd_codes in the SQL request at once,
     # rather than processing one at a time and aggregating.
     dfs = [
-        _load(
-            icd_code=icd_codes,
+        load_from_list(
+            codes_to_match=icd_codes,
+            column_name="diagnosegruppestreng",
             output_col_name=output_col_name,
-            wildcard_icd_code=wildcard_icd_code,
+            wildcard_code=wildcard_icd_code,
             n_rows=n_rows,
             **kwargs,
         )
@@ -199,10 +115,10 @@ def from_physical_visits(
         n_rows_per_df = None
 
     dfs = [
-        _load(
-            icd_code=icd_code,
+        load_from_list(
+            codes_to_match=icd_code,
+            column_name="diagnosegruppestreng",
             output_col_name=output_col_name,
-            wildcard_icd_code=wildcard_icd_code,
             n_rows=n_rows_per_df,
             **kwargs,
         )
diff --git a/src/psycop_feature_generation/loaders/raw/load_medications.py b/src/psycop_feature_generation/loaders/raw/load_medications.py
index bddada3e..3cde4839 100644
--- a/src/psycop_feature_generation/loaders/raw/load_medications.py
+++ b/src/psycop_feature_generation/loaders/raw/load_medications.py
@@ -5,73 +5,18 @@
 from wasabi import msg
 
 from psycop_feature_generation.loaders.raw.sql_load import sql_load
+from psycop_feature_generation.loaders.raw.utils import load_from_list
 from psycop_feature_generation.utils import data_loaders
 
 # pylint: disable=missing-function-docstring
 
 
-def _load_one_source(
-    atc_code: str,
-    source_timestamp_col_name: str,
-    view: str,
-    output_col_name: Optional[str] = None,
-    wildcard_icd_code: Optional[bool] = False,
-    n_rows: Optional[int] = None,
-) -> pd.DataFrame:
-    """Load the prescribed medications that match atc. If wildcard_atc_code,
-    match from atc_code*. Aggregates all that match. Beware that data is
-    incomplete prior to sep. 2016 for prescribed medications.
-
-    Args:
-        atc_code (str): ATC string to match on. # noqa: DAR102
-        source_timestamp_col_name (str): Name of the timestamp column in the SQL
-            table.
-        view (str): Which view to use, e.g.
-            "FOR_Medicin_ordineret_inkl_2021_feb2022"
-        output_col_name (str, optional): Name of new column string. Defaults to
-            None.
-        wildcard_icd_code (bool, optional): Whether to match on atc_code* or
-            atc_code.
-        n_rows (int, optional): Number of rows to return. Defaults to None.
-
-    Returns:
-        pd.DataFrame: A pandas dataframe with dw_ek_borger, timestamp and
-            output_col_name = 1
-    """
-
-    if wildcard_icd_code:
-        end_of_sql = "%"
-    else:
-        end_of_sql = ""  # noqa
-
-    view = f"[{view}]"
-    sql = (
-        f"SELECT dw_ek_borger, {source_timestamp_col_name}, atc FROM [fct].{view}"
-        + f" WHERE {source_timestamp_col_name} IS NOT NULL AND (lower(atc)) LIKE lower('{atc_code}{end_of_sql}')"
-    )
-
-    df = sql_load(sql, database="USR_PS_FORSK", chunksize=None, n_rows=n_rows)
-
-    if output_col_name is None:
-        output_col_name = atc_code
-
-    df[output_col_name] = 1
-
-    df.drop(["atc"], axis="columns", inplace=True)
-
-    return df.rename(
-        columns={
-            source_timestamp_col_name: "timestamp",
-        },
-    )
-
-
 def load(
     atc_code: str,
     output_col_name: Optional[str] = None,
     load_prescribed: Optional[bool] = False,
     load_administered: Optional[bool] = True,
-    wildcard_icd_code: Optional[bool] = True,
+    wildcard_code: Optional[bool] = True,
     n_rows: Optional[int] = None,
 ) -> pd.DataFrame:
     """Load medications. Aggregates prescribed/administered if both true. If
@@ -105,24 +50,26 @@ def load(
     df = pd.DataFrame()
 
     if load_prescribed:
-        df_medication_prescribed = _load_one_source(
-            atc_code=atc_code,
+        df_medication_prescribed = load_from_list(
+            codes_to_match=atc_code,
+            code_col_name="atc",
             source_timestamp_col_name="datotid_ordinationstart",
             view="FOR_Medicin_ordineret_inkl_2021_feb2022",
             output_col_name=output_col_name,
-            wildcard_icd_code=wildcard_icd_code,
+            wildcard_code=wildcard_code,
             n_rows=n_rows,
         )
 
         df = pd.concat([df, df_medication_prescribed])
 
     if load_administered:
-        df_medication_administered = _load_one_source(
-            atc_code=atc_code,
+        df_medication_administered = load_from_list(
+            codes_to_match=atc_code,
+            code_col_name="atc",
             source_timestamp_col_name="datotid_administration_start",
             view="FOR_Medicin_administreret_inkl_2021_feb2022",
             output_col_name=output_col_name,
-            wildcard_icd_code=wildcard_icd_code,
+            wildcard_code=wildcard_code,
             n_rows=n_rows,
         )
         df = pd.concat([df, df_medication_administered])

From 1f208d45e8805163e2ae305e9627e8f3beb25a79 Mon Sep 17 00:00:00 2001
From: Martin Bernstorff <ryqiem@gmail.com>
Date: Thu, 17 Nov 2022 15:11:57 +0100
Subject: [PATCH 3/9] refactor: finish refactor

---
 example/loaders/load_medications.py           |   2 +-
 .../data_checks/raw/check_predictor_lists.py  |   9 ++
 .../loaders/raw/load_diagnoses.py             |  34 ++---
 .../loaders/raw/load_lab_results.py           |   4 +-
 .../loaders/raw/load_medications.py           |  90 +++++++------
 .../loaders/raw/utils.py                      | 125 ++++++++++++------
 6 files changed, 159 insertions(+), 105 deletions(-)

diff --git a/example/loaders/load_medications.py b/example/loaders/load_medications.py
index 81bd2d19..0e4fc2b4 100644
--- a/example/loaders/load_medications.py
+++ b/example/loaders/load_medications.py
@@ -3,4 +3,4 @@
 import psycop_feature_generation.loaders.raw.load_medications as m
 
 if __name__ == "__main__":
-    df = m.antipsychotics()
+    df = m.first_gen_antipsychotics(n_rows=1000)
diff --git a/src/psycop_feature_generation/data_checks/raw/check_predictor_lists.py b/src/psycop_feature_generation/data_checks/raw/check_predictor_lists.py
index ca57b2e3..75eeb9b2 100644
--- a/src/psycop_feature_generation/data_checks/raw/check_predictor_lists.py
+++ b/src/psycop_feature_generation/data_checks/raw/check_predictor_lists.py
@@ -36,6 +36,15 @@ def check_df_conforms_to_arg_dict(
         ValueError: If df does not conform to d.
     """
 
+    if required_columns is None:
+        required_columns = ["dw_ek_borger", "timestamp", "value"]
+
+    if subset_duplicates_columns is None:
+        subset_duplicates_columns = ["dw_ek_borger", "timestamp", "value"]
+
+    if expected_val_dtypes is None:
+        expected_val_dtypes = ["float64", "int64"]
+
     msg = Printer(timestamp=True)
 
     allowed_nan_value_prop = (
diff --git a/src/psycop_feature_generation/loaders/raw/load_diagnoses.py b/src/psycop_feature_generation/loaders/raw/load_diagnoses.py
index 1ce8920f..d86a4c20 100644
--- a/src/psycop_feature_generation/loaders/raw/load_diagnoses.py
+++ b/src/psycop_feature_generation/loaders/raw/load_diagnoses.py
@@ -10,8 +10,7 @@
 
 import pandas as pd
 
-from psycop_feature_generation.loaders.raw.sql_load import sql_load
-from psycop_feature_generation.loaders.raw.utils import load_from_list
+from psycop_feature_generation.loaders.raw.utils import load_from_codes
 from psycop_feature_generation.utils import data_loaders
 
 
@@ -36,35 +35,34 @@ def concat_from_physical_visits(
 
     diagnoses_source_table_info = {
         "lpr3": {
-            "fct": "FOR_LPR3kontakter_psyk_somatik_inkl_2021_feb2022",
+            "view": "FOR_LPR3kontakter_psyk_somatik_inkl_2021_feb2022",
             "source_timestamp_col_name": "datotid_lpr3kontaktstart",
         },
         "lpr2_inpatient": {
-            "fct": "FOR_indlaeggelser_psyk_somatik_LPR2_inkl_2021_feb2022",
+            "view": "FOR_indlaeggelser_psyk_somatik_LPR2_inkl_2021_feb2022",
             "source_timestamp_col_name": "datotid_indlaeggelse",
         },
         "lpr2_acute_outpatient": {
-            "fct": "FOR_akutambulantekontakter_psyk_somatik_LPR2_inkl_2021_feb2022",
+            "view": "FOR_akutambulantekontakter_psyk_somatik_LPR2_inkl_2021_feb2022",
             "source_timestamp_col_name": "datotid_start",
         },
         "lpr2_outpatient": {
-            "fct": "FOR_besoeg_psyk_somatik_LPR2_inkl_2021_feb2022",
+            "view": "FOR_besoeg_psyk_somatik_LPR2_inkl_2021_feb2022",
             "source_timestamp_col_name": "datotid_start",
         },
     }
 
-    # Using ._load is faster than from_physical_visits since it can process all icd_codes in the SQL request at once,
-    # rather than processing one at a time and aggregating.
     dfs = [
-        load_from_list(
+        load_from_codes(
             codes_to_match=icd_codes,
             column_name="diagnosegruppestreng",
             output_col_name=output_col_name,
             wildcard_code=wildcard_icd_code,
             n_rows=n_rows,
+            load_diagnoses=True,
             **kwargs,
         )
-        for source_name, kwargs in diagnoses_source_table_info.items()
+        for _, kwargs in diagnoses_source_table_info.items()
     ]
 
     df = pd.concat(dfs).drop_duplicates(
@@ -75,7 +73,7 @@ def concat_from_physical_visits(
 
 
 def from_physical_visits(
-    icd_code: str,
+    icd_code: Union[list[str], str],
     output_col_name: Optional[str] = "value",
     n_rows: Optional[int] = None,
     wildcard_icd_code: Optional[bool] = False,
@@ -96,15 +94,15 @@ def from_physical_visits(
 
     diagnoses_source_table_info = {
         "lpr3": {
-            "fct": "FOR_LPR3kontakter_psyk_somatik_inkl_2021",
+            "view": "FOR_LPR3kontakter_psyk_somatik_inkl_2021",
             "source_timestamp_col_name": "datotid_lpr3kontaktstart",
         },
         "lpr2_inpatient": {
-            "fct": "FOR_indlaeggelser_psyk_somatik_LPR2_inkl_2021",
+            "view": "FOR_indlaeggelser_psyk_somatik_LPR2_inkl_2021",
             "source_timestamp_col_name": "datotid_indlaeggelse",
         },
         "lpr2_outpatient": {
-            "fct": "FOR_besoeg_psyk_somatik_LPR2_inkl_2021",
+            "view": "FOR_besoeg_psyk_somatik_LPR2_inkl_2021",
             "source_timestamp_col_name": "datotid_start",
         },
     }
@@ -115,14 +113,16 @@ def from_physical_visits(
         n_rows_per_df = None
 
     dfs = [
-        load_from_list(
+        load_from_codes(
             codes_to_match=icd_code,
-            column_name="diagnosegruppestreng",
+            code_col_name="diagnosegruppestreng",
             output_col_name=output_col_name,
             n_rows=n_rows_per_df,
+            wildcard_code=wildcard_icd_code,
             **kwargs,
+            load_diagnoses=True,
         )
-        for source_name, kwargs in diagnoses_source_table_info.items()
+        for _, kwargs in diagnoses_source_table_info.items()
     ]
 
     df = pd.concat(dfs).drop_duplicates(
diff --git a/src/psycop_feature_generation/loaders/raw/load_lab_results.py b/src/psycop_feature_generation/loaders/raw/load_lab_results.py
index 39089bc9..c5ab8cf9 100644
--- a/src/psycop_feature_generation/loaders/raw/load_lab_results.py
+++ b/src/psycop_feature_generation/loaders/raw/load_lab_results.py
@@ -55,9 +55,7 @@ def load_non_numerical_values_and_coerce_inequalities(
         inplace=True,
     )
 
-    if ineq2mult:
-        return multiply_inequalities_in_df(df, ineq2mult=ineq2mult)
-    return multiply_inequalities_in_df(df)
+    return multiply_inequalities_in_df(df, ineq2mult=ineq2mult)
 
 
 def load_numerical_values(
diff --git a/src/psycop_feature_generation/loaders/raw/load_medications.py b/src/psycop_feature_generation/loaders/raw/load_medications.py
index 3cde4839..466c34a7 100644
--- a/src/psycop_feature_generation/loaders/raw/load_medications.py
+++ b/src/psycop_feature_generation/loaders/raw/load_medications.py
@@ -1,18 +1,17 @@
 """Loaders for medications."""
-from typing import Optional
+from typing import Optional, Union
 
 import pandas as pd
 from wasabi import msg
 
-from psycop_feature_generation.loaders.raw.sql_load import sql_load
-from psycop_feature_generation.loaders.raw.utils import load_from_list
+from psycop_feature_generation.loaders.raw.utils import load_from_codes
 from psycop_feature_generation.utils import data_loaders
 
 # pylint: disable=missing-function-docstring
 
 
 def load(
-    atc_code: str,
+    atc_code: Union[str, list[str]],
     output_col_name: Optional[str] = None,
     load_prescribed: Optional[bool] = False,
     load_administered: Optional[bool] = True,
@@ -24,7 +23,7 @@ def load(
     that data is incomplete prior to sep. 2016 for prescribed medications.
 
     Args:
-        atc_code (str): ATC-code prefix to load. Matches atc_code_prefix*. # noqa: DAR102
+        atc_code (str): ATC-code prefix to load. Matches atc_code_prefix*.
             Aggregates all.
         output_col_name (str, optional): Name of output_col_name. Contains 1 if
             atc_code matches atc_code_prefix, 0 if not.Defaults to
@@ -33,7 +32,7 @@ def load(
             False. Beware incomplete until sep 2016.
         load_administered (bool, optional): Whether to load administrations.
             Defaults to True.
-        wildcard_icd_code (bool, optional): Whether to match on atc_code* or
+        wildcard_code (bool, optional): Whether to match on atc_code* or
             atc_code.
         n_rows (int, optional): Number of rows to return. Defaults to None.
 
@@ -49,8 +48,11 @@ def load(
 
     df = pd.DataFrame()
 
+    if load_prescribed and load_administered:
+        n_rows = int(n_rows / 2) if n_rows else None
+
     if load_prescribed:
-        df_medication_prescribed = load_from_list(
+        df_medication_prescribed = load_from_codes(
             codes_to_match=atc_code,
             code_col_name="atc",
             source_timestamp_col_name="datotid_ordinationstart",
@@ -58,12 +60,13 @@ def load(
             output_col_name=output_col_name,
             wildcard_code=wildcard_code,
             n_rows=n_rows,
+            load_diagnoses=False,
         )
 
         df = pd.concat([df, df_medication_prescribed])
 
     if load_administered:
-        df_medication_administered = load_from_list(
+        df_medication_administered = load_from_codes(
             codes_to_match=atc_code,
             code_col_name="atc",
             source_timestamp_col_name="datotid_administration_start",
@@ -71,15 +74,20 @@ def load(
             output_col_name=output_col_name,
             wildcard_code=wildcard_code,
             n_rows=n_rows,
+            load_diagnoses=False,
         )
         df = pd.concat([df, df_medication_administered])
 
     if output_col_name is None:
-        output_col_name = atc_code
+        if isinstance(atc_code, list):
+            # Joint list of atc_codes
+            output_col_name = "_".join(atc_code)
+        else:
+            output_col_name = atc_code
 
     df.rename(
         columns={
-            atc_code: "value",
+            output_col_name: "value",
         },
         inplace=True,
     )
@@ -132,7 +140,7 @@ def antipsychotics(n_rows: Optional[int] = None) -> pd.DataFrame:
         atc_code="N05A",
         load_prescribed=True,
         load_administered=True,
-        wildcard_icd_code=True,
+        wildcard_code=True,
         n_rows=n_rows,
     )
 
@@ -152,7 +160,7 @@ def first_gen_antipsychotics(n_rows: Optional[int] = None) -> pd.DataFrame:
         ],
         load_prescribed=True,
         load_administered=True,
-        wildcard_icd_code=False,
+        wildcard_code=False,
         n_rows=n_rows,
     )
 
@@ -178,7 +186,7 @@ def second_gen_antipsychotics(n_rows: Optional[int] = None) -> pd.DataFrame:
         ],
         load_prescribed=True,
         load_administered=True,
-        wildcard_icd_code=False,
+        wildcard_code=False,
         n_rows=n_rows,
     )
 
@@ -189,7 +197,7 @@ def olanzapine(n_rows: Optional[int] = None) -> pd.DataFrame:
         atc_code="N05AH03",
         load_prescribed=True,
         load_administered=True,
-        wildcard_icd_code=False,
+        wildcard_code=False,
         n_rows=n_rows,
     )
 
@@ -200,7 +208,7 @@ def clozapine(n_rows: Optional[int] = None) -> pd.DataFrame:
         atc_code="N05AH02",
         load_prescribed=True,
         load_administered=True,
-        wildcard_icd_code=False,
+        wildcard_code=False,
         n_rows=n_rows,
     )
 
@@ -211,7 +219,7 @@ def anxiolytics(n_rows: Optional[int] = None) -> pd.DataFrame:
         atc_code="N05B",
         load_prescribed=False,
         load_administered=True,
-        wildcard_icd_code=True,
+        wildcard_code=True,
         n_rows=n_rows,
     )
 
@@ -222,7 +230,7 @@ def hypnotics(n_rows: Optional[int] = None) -> pd.DataFrame:
         atc_code="N05C",
         load_prescribed=False,
         load_administered=True,
-        wildcard_icd_code=True,
+        wildcard_code=True,
         n_rows=n_rows,
     )
 
@@ -233,7 +241,7 @@ def antidepressives(n_rows: Optional[int] = None) -> pd.DataFrame:
         atc_code="N06A",
         load_prescribed=False,
         load_administered=True,
-        wildcard_icd_code=True,
+        wildcard_code=True,
         n_rows=n_rows,
     )
 
@@ -245,7 +253,7 @@ def ssri(n_rows: Optional[int] = None) -> pd.DataFrame:
         atc_code=["N06AB10", "N06AB04", "N06AB08", "N06AB03", "N06AB05", "N06AB06"],
         load_prescribed=True,
         load_administered=True,
-        wildcard_icd_code=False,
+        wildcard_code=False,
         n_rows=n_rows,
     )
 
@@ -257,7 +265,7 @@ def snri(n_rows: Optional[int] = None) -> pd.DataFrame:
         atc_code=["N06AX21", "N06AX16"],
         load_prescribed=True,
         load_administered=True,
-        wildcard_icd_code=False,
+        wildcard_code=False,
         n_rows=n_rows,
     )
 
@@ -269,7 +277,7 @@ def tca(n_rows: Optional[int] = None) -> pd.DataFrame:
         atc_code=["N06AA09", "N06AA04", "N06AA02", "N06AA10", "N06AA16"],
         load_prescribed=True,
         load_administered=True,
-        wildcard_icd_code=False,
+        wildcard_code=False,
         n_rows=n_rows,
     )
 
@@ -280,7 +288,7 @@ def lithium(n_rows: Optional[int] = None) -> pd.DataFrame:
         atc_code="N05AN01",
         load_prescribed=True,
         load_administered=True,
-        wildcard_icd_code=False,
+        wildcard_code=False,
         n_rows=n_rows,
     )
 
@@ -291,7 +299,7 @@ def hyperactive_disorders_medications(n_rows: Optional[int] = None) -> pd.DataFr
         atc_code="N06B",
         load_prescribed=False,
         load_administered=True,
-        wildcard_icd_code=True,
+        wildcard_code=True,
         n_rows=n_rows,
     )
 
@@ -302,7 +310,7 @@ def dementia_medications(n_rows: Optional[int] = None) -> pd.DataFrame:
         atc_code="N06D",
         load_prescribed=False,
         load_administered=True,
-        wildcard_icd_code=True,
+        wildcard_code=True,
         n_rows=n_rows,
     )
 
@@ -313,7 +321,7 @@ def anti_epileptics(n_rows: Optional[int] = None) -> pd.DataFrame:
         atc_code="N03",
         load_prescribed=False,
         load_administered=True,
-        wildcard_icd_code=True,
+        wildcard_code=True,
         n_rows=n_rows,
     )
 
@@ -325,7 +333,7 @@ def alcohol_abstinence(n_rows: Optional[int] = None) -> pd.DataFrame:
         atc_code=["A11DA01", "A11EA", "N05BA02", "N03AA02"],
         load_prescribed=True,
         load_administered=True,
-        wildcard_icd_code=False,
+        wildcard_code=False,
         n_rows=n_rows,
     )
 
@@ -337,7 +345,7 @@ def alimentary_medications(n_rows: Optional[int] = None) -> pd.DataFrame:
         atc_code="A",
         load_prescribed=False,
         load_administered=True,
-        wildcard_icd_code=True,
+        wildcard_code=True,
         n_rows=n_rows,
     )
 
@@ -348,7 +356,7 @@ def blood_medications(n_rows: Optional[int] = None) -> pd.DataFrame:
         atc_code="B",
         load_prescribed=False,
         load_administered=True,
-        wildcard_icd_code=True,
+        wildcard_code=True,
         n_rows=n_rows,
     )
 
@@ -359,7 +367,7 @@ def cardiovascular_medications(n_rows: Optional[int] = None) -> pd.DataFrame:
         atc_code="C",
         load_prescribed=False,
         load_administered=True,
-        wildcard_icd_code=True,
+        wildcard_code=True,
         n_rows=n_rows,
     )
 
@@ -370,7 +378,7 @@ def dermatological_medications(n_rows: Optional[int] = None) -> pd.DataFrame:
         atc_code="D",
         load_prescribed=False,
         load_administered=True,
-        wildcard_icd_code=True,
+        wildcard_code=True,
         n_rows=n_rows,
     )
 
@@ -381,7 +389,7 @@ def genito_sex_medications(n_rows: Optional[int] = None) -> pd.DataFrame:
         atc_code="G",
         load_prescribed=False,
         load_administered=True,
-        wildcard_icd_code=True,
+        wildcard_code=True,
         n_rows=n_rows,
     )
 
@@ -392,7 +400,7 @@ def hormonal_medications(n_rows: Optional[int] = None) -> pd.DataFrame:
         atc_code="H",
         load_prescribed=False,
         load_administered=True,
-        wildcard_icd_code=True,
+        wildcard_code=True,
         n_rows=n_rows,
     )
 
@@ -403,7 +411,7 @@ def antiinfectives(n_rows: Optional[int] = None) -> pd.DataFrame:
         atc_code="J",
         load_prescribed=False,
         load_administered=True,
-        wildcard_icd_code=True,
+        wildcard_code=True,
         n_rows=n_rows,
     )
 
@@ -414,7 +422,7 @@ def antineoplastic(n_rows: Optional[int] = None) -> pd.DataFrame:
         atc_code="L",
         load_prescribed=False,
         load_administered=True,
-        wildcard_icd_code=True,
+        wildcard_code=True,
         n_rows=n_rows,
     )
 
@@ -425,7 +433,7 @@ def musculoskeletal_medications(n_rows: Optional[int] = None) -> pd.DataFrame:
         atc_code="M",
         load_prescribed=False,
         load_administered=True,
-        wildcard_icd_code=True,
+        wildcard_code=True,
         n_rows=n_rows,
     )
 
@@ -436,7 +444,7 @@ def nervous_system_medications(n_rows: Optional[int] = None) -> pd.DataFrame:
         atc_code="N",
         load_prescribed=False,
         load_administered=True,
-        wildcard_icd_code=True,
+        wildcard_code=True,
         n_rows=n_rows,
     )
 
@@ -447,7 +455,7 @@ def analgesic(n_rows: Optional[int] = None) -> pd.DataFrame:
         atc_code="N02",
         load_prescribed=False,
         load_administered=True,
-        wildcard_icd_code=True,
+        wildcard_code=True,
         n_rows=n_rows,
     )
 
@@ -458,7 +466,7 @@ def antiparasitic(n_rows: Optional[int] = None) -> pd.DataFrame:
         atc_code="P",
         load_prescribed=False,
         load_administered=True,
-        wildcard_icd_code=True,
+        wildcard_code=True,
         n_rows=n_rows,
     )
 
@@ -469,7 +477,7 @@ def respiratory_medications(n_rows: Optional[int] = None) -> pd.DataFrame:
         atc_code="R",
         load_prescribed=False,
         load_administered=True,
-        wildcard_icd_code=True,
+        wildcard_code=True,
         n_rows=n_rows,
     )
 
@@ -480,7 +488,7 @@ def sensory_medications(n_rows: Optional[int] = None) -> pd.DataFrame:
         atc_code="S",
         load_prescribed=False,
         load_administered=True,
-        wildcard_icd_code=True,
+        wildcard_code=True,
         n_rows=n_rows,
     )
 
@@ -491,6 +499,6 @@ def various_medications(n_rows: Optional[int] = None) -> pd.DataFrame:
         atc_code="V",
         load_prescribed=False,
         load_administered=True,
-        wildcard_icd_code=True,
+        wildcard_code=True,
         n_rows=n_rows,
     )
diff --git a/src/psycop_feature_generation/loaders/raw/utils.py b/src/psycop_feature_generation/loaders/raw/utils.py
index 9c5d804b..5ed058e0 100644
--- a/src/psycop_feature_generation/loaders/raw/utils.py
+++ b/src/psycop_feature_generation/loaders/raw/utils.py
@@ -7,13 +7,63 @@
 from psycop_feature_generation.loaders.raw.sql_load import sql_load
 
 
-def load_from_list(
+def str_to_sql_match_logic(
+    codes_to_match: str,
+    code_col_name: str,
+    wildcard_code: bool,
+    load_diagnoses: bool,
+):
+    """Generate SQL match logic from a single string."""
+    if wildcard_code:
+        match_col_sql_str = f"lower({code_col_name}) LIKE '%{codes_to_match.lower()}%'"
+    else:
+        match_col_sql_str = f"lower({code_col_name}) LIKE '%{codes_to_match.lower()}'"
+
+        if load_diagnoses:
+            match_col_sql_str += (
+                f" OR lower({code_col_name}) LIKE '%{codes_to_match.lower()}#%'"
+            )
+
+    return match_col_sql_str
+
+
+def list_to_sql_logic(
+    codes_to_match: list[str],
+    load_diagnoses: bool,
+    code_col_name: str,
+    wildcard_code: bool,
+):
+    """Generate SQL match logic from a list of strings."""
+    match_col_sql_strings = []
+
+    for code_str in codes_to_match:
+        if wildcard_code:
+            match_col_sql_strings.append(
+                f"lower({code_col_name}) LIKE '%{code_str.lower()}%'",
+            )
+        else:
+            # If the string is at the end of diagnosegruppestreng, it doesn't end with a hashtag
+            match_col_sql_strings.append(
+                f"lower({code_col_name}) LIKE '%{code_str.lower()}'",
+            )
+
+            if load_diagnoses:
+                # If the string is at the beginning of diagnosegruppestreng, it doesn't start with a hashtag
+                match_col_sql_strings.append(
+                    f"lower({code_col_name}) LIKE '{code_str.lower()}%'",
+                )
+
+    return " OR ".join(match_col_sql_strings)
+
+
+def load_from_codes(
     codes_to_match: Union[list[str], str],
+    load_diagnoses: bool,
     code_col_name: str,
     source_timestamp_col_name: str,
-    fct: str,
+    view: str,
     output_col_name: Optional[str] = None,
-    wildcard_code: Optional[bool] = True,
+    wildcard_code: bool = True,
     n_rows: Optional[int] = None,
 ) -> pd.DataFrame:
     """Load the visits that have diagnoses that match icd_code or atc code from
@@ -21,14 +71,18 @@ def load_from_list(
     that match.
 
     Args:
-        codes_to_match (Union[list[str], str]): Substring(s) to match diagnoses or medictions for. # noqa: DAR102
-            Matches any diagnoses, whether a-diagnosis, b-diagnosis or any atc code etc. If a list is passed, will
-            count as a match if any of the icd_codes or act codes in the list match.
+        codes_to_match (Union[list[str], str]): Substring(s) to match diagnoses or medications for.
+            Diagnoses: Matches any diagnoses, whether a-diagnosis, b-diagnosis.
+            Both: If a list is passed, will count as a match if any of the icd_codes or at codes in the list match.
+        load_diagnoses (bool): Determines which mathing logic is employed. If True, will load diagnoses. If False, will load medications.
+            Diagnoses must be able to split a string like this:
+                A:DF431#+:ALFC3#B:DF329
+            Which means that if wildcard_code is False, we must match on *icd_code# or *icd_code followed by nothing. If it's true, we can match on *icd_code*.
         code_col_name (str): Name of column containing either diagnosis (icd) or medication (atc) codes.
             Takes either 'diagnosegruppestreng' or 'atc' as input.
         source_timestamp_col_name (str): Name of the timestamp column in the SQL
             view.
-        fct (str): Name of the SQL view to load from.
+        view (str): Name of the SQL view to load from.
         output_col_name (str, optional): Name of new column string. Defaults to
             None.
         wildcard_code (bool, optional): Whether to match on icd_code* / atc_code*.
@@ -39,42 +93,24 @@ def load_from_list(
         pd.DataFrame: A pandas dataframe with dw_ek_borger, timestamp and
             output_col_name = 1
     """
-    fct = f"[{fct}]"
-
-    # Must be able to split a string like this:
-    #   A:DF431#+:ALFC3#B:DF329
-    # Which means that if wildcard_code is False, we must match on icd_code# or icd_code followed by nothing.
-    # If it's true, we can match on icd_code*.
-
-    # Handle if there are multiple ICD codes to count together.
-    if isinstance(codes_to_match, list):
-        match_col_sql_strings = []
-
-        for code_str in codes_to_match:  # pylint: disable=not-an-iterable
-            if wildcard_code:
-                match_col_sql_strings.append(
-                    f"lower({code_col_name}) LIKE '%{code_str.lower()}%'",
-                )
-            else:
-                # If the string is at the end of diagnosegruppestreng, it doesn't end with a hashtag
-                match_col_sql_strings.append(
-                    f"lower({code_col_name}) LIKE '%{code_str.lower()}'",
-                )
-
-                # But if it is at the end, it does
-                match_col_sql_strings.append(
-                    f"lower({code_col_name}) LIKE '%{code_str.lower()}#%'",
-                )
-
-        match_col_sql_str = " OR ".join(match_col_sql_strings)
+    fct = f"[{view}]"
+
+    if isinstance(codes_to_match, list) and len(codes_to_match) > 1:
+        match_col_sql_str = list_to_sql_logic(
+            codes_to_match=codes_to_match,
+            load_diagnoses=load_diagnoses,
+            code_col_name=code_col_name,
+            wildcard_code=wildcard_code,
+        )
+    elif isinstance(codes_to_match, str):
+        match_col_sql_str = str_to_sql_match_logic(
+            codes_to_match=codes_to_match,
+            code_col_name=code_col_name,
+            wildcard_code=wildcard_code,
+            load_diagnoses=load_diagnoses,
+        )
     else:
-        if wildcard_code:
-            match_col_sql_str = (
-                f"lower({code_col_name}) LIKE '%{codes_to_match.lower()}%'"
-            )
-
-        else:
-            match_col_sql_str = f"lower({code_col_name}) LIKE '%{codes_to_match.lower()}' OR lower({code_col_name}) LIKE '%{codes_to_match.lower()}#%'"
+        raise ValueError("codes_to_match must be either a list or a string.")
 
     sql = (
         f"SELECT dw_ek_borger, {source_timestamp_col_name}, {code_col_name}"
@@ -84,7 +120,10 @@ def load_from_list(
     df = sql_load(sql, database="USR_PS_FORSK", chunksize=None, n_rows=n_rows)
 
     if output_col_name is None:
-        output_col_name = codes_to_match
+        if isinstance(codes_to_match, list):
+            output_col_name = "_".join(codes_to_match)
+        else:
+            output_col_name = codes_to_match
 
     df[output_col_name] = 1
 

From 9aad0af6205af2e3deffb573676af5a20401bae1 Mon Sep 17 00:00:00 2001
From: Martin Bernstorff <ryqiem@gmail.com>
Date: Thu, 17 Nov 2022 15:16:25 +0100
Subject: [PATCH 4/9] docs: improve docs

---
 .../loaders/raw/utils.py                      | 64 ++++++++++++-------
 1 file changed, 41 insertions(+), 23 deletions(-)

diff --git a/src/psycop_feature_generation/loaders/raw/utils.py b/src/psycop_feature_generation/loaders/raw/utils.py
index 5ed058e0..4ac68ec3 100644
--- a/src/psycop_feature_generation/loaders/raw/utils.py
+++ b/src/psycop_feature_generation/loaders/raw/utils.py
@@ -8,20 +8,31 @@
 
 
 def str_to_sql_match_logic(
-    codes_to_match: str,
-    code_col_name: str,
-    wildcard_code: bool,
+    code_to_match: str,
+    code_sql_col_name: str,
     load_diagnoses: bool,
+    match_with_wildcard: bool,
 ):
-    """Generate SQL match logic from a single string."""
-    if wildcard_code:
-        match_col_sql_str = f"lower({code_col_name}) LIKE '%{codes_to_match.lower()}%'"
+    """Generate SQL match logic from a single string.
+
+    Args:
+        code_to_match (list[str]): List of strings to match.
+        code_sql_col_name (str): Name of the SQL column containing the codes.
+        load_diagnoses (bool): Whether to load diagnoses or medications. Determines the logic. See calling function for more.
+        match_with_wildcard (bool): Whether to match on icd_code* / atc_code*.
+    """
+    if match_with_wildcard:
+        match_col_sql_str = (
+            f"lower({code_sql_col_name}) LIKE '%{code_to_match.lower()}%'"
+        )
     else:
-        match_col_sql_str = f"lower({code_col_name}) LIKE '%{codes_to_match.lower()}'"
+        match_col_sql_str = (
+            f"lower({code_sql_col_name}) LIKE '%{code_to_match.lower()}'"
+        )
 
         if load_diagnoses:
             match_col_sql_str += (
-                f" OR lower({code_col_name}) LIKE '%{codes_to_match.lower()}#%'"
+                f" OR lower({code_sql_col_name}) LIKE '%{code_to_match.lower()}#%'"
             )
 
     return match_col_sql_str
@@ -29,28 +40,35 @@ def str_to_sql_match_logic(
 
 def list_to_sql_logic(
     codes_to_match: list[str],
+    code_sql_col_name: str,
     load_diagnoses: bool,
-    code_col_name: str,
-    wildcard_code: bool,
+    match_with_wildcard: bool,
 ):
-    """Generate SQL match logic from a list of strings."""
+    """Generate SQL match logic from a list of strings.
+
+    Args:
+        codes_to_match (list[str]): List of strings to match.
+        code_sql_col_name (str): Name of the SQL column containing the codes.
+        load_diagnoses (bool): Whether to load diagnoses or medications. Determines the logic. See calling function for more.
+        match_with_wildcard (bool): Whether to match on icd_code* / atc_code*.
+    """
     match_col_sql_strings = []
 
     for code_str in codes_to_match:
-        if wildcard_code:
+        if match_with_wildcard:
             match_col_sql_strings.append(
-                f"lower({code_col_name}) LIKE '%{code_str.lower()}%'",
+                f"lower({code_sql_col_name}) LIKE '%{code_str.lower()}%'",
             )
         else:
             # If the string is at the end of diagnosegruppestreng, it doesn't end with a hashtag
             match_col_sql_strings.append(
-                f"lower({code_col_name}) LIKE '%{code_str.lower()}'",
+                f"lower({code_sql_col_name}) LIKE '%{code_str.lower()}'",
             )
 
             if load_diagnoses:
                 # If the string is at the beginning of diagnosegruppestreng, it doesn't start with a hashtag
                 match_col_sql_strings.append(
-                    f"lower({code_col_name}) LIKE '{code_str.lower()}%'",
+                    f"lower({code_sql_col_name}) LIKE '{code_str.lower()}%'",
                 )
 
     return " OR ".join(match_col_sql_strings)
@@ -63,7 +81,7 @@ def load_from_codes(
     source_timestamp_col_name: str,
     view: str,
     output_col_name: Optional[str] = None,
-    wildcard_code: bool = True,
+    match_with_wildcard: bool = True,
     n_rows: Optional[int] = None,
 ) -> pd.DataFrame:
     """Load the visits that have diagnoses that match icd_code or atc code from
@@ -77,7 +95,7 @@ def load_from_codes(
         load_diagnoses (bool): Determines which mathing logic is employed. If True, will load diagnoses. If False, will load medications.
             Diagnoses must be able to split a string like this:
                 A:DF431#+:ALFC3#B:DF329
-            Which means that if wildcard_code is False, we must match on *icd_code# or *icd_code followed by nothing. If it's true, we can match on *icd_code*.
+            Which means that if match_with_wildcard is False, we must match on *icd_code# or *icd_code followed by nothing. If it's true, we can match on *icd_code*.
         code_col_name (str): Name of column containing either diagnosis (icd) or medication (atc) codes.
             Takes either 'diagnosegruppestreng' or 'atc' as input.
         source_timestamp_col_name (str): Name of the timestamp column in the SQL
@@ -85,7 +103,7 @@ def load_from_codes(
         view (str): Name of the SQL view to load from.
         output_col_name (str, optional): Name of new column string. Defaults to
             None.
-        wildcard_code (bool, optional): Whether to match on icd_code* / atc_code*.
+        match_with_wildcard (bool, optional): Whether to match on icd_code* / atc_code*.
             Defaults to true.
         n_rows: Number of rows to return. Defaults to None.
 
@@ -98,16 +116,16 @@ def load_from_codes(
     if isinstance(codes_to_match, list) and len(codes_to_match) > 1:
         match_col_sql_str = list_to_sql_logic(
             codes_to_match=codes_to_match,
+            code_sql_col_name=code_col_name,
             load_diagnoses=load_diagnoses,
-            code_col_name=code_col_name,
-            wildcard_code=wildcard_code,
+            match_with_wildcard=match_with_wildcard,
         )
     elif isinstance(codes_to_match, str):
         match_col_sql_str = str_to_sql_match_logic(
-            codes_to_match=codes_to_match,
-            code_col_name=code_col_name,
-            wildcard_code=wildcard_code,
+            code_to_match=codes_to_match,
+            code_sql_col_name=code_col_name,
             load_diagnoses=load_diagnoses,
+            match_with_wildcard=match_with_wildcard,
         )
     else:
         raise ValueError("codes_to_match must be either a list or a string.")

From 4338f68bd1d9df97d512434f010a64e71946b756 Mon Sep 17 00:00:00 2001
From: Martin Bernstorff <ryqiem@gmail.com>
Date: Thu, 17 Nov 2022 15:18:27 +0100
Subject: [PATCH 5/9] refactor: correctly rename arguments

---
 src/psycop_feature_generation/loaders/raw/load_diagnoses.py  | 5 +++--
 .../loaders/raw/load_medications.py                          | 4 ++--
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/psycop_feature_generation/loaders/raw/load_diagnoses.py b/src/psycop_feature_generation/loaders/raw/load_diagnoses.py
index 58577ddf..525ee48b 100644
--- a/src/psycop_feature_generation/loaders/raw/load_diagnoses.py
+++ b/src/psycop_feature_generation/loaders/raw/load_diagnoses.py
@@ -13,6 +13,7 @@
 from psycop_feature_generation.loaders.raw.utils import load_from_codes
 from psycop_feature_generation.utils import data_loaders
 
+
 def concat_from_physical_visits(
     icd_codes: list[str],
     output_col_name: str,
@@ -56,7 +57,7 @@ def concat_from_physical_visits(
             codes_to_match=icd_codes,
             column_name="diagnosegruppestreng",
             output_col_name=output_col_name,
-            wildcard_code=wildcard_icd_code,
+            match_with_wildcard=wildcard_icd_code,
             n_rows=n_rows,
             load_diagnoses=True,
             **kwargs,
@@ -117,7 +118,7 @@ def from_physical_visits(
             code_col_name="diagnosegruppestreng",
             output_col_name=output_col_name,
             n_rows=n_rows_per_df,
-            wildcard_code=wildcard_icd_code,
+            match_with_wildcard=wildcard_icd_code,
             **kwargs,
             load_diagnoses=True,
         )
diff --git a/src/psycop_feature_generation/loaders/raw/load_medications.py b/src/psycop_feature_generation/loaders/raw/load_medications.py
index 466c34a7..9174fba2 100644
--- a/src/psycop_feature_generation/loaders/raw/load_medications.py
+++ b/src/psycop_feature_generation/loaders/raw/load_medications.py
@@ -58,7 +58,7 @@ def load(
             source_timestamp_col_name="datotid_ordinationstart",
             view="FOR_Medicin_ordineret_inkl_2021_feb2022",
             output_col_name=output_col_name,
-            wildcard_code=wildcard_code,
+            match_with_wildcard=wildcard_code,
             n_rows=n_rows,
             load_diagnoses=False,
         )
@@ -72,7 +72,7 @@ def load(
             source_timestamp_col_name="datotid_administration_start",
             view="FOR_Medicin_administreret_inkl_2021_feb2022",
             output_col_name=output_col_name,
-            wildcard_code=wildcard_code,
+            match_with_wildcard=wildcard_code,
             n_rows=n_rows,
             load_diagnoses=False,
         )

From a24e490e921cb38a1dac2d0a5a19519c3b3243c9 Mon Sep 17 00:00:00 2001
From: Martin Bernstorff <martinbernstorff@gmail.com>
Date: Thu, 17 Nov 2022 15:35:32 +0100
Subject: [PATCH 6/9] refactor: further refactoring

---
 .../loaders/raw/utils.py                      | 68 +++++++++----------
 1 file changed, 31 insertions(+), 37 deletions(-)

diff --git a/src/psycop_feature_generation/loaders/raw/utils.py b/src/psycop_feature_generation/loaders/raw/utils.py
index 4ac68ec3..e7cf86d5 100644
--- a/src/psycop_feature_generation/loaders/raw/utils.py
+++ b/src/psycop_feature_generation/loaders/raw/utils.py
@@ -8,10 +8,10 @@
 
 
 def str_to_sql_match_logic(
-    code_to_match: str,
-    code_sql_col_name: str,
-    load_diagnoses: bool,
-    match_with_wildcard: bool,
+        code_to_match: str,
+        code_sql_col_name: str,
+        load_diagnoses: bool,
+        match_with_wildcard: bool,
 ):
     """Generate SQL match logic from a single string.
 
@@ -19,30 +19,24 @@ def str_to_sql_match_logic(
         code_to_match (list[str]): List of strings to match.
         code_sql_col_name (str): Name of the SQL column containing the codes.
         load_diagnoses (bool): Whether to load diagnoses or medications. Determines the logic. See calling function for more.
-        match_with_wildcard (bool): Whether to match on icd_code* / atc_code*.
+        match_with_wildcard (bool): Whether to match on icd_code* / atc_code* or only icd_code / atc_code.
     """
+    base_query = f"lower({code_sql_col_name}) LIKE '%{code_to_match.lower()}"
+
     if match_with_wildcard:
-        match_col_sql_str = (
-            f"lower({code_sql_col_name}) LIKE '%{code_to_match.lower()}%'"
-        )
-    else:
-        match_col_sql_str = (
-            f"lower({code_sql_col_name}) LIKE '%{code_to_match.lower()}'"
-        )
+        return f"{base_query}%'"
 
-        if load_diagnoses:
-            match_col_sql_str += (
-                f" OR lower({code_sql_col_name}) LIKE '%{code_to_match.lower()}#%'"
-            )
+    if load_diagnoses:
+        return f"{base_query} OR {base_query}#%'"
 
-    return match_col_sql_str
+    return base_query
 
 
 def list_to_sql_logic(
-    codes_to_match: list[str],
-    code_sql_col_name: str,
-    load_diagnoses: bool,
-    match_with_wildcard: bool,
+        codes_to_match: list[str],
+        code_sql_col_name: str,
+        load_diagnoses: bool,
+        match_with_wildcard: bool,
 ):
     """Generate SQL match logic from a list of strings.
 
@@ -50,39 +44,39 @@ def list_to_sql_logic(
         codes_to_match (list[str]): List of strings to match.
         code_sql_col_name (str): Name of the SQL column containing the codes.
         load_diagnoses (bool): Whether to load diagnoses or medications. Determines the logic. See calling function for more.
-        match_with_wildcard (bool): Whether to match on icd_code* / atc_code*.
+        match_with_wildcard (bool): Whether to match on icd_code* / atc_code* or only icd_code / atc_code.
     """
     match_col_sql_strings = []
 
     for code_str in codes_to_match:
+        base_query = f"lower({code_sql_col_name}) LIKE '%{code_str.lower()}"
+
         if match_with_wildcard:
             match_col_sql_strings.append(
-                f"lower({code_sql_col_name}) LIKE '%{code_str.lower()}%'",
+                f"{base_query}%'",
             )
         else:
             # If the string is at the end of diagnosegruppestreng, it doesn't end with a hashtag
-            match_col_sql_strings.append(
-                f"lower({code_sql_col_name}) LIKE '%{code_str.lower()}'",
-            )
+            match_col_sql_strings.append(base_query)
 
             if load_diagnoses:
                 # If the string is at the beginning of diagnosegruppestreng, it doesn't start with a hashtag
                 match_col_sql_strings.append(
-                    f"lower({code_sql_col_name}) LIKE '{code_str.lower()}%'",
+                    f"lower({code_sql_col_name}) LIKE '{code_str.lower()}#%'",
                 )
 
     return " OR ".join(match_col_sql_strings)
 
 
 def load_from_codes(
-    codes_to_match: Union[list[str], str],
-    load_diagnoses: bool,
-    code_col_name: str,
-    source_timestamp_col_name: str,
-    view: str,
-    output_col_name: Optional[str] = None,
-    match_with_wildcard: bool = True,
-    n_rows: Optional[int] = None,
+        codes_to_match: Union[list[str], str],
+        load_diagnoses: bool,
+        code_col_name: str,
+        source_timestamp_col_name: str,
+        view: str,
+        output_col_name: Optional[str] = None,
+        match_with_wildcard: bool = True,
+        n_rows: Optional[int] = None,
 ) -> pd.DataFrame:
     """Load the visits that have diagnoses that match icd_code or atc code from
     the beginning of their adiagnosekode or atc code string. Aggregates all
@@ -131,8 +125,8 @@ def load_from_codes(
         raise ValueError("codes_to_match must be either a list or a string.")
 
     sql = (
-        f"SELECT dw_ek_borger, {source_timestamp_col_name}, {code_col_name}"
-        + f" FROM [fct].{fct} WHERE {source_timestamp_col_name} IS NOT NULL AND ({match_col_sql_str})"
+            f"SELECT dw_ek_borger, {source_timestamp_col_name}, {code_col_name}"
+            + f" FROM [fct].{fct} WHERE {source_timestamp_col_name} IS NOT NULL AND ({match_col_sql_str})"
     )
 
     df = sql_load(sql, database="USR_PS_FORSK", chunksize=None, n_rows=n_rows)

From 8c4a979678feae197989b7e9b17685aa0e226d85 Mon Sep 17 00:00:00 2001
From: Martin Bernstorff <martinbernstorff@gmail.com>
Date: Thu, 17 Nov 2022 15:41:19 +0100
Subject: [PATCH 7/9] style: black

---
 .../loaders/raw/utils.py                      | 36 +++++++++----------
 .../timeseriesflattener/flattened_dataset.py  |  4 ++-
 2 files changed, 21 insertions(+), 19 deletions(-)

diff --git a/src/psycop_feature_generation/loaders/raw/utils.py b/src/psycop_feature_generation/loaders/raw/utils.py
index e7cf86d5..e9989e8e 100644
--- a/src/psycop_feature_generation/loaders/raw/utils.py
+++ b/src/psycop_feature_generation/loaders/raw/utils.py
@@ -8,10 +8,10 @@
 
 
 def str_to_sql_match_logic(
-        code_to_match: str,
-        code_sql_col_name: str,
-        load_diagnoses: bool,
-        match_with_wildcard: bool,
+    code_to_match: str,
+    code_sql_col_name: str,
+    load_diagnoses: bool,
+    match_with_wildcard: bool,
 ):
     """Generate SQL match logic from a single string.
 
@@ -33,10 +33,10 @@ def str_to_sql_match_logic(
 
 
 def list_to_sql_logic(
-        codes_to_match: list[str],
-        code_sql_col_name: str,
-        load_diagnoses: bool,
-        match_with_wildcard: bool,
+    codes_to_match: list[str],
+    code_sql_col_name: str,
+    load_diagnoses: bool,
+    match_with_wildcard: bool,
 ):
     """Generate SQL match logic from a list of strings.
 
@@ -69,14 +69,14 @@ def list_to_sql_logic(
 
 
 def load_from_codes(
-        codes_to_match: Union[list[str], str],
-        load_diagnoses: bool,
-        code_col_name: str,
-        source_timestamp_col_name: str,
-        view: str,
-        output_col_name: Optional[str] = None,
-        match_with_wildcard: bool = True,
-        n_rows: Optional[int] = None,
+    codes_to_match: Union[list[str], str],
+    load_diagnoses: bool,
+    code_col_name: str,
+    source_timestamp_col_name: str,
+    view: str,
+    output_col_name: Optional[str] = None,
+    match_with_wildcard: bool = True,
+    n_rows: Optional[int] = None,
 ) -> pd.DataFrame:
     """Load the visits that have diagnoses that match icd_code or atc code from
     the beginning of their adiagnosekode or atc code string. Aggregates all
@@ -125,8 +125,8 @@ def load_from_codes(
         raise ValueError("codes_to_match must be either a list or a string.")
 
     sql = (
-            f"SELECT dw_ek_borger, {source_timestamp_col_name}, {code_col_name}"
-            + f" FROM [fct].{fct} WHERE {source_timestamp_col_name} IS NOT NULL AND ({match_col_sql_str})"
+        f"SELECT dw_ek_borger, {source_timestamp_col_name}, {code_col_name}"
+        + f" FROM [fct].{fct} WHERE {source_timestamp_col_name} IS NOT NULL AND ({match_col_sql_str})"
     )
 
     df = sql_load(sql, database="USR_PS_FORSK", chunksize=None, n_rows=n_rows)
diff --git a/src/psycop_feature_generation/timeseriesflattener/flattened_dataset.py b/src/psycop_feature_generation/timeseriesflattener/flattened_dataset.py
index 05bc2016..54b328f5 100644
--- a/src/psycop_feature_generation/timeseriesflattener/flattened_dataset.py
+++ b/src/psycop_feature_generation/timeseriesflattener/flattened_dataset.py
@@ -635,7 +635,9 @@ def add_static_info(
         # Try to infer value col name if not provided
         if static_spec.input_col_name_override is None:
             possible_value_cols = [
-                col for col in static_spec.values_df.columns if col not in self.id_col_name
+                col
+                for col in static_spec.values_df.columns
+                if col not in self.id_col_name
             ]
 
             if len(possible_value_cols) == 1:

From be0ac9fa07140e7326943b03a0c8ded130dc675d Mon Sep 17 00:00:00 2001
From: github-actions <action@github.com>
Date: Thu, 17 Nov 2022 15:47:34 +0000
Subject: [PATCH 8/9] 0.8.0

Automatically generated by python-semantic-release
---
 CHANGELOG.md   | 10 ++++++++++
 pyproject.toml |  2 +-
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index ffe58afc..1678e1ed 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,16 @@
 
 <!--next-version-placeholder-->
 
+## v0.8.0 (2022-11-17)
+### Feature
+* Allow load_medications to concat a list of medications ([`d78f465`](https://github.com/Aarhus-Psychiatry-Research/psycop-feature-generation/commit/d78f46592213b8245229d6618d40f1a1ff4d80eb))
+
+### Fix
+* Remove original functions ([`da59110`](https://github.com/Aarhus-Psychiatry-Research/psycop-feature-generation/commit/da59110978469b0743ce2d625005fc90950fb436))
+
+### Documentation
+* Improve docs ([`9aad0af`](https://github.com/Aarhus-Psychiatry-Research/psycop-feature-generation/commit/9aad0af6205af2e3deffb573676af5a20401bae1))
+
 ## v0.7.0 (2022-11-16)
 ### Feature
 * Full run ([`142212f`](https://github.com/Aarhus-Psychiatry-Research/psycop-feature-generation/commit/142212fc63a59662048b6569dc874def92dfe62f))
diff --git a/pyproject.toml b/pyproject.toml
index 4ce8700c..cb9efcdf 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "psycop_feature_generation"
-version = "0.7.0"
+version = "0.8.0"
 description = ""
 authors = ["Your Name <you@example.com>"]
 

From 214cdaef8f650831c4fb2e796ea7753a2c36f5e1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakob=20Gr=C3=B8hn?= <bokajgd@gmail.com>
Date: Fri, 18 Nov 2022 11:18:42 +0100
Subject: [PATCH 9/9] build: remove torch as dependency, broke package
 installation

---
 pyproject.toml                                |   1 -
 .../loaders/raw/load_text.py                  | 859 +++++++++---------
 2 files changed, 430 insertions(+), 430 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index cb9efcdf..dec68587 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -21,7 +21,6 @@ psutil = ">=5.9.1, <6.0.0"
 pandas = ">=1.4.0,<1.6.0"
 catalogue = ">=2.0.0, <2.1.0"
 numpy = ">=1.23.3,<1.23.5"
-torch = "^1.12.1"
 transformers = "^4.22.2"
 pyarrow = ">=9.0.0,<9.1.0"
 psycopmlutils = ">=0.2.4, <0.3.0"
diff --git a/src/psycop_feature_generation/loaders/raw/load_text.py b/src/psycop_feature_generation/loaders/raw/load_text.py
index 7c20e2a6..2f532e54 100644
--- a/src/psycop_feature_generation/loaders/raw/load_text.py
+++ b/src/psycop_feature_generation/loaders/raw/load_text.py
@@ -1,138 +1,139 @@
-"""Load text data from a database and featurise it using a tf-idf
-vectorizer."""
-
-# pylint: disable=E0211,E0213,missing-function-docstring
-
-from functools import partial
-from multiprocessing import Pool
-from pathlib import Path
-from typing import Optional, Union
-
-import dill as pkl
-import pandas as pd
-import torch
-from transformers import AutoModel, AutoTokenizer
-from transformers.modeling_outputs import BaseModelOutputWithPoolingAndCrossAttentions
-
-from psycop_feature_generation.loaders.raw.sql_load import sql_load
-from psycop_feature_generation.utils import PROJECT_ROOT, data_loaders
-
-
-def get_all_valid_note_types() -> set[str]:
-    """Returns a set of valid note types. Notice that 'Konklusion' is replaced
-    by 'Vurdering/konklusion' in 2020, so make sure to use both. 'Ordination'
-    was replaced by 'Ordination, Psykiatry' in 2022, but 'Ordination,
-    Psykiatri' is not included in the table. Use with caution.
-
-    Returns:
-        Set[str]: Set of valid note types
-    """
-    return {
-        "Observation af patient, Psykiatri",
-        "Samtale med behandlingssigte",
-        "Ordination",  # OBS replaced "Ordination, Psykiatri" in 01/02-22
-        # but is not included in this table. Use with caution
-        "Aktuelt psykisk",
-        "Aktuelt socialt, Psykiatri",
-        "Aftaler, Psykiatri",
-        "Medicin",
-        "Aktuelt somatisk, Psykiatri",
-        "Objektivt psykisk",
-        "Kontaktårsag",
-        "Telefonkonsultation",
-        "Journalnotat",
-        "Telefonnotat",
-        "Objektivt, somatisk",
-        "Plan",
-        "Semistruktureret diagnostisk interview",
-        "Vurdering/konklusion",
-    }
-
-
-def _load_notes_for_year(
-    note_types: Union[str, list[str]],
-    year: str,
-    view: Optional[str] = "FOR_SFI_fritekst_resultat_udfoert_i_psykiatrien_aendret",
-    n_rows: Optional[int] = None,
-) -> pd.DataFrame:
-    """Loads clinical notes from sql from a specified year and matching
-    specified note types.
-
-    Args:
-        note_names (Union[str, list[str]]): Which types of notes to load.
-        year (str): Which year to load
-        view (str, optional): Which table to load.
-            Defaults to "[FOR_SFI_fritekst_resultat_udfoert_i_psykiatrien_aendret".
-        n_rows (Optional[int], optional): Number of rows to load. Defaults to None.
-
-    Returns:
-        pd.DataFrame: Dataframe with clinical notes
-    """
-
-    sql = (
-        "SELECT dw_ek_borger, datotid_senest_aendret_i_sfien, fritekst"
-        + f" FROM [fct].[{view}_{year}_inkl_2021_feb2022]"
-        + f" WHERE overskrift IN {note_types}"
-    )
-    return sql_load(
-        sql,
-        database="USR_PS_FORSK",
-        chunksize=None,
-        n_rows=n_rows,
-    )
-
-
-def _tfidf_featurize(
-    df: pd.DataFrame,
-    tfidf_path: Path,
-    text_col: str = "text",
-) -> pd.DataFrame:
-    """TF-IDF featurize text. Assumes `df` to have a column named `text`.
-
-    Args:
-        df (pd.DataFrame): Dataframe with text column
-        tfidf_path (Optional[Path]): Path to a sklearn tf-idf vectorizer
-        text_col (str, optional): Name of text column. Defaults to "text".
-
-    Returns:
-        pd.DataFrame: Original dataframe with tf-idf features appended
-    """
-    with open(tfidf_path, "rb") as f:
-        tfidf = pkl.load(f)
-
-    vocab = ["tfidf-" + word for word in tfidf.get_feature_names()]
-
-    text = df[text_col].values
-    df = df.drop(text_col, axis=1).reset_index(drop=True)
-
-    text = tfidf.transform(text)
-    text = pd.DataFrame(text.toarray(), columns=vocab)
-    return pd.concat([df, text], axis=1)
-
-
-def _mean_pooling(
-    model_output: BaseModelOutputWithPoolingAndCrossAttentions,
-    attention_mask: torch.Tensor,
-) -> torch.Tensor:
-    """Mean Pooling - take attention mask into account for correct averaging.
-
-    Args:
-        model_output (BaseModelOutputWithPoolingAndCrossAttentions): model output from pretrained Huggingface transformer
-        attention_mask (torch.Tensor): attention mask from from pretrained Hugginface tokenizer
-
-    Returns:
-        np.ndarray: numpy array with mean pooled embeddings
-    """
-    token_embeddings = model_output[
-        0
-    ]  # first element of model_output contains all token embeddings
-    input_mask_expanded = (
-        attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
-    )
-    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(
-        input_mask_expanded.sum(1),
-        min=1e-9,
-    )
+# """Load text data from a database and featurise it using a tf-idf
+# vectorizer."""
+
+# # pylint: disable=E0211,E0213,missing-function-docstring
+
+# from functools import partial
+# from multiprocessing import Pool
+# from pathlib import Path
+# from typing import Optional, Union
+
+# import dill as pkl
+# import pandas as pd
+
+# # import torch
+# from transformers import AutoModel, AutoTokenizer
+# from transformers.modeling_outputs import BaseModelOutputWithPoolingAndCrossAttentions
+
+# from psycop_feature_generation.loaders.raw.sql_load import sql_load
+# from psycop_feature_generation.utils import PROJECT_ROOT, data_loaders
+
+
+# def get_all_valid_note_types() -> set[str]:
+#     """Returns a set of valid note types. Notice that 'Konklusion' is replaced
+#     by 'Vurdering/konklusion' in 2020, so make sure to use both. 'Ordination'
+#     was replaced by 'Ordination, Psykiatry' in 2022, but 'Ordination,
+#     Psykiatri' is not included in the table. Use with caution.
+
+#     Returns:
+#         Set[str]: Set of valid note types
+#     """
+#     return {
+#         "Observation af patient, Psykiatri",
+#         "Samtale med behandlingssigte",
+#         "Ordination",  # OBS replaced "Ordination, Psykiatri" in 01/02-22
+#         # but is not included in this table. Use with caution
+#         "Aktuelt psykisk",
+#         "Aktuelt socialt, Psykiatri",
+#         "Aftaler, Psykiatri",
+#         "Medicin",
+#         "Aktuelt somatisk, Psykiatri",
+#         "Objektivt psykisk",
+#         "Kontaktårsag",
+#         "Telefonkonsultation",
+#         "Journalnotat",
+#         "Telefonnotat",
+#         "Objektivt, somatisk",
+#         "Plan",
+#         "Semistruktureret diagnostisk interview",
+#         "Vurdering/konklusion",
+#     }
+
+
+# def _load_notes_for_year(
+#     note_types: Union[str, list[str]],
+#     year: str,
+#     view: Optional[str] = "FOR_SFI_fritekst_resultat_udfoert_i_psykiatrien_aendret",
+#     n_rows: Optional[int] = None,
+# ) -> pd.DataFrame:
+#     """Loads clinical notes from sql from a specified year and matching
+#     specified note types.
+
+#     Args:
+#         note_names (Union[str, list[str]]): Which types of notes to load.
+#         year (str): Which year to load
+#         view (str, optional): Which table to load.
+#             Defaults to "[FOR_SFI_fritekst_resultat_udfoert_i_psykiatrien_aendret".
+#         n_rows (Optional[int], optional): Number of rows to load. Defaults to None.
+
+#     Returns:
+#         pd.DataFrame: Dataframe with clinical notes
+#     """
+
+#     sql = (
+#         "SELECT dw_ek_borger, datotid_senest_aendret_i_sfien, fritekst"
+#         + f" FROM [fct].[{view}_{year}_inkl_2021_feb2022]"
+#         + f" WHERE overskrift IN {note_types}"
+#     )
+#     return sql_load(
+#         sql,
+#         database="USR_PS_FORSK",
+#         chunksize=None,
+#         n_rows=n_rows,
+#     )
+
+
+# def _tfidf_featurize(
+#     df: pd.DataFrame,
+#     tfidf_path: Path,
+#     text_col: str = "text",
+# ) -> pd.DataFrame:
+#     """TF-IDF featurize text. Assumes `df` to have a column named `text`.
+
+#     Args:
+#         df (pd.DataFrame): Dataframe with text column
+#         tfidf_path (Optional[Path]): Path to a sklearn tf-idf vectorizer
+#         text_col (str, optional): Name of text column. Defaults to "text".
+
+#     Returns:
+#         pd.DataFrame: Original dataframe with tf-idf features appended
+#     """
+#     with open(tfidf_path, "rb") as f:
+#         tfidf = pkl.load(f)
+
+#     vocab = ["tfidf-" + word for word in tfidf.get_feature_names()]
+
+#     text = df[text_col].values
+#     df = df.drop(text_col, axis=1).reset_index(drop=True)
+
+#     text = tfidf.transform(text)
+#     text = pd.DataFrame(text.toarray(), columns=vocab)
+#     return pd.concat([df, text], axis=1)
+
+
+# def _mean_pooling(
+#     model_output: BaseModelOutputWithPoolingAndCrossAttentions,
+#     attention_mask: torch.Tensor,
+# ) -> torch.Tensor:
+#     """Mean Pooling - take attention mask into account for correct averaging.
+
+#     Args:
+#         model_output (BaseModelOutputWithPoolingAndCrossAttentions): model output from pretrained Huggingface transformer
+#         attention_mask (torch.Tensor): attention mask from from pretrained Hugginface tokenizer
+
+#     Returns:
+#         np.ndarray: numpy array with mean pooled embeddings
+#     """
+#     token_embeddings = model_output[
+#         0
+#     ]  # first element of model_output contains all token embeddings
+#     input_mask_expanded = (
+#         attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+#     )
+#     return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(
+#         input_mask_expanded.sum(1),
+#         min=1e-9,
+#     )
 
 
 def _chunk_text(text: str, seq_length: int) -> list[str]:
@@ -163,297 +164,297 @@ def _chunk_text(text: str, seq_length: int) -> list[str]:
         return chunks
 
 
-def _huggingface_featurize(
-    df: pd.DataFrame,
-    model_id: str = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
-    text_col: str = "text",
-) -> pd.DataFrame:
-    """Featurize text using a huggingface model and generate a dataframe with
-    the embeddings. If the text is longer than the maximum sequence length of
-    the model, the text is split into chunks and embeddings are averaged across
-    chunks.
-
-    Args:
-        df (pd.DataFrame): Dataframe with text column
-        model_id (str): Which huggingface model to use. See https://huggingface.co/models for a list of models. Assumes the model is a transformer model and has both a tokenizer and a model.
-        text_col (str, optional): Name of text column. Defaults to "text".
-
-    Returns:
-        pd.DataFrame: Original dataframe with huggingface embeddings appended
-
-    Example:
-        >>> p = PROJECT_ROOT / "tests" / "test_data" / "raw"
-        >>> huggingface_model_id = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
-        >>> df_p = p / "synth_txt_data.csv"
-
-        >>> df = pd.read_csv(df_p)
-        >>> df = df.dropna()
-
-        >>> x = _huggingface_featurize(df, huggingface_model_id)
-    """
-    tokenizer = AutoTokenizer.from_pretrained(model_id)
-    model = AutoModel.from_pretrained(model_id)
-
-    df = df[df[text_col].notna()]
-    text = df[text_col].values
-    df = df.drop(text_col, axis=1)
-
-    max_seq_length = int(
-        tokenizer.model_max_length / 1.5,
-    )  # allowing space for more word piece tokens than words in original sequence
-
-    list_of_embeddings = []
-    for txt in text:
-        chunks = _chunk_text(txt, max_seq_length)
-
-        encoded_input = tokenizer(
-            chunks,
-            padding=True,
-            truncation=True,
-            return_tensors="pt",
-        )
-
-        with torch.no_grad():
-            model_output = model(**encoded_input)
-
-        embedding = _mean_pooling(model_output, encoded_input["attention_mask"])
-
-        if len(chunks) > 1:
-            list_of_embeddings.append(torch.mean(embedding, axis=0).numpy())  # type: ignore
-        else:
-            list_of_embeddings.append(embedding.numpy()[0])
-
-    embeddings_df = pd.DataFrame(list_of_embeddings)
-    embeddings_df.columns = [
-        "embedding-" + str(dimension) for dimension in range(embeddings_df.shape[1])
-    ]
-
-    return pd.concat([df, embeddings_df], axis=1)
-
-
-def _load_and_featurize_notes_per_year(
-    year: str,
-    note_types: Union[str, list[str]],
-    view: str,
-    n_rows: int,
-    featurizer: str,
-    featurizer_kwargs: dict,
-) -> pd.DataFrame:
-    """Loads clinical notes and features them.
-
-    Args:
-        note_types (Union[str, list[str]]): Which note types to load.
-        year (str): Which year to load
-        view (str): Which view to load
-        n_rows (int): How many rows to load
-        featurizer (str): Which featurizer to use (tfidf or huggingface)
-        featurizer_kwargs (dict): kwargs for the featurizer
-
-    Returns:
-        pd.DataFrame: Dataframe of notes and features
-    """
-
-    df = _load_notes_for_year(
-        note_types=note_types,
-        year=year,
-        view=view,
-        n_rows=n_rows,
-    )
-    if featurizer == "tfidf":
-        df = _tfidf_featurize(df, **featurizer_kwargs)
-    elif featurizer == "huggingface":
-        df = _huggingface_featurize(df, **featurizer_kwargs)
-    return df
-
-
-def load_and_featurize_notes(
-    note_types: Union[str, list[str]],
-    featurizer: str,
-    featurizer_kwargs: Optional[dict] = None,
-    n_rows: Optional[int] = None,
-) -> pd.DataFrame:
-    """Loads all clinical notes that match the specified note from all years.
-    Featurizes the notes using the specified featurizer (tf-idf or huggingface
-    model). Kwargs passed to.
-
-    Args:
-        note_types (Union[str, list[str]]): Which note types to load. See
-            `get_all_valid_note_types()` for valid note types.
-        featurizer (str): Which featurizer to use. Either 'tf-idf' or 'huggingface' or
-            `None` to return the raw text.
-        featurizer_kwargs (Optional[dict]): Kwargs passed to the featurizer. Defaults to None.
-            For tf-idf, this is `tfidf_path` to the vectorizer. For huggingface,
-            this is `model_id` to the model.
-        n_rows (Optional[int], optional): How many rows to load. Defaults to None.
-
-    Raises:
-        ValueError: If given invalid featurizer
-        ValueError: If given invlaid note type
-
-    Returns:
-        pd.DataFrame: Featurized clinical notes
-    """
-
-    valid_featurizers = {"tfidf", "huggingface", None}
-    if featurizer not in valid_featurizers:
-        raise ValueError(
-            f"featurizer must be one of {valid_featurizers}, got {featurizer}",
-        )
-
-    if isinstance(note_types, str):
-        note_types = list(note_types)  # pylint: disable=W0642
-    # check for invalid note types
-    if not set(note_types).issubset(get_all_valid_note_types()):
-        raise ValueError(
-            "Invalid note type. Valid note types are: "
-            + str(get_all_valid_note_types()),
-        )
-
-    # convert note_types to sql query
-    note_types = "('" + "', '".join(note_types) + "')"  # pylint: disable=W0642
-
-    view = "FOR_SFI_fritekst_resultat_udfoert_i_psykiatrien_aendret"
-
-    load_and_featurize = partial(
-        _load_and_featurize_notes_per_year,
-        note_types=note_types,
-        view=view,
-        n_rows=n_rows,
-        featurizer=featurizer,
-        featurizer_kwargs=featurizer_kwargs,
-    )
-
-    years = list(range(2011, 2022))
-
-    with Pool(processes=len(years)) as p:
-        dfs = p.map(load_and_featurize, [str(y) for y in years])
-
-    df = pd.concat(dfs)
-
-    df = df.rename(
-        {"datotid_senest_aendret_i_sfien": "timestamp", "fritekst": "text"},
-        axis=1,
-    )
-    return df
-
-
-@data_loaders.register("all_notes")
-def load_all_notes(
-    featurizer: str,
-    n_rows: Optional[int] = None,
-    featurizer_kwargs: Optional[dict] = None,
-) -> pd.DataFrame:
-    """Returns all notes from all years. Featurizes the notes using the
-    specified featurizer ('tfidf', 'huggingface', or `None` for raw text).
-    `featurizer_kwargs` are passed to the featurizer (e.g. "tfidf_path" for
-    tfidf, and "model_id" for huggingface).
-
-    Args:
-        featurizer (str): Which featurizer to use. Either 'tf-idf', 'huggingface', or None
-        n_rows (Optional[int], optional): Number of rows to load. Defaults to None.
-        featurizer_kwargs (Optional[dict], optional): Keyword arguments passed to
-            the featurizer. Defaults to None.
-
-    Returns:
-        pd.DataFrame: (Featurized) notes
-    """
-    return load_and_featurize_notes(
-        note_types=get_all_valid_note_types(),
-        featurizer=featurizer,
-        n_rows=n_rows,
-        featurizer_kwargs=featurizer_kwargs,
-    )
-
-
-@data_loaders.register("aktuelt_psykisk")
-def load_aktuel_psykisk(
-    featurizer: str,
-    n_rows: Optional[int] = None,
-    featurizer_kwargs: Optional[dict] = None,
-) -> pd.DataFrame:
-    """Returns 'Aktuelt psykisk' notes from all years. Featurizes the notes
-    using the specified featurizer ('tfidf', 'huggingface', or `None` for raw
-    text). `featurizer_kwargs` are passed to the featurizer (e.g. "tfidf_path"
-    for tfidf, and "model_id" for huggingface).
-
-    Args:
-        featurizer (str): Which featurizer to use. Either 'tf-idf', 'huggingface', or None
-        n_rows (Optional[int], optional): Number of rows to load. Defaults to None.
-        featurizer_kwargs (Optional[dict], optional): Keyword arguments passed to
-            the featurizer. Defaults to None.
-
-    Returns:
-        pd.DataFrame: (Featurized) notes
-    """
-    return load_and_featurize_notes(
-        note_types="Aktuelt psykisk",
-        featurizer=featurizer,
-        n_rows=n_rows,
-        featurizer_kwargs=featurizer_kwargs,
-    )
-
-
-@data_loaders.register("load_note_types")
-def load_arbitrary_notes(
-    note_names: Union[str, list[str]],
-    featurizer: str,
-    n_rows: Optional[int] = None,
-    featurizer_kwargs: Optional[dict] = None,
-) -> pd.DataFrame:
-    """Returns one or multiple note types from all years. Featurizes the notes
-    using the specified featurizer ('tfidf', 'huggingface', or `None` for raw
-    text). `featurizer_kwargs` are passed to the featurizer (e.g. "tfidf_path"
-    for tfidf, and "model_id" for huggingface).
-
-    Args:
-        note_names (Union[str, list[str]]): Which note types to load. See
-            `get_all_valid_note_types()` for a list of valid note types.
-        featurizer (str): Which featurizer to use. Either 'tf-idf', 'huggingface', or None
-        n_rows (Optional[int], optional): Number of rows to load. Defaults to None.
-        featurizer_kwargs (Optional[dict], optional): Keyword arguments passed to
-            the featurizer. Defaults to None.
-
-    Returns:
-        pd.DataFrame: (Featurized) notes
-    """
-    return load_and_featurize_notes(
-        note_names,
-        featurizer=featurizer,
-        n_rows=n_rows,
-        featurizer_kwargs=featurizer_kwargs,
-    )
-
-
-@data_loaders.register("synth_notes")
-def load_synth_notes(featurizer: str, **featurizer_kwargs) -> pd.DataFrame:
-    """Load (featurized) synthetic notes for testing.
-
-    Args:
-        featurizer (str): Which featurizer to use
-        **featurizer_kwargs: Keyword arguments passed to the featurizer
-
-    Raises:
-        ValueError: If given invalid featurizer
-
-    Returns:
-        pd.DataFrame: (Featurized) synthetic notes
-    """
-    p = PROJECT_ROOT / "tests" / "test_data"
-    df = pd.read_csv(
-        p / "raw" / "synth_txt_data.csv",
-    ).drop("Unnamed: 0", axis=1)
-    df = df.dropna()
-    df["timestamp"] = pd.to_datetime(df["timestamp"])
-
-    if featurizer == "tfidf":
-        return _tfidf_featurize(
-            df,
-            tfidf_path=p / "test_tfidf" / "tfidf_10.pkl",
-        )
-    elif featurizer == "huggingface":
-        return _huggingface_featurize(
-            df,
-            **featurizer_kwargs,
-        )
-
-    raise ValueError("Only tfidf or huggingface featurizer supported for synth notes")
+# def _huggingface_featurize(
+#     df: pd.DataFrame,
+#     model_id: str = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
+#     text_col: str = "text",
+# ) -> pd.DataFrame:
+#     """Featurize text using a huggingface model and generate a dataframe with
+#     the embeddings. If the text is longer than the maximum sequence length of
+#     the model, the text is split into chunks and embeddings are averaged across
+#     chunks.
+
+#     Args:
+#         df (pd.DataFrame): Dataframe with text column
+#         model_id (str): Which huggingface model to use. See https://huggingface.co/models for a list of models. Assumes the model is a transformer model and has both a tokenizer and a model.
+#         text_col (str, optional): Name of text column. Defaults to "text".
+
+#     Returns:
+#         pd.DataFrame: Original dataframe with huggingface embeddings appended
+
+#     Example:
+#         >>> p = PROJECT_ROOT / "tests" / "test_data" / "raw"
+#         >>> huggingface_model_id = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
+#         >>> df_p = p / "synth_txt_data.csv"
+
+#         >>> df = pd.read_csv(df_p)
+#         >>> df = df.dropna()
+
+#         >>> x = _huggingface_featurize(df, huggingface_model_id)
+#     """
+#     tokenizer = AutoTokenizer.from_pretrained(model_id)
+#     model = AutoModel.from_pretrained(model_id)
+
+#     df = df[df[text_col].notna()]
+#     text = df[text_col].values
+#     df = df.drop(text_col, axis=1)
+
+#     max_seq_length = int(
+#         tokenizer.model_max_length / 1.5,
+#     )  # allowing space for more word piece tokens than words in original sequence
+
+#     list_of_embeddings = []
+#     for txt in text:
+#         chunks = _chunk_text(txt, max_seq_length)
+
+#         encoded_input = tokenizer(
+#             chunks,
+#             padding=True,
+#             truncation=True,
+#             return_tensors="pt",
+#         )
+
+#         with torch.no_grad():
+#             model_output = model(**encoded_input)
+
+#         embedding = _mean_pooling(model_output, encoded_input["attention_mask"])
+
+#         if len(chunks) > 1:
+#             list_of_embeddings.append(torch.mean(embedding, axis=0).numpy())  # type: ignore
+#         else:
+#             list_of_embeddings.append(embedding.numpy()[0])
+
+#     embeddings_df = pd.DataFrame(list_of_embeddings)
+#     embeddings_df.columns = [
+#         "embedding-" + str(dimension) for dimension in range(embeddings_df.shape[1])
+#     ]
+
+#     return pd.concat([df, embeddings_df], axis=1)
+
+
+# def _load_and_featurize_notes_per_year(
+#     year: str,
+#     note_types: Union[str, list[str]],
+#     view: str,
+#     n_rows: int,
+#     featurizer: str,
+#     featurizer_kwargs: dict,
+# ) -> pd.DataFrame:
+#     """Loads clinical notes and features them.
+
+#     Args:
+#         note_types (Union[str, list[str]]): Which note types to load.
+#         year (str): Which year to load
+#         view (str): Which view to load
+#         n_rows (int): How many rows to load
+#         featurizer (str): Which featurizer to use (tfidf or huggingface)
+#         featurizer_kwargs (dict): kwargs for the featurizer
+
+#     Returns:
+#         pd.DataFrame: Dataframe of notes and features
+#     """
+
+#     df = _load_notes_for_year(
+#         note_types=note_types,
+#         year=year,
+#         view=view,
+#         n_rows=n_rows,
+#     )
+#     if featurizer == "tfidf":
+#         df = _tfidf_featurize(df, **featurizer_kwargs)
+#     elif featurizer == "huggingface":
+#         df = _huggingface_featurize(df, **featurizer_kwargs)
+#     return df
+
+
+# def load_and_featurize_notes(
+#     note_types: Union[str, list[str]],
+#     featurizer: str,
+#     featurizer_kwargs: Optional[dict] = None,
+#     n_rows: Optional[int] = None,
+# ) -> pd.DataFrame:
+#     """Loads all clinical notes that match the specified note from all years.
+#     Featurizes the notes using the specified featurizer (tf-idf or huggingface
+#     model). Kwargs passed to.
+
+#     Args:
+#         note_types (Union[str, list[str]]): Which note types to load. See
+#             `get_all_valid_note_types()` for valid note types.
+#         featurizer (str): Which featurizer to use. Either 'tf-idf' or 'huggingface' or
+#             `None` to return the raw text.
+#         featurizer_kwargs (Optional[dict]): Kwargs passed to the featurizer. Defaults to None.
+#             For tf-idf, this is `tfidf_path` to the vectorizer. For huggingface,
+#             this is `model_id` to the model.
+#         n_rows (Optional[int], optional): How many rows to load. Defaults to None.
+
+#     Raises:
+#         ValueError: If given invalid featurizer
+#         ValueError: If given invlaid note type
+
+#     Returns:
+#         pd.DataFrame: Featurized clinical notes
+#     """
+
+#     valid_featurizers = {"tfidf", "huggingface", None}
+#     if featurizer not in valid_featurizers:
+#         raise ValueError(
+#             f"featurizer must be one of {valid_featurizers}, got {featurizer}",
+#         )
+
+#     if isinstance(note_types, str):
+#         note_types = list(note_types)  # pylint: disable=W0642
+#     # check for invalid note types
+#     if not set(note_types).issubset(get_all_valid_note_types()):
+#         raise ValueError(
+#             "Invalid note type. Valid note types are: "
+#             + str(get_all_valid_note_types()),
+#         )
+
+#     # convert note_types to sql query
+#     note_types = "('" + "', '".join(note_types) + "')"  # pylint: disable=W0642
+
+#     view = "FOR_SFI_fritekst_resultat_udfoert_i_psykiatrien_aendret"
+
+#     load_and_featurize = partial(
+#         _load_and_featurize_notes_per_year,
+#         note_types=note_types,
+#         view=view,
+#         n_rows=n_rows,
+#         featurizer=featurizer,
+#         featurizer_kwargs=featurizer_kwargs,
+#     )
+
+#     years = list(range(2011, 2022))
+
+#     with Pool(processes=len(years)) as p:
+#         dfs = p.map(load_and_featurize, [str(y) for y in years])
+
+#     df = pd.concat(dfs)
+
+#     df = df.rename(
+#         {"datotid_senest_aendret_i_sfien": "timestamp", "fritekst": "text"},
+#         axis=1,
+#     )
+#     return df
+
+
+# @data_loaders.register("all_notes")
+# def load_all_notes(
+#     featurizer: str,
+#     n_rows: Optional[int] = None,
+#     featurizer_kwargs: Optional[dict] = None,
+# ) -> pd.DataFrame:
+#     """Returns all notes from all years. Featurizes the notes using the
+#     specified featurizer ('tfidf', 'huggingface', or `None` for raw text).
+#     `featurizer_kwargs` are passed to the featurizer (e.g. "tfidf_path" for
+#     tfidf, and "model_id" for huggingface).
+
+#     Args:
+#         featurizer (str): Which featurizer to use. Either 'tf-idf', 'huggingface', or None
+#         n_rows (Optional[int], optional): Number of rows to load. Defaults to None.
+#         featurizer_kwargs (Optional[dict], optional): Keyword arguments passed to
+#             the featurizer. Defaults to None.
+
+#     Returns:
+#         pd.DataFrame: (Featurized) notes
+#     """
+#     return load_and_featurize_notes(
+#         note_types=get_all_valid_note_types(),
+#         featurizer=featurizer,
+#         n_rows=n_rows,
+#         featurizer_kwargs=featurizer_kwargs,
+#     )
+
+
+# @data_loaders.register("aktuelt_psykisk")
+# def load_aktuel_psykisk(
+#     featurizer: str,
+#     n_rows: Optional[int] = None,
+#     featurizer_kwargs: Optional[dict] = None,
+# ) -> pd.DataFrame:
+#     """Returns 'Aktuelt psykisk' notes from all years. Featurizes the notes
+#     using the specified featurizer ('tfidf', 'huggingface', or `None` for raw
+#     text). `featurizer_kwargs` are passed to the featurizer (e.g. "tfidf_path"
+#     for tfidf, and "model_id" for huggingface).
+
+#     Args:
+#         featurizer (str): Which featurizer to use. Either 'tf-idf', 'huggingface', or None
+#         n_rows (Optional[int], optional): Number of rows to load. Defaults to None.
+#         featurizer_kwargs (Optional[dict], optional): Keyword arguments passed to
+#             the featurizer. Defaults to None.
+
+#     Returns:
+#         pd.DataFrame: (Featurized) notes
+#     """
+#     return load_and_featurize_notes(
+#         note_types="Aktuelt psykisk",
+#         featurizer=featurizer,
+#         n_rows=n_rows,
+#         featurizer_kwargs=featurizer_kwargs,
+#     )
+
+
+# @data_loaders.register("load_note_types")
+# def load_arbitrary_notes(
+#     note_names: Union[str, list[str]],
+#     featurizer: str,
+#     n_rows: Optional[int] = None,
+#     featurizer_kwargs: Optional[dict] = None,
+# ) -> pd.DataFrame:
+#     """Returns one or multiple note types from all years. Featurizes the notes
+#     using the specified featurizer ('tfidf', 'huggingface', or `None` for raw
+#     text). `featurizer_kwargs` are passed to the featurizer (e.g. "tfidf_path"
+#     for tfidf, and "model_id" for huggingface).
+
+#     Args:
+#         note_names (Union[str, list[str]]): Which note types to load. See
+#             `get_all_valid_note_types()` for a list of valid note types.
+#         featurizer (str): Which featurizer to use. Either 'tf-idf', 'huggingface', or None
+#         n_rows (Optional[int], optional): Number of rows to load. Defaults to None.
+#         featurizer_kwargs (Optional[dict], optional): Keyword arguments passed to
+#             the featurizer. Defaults to None.
+
+#     Returns:
+#         pd.DataFrame: (Featurized) notes
+#     """
+#     return load_and_featurize_notes(
+#         note_names,
+#         featurizer=featurizer,
+#         n_rows=n_rows,
+#         featurizer_kwargs=featurizer_kwargs,
+#     )
+
+
+# @data_loaders.register("synth_notes")
+# def load_synth_notes(featurizer: str, **featurizer_kwargs) -> pd.DataFrame:
+#     """Load (featurized) synthetic notes for testing.
+
+#     Args:
+#         featurizer (str): Which featurizer to use
+#         **featurizer_kwargs: Keyword arguments passed to the featurizer
+
+#     Raises:
+#         ValueError: If given invalid featurizer
+
+#     Returns:
+#         pd.DataFrame: (Featurized) synthetic notes
+#     """
+#     p = PROJECT_ROOT / "tests" / "test_data"
+#     df = pd.read_csv(
+#         p / "raw" / "synth_txt_data.csv",
+#     ).drop("Unnamed: 0", axis=1)
+#     df = df.dropna()
+#     df["timestamp"] = pd.to_datetime(df["timestamp"])
+
+#     if featurizer == "tfidf":
+#         return _tfidf_featurize(
+#             df,
+#             tfidf_path=p / "test_tfidf" / "tfidf_10.pkl",
+#         )
+#     elif featurizer == "huggingface":
+#         return _huggingface_featurize(
+#             df,
+#             **featurizer_kwargs,
+#         )
+
+#     raise ValueError("Only tfidf or huggingface featurizer supported for synth notes")