diff --git a/CHANGELOG.md b/CHANGELOG.md index ffe58afc..1678e1ed 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,16 @@ +## v0.8.0 (2022-11-17) +### Feature +* Allow load_medications to concat a list of medications ([`d78f465`](https://github.com/Aarhus-Psychiatry-Research/psycop-feature-generation/commit/d78f46592213b8245229d6618d40f1a1ff4d80eb)) + +### Fix +* Remove original functions ([`da59110`](https://github.com/Aarhus-Psychiatry-Research/psycop-feature-generation/commit/da59110978469b0743ce2d625005fc90950fb436)) + +### Documentation +* Improve docs ([`9aad0af`](https://github.com/Aarhus-Psychiatry-Research/psycop-feature-generation/commit/9aad0af6205af2e3deffb573676af5a20401bae1)) + ## v0.7.0 (2022-11-16) ### Feature * Full run ([`142212f`](https://github.com/Aarhus-Psychiatry-Research/psycop-feature-generation/commit/142212fc63a59662048b6569dc874def92dfe62f)) diff --git a/example/loaders/load_medications.py b/example/loaders/load_medications.py index 81bd2d19..0e4fc2b4 100644 --- a/example/loaders/load_medications.py +++ b/example/loaders/load_medications.py @@ -3,4 +3,4 @@ import psycop_feature_generation.loaders.raw.load_medications as m if __name__ == "__main__": - df = m.antipsychotics() + df = m.first_gen_antipsychotics(n_rows=1000) diff --git a/pyproject.toml b/pyproject.toml index 4ce8700c..dec68587 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "psycop_feature_generation" -version = "0.7.0" +version = "0.8.0" description = "" authors = ["Your Name "] @@ -21,7 +21,6 @@ psutil = ">=5.9.1, <6.0.0" pandas = ">=1.4.0,<1.6.0" catalogue = ">=2.0.0, <2.1.0" numpy = ">=1.23.3,<1.23.5" -torch = "^1.12.1" transformers = "^4.22.2" pyarrow = ">=9.0.0,<9.1.0" psycopmlutils = ">=0.2.4, <0.3.0" diff --git a/src/psycop_feature_generation/data_checks/raw/check_predictor_lists.py b/src/psycop_feature_generation/data_checks/raw/check_predictor_lists.py index 6e546700..2fae584f 100644 --- a/src/psycop_feature_generation/data_checks/raw/check_predictor_lists.py +++ b/src/psycop_feature_generation/data_checks/raw/check_predictor_lists.py @@ -37,6 +37,15 @@ def check_df_conforms_to_feature_spec( ValueError: If df does not conform to d. """ + if required_columns is None: + required_columns = ["dw_ek_borger", "timestamp", "value"] + + if subset_duplicates_columns is None: + subset_duplicates_columns = ["dw_ek_borger", "timestamp", "value"] + + if expected_val_dtypes is None: + expected_val_dtypes = ["float64", "int64"] + msg = Printer(timestamp=True) allowed_nan_value_prop = ( diff --git a/src/psycop_feature_generation/loaders/raw/load_diagnoses.py b/src/psycop_feature_generation/loaders/raw/load_diagnoses.py index e6139183..525ee48b 100644 --- a/src/psycop_feature_generation/loaders/raw/load_diagnoses.py +++ b/src/psycop_feature_generation/loaders/raw/load_diagnoses.py @@ -10,97 +10,10 @@ import pandas as pd -from psycop_feature_generation.loaders.raw.sql_load import sql_load +from psycop_feature_generation.loaders.raw.utils import load_from_codes from psycop_feature_generation.utils import data_loaders -def _load( - icd_code: Union[list[str], str], - source_timestamp_col_name: str, - fct: str, - output_col_name_override: Optional[str] = None, - wildcard_icd_code: Optional[bool] = True, - n_rows: Optional[int] = None, -) -> pd.DataFrame: - """Load the visits that have diagnoses that match icd_code from the - beginning of their adiagnosekode string. Aggregates all that match. - - Args: - icd_code (Union[list[str], str]): Substring(s) to match diagnoses for. # noqa: DAR102 - Matches any diagnoses, whether a-diagnosis, b-diagnosis etc. If a list is passed, will - count a diagnosis as a match if any of the icd_codes in the list match. - source_timestamp_col_name (str): Name of the timestamp column in the SQL - view. - fct (str): Name of the SQL view to load from. - output_col_name_override (str, optional): Name of new column string. If not specified, defaults to the icd_code. - wildcard_icd_code (bool, optional): Whether to match on icd_code*. - Defaults to true. - n_rows: Number of rows to return. Defaults to None. - - Returns: - pd.DataFrame: A pandas dataframe with dw_ek_borger, timestamp and - output_col_name = 1 - """ - fct = f"[{fct}]" - - # Must be able to split a string like this: - # A:DF431#+:ALFC3#B:DF329 - # Which means that if wildcard_icd_code is False, we must match on icd_code# or icd_code followed by nothing. - # If it's true, we can match on icd_code*. - - # Handle if there are multiple ICD codes to count together. - if isinstance(icd_code, list): - match_col_sql_strings = [] - - for code_str in icd_code: # pylint: disable=not-an-iterable - if wildcard_icd_code: - match_col_sql_strings.append( - f"lower(diagnosegruppestreng) LIKE '%{code_str.lower()}%'", - ) - else: - # If the string is at the end of diagnosegruppestreng, it doesn't end with a hashtag - match_col_sql_strings.append( - f"lower(diagnosegruppestreng) LIKE '%{code_str.lower()}'", - ) - - # But if it is at the end, it does - match_col_sql_strings.append( - f"lower(diagnosegruppestreng) LIKE '%{code_str.lower()}#%'", - ) - - match_col_sql_str = " OR ".join(match_col_sql_strings) - else: - if wildcard_icd_code: - match_col_sql_str = ( - f"lower(diagnosegruppestreng) LIKE '%{icd_code.lower()}%'" - ) - - else: - match_col_sql_str = f"lower(diagnosegruppestreng) LIKE '%{icd_code.lower()}' OR lower(diagnosegruppestreng) LIKE '%{icd_code.lower()}#%'" - - sql = ( - f"SELECT dw_ek_borger, {source_timestamp_col_name}, diagnosegruppestreng" - + f" FROM [fct].{fct} WHERE {source_timestamp_col_name} IS NOT NULL AND ({match_col_sql_str})" - ) - - df = sql_load(sql, database="USR_PS_FORSK", chunksize=None, n_rows=n_rows) - - if output_col_name_override is None: - output_col_name = icd_code - else: - output_col_name = output_col_name_override - - df[output_col_name] = 1 - - df.drop(["diagnosegruppestreng"], axis="columns", inplace=True) - - return df.rename( - columns={ - source_timestamp_col_name: "timestamp", - }, - ) - - def concat_from_physical_visits( icd_codes: list[str], output_col_name: str, @@ -122,34 +35,34 @@ def concat_from_physical_visits( diagnoses_source_table_info = { "lpr3": { - "fct": "FOR_LPR3kontakter_psyk_somatik_inkl_2021_feb2022", + "view": "FOR_LPR3kontakter_psyk_somatik_inkl_2021_feb2022", "source_timestamp_col_name": "datotid_lpr3kontaktstart", }, "lpr2_inpatient": { - "fct": "FOR_indlaeggelser_psyk_somatik_LPR2_inkl_2021_feb2022", + "view": "FOR_indlaeggelser_psyk_somatik_LPR2_inkl_2021_feb2022", "source_timestamp_col_name": "datotid_indlaeggelse", }, "lpr2_acute_outpatient": { - "fct": "FOR_akutambulantekontakter_psyk_somatik_LPR2_inkl_2021_feb2022", + "view": "FOR_akutambulantekontakter_psyk_somatik_LPR2_inkl_2021_feb2022", "source_timestamp_col_name": "datotid_start", }, "lpr2_outpatient": { - "fct": "FOR_besoeg_psyk_somatik_LPR2_inkl_2021_feb2022", + "view": "FOR_besoeg_psyk_somatik_LPR2_inkl_2021_feb2022", "source_timestamp_col_name": "datotid_start", }, } - # Using ._load is faster than from_physical_visits since it can process all icd_codes in the SQL request at once, - # rather than processing one at a time and aggregating. dfs = [ - _load( - icd_code=icd_codes, - output_col_name_override=output_col_name, - wildcard_icd_code=wildcard_icd_code, + load_from_codes( + codes_to_match=icd_codes, + column_name="diagnosegruppestreng", + output_col_name=output_col_name, + match_with_wildcard=wildcard_icd_code, n_rows=n_rows, + load_diagnoses=True, **kwargs, ) - for source_name, kwargs in diagnoses_source_table_info.items() + for _, kwargs in diagnoses_source_table_info.items() ] df = pd.concat(dfs).drop_duplicates( @@ -160,8 +73,8 @@ def concat_from_physical_visits( def from_physical_visits( - icd_code: str, - output_col_name_override: Optional[str] = "value", + icd_code: Union[list[str], str], + output_col_name: Optional[str] = "value", n_rows: Optional[int] = None, wildcard_icd_code: Optional[bool] = False, ) -> pd.DataFrame: @@ -181,15 +94,15 @@ def from_physical_visits( diagnoses_source_table_info = { "lpr3": { - "fct": "FOR_LPR3kontakter_psyk_somatik_inkl_2021", + "view": "FOR_LPR3kontakter_psyk_somatik_inkl_2021", "source_timestamp_col_name": "datotid_lpr3kontaktstart", }, "lpr2_inpatient": { - "fct": "FOR_indlaeggelser_psyk_somatik_LPR2_inkl_2021", + "view": "FOR_indlaeggelser_psyk_somatik_LPR2_inkl_2021", "source_timestamp_col_name": "datotid_indlaeggelse", }, "lpr2_outpatient": { - "fct": "FOR_besoeg_psyk_somatik_LPR2_inkl_2021", + "view": "FOR_besoeg_psyk_somatik_LPR2_inkl_2021", "source_timestamp_col_name": "datotid_start", }, } @@ -200,12 +113,14 @@ def from_physical_visits( n_rows_per_df = None dfs = [ - _load( - icd_code=icd_code, - output_col_name_override=output_col_name_override, - wildcard_icd_code=wildcard_icd_code, + load_from_codes( + codes_to_match=icd_code, + code_col_name="diagnosegruppestreng", + output_col_name=output_col_name, n_rows=n_rows_per_df, + match_with_wildcard=wildcard_icd_code, **kwargs, + load_diagnoses=True, ) for _, kwargs in diagnoses_source_table_info.items() ] diff --git a/src/psycop_feature_generation/loaders/raw/load_lab_results.py b/src/psycop_feature_generation/loaders/raw/load_lab_results.py index 83fa21b7..4077f8f7 100644 --- a/src/psycop_feature_generation/loaders/raw/load_lab_results.py +++ b/src/psycop_feature_generation/loaders/raw/load_lab_results.py @@ -55,9 +55,7 @@ def load_non_numerical_values_and_coerce_inequalities( inplace=True, ) - if ineq2mult: - return multiply_inequalities_in_df(df, ineq2mult=ineq2mult) - return multiply_inequalities_in_df(df) + return multiply_inequalities_in_df(df, ineq2mult=ineq2mult) def load_numerical_values( diff --git a/src/psycop_feature_generation/loaders/raw/load_medications.py b/src/psycop_feature_generation/loaders/raw/load_medications.py index b3af1949..9174fba2 100644 --- a/src/psycop_feature_generation/loaders/raw/load_medications.py +++ b/src/psycop_feature_generation/loaders/raw/load_medications.py @@ -1,85 +1,29 @@ """Loaders for medications.""" -from typing import Optional +from typing import Optional, Union import pandas as pd from wasabi import msg -from psycop_feature_generation.loaders.raw.sql_load import sql_load +from psycop_feature_generation.loaders.raw.utils import load_from_codes from psycop_feature_generation.utils import data_loaders # pylint: disable=missing-function-docstring -def _load_one_source( - atc_code: str, - source_timestamp_col_name: str, - view: str, - output_col_name: Optional[str] = None, - wildcard_icd_code: Optional[bool] = False, - n_rows: Optional[int] = None, -) -> pd.DataFrame: - """Load the prescribed medications that match atc. If wildcard_icd_code, - match from atc_code*. Aggregates all that match. Beware that data is - incomplete prior to sep. 2016 for prescribed medications. - - Args: - atc_code (str): ATC string to match on. # noqa: DAR102 - source_timestamp_col_name (str): Name of the timestamp column in the SQL - table. - view (str): Which view to use, e.g. - "FOR_Medicin_ordineret_inkl_2021_feb2022" - output_col_name (str, optional): Name of new column string. Defaults to - None. - wildcard_icd_code (bool, optional): Whether to match on atc_code* or - atc_code. - n_rows (int, optional): Number of rows to return. Defaults to None. - - Returns: - pd.DataFrame: A pandas dataframe with dw_ek_borger, timestamp and - output_col_name = 1 - """ - - if wildcard_icd_code: - end_of_sql = "%" - else: - end_of_sql = "" # noqa - - view = f"[{view}]" - sql = ( - f"SELECT dw_ek_borger, {source_timestamp_col_name}, atc FROM [fct].{view}" - + f" WHERE {source_timestamp_col_name} IS NOT NULL AND (lower(atc)) LIKE lower('{atc_code}{end_of_sql}')" - ) - - df = sql_load(sql, database="USR_PS_FORSK", chunksize=None, n_rows=n_rows) - - if output_col_name is None: - output_col_name = atc_code - - df[output_col_name] = 1 - - df.drop(["atc"], axis="columns", inplace=True) - - return df.rename( - columns={ - source_timestamp_col_name: "timestamp", - }, - ) - - def load( - atc_code: str, + atc_code: Union[str, list[str]], output_col_name: Optional[str] = None, load_prescribed: Optional[bool] = False, load_administered: Optional[bool] = True, - wildcard_icd_code: Optional[bool] = True, + wildcard_code: Optional[bool] = True, n_rows: Optional[int] = None, ) -> pd.DataFrame: """Load medications. Aggregates prescribed/administered if both true. If - wildcard_icd_code, match from atc_code*. Aggregates all that match. Beware + wildcard_atc_code, match from atc_code*. Aggregates all that match. Beware that data is incomplete prior to sep. 2016 for prescribed medications. Args: - atc_code (str): ATC-code prefix to load. Matches atc_code_prefix*. # noqa: DAR102 + atc_code (str): ATC-code prefix to load. Matches atc_code_prefix*. Aggregates all. output_col_name (str, optional): Name of output_col_name. Contains 1 if atc_code matches atc_code_prefix, 0 if not.Defaults to @@ -88,7 +32,7 @@ def load( False. Beware incomplete until sep 2016. load_administered (bool, optional): Whether to load administrations. Defaults to True. - wildcard_icd_code (bool, optional): Whether to match on atc_code* or + wildcard_code (bool, optional): Whether to match on atc_code* or atc_code. n_rows (int, optional): Number of rows to return. Defaults to None. @@ -104,35 +48,46 @@ def load( df = pd.DataFrame() + if load_prescribed and load_administered: + n_rows = int(n_rows / 2) if n_rows else None + if load_prescribed: - df_medication_prescribed = _load_one_source( - atc_code=atc_code, + df_medication_prescribed = load_from_codes( + codes_to_match=atc_code, + code_col_name="atc", source_timestamp_col_name="datotid_ordinationstart", view="FOR_Medicin_ordineret_inkl_2021_feb2022", output_col_name=output_col_name, - wildcard_icd_code=wildcard_icd_code, + match_with_wildcard=wildcard_code, n_rows=n_rows, + load_diagnoses=False, ) df = pd.concat([df, df_medication_prescribed]) if load_administered: - df_medication_administered = _load_one_source( - atc_code=atc_code, + df_medication_administered = load_from_codes( + codes_to_match=atc_code, + code_col_name="atc", source_timestamp_col_name="datotid_administration_start", view="FOR_Medicin_administreret_inkl_2021_feb2022", output_col_name=output_col_name, - wildcard_icd_code=wildcard_icd_code, + match_with_wildcard=wildcard_code, n_rows=n_rows, + load_diagnoses=False, ) df = pd.concat([df, df_medication_administered]) if output_col_name is None: - output_col_name = atc_code + if isinstance(atc_code, list): + # Joint list of atc_codes + output_col_name = "_".join(atc_code) + else: + output_col_name = atc_code df.rename( columns={ - atc_code: "value", + output_col_name: "value", }, inplace=True, ) @@ -185,7 +140,7 @@ def antipsychotics(n_rows: Optional[int] = None) -> pd.DataFrame: atc_code="N05A", load_prescribed=True, load_administered=True, - wildcard_icd_code=True, + wildcard_code=True, n_rows=n_rows, ) @@ -205,7 +160,7 @@ def first_gen_antipsychotics(n_rows: Optional[int] = None) -> pd.DataFrame: ], load_prescribed=True, load_administered=True, - wildcard_icd_code=False, + wildcard_code=False, n_rows=n_rows, ) @@ -231,7 +186,7 @@ def second_gen_antipsychotics(n_rows: Optional[int] = None) -> pd.DataFrame: ], load_prescribed=True, load_administered=True, - wildcard_icd_code=False, + wildcard_code=False, n_rows=n_rows, ) @@ -242,7 +197,7 @@ def olanzapine(n_rows: Optional[int] = None) -> pd.DataFrame: atc_code="N05AH03", load_prescribed=True, load_administered=True, - wildcard_icd_code=False, + wildcard_code=False, n_rows=n_rows, ) @@ -253,7 +208,7 @@ def clozapine(n_rows: Optional[int] = None) -> pd.DataFrame: atc_code="N05AH02", load_prescribed=True, load_administered=True, - wildcard_icd_code=False, + wildcard_code=False, n_rows=n_rows, ) @@ -264,7 +219,7 @@ def anxiolytics(n_rows: Optional[int] = None) -> pd.DataFrame: atc_code="N05B", load_prescribed=False, load_administered=True, - wildcard_icd_code=True, + wildcard_code=True, n_rows=n_rows, ) @@ -275,7 +230,7 @@ def hypnotics(n_rows: Optional[int] = None) -> pd.DataFrame: atc_code="N05C", load_prescribed=False, load_administered=True, - wildcard_icd_code=True, + wildcard_code=True, n_rows=n_rows, ) @@ -286,7 +241,7 @@ def antidepressives(n_rows: Optional[int] = None) -> pd.DataFrame: atc_code="N06A", load_prescribed=False, load_administered=True, - wildcard_icd_code=True, + wildcard_code=True, n_rows=n_rows, ) @@ -298,7 +253,7 @@ def ssri(n_rows: Optional[int] = None) -> pd.DataFrame: atc_code=["N06AB10", "N06AB04", "N06AB08", "N06AB03", "N06AB05", "N06AB06"], load_prescribed=True, load_administered=True, - wildcard_icd_code=False, + wildcard_code=False, n_rows=n_rows, ) @@ -310,7 +265,7 @@ def snri(n_rows: Optional[int] = None) -> pd.DataFrame: atc_code=["N06AX21", "N06AX16"], load_prescribed=True, load_administered=True, - wildcard_icd_code=False, + wildcard_code=False, n_rows=n_rows, ) @@ -322,7 +277,7 @@ def tca(n_rows: Optional[int] = None) -> pd.DataFrame: atc_code=["N06AA09", "N06AA04", "N06AA02", "N06AA10", "N06AA16"], load_prescribed=True, load_administered=True, - wildcard_icd_code=False, + wildcard_code=False, n_rows=n_rows, ) @@ -333,7 +288,7 @@ def lithium(n_rows: Optional[int] = None) -> pd.DataFrame: atc_code="N05AN01", load_prescribed=True, load_administered=True, - wildcard_icd_code=False, + wildcard_code=False, n_rows=n_rows, ) @@ -344,7 +299,7 @@ def hyperactive_disorders_medications(n_rows: Optional[int] = None) -> pd.DataFr atc_code="N06B", load_prescribed=False, load_administered=True, - wildcard_icd_code=True, + wildcard_code=True, n_rows=n_rows, ) @@ -355,7 +310,7 @@ def dementia_medications(n_rows: Optional[int] = None) -> pd.DataFrame: atc_code="N06D", load_prescribed=False, load_administered=True, - wildcard_icd_code=True, + wildcard_code=True, n_rows=n_rows, ) @@ -366,7 +321,7 @@ def anti_epileptics(n_rows: Optional[int] = None) -> pd.DataFrame: atc_code="N03", load_prescribed=False, load_administered=True, - wildcard_icd_code=True, + wildcard_code=True, n_rows=n_rows, ) @@ -378,7 +333,7 @@ def alcohol_abstinence(n_rows: Optional[int] = None) -> pd.DataFrame: atc_code=["A11DA01", "A11EA", "N05BA02", "N03AA02"], load_prescribed=True, load_administered=True, - wildcard_icd_code=False, + wildcard_code=False, n_rows=n_rows, ) @@ -390,7 +345,7 @@ def alimentary_medications(n_rows: Optional[int] = None) -> pd.DataFrame: atc_code="A", load_prescribed=False, load_administered=True, - wildcard_icd_code=True, + wildcard_code=True, n_rows=n_rows, ) @@ -401,7 +356,7 @@ def blood_medications(n_rows: Optional[int] = None) -> pd.DataFrame: atc_code="B", load_prescribed=False, load_administered=True, - wildcard_icd_code=True, + wildcard_code=True, n_rows=n_rows, ) @@ -412,7 +367,7 @@ def cardiovascular_medications(n_rows: Optional[int] = None) -> pd.DataFrame: atc_code="C", load_prescribed=False, load_administered=True, - wildcard_icd_code=True, + wildcard_code=True, n_rows=n_rows, ) @@ -423,7 +378,7 @@ def dermatological_medications(n_rows: Optional[int] = None) -> pd.DataFrame: atc_code="D", load_prescribed=False, load_administered=True, - wildcard_icd_code=True, + wildcard_code=True, n_rows=n_rows, ) @@ -434,7 +389,7 @@ def genito_sex_medications(n_rows: Optional[int] = None) -> pd.DataFrame: atc_code="G", load_prescribed=False, load_administered=True, - wildcard_icd_code=True, + wildcard_code=True, n_rows=n_rows, ) @@ -445,7 +400,7 @@ def hormonal_medications(n_rows: Optional[int] = None) -> pd.DataFrame: atc_code="H", load_prescribed=False, load_administered=True, - wildcard_icd_code=True, + wildcard_code=True, n_rows=n_rows, ) @@ -456,7 +411,7 @@ def antiinfectives(n_rows: Optional[int] = None) -> pd.DataFrame: atc_code="J", load_prescribed=False, load_administered=True, - wildcard_icd_code=True, + wildcard_code=True, n_rows=n_rows, ) @@ -467,7 +422,7 @@ def antineoplastic(n_rows: Optional[int] = None) -> pd.DataFrame: atc_code="L", load_prescribed=False, load_administered=True, - wildcard_icd_code=True, + wildcard_code=True, n_rows=n_rows, ) @@ -478,7 +433,7 @@ def musculoskeletal_medications(n_rows: Optional[int] = None) -> pd.DataFrame: atc_code="M", load_prescribed=False, load_administered=True, - wildcard_icd_code=True, + wildcard_code=True, n_rows=n_rows, ) @@ -489,7 +444,7 @@ def nervous_system_medications(n_rows: Optional[int] = None) -> pd.DataFrame: atc_code="N", load_prescribed=False, load_administered=True, - wildcard_icd_code=True, + wildcard_code=True, n_rows=n_rows, ) @@ -500,7 +455,7 @@ def analgesic(n_rows: Optional[int] = None) -> pd.DataFrame: atc_code="N02", load_prescribed=False, load_administered=True, - wildcard_icd_code=True, + wildcard_code=True, n_rows=n_rows, ) @@ -511,7 +466,7 @@ def antiparasitic(n_rows: Optional[int] = None) -> pd.DataFrame: atc_code="P", load_prescribed=False, load_administered=True, - wildcard_icd_code=True, + wildcard_code=True, n_rows=n_rows, ) @@ -522,7 +477,7 @@ def respiratory_medications(n_rows: Optional[int] = None) -> pd.DataFrame: atc_code="R", load_prescribed=False, load_administered=True, - wildcard_icd_code=True, + wildcard_code=True, n_rows=n_rows, ) @@ -533,7 +488,7 @@ def sensory_medications(n_rows: Optional[int] = None) -> pd.DataFrame: atc_code="S", load_prescribed=False, load_administered=True, - wildcard_icd_code=True, + wildcard_code=True, n_rows=n_rows, ) @@ -544,6 +499,6 @@ def various_medications(n_rows: Optional[int] = None) -> pd.DataFrame: atc_code="V", load_prescribed=False, load_administered=True, - wildcard_icd_code=True, + wildcard_code=True, n_rows=n_rows, ) diff --git a/src/psycop_feature_generation/loaders/raw/load_text.py b/src/psycop_feature_generation/loaders/raw/load_text.py index 7c20e2a6..2f532e54 100644 --- a/src/psycop_feature_generation/loaders/raw/load_text.py +++ b/src/psycop_feature_generation/loaders/raw/load_text.py @@ -1,138 +1,139 @@ -"""Load text data from a database and featurise it using a tf-idf -vectorizer.""" - -# pylint: disable=E0211,E0213,missing-function-docstring - -from functools import partial -from multiprocessing import Pool -from pathlib import Path -from typing import Optional, Union - -import dill as pkl -import pandas as pd -import torch -from transformers import AutoModel, AutoTokenizer -from transformers.modeling_outputs import BaseModelOutputWithPoolingAndCrossAttentions - -from psycop_feature_generation.loaders.raw.sql_load import sql_load -from psycop_feature_generation.utils import PROJECT_ROOT, data_loaders - - -def get_all_valid_note_types() -> set[str]: - """Returns a set of valid note types. Notice that 'Konklusion' is replaced - by 'Vurdering/konklusion' in 2020, so make sure to use both. 'Ordination' - was replaced by 'Ordination, Psykiatry' in 2022, but 'Ordination, - Psykiatri' is not included in the table. Use with caution. - - Returns: - Set[str]: Set of valid note types - """ - return { - "Observation af patient, Psykiatri", - "Samtale med behandlingssigte", - "Ordination", # OBS replaced "Ordination, Psykiatri" in 01/02-22 - # but is not included in this table. Use with caution - "Aktuelt psykisk", - "Aktuelt socialt, Psykiatri", - "Aftaler, Psykiatri", - "Medicin", - "Aktuelt somatisk, Psykiatri", - "Objektivt psykisk", - "Kontaktårsag", - "Telefonkonsultation", - "Journalnotat", - "Telefonnotat", - "Objektivt, somatisk", - "Plan", - "Semistruktureret diagnostisk interview", - "Vurdering/konklusion", - } - - -def _load_notes_for_year( - note_types: Union[str, list[str]], - year: str, - view: Optional[str] = "FOR_SFI_fritekst_resultat_udfoert_i_psykiatrien_aendret", - n_rows: Optional[int] = None, -) -> pd.DataFrame: - """Loads clinical notes from sql from a specified year and matching - specified note types. - - Args: - note_names (Union[str, list[str]]): Which types of notes to load. - year (str): Which year to load - view (str, optional): Which table to load. - Defaults to "[FOR_SFI_fritekst_resultat_udfoert_i_psykiatrien_aendret". - n_rows (Optional[int], optional): Number of rows to load. Defaults to None. - - Returns: - pd.DataFrame: Dataframe with clinical notes - """ - - sql = ( - "SELECT dw_ek_borger, datotid_senest_aendret_i_sfien, fritekst" - + f" FROM [fct].[{view}_{year}_inkl_2021_feb2022]" - + f" WHERE overskrift IN {note_types}" - ) - return sql_load( - sql, - database="USR_PS_FORSK", - chunksize=None, - n_rows=n_rows, - ) - - -def _tfidf_featurize( - df: pd.DataFrame, - tfidf_path: Path, - text_col: str = "text", -) -> pd.DataFrame: - """TF-IDF featurize text. Assumes `df` to have a column named `text`. - - Args: - df (pd.DataFrame): Dataframe with text column - tfidf_path (Optional[Path]): Path to a sklearn tf-idf vectorizer - text_col (str, optional): Name of text column. Defaults to "text". - - Returns: - pd.DataFrame: Original dataframe with tf-idf features appended - """ - with open(tfidf_path, "rb") as f: - tfidf = pkl.load(f) - - vocab = ["tfidf-" + word for word in tfidf.get_feature_names()] - - text = df[text_col].values - df = df.drop(text_col, axis=1).reset_index(drop=True) - - text = tfidf.transform(text) - text = pd.DataFrame(text.toarray(), columns=vocab) - return pd.concat([df, text], axis=1) - - -def _mean_pooling( - model_output: BaseModelOutputWithPoolingAndCrossAttentions, - attention_mask: torch.Tensor, -) -> torch.Tensor: - """Mean Pooling - take attention mask into account for correct averaging. - - Args: - model_output (BaseModelOutputWithPoolingAndCrossAttentions): model output from pretrained Huggingface transformer - attention_mask (torch.Tensor): attention mask from from pretrained Hugginface tokenizer - - Returns: - np.ndarray: numpy array with mean pooled embeddings - """ - token_embeddings = model_output[ - 0 - ] # first element of model_output contains all token embeddings - input_mask_expanded = ( - attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() - ) - return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp( - input_mask_expanded.sum(1), - min=1e-9, - ) +# """Load text data from a database and featurise it using a tf-idf +# vectorizer.""" + +# # pylint: disable=E0211,E0213,missing-function-docstring + +# from functools import partial +# from multiprocessing import Pool +# from pathlib import Path +# from typing import Optional, Union + +# import dill as pkl +# import pandas as pd + +# # import torch +# from transformers import AutoModel, AutoTokenizer +# from transformers.modeling_outputs import BaseModelOutputWithPoolingAndCrossAttentions + +# from psycop_feature_generation.loaders.raw.sql_load import sql_load +# from psycop_feature_generation.utils import PROJECT_ROOT, data_loaders + + +# def get_all_valid_note_types() -> set[str]: +# """Returns a set of valid note types. Notice that 'Konklusion' is replaced +# by 'Vurdering/konklusion' in 2020, so make sure to use both. 'Ordination' +# was replaced by 'Ordination, Psykiatry' in 2022, but 'Ordination, +# Psykiatri' is not included in the table. Use with caution. + +# Returns: +# Set[str]: Set of valid note types +# """ +# return { +# "Observation af patient, Psykiatri", +# "Samtale med behandlingssigte", +# "Ordination", # OBS replaced "Ordination, Psykiatri" in 01/02-22 +# # but is not included in this table. Use with caution +# "Aktuelt psykisk", +# "Aktuelt socialt, Psykiatri", +# "Aftaler, Psykiatri", +# "Medicin", +# "Aktuelt somatisk, Psykiatri", +# "Objektivt psykisk", +# "Kontaktårsag", +# "Telefonkonsultation", +# "Journalnotat", +# "Telefonnotat", +# "Objektivt, somatisk", +# "Plan", +# "Semistruktureret diagnostisk interview", +# "Vurdering/konklusion", +# } + + +# def _load_notes_for_year( +# note_types: Union[str, list[str]], +# year: str, +# view: Optional[str] = "FOR_SFI_fritekst_resultat_udfoert_i_psykiatrien_aendret", +# n_rows: Optional[int] = None, +# ) -> pd.DataFrame: +# """Loads clinical notes from sql from a specified year and matching +# specified note types. + +# Args: +# note_names (Union[str, list[str]]): Which types of notes to load. +# year (str): Which year to load +# view (str, optional): Which table to load. +# Defaults to "[FOR_SFI_fritekst_resultat_udfoert_i_psykiatrien_aendret". +# n_rows (Optional[int], optional): Number of rows to load. Defaults to None. + +# Returns: +# pd.DataFrame: Dataframe with clinical notes +# """ + +# sql = ( +# "SELECT dw_ek_borger, datotid_senest_aendret_i_sfien, fritekst" +# + f" FROM [fct].[{view}_{year}_inkl_2021_feb2022]" +# + f" WHERE overskrift IN {note_types}" +# ) +# return sql_load( +# sql, +# database="USR_PS_FORSK", +# chunksize=None, +# n_rows=n_rows, +# ) + + +# def _tfidf_featurize( +# df: pd.DataFrame, +# tfidf_path: Path, +# text_col: str = "text", +# ) -> pd.DataFrame: +# """TF-IDF featurize text. Assumes `df` to have a column named `text`. + +# Args: +# df (pd.DataFrame): Dataframe with text column +# tfidf_path (Optional[Path]): Path to a sklearn tf-idf vectorizer +# text_col (str, optional): Name of text column. Defaults to "text". + +# Returns: +# pd.DataFrame: Original dataframe with tf-idf features appended +# """ +# with open(tfidf_path, "rb") as f: +# tfidf = pkl.load(f) + +# vocab = ["tfidf-" + word for word in tfidf.get_feature_names()] + +# text = df[text_col].values +# df = df.drop(text_col, axis=1).reset_index(drop=True) + +# text = tfidf.transform(text) +# text = pd.DataFrame(text.toarray(), columns=vocab) +# return pd.concat([df, text], axis=1) + + +# def _mean_pooling( +# model_output: BaseModelOutputWithPoolingAndCrossAttentions, +# attention_mask: torch.Tensor, +# ) -> torch.Tensor: +# """Mean Pooling - take attention mask into account for correct averaging. + +# Args: +# model_output (BaseModelOutputWithPoolingAndCrossAttentions): model output from pretrained Huggingface transformer +# attention_mask (torch.Tensor): attention mask from from pretrained Hugginface tokenizer + +# Returns: +# np.ndarray: numpy array with mean pooled embeddings +# """ +# token_embeddings = model_output[ +# 0 +# ] # first element of model_output contains all token embeddings +# input_mask_expanded = ( +# attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() +# ) +# return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp( +# input_mask_expanded.sum(1), +# min=1e-9, +# ) def _chunk_text(text: str, seq_length: int) -> list[str]: @@ -163,297 +164,297 @@ def _chunk_text(text: str, seq_length: int) -> list[str]: return chunks -def _huggingface_featurize( - df: pd.DataFrame, - model_id: str = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", - text_col: str = "text", -) -> pd.DataFrame: - """Featurize text using a huggingface model and generate a dataframe with - the embeddings. If the text is longer than the maximum sequence length of - the model, the text is split into chunks and embeddings are averaged across - chunks. - - Args: - df (pd.DataFrame): Dataframe with text column - model_id (str): Which huggingface model to use. See https://huggingface.co/models for a list of models. Assumes the model is a transformer model and has both a tokenizer and a model. - text_col (str, optional): Name of text column. Defaults to "text". - - Returns: - pd.DataFrame: Original dataframe with huggingface embeddings appended - - Example: - >>> p = PROJECT_ROOT / "tests" / "test_data" / "raw" - >>> huggingface_model_id = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2" - >>> df_p = p / "synth_txt_data.csv" - - >>> df = pd.read_csv(df_p) - >>> df = df.dropna() - - >>> x = _huggingface_featurize(df, huggingface_model_id) - """ - tokenizer = AutoTokenizer.from_pretrained(model_id) - model = AutoModel.from_pretrained(model_id) - - df = df[df[text_col].notna()] - text = df[text_col].values - df = df.drop(text_col, axis=1) - - max_seq_length = int( - tokenizer.model_max_length / 1.5, - ) # allowing space for more word piece tokens than words in original sequence - - list_of_embeddings = [] - for txt in text: - chunks = _chunk_text(txt, max_seq_length) - - encoded_input = tokenizer( - chunks, - padding=True, - truncation=True, - return_tensors="pt", - ) - - with torch.no_grad(): - model_output = model(**encoded_input) - - embedding = _mean_pooling(model_output, encoded_input["attention_mask"]) - - if len(chunks) > 1: - list_of_embeddings.append(torch.mean(embedding, axis=0).numpy()) # type: ignore - else: - list_of_embeddings.append(embedding.numpy()[0]) - - embeddings_df = pd.DataFrame(list_of_embeddings) - embeddings_df.columns = [ - "embedding-" + str(dimension) for dimension in range(embeddings_df.shape[1]) - ] - - return pd.concat([df, embeddings_df], axis=1) - - -def _load_and_featurize_notes_per_year( - year: str, - note_types: Union[str, list[str]], - view: str, - n_rows: int, - featurizer: str, - featurizer_kwargs: dict, -) -> pd.DataFrame: - """Loads clinical notes and features them. - - Args: - note_types (Union[str, list[str]]): Which note types to load. - year (str): Which year to load - view (str): Which view to load - n_rows (int): How many rows to load - featurizer (str): Which featurizer to use (tfidf or huggingface) - featurizer_kwargs (dict): kwargs for the featurizer - - Returns: - pd.DataFrame: Dataframe of notes and features - """ - - df = _load_notes_for_year( - note_types=note_types, - year=year, - view=view, - n_rows=n_rows, - ) - if featurizer == "tfidf": - df = _tfidf_featurize(df, **featurizer_kwargs) - elif featurizer == "huggingface": - df = _huggingface_featurize(df, **featurizer_kwargs) - return df - - -def load_and_featurize_notes( - note_types: Union[str, list[str]], - featurizer: str, - featurizer_kwargs: Optional[dict] = None, - n_rows: Optional[int] = None, -) -> pd.DataFrame: - """Loads all clinical notes that match the specified note from all years. - Featurizes the notes using the specified featurizer (tf-idf or huggingface - model). Kwargs passed to. - - Args: - note_types (Union[str, list[str]]): Which note types to load. See - `get_all_valid_note_types()` for valid note types. - featurizer (str): Which featurizer to use. Either 'tf-idf' or 'huggingface' or - `None` to return the raw text. - featurizer_kwargs (Optional[dict]): Kwargs passed to the featurizer. Defaults to None. - For tf-idf, this is `tfidf_path` to the vectorizer. For huggingface, - this is `model_id` to the model. - n_rows (Optional[int], optional): How many rows to load. Defaults to None. - - Raises: - ValueError: If given invalid featurizer - ValueError: If given invlaid note type - - Returns: - pd.DataFrame: Featurized clinical notes - """ - - valid_featurizers = {"tfidf", "huggingface", None} - if featurizer not in valid_featurizers: - raise ValueError( - f"featurizer must be one of {valid_featurizers}, got {featurizer}", - ) - - if isinstance(note_types, str): - note_types = list(note_types) # pylint: disable=W0642 - # check for invalid note types - if not set(note_types).issubset(get_all_valid_note_types()): - raise ValueError( - "Invalid note type. Valid note types are: " - + str(get_all_valid_note_types()), - ) - - # convert note_types to sql query - note_types = "('" + "', '".join(note_types) + "')" # pylint: disable=W0642 - - view = "FOR_SFI_fritekst_resultat_udfoert_i_psykiatrien_aendret" - - load_and_featurize = partial( - _load_and_featurize_notes_per_year, - note_types=note_types, - view=view, - n_rows=n_rows, - featurizer=featurizer, - featurizer_kwargs=featurizer_kwargs, - ) - - years = list(range(2011, 2022)) - - with Pool(processes=len(years)) as p: - dfs = p.map(load_and_featurize, [str(y) for y in years]) - - df = pd.concat(dfs) - - df = df.rename( - {"datotid_senest_aendret_i_sfien": "timestamp", "fritekst": "text"}, - axis=1, - ) - return df - - -@data_loaders.register("all_notes") -def load_all_notes( - featurizer: str, - n_rows: Optional[int] = None, - featurizer_kwargs: Optional[dict] = None, -) -> pd.DataFrame: - """Returns all notes from all years. Featurizes the notes using the - specified featurizer ('tfidf', 'huggingface', or `None` for raw text). - `featurizer_kwargs` are passed to the featurizer (e.g. "tfidf_path" for - tfidf, and "model_id" for huggingface). - - Args: - featurizer (str): Which featurizer to use. Either 'tf-idf', 'huggingface', or None - n_rows (Optional[int], optional): Number of rows to load. Defaults to None. - featurizer_kwargs (Optional[dict], optional): Keyword arguments passed to - the featurizer. Defaults to None. - - Returns: - pd.DataFrame: (Featurized) notes - """ - return load_and_featurize_notes( - note_types=get_all_valid_note_types(), - featurizer=featurizer, - n_rows=n_rows, - featurizer_kwargs=featurizer_kwargs, - ) - - -@data_loaders.register("aktuelt_psykisk") -def load_aktuel_psykisk( - featurizer: str, - n_rows: Optional[int] = None, - featurizer_kwargs: Optional[dict] = None, -) -> pd.DataFrame: - """Returns 'Aktuelt psykisk' notes from all years. Featurizes the notes - using the specified featurizer ('tfidf', 'huggingface', or `None` for raw - text). `featurizer_kwargs` are passed to the featurizer (e.g. "tfidf_path" - for tfidf, and "model_id" for huggingface). - - Args: - featurizer (str): Which featurizer to use. Either 'tf-idf', 'huggingface', or None - n_rows (Optional[int], optional): Number of rows to load. Defaults to None. - featurizer_kwargs (Optional[dict], optional): Keyword arguments passed to - the featurizer. Defaults to None. - - Returns: - pd.DataFrame: (Featurized) notes - """ - return load_and_featurize_notes( - note_types="Aktuelt psykisk", - featurizer=featurizer, - n_rows=n_rows, - featurizer_kwargs=featurizer_kwargs, - ) - - -@data_loaders.register("load_note_types") -def load_arbitrary_notes( - note_names: Union[str, list[str]], - featurizer: str, - n_rows: Optional[int] = None, - featurizer_kwargs: Optional[dict] = None, -) -> pd.DataFrame: - """Returns one or multiple note types from all years. Featurizes the notes - using the specified featurizer ('tfidf', 'huggingface', or `None` for raw - text). `featurizer_kwargs` are passed to the featurizer (e.g. "tfidf_path" - for tfidf, and "model_id" for huggingface). - - Args: - note_names (Union[str, list[str]]): Which note types to load. See - `get_all_valid_note_types()` for a list of valid note types. - featurizer (str): Which featurizer to use. Either 'tf-idf', 'huggingface', or None - n_rows (Optional[int], optional): Number of rows to load. Defaults to None. - featurizer_kwargs (Optional[dict], optional): Keyword arguments passed to - the featurizer. Defaults to None. - - Returns: - pd.DataFrame: (Featurized) notes - """ - return load_and_featurize_notes( - note_names, - featurizer=featurizer, - n_rows=n_rows, - featurizer_kwargs=featurizer_kwargs, - ) - - -@data_loaders.register("synth_notes") -def load_synth_notes(featurizer: str, **featurizer_kwargs) -> pd.DataFrame: - """Load (featurized) synthetic notes for testing. - - Args: - featurizer (str): Which featurizer to use - **featurizer_kwargs: Keyword arguments passed to the featurizer - - Raises: - ValueError: If given invalid featurizer - - Returns: - pd.DataFrame: (Featurized) synthetic notes - """ - p = PROJECT_ROOT / "tests" / "test_data" - df = pd.read_csv( - p / "raw" / "synth_txt_data.csv", - ).drop("Unnamed: 0", axis=1) - df = df.dropna() - df["timestamp"] = pd.to_datetime(df["timestamp"]) - - if featurizer == "tfidf": - return _tfidf_featurize( - df, - tfidf_path=p / "test_tfidf" / "tfidf_10.pkl", - ) - elif featurizer == "huggingface": - return _huggingface_featurize( - df, - **featurizer_kwargs, - ) - - raise ValueError("Only tfidf or huggingface featurizer supported for synth notes") +# def _huggingface_featurize( +# df: pd.DataFrame, +# model_id: str = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", +# text_col: str = "text", +# ) -> pd.DataFrame: +# """Featurize text using a huggingface model and generate a dataframe with +# the embeddings. If the text is longer than the maximum sequence length of +# the model, the text is split into chunks and embeddings are averaged across +# chunks. + +# Args: +# df (pd.DataFrame): Dataframe with text column +# model_id (str): Which huggingface model to use. See https://huggingface.co/models for a list of models. Assumes the model is a transformer model and has both a tokenizer and a model. +# text_col (str, optional): Name of text column. Defaults to "text". + +# Returns: +# pd.DataFrame: Original dataframe with huggingface embeddings appended + +# Example: +# >>> p = PROJECT_ROOT / "tests" / "test_data" / "raw" +# >>> huggingface_model_id = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2" +# >>> df_p = p / "synth_txt_data.csv" + +# >>> df = pd.read_csv(df_p) +# >>> df = df.dropna() + +# >>> x = _huggingface_featurize(df, huggingface_model_id) +# """ +# tokenizer = AutoTokenizer.from_pretrained(model_id) +# model = AutoModel.from_pretrained(model_id) + +# df = df[df[text_col].notna()] +# text = df[text_col].values +# df = df.drop(text_col, axis=1) + +# max_seq_length = int( +# tokenizer.model_max_length / 1.5, +# ) # allowing space for more word piece tokens than words in original sequence + +# list_of_embeddings = [] +# for txt in text: +# chunks = _chunk_text(txt, max_seq_length) + +# encoded_input = tokenizer( +# chunks, +# padding=True, +# truncation=True, +# return_tensors="pt", +# ) + +# with torch.no_grad(): +# model_output = model(**encoded_input) + +# embedding = _mean_pooling(model_output, encoded_input["attention_mask"]) + +# if len(chunks) > 1: +# list_of_embeddings.append(torch.mean(embedding, axis=0).numpy()) # type: ignore +# else: +# list_of_embeddings.append(embedding.numpy()[0]) + +# embeddings_df = pd.DataFrame(list_of_embeddings) +# embeddings_df.columns = [ +# "embedding-" + str(dimension) for dimension in range(embeddings_df.shape[1]) +# ] + +# return pd.concat([df, embeddings_df], axis=1) + + +# def _load_and_featurize_notes_per_year( +# year: str, +# note_types: Union[str, list[str]], +# view: str, +# n_rows: int, +# featurizer: str, +# featurizer_kwargs: dict, +# ) -> pd.DataFrame: +# """Loads clinical notes and features them. + +# Args: +# note_types (Union[str, list[str]]): Which note types to load. +# year (str): Which year to load +# view (str): Which view to load +# n_rows (int): How many rows to load +# featurizer (str): Which featurizer to use (tfidf or huggingface) +# featurizer_kwargs (dict): kwargs for the featurizer + +# Returns: +# pd.DataFrame: Dataframe of notes and features +# """ + +# df = _load_notes_for_year( +# note_types=note_types, +# year=year, +# view=view, +# n_rows=n_rows, +# ) +# if featurizer == "tfidf": +# df = _tfidf_featurize(df, **featurizer_kwargs) +# elif featurizer == "huggingface": +# df = _huggingface_featurize(df, **featurizer_kwargs) +# return df + + +# def load_and_featurize_notes( +# note_types: Union[str, list[str]], +# featurizer: str, +# featurizer_kwargs: Optional[dict] = None, +# n_rows: Optional[int] = None, +# ) -> pd.DataFrame: +# """Loads all clinical notes that match the specified note from all years. +# Featurizes the notes using the specified featurizer (tf-idf or huggingface +# model). Kwargs passed to. + +# Args: +# note_types (Union[str, list[str]]): Which note types to load. See +# `get_all_valid_note_types()` for valid note types. +# featurizer (str): Which featurizer to use. Either 'tf-idf' or 'huggingface' or +# `None` to return the raw text. +# featurizer_kwargs (Optional[dict]): Kwargs passed to the featurizer. Defaults to None. +# For tf-idf, this is `tfidf_path` to the vectorizer. For huggingface, +# this is `model_id` to the model. +# n_rows (Optional[int], optional): How many rows to load. Defaults to None. + +# Raises: +# ValueError: If given invalid featurizer +# ValueError: If given invlaid note type + +# Returns: +# pd.DataFrame: Featurized clinical notes +# """ + +# valid_featurizers = {"tfidf", "huggingface", None} +# if featurizer not in valid_featurizers: +# raise ValueError( +# f"featurizer must be one of {valid_featurizers}, got {featurizer}", +# ) + +# if isinstance(note_types, str): +# note_types = list(note_types) # pylint: disable=W0642 +# # check for invalid note types +# if not set(note_types).issubset(get_all_valid_note_types()): +# raise ValueError( +# "Invalid note type. Valid note types are: " +# + str(get_all_valid_note_types()), +# ) + +# # convert note_types to sql query +# note_types = "('" + "', '".join(note_types) + "')" # pylint: disable=W0642 + +# view = "FOR_SFI_fritekst_resultat_udfoert_i_psykiatrien_aendret" + +# load_and_featurize = partial( +# _load_and_featurize_notes_per_year, +# note_types=note_types, +# view=view, +# n_rows=n_rows, +# featurizer=featurizer, +# featurizer_kwargs=featurizer_kwargs, +# ) + +# years = list(range(2011, 2022)) + +# with Pool(processes=len(years)) as p: +# dfs = p.map(load_and_featurize, [str(y) for y in years]) + +# df = pd.concat(dfs) + +# df = df.rename( +# {"datotid_senest_aendret_i_sfien": "timestamp", "fritekst": "text"}, +# axis=1, +# ) +# return df + + +# @data_loaders.register("all_notes") +# def load_all_notes( +# featurizer: str, +# n_rows: Optional[int] = None, +# featurizer_kwargs: Optional[dict] = None, +# ) -> pd.DataFrame: +# """Returns all notes from all years. Featurizes the notes using the +# specified featurizer ('tfidf', 'huggingface', or `None` for raw text). +# `featurizer_kwargs` are passed to the featurizer (e.g. "tfidf_path" for +# tfidf, and "model_id" for huggingface). + +# Args: +# featurizer (str): Which featurizer to use. Either 'tf-idf', 'huggingface', or None +# n_rows (Optional[int], optional): Number of rows to load. Defaults to None. +# featurizer_kwargs (Optional[dict], optional): Keyword arguments passed to +# the featurizer. Defaults to None. + +# Returns: +# pd.DataFrame: (Featurized) notes +# """ +# return load_and_featurize_notes( +# note_types=get_all_valid_note_types(), +# featurizer=featurizer, +# n_rows=n_rows, +# featurizer_kwargs=featurizer_kwargs, +# ) + + +# @data_loaders.register("aktuelt_psykisk") +# def load_aktuel_psykisk( +# featurizer: str, +# n_rows: Optional[int] = None, +# featurizer_kwargs: Optional[dict] = None, +# ) -> pd.DataFrame: +# """Returns 'Aktuelt psykisk' notes from all years. Featurizes the notes +# using the specified featurizer ('tfidf', 'huggingface', or `None` for raw +# text). `featurizer_kwargs` are passed to the featurizer (e.g. "tfidf_path" +# for tfidf, and "model_id" for huggingface). + +# Args: +# featurizer (str): Which featurizer to use. Either 'tf-idf', 'huggingface', or None +# n_rows (Optional[int], optional): Number of rows to load. Defaults to None. +# featurizer_kwargs (Optional[dict], optional): Keyword arguments passed to +# the featurizer. Defaults to None. + +# Returns: +# pd.DataFrame: (Featurized) notes +# """ +# return load_and_featurize_notes( +# note_types="Aktuelt psykisk", +# featurizer=featurizer, +# n_rows=n_rows, +# featurizer_kwargs=featurizer_kwargs, +# ) + + +# @data_loaders.register("load_note_types") +# def load_arbitrary_notes( +# note_names: Union[str, list[str]], +# featurizer: str, +# n_rows: Optional[int] = None, +# featurizer_kwargs: Optional[dict] = None, +# ) -> pd.DataFrame: +# """Returns one or multiple note types from all years. Featurizes the notes +# using the specified featurizer ('tfidf', 'huggingface', or `None` for raw +# text). `featurizer_kwargs` are passed to the featurizer (e.g. "tfidf_path" +# for tfidf, and "model_id" for huggingface). + +# Args: +# note_names (Union[str, list[str]]): Which note types to load. See +# `get_all_valid_note_types()` for a list of valid note types. +# featurizer (str): Which featurizer to use. Either 'tf-idf', 'huggingface', or None +# n_rows (Optional[int], optional): Number of rows to load. Defaults to None. +# featurizer_kwargs (Optional[dict], optional): Keyword arguments passed to +# the featurizer. Defaults to None. + +# Returns: +# pd.DataFrame: (Featurized) notes +# """ +# return load_and_featurize_notes( +# note_names, +# featurizer=featurizer, +# n_rows=n_rows, +# featurizer_kwargs=featurizer_kwargs, +# ) + + +# @data_loaders.register("synth_notes") +# def load_synth_notes(featurizer: str, **featurizer_kwargs) -> pd.DataFrame: +# """Load (featurized) synthetic notes for testing. + +# Args: +# featurizer (str): Which featurizer to use +# **featurizer_kwargs: Keyword arguments passed to the featurizer + +# Raises: +# ValueError: If given invalid featurizer + +# Returns: +# pd.DataFrame: (Featurized) synthetic notes +# """ +# p = PROJECT_ROOT / "tests" / "test_data" +# df = pd.read_csv( +# p / "raw" / "synth_txt_data.csv", +# ).drop("Unnamed: 0", axis=1) +# df = df.dropna() +# df["timestamp"] = pd.to_datetime(df["timestamp"]) + +# if featurizer == "tfidf": +# return _tfidf_featurize( +# df, +# tfidf_path=p / "test_tfidf" / "tfidf_10.pkl", +# ) +# elif featurizer == "huggingface": +# return _huggingface_featurize( +# df, +# **featurizer_kwargs, +# ) + +# raise ValueError("Only tfidf or huggingface featurizer supported for synth notes") diff --git a/src/psycop_feature_generation/loaders/raw/utils.py b/src/psycop_feature_generation/loaders/raw/utils.py new file mode 100644 index 00000000..e9989e8e --- /dev/null +++ b/src/psycop_feature_generation/loaders/raw/utils.py @@ -0,0 +1,148 @@ +"""Example of.""" + +from typing import Optional, Union + +import pandas as pd + +from psycop_feature_generation.loaders.raw.sql_load import sql_load + + +def str_to_sql_match_logic( + code_to_match: str, + code_sql_col_name: str, + load_diagnoses: bool, + match_with_wildcard: bool, +): + """Generate SQL match logic from a single string. + + Args: + code_to_match (list[str]): List of strings to match. + code_sql_col_name (str): Name of the SQL column containing the codes. + load_diagnoses (bool): Whether to load diagnoses or medications. Determines the logic. See calling function for more. + match_with_wildcard (bool): Whether to match on icd_code* / atc_code* or only icd_code / atc_code. + """ + base_query = f"lower({code_sql_col_name}) LIKE '%{code_to_match.lower()}" + + if match_with_wildcard: + return f"{base_query}%'" + + if load_diagnoses: + return f"{base_query} OR {base_query}#%'" + + return base_query + + +def list_to_sql_logic( + codes_to_match: list[str], + code_sql_col_name: str, + load_diagnoses: bool, + match_with_wildcard: bool, +): + """Generate SQL match logic from a list of strings. + + Args: + codes_to_match (list[str]): List of strings to match. + code_sql_col_name (str): Name of the SQL column containing the codes. + load_diagnoses (bool): Whether to load diagnoses or medications. Determines the logic. See calling function for more. + match_with_wildcard (bool): Whether to match on icd_code* / atc_code* or only icd_code / atc_code. + """ + match_col_sql_strings = [] + + for code_str in codes_to_match: + base_query = f"lower({code_sql_col_name}) LIKE '%{code_str.lower()}" + + if match_with_wildcard: + match_col_sql_strings.append( + f"{base_query}%'", + ) + else: + # If the string is at the end of diagnosegruppestreng, it doesn't end with a hashtag + match_col_sql_strings.append(base_query) + + if load_diagnoses: + # If the string is at the beginning of diagnosegruppestreng, it doesn't start with a hashtag + match_col_sql_strings.append( + f"lower({code_sql_col_name}) LIKE '{code_str.lower()}#%'", + ) + + return " OR ".join(match_col_sql_strings) + + +def load_from_codes( + codes_to_match: Union[list[str], str], + load_diagnoses: bool, + code_col_name: str, + source_timestamp_col_name: str, + view: str, + output_col_name: Optional[str] = None, + match_with_wildcard: bool = True, + n_rows: Optional[int] = None, +) -> pd.DataFrame: + """Load the visits that have diagnoses that match icd_code or atc code from + the beginning of their adiagnosekode or atc code string. Aggregates all + that match. + + Args: + codes_to_match (Union[list[str], str]): Substring(s) to match diagnoses or medications for. + Diagnoses: Matches any diagnoses, whether a-diagnosis, b-diagnosis. + Both: If a list is passed, will count as a match if any of the icd_codes or at codes in the list match. + load_diagnoses (bool): Determines which mathing logic is employed. If True, will load diagnoses. If False, will load medications. + Diagnoses must be able to split a string like this: + A:DF431#+:ALFC3#B:DF329 + Which means that if match_with_wildcard is False, we must match on *icd_code# or *icd_code followed by nothing. If it's true, we can match on *icd_code*. + code_col_name (str): Name of column containing either diagnosis (icd) or medication (atc) codes. + Takes either 'diagnosegruppestreng' or 'atc' as input. + source_timestamp_col_name (str): Name of the timestamp column in the SQL + view. + view (str): Name of the SQL view to load from. + output_col_name (str, optional): Name of new column string. Defaults to + None. + match_with_wildcard (bool, optional): Whether to match on icd_code* / atc_code*. + Defaults to true. + n_rows: Number of rows to return. Defaults to None. + + Returns: + pd.DataFrame: A pandas dataframe with dw_ek_borger, timestamp and + output_col_name = 1 + """ + fct = f"[{view}]" + + if isinstance(codes_to_match, list) and len(codes_to_match) > 1: + match_col_sql_str = list_to_sql_logic( + codes_to_match=codes_to_match, + code_sql_col_name=code_col_name, + load_diagnoses=load_diagnoses, + match_with_wildcard=match_with_wildcard, + ) + elif isinstance(codes_to_match, str): + match_col_sql_str = str_to_sql_match_logic( + code_to_match=codes_to_match, + code_sql_col_name=code_col_name, + load_diagnoses=load_diagnoses, + match_with_wildcard=match_with_wildcard, + ) + else: + raise ValueError("codes_to_match must be either a list or a string.") + + sql = ( + f"SELECT dw_ek_borger, {source_timestamp_col_name}, {code_col_name}" + + f" FROM [fct].{fct} WHERE {source_timestamp_col_name} IS NOT NULL AND ({match_col_sql_str})" + ) + + df = sql_load(sql, database="USR_PS_FORSK", chunksize=None, n_rows=n_rows) + + if output_col_name is None: + if isinstance(codes_to_match, list): + output_col_name = "_".join(codes_to_match) + else: + output_col_name = codes_to_match + + df[output_col_name] = 1 + + df.drop([f"{code_col_name}"], axis="columns", inplace=True) + + return df.rename( + columns={ + source_timestamp_col_name: "timestamp", + }, + ) diff --git a/src/psycop_feature_generation/timeseriesflattener/flattened_dataset.py b/src/psycop_feature_generation/timeseriesflattener/flattened_dataset.py index 05bc2016..54b328f5 100644 --- a/src/psycop_feature_generation/timeseriesflattener/flattened_dataset.py +++ b/src/psycop_feature_generation/timeseriesflattener/flattened_dataset.py @@ -635,7 +635,9 @@ def add_static_info( # Try to infer value col name if not provided if static_spec.input_col_name_override is None: possible_value_cols = [ - col for col in static_spec.values_df.columns if col not in self.id_col_name + col + for col in static_spec.values_df.columns + if col not in self.id_col_name ] if len(possible_value_cols) == 1: