Skip to content
This repository has been archived by the owner on May 3, 2023. It is now read-only.

Commit

Permalink
Merge branch 'jakdam/remove_torch_dep' into bokajgd/issue87
Browse files Browse the repository at this point in the history
merge: remove torch dependency
  • Loading branch information
bokajgd committed Nov 18, 2022
2 parents d92f798 + 214cdae commit 60fd7db
Show file tree
Hide file tree
Showing 10 changed files with 683 additions and 646 deletions.
10 changes: 10 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,16 @@

<!--next-version-placeholder-->

## v0.8.0 (2022-11-17)
### Feature
* Allow load_medications to concat a list of medications ([`d78f465`](https://github.com/Aarhus-Psychiatry-Research/psycop-feature-generation/commit/d78f46592213b8245229d6618d40f1a1ff4d80eb))

### Fix
* Remove original functions ([`da59110`](https://github.com/Aarhus-Psychiatry-Research/psycop-feature-generation/commit/da59110978469b0743ce2d625005fc90950fb436))

### Documentation
* Improve docs ([`9aad0af`](https://github.com/Aarhus-Psychiatry-Research/psycop-feature-generation/commit/9aad0af6205af2e3deffb573676af5a20401bae1))

## v0.7.0 (2022-11-16)
### Feature
* Full run ([`142212f`](https://github.com/Aarhus-Psychiatry-Research/psycop-feature-generation/commit/142212fc63a59662048b6569dc874def92dfe62f))
Expand Down
2 changes: 1 addition & 1 deletion example/loaders/load_medications.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@
import psycop_feature_generation.loaders.raw.load_medications as m

if __name__ == "__main__":
df = m.antipsychotics()
df = m.first_gen_antipsychotics(n_rows=1000)
3 changes: 1 addition & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "psycop_feature_generation"
version = "0.7.0"
version = "0.8.0"
description = ""
authors = ["Your Name <you@example.com>"]

Expand All @@ -21,7 +21,6 @@ psutil = ">=5.9.1, <6.0.0"
pandas = ">=1.4.0,<1.6.0"
catalogue = ">=2.0.0, <2.1.0"
numpy = ">=1.23.3,<1.23.5"
torch = "^1.12.1"
transformers = "^4.22.2"
pyarrow = ">=9.0.0,<9.1.0"
psycopmlutils = ">=0.2.4, <0.3.0"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,15 @@ def check_df_conforms_to_feature_spec(
ValueError: If df does not conform to d.
"""

if required_columns is None:
required_columns = ["dw_ek_borger", "timestamp", "value"]

if subset_duplicates_columns is None:
subset_duplicates_columns = ["dw_ek_borger", "timestamp", "value"]

if expected_val_dtypes is None:
expected_val_dtypes = ["float64", "int64"]

msg = Printer(timestamp=True)

allowed_nan_value_prop = (
Expand Down
131 changes: 23 additions & 108 deletions src/psycop_feature_generation/loaders/raw/load_diagnoses.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,97 +10,10 @@

import pandas as pd

from psycop_feature_generation.loaders.raw.sql_load import sql_load
from psycop_feature_generation.loaders.raw.utils import load_from_codes
from psycop_feature_generation.utils import data_loaders


def _load(
icd_code: Union[list[str], str],
source_timestamp_col_name: str,
fct: str,
output_col_name_override: Optional[str] = None,
wildcard_icd_code: Optional[bool] = True,
n_rows: Optional[int] = None,
) -> pd.DataFrame:
"""Load the visits that have diagnoses that match icd_code from the
beginning of their adiagnosekode string. Aggregates all that match.
Args:
icd_code (Union[list[str], str]): Substring(s) to match diagnoses for. # noqa: DAR102
Matches any diagnoses, whether a-diagnosis, b-diagnosis etc. If a list is passed, will
count a diagnosis as a match if any of the icd_codes in the list match.
source_timestamp_col_name (str): Name of the timestamp column in the SQL
view.
fct (str): Name of the SQL view to load from.
output_col_name_override (str, optional): Name of new column string. If not specified, defaults to the icd_code.
wildcard_icd_code (bool, optional): Whether to match on icd_code*.
Defaults to true.
n_rows: Number of rows to return. Defaults to None.
Returns:
pd.DataFrame: A pandas dataframe with dw_ek_borger, timestamp and
output_col_name = 1
"""
fct = f"[{fct}]"

# Must be able to split a string like this:
# A:DF431#+:ALFC3#B:DF329
# Which means that if wildcard_icd_code is False, we must match on icd_code# or icd_code followed by nothing.
# If it's true, we can match on icd_code*.

# Handle if there are multiple ICD codes to count together.
if isinstance(icd_code, list):
match_col_sql_strings = []

for code_str in icd_code: # pylint: disable=not-an-iterable
if wildcard_icd_code:
match_col_sql_strings.append(
f"lower(diagnosegruppestreng) LIKE '%{code_str.lower()}%'",
)
else:
# If the string is at the end of diagnosegruppestreng, it doesn't end with a hashtag
match_col_sql_strings.append(
f"lower(diagnosegruppestreng) LIKE '%{code_str.lower()}'",
)

# But if it is at the end, it does
match_col_sql_strings.append(
f"lower(diagnosegruppestreng) LIKE '%{code_str.lower()}#%'",
)

match_col_sql_str = " OR ".join(match_col_sql_strings)
else:
if wildcard_icd_code:
match_col_sql_str = (
f"lower(diagnosegruppestreng) LIKE '%{icd_code.lower()}%'"
)

else:
match_col_sql_str = f"lower(diagnosegruppestreng) LIKE '%{icd_code.lower()}' OR lower(diagnosegruppestreng) LIKE '%{icd_code.lower()}#%'"

sql = (
f"SELECT dw_ek_borger, {source_timestamp_col_name}, diagnosegruppestreng"
+ f" FROM [fct].{fct} WHERE {source_timestamp_col_name} IS NOT NULL AND ({match_col_sql_str})"
)

df = sql_load(sql, database="USR_PS_FORSK", chunksize=None, n_rows=n_rows)

if output_col_name_override is None:
output_col_name = icd_code
else:
output_col_name = output_col_name_override

df[output_col_name] = 1

df.drop(["diagnosegruppestreng"], axis="columns", inplace=True)

return df.rename(
columns={
source_timestamp_col_name: "timestamp",
},
)


def concat_from_physical_visits(
icd_codes: list[str],
output_col_name: str,
Expand All @@ -122,34 +35,34 @@ def concat_from_physical_visits(

diagnoses_source_table_info = {
"lpr3": {
"fct": "FOR_LPR3kontakter_psyk_somatik_inkl_2021_feb2022",
"view": "FOR_LPR3kontakter_psyk_somatik_inkl_2021_feb2022",
"source_timestamp_col_name": "datotid_lpr3kontaktstart",
},
"lpr2_inpatient": {
"fct": "FOR_indlaeggelser_psyk_somatik_LPR2_inkl_2021_feb2022",
"view": "FOR_indlaeggelser_psyk_somatik_LPR2_inkl_2021_feb2022",
"source_timestamp_col_name": "datotid_indlaeggelse",
},
"lpr2_acute_outpatient": {
"fct": "FOR_akutambulantekontakter_psyk_somatik_LPR2_inkl_2021_feb2022",
"view": "FOR_akutambulantekontakter_psyk_somatik_LPR2_inkl_2021_feb2022",
"source_timestamp_col_name": "datotid_start",
},
"lpr2_outpatient": {
"fct": "FOR_besoeg_psyk_somatik_LPR2_inkl_2021_feb2022",
"view": "FOR_besoeg_psyk_somatik_LPR2_inkl_2021_feb2022",
"source_timestamp_col_name": "datotid_start",
},
}

# Using ._load is faster than from_physical_visits since it can process all icd_codes in the SQL request at once,
# rather than processing one at a time and aggregating.
dfs = [
_load(
icd_code=icd_codes,
output_col_name_override=output_col_name,
wildcard_icd_code=wildcard_icd_code,
load_from_codes(
codes_to_match=icd_codes,
column_name="diagnosegruppestreng",
output_col_name=output_col_name,
match_with_wildcard=wildcard_icd_code,
n_rows=n_rows,
load_diagnoses=True,
**kwargs,
)
for source_name, kwargs in diagnoses_source_table_info.items()
for _, kwargs in diagnoses_source_table_info.items()
]

df = pd.concat(dfs).drop_duplicates(
Expand All @@ -160,8 +73,8 @@ def concat_from_physical_visits(


def from_physical_visits(
icd_code: str,
output_col_name_override: Optional[str] = "value",
icd_code: Union[list[str], str],
output_col_name: Optional[str] = "value",
n_rows: Optional[int] = None,
wildcard_icd_code: Optional[bool] = False,
) -> pd.DataFrame:
Expand All @@ -181,15 +94,15 @@ def from_physical_visits(

diagnoses_source_table_info = {
"lpr3": {
"fct": "FOR_LPR3kontakter_psyk_somatik_inkl_2021",
"view": "FOR_LPR3kontakter_psyk_somatik_inkl_2021",
"source_timestamp_col_name": "datotid_lpr3kontaktstart",
},
"lpr2_inpatient": {
"fct": "FOR_indlaeggelser_psyk_somatik_LPR2_inkl_2021",
"view": "FOR_indlaeggelser_psyk_somatik_LPR2_inkl_2021",
"source_timestamp_col_name": "datotid_indlaeggelse",
},
"lpr2_outpatient": {
"fct": "FOR_besoeg_psyk_somatik_LPR2_inkl_2021",
"view": "FOR_besoeg_psyk_somatik_LPR2_inkl_2021",
"source_timestamp_col_name": "datotid_start",
},
}
Expand All @@ -200,12 +113,14 @@ def from_physical_visits(
n_rows_per_df = None

dfs = [
_load(
icd_code=icd_code,
output_col_name_override=output_col_name_override,
wildcard_icd_code=wildcard_icd_code,
load_from_codes(
codes_to_match=icd_code,
code_col_name="diagnosegruppestreng",
output_col_name=output_col_name,
n_rows=n_rows_per_df,
match_with_wildcard=wildcard_icd_code,
**kwargs,
load_diagnoses=True,
)
for _, kwargs in diagnoses_source_table_info.items()
]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,9 +55,7 @@ def load_non_numerical_values_and_coerce_inequalities(
inplace=True,
)

if ineq2mult:
return multiply_inequalities_in_df(df, ineq2mult=ineq2mult)
return multiply_inequalities_in_df(df)
return multiply_inequalities_in_df(df, ineq2mult=ineq2mult)


def load_numerical_values(
Expand Down
Loading

0 comments on commit 60fd7db

Please sign in to comment.