From 84c696a138ab0f589b2698f67c3097cb32d68200 Mon Sep 17 00:00:00 2001 From: sarakolding Date: Tue, 18 Apr 2023 15:50:24 +0200 Subject: [PATCH 01/10] feat: add feature descriptions for text features --- .../flattened/feature_describer.py | 45 +++++++++++++++---- 1 file changed, 37 insertions(+), 8 deletions(-) diff --git a/src/psycop_feature_generation/data_checks/flattened/feature_describer.py b/src/psycop_feature_generation/data_checks/flattened/feature_describer.py index 3ee290a1..130d52bc 100644 --- a/src/psycop_feature_generation/data_checks/flattened/feature_describer.py +++ b/src/psycop_feature_generation/data_checks/flattened/feature_describer.py @@ -4,6 +4,7 @@ from collections.abc import Sequence from pathlib import Path +import re import numpy as np import pandas as pd @@ -16,6 +17,7 @@ StaticSpec, TemporalSpec, _AnySpec, + TextPredictorSpec, ) from wasabi import Printer @@ -92,10 +94,11 @@ def create_unicode_hist(series: pd.Series) -> pd.Series: def generate_temporal_feature_description( series: pd.Series, predictor_spec: TemporalSpec, + feature_name: str, ): """Generate a row with feature description for a temporal predictor.""" d = { - "Predictor df": predictor_spec.feature_name, + "Predictor df": feature_name, "Lookbehind days": predictor_spec.interval_days, "Resolve multiple": predictor_spec.resolve_multiple_fn.__name__, "N unique": series.nunique(), @@ -134,12 +137,14 @@ def generate_static_feature_description(series: pd.Series, predictor_spec: Stati def generate_feature_description_row( series: pd.Series, predictor_spec: _AnySpec, + feature_name: str, ) -> dict: """Generate a row with feature description. Args: series (pd.Series): Series with data to describe. predictor_spec (PredictorSpec): Predictor specification. + feature_name (str): Name of the feature. Returns: dict: dictionary with feature description. @@ -148,7 +153,9 @@ def generate_feature_description_row( if isinstance(predictor_spec, StaticSpec): d = generate_static_feature_description(series, predictor_spec) elif isinstance(predictor_spec, TemporalSpec): - d = generate_temporal_feature_description(series, predictor_spec) + d = generate_temporal_feature_description( + series, predictor_spec, feature_name=feature_name + ) return d @@ -172,12 +179,34 @@ def generate_feature_description_df( for spec in predictor_specs: column_name = spec.get_col_str() - rows.append( - generate_feature_description_row( - series=df[column_name], - predictor_spec=spec, - ), - ) + if isinstance(spec, TextPredictorSpec): + last_part = column_name.split(f"{spec.prefix}_{spec.feature_name}")[1] + first_part = column_name.split(last_part)[0] + string_match = f"{first_part}[\dA-Za-z\-]+{last_part}" + + column_names = [ + re.match(string_match, column)[0] + for column in df.columns + if re.match(string_match, column) is not None + ] + + for column_name in column_names: + rows.append( + generate_feature_description_row( + series=df[column_name], + predictor_spec=spec, + feature_name=column_name, + ), + ) + + else: + rows.append( + generate_feature_description_row( + series=df[column_name], + predictor_spec=spec, + feature_name=spec.feature_name, + ), + ) # Convert to dataframe feature_description_df = pd.DataFrame(rows) From 28ce302ed719f5344e8e680c6be8a331582305ba Mon Sep 17 00:00:00 2001 From: github-actions Date: Tue, 18 Apr 2023 13:53:54 +0000 Subject: [PATCH 02/10] style: linting --- .../data_checks/flattened/feature_describer.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/psycop_feature_generation/data_checks/flattened/feature_describer.py b/src/psycop_feature_generation/data_checks/flattened/feature_describer.py index 130d52bc..0e6e02f5 100644 --- a/src/psycop_feature_generation/data_checks/flattened/feature_describer.py +++ b/src/psycop_feature_generation/data_checks/flattened/feature_describer.py @@ -2,9 +2,9 @@ df.""" from __future__ import annotations +import re from collections.abc import Sequence from pathlib import Path -import re import numpy as np import pandas as pd @@ -16,8 +16,8 @@ PredictorSpec, StaticSpec, TemporalSpec, - _AnySpec, TextPredictorSpec, + _AnySpec, ) from wasabi import Printer @@ -154,7 +154,7 @@ def generate_feature_description_row( d = generate_static_feature_description(series, predictor_spec) elif isinstance(predictor_spec, TemporalSpec): d = generate_temporal_feature_description( - series, predictor_spec, feature_name=feature_name + series, predictor_spec, feature_name=feature_name, ) return d @@ -182,7 +182,7 @@ def generate_feature_description_df( if isinstance(spec, TextPredictorSpec): last_part = column_name.split(f"{spec.prefix}_{spec.feature_name}")[1] first_part = column_name.split(last_part)[0] - string_match = f"{first_part}[\dA-Za-z\-]+{last_part}" + string_match = f"{first_part}[\\dA-Za-z\\-]+{last_part}" column_names = [ re.match(string_match, column)[0] From 04809cb76019ce687e68a7ffb4b070ba52fcfccb Mon Sep 17 00:00:00 2001 From: github-actions Date: Tue, 18 Apr 2023 13:54:43 +0000 Subject: [PATCH 03/10] style: linting --- .../data_checks/flattened/feature_describer.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/psycop_feature_generation/data_checks/flattened/feature_describer.py b/src/psycop_feature_generation/data_checks/flattened/feature_describer.py index 0e6e02f5..2c7c9979 100644 --- a/src/psycop_feature_generation/data_checks/flattened/feature_describer.py +++ b/src/psycop_feature_generation/data_checks/flattened/feature_describer.py @@ -154,7 +154,9 @@ def generate_feature_description_row( d = generate_static_feature_description(series, predictor_spec) elif isinstance(predictor_spec, TemporalSpec): d = generate_temporal_feature_description( - series, predictor_spec, feature_name=feature_name, + series, + predictor_spec, + feature_name=feature_name, ) return d From 09364d9d4c84ea221fe3f252453eb339f3d8524c Mon Sep 17 00:00:00 2001 From: sarakolding Date: Tue, 18 Apr 2023 16:35:56 +0200 Subject: [PATCH 04/10] add optional arg and if-guard --- .../data_checks/flattened/feature_describer.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/psycop_feature_generation/data_checks/flattened/feature_describer.py b/src/psycop_feature_generation/data_checks/flattened/feature_describer.py index 130d52bc..56ccc465 100644 --- a/src/psycop_feature_generation/data_checks/flattened/feature_describer.py +++ b/src/psycop_feature_generation/data_checks/flattened/feature_describer.py @@ -5,6 +5,7 @@ from collections.abc import Sequence from pathlib import Path import re +from typing import Optional import numpy as np import pandas as pd @@ -94,9 +95,14 @@ def create_unicode_hist(series: pd.Series) -> pd.Series: def generate_temporal_feature_description( series: pd.Series, predictor_spec: TemporalSpec, - feature_name: str, + feature_name: Optional[str] = None, ): """Generate a row with feature description for a temporal predictor.""" + if feature_name is not None: + feature_name = feature_name + else: + feature_name = predictor_spec.feature_name + d = { "Predictor df": feature_name, "Lookbehind days": predictor_spec.interval_days, @@ -137,14 +143,14 @@ def generate_static_feature_description(series: pd.Series, predictor_spec: Stati def generate_feature_description_row( series: pd.Series, predictor_spec: _AnySpec, - feature_name: str, + feature_name: Optional[str] = None, ) -> dict: """Generate a row with feature description. Args: series (pd.Series): Series with data to describe. predictor_spec (PredictorSpec): Predictor specification. - feature_name (str): Name of the feature. + feature_name (str, optional): Name of the feature. Defaults to None. Returns: dict: dictionary with feature description. @@ -156,7 +162,6 @@ def generate_feature_description_row( d = generate_temporal_feature_description( series, predictor_spec, feature_name=feature_name ) - return d @@ -204,7 +209,6 @@ def generate_feature_description_df( generate_feature_description_row( series=df[column_name], predictor_spec=spec, - feature_name=spec.feature_name, ), ) From 0bbfc75bb6e457db001ad1ad6180d99234007cc9 Mon Sep 17 00:00:00 2001 From: github-actions Date: Tue, 18 Apr 2023 14:51:46 +0000 Subject: [PATCH 05/10] style: linting --- .../data_checks/flattened/feature_describer.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/src/psycop_feature_generation/data_checks/flattened/feature_describer.py b/src/psycop_feature_generation/data_checks/flattened/feature_describer.py index 97e8690b..96fc0de6 100644 --- a/src/psycop_feature_generation/data_checks/flattened/feature_describer.py +++ b/src/psycop_feature_generation/data_checks/flattened/feature_describer.py @@ -6,10 +6,6 @@ from collections.abc import Sequence from pathlib import Path -import re -from typing import Optional - - import numpy as np import pandas as pd from psycop_feature_generation.data_checks.utils import save_df_to_pretty_html_table @@ -98,7 +94,7 @@ def create_unicode_hist(series: pd.Series) -> pd.Series: def generate_temporal_feature_description( series: pd.Series, predictor_spec: TemporalSpec, - feature_name: Optional[str] = None, + feature_name: str | None = None, ): """Generate a row with feature description for a temporal predictor.""" if feature_name is not None: @@ -146,7 +142,7 @@ def generate_static_feature_description(series: pd.Series, predictor_spec: Stati def generate_feature_description_row( series: pd.Series, predictor_spec: _AnySpec, - feature_name: Optional[str] = None, + feature_name: str | None = None, ) -> dict: """Generate a row with feature description. From b74b5ef1264e9b96b197fd61128bd473de9ed841 Mon Sep 17 00:00:00 2001 From: sarakolding Date: Wed, 19 Apr 2023 16:14:00 +0200 Subject: [PATCH 06/10] pull git changes --- .../data_checks/flattened/feature_describer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/psycop_feature_generation/data_checks/flattened/feature_describer.py b/src/psycop_feature_generation/data_checks/flattened/feature_describer.py index 96fc0de6..fee6e0d7 100644 --- a/src/psycop_feature_generation/data_checks/flattened/feature_describer.py +++ b/src/psycop_feature_generation/data_checks/flattened/feature_describer.py @@ -88,7 +88,7 @@ def create_unicode_hist(series: pd.Series) -> pd.Series: [UNICODE_HIST[_find_nearest(key_vector, val)] for val in hist], ) - return ucode_to_print + return pd.Series(ucode_to_print) def generate_temporal_feature_description( From 7f6f5b9143144ba2dac04618a0d8e776d22e373d Mon Sep 17 00:00:00 2001 From: sarakolding Date: Wed, 19 Apr 2023 16:34:13 +0200 Subject: [PATCH 07/10] style: linting --- .../data_checks/flattened/feature_describer.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/psycop_feature_generation/data_checks/flattened/feature_describer.py b/src/psycop_feature_generation/data_checks/flattened/feature_describer.py index fee6e0d7..673506ce 100644 --- a/src/psycop_feature_generation/data_checks/flattened/feature_describer.py +++ b/src/psycop_feature_generation/data_checks/flattened/feature_describer.py @@ -5,6 +5,7 @@ import re from collections.abc import Sequence from pathlib import Path +from typing import Literal, Union import numpy as np import pandas as pd @@ -141,7 +142,7 @@ def generate_static_feature_description(series: pd.Series, predictor_spec: Stati def generate_feature_description_row( series: pd.Series, - predictor_spec: _AnySpec, + predictor_spec: Union[StaticSpec, TemporalSpec], feature_name: str | None = None, ) -> dict: """Generate a row with feature description. @@ -168,13 +169,13 @@ def generate_feature_description_row( def generate_feature_description_df( df: pd.DataFrame, - predictor_specs: list[PredictorSpec], + predictor_specs: list[Union[PredictorSpec, StaticSpec, TemporalSpec]], ) -> pd.DataFrame: """Generate a data frame with feature descriptions. Args: df (pd.DataFrame): Data frame with data to describe. - predictor_specs (PredictorSpec): Predictor specifications. + predictor_specs (Union[PredictorSpec, StaticSpec, TemporalSpec]): Predictor specifications. Returns: pd.DataFrame: Data frame with feature descriptions. @@ -269,7 +270,7 @@ def save_feature_descriptive_stats_from_dir( ) # Writing html table as well save_df_to_pretty_html_table( + df=feature_descriptive_stats, path=out_dir / f"{split}_feature_descriptive_stats.html", title="Feature descriptive stats", - df=feature_descriptive_stats, ) From 1425eac6d23288be5a2ebc6900b3b87a0cc19561 Mon Sep 17 00:00:00 2001 From: github-actions Date: Wed, 19 Apr 2023 14:35:10 +0000 Subject: [PATCH 08/10] style: linting --- .../data_checks/flattened/feature_describer.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/psycop_feature_generation/data_checks/flattened/feature_describer.py b/src/psycop_feature_generation/data_checks/flattened/feature_describer.py index 673506ce..a26b1d9b 100644 --- a/src/psycop_feature_generation/data_checks/flattened/feature_describer.py +++ b/src/psycop_feature_generation/data_checks/flattened/feature_describer.py @@ -5,7 +5,6 @@ import re from collections.abc import Sequence from pathlib import Path -from typing import Literal, Union import numpy as np import pandas as pd @@ -18,7 +17,6 @@ StaticSpec, TemporalSpec, TextPredictorSpec, - _AnySpec, ) from wasabi import Printer @@ -142,7 +140,7 @@ def generate_static_feature_description(series: pd.Series, predictor_spec: Stati def generate_feature_description_row( series: pd.Series, - predictor_spec: Union[StaticSpec, TemporalSpec], + predictor_spec: StaticSpec | TemporalSpec, feature_name: str | None = None, ) -> dict: """Generate a row with feature description. @@ -169,7 +167,7 @@ def generate_feature_description_row( def generate_feature_description_df( df: pd.DataFrame, - predictor_specs: list[Union[PredictorSpec, StaticSpec, TemporalSpec]], + predictor_specs: list[PredictorSpec | StaticSpec | TemporalSpec], ) -> pd.DataFrame: """Generate a data frame with feature descriptions. From dd38605ede50da6c575eb07b1ce375d578493a93 Mon Sep 17 00:00:00 2001 From: sarakolding Date: Mon, 24 Apr 2023 22:11:14 +0200 Subject: [PATCH 09/10] fix mypy --- .../data_checks/flattened/feature_describer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/psycop_feature_generation/data_checks/flattened/feature_describer.py b/src/psycop_feature_generation/data_checks/flattened/feature_describer.py index a26b1d9b..82c93863 100644 --- a/src/psycop_feature_generation/data_checks/flattened/feature_describer.py +++ b/src/psycop_feature_generation/data_checks/flattened/feature_describer.py @@ -190,7 +190,7 @@ def generate_feature_description_df( string_match = f"{first_part}[\\dA-Za-z\\-]+{last_part}" column_names = [ - re.match(string_match, column)[0] + re.match(string_match, column)[0] # type: ignore for column in df.columns if re.match(string_match, column) is not None ] From cf572739e0e6cf2a394899f4b07034266680b956 Mon Sep 17 00:00:00 2001 From: github-actions Date: Mon, 24 Apr 2023 20:12:12 +0000 Subject: [PATCH 10/10] style: linting --- .../data_checks/flattened/feature_describer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/psycop_feature_generation/data_checks/flattened/feature_describer.py b/src/psycop_feature_generation/data_checks/flattened/feature_describer.py index 82c93863..b4ef45d8 100644 --- a/src/psycop_feature_generation/data_checks/flattened/feature_describer.py +++ b/src/psycop_feature_generation/data_checks/flattened/feature_describer.py @@ -190,7 +190,7 @@ def generate_feature_description_df( string_match = f"{first_part}[\\dA-Za-z\\-]+{last_part}" column_names = [ - re.match(string_match, column)[0] # type: ignore + re.match(string_match, column)[0] # type: ignore for column in df.columns if re.match(string_match, column) is not None ]