diff --git a/src/psycop_feature_generation/data_checks/flattened/feature_describer.py b/src/psycop_feature_generation/data_checks/flattened/feature_describer.py index 3ee290a1..b4ef45d8 100644 --- a/src/psycop_feature_generation/data_checks/flattened/feature_describer.py +++ b/src/psycop_feature_generation/data_checks/flattened/feature_describer.py @@ -2,6 +2,7 @@ df.""" from __future__ import annotations +import re from collections.abc import Sequence from pathlib import Path @@ -15,7 +16,7 @@ PredictorSpec, StaticSpec, TemporalSpec, - _AnySpec, + TextPredictorSpec, ) from wasabi import Printer @@ -86,16 +87,22 @@ def create_unicode_hist(series: pd.Series) -> pd.Series: [UNICODE_HIST[_find_nearest(key_vector, val)] for val in hist], ) - return ucode_to_print + return pd.Series(ucode_to_print) def generate_temporal_feature_description( series: pd.Series, predictor_spec: TemporalSpec, + feature_name: str | None = None, ): """Generate a row with feature description for a temporal predictor.""" + if feature_name is not None: + feature_name = feature_name + else: + feature_name = predictor_spec.feature_name + d = { - "Predictor df": predictor_spec.feature_name, + "Predictor df": feature_name, "Lookbehind days": predictor_spec.interval_days, "Resolve multiple": predictor_spec.resolve_multiple_fn.__name__, "N unique": series.nunique(), @@ -133,13 +140,15 @@ def generate_static_feature_description(series: pd.Series, predictor_spec: Stati def generate_feature_description_row( series: pd.Series, - predictor_spec: _AnySpec, + predictor_spec: StaticSpec | TemporalSpec, + feature_name: str | None = None, ) -> dict: """Generate a row with feature description. Args: series (pd.Series): Series with data to describe. predictor_spec (PredictorSpec): Predictor specification. + feature_name (str, optional): Name of the feature. Defaults to None. Returns: dict: dictionary with feature description. @@ -148,20 +157,23 @@ def generate_feature_description_row( if isinstance(predictor_spec, StaticSpec): d = generate_static_feature_description(series, predictor_spec) elif isinstance(predictor_spec, TemporalSpec): - d = generate_temporal_feature_description(series, predictor_spec) - + d = generate_temporal_feature_description( + series, + predictor_spec, + feature_name=feature_name, + ) return d def generate_feature_description_df( df: pd.DataFrame, - predictor_specs: list[PredictorSpec], + predictor_specs: list[PredictorSpec | StaticSpec | TemporalSpec], ) -> pd.DataFrame: """Generate a data frame with feature descriptions. Args: df (pd.DataFrame): Data frame with data to describe. - predictor_specs (PredictorSpec): Predictor specifications. + predictor_specs (Union[PredictorSpec, StaticSpec, TemporalSpec]): Predictor specifications. Returns: pd.DataFrame: Data frame with feature descriptions. @@ -172,12 +184,33 @@ def generate_feature_description_df( for spec in predictor_specs: column_name = spec.get_col_str() - rows.append( - generate_feature_description_row( - series=df[column_name], - predictor_spec=spec, - ), - ) + if isinstance(spec, TextPredictorSpec): + last_part = column_name.split(f"{spec.prefix}_{spec.feature_name}")[1] + first_part = column_name.split(last_part)[0] + string_match = f"{first_part}[\\dA-Za-z\\-]+{last_part}" + + column_names = [ + re.match(string_match, column)[0] # type: ignore + for column in df.columns + if re.match(string_match, column) is not None + ] + + for column_name in column_names: + rows.append( + generate_feature_description_row( + series=df[column_name], + predictor_spec=spec, + feature_name=column_name, + ), + ) + + else: + rows.append( + generate_feature_description_row( + series=df[column_name], + predictor_spec=spec, + ), + ) # Convert to dataframe feature_description_df = pd.DataFrame(rows) @@ -235,7 +268,7 @@ def save_feature_descriptive_stats_from_dir( ) # Writing html table as well save_df_to_pretty_html_table( + df=feature_descriptive_stats, path=out_dir / f"{split}_feature_descriptive_stats.html", title="Feature descriptive stats", - df=feature_descriptive_stats, )