From 661b7f365e496c54fd1f16cd2c3d1cb3042db27f Mon Sep 17 00:00:00 2001 From: "samuel.oranyeli" Date: Mon, 16 Dec 2024 22:39:19 +1100 Subject: [PATCH 1/7] add pivot_wider_spec for pandas --- janitor/functions/__init__.py | 8 +- janitor/functions/pivot.py | 119 ++++++++++++++++++++ tests/functions/test_pivot_wider_spec.py | 133 +++++++++++++++++++++++ 3 files changed, 259 insertions(+), 1 deletion(-) create mode 100644 tests/functions/test_pivot_wider_spec.py diff --git a/janitor/functions/__init__.py b/janitor/functions/__init__.py index 8af27b3ff..6e0f14fc6 100644 --- a/janitor/functions/__init__.py +++ b/janitor/functions/__init__.py @@ -57,7 +57,12 @@ from .limit_column_characters import limit_column_characters from .min_max_scale import min_max_scale from .move import move -from .pivot import pivot_longer, pivot_longer_spec, pivot_wider +from .pivot import ( + pivot_longer, + pivot_longer_spec, + pivot_wider, + pivot_wider_spec, +) from .process_text import process_text from .remove_columns import remove_columns from .remove_empty import remove_empty @@ -138,6 +143,7 @@ "pivot_longer", "pivot_longer_spec", "pivot_wider", + "pivot_wider_spec", "process_text", "remove_columns", "remove_empty", diff --git a/janitor/functions/pivot.py b/janitor/functions/pivot.py index 9522ad084..893b41f6d 100644 --- a/janitor/functions/pivot.py +++ b/janitor/functions/pivot.py @@ -327,10 +327,14 @@ def pivot_longer( Should be either a single column name, or a list/tuple of column names. `index` should be a list of tuples if the columns are a MultiIndex. + Column selection is possible using the + [`select`][janitor.functions.select.select] syntax. column_names: Name(s) of columns to unpivot. Should be either a single column name or a list/tuple of column names. `column_names` should be a list of tuples if the columns are a MultiIndex. + Column selection is possible using the + [`select`][janitor.functions.select.select] syntax. names_to: Name of new column as a string that will contain what were previously the column names in `column_names`. The default is `variable` if no value is provided. It can @@ -2380,3 +2384,118 @@ def _check_tuples_multiindex(indexer, args, param): ) return args + + +def pivot_wider_spec( + df: pd.DataFrame, + spec: pd.DataFrame, + index: list | tuple | str | Pattern = None, + reset_index: bool = True, +) -> pd.DataFrame: + """ + Provide specification to convert DataFrame from long to wide form. + + !!! abstract "Version Changed" + + - 0.24.0 + - Added `reset_index`, `names_expand` and `index_expand` parameters. + + Args: + df: A pandas DataFrame. + spec: A specification DataFrame. + At a minimum, the spec DataFrame + must have a '.name' and a '.value' columns. + The '.name' column should contain the + the names of the columns in the output DataFrame. + The '.value' column should contain the name of the column(s) + in the source DataFrame that will be serve as the values. + Additional columns in spec will serves as the columns + to be flipped to wide form. + Note that these additional columns should already exist + in the source DataFrame. + index: Name(s) of columns to use as identifier variables. + It should be either a single column name, or a list of column names. + If `index` is not provided, the DataFrame's index is used. + Column selection is possible using the + [`select`][janitor.functions.select.select] syntax. + reset_index: Determines whether to reset the `index`. + Applicable only if `index` is provided. + + Returns: + A pandas DataFrame that has been unpivoted from long to wide form. + """ # noqa: E501 + check("spec", spec, [pd.DataFrame]) + check("reset_index", reset_index, [bool]) + if not spec.columns.is_unique: + raise ValueError("Kindly ensure the spec's columns is unique.") + if ".name" not in spec.columns: + raise KeyError( + "Kindly ensure the spec DataFrame has a `.name` column." + ) + if ".value" not in spec.columns: + raise KeyError( + "Kindly ensure the spec DataFrame has a `.value` column." + ) + if spec.columns.tolist()[:2] != [".name", ".value"]: + raise ValueError( + "The first two columns of the spec DataFrame " + "should be '.name' and '.value', " + "with '.name' coming before '.value'." + ) + if spec.columns.size == 2: + raise ValueError( + "Kindly provide the column(s) " + "to use to make new frame’s columns" + ) + columns = spec.columns[2:] + values = spec[".value"].unique() + if index is not None: + index = _select_index([index], df, axis="columns") + index = df.columns[index].tolist() + df = df.pivot(index=index, columns=columns, values=values) + _index = spec.columns[1:].tolist() + spec = spec.set_index(_index).squeeze() + df = df.reindex(columns=spec.index) + df.columns = df.columns.map(spec) + if reset_index and index: + return df.reset_index() + return df + # if _index: + # df = df.set_index(_index) + # # use a pivot, then rename + # # the below code may work for polars? + # if len(grouper) == 1: + # _grouper = grouper[0] + # else: + # _grouper = grouper + # grouped = df.groupby(_grouper, sort=False, observed=True) + # mapper = defaultdict(dict) + # if len(grouper) > 1: + # spec_grouper = pd.MultiIndex.from_frame(spec.loc[:, grouper]) + # else: + # spec_grouper = spec[grouper[0]] + # for grouper, old_name, new_name in zip( + # spec_grouper, spec[".value"], spec[".name"] + # ): + # mapper[grouper].update({old_name: new_name}) + # frames = [] + # for grouper, frame in grouped: + # mapping = mapper[grouper] + # frame = frame.loc[:, [*mapping]] + # frame.columns = frame.columns.map(mapping) + # frames.append(frame) + # frames = pd.concat(frames, axis=1) + # return frames + + +# names_from -> .name -> columns(pandas) +# values_from -> .value-> values(pandas) +# index = df.columns +# - .name.unique() +# - .value.unique() +# - remaining columns +# if no idex, then df.index is used + +# where does .value and other columns intersect? +# group by other columns, and select .value +# df.groupby(index + other column)[.value.unique()] diff --git a/tests/functions/test_pivot_wider_spec.py b/tests/functions/test_pivot_wider_spec.py new file mode 100644 index 000000000..6e2a82ac2 --- /dev/null +++ b/tests/functions/test_pivot_wider_spec.py @@ -0,0 +1,133 @@ +import re + +import pandas as pd +import pytest +from pandas.testing import assert_frame_equal + +from janitor import pivot_wider_spec + + +@pytest.fixture +def df_checks(): + """pytest fixture""" + return pd.DataFrame( + [ + {"famid": 1, "birth": 1, "age": 1, "ht": 2.8}, + {"famid": 1, "birth": 1, "age": 2, "ht": 3.4}, + {"famid": 1, "birth": 2, "age": 1, "ht": 2.9}, + {"famid": 1, "birth": 2, "age": 2, "ht": 3.8}, + {"famid": 1, "birth": 3, "age": 1, "ht": 2.2}, + {"famid": 1, "birth": 3, "age": 2, "ht": 2.9}, + {"famid": 2, "birth": 1, "age": 1, "ht": 2.0}, + {"famid": 2, "birth": 1, "age": 2, "ht": 3.2}, + {"famid": 2, "birth": 2, "age": 1, "ht": 1.8}, + {"famid": 2, "birth": 2, "age": 2, "ht": 2.8}, + {"famid": 2, "birth": 3, "age": 1, "ht": 1.9}, + {"famid": 2, "birth": 3, "age": 2, "ht": 2.4}, + {"famid": 3, "birth": 1, "age": 1, "ht": 2.2}, + {"famid": 3, "birth": 1, "age": 2, "ht": 3.3}, + {"famid": 3, "birth": 2, "age": 1, "ht": 2.3}, + {"famid": 3, "birth": 2, "age": 2, "ht": 3.4}, + {"famid": 3, "birth": 3, "age": 1, "ht": 2.1}, + {"famid": 3, "birth": 3, "age": 2, "ht": 2.9}, + ] + ) + + +spec = {".name": ["ht1", "ht2"], ".value": ["ht", "ht"], "age": [1, 2]} +spec = pd.DataFrame(spec) + + +def test_spec_is_a_dataframe(df_checks): + """Raise Error if spec is not a DataFrame.""" + with pytest.raises( + TypeError, + match="spec should be one of.+", + ): + df_checks.pipe(pivot_wider_spec, spec={".name": "name"}) + + +def test_spec_columns_has_dot_name(df_checks): + """Raise KeyError if '.name' not in spec's columns.""" + with pytest.raises( + KeyError, + match="Kindly ensure the spec DataFrame has a `.name` column.", + ): + df_checks.pipe( + pivot_wider_spec, + spec=spec.set_axis(labels=[".value", ".blabla", "age"], axis=1), + ) + + +def test_spec_columns_has_dot_value(df_checks): + """Raise KeyError if '.value' not in spec's columns.""" + with pytest.raises( + KeyError, + match="Kindly ensure the spec DataFrame has a `.value` column.", + ): + df_checks.pipe( + pivot_wider_spec, + spec=spec.set_axis(labels=[".name", ".blabla", "age"], axis=1), + ) + + +def test_spec_columns_name_value_order(df_checks): + """ + Raise ValueError if '.name' and '.value' + are not the first two labels + in spec's columns. + """ + msg = "The first two columns of the spec DataFrame " + msg += "should be '.name' and '.value',.+" + with pytest.raises( + ValueError, + match=msg, + ): + df_checks.pipe( + pivot_wider_spec, + spec=spec.loc[:, [".value", ".name", "age"]], + ) + + +def test_spec_columns_len_2(df_checks): + """ + Raise ValueError if '.name' and '.value' + are the only columns in spec. + """ + msg = "Kindly provide the column(s) " + msg += "to use to make new frame’s columns" + with pytest.raises( + ValueError, + match=re.escape(msg), + ): + df_checks.pipe( + pivot_wider_spec, + spec=spec.loc[:, [".name", ".value"]], + ) + + +def test_spec_columns_not_unique(df_checks): + """Raise ValueError if the spec's columns is not unique.""" + with pytest.raises( + ValueError, match="Kindly ensure the spec's columns is unique." + ): + df_checks.pipe( + pivot_wider_spec, + spec=spec.set_axis(labels=[".name", ".name", "age"], axis=1), + ) + + +def test_pivot_wider_spec(df_checks): + """ + Test output + """ + expected = ( + df_checks.pivot(index=["famid", "birth"], columns="age", values="ht") + .add_prefix("ht") + .rename_axis(columns=None) + .reset_index() + ) + actual = df_checks.pipe( + pivot_wider_spec, spec=spec, index=["famid", "birth"] + ) + assert_frame_equal(actual, expected) From 0391ad93a46b61580bb65b57a9d0522c5c7a4906 Mon Sep 17 00:00:00 2001 From: "samuel.oranyeli" Date: Mon, 16 Dec 2024 22:54:35 +1100 Subject: [PATCH 2/7] add examples --- janitor/functions/pivot.py | 79 +++++++++++++++++++++++++++++++++++--- 1 file changed, 74 insertions(+), 5 deletions(-) diff --git a/janitor/functions/pivot.py b/janitor/functions/pivot.py index 893b41f6d..8f2851f8c 100644 --- a/janitor/functions/pivot.py +++ b/janitor/functions/pivot.py @@ -2392,13 +2392,82 @@ def pivot_wider_spec( index: list | tuple | str | Pattern = None, reset_index: bool = True, ) -> pd.DataFrame: - """ - Provide specification to convert DataFrame from long to wide form. + """A declarative interface to pivot a DataFrame from long to wide form, + where you describe how the data will be pivoted, + using a DataFrame. This gives you, the user, + more control over pivoting, where you create a “spec” + data frame that describes exactly how data stored + in the column names becomes variables. + It can come in handy for situations where + `pd.DataFrame.pivot` + seems inadequate for the transformation. - !!! abstract "Version Changed" + !!! info "New in version 0.31.0" - - 0.24.0 - - Added `reset_index`, `names_expand` and `index_expand` parameters. + Examples: + >>> import pandas as pd + >>> import janitor + >>> df = pd.DataFrame( + ... [ + ... {"famid": 1, "birth": 1, "age": 1, "ht": 2.8}, + ... {"famid": 1, "birth": 1, "age": 2, "ht": 3.4}, + ... {"famid": 1, "birth": 2, "age": 1, "ht": 2.9}, + ... {"famid": 1, "birth": 2, "age": 2, "ht": 3.8}, + ... {"famid": 1, "birth": 3, "age": 1, "ht": 2.2}, + ... {"famid": 1, "birth": 3, "age": 2, "ht": 2.9}, + ... {"famid": 2, "birth": 1, "age": 1, "ht": 2.0}, + ... {"famid": 2, "birth": 1, "age": 2, "ht": 3.2}, + ... {"famid": 2, "birth": 2, "age": 1, "ht": 1.8}, + ... {"famid": 2, "birth": 2, "age": 2, "ht": 2.8}, + ... {"famid": 2, "birth": 3, "age": 1, "ht": 1.9}, + ... {"famid": 2, "birth": 3, "age": 2, "ht": 2.4}, + ... {"famid": 3, "birth": 1, "age": 1, "ht": 2.2}, + ... {"famid": 3, "birth": 1, "age": 2, "ht": 3.3}, + ... {"famid": 3, "birth": 2, "age": 1, "ht": 2.3}, + ... {"famid": 3, "birth": 2, "age": 2, "ht": 3.4}, + ... {"famid": 3, "birth": 3, "age": 1, "ht": 2.1}, + ... {"famid": 3, "birth": 3, "age": 2, "ht": 2.9}, + ... ] + ... ) + >>> df + famid birth age ht + 0 1 1 1 2.8 + 1 1 1 2 3.4 + 2 1 2 1 2.9 + 3 1 2 2 3.8 + 4 1 3 1 2.2 + 5 1 3 2 2.9 + 6 2 1 1 2.0 + 7 2 1 2 3.2 + 8 2 2 1 1.8 + 9 2 2 2 2.8 + 10 2 3 1 1.9 + 11 2 3 2 2.4 + 12 3 1 1 2.2 + 13 3 1 2 3.3 + 14 3 2 1 2.3 + 15 3 2 2 3.4 + 16 3 3 1 2.1 + 17 3 3 2 2.9 + >>> spec = {".name": ["ht1", "ht2"], + ... ".value": ["ht", "ht"], + ... "age": [1, 2]} + ... spec = pd.DataFrame(spec) + >>> spec + .name .value age + 0 ht1 ht 1 + 1 ht2 ht 2 + >>> pivot_wider_spec(df=df,spec=spec) + famid birth ht1 ht2 + 0 1 1 2.8 3.4 + 1 1 2 2.9 3.8 + 2 1 3 2.2 2.9 + 3 2 1 2.0 3.2 + 4 2 2 1.8 2.8 + 5 2 3 1.9 2.4 + 6 3 1 2.2 3.3 + 7 3 2 2.3 3.4 + 8 3 3 2.1 2.9 Args: df: A pandas DataFrame. From 929d98553f8c429d1286e3752f215635a438e0f9 Mon Sep 17 00:00:00 2001 From: "samuel.oranyeli" Date: Mon, 16 Dec 2024 22:58:30 +1100 Subject: [PATCH 3/7] cleanup --- janitor/functions/pivot.py | 41 +------------------------------------- 1 file changed, 1 insertion(+), 40 deletions(-) diff --git a/janitor/functions/pivot.py b/janitor/functions/pivot.py index 8f2851f8c..d73109078 100644 --- a/janitor/functions/pivot.py +++ b/janitor/functions/pivot.py @@ -2406,7 +2406,7 @@ def pivot_wider_spec( Examples: >>> import pandas as pd - >>> import janitor + >>> from janitor import pivot_wider_spec >>> df = pd.DataFrame( ... [ ... {"famid": 1, "birth": 1, "age": 1, "ht": 2.8}, @@ -2529,42 +2529,3 @@ def pivot_wider_spec( if reset_index and index: return df.reset_index() return df - # if _index: - # df = df.set_index(_index) - # # use a pivot, then rename - # # the below code may work for polars? - # if len(grouper) == 1: - # _grouper = grouper[0] - # else: - # _grouper = grouper - # grouped = df.groupby(_grouper, sort=False, observed=True) - # mapper = defaultdict(dict) - # if len(grouper) > 1: - # spec_grouper = pd.MultiIndex.from_frame(spec.loc[:, grouper]) - # else: - # spec_grouper = spec[grouper[0]] - # for grouper, old_name, new_name in zip( - # spec_grouper, spec[".value"], spec[".name"] - # ): - # mapper[grouper].update({old_name: new_name}) - # frames = [] - # for grouper, frame in grouped: - # mapping = mapper[grouper] - # frame = frame.loc[:, [*mapping]] - # frame.columns = frame.columns.map(mapping) - # frames.append(frame) - # frames = pd.concat(frames, axis=1) - # return frames - - -# names_from -> .name -> columns(pandas) -# values_from -> .value-> values(pandas) -# index = df.columns -# - .name.unique() -# - .value.unique() -# - remaining columns -# if no idex, then df.index is used - -# where does .value and other columns intersect? -# group by other columns, and select .value -# df.groupby(index + other column)[.value.unique()] From b74fe7e2160f7edd08f5188613d52e73e2cc2f2d Mon Sep 17 00:00:00 2001 From: "samuel.oranyeli" Date: Mon, 16 Dec 2024 23:02:49 +1100 Subject: [PATCH 4/7] fix example --- janitor/functions/pivot.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/janitor/functions/pivot.py b/janitor/functions/pivot.py index d73109078..71843c3a9 100644 --- a/janitor/functions/pivot.py +++ b/janitor/functions/pivot.py @@ -2452,7 +2452,7 @@ def pivot_wider_spec( >>> spec = {".name": ["ht1", "ht2"], ... ".value": ["ht", "ht"], ... "age": [1, 2]} - ... spec = pd.DataFrame(spec) + >>> spec = pd.DataFrame(spec) >>> spec .name .value age 0 ht1 ht 1 From a9a5965dc7ba60f2bb59334cb5b841e8045650be Mon Sep 17 00:00:00 2001 From: "samuel.oranyeli" Date: Mon, 16 Dec 2024 23:03:45 +1100 Subject: [PATCH 5/7] fix example --- janitor/functions/pivot.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/janitor/functions/pivot.py b/janitor/functions/pivot.py index 71843c3a9..859e35bfd 100644 --- a/janitor/functions/pivot.py +++ b/janitor/functions/pivot.py @@ -2394,10 +2394,13 @@ def pivot_wider_spec( ) -> pd.DataFrame: """A declarative interface to pivot a DataFrame from long to wide form, where you describe how the data will be pivoted, - using a DataFrame. This gives you, the user, + using a DataFrame. + + This gives you, the user, more control over pivoting, where you create a “spec” data frame that describes exactly how data stored in the column names becomes variables. + It can come in handy for situations where `pd.DataFrame.pivot` seems inadequate for the transformation. From b7ffe750cdc2d3d3af788cfb5b0103734e65f7a8 Mon Sep 17 00:00:00 2001 From: "samuel.oranyeli" Date: Mon, 16 Dec 2024 23:07:35 +1100 Subject: [PATCH 6/7] fix failing test --- tests/functions/test_pivot_wider_spec.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/functions/test_pivot_wider_spec.py b/tests/functions/test_pivot_wider_spec.py index 6e2a82ac2..a67b59be2 100644 --- a/tests/functions/test_pivot_wider_spec.py +++ b/tests/functions/test_pivot_wider_spec.py @@ -130,4 +130,7 @@ def test_pivot_wider_spec(df_checks): actual = df_checks.pipe( pivot_wider_spec, spec=spec, index=["famid", "birth"] ) - assert_frame_equal(actual, expected) + assert_frame_equal( + actual.sort_values(expected.columns.tolist(), ignore_index=True), + expected.sort_values(expected.columns.tolist(), ignore_index=True), + ) From f629419ebd559d9ea3ec2632a98eb8c7fa206225 Mon Sep 17 00:00:00 2001 From: "samuel.oranyeli" Date: Mon, 16 Dec 2024 23:10:54 +1100 Subject: [PATCH 7/7] fix example --- janitor/functions/pivot.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/janitor/functions/pivot.py b/janitor/functions/pivot.py index 859e35bfd..810243fb0 100644 --- a/janitor/functions/pivot.py +++ b/janitor/functions/pivot.py @@ -424,10 +424,13 @@ def pivot_longer_spec( ) -> pd.DataFrame: """A declarative interface to pivot a DataFrame from wide to long form, where you describe how the data will be unpivoted, - using a DataFrame. This gives you, the user, + using a DataFrame. + + This gives you, the user, more control over unpivoting, where you create a “spec” data frame that describes exactly how data stored in the column names becomes variables. + It can come in handy for situations where [`pivot_longer`][janitor.functions.pivot.pivot_longer] seems inadequate for the transformation. @@ -2460,7 +2463,7 @@ def pivot_wider_spec( .name .value age 0 ht1 ht 1 1 ht2 ht 2 - >>> pivot_wider_spec(df=df,spec=spec) + >>> pivot_wider_spec(df=df,spec=spec, index=['famid','birth']) famid birth ht1 ht2 0 1 1 2.8 3.4 1 1 2 2.9 3.8