diff --git a/janitor/functions/__init__.py b/janitor/functions/__init__.py index 8af27b3ff..6e0f14fc6 100644 --- a/janitor/functions/__init__.py +++ b/janitor/functions/__init__.py @@ -57,7 +57,12 @@ from .limit_column_characters import limit_column_characters from .min_max_scale import min_max_scale from .move import move -from .pivot import pivot_longer, pivot_longer_spec, pivot_wider +from .pivot import ( + pivot_longer, + pivot_longer_spec, + pivot_wider, + pivot_wider_spec, +) from .process_text import process_text from .remove_columns import remove_columns from .remove_empty import remove_empty @@ -138,6 +143,7 @@ "pivot_longer", "pivot_longer_spec", "pivot_wider", + "pivot_wider_spec", "process_text", "remove_columns", "remove_empty", diff --git a/janitor/functions/pivot.py b/janitor/functions/pivot.py index 9522ad084..810243fb0 100644 --- a/janitor/functions/pivot.py +++ b/janitor/functions/pivot.py @@ -327,10 +327,14 @@ def pivot_longer( Should be either a single column name, or a list/tuple of column names. `index` should be a list of tuples if the columns are a MultiIndex. + Column selection is possible using the + [`select`][janitor.functions.select.select] syntax. column_names: Name(s) of columns to unpivot. Should be either a single column name or a list/tuple of column names. `column_names` should be a list of tuples if the columns are a MultiIndex. + Column selection is possible using the + [`select`][janitor.functions.select.select] syntax. names_to: Name of new column as a string that will contain what were previously the column names in `column_names`. The default is `variable` if no value is provided. It can @@ -420,10 +424,13 @@ def pivot_longer_spec( ) -> pd.DataFrame: """A declarative interface to pivot a DataFrame from wide to long form, where you describe how the data will be unpivoted, - using a DataFrame. This gives you, the user, + using a DataFrame. + + This gives you, the user, more control over unpivoting, where you create a “spec” data frame that describes exactly how data stored in the column names becomes variables. + It can come in handy for situations where [`pivot_longer`][janitor.functions.pivot.pivot_longer] seems inadequate for the transformation. @@ -2380,3 +2387,151 @@ def _check_tuples_multiindex(indexer, args, param): ) return args + + +def pivot_wider_spec( + df: pd.DataFrame, + spec: pd.DataFrame, + index: list | tuple | str | Pattern = None, + reset_index: bool = True, +) -> pd.DataFrame: + """A declarative interface to pivot a DataFrame from long to wide form, + where you describe how the data will be pivoted, + using a DataFrame. + + This gives you, the user, + more control over pivoting, where you create a “spec” + data frame that describes exactly how data stored + in the column names becomes variables. + + It can come in handy for situations where + `pd.DataFrame.pivot` + seems inadequate for the transformation. + + !!! info "New in version 0.31.0" + + Examples: + >>> import pandas as pd + >>> from janitor import pivot_wider_spec + >>> df = pd.DataFrame( + ... [ + ... {"famid": 1, "birth": 1, "age": 1, "ht": 2.8}, + ... {"famid": 1, "birth": 1, "age": 2, "ht": 3.4}, + ... {"famid": 1, "birth": 2, "age": 1, "ht": 2.9}, + ... {"famid": 1, "birth": 2, "age": 2, "ht": 3.8}, + ... {"famid": 1, "birth": 3, "age": 1, "ht": 2.2}, + ... {"famid": 1, "birth": 3, "age": 2, "ht": 2.9}, + ... {"famid": 2, "birth": 1, "age": 1, "ht": 2.0}, + ... {"famid": 2, "birth": 1, "age": 2, "ht": 3.2}, + ... {"famid": 2, "birth": 2, "age": 1, "ht": 1.8}, + ... {"famid": 2, "birth": 2, "age": 2, "ht": 2.8}, + ... {"famid": 2, "birth": 3, "age": 1, "ht": 1.9}, + ... {"famid": 2, "birth": 3, "age": 2, "ht": 2.4}, + ... {"famid": 3, "birth": 1, "age": 1, "ht": 2.2}, + ... {"famid": 3, "birth": 1, "age": 2, "ht": 3.3}, + ... {"famid": 3, "birth": 2, "age": 1, "ht": 2.3}, + ... {"famid": 3, "birth": 2, "age": 2, "ht": 3.4}, + ... {"famid": 3, "birth": 3, "age": 1, "ht": 2.1}, + ... {"famid": 3, "birth": 3, "age": 2, "ht": 2.9}, + ... ] + ... ) + >>> df + famid birth age ht + 0 1 1 1 2.8 + 1 1 1 2 3.4 + 2 1 2 1 2.9 + 3 1 2 2 3.8 + 4 1 3 1 2.2 + 5 1 3 2 2.9 + 6 2 1 1 2.0 + 7 2 1 2 3.2 + 8 2 2 1 1.8 + 9 2 2 2 2.8 + 10 2 3 1 1.9 + 11 2 3 2 2.4 + 12 3 1 1 2.2 + 13 3 1 2 3.3 + 14 3 2 1 2.3 + 15 3 2 2 3.4 + 16 3 3 1 2.1 + 17 3 3 2 2.9 + >>> spec = {".name": ["ht1", "ht2"], + ... ".value": ["ht", "ht"], + ... "age": [1, 2]} + >>> spec = pd.DataFrame(spec) + >>> spec + .name .value age + 0 ht1 ht 1 + 1 ht2 ht 2 + >>> pivot_wider_spec(df=df,spec=spec, index=['famid','birth']) + famid birth ht1 ht2 + 0 1 1 2.8 3.4 + 1 1 2 2.9 3.8 + 2 1 3 2.2 2.9 + 3 2 1 2.0 3.2 + 4 2 2 1.8 2.8 + 5 2 3 1.9 2.4 + 6 3 1 2.2 3.3 + 7 3 2 2.3 3.4 + 8 3 3 2.1 2.9 + + Args: + df: A pandas DataFrame. + spec: A specification DataFrame. + At a minimum, the spec DataFrame + must have a '.name' and a '.value' columns. + The '.name' column should contain the + the names of the columns in the output DataFrame. + The '.value' column should contain the name of the column(s) + in the source DataFrame that will be serve as the values. + Additional columns in spec will serves as the columns + to be flipped to wide form. + Note that these additional columns should already exist + in the source DataFrame. + index: Name(s) of columns to use as identifier variables. + It should be either a single column name, or a list of column names. + If `index` is not provided, the DataFrame's index is used. + Column selection is possible using the + [`select`][janitor.functions.select.select] syntax. + reset_index: Determines whether to reset the `index`. + Applicable only if `index` is provided. + + Returns: + A pandas DataFrame that has been unpivoted from long to wide form. + """ # noqa: E501 + check("spec", spec, [pd.DataFrame]) + check("reset_index", reset_index, [bool]) + if not spec.columns.is_unique: + raise ValueError("Kindly ensure the spec's columns is unique.") + if ".name" not in spec.columns: + raise KeyError( + "Kindly ensure the spec DataFrame has a `.name` column." + ) + if ".value" not in spec.columns: + raise KeyError( + "Kindly ensure the spec DataFrame has a `.value` column." + ) + if spec.columns.tolist()[:2] != [".name", ".value"]: + raise ValueError( + "The first two columns of the spec DataFrame " + "should be '.name' and '.value', " + "with '.name' coming before '.value'." + ) + if spec.columns.size == 2: + raise ValueError( + "Kindly provide the column(s) " + "to use to make new frame’s columns" + ) + columns = spec.columns[2:] + values = spec[".value"].unique() + if index is not None: + index = _select_index([index], df, axis="columns") + index = df.columns[index].tolist() + df = df.pivot(index=index, columns=columns, values=values) + _index = spec.columns[1:].tolist() + spec = spec.set_index(_index).squeeze() + df = df.reindex(columns=spec.index) + df.columns = df.columns.map(spec) + if reset_index and index: + return df.reset_index() + return df diff --git a/tests/functions/test_pivot_wider_spec.py b/tests/functions/test_pivot_wider_spec.py new file mode 100644 index 000000000..a67b59be2 --- /dev/null +++ b/tests/functions/test_pivot_wider_spec.py @@ -0,0 +1,136 @@ +import re + +import pandas as pd +import pytest +from pandas.testing import assert_frame_equal + +from janitor import pivot_wider_spec + + +@pytest.fixture +def df_checks(): + """pytest fixture""" + return pd.DataFrame( + [ + {"famid": 1, "birth": 1, "age": 1, "ht": 2.8}, + {"famid": 1, "birth": 1, "age": 2, "ht": 3.4}, + {"famid": 1, "birth": 2, "age": 1, "ht": 2.9}, + {"famid": 1, "birth": 2, "age": 2, "ht": 3.8}, + {"famid": 1, "birth": 3, "age": 1, "ht": 2.2}, + {"famid": 1, "birth": 3, "age": 2, "ht": 2.9}, + {"famid": 2, "birth": 1, "age": 1, "ht": 2.0}, + {"famid": 2, "birth": 1, "age": 2, "ht": 3.2}, + {"famid": 2, "birth": 2, "age": 1, "ht": 1.8}, + {"famid": 2, "birth": 2, "age": 2, "ht": 2.8}, + {"famid": 2, "birth": 3, "age": 1, "ht": 1.9}, + {"famid": 2, "birth": 3, "age": 2, "ht": 2.4}, + {"famid": 3, "birth": 1, "age": 1, "ht": 2.2}, + {"famid": 3, "birth": 1, "age": 2, "ht": 3.3}, + {"famid": 3, "birth": 2, "age": 1, "ht": 2.3}, + {"famid": 3, "birth": 2, "age": 2, "ht": 3.4}, + {"famid": 3, "birth": 3, "age": 1, "ht": 2.1}, + {"famid": 3, "birth": 3, "age": 2, "ht": 2.9}, + ] + ) + + +spec = {".name": ["ht1", "ht2"], ".value": ["ht", "ht"], "age": [1, 2]} +spec = pd.DataFrame(spec) + + +def test_spec_is_a_dataframe(df_checks): + """Raise Error if spec is not a DataFrame.""" + with pytest.raises( + TypeError, + match="spec should be one of.+", + ): + df_checks.pipe(pivot_wider_spec, spec={".name": "name"}) + + +def test_spec_columns_has_dot_name(df_checks): + """Raise KeyError if '.name' not in spec's columns.""" + with pytest.raises( + KeyError, + match="Kindly ensure the spec DataFrame has a `.name` column.", + ): + df_checks.pipe( + pivot_wider_spec, + spec=spec.set_axis(labels=[".value", ".blabla", "age"], axis=1), + ) + + +def test_spec_columns_has_dot_value(df_checks): + """Raise KeyError if '.value' not in spec's columns.""" + with pytest.raises( + KeyError, + match="Kindly ensure the spec DataFrame has a `.value` column.", + ): + df_checks.pipe( + pivot_wider_spec, + spec=spec.set_axis(labels=[".name", ".blabla", "age"], axis=1), + ) + + +def test_spec_columns_name_value_order(df_checks): + """ + Raise ValueError if '.name' and '.value' + are not the first two labels + in spec's columns. + """ + msg = "The first two columns of the spec DataFrame " + msg += "should be '.name' and '.value',.+" + with pytest.raises( + ValueError, + match=msg, + ): + df_checks.pipe( + pivot_wider_spec, + spec=spec.loc[:, [".value", ".name", "age"]], + ) + + +def test_spec_columns_len_2(df_checks): + """ + Raise ValueError if '.name' and '.value' + are the only columns in spec. + """ + msg = "Kindly provide the column(s) " + msg += "to use to make new frame’s columns" + with pytest.raises( + ValueError, + match=re.escape(msg), + ): + df_checks.pipe( + pivot_wider_spec, + spec=spec.loc[:, [".name", ".value"]], + ) + + +def test_spec_columns_not_unique(df_checks): + """Raise ValueError if the spec's columns is not unique.""" + with pytest.raises( + ValueError, match="Kindly ensure the spec's columns is unique." + ): + df_checks.pipe( + pivot_wider_spec, + spec=spec.set_axis(labels=[".name", ".name", "age"], axis=1), + ) + + +def test_pivot_wider_spec(df_checks): + """ + Test output + """ + expected = ( + df_checks.pivot(index=["famid", "birth"], columns="age", values="ht") + .add_prefix("ht") + .rename_axis(columns=None) + .reset_index() + ) + actual = df_checks.pipe( + pivot_wider_spec, spec=spec, index=["famid", "birth"] + ) + assert_frame_equal( + actual.sort_values(expected.columns.tolist(), ignore_index=True), + expected.sort_values(expected.columns.tolist(), ignore_index=True), + )