From a14061c61bb71cbd5041c3d95f95f54507a9f08e Mon Sep 17 00:00:00 2001 From: Samuel Oranyeli Date: Sun, 14 Jul 2024 08:03:34 +1000 Subject: [PATCH] `row_to_names` improvement (#1379) This function improves `row_to_names` for polars dataframes, primarily with speed enhancements. --- janitor/functions/row_to_names.py | 187 +++++++++++++----- janitor/polars/complete.py | 2 +- janitor/polars/pivot_longer.py | 2 +- janitor/polars/row_to_names.py | 144 ++++++++------ tests/functions/test_row_to_names.py | 34 +++- .../functions/test_row_to_names_polars.py | 86 ++++---- 6 files changed, 284 insertions(+), 171 deletions(-) diff --git a/janitor/functions/row_to_names.py b/janitor/functions/row_to_names.py index 2f529c8b5..ee97d8531 100644 --- a/janitor/functions/row_to_names.py +++ b/janitor/functions/row_to_names.py @@ -2,7 +2,7 @@ from __future__ import annotations -import warnings +from functools import singledispatch import numpy as np import pandas as pd @@ -15,7 +15,7 @@ @deprecated_alias(row_number="row_numbers", remove_row="remove_rows") def row_to_names( df: pd.DataFrame, - row_numbers: int | list = 0, + row_numbers: int | list | slice = 0, remove_rows: bool = False, remove_rows_above: bool = False, reset_index: bool = False, @@ -47,7 +47,7 @@ def row_to_names( 1 9 y >>> df.row_to_names([0,1], remove_rows=True, reset_index=True) nums chars - 6 x + 6 x 0 9 y Remove rows above the elevated row and the elevated row itself. @@ -72,8 +72,7 @@ def row_to_names( Args: df: A pandas DataFrame. row_numbers: Position of the row(s) containing the variable names. - Note that indexing starts from 0. It can also be a list, - in which case, a MultiIndex column is created. + It can be an integer, a list or a slice. Defaults to 0 (first row). remove_rows: Whether the row(s) should be removed from the DataFrame. remove_rows_above: Whether the row(s) above the selected row should @@ -83,53 +82,149 @@ def row_to_names( Returns: A pandas DataFrame with set column names. """ # noqa: E501 - if not pd.options.mode.copy_on_write: - df = df.copy() - - check("row_numbers", row_numbers, [int, list]) - if isinstance(row_numbers, list): - for entry in row_numbers: - check("entry in the row_numbers argument", entry, [int]) - - warnings.warn( - "The function row_to_names will, in the official 1.0 release, " - "change its behaviour to reset the dataframe's index by default. " - "You can prepare for this change right now by explicitly setting " - "`reset_index=True` when calling on `row_to_names`." + + return _row_to_names( + row_numbers, + df=df, + remove_rows=remove_rows, + remove_rows_above=remove_rows_above, + reset_index=reset_index, + ) + + +@singledispatch +def _row_to_names( + row_numbers, df, remove_rows, remove_rows_above, reset_index +) -> pd.DataFrame: + """ + Base function for row_to_names. + """ + raise TypeError( + "row_numbers should be either an integer, " + "a slice or a list; " + f"instead got type {type(row_numbers).__name__}" ) - # should raise if positional indexers are missing - # IndexError: positional indexers are out-of-bounds - headers = df.iloc[row_numbers] + + +@_row_to_names.register(int) # noqa: F811 +def _row_to_names_dispatch( # noqa: F811 + row_numbers, df, remove_rows, remove_rows_above, reset_index +): + df_ = df[:] + headers = df_.iloc[row_numbers] + df_.columns = headers + df_.columns.name = None + if not remove_rows and not remove_rows_above and not reset_index: + return df_ + if not remove_rows and not remove_rows_above and reset_index: + return df_.reset_index(drop=True) + + len_df = len(df_) + arrays = [arr._values for _, arr in df_.items()] + if remove_rows_above and remove_rows: + indexer = np.arange(row_numbers + 1, len_df) + elif remove_rows_above: + indexer = np.arange(row_numbers, len_df) + elif remove_rows: + indexer = np.arange(len_df) + mask = np.ones(len_df, dtype=np.bool_) + mask[row_numbers] = False + indexer = indexer[mask] + arrays = {num: arr[indexer] for num, arr in enumerate(arrays)} + if reset_index: + df_index = pd.RangeIndex(start=0, stop=indexer.size) + else: + df_index = df_.index[indexer] + _df = pd.DataFrame(data=arrays, index=df_index, copy=False) + _df.columns = df_.columns + return _df + + +@_row_to_names.register(slice) # noqa: F811 +def _row_to_names_dispatch( # noqa: F811 + row_numbers, df, remove_rows, remove_rows_above, reset_index +): + if row_numbers.step is not None: + raise ValueError( + "The step argument for slice is not supported in row_to_names." + ) + df_ = df[:] + headers = df_.iloc[row_numbers] if isinstance(headers, pd.DataFrame) and (len(headers) == 1): headers = headers.squeeze() - if isinstance(headers, pd.Series): - headers = pd.Index(headers) + df_.columns = headers + df_.columns.name = None else: - headers = [entry.array for _, entry in headers.items()] + headers = [array._values for _, array in headers.items()] headers = pd.MultiIndex.from_tuples(headers) + df_.columns = headers + if not remove_rows and not remove_rows_above and not reset_index: + return df_ + if not remove_rows and not remove_rows_above and reset_index: + return df_.reset_index(drop=True) + len_df = len(df_) + arrays = [arr._values for _, arr in df_.items()] + if remove_rows_above and remove_rows: + indexer = np.arange(row_numbers.stop, len_df) + elif remove_rows_above: + indexer = np.arange(row_numbers.start, len_df) + elif remove_rows: + indexer = np.arange(len_df) + mask = np.ones(len_df, dtype=np.bool_) + mask[row_numbers] = False + indexer = indexer[mask] + arrays = {num: arr[indexer] for num, arr in enumerate(arrays)} + if reset_index: + df_index = pd.RangeIndex(start=0, stop=indexer.size) + else: + df_index = df_.index[indexer] + _df = pd.DataFrame(data=arrays, index=df_index, copy=False) + _df.columns = df_.columns + return _df - df.columns = headers - df.columns.name = None - df_index = df.index +@_row_to_names.register(list) # noqa: F811 +def _row_to_names_dispatch( # noqa: F811 + row_numbers, df, remove_rows, remove_rows_above, reset_index +): if remove_rows_above: - if isinstance(row_numbers, list): - if not (np.diff(row_numbers) == 1).all(): - raise ValueError( - "The remove_rows_above argument is applicable " - "only if the row_numbers argument is an integer, " - "or the integers in a list are consecutive increasing, " - "with a difference of 1." - ) - tail = row_numbers[0] - else: - tail = row_numbers - df = df.iloc[tail:] - if remove_rows: - if isinstance(row_numbers, int): - row_numbers = [row_numbers] - df_index = df.index.symmetric_difference(df_index[row_numbers]) - df = df.loc[df_index] + raise ValueError( + "The remove_rows_above argument is applicable " + "only if the row_numbers argument is an integer " + "or a slice." + ) + + for entry in row_numbers: + check("entry in the row_numbers argument", entry, [int]) + + df_ = df[:] + headers = df_.iloc[row_numbers] + if isinstance(headers, pd.DataFrame) and (len(headers) == 1): + headers = headers.squeeze() + df_.columns = headers + df_.columns.name = None + else: + headers = [array._values for _, array in headers.items()] + headers = pd.MultiIndex.from_tuples(headers) + df_.columns = headers + + if not remove_rows and reset_index: + return df_.reset_index(drop=True) + if not remove_rows and not reset_index: + return df_ + + len_df = len(df_) + arrays = [arr._values for _, arr in df_.items()] + indexer = np.arange(len_df) + mask = np.ones(len_df, dtype=np.bool_) + mask[row_numbers] = False + indexer = indexer[mask] + + arrays = {num: arr[indexer] for num, arr in enumerate(arrays)} if reset_index: - df.index = range(len(df)) - return df + df_index = pd.RangeIndex(start=0, stop=indexer.size) + else: + df_index = df_.index[indexer] + _df = pd.DataFrame(data=arrays, index=df_index, copy=False) + _df.columns = df_.columns + return _df diff --git a/janitor/polars/complete.py b/janitor/polars/complete.py index 546f903bc..ddd6f0a2d 100644 --- a/janitor/polars/complete.py +++ b/janitor/polars/complete.py @@ -11,7 +11,7 @@ try: import polars as pl import polars.selectors as cs - from polars.type_aliases import ColumnNameOrSelector + from polars._typing import ColumnNameOrSelector except ImportError: import_message( submodule="polars", diff --git a/janitor/polars/pivot_longer.py b/janitor/polars/pivot_longer.py index 9dea2581f..15cce254c 100644 --- a/janitor/polars/pivot_longer.py +++ b/janitor/polars/pivot_longer.py @@ -8,7 +8,7 @@ try: import polars as pl - from polars.type_aliases import ColumnNameOrSelector + from polars._typing import ColumnNameOrSelector except ImportError: import_message( submodule="polars", diff --git a/janitor/polars/row_to_names.py b/janitor/polars/row_to_names.py index d67f30f6b..9c90e8e89 100644 --- a/janitor/polars/row_to_names.py +++ b/janitor/polars/row_to_names.py @@ -2,9 +2,11 @@ from __future__ import annotations +from functools import singledispatch + from janitor.utils import check, import_message -from .polars_flavor import register_dataframe_method, register_lazyframe_method +from .polars_flavor import register_dataframe_method try: import polars as pl @@ -17,20 +19,17 @@ ) -@register_lazyframe_method @register_dataframe_method def row_to_names( - df: pl.DataFrame | pl.LazyFrame, - row_numbers: int | list = 0, + df: pl.DataFrame, + row_numbers: int | list | slice = 0, remove_rows: bool = False, remove_rows_above: bool = False, separator: str = "_", -) -> pl.DataFrame | pl.LazyFrame: +) -> pl.DataFrame: """ Elevates a row, or rows, to be the column names of a DataFrame. - `row_to_names` can also be applied to a LazyFrame. - Examples: Replace column names with the first row. @@ -104,8 +103,7 @@ def row_to_names( Args: row_numbers: Position of the row(s) containing the variable names. - Note that indexing starts from 0. It can also be a list. - Defaults to 0 (first row). + It can be an integer, list or a slice. remove_rows: Whether the row(s) should be removed from the DataFrame. remove_rows_above: Whether the row(s) above the selected row should be removed from the DataFrame. @@ -113,71 +111,93 @@ def row_to_names( if row_numbers is a list of integers. Default is '_'. Returns: - A polars DataFrame/LazyFrame. + A polars DataFrame. """ # noqa: E501 return _row_to_names( + row_numbers, df=df, - row_numbers=row_numbers, remove_rows=remove_rows, remove_rows_above=remove_rows_above, separator=separator, ) +@singledispatch def _row_to_names( - df: pl.DataFrame | pl.LazyFrame, - row_numbers: int | list, - remove_rows: bool, - remove_rows_above: bool, - separator: str, -) -> pl.DataFrame | pl.LazyFrame: + row_numbers, df, remove_rows, remove_rows_above, separator +) -> pl.DataFrame: """ - Function to convert rows in the DataFrame to column names. + Base function for row_to_names. """ - check("separator", separator, [str]) - check("row_numbers", row_numbers, [int, list]) - row_numbers_is_a_list = False - if isinstance(row_numbers, list): - row_numbers_is_a_list = True - for entry in row_numbers: - check("entry in the row_numbers argument", entry, [int]) - expression = ( - pl.all() - .gather(row_numbers) - .cast(pl.String) - .implode() - .list.join(separator=separator) + raise TypeError( + "row_numbers should be either an integer, " + "a slice or a list; " + f"instead got type {type(row_numbers).__name__}" + ) + + +@_row_to_names.register(int) # noqa: F811 +def _row_to_names_dispatch( # noqa: F811 + row_numbers, df, remove_rows, remove_rows_above, separator +): + headers = df.row(row_numbers, named=True) + headers = {col: str(repl) for col, repl in headers.items()} + df = df.rename(mapping=headers) + if remove_rows_above and remove_rows: + return df.slice(row_numbers + 1) + elif remove_rows_above: + return df.slice(row_numbers) + elif remove_rows: + expression = pl.int_range(pl.len()).ne(row_numbers) + return df.filter(expression) + return df + + +@_row_to_names.register(slice) # noqa: F811 +def _row_to_names_dispatch( # noqa: F811 + row_numbers, df, remove_rows, remove_rows_above, separator +): + if row_numbers.step is not None: + raise ValueError( + "The step argument for slice is not supported in row_to_names." ) - expression = pl.struct(expression) - else: - expression = pl.all().gather(row_numbers).cast(pl.String) - expression = pl.struct(expression) - mapping = df.select(expression) - if isinstance(mapping, pl.LazyFrame): - mapping = mapping.collect() - mapping = mapping.to_series(0)[0] - df = df.rename(mapping=mapping) - if remove_rows_above: - if row_numbers_is_a_list: - if not pl.Series(row_numbers).diff().drop_nulls().eq(1).all(): - raise ValueError( - "The remove_rows_above argument is applicable " - "only if the row_numbers argument is an integer, " - "or the integers in a list are consecutive increasing, " - "with a difference of 1." - ) - if remove_rows: - tail = row_numbers[-1] if row_numbers_is_a_list else row_numbers - tail += 1 - else: - tail = row_numbers[0] if row_numbers_is_a_list else row_numbers - df = df.slice(offset=tail) + headers = df.slice(row_numbers.start, row_numbers.stop - row_numbers.start) + expression = pl.all().str.concat(delimiter=separator) + headers = headers.select(expression).row(0, named=True) + headers = {col: str(repl) for col, repl in headers.items()} + df = df.rename(mapping=headers) + if remove_rows_above and remove_rows: + return df.slice(row_numbers.stop) + elif remove_rows_above: + return df.slice(row_numbers.start) elif remove_rows: - idx = "".join(df.columns) - df = df.with_row_index(name=idx) - if row_numbers_is_a_list: - df = df.filter(~pl.col(idx).is_in(row_numbers)) - else: - df = df.filter(pl.col(idx) != row_numbers) - df = df.drop(idx) + expression = pl.int_range(pl.len()).is_between( + row_numbers.start, row_numbers.stop, closed="left" + ) + return df.filter(~expression) + return df + + +@_row_to_names.register(list) # noqa: F811 +def _row_to_names_dispatch( # noqa: F811 + row_numbers, df, remove_rows, remove_rows_above, separator +): + if remove_rows_above: + raise ValueError( + "The remove_rows_above argument is applicable " + "only if the row_numbers argument is an integer " + "or a slice." + ) + + for entry in row_numbers: + check("entry in the row_numbers argument", entry, [int]) + + expression = pl.all().gather(row_numbers) + expression = expression.str.concat(delimiter=separator) + headers = df.select(expression).row(0, named=True) + headers = {col: str(repl) for col, repl in headers.items()} + df = df.rename(mapping=headers) + if remove_rows: + expression = pl.int_range(pl.len()).is_in(row_numbers) + return df.filter(~expression) return df diff --git a/tests/functions/test_row_to_names.py b/tests/functions/test_row_to_names.py index f966a4966..758afe44d 100644 --- a/tests/functions/test_row_to_names.py +++ b/tests/functions/test_row_to_names.py @@ -64,9 +64,9 @@ def test_row_to_names_delete_above(dataframe): @pytest.mark.functions -def test_row_to_names_delete_above_list(dataframe): - "Test output if row_numbers is a list" - df = dataframe.row_to_names([2, 3], remove_rows_above=True) +def test_row_to_names_delete_above_slice(dataframe): + "Test output if row_numbers is a slice" + df = dataframe.row_to_names(slice(2, 4), remove_rows_above=True) assert df.iloc[0, 0] == 3 assert df.iloc[0, 1] == 3.234_612_5 assert df.iloc[0, 2] == 3 @@ -75,13 +75,27 @@ def test_row_to_names_delete_above_list(dataframe): @pytest.mark.functions -def test_row_to_names_delete_above_list_non_consecutive(dataframe): - "Raise if row_numbers is a list, but non consecutive" - msg = "The remove_rows_above argument is applicable " - msg += "only if the row_numbers argument is an integer, " - msg += "or the integers in a list are consecutive increasing, " - msg += "with a difference of 1." - with pytest.raises(ValueError, match=msg): +def test_row_to_names_delete_above_delete_rows(dataframe): + """ + Test output for remove_rows=True + and remove_rows_above=True + """ + df = dataframe.row_to_names( + slice(2, 4), remove_rows=True, remove_rows_above=True + ) + assert df.iloc[0, 0] == 2 + assert df.iloc[0, 1] == 2.456234 + assert df.iloc[0, 2] == 2 + assert df.iloc[0, 3] == "leopard" + assert df.iloc[0, 4] == "Shanghai" + + +@pytest.mark.functions +def test_row_to_names_delete_above_is_a_list(dataframe): + "Raise if row_numbers is a list" + with pytest.raises( + ValueError, match=r"The remove_rows_above argument is applicable.+" + ): dataframe.row_to_names([1, 3], remove_rows_above=True) diff --git a/tests/polars/functions/test_row_to_names_polars.py b/tests/polars/functions/test_row_to_names_polars.py index 47c01fb92..1c81660e0 100644 --- a/tests/polars/functions/test_row_to_names_polars.py +++ b/tests/polars/functions/test_row_to_names_polars.py @@ -3,35 +3,36 @@ import janitor.polars # noqa: F401 -df = pl.DataFrame( - { - "Bell__Chart": [1.234_523_45, 2.456_234, 3.234_612_5] * 3, - "decorated-elephant": [1, 2, 3] * 3, - "animals@#$%^": ["rabbit", "leopard", "lion"] * 3, - "cities": ["Cambridge", "Shanghai", "Basel"] * 3, - } -) - - -@pytest.mark.parametrize("df", [df, df.lazy()]) -def test_separator_type(df): - """ - Raise if separator is not a string - """ - with pytest.raises(TypeError, match="separator should be.+"): - df.row_to_names([1, 2], separator=1) + +@pytest.fixture +def df(): + """fixture for tests""" + return pl.DataFrame( + { + "Bell__Chart": [1.234_523_45, 2.456_234, 3.234_612_5] * 3, + "decorated-elephant": [1, 2, 3] * 3, + "animals@#$%^": ["rabbit", "leopard", "lion"] * 3, + "cities": ["Cambridge", "Shanghai", "Basel"] * 3, + } + ) -@pytest.mark.parametrize("df", [df, df.lazy()]) def test_row_numbers_type(df): """ - Raise if row_numbers is not an int/list + Raise if row_numbers is not an int/slice/list """ with pytest.raises(TypeError, match="row_numbers should be.+"): df.row_to_names({1, 2}) -@pytest.mark.parametrize("df", [df, df.lazy()]) +def test_row_numbers_slice_step(df): + """ + Raise if row_numbers is a slice and step is passed. + """ + with pytest.raises(ValueError, match="The step argument for slice.+"): + df.row_to_names(slice(1, 3, 1)) + + def test_row_numbers_list_type(df): """ Raise if row_numbers is a list @@ -43,7 +44,6 @@ def test_row_numbers_list_type(df): df.row_to_names(["1", 2]) -@pytest.mark.parametrize("df", [df, df.lazy()]) def test_row_to_names(df): df = df.row_to_names(2) assert df.columns[0] == "3.2346125" @@ -52,7 +52,14 @@ def test_row_to_names(df): assert df.columns[3] == "Basel" -@pytest.mark.parametrize("df", [df, df.lazy()]) +def test_row_to_names_slice(df): + df = df.row_to_names(slice(2, 3)) + assert df.columns[0] == "3.2346125" + assert df.columns[1] == "3" + assert df.columns[2] == "lion" + assert df.columns[3] == "Basel" + + def test_row_to_names_single_list(df): "Test output if row_numbers is a list, and contains a single item." df = df.row_to_names([2]) @@ -62,7 +69,6 @@ def test_row_to_names_single_list(df): assert df.columns[3] == "Basel" -@pytest.mark.parametrize("df", [df, df.lazy()]) def test_row_to_names_list(df): "Test output if row_numbers is a list." df = df.row_to_names([1, 2]) @@ -72,87 +78,65 @@ def test_row_to_names_list(df): assert df.columns[3] == "Shanghai_Basel" -@pytest.mark.parametrize("df", [df, df.lazy()]) def test_row_to_names_delete_this_row(df): df = df.row_to_names(2, remove_rows=True) - if isinstance(df, pl.LazyFrame): - df = df.collect() assert df.to_series(0)[0] == 1.234_523_45 assert df.to_series(1)[0] == 1 assert df.to_series(2)[0] == "rabbit" assert df.to_series(3)[0] == "Cambridge" -@pytest.mark.parametrize("df", [df, df.lazy()]) def test_row_to_names_list_delete_this_row(df): df = df.row_to_names([2], remove_rows=True) - if isinstance(df, pl.LazyFrame): - df = df.collect() assert df.to_series(0)[0] == 1.234_523_45 assert df.to_series(1)[0] == 1 assert df.to_series(2)[0] == "rabbit" assert df.to_series(3)[0] == "Cambridge" -@pytest.mark.parametrize("df", [df, df.lazy()]) def test_row_to_names_delete_above(df): df = df.row_to_names(2, remove_rows_above=True) - if isinstance(df, pl.LazyFrame): - df = df.collect() assert df.to_series(0)[0] == 3.234_612_5 assert df.to_series(1)[0] == 3 assert df.to_series(2)[0] == "lion" assert df.to_series(3)[0] == "Basel" -@pytest.mark.parametrize("df", [df, df.lazy()]) def test_row_to_names_delete_above_list(df): "Test output if row_numbers is a list" - df = df.row_to_names([2, 3], remove_rows_above=True) - if isinstance(df, pl.LazyFrame): - df = df.collect() + df = df.row_to_names(slice(2, 4), remove_rows_above=True) assert df.to_series(0)[0] == 3.234_612_5 assert df.to_series(1)[0] == 3 assert df.to_series(2)[0] == "lion" assert df.to_series(3)[0] == "Basel" -@pytest.mark.parametrize("df", [df, df.lazy()]) def test_row_to_names_delete_above_delete_rows(df): """ Test output for remove_rows=True and remove_rows_above=True """ - df = df.row_to_names([2, 3], remove_rows=True, remove_rows_above=True) - if isinstance(df, pl.LazyFrame): - df = df.collect() + df = df.row_to_names(slice(2, 4), remove_rows=True, remove_rows_above=True) assert df.to_series(0)[0] == 2.456234 assert df.to_series(1)[0] == 2 assert df.to_series(2)[0] == "leopard" assert df.to_series(3)[0] == "Shanghai" -@pytest.mark.parametrize("df", [df, df.lazy()]) def test_row_to_names_delete_above_delete_rows_scalar(df): """ Test output for remove_rows=True and remove_rows_above=True """ df = df.row_to_names(2, remove_rows=True, remove_rows_above=True) - if isinstance(df, pl.LazyFrame): - df = df.collect() assert df.to_series(0)[0] == 1.23452345 assert df.to_series(1)[0] == 1 assert df.to_series(2)[0] == "rabbit" assert df.to_series(3)[0] == "Cambridge" -@pytest.mark.parametrize("df", [df, df.lazy()]) -def test_row_to_names_delete_above_list_non_consecutive(df): - "Raise if row_numbers is a list, but non consecutive" - msg = "The remove_rows_above argument is applicable " - msg += "only if the row_numbers argument is an integer, " - msg += "or the integers in a list are consecutive increasing, " - msg += "with a difference of 1." - with pytest.raises(ValueError, match=msg): +def test_row_to_names_not_a_slice_remove_rows_above(df): + with pytest.raises( + ValueError, match=r"The remove_rows_above argument is applicable.+" + ): df.row_to_names([1, 3], remove_rows_above=True)