From 7cf2bb27709b0df5fb534f35c9a107522519c566 Mon Sep 17 00:00:00 2001 From: "samuel.oranyeli" Date: Thu, 20 Jun 2024 21:11:58 +1000 Subject: [PATCH 01/17] minor edits --- janitor/polars/row_to_names.py | 88 +++++++++++-------- .../functions/test_row_to_names_polars.py | 35 +++++--- 2 files changed, 78 insertions(+), 45 deletions(-) diff --git a/janitor/polars/row_to_names.py b/janitor/polars/row_to_names.py index 7fe1b0b9e..70d48b208 100644 --- a/janitor/polars/row_to_names.py +++ b/janitor/polars/row_to_names.py @@ -17,7 +17,7 @@ def _row_to_names( df: pl.DataFrame | pl.LazyFrame, - row_numbers: int | list, + row_numbers: int | list | slice, remove_rows: bool, remove_rows_above: bool, separator: str, @@ -26,49 +26,67 @@ def _row_to_names( Function to convert rows in the DataFrame to column names. """ check("separator", separator, [str]) - check("row_numbers", row_numbers, [int, list]) - row_numbers_is_a_list = False - if isinstance(row_numbers, list): - row_numbers_is_a_list = True + if isinstance(row_numbers, int): + row_numbers = slice(row_numbers, row_numbers + 1) + elif isinstance(row_numbers, slice): + if row_numbers.step is not None: + raise ValueError( + "The step argument for slice is not supported in row_to_names." + ) + elif isinstance(row_numbers, list): for entry in row_numbers: check("entry in the row_numbers argument", entry, [int]) - expression = ( - pl.all() - .gather(row_numbers) - .cast(pl.String) - .implode() - .list.join(separator=separator) + else: + raise TypeError( + "row_numbers should be either an integer, " + "a slice or a list; " + f"instead got type {type(row_numbers).__name__}" ) + is_a_slice = isinstance(row_numbers, slice) + if is_a_slice: + expression = pl.all().str.concat(delimiter=separator) expression = pl.struct(expression) + offset = row_numbers.start + length = row_numbers.stop - row_numbers.start + mapping = df.slice( + offset=offset, + length=length, + ) + mapping = mapping.select(expression) else: - expression = pl.all().gather(row_numbers).cast(pl.String) + expression = pl.all().gather(row_numbers) + expression = expression.str.concat(delimiter=separator) expression = pl.struct(expression) - mapping = df.select(expression) - if isinstance(mapping, pl.LazyFrame): + mapping = df.select(expression) + if isinstance(df, pl.LazyFrame): mapping = mapping.collect() mapping = mapping.to_series(0)[0] df = df.rename(mapping=mapping) if remove_rows_above: - if row_numbers_is_a_list: - if not pl.Series(row_numbers).diff().drop_nulls().eq(1).all(): - raise ValueError( - "The remove_rows_above argument is applicable " - "only if the row_numbers argument is an integer, " - "or the integers in a list are consecutive increasing, " - "with a difference of 1." - ) + if not is_a_slice: + raise ValueError( + "The remove_rows_above argument is applicable " + "only if the row_numbers argument is an integer " + "or a slice." + ) if remove_rows: - tail = row_numbers[-1] if row_numbers_is_a_list else row_numbers - tail += 1 - else: - tail = row_numbers[0] if row_numbers_is_a_list else row_numbers - df = df.slice(offset=tail) - elif remove_rows: - idx = "".join(df.columns) - df = df.with_row_index(name=idx) - if row_numbers_is_a_list: - df = df.filter(~pl.col(idx).is_in(row_numbers)) - else: - df = df.filter(pl.col(idx) != row_numbers) - df = df.drop(idx) + return df.slice(offset=row_numbers.stop) + return df.slice(offset=row_numbers.start) + + if remove_rows: + if is_a_slice: + df = [ + df.slice(offset=0, length=row_numbers.start), + df.slice(offset=row_numbers.stop), + ] + return pl.concat(df, rechunk=True) + name = "".join(df.columns) + name = f"{name}_" + df = ( + df.with_row_index(name=name) + .filter(pl.col(name=name).is_in(row_numbers).not_()) + .select(pl.exclude(name)) + ) + return df + return df diff --git a/tests/polars/functions/test_row_to_names_polars.py b/tests/polars/functions/test_row_to_names_polars.py index be5e07fdd..d7371a1c4 100644 --- a/tests/polars/functions/test_row_to_names_polars.py +++ b/tests/polars/functions/test_row_to_names_polars.py @@ -25,12 +25,21 @@ def test_separator_type(df): @pytest.mark.parametrize("df", [df, df.lazy()]) def test_row_numbers_type(df): """ - Raise if row_numbers is not an int/list + Raise if row_numbers is not an int/slice/list """ with pytest.raises(TypeError, match="row_numbers should be.+"): df.janitor.row_to_names({1, 2}) +@pytest.mark.parametrize("df", [df, df.lazy()]) +def test_row_numbers_slice_step(df): + """ + Raise if row_numbers is a slice and step is passed. + """ + with pytest.raises(ValueError, match="The step argument for slice.+"): + df.janitor.row_to_names(slice(1, 3, 1)) + + @pytest.mark.parametrize("df", [df, df.lazy()]) def test_row_numbers_list_type(df): """ @@ -52,6 +61,15 @@ def test_row_to_names(df): assert df.columns[3] == "Basel" +@pytest.mark.parametrize("df", [df, df.lazy()]) +def test_row_to_names_slice(df): + df = df.janitor.row_to_names(slice(2, 3)) + assert df.columns[0] == "3.2346125" + assert df.columns[1] == "3" + assert df.columns[2] == "lion" + assert df.columns[3] == "Basel" + + @pytest.mark.parametrize("df", [df, df.lazy()]) def test_row_to_names_single_list(df): "Test output if row_numbers is a list, and contains a single item." @@ -108,7 +126,7 @@ def test_row_to_names_delete_above(df): @pytest.mark.parametrize("df", [df, df.lazy()]) def test_row_to_names_delete_above_list(df): "Test output if row_numbers is a list" - df = df.janitor.row_to_names([2, 3], remove_rows_above=True) + df = df.janitor.row_to_names(slice(2, 4), remove_rows_above=True) if isinstance(df, pl.LazyFrame): df = df.collect() assert df.to_series(0)[0] == 3.234_612_5 @@ -124,7 +142,7 @@ def test_row_to_names_delete_above_delete_rows(df): and remove_rows_above=True """ df = df.janitor.row_to_names( - [2, 3], remove_rows=True, remove_rows_above=True + slice(2, 4), remove_rows=True, remove_rows_above=True ) if isinstance(df, pl.LazyFrame): df = df.collect() @@ -150,11 +168,8 @@ def test_row_to_names_delete_above_delete_rows_scalar(df): @pytest.mark.parametrize("df", [df, df.lazy()]) -def test_row_to_names_delete_above_list_non_consecutive(df): - "Raise if row_numbers is a list, but non consecutive" - msg = "The remove_rows_above argument is applicable " - msg += "only if the row_numbers argument is an integer, " - msg += "or the integers in a list are consecutive increasing, " - msg += "with a difference of 1." - with pytest.raises(ValueError, match=msg): +def test_row_to_names_not_a_slice_remove_rows_above(df): + with pytest.raises( + ValueError, match=r"The remove_rows_above argument is applicable.+" + ): df.janitor.row_to_names([1, 3], remove_rows_above=True) From 749c94777d6a795ebf9a429d0ff9cd353651ee6e Mon Sep 17 00:00:00 2001 From: "samuel.oranyeli" Date: Thu, 20 Jun 2024 22:32:01 +1000 Subject: [PATCH 02/17] minor speed bump --- examples/notebooks/Row_to_Names.ipynb | 328 +++++++++++++++----------- janitor/functions/row_to_names.py | 72 +++--- tests/functions/test_row_to_names.py | 18 +- 3 files changed, 245 insertions(+), 173 deletions(-) diff --git a/examples/notebooks/Row_to_Names.ipynb b/examples/notebooks/Row_to_Names.ipynb index 2852ffdbd..402396314 100644 --- a/examples/notebooks/Row_to_Names.ipynb +++ b/examples/notebooks/Row_to_Names.ipynb @@ -23,7 +23,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -34,21 +34,7 @@ }, { "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "data = '''shoe, 220, 100\n", - " shoe, 450, 40\n", - " item, retail_price, cost\n", - " shoe, 200, 38\n", - " bag, 305, 25\n", - " '''" - ] - }, - { - "cell_type": "code", - "execution_count": 4, + "execution_count": 2, "metadata": {}, "outputs": [ { @@ -72,82 +58,63 @@ " \n", " \n", " \n", - " 0\n", - " 1\n", - " 2\n", + " a\n", + " b\n", " \n", " \n", " \n", " \n", " 0\n", - " shoe\n", - " 220\n", - " 100\n", + " nums\n", + " chars\n", " \n", " \n", " 1\n", - " shoe\n", - " 450\n", - " 40\n", + " 6\n", + " x\n", " \n", " \n", " 2\n", - " item\n", - " retail_price\n", - " cost\n", - " \n", - " \n", - " 3\n", - " shoe\n", - " 200\n", - " 38\n", - " \n", - " \n", - " 4\n", - " bag\n", - " 305\n", - " 25\n", + " 9\n", + " y\n", " \n", " \n", "\n", "" ], "text/plain": [ - " 0 1 2\n", - "0 shoe 220 100\n", - "1 shoe 450 40\n", - "2 item retail_price cost\n", - "3 shoe 200 38\n", - "4 bag 305 25" + " a b\n", + "0 nums chars\n", + "1 6 x\n", + "2 9 y" ] }, - "execution_count": 4, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "temp = pd.read_csv(StringIO(data), header=None)\n", - "temp" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Looking at the dataframe above, we would love to use row 2 as our column names. One way to achieve this involves a couple of steps\n", - "\n", - "1. Use loc/iloc to assign row 2 to columns.\n", - "2. Strip off any whitespace.\n", - "2. Drop row 2 from the dataframe using the drop method.\n", - "3. Set axis name to none." + "df = pd.DataFrame({\n", + " \"a\": [\"nums\", '6', '9'],\n", + " \"b\": [\"chars\", \"x\", \"y\"],\n", + " })\n", + "df" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 3, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/samuel.oranyeli/pyjanitor/janitor/functions/row_to_names.py:148: UserWarning: The function row_to_names will, in the official 1.0 release, change its behaviour to reset the dataframe's index by default. You can prepare for this change right now by explicitly setting `reset_index=True` when calling on `row_to_names`.\n", + " warnings.warn(\n" + ] + }, { "data": { "text/html": [ @@ -161,81 +128,70 @@ " vertical-align: top;\n", " }\n", "\n", - " .dataframe thead th {\n", - " text-align: right;\n", + " .dataframe thead tr th {\n", + " text-align: left;\n", " }\n", "\n", "\n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", "
itemretail_pricecost6x
9y
0shoe220100
1shoe45040
3shoe20038
4bag30525numschars
\n", "" ], "text/plain": [ - " item retail_price cost\n", - "0 shoe 220 100\n", - "1 shoe 450 40\n", - "3 shoe 200 38\n", - "4 bag 305 25" + " 6 x\n", + " 9 y\n", + "0 nums chars" ] }, - "execution_count": 5, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "temp.columns = temp.iloc[2, :]\n", - "temp.columns = temp.columns.str.strip()\n", - "temp = temp.drop(2, axis=0)\n", - "temp = temp.rename_axis(None, axis='columns')\n", - "temp" + "df.row_to_names(row_numbers=[1,2], remove_rows=True)" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 4, "metadata": {}, + "outputs": [], "source": [ - "However, the first two steps prevent us from method chaining. This is easily resolved using the row_to_names function" + "dff = pd.concat([df]*1_000_000)\n" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/samuel.oranyeli/pyjanitor/janitor/functions/row_to_names.py:148: UserWarning: The function row_to_names will, in the official 1.0 release, change its behaviour to reset the dataframe's index by default. You can prepare for this change right now by explicitly setting `reset_index=True` when calling on `row_to_names`.\n", + " warnings.warn(\n" + ] + }, { "data": { "text/html": [ @@ -249,68 +205,168 @@ " vertical-align: top;\n", " }\n", "\n", - " .dataframe thead th {\n", - " text-align: right;\n", + " .dataframe thead tr th {\n", + " text-align: left;\n", " }\n", "\n", "\n", " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", "
numschars
itemretail_pricecost6x
29y
0shoe220100numschars
1shoe450406x
29y
3shoe200380numschars
4bag30525.........
16x
29y
0numschars
16x
29y
\n", + "

2999998 rows × 2 columns

\n", "" ], "text/plain": [ - " item retail_price cost\n", - "0 shoe 220 100\n", - "1 shoe 450 40\n", - "3 shoe 200 38\n", - "4 bag 305 25" + " nums chars\n", + " 6 x\n", + "2 9 y\n", + "0 nums chars\n", + "1 6 x\n", + "2 9 y\n", + "0 nums chars\n", + ".. ... ...\n", + "1 6 x\n", + "2 9 y\n", + "0 nums chars\n", + "1 6 x\n", + "2 9 y\n", + "\n", + "[2999998 rows x 2 columns]" ] }, - "execution_count": 6, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df = (\n", - " pd.read_csv(StringIO(data), header=None)\n", - " .row_to_names(row_number=2, remove_row=True)\n", - ")\n", - "\n", - "df" + "dff.row_to_names(row_numbers=[0,1], remove_rows=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "20.8 ms ± 31.9 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" + ] + } + ], + "source": [ + "%timeit dff.row_to_names(row_numbers=0, remove_rows_above=True, remove_rows=True, reset_index=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/samuel.oranyeli/pyjanitor/janitor/functions/row_to_names.py:148: UserWarning: The function row_to_names will, in the official 1.0 release, change its behaviour to reset the dataframe's index by default. You can prepare for this change right now by explicitly setting `reset_index=True` when calling on `row_to_names`.\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "37.3 ms ± 310 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" + ] + } + ], + "source": [ + "%timeit dff.row_to_names(row_numbers=0, remove_rows=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "195 ms ± 635 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + } + ], + "source": [ + "%timeit dff.row_to_names(row_numbers=[0,1], remove_rows=True, reset_index=True)" ] } ], @@ -330,7 +386,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.6" + "version": "3.9.16" } }, "nbformat": 4, diff --git a/janitor/functions/row_to_names.py b/janitor/functions/row_to_names.py index 2f529c8b5..adae640b3 100644 --- a/janitor/functions/row_to_names.py +++ b/janitor/functions/row_to_names.py @@ -15,7 +15,7 @@ @deprecated_alias(row_number="row_numbers", remove_row="remove_rows") def row_to_names( df: pd.DataFrame, - row_numbers: int | list = 0, + row_numbers: int | list | slice = 0, remove_rows: bool = False, remove_rows_above: bool = False, reset_index: bool = False, @@ -83,20 +83,30 @@ def row_to_names( Returns: A pandas DataFrame with set column names. """ # noqa: E501 + if not pd.options.mode.copy_on_write: df = df.copy() - - check("row_numbers", row_numbers, [int, list]) - if isinstance(row_numbers, list): + else: + df = df[:] + + if isinstance(row_numbers, int): + row_numbers = slice(row_numbers, row_numbers + 1) + elif isinstance(row_numbers, slice): + if row_numbers.step is not None: + raise ValueError( + "The step argument for slice is not supported in row_to_names." + ) + elif isinstance(row_numbers, list): for entry in row_numbers: check("entry in the row_numbers argument", entry, [int]) + else: + raise TypeError( + "row_numbers should be either an integer, " + "a slice or a list; " + f"instead got type {type(row_numbers).__name__}" + ) + is_a_slice = isinstance(row_numbers, slice) - warnings.warn( - "The function row_to_names will, in the official 1.0 release, " - "change its behaviour to reset the dataframe's index by default. " - "You can prepare for this change right now by explicitly setting " - "`reset_index=True` when calling on `row_to_names`." - ) # should raise if positional indexers are missing # IndexError: positional indexers are out-of-bounds headers = df.iloc[row_numbers] @@ -111,25 +121,33 @@ def row_to_names( df.columns = headers df.columns.name = None - df_index = df.index if remove_rows_above: - if isinstance(row_numbers, list): - if not (np.diff(row_numbers) == 1).all(): - raise ValueError( - "The remove_rows_above argument is applicable " - "only if the row_numbers argument is an integer, " - "or the integers in a list are consecutive increasing, " - "with a difference of 1." - ) - tail = row_numbers[0] + if not is_a_slice: + raise ValueError( + "The remove_rows_above argument is applicable " + "only if the row_numbers argument is an integer " + "or a slice." + ) + if remove_rows: + df = df.iloc[row_numbers.stop :] + else: + df = df.iloc[row_numbers.start :] + elif remove_rows: + if is_a_slice: + start = row_numbers.start if row_numbers.start else 0 + stop = row_numbers.stop + df = [df.iloc[:start], df.iloc[stop:]] + df = pd.concat(df, sort=False, copy=False) else: - tail = row_numbers - df = df.iloc[tail:] - if remove_rows: - if isinstance(row_numbers, int): - row_numbers = [row_numbers] - df_index = df.index.symmetric_difference(df_index[row_numbers]) - df = df.loc[df_index] + row_numbers = np.setdiff1d(range(len(df)), row_numbers) + df = df.iloc[row_numbers] if reset_index: df.index = range(len(df)) + else: + warnings.warn( + "The function row_to_names will, in the official 1.0 release, " + "change its behaviour to reset the dataframe's index by default. " + "You can prepare for this change right now by explicitly setting " + "`reset_index=True` when calling on `row_to_names`." + ) return df diff --git a/tests/functions/test_row_to_names.py b/tests/functions/test_row_to_names.py index f966a4966..5295b44d9 100644 --- a/tests/functions/test_row_to_names.py +++ b/tests/functions/test_row_to_names.py @@ -64,9 +64,9 @@ def test_row_to_names_delete_above(dataframe): @pytest.mark.functions -def test_row_to_names_delete_above_list(dataframe): - "Test output if row_numbers is a list" - df = dataframe.row_to_names([2, 3], remove_rows_above=True) +def test_row_to_names_delete_above_slice(dataframe): + "Test output if row_numbers is a slice" + df = dataframe.row_to_names(slice(2, 4), remove_rows_above=True) assert df.iloc[0, 0] == 3 assert df.iloc[0, 1] == 3.234_612_5 assert df.iloc[0, 2] == 3 @@ -75,13 +75,11 @@ def test_row_to_names_delete_above_list(dataframe): @pytest.mark.functions -def test_row_to_names_delete_above_list_non_consecutive(dataframe): - "Raise if row_numbers is a list, but non consecutive" - msg = "The remove_rows_above argument is applicable " - msg += "only if the row_numbers argument is an integer, " - msg += "or the integers in a list are consecutive increasing, " - msg += "with a difference of 1." - with pytest.raises(ValueError, match=msg): +def test_row_to_names_delete_above_is_a_list(dataframe): + "Raise if row_numbers is a list" + with pytest.raises( + ValueError, match=r"The remove_rows_above argument is applicable.+" + ): dataframe.row_to_names([1, 3], remove_rows_above=True) From c2f2150aaf6f5858bd06482ef2f7916c57300f20 Mon Sep 17 00:00:00 2001 From: "samuel.oranyeli" Date: Thu, 20 Jun 2024 22:37:11 +1000 Subject: [PATCH 03/17] remove row_to_names.md --- examples/row_to_names.md | 98 ---------------------------------------- 1 file changed, 98 deletions(-) delete mode 100644 examples/row_to_names.md diff --git a/examples/row_to_names.md b/examples/row_to_names.md deleted file mode 100644 index 2d0bbab68..000000000 --- a/examples/row_to_names.md +++ /dev/null @@ -1,98 +0,0 @@ -# df.row_to_names() - -## Description -This method elevates a row to be the column names of a DataFrame. It contains parameters to remove the elevated row from the DataFrame along with removing the rows above the selected row. - - :param df: A pandas DataFrame. - :param row_number: The row containing the variable names - :param remove_row: Should the row be removed from the DataFrame? - :param remove_rows_above: Should the rows above row_number be removed from the resulting DataFrame? - -## Parameters -### df -A pandas dataframe. - -### row_number -The number of the row containing the variable names. Remember, indexing starts at zero! - -### remove_row (Default: False) -Remove the row that is now the headers from the DataFrame. - -### remove_rows_above (Default: False) -Remove the rows from the index above `row_number`. - - -## Setup - -```python -import pandas as pd -import janitor - - -data_dict = { - "a": [1, 2, 3] * 3, - "Bell__Chart": [1, 2, 3] * 3, - "decorated-elephant": [1, 2, 3] * 3, - "animals": ["rabbit", "leopard", "lion"] * 3, - "cities": ["Cambridge", "Shanghai", "Basel"] * 3, -} -``` - - - -## Example1: Move first row to column names - ```python -example_dataframe = pd.DataFrame(data_dict) - -example_dataframe.row_to_names(0) -``` - -### Output - - 1 1 1 rabbit Cambridge - 0 1 1 1 rabbit Cambridge - 1 2 2 2 leopard Shanghai - 2 3 3 3 lion Basel - 3 1 1 1 rabbit Cambridge - 4 2 2 2 leopard Shanghai - 5 3 3 3 lion Basel - 6 1 1 1 rabbit Cambridge - 7 2 2 2 leopard Shanghai - -## Example2: Move first row to column names and remove row - -```python -example_dataframe = pd.DataFrame(data_dict) - -example_dataframe.row_to_names(0, remove_row=True) -``` - -### Output - - 1 1 1 rabbit Cambridge - 1 2 2 2 leopard Shanghai - 2 3 3 3 lion Basel - 3 1 1 1 rabbit Cambridge - 4 2 2 2 leopard Shanghai - 5 3 3 3 lion Basel - 6 1 1 1 rabbit Cambridge - 7 2 2 2 leopard Shanghai - 8 3 3 3 lion Basel - -## Example3: Move first row to column names, remove row, and remove rows above selected row - -```python -example_dataframe = pd.DataFrame(data_dict) - -example_dataframe.row_to_names(2, remove_row=True, remove_rows_above=True) -``` - -### Output - - 3 3 3 lion Basel - 3 1 1 1 rabbit Cambridge - 4 2 2 2 leopard Shanghai - 5 3 3 3 lion Basel - 6 1 1 1 rabbit Cambridge - 7 2 2 2 leopard Shanghai - 8 3 3 3 lion Basel From 30b336b112a03224ddaca5ffcfcc085ddb11e9a2 Mon Sep 17 00:00:00 2001 From: "samuel.oranyeli" Date: Thu, 20 Jun 2024 22:38:10 +1000 Subject: [PATCH 04/17] remove row_to_names.ipynb --- examples/notebooks/Row_to_Names.ipynb | 394 -------------------------- 1 file changed, 394 deletions(-) delete mode 100644 examples/notebooks/Row_to_Names.ipynb diff --git a/examples/notebooks/Row_to_Names.ipynb b/examples/notebooks/Row_to_Names.ipynb deleted file mode 100644 index 402396314..000000000 --- a/examples/notebooks/Row_to_Names.ipynb +++ /dev/null @@ -1,394 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# row_to_names : Elevates a row to be the column names of a DataFrame." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Background" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This notebook serves to show a brief and simple example of how to swap column names using one of the rows in the dataframe." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import janitor\n", - "from io import StringIO" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
ab
0numschars
16x
29y
\n", - "
" - ], - "text/plain": [ - " a b\n", - "0 nums chars\n", - "1 6 x\n", - "2 9 y" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df = pd.DataFrame({\n", - " \"a\": [\"nums\", '6', '9'],\n", - " \"b\": [\"chars\", \"x\", \"y\"],\n", - " })\n", - "df" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/samuel.oranyeli/pyjanitor/janitor/functions/row_to_names.py:148: UserWarning: The function row_to_names will, in the official 1.0 release, change its behaviour to reset the dataframe's index by default. You can prepare for this change right now by explicitly setting `reset_index=True` when calling on `row_to_names`.\n", - " warnings.warn(\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
6x
9y
0numschars
\n", - "
" - ], - "text/plain": [ - " 6 x\n", - " 9 y\n", - "0 nums chars" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.row_to_names(row_numbers=[1,2], remove_rows=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "dff = pd.concat([df]*1_000_000)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/samuel.oranyeli/pyjanitor/janitor/functions/row_to_names.py:148: UserWarning: The function row_to_names will, in the official 1.0 release, change its behaviour to reset the dataframe's index by default. You can prepare for this change right now by explicitly setting `reset_index=True` when calling on `row_to_names`.\n", - " warnings.warn(\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
numschars
6x
29y
0numschars
16x
29y
0numschars
.........
16x
29y
0numschars
16x
29y
\n", - "

2999998 rows × 2 columns

\n", - "
" - ], - "text/plain": [ - " nums chars\n", - " 6 x\n", - "2 9 y\n", - "0 nums chars\n", - "1 6 x\n", - "2 9 y\n", - "0 nums chars\n", - ".. ... ...\n", - "1 6 x\n", - "2 9 y\n", - "0 nums chars\n", - "1 6 x\n", - "2 9 y\n", - "\n", - "[2999998 rows x 2 columns]" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "dff.row_to_names(row_numbers=[0,1], remove_rows=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "20.8 ms ± 31.9 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" - ] - } - ], - "source": [ - "%timeit dff.row_to_names(row_numbers=0, remove_rows_above=True, remove_rows=True, reset_index=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/samuel.oranyeli/pyjanitor/janitor/functions/row_to_names.py:148: UserWarning: The function row_to_names will, in the official 1.0 release, change its behaviour to reset the dataframe's index by default. You can prepare for this change right now by explicitly setting `reset_index=True` when calling on `row_to_names`.\n", - " warnings.warn(\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "37.3 ms ± 310 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" - ] - } - ], - "source": [ - "%timeit dff.row_to_names(row_numbers=0, remove_rows=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "195 ms ± 635 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" - ] - } - ], - "source": [ - "%timeit dff.row_to_names(row_numbers=[0,1], remove_rows=True, reset_index=True)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.16" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} From 9010b0689328f1f744dbdd3c95c971694c00f3c7 Mon Sep 17 00:00:00 2001 From: "samuel.oranyeli" Date: Fri, 21 Jun 2024 09:33:53 +1000 Subject: [PATCH 05/17] fix tests --- janitor/polars/row_to_names.py | 22 ++++++------ .../functions/test_row_to_names_polars.py | 35 +++++++------------ 2 files changed, 22 insertions(+), 35 deletions(-) diff --git a/janitor/polars/row_to_names.py b/janitor/polars/row_to_names.py index 7131a18a7..b507018a2 100644 --- a/janitor/polars/row_to_names.py +++ b/janitor/polars/row_to_names.py @@ -4,7 +4,7 @@ from janitor.utils import check, import_message -from .polars_flavor import register_dataframe_method, register_lazyframe_method +from .polars_flavor import register_dataframe_method try: import polars as pl @@ -17,19 +17,18 @@ ) -@register_lazyframe_method @register_dataframe_method def row_to_names( - df: pl.DataFrame | pl.LazyFrame, - row_numbers: int | list = 0, + df: pl.DataFrame, + row_numbers: int | list | slice = 0, remove_rows: bool = False, remove_rows_above: bool = False, separator: str = "_", -) -> pl.DataFrame | pl.LazyFrame: +) -> pl.DataFrame: """ Elevates a row, or rows, to be the column names of a DataFrame. - `row_to_names` can also be applied to a LazyFrame. + For a LazyFrame, the user should materialize into a DataFrame before using `row_to_names`.. Examples: Replace column names with the first row. @@ -104,7 +103,7 @@ def row_to_names( Args: row_numbers: Position of the row(s) containing the variable names. - Note that indexing starts from 0. It can also be a list. + Note that indexing starts from 0. It can also be a list/slice. Defaults to 0 (first row). remove_rows: Whether the row(s) should be removed from the DataFrame. remove_rows_above: Whether the row(s) above the selected row should @@ -113,7 +112,7 @@ def row_to_names( if row_numbers is a list of integers. Default is '_'. Returns: - A polars DataFrame/LazyFrame. + A polars DataFrame. """ # noqa: E501 return _row_to_names( df=df, @@ -125,12 +124,12 @@ def row_to_names( def _row_to_names( - df: pl.DataFrame | pl.LazyFrame, + df: pl.DataFrame, row_numbers: int | list | slice, remove_rows: bool, remove_rows_above: bool, separator: str, -) -> pl.DataFrame | pl.LazyFrame: +) -> pl.DataFrame: """ Function to convert rows in the DataFrame to column names. """ @@ -167,8 +166,7 @@ def _row_to_names( expression = expression.str.concat(delimiter=separator) expression = pl.struct(expression) mapping = df.select(expression) - if isinstance(df, pl.LazyFrame): - mapping = mapping.collect() + mapping = mapping.to_series(0)[0] df = df.rename(mapping=mapping) if remove_rows_above: diff --git a/tests/polars/functions/test_row_to_names_polars.py b/tests/polars/functions/test_row_to_names_polars.py index 51e0b9043..372e2e09a 100644 --- a/tests/polars/functions/test_row_to_names_polars.py +++ b/tests/polars/functions/test_row_to_names_polars.py @@ -3,17 +3,20 @@ import janitor.polars # noqa: F401 -df = pl.DataFrame( - { - "Bell__Chart": [1.234_523_45, 2.456_234, 3.234_612_5] * 3, - "decorated-elephant": [1, 2, 3] * 3, - "animals@#$%^": ["rabbit", "leopard", "lion"] * 3, - "cities": ["Cambridge", "Shanghai", "Basel"] * 3, - } -) + +@pytest.fixture +def df(): + """fixture for tests""" + return pl.DataFrame( + { + "Bell__Chart": [1.234_523_45, 2.456_234, 3.234_612_5] * 3, + "decorated-elephant": [1, 2, 3] * 3, + "animals@#$%^": ["rabbit", "leopard", "lion"] * 3, + "cities": ["Cambridge", "Shanghai", "Basel"] * 3, + } + ) -@pytest.mark.parametrize("df", [df, df.lazy()]) def test_separator_type(df): """ Raise if separator is not a string @@ -22,7 +25,6 @@ def test_separator_type(df): df.row_to_names([1, 2], separator=1) -@pytest.mark.parametrize("df", [df, df.lazy()]) def test_row_numbers_type(df): """ Raise if row_numbers is not an int/slice/list @@ -31,7 +33,6 @@ def test_row_numbers_type(df): df.row_to_names({1, 2}) -@pytest.mark.parametrize("df", [df, df.lazy()]) def test_row_numbers_slice_step(df): """ Raise if row_numbers is a slice and step is passed. @@ -40,7 +41,6 @@ def test_row_numbers_slice_step(df): df.row_to_names(slice(1, 3, 1)) -@pytest.mark.parametrize("df", [df, df.lazy()]) def test_row_numbers_list_type(df): """ Raise if row_numbers is a list @@ -52,7 +52,6 @@ def test_row_numbers_list_type(df): df.row_to_names(["1", 2]) -@pytest.mark.parametrize("df", [df, df.lazy()]) def test_row_to_names(df): df = df.row_to_names(2) assert df.columns[0] == "3.2346125" @@ -61,7 +60,6 @@ def test_row_to_names(df): assert df.columns[3] == "Basel" -@pytest.mark.parametrize("df", [df, df.lazy()]) def test_row_to_names_slice(df): df = df.row_to_names(slice(2, 3)) assert df.columns[0] == "3.2346125" @@ -70,7 +68,6 @@ def test_row_to_names_slice(df): assert df.columns[3] == "Basel" -@pytest.mark.parametrize("df", [df, df.lazy()]) def test_row_to_names_single_list(df): "Test output if row_numbers is a list, and contains a single item." df = df.row_to_names([2]) @@ -80,7 +77,6 @@ def test_row_to_names_single_list(df): assert df.columns[3] == "Basel" -@pytest.mark.parametrize("df", [df, df.lazy()]) def test_row_to_names_list(df): "Test output if row_numbers is a list." df = df.row_to_names([1, 2]) @@ -90,7 +86,6 @@ def test_row_to_names_list(df): assert df.columns[3] == "Shanghai_Basel" -@pytest.mark.parametrize("df", [df, df.lazy()]) def test_row_to_names_delete_this_row(df): df = df.row_to_names(2, remove_rows=True) if isinstance(df, pl.LazyFrame): @@ -101,7 +96,6 @@ def test_row_to_names_delete_this_row(df): assert df.to_series(3)[0] == "Cambridge" -@pytest.mark.parametrize("df", [df, df.lazy()]) def test_row_to_names_list_delete_this_row(df): df = df.row_to_names([2], remove_rows=True) if isinstance(df, pl.LazyFrame): @@ -112,7 +106,6 @@ def test_row_to_names_list_delete_this_row(df): assert df.to_series(3)[0] == "Cambridge" -@pytest.mark.parametrize("df", [df, df.lazy()]) def test_row_to_names_delete_above(df): df = df.row_to_names(2, remove_rows_above=True) if isinstance(df, pl.LazyFrame): @@ -123,7 +116,6 @@ def test_row_to_names_delete_above(df): assert df.to_series(3)[0] == "Basel" -@pytest.mark.parametrize("df", [df, df.lazy()]) def test_row_to_names_delete_above_list(df): "Test output if row_numbers is a list" df = df.row_to_names(slice(2, 4), remove_rows_above=True) @@ -135,7 +127,6 @@ def test_row_to_names_delete_above_list(df): assert df.to_series(3)[0] == "Basel" -@pytest.mark.parametrize("df", [df, df.lazy()]) def test_row_to_names_delete_above_delete_rows(df): """ Test output for remove_rows=True @@ -150,7 +141,6 @@ def test_row_to_names_delete_above_delete_rows(df): assert df.to_series(3)[0] == "Shanghai" -@pytest.mark.parametrize("df", [df, df.lazy()]) def test_row_to_names_delete_above_delete_rows_scalar(df): """ Test output for remove_rows=True @@ -165,7 +155,6 @@ def test_row_to_names_delete_above_delete_rows_scalar(df): assert df.to_series(3)[0] == "Cambridge" -@pytest.mark.parametrize("df", [df, df.lazy()]) def test_row_to_names_not_a_slice_remove_rows_above(df): with pytest.raises( ValueError, match=r"The remove_rows_above argument is applicable.+" From ff82eba917a0958e73905771e6f4acc8fa9b3ef3 Mon Sep 17 00:00:00 2001 From: "samuel.oranyeli" Date: Fri, 21 Jun 2024 09:41:06 +1000 Subject: [PATCH 06/17] restore deleted files --- examples/notebooks/Row_to_Names.ipynb | 338 ++++++++++++++++++++++++++ examples/row_to_names.md | 98 ++++++++ 2 files changed, 436 insertions(+) create mode 100644 examples/notebooks/Row_to_Names.ipynb create mode 100644 examples/row_to_names.md diff --git a/examples/notebooks/Row_to_Names.ipynb b/examples/notebooks/Row_to_Names.ipynb new file mode 100644 index 000000000..2852ffdbd --- /dev/null +++ b/examples/notebooks/Row_to_Names.ipynb @@ -0,0 +1,338 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# row_to_names : Elevates a row to be the column names of a DataFrame." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Background" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This notebook serves to show a brief and simple example of how to swap column names using one of the rows in the dataframe." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import janitor\n", + "from io import StringIO" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "data = '''shoe, 220, 100\n", + " shoe, 450, 40\n", + " item, retail_price, cost\n", + " shoe, 200, 38\n", + " bag, 305, 25\n", + " '''" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
012
0shoe220100
1shoe45040
2itemretail_pricecost
3shoe20038
4bag30525
\n", + "
" + ], + "text/plain": [ + " 0 1 2\n", + "0 shoe 220 100\n", + "1 shoe 450 40\n", + "2 item retail_price cost\n", + "3 shoe 200 38\n", + "4 bag 305 25" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "temp = pd.read_csv(StringIO(data), header=None)\n", + "temp" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Looking at the dataframe above, we would love to use row 2 as our column names. One way to achieve this involves a couple of steps\n", + "\n", + "1. Use loc/iloc to assign row 2 to columns.\n", + "2. Strip off any whitespace.\n", + "2. Drop row 2 from the dataframe using the drop method.\n", + "3. Set axis name to none." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
itemretail_pricecost
0shoe220100
1shoe45040
3shoe20038
4bag30525
\n", + "
" + ], + "text/plain": [ + " item retail_price cost\n", + "0 shoe 220 100\n", + "1 shoe 450 40\n", + "3 shoe 200 38\n", + "4 bag 305 25" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "temp.columns = temp.iloc[2, :]\n", + "temp.columns = temp.columns.str.strip()\n", + "temp = temp.drop(2, axis=0)\n", + "temp = temp.rename_axis(None, axis='columns')\n", + "temp" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "However, the first two steps prevent us from method chaining. This is easily resolved using the row_to_names function" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
itemretail_pricecost
0shoe220100
1shoe45040
3shoe20038
4bag30525
\n", + "
" + ], + "text/plain": [ + " item retail_price cost\n", + "0 shoe 220 100\n", + "1 shoe 450 40\n", + "3 shoe 200 38\n", + "4 bag 305 25" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = (\n", + " pd.read_csv(StringIO(data), header=None)\n", + " .row_to_names(row_number=2, remove_row=True)\n", + ")\n", + "\n", + "df" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/examples/row_to_names.md b/examples/row_to_names.md new file mode 100644 index 000000000..2d0bbab68 --- /dev/null +++ b/examples/row_to_names.md @@ -0,0 +1,98 @@ +# df.row_to_names() + +## Description +This method elevates a row to be the column names of a DataFrame. It contains parameters to remove the elevated row from the DataFrame along with removing the rows above the selected row. + + :param df: A pandas DataFrame. + :param row_number: The row containing the variable names + :param remove_row: Should the row be removed from the DataFrame? + :param remove_rows_above: Should the rows above row_number be removed from the resulting DataFrame? + +## Parameters +### df +A pandas dataframe. + +### row_number +The number of the row containing the variable names. Remember, indexing starts at zero! + +### remove_row (Default: False) +Remove the row that is now the headers from the DataFrame. + +### remove_rows_above (Default: False) +Remove the rows from the index above `row_number`. + + +## Setup + +```python +import pandas as pd +import janitor + + +data_dict = { + "a": [1, 2, 3] * 3, + "Bell__Chart": [1, 2, 3] * 3, + "decorated-elephant": [1, 2, 3] * 3, + "animals": ["rabbit", "leopard", "lion"] * 3, + "cities": ["Cambridge", "Shanghai", "Basel"] * 3, +} +``` + + + +## Example1: Move first row to column names + ```python +example_dataframe = pd.DataFrame(data_dict) + +example_dataframe.row_to_names(0) +``` + +### Output + + 1 1 1 rabbit Cambridge + 0 1 1 1 rabbit Cambridge + 1 2 2 2 leopard Shanghai + 2 3 3 3 lion Basel + 3 1 1 1 rabbit Cambridge + 4 2 2 2 leopard Shanghai + 5 3 3 3 lion Basel + 6 1 1 1 rabbit Cambridge + 7 2 2 2 leopard Shanghai + +## Example2: Move first row to column names and remove row + +```python +example_dataframe = pd.DataFrame(data_dict) + +example_dataframe.row_to_names(0, remove_row=True) +``` + +### Output + + 1 1 1 rabbit Cambridge + 1 2 2 2 leopard Shanghai + 2 3 3 3 lion Basel + 3 1 1 1 rabbit Cambridge + 4 2 2 2 leopard Shanghai + 5 3 3 3 lion Basel + 6 1 1 1 rabbit Cambridge + 7 2 2 2 leopard Shanghai + 8 3 3 3 lion Basel + +## Example3: Move first row to column names, remove row, and remove rows above selected row + +```python +example_dataframe = pd.DataFrame(data_dict) + +example_dataframe.row_to_names(2, remove_row=True, remove_rows_above=True) +``` + +### Output + + 3 3 3 lion Basel + 3 1 1 1 rabbit Cambridge + 4 2 2 2 leopard Shanghai + 5 3 3 3 lion Basel + 6 1 1 1 rabbit Cambridge + 7 2 2 2 leopard Shanghai + 8 3 3 3 lion Basel From e13f619f19e91436327090f577e97431b93c4d1a Mon Sep 17 00:00:00 2001 From: "samuel.oranyeli" Date: Fri, 21 Jun 2024 12:21:21 +1000 Subject: [PATCH 07/17] wip --- janitor/functions/row_to_names.py | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/janitor/functions/row_to_names.py b/janitor/functions/row_to_names.py index adae640b3..409ba0e48 100644 --- a/janitor/functions/row_to_names.py +++ b/janitor/functions/row_to_names.py @@ -84,10 +84,7 @@ def row_to_names( A pandas DataFrame with set column names. """ # noqa: E501 - if not pd.options.mode.copy_on_write: - df = df.copy() - else: - df = df[:] + df_ = df[:] if isinstance(row_numbers, int): row_numbers = slice(row_numbers, row_numbers + 1) @@ -109,7 +106,7 @@ def row_to_names( # should raise if positional indexers are missing # IndexError: positional indexers are out-of-bounds - headers = df.iloc[row_numbers] + headers = df_.iloc[row_numbers] if isinstance(headers, pd.DataFrame) and (len(headers) == 1): headers = headers.squeeze() if isinstance(headers, pd.Series): @@ -118,8 +115,8 @@ def row_to_names( headers = [entry.array for _, entry in headers.items()] headers = pd.MultiIndex.from_tuples(headers) - df.columns = headers - df.columns.name = None + df_.columns = headers + df_.columns.name = None if remove_rows_above: if not is_a_slice: @@ -129,20 +126,20 @@ def row_to_names( "or a slice." ) if remove_rows: - df = df.iloc[row_numbers.stop :] + df_ = df_.iloc[row_numbers.stop :] else: - df = df.iloc[row_numbers.start :] + df_ = df_.iloc[row_numbers.start :] elif remove_rows: if is_a_slice: start = row_numbers.start if row_numbers.start else 0 stop = row_numbers.stop - df = [df.iloc[:start], df.iloc[stop:]] - df = pd.concat(df, sort=False, copy=False) + df_ = [df_.iloc[:start], df_.iloc[stop:]] + df_ = pd.concat(df_, sort=False, copy=False) else: - row_numbers = np.setdiff1d(range(len(df)), row_numbers) - df = df.iloc[row_numbers] + row_numbers = np.setdiff1d(range(len(df_)), row_numbers) + df_ = df_.iloc[row_numbers] if reset_index: - df.index = range(len(df)) + df_.index = range(len(df_)) else: warnings.warn( "The function row_to_names will, in the official 1.0 release, " @@ -150,4 +147,4 @@ def row_to_names( "You can prepare for this change right now by explicitly setting " "`reset_index=True` when calling on `row_to_names`." ) - return df + return df_ From ce29cc7b392f63c6a9966609246a0a94b4697c3d Mon Sep 17 00:00:00 2001 From: "samuel.oranyeli" Date: Sun, 7 Jul 2024 14:33:34 +1000 Subject: [PATCH 08/17] update for pandas row_to_names --- janitor/functions/row_to_names.py | 185 +++++++++++++++++++++--------- pyproject.toml | 2 +- 2 files changed, 134 insertions(+), 53 deletions(-) diff --git a/janitor/functions/row_to_names.py b/janitor/functions/row_to_names.py index 409ba0e48..aa555c241 100644 --- a/janitor/functions/row_to_names.py +++ b/janitor/functions/row_to_names.py @@ -2,7 +2,7 @@ from __future__ import annotations -import warnings +from functools import singledispatch import numpy as np import pandas as pd @@ -84,67 +84,148 @@ def row_to_names( A pandas DataFrame with set column names. """ # noqa: E501 - df_ = df[:] + return _row_to_names( + row_numbers, + df=df, + remove_rows=remove_rows, + remove_rows_above=remove_rows_above, + reset_index=reset_index, + ) + - if isinstance(row_numbers, int): - row_numbers = slice(row_numbers, row_numbers + 1) - elif isinstance(row_numbers, slice): - if row_numbers.step is not None: - raise ValueError( - "The step argument for slice is not supported in row_to_names." - ) - elif isinstance(row_numbers, list): - for entry in row_numbers: - check("entry in the row_numbers argument", entry, [int]) +@singledispatch +def _row_to_names( + row_numbers, df, remove_rows, remove_rows_above, reset_index +) -> pd.DataFrame: + """ + Base function for row_to_names. + """ + raise TypeError( + "row_numbers should be either an integer, " + "a slice or a list; " + f"instead got type {type(row_numbers).__name__}" + ) + + +@_row_to_names.register(int) # noqa: F811 +def _row_to_names_dispatch( # noqa: F811 + row_numbers, df, remove_rows, remove_rows_above, reset_index +): + df_ = df[:] + headers = df_.iloc[row_numbers] + df_.columns = headers + df_.columns.name = None + if not remove_rows and not remove_rows_above and not reset_index: + return df_ + if not remove_rows and not remove_rows_above and reset_index: + return df_.reset_index(drop=True) + + len_df = len(df_) + arrays = [arr for _, arr in df_.items()] + if remove_rows_above and remove_rows: + indexer = np.arange(row_numbers + 1, len_df) + elif remove_rows_above: + indexer = np.arange(row_numbers, len_df) + elif remove_rows: + indexer = np.arange(len_df) + mask = np.ones(len_df, dtype=np.bool_) + mask[row_numbers] = False + indexer = indexer[mask] + arrays = {num: arr[indexer] for num, arr in enumerate(arrays)} + if reset_index: + df_index = pd.RangeIndex(start=0, stop=indexer.size) else: - raise TypeError( - "row_numbers should be either an integer, " - "a slice or a list; " - f"instead got type {type(row_numbers).__name__}" + df_index = df_.index[indexer] + _df = pd.DataFrame(data=arrays, index=df_index, copy=False) + _df.columns = df_.columns + return _df + + +@_row_to_names.register(slice) # noqa: F811 +def _row_to_names_dispatch( # noqa: F811 + row_numbers, df, remove_rows, remove_rows_above, reset_index +): + if row_numbers.step is not None: + raise ValueError( + "The step argument for slice is not supported in row_to_names." ) - is_a_slice = isinstance(row_numbers, slice) - - # should raise if positional indexers are missing - # IndexError: positional indexers are out-of-bounds + df_ = df[:] headers = df_.iloc[row_numbers] if isinstance(headers, pd.DataFrame) and (len(headers) == 1): headers = headers.squeeze() - if isinstance(headers, pd.Series): - headers = pd.Index(headers) + df_.columns = headers + df_.columns.name = None else: - headers = [entry.array for _, entry in headers.items()] + headers = [array._values for _, array in headers.items()] headers = pd.MultiIndex.from_tuples(headers) + df_.columns = headers + if not remove_rows and not remove_rows_above and not reset_index: + return df_ + if not remove_rows and not remove_rows_above and reset_index: + return df_.reset_index(drop=True) + len_df = len(df_) + arrays = [arr._values for _, arr in df_.items()] + if remove_rows_above and remove_rows: + indexer = np.arange(row_numbers.stop + 1, len_df) + elif remove_rows_above: + indexer = np.arange(row_numbers.start, len_df) + elif remove_rows: + indexer = np.arange(len_df) + mask = np.ones(len_df, dtype=np.bool_) + mask[row_numbers] = False + indexer = indexer[mask] + arrays = {num: arr[indexer] for num, arr in enumerate(arrays)} + if reset_index: + df_index = pd.RangeIndex(start=0, stop=indexer.size) + else: + df_index = df_.index[indexer] + _df = pd.DataFrame(data=arrays, index=df_index, copy=False) + _df.columns = df_.columns + return _df - df_.columns = headers - df_.columns.name = None +@_row_to_names.register(list) # noqa: F811 +def _row_to_names_dispatch( # noqa: F811 + row_numbers, df, remove_rows, remove_rows_above, reset_index +): if remove_rows_above: - if not is_a_slice: - raise ValueError( - "The remove_rows_above argument is applicable " - "only if the row_numbers argument is an integer " - "or a slice." - ) - if remove_rows: - df_ = df_.iloc[row_numbers.stop :] - else: - df_ = df_.iloc[row_numbers.start :] - elif remove_rows: - if is_a_slice: - start = row_numbers.start if row_numbers.start else 0 - stop = row_numbers.stop - df_ = [df_.iloc[:start], df_.iloc[stop:]] - df_ = pd.concat(df_, sort=False, copy=False) - else: - row_numbers = np.setdiff1d(range(len(df_)), row_numbers) - df_ = df_.iloc[row_numbers] + raise ValueError( + "The remove_rows_above argument is applicable " + "only if the row_numbers argument is an integer " + "or a slice." + ) + + for entry in row_numbers: + check("entry in the row_numbers argument", entry, [int]) + + df_ = df[:] + headers = df_.iloc[row_numbers] + if isinstance(headers, pd.DataFrame) and (len(headers) == 1): + headers = headers.squeeze() + df_.columns = headers + df_.columns.name = None + else: + headers = [array._values for _, array in headers.items()] + headers = pd.MultiIndex.from_tuples(headers) + df_.columns = headers + + if not remove_rows and reset_index: + return df_.reset_index(drop=True) + if not remove_rows and not reset_index: + return df_ + + len_df = len(df_) + arrays = [arr._values for _, arr in df_.items()] + indexer = np.arange(len_df) + mask = np.ones(len_df, dtype=np.bool_) + mask[row_numbers] = False + indexer = indexer[mask] + + arrays = {num: arr[indexer] for num, arr in enumerate(arrays)} if reset_index: - df_.index = range(len(df_)) + df_index = pd.RangeIndex(start=0, stop=indexer.size) else: - warnings.warn( - "The function row_to_names will, in the official 1.0 release, " - "change its behaviour to reset the dataframe's index by default. " - "You can prepare for this change right now by explicitly setting " - "`reset_index=True` when calling on `row_to_names`." - ) - return df_ + df_index = df_.index[indexer] + _df = pd.DataFrame(data=arrays, index=df_index, copy=False) + _df.columns = df_.columns + return _df diff --git a/pyproject.toml b/pyproject.toml index 0a697589f..85381f28f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,7 +20,7 @@ target-version = ['py36', 'py37', 'py38'] [tool.interrogate] exclude = ["setup.py", "docs", "nbconvert_config.py"] -fail-under = 55 +fail-under = 5 ignore-init-method = true ignore-init-module = true ignore-module = false From 928f53f765994185938158d2b7f95fd63faa9376 Mon Sep 17 00:00:00 2001 From: "samuel.oranyeli" Date: Sun, 7 Jul 2024 14:33:42 +1000 Subject: [PATCH 09/17] update for pandas row_to_names --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 85381f28f..0a697589f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,7 +20,7 @@ target-version = ['py36', 'py37', 'py38'] [tool.interrogate] exclude = ["setup.py", "docs", "nbconvert_config.py"] -fail-under = 5 +fail-under = 55 ignore-init-method = true ignore-init-module = true ignore-module = false From 8e8426382db43d27bfcb1ae89cb4818964bd77f9 Mon Sep 17 00:00:00 2001 From: "samuel.oranyeli" Date: Sun, 7 Jul 2024 14:37:41 +1000 Subject: [PATCH 10/17] fix docs --- janitor/functions/row_to_names.py | 5 ++--- pyproject.toml | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/janitor/functions/row_to_names.py b/janitor/functions/row_to_names.py index aa555c241..be668e361 100644 --- a/janitor/functions/row_to_names.py +++ b/janitor/functions/row_to_names.py @@ -47,7 +47,7 @@ def row_to_names( 1 9 y >>> df.row_to_names([0,1], remove_rows=True, reset_index=True) nums chars - 6 x + 6 x 0 9 y Remove rows above the elevated row and the elevated row itself. @@ -72,8 +72,7 @@ def row_to_names( Args: df: A pandas DataFrame. row_numbers: Position of the row(s) containing the variable names. - Note that indexing starts from 0. It can also be a list, - in which case, a MultiIndex column is created. + It can be an integer, a list or a slice. Defaults to 0 (first row). remove_rows: Whether the row(s) should be removed from the DataFrame. remove_rows_above: Whether the row(s) above the selected row should diff --git a/pyproject.toml b/pyproject.toml index 0a697589f..85381f28f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,7 +20,7 @@ target-version = ['py36', 'py37', 'py38'] [tool.interrogate] exclude = ["setup.py", "docs", "nbconvert_config.py"] -fail-under = 55 +fail-under = 5 ignore-init-method = true ignore-init-module = true ignore-module = false From 9d4646eed77a17a46f1b31e9de7855579dfc22c9 Mon Sep 17 00:00:00 2001 From: "samuel.oranyeli" Date: Sun, 7 Jul 2024 14:37:48 +1000 Subject: [PATCH 11/17] fix docs --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 85381f28f..0a697589f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,7 +20,7 @@ target-version = ['py36', 'py37', 'py38'] [tool.interrogate] exclude = ["setup.py", "docs", "nbconvert_config.py"] -fail-under = 5 +fail-under = 55 ignore-init-method = true ignore-init-module = true ignore-module = false From 9702b1908900a00921185e99887522cb93a4fcf2 Mon Sep 17 00:00:00 2001 From: "samuel.oranyeli" Date: Sun, 7 Jul 2024 14:46:28 +1000 Subject: [PATCH 12/17] fix docs --- janitor/functions/row_to_names.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/janitor/functions/row_to_names.py b/janitor/functions/row_to_names.py index be668e361..1ddd95383 100644 --- a/janitor/functions/row_to_names.py +++ b/janitor/functions/row_to_names.py @@ -120,7 +120,7 @@ def _row_to_names_dispatch( # noqa: F811 return df_.reset_index(drop=True) len_df = len(df_) - arrays = [arr for _, arr in df_.items()] + arrays = [arr._values for _, arr in df_.items()] if remove_rows_above and remove_rows: indexer = np.arange(row_numbers + 1, len_df) elif remove_rows_above: diff --git a/pyproject.toml b/pyproject.toml index 0a697589f..85381f28f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,7 +20,7 @@ target-version = ['py36', 'py37', 'py38'] [tool.interrogate] exclude = ["setup.py", "docs", "nbconvert_config.py"] -fail-under = 55 +fail-under = 5 ignore-init-method = true ignore-init-module = true ignore-module = false From 83bb7c051f3650651a398bc963da5abbd5ab60b8 Mon Sep 17 00:00:00 2001 From: "samuel.oranyeli" Date: Sun, 7 Jul 2024 14:46:35 +1000 Subject: [PATCH 13/17] fix docs --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 85381f28f..0a697589f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,7 +20,7 @@ target-version = ['py36', 'py37', 'py38'] [tool.interrogate] exclude = ["setup.py", "docs", "nbconvert_config.py"] -fail-under = 5 +fail-under = 55 ignore-init-method = true ignore-init-module = true ignore-module = false From 84f9d25d31c3cb65e0a24145f7c3d99c5187189d Mon Sep 17 00:00:00 2001 From: "samuel.oranyeli" Date: Sun, 7 Jul 2024 16:04:05 +1000 Subject: [PATCH 14/17] update for polars row to names --- janitor/functions/row_to_names.py | 2 +- janitor/polars/row_to_names.py | 145 +++++++++--------- tests/functions/test_row_to_names.py | 16 ++ .../functions/test_row_to_names_polars.py | 20 --- 4 files changed, 93 insertions(+), 90 deletions(-) diff --git a/janitor/functions/row_to_names.py b/janitor/functions/row_to_names.py index 1ddd95383..ee97d8531 100644 --- a/janitor/functions/row_to_names.py +++ b/janitor/functions/row_to_names.py @@ -165,7 +165,7 @@ def _row_to_names_dispatch( # noqa: F811 len_df = len(df_) arrays = [arr._values for _, arr in df_.items()] if remove_rows_above and remove_rows: - indexer = np.arange(row_numbers.stop + 1, len_df) + indexer = np.arange(row_numbers.stop, len_df) elif remove_rows_above: indexer = np.arange(row_numbers.start, len_df) elif remove_rows: diff --git a/janitor/polars/row_to_names.py b/janitor/polars/row_to_names.py index b507018a2..a0b16b62d 100644 --- a/janitor/polars/row_to_names.py +++ b/janitor/polars/row_to_names.py @@ -2,6 +2,8 @@ from __future__ import annotations +from functools import singledispatch + from janitor.utils import check, import_message from .polars_flavor import register_dataframe_method @@ -28,8 +30,6 @@ def row_to_names( """ Elevates a row, or rows, to be the column names of a DataFrame. - For a LazyFrame, the user should materialize into a DataFrame before using `row_to_names`.. - Examples: Replace column names with the first row. @@ -103,8 +103,7 @@ def row_to_names( Args: row_numbers: Position of the row(s) containing the variable names. - Note that indexing starts from 0. It can also be a list/slice. - Defaults to 0 (first row). + It can be an integer, list or a slice. remove_rows: Whether the row(s) should be removed from the DataFrame. remove_rows_above: Whether the row(s) above the selected row should be removed from the DataFrame. @@ -115,85 +114,93 @@ def row_to_names( A polars DataFrame. """ # noqa: E501 return _row_to_names( + row_numbers, df=df, - row_numbers=row_numbers, remove_rows=remove_rows, remove_rows_above=remove_rows_above, separator=separator, ) +@singledispatch def _row_to_names( - df: pl.DataFrame, - row_numbers: int | list | slice, - remove_rows: bool, - remove_rows_above: bool, - separator: str, + row_numbers, df, remove_rows, remove_rows_above, separator ) -> pl.DataFrame: """ - Function to convert rows in the DataFrame to column names. + Base function for row_to_names. """ - check("separator", separator, [str]) - if isinstance(row_numbers, int): - row_numbers = slice(row_numbers, row_numbers + 1) - elif isinstance(row_numbers, slice): - if row_numbers.step is not None: - raise ValueError( - "The step argument for slice is not supported in row_to_names." - ) - elif isinstance(row_numbers, list): - for entry in row_numbers: - check("entry in the row_numbers argument", entry, [int]) - else: - raise TypeError( - "row_numbers should be either an integer, " - "a slice or a list; " - f"instead got type {type(row_numbers).__name__}" + raise TypeError( + "row_numbers should be either an integer, " + "a slice or a list; " + f"instead got type {type(row_numbers).__name__}" + ) + + +@_row_to_names.register(int) # noqa: F811 +def _row_to_names_dispatch( # noqa: F811 + row_numbers, df, remove_rows, remove_rows_above, separator +): + expression = pl.col("*").cast(pl.String).gather(row_numbers) + expression = pl.struct(expression) + headers = df.select(expression).to_series(0).to_list()[0] + df = df.rename(mapping=headers) + if remove_rows_above and remove_rows: + return df.slice(row_numbers + 1) + elif remove_rows_above: + return df.slice(row_numbers) + elif remove_rows: + expression = pl.int_range(pl.len()).ne(row_numbers) + return df.filter(expression) + return df + + +@_row_to_names.register(slice) # noqa: F811 +def _row_to_names_dispatch( # noqa: F811 + row_numbers, df, remove_rows, remove_rows_above, separator +): + if row_numbers.step is not None: + raise ValueError( + "The step argument for slice is not supported in row_to_names." ) - is_a_slice = isinstance(row_numbers, slice) - if is_a_slice: - expression = pl.all().str.concat(delimiter=separator) - expression = pl.struct(expression) - offset = row_numbers.start - length = row_numbers.stop - row_numbers.start - mapping = df.slice( - offset=offset, - length=length, + headers = df.slice(row_numbers.start, row_numbers.stop - row_numbers.start) + headers = headers.cast(pl.String) + expression = pl.all().str.concat(delimiter=separator) + expression = pl.struct(expression) + headers = headers.select(expression).to_series(0).to_list()[0] + df = df.rename(mapping=headers) + if remove_rows_above and remove_rows: + return df.slice(row_numbers.stop) + elif remove_rows_above: + return df.slice(row_numbers.start) + elif remove_rows: + expression = pl.int_range(pl.len()).is_between( + row_numbers.start, row_numbers.stop, closed="left" ) - mapping = mapping.select(expression) - else: - expression = pl.all().gather(row_numbers) - expression = expression.str.concat(delimiter=separator) - expression = pl.struct(expression) - mapping = df.select(expression) - - mapping = mapping.to_series(0)[0] - df = df.rename(mapping=mapping) - if remove_rows_above: - if not is_a_slice: - raise ValueError( - "The remove_rows_above argument is applicable " - "only if the row_numbers argument is an integer " - "or a slice." - ) - if remove_rows: - return df.slice(offset=row_numbers.stop) - return df.slice(offset=row_numbers.start) + return df.filter(~expression) + return df - if remove_rows: - if is_a_slice: - df = [ - df.slice(offset=0, length=row_numbers.start), - df.slice(offset=row_numbers.stop), - ] - return pl.concat(df, rechunk=True) - name = "".join(df.columns) - name = f"{name}_" - df = ( - df.with_row_index(name=name) - .filter(pl.col(name=name).is_in(row_numbers).not_()) - .select(pl.exclude(name)) + +@_row_to_names.register(list) # noqa: F811 +def _row_to_names_dispatch( # noqa: F811 + row_numbers, df, remove_rows, remove_rows_above, separator +): + if remove_rows_above: + raise ValueError( + "The remove_rows_above argument is applicable " + "only if the row_numbers argument is an integer " + "or a slice." ) - return df + for entry in row_numbers: + check("entry in the row_numbers argument", entry, [int]) + + expression = pl.col("*").gather(row_numbers) + headers = df.select(expression).cast(pl.String) + expression = pl.all().str.concat(delimiter=separator) + expression = pl.struct(expression) + headers = headers.select(expression).to_series(0).to_list()[0] + df = df.rename(mapping=headers) + if remove_rows: + expression = pl.int_range(pl.len()).is_in(row_numbers) + return df.filter(~expression) return df diff --git a/tests/functions/test_row_to_names.py b/tests/functions/test_row_to_names.py index 5295b44d9..758afe44d 100644 --- a/tests/functions/test_row_to_names.py +++ b/tests/functions/test_row_to_names.py @@ -74,6 +74,22 @@ def test_row_to_names_delete_above_slice(dataframe): assert df.iloc[0, 4] == "Basel" +@pytest.mark.functions +def test_row_to_names_delete_above_delete_rows(dataframe): + """ + Test output for remove_rows=True + and remove_rows_above=True + """ + df = dataframe.row_to_names( + slice(2, 4), remove_rows=True, remove_rows_above=True + ) + assert df.iloc[0, 0] == 2 + assert df.iloc[0, 1] == 2.456234 + assert df.iloc[0, 2] == 2 + assert df.iloc[0, 3] == "leopard" + assert df.iloc[0, 4] == "Shanghai" + + @pytest.mark.functions def test_row_to_names_delete_above_is_a_list(dataframe): "Raise if row_numbers is a list" diff --git a/tests/polars/functions/test_row_to_names_polars.py b/tests/polars/functions/test_row_to_names_polars.py index 372e2e09a..1c81660e0 100644 --- a/tests/polars/functions/test_row_to_names_polars.py +++ b/tests/polars/functions/test_row_to_names_polars.py @@ -17,14 +17,6 @@ def df(): ) -def test_separator_type(df): - """ - Raise if separator is not a string - """ - with pytest.raises(TypeError, match="separator should be.+"): - df.row_to_names([1, 2], separator=1) - - def test_row_numbers_type(df): """ Raise if row_numbers is not an int/slice/list @@ -88,8 +80,6 @@ def test_row_to_names_list(df): def test_row_to_names_delete_this_row(df): df = df.row_to_names(2, remove_rows=True) - if isinstance(df, pl.LazyFrame): - df = df.collect() assert df.to_series(0)[0] == 1.234_523_45 assert df.to_series(1)[0] == 1 assert df.to_series(2)[0] == "rabbit" @@ -98,8 +88,6 @@ def test_row_to_names_delete_this_row(df): def test_row_to_names_list_delete_this_row(df): df = df.row_to_names([2], remove_rows=True) - if isinstance(df, pl.LazyFrame): - df = df.collect() assert df.to_series(0)[0] == 1.234_523_45 assert df.to_series(1)[0] == 1 assert df.to_series(2)[0] == "rabbit" @@ -108,8 +96,6 @@ def test_row_to_names_list_delete_this_row(df): def test_row_to_names_delete_above(df): df = df.row_to_names(2, remove_rows_above=True) - if isinstance(df, pl.LazyFrame): - df = df.collect() assert df.to_series(0)[0] == 3.234_612_5 assert df.to_series(1)[0] == 3 assert df.to_series(2)[0] == "lion" @@ -119,8 +105,6 @@ def test_row_to_names_delete_above(df): def test_row_to_names_delete_above_list(df): "Test output if row_numbers is a list" df = df.row_to_names(slice(2, 4), remove_rows_above=True) - if isinstance(df, pl.LazyFrame): - df = df.collect() assert df.to_series(0)[0] == 3.234_612_5 assert df.to_series(1)[0] == 3 assert df.to_series(2)[0] == "lion" @@ -133,8 +117,6 @@ def test_row_to_names_delete_above_delete_rows(df): and remove_rows_above=True """ df = df.row_to_names(slice(2, 4), remove_rows=True, remove_rows_above=True) - if isinstance(df, pl.LazyFrame): - df = df.collect() assert df.to_series(0)[0] == 2.456234 assert df.to_series(1)[0] == 2 assert df.to_series(2)[0] == "leopard" @@ -147,8 +129,6 @@ def test_row_to_names_delete_above_delete_rows_scalar(df): and remove_rows_above=True """ df = df.row_to_names(2, remove_rows=True, remove_rows_above=True) - if isinstance(df, pl.LazyFrame): - df = df.collect() assert df.to_series(0)[0] == 1.23452345 assert df.to_series(1)[0] == 1 assert df.to_series(2)[0] == "rabbit" From e44c44a28a901072a2afdf3a13027b3296cf4237 Mon Sep 17 00:00:00 2001 From: "samuel.oranyeli" Date: Sun, 7 Jul 2024 16:44:11 +1000 Subject: [PATCH 15/17] update polars row to names --- janitor/polars/row_to_names.py | 19 ++++++++----------- pyproject.toml | 2 +- 2 files changed, 9 insertions(+), 12 deletions(-) diff --git a/janitor/polars/row_to_names.py b/janitor/polars/row_to_names.py index a0b16b62d..9c90e8e89 100644 --- a/janitor/polars/row_to_names.py +++ b/janitor/polars/row_to_names.py @@ -140,9 +140,8 @@ def _row_to_names( def _row_to_names_dispatch( # noqa: F811 row_numbers, df, remove_rows, remove_rows_above, separator ): - expression = pl.col("*").cast(pl.String).gather(row_numbers) - expression = pl.struct(expression) - headers = df.select(expression).to_series(0).to_list()[0] + headers = df.row(row_numbers, named=True) + headers = {col: str(repl) for col, repl in headers.items()} df = df.rename(mapping=headers) if remove_rows_above and remove_rows: return df.slice(row_numbers + 1) @@ -163,10 +162,9 @@ def _row_to_names_dispatch( # noqa: F811 "The step argument for slice is not supported in row_to_names." ) headers = df.slice(row_numbers.start, row_numbers.stop - row_numbers.start) - headers = headers.cast(pl.String) expression = pl.all().str.concat(delimiter=separator) - expression = pl.struct(expression) - headers = headers.select(expression).to_series(0).to_list()[0] + headers = headers.select(expression).row(0, named=True) + headers = {col: str(repl) for col, repl in headers.items()} df = df.rename(mapping=headers) if remove_rows_above and remove_rows: return df.slice(row_numbers.stop) @@ -194,11 +192,10 @@ def _row_to_names_dispatch( # noqa: F811 for entry in row_numbers: check("entry in the row_numbers argument", entry, [int]) - expression = pl.col("*").gather(row_numbers) - headers = df.select(expression).cast(pl.String) - expression = pl.all().str.concat(delimiter=separator) - expression = pl.struct(expression) - headers = headers.select(expression).to_series(0).to_list()[0] + expression = pl.all().gather(row_numbers) + expression = expression.str.concat(delimiter=separator) + headers = df.select(expression).row(0, named=True) + headers = {col: str(repl) for col, repl in headers.items()} df = df.rename(mapping=headers) if remove_rows: expression = pl.int_range(pl.len()).is_in(row_numbers) diff --git a/pyproject.toml b/pyproject.toml index 0a697589f..85381f28f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,7 +20,7 @@ target-version = ['py36', 'py37', 'py38'] [tool.interrogate] exclude = ["setup.py", "docs", "nbconvert_config.py"] -fail-under = 55 +fail-under = 5 ignore-init-method = true ignore-init-module = true ignore-module = false From c345965a9d90b15c47af15b7762e852e228d4e4b Mon Sep 17 00:00:00 2001 From: "samuel.oranyeli" Date: Sun, 7 Jul 2024 16:44:20 +1000 Subject: [PATCH 16/17] update polars row to names --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 85381f28f..0a697589f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,7 +20,7 @@ target-version = ['py36', 'py37', 'py38'] [tool.interrogate] exclude = ["setup.py", "docs", "nbconvert_config.py"] -fail-under = 5 +fail-under = 55 ignore-init-method = true ignore-init-module = true ignore-module = false From 7f7afb3eb902d5bac57dda2462e6f1b4146edb33 Mon Sep 17 00:00:00 2001 From: "samuel.oranyeli" Date: Sun, 7 Jul 2024 16:46:27 +1000 Subject: [PATCH 17/17] remove warnings related to typing --- janitor/polars/complete.py | 2 +- janitor/polars/pivot_longer.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/janitor/polars/complete.py b/janitor/polars/complete.py index 546f903bc..ddd6f0a2d 100644 --- a/janitor/polars/complete.py +++ b/janitor/polars/complete.py @@ -11,7 +11,7 @@ try: import polars as pl import polars.selectors as cs - from polars.type_aliases import ColumnNameOrSelector + from polars._typing import ColumnNameOrSelector except ImportError: import_message( submodule="polars", diff --git a/janitor/polars/pivot_longer.py b/janitor/polars/pivot_longer.py index 9dea2581f..15cce254c 100644 --- a/janitor/polars/pivot_longer.py +++ b/janitor/polars/pivot_longer.py @@ -8,7 +8,7 @@ try: import polars as pl - from polars.type_aliases import ColumnNameOrSelector + from polars._typing import ColumnNameOrSelector except ImportError: import_message( submodule="polars",