Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Wrap dropna #26

Merged
merged 9 commits into from
Apr 11, 2024
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
132 changes: 132 additions & 0 deletions src/nested_pandas/nestedframe/core.py
dougbrn marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
# typing.Self and "|" union syntax don't exist in Python 3.9
from __future__ import annotations

import numpy as np
import pandas as pd
from pandas._libs import lib
dougbrn marked this conversation as resolved.
Show resolved Hide resolved
from pandas._typing import AnyAll, Axis, IndexLabel

from nested_pandas.series import packer
from nested_pandas.series.dtype import NestedDtype
Expand Down Expand Up @@ -154,3 +157,132 @@ def query(self, expr) -> Self: # type: ignore[name-defined] # noqa: F821
# TODO: does not work with queries that empty the dataframe
result[expr] = result[expr].nest.query_flat(exprs_to_use[expr])
return result

def dropna(
self,
*,
axis: Axis = 0,
how: AnyAll | lib.NoDefault = lib.no_default,
thresh: int | lib.NoDefault = lib.no_default,
on_nested: bool = False,
subset: IndexLabel | None = None,
inplace: bool = False,
ignore_index: bool = False,
) -> NestedFrame | None:
"""
Remove missing values.

Parameters
----------
axis : {0 or 'index', 1 or 'columns'}, default 0
Determine if rows or columns which contain missing values are
removed.

* 0, or 'index' : Drop rows which contain missing values.
* 1, or 'columns' : Drop columns which contain missing value.

Only a single axis is allowed.

how : {'any', 'all'}, default 'any'
Determine if row or column is removed from DataFrame, when we have
at least one NA or all NA.

* 'any' : If any NA values are present, drop that row or column.
* 'all' : If all values are NA, drop that row or column.
thresh : int, optional
Require that many non-NA values. Cannot be combined with how.
on_nested : str or bool, optional
If not False, applies the call to the nested dataframe in the
column with label equal to the provided string. If specified,
the nested dataframe should align with any columns given in
`subset`.
subset : column label or sequence of labels, optional
Labels along other axis to consider, e.g. if you are dropping rows
these would be a list of columns to include.

Access nested columns using `nested_df.nested_col` (where
`nested_df` refers to a particular nested dataframe and
`nested_col` is a column of that nested dataframe).
inplace : bool, default False
Whether to modify the DataFrame rather than creating a new one.
ignore_index : bool, default ``False``
If ``True``, the resulting axis will be labeled 0, 1, …, n - 1.

.. versionadded:: 2.0.0

Returns
-------
DataFrame or None
DataFrame with NA entries dropped from it or None if ``inplace=True``.

Notes
-----
Operations that target a particular nested structure return a dataframe
with rows of that particular nested structure affected.
"""

# determine target dataframe

# first check the subset kwarg input
dougbrn marked this conversation as resolved.
Show resolved Hide resolved
subset_target = []
if subset:
if isinstance(subset, str):
subset = [subset]

for col in subset:
col = col.split(".")[0]
if col in self.nested_columns:
dougbrn marked this conversation as resolved.
Show resolved Hide resolved
subset_target.append(col)
elif col in self.columns:
subset_target.append("base")
dougbrn marked this conversation as resolved.
Show resolved Hide resolved

# Check for 1 target
subset_target = np.unique(subset_target)
if len(subset_target) > 1: # prohibit multi-target operations
raise ValueError(
f"Targeted multiple nested structures ({subset_target}), write one command per target dataframe" # noqa
dougbrn marked this conversation as resolved.
Show resolved Hide resolved
)
elif len(subset_target) == 0:
raise ValueError(
"Provided base columns or nested layer did not match any found in the nestedframe"
)
subset_target = subset_target[0]
dougbrn marked this conversation as resolved.
Show resolved Hide resolved

# Next check the on_nested kwarg input
if on_nested and on_nested not in self.nested_columns:
raise ValueError("Provided nested layer not found in nested dataframes")

# Resolve target layer
target = "base"
if on_nested and subset_target:
if on_nested != subset_target:
raise ValueError(
f"Provided on_nested={on_nested}, but subset columns are from {subset_target}. Make sure these are aligned or just use subset." # noqa
)
else:
target = subset_target
elif on_nested:
target = str(on_nested)
elif subset_target:
target = str(subset_target)

if target == "base":
return super().dropna(
axis=axis, how=how, thresh=thresh, subset=subset, inplace=inplace, ignore_index=ignore_index
)
else:
dougbrn marked this conversation as resolved.
Show resolved Hide resolved
if subset is not None:
subset = [col.split(".")[-1] for col in subset]
self[target] = packer.pack_flat(
dougbrn marked this conversation as resolved.
Show resolved Hide resolved
self[target]
.nest.to_flat()
.dropna(
axis=axis,
how=how,
thresh=thresh,
subset=subset,
inplace=inplace,
ignore_index=ignore_index,
)
)
return self
63 changes: 63 additions & 0 deletions tests/nested_pandas/nestedframe/test_nestedframe.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import numpy as np
import pandas as pd
import pytest
from nested_pandas import NestedFrame
Expand Down Expand Up @@ -101,3 +102,65 @@ def test_query():

nest_queried = base.query("(nested.c > 1) and (nested.d>2)")
assert len(nest_queried.nested.nest.to_flat()) == 4


def test_dropna():
"""Test that dropna works on all layers"""

base = NestedFrame(data={"a": [1, 2, 3], "b": [2, np.NaN, 6]}, index=[0, 1, 2])

nested = pd.DataFrame(
data={"c": [0, 2, 4, 1, np.NaN, 3, 1, 4, 1], "d": [5, 4, 7, 5, 3, 1, 9, 3, 4]},
index=[0, 0, 0, 1, 1, 1, 2, 2, 2],
)

base = base.add_nested(nested, "nested")

# Test basic functionality
dn_base = base.dropna(subset=["b"])
assert len(dn_base) == 2
assert len(dn_base["nested"].nest.to_flat() == 6)

# Test on_nested kwarg
dn_on_nested = base.dropna(on_nested="nested")
assert len(dn_on_nested) == 3
assert len(dn_on_nested["nested"].nest.to_flat() == 8)

# Test hierarchical column subset
dn_hierarchical = base.dropna(subset="nested.c")
assert len(dn_hierarchical) == 3
assert len(dn_hierarchical["nested"].nest.to_flat() == 8)

# Test hierarchical column subset and on_nested
dn_hierarchical = base.dropna(on_nested="nested", subset="nested.c")
assert len(dn_hierarchical) == 3
assert len(dn_hierarchical["nested"].nest.to_flat() == 8)


def test_dropna_errors():
"""Test that the various dropna exceptions trigger"""

base = NestedFrame(data={"a": [1, 2, 3], "b": [2, np.NaN, 6]}, index=[0, 1, 2])

nested = pd.DataFrame(
data={"c": [0, 2, 4, 1, np.NaN, 3, 1, 4, 1], "d": [5, 4, 7, 5, 3, 1, 9, 3, 4]},
index=[0, 0, 0, 1, 1, 1, 2, 2, 2],
)

base = base.add_nested(nested, "nested")

# Test multi-target
with pytest.raises(ValueError):
base.dropna(subset=["b", "nested.c"])

# Test no-target
with pytest.raises(ValueError):
base.dropna(subset=["not_nested.c"])

# Test bad on-nested value
with pytest.raises(ValueError):
base.dropna(on_nested="not_nested")

# Test on-nested + subset disagreement
with pytest.raises(ValueError):
base.dropna(on_nested="nested", subset=["b"])