lincc-frameworks · dougbrn · Apr 11, 2024 · Apr 10, 2024 · Apr 10, 2024 · Apr 10, 2024
diff --git a/src/nested_pandas/nestedframe/core.py b/src/nested_pandas/nestedframe/core.py
@@ -1,7 +1,10 @@
 # typing.Self and "|" union syntax don't exist in Python 3.9
 from __future__ import annotations
 
+import numpy as np
 import pandas as pd
+from pandas._libs import lib
+from pandas._typing import AnyAll, Axis, IndexLabel
 
 from nested_pandas.series import packer
 from nested_pandas.series.dtype import NestedDtype
@@ -154,3 +157,132 @@ def query(self, expr) -> Self:  # type: ignore[name-defined] # noqa: F821
                 # TODO: does not work with queries that empty the dataframe
                 result[expr] = result[expr].nest.query_flat(exprs_to_use[expr])
         return result
+
+    def dropna(
+        self,
+        *,
+        axis: Axis = 0,
+        how: AnyAll | lib.NoDefault = lib.no_default,
+        thresh: int | lib.NoDefault = lib.no_default,
+        on_nested: bool = False,
+        subset: IndexLabel | None = None,
+        inplace: bool = False,
+        ignore_index: bool = False,
+    ) -> NestedFrame | None:
+        """
+        Remove missing values.
+
+        Parameters
+        ----------
+        axis : {0 or 'index', 1 or 'columns'}, default 0
+            Determine if rows or columns which contain missing values are
+            removed.
+
+            * 0, or 'index' : Drop rows which contain missing values.
+            * 1, or 'columns' : Drop columns which contain missing value.
+
+            Only a single axis is allowed.
+
+        how : {'any', 'all'}, default 'any'
+            Determine if row or column is removed from DataFrame, when we have
+            at least one NA or all NA.
+
+            * 'any' : If any NA values are present, drop that row or column.
+            * 'all' : If all values are NA, drop that row or column.
+        thresh : int, optional
+            Require that many non-NA values. Cannot be combined with how.
+        on_nested : str or bool, optional
+            If not False, applies the call to the nested dataframe in the
+            column with label equal to the provided string. If specified,
+            the nested dataframe should align with any columns given in
+            `subset`.
+        subset : column label or sequence of labels, optional
+            Labels along other axis to consider, e.g. if you are dropping rows
+            these would be a list of columns to include.
+
+            Access nested columns using `nested_df.nested_col` (where
+            `nested_df` refers to a particular nested dataframe and
+            `nested_col` is a column of that nested dataframe).
+        inplace : bool, default False
+            Whether to modify the DataFrame rather than creating a new one.
+        ignore_index : bool, default ``False``
+            If ``True``, the resulting axis will be labeled 0, 1, …, n - 1.
+
+            .. versionadded:: 2.0.0
+
+        Returns
+        -------
+        DataFrame or None
+            DataFrame with NA entries dropped from it or None if ``inplace=True``.
+
+        Notes
+        -----
+        Operations that target a particular nested structure return a dataframe
+        with rows of that particular nested structure affected.
+        """
+
+        # determine target dataframe
+
+        # first check the subset kwarg input
+        subset_target = []
+        if subset:
+            if isinstance(subset, str):
+                subset = [subset]
+
+            for col in subset:
+                col = col.split(".")[0]
+                if col in self.nested_columns:
+                    subset_target.append(col)
+                elif col in self.columns:
+                    subset_target.append("base")
+
+            # Check for 1 target
+            subset_target = np.unique(subset_target)
+            if len(subset_target) > 1:  # prohibit multi-target operations
+                raise ValueError(
+                    f"Targeted multiple nested structures ({subset_target}), write one command per target dataframe"  # noqa
+                )
+            elif len(subset_target) == 0:
+                raise ValueError(
+                    "Provided base columns or nested layer did not match any found in the nestedframe"
+                )
+            subset_target = subset_target[0]
+
+        # Next check the on_nested kwarg input
+        if on_nested and on_nested not in self.nested_columns:
+            raise ValueError("Provided nested layer not found in nested dataframes")
+
+        # Resolve target layer
+        target = "base"
+        if on_nested and subset_target:
+            if on_nested != subset_target:
+                raise ValueError(
+                    f"Provided on_nested={on_nested}, but subset columns are from {subset_target}. Make sure these are aligned or just use subset."  # noqa
+                )
+            else:
+                target = subset_target
+        elif on_nested:
+            target = str(on_nested)
+        elif subset_target:
+            target = str(subset_target)
+
+        if target == "base":
+            return super().dropna(
+                axis=axis, how=how, thresh=thresh, subset=subset, inplace=inplace, ignore_index=ignore_index
+            )
+        else:
+            if subset is not None:
+                subset = [col.split(".")[-1] for col in subset]
+            self[target] = packer.pack_flat(
+                self[target]
+                .nest.to_flat()
+                .dropna(
+                    axis=axis,
+                    how=how,
+                    thresh=thresh,
+                    subset=subset,
+                    inplace=inplace,
+                    ignore_index=ignore_index,
+                )
+            )
+            return self
diff --git a/tests/nested_pandas/nestedframe/test_nestedframe.py b/tests/nested_pandas/nestedframe/test_nestedframe.py
@@ -1,3 +1,4 @@
+import numpy as np
 import pandas as pd
 import pytest
 from nested_pandas import NestedFrame
@@ -101,3 +102,65 @@ def test_query():
 
     nest_queried = base.query("(nested.c > 1) and (nested.d>2)")
     assert len(nest_queried.nested.nest.to_flat()) == 4
+
+
+def test_dropna():
+    """Test that dropna works on all layers"""
+
+    base = NestedFrame(data={"a": [1, 2, 3], "b": [2, np.NaN, 6]}, index=[0, 1, 2])
+
+    nested = pd.DataFrame(
+        data={"c": [0, 2, 4, 1, np.NaN, 3, 1, 4, 1], "d": [5, 4, 7, 5, 3, 1, 9, 3, 4]},
+        index=[0, 0, 0, 1, 1, 1, 2, 2, 2],
+    )
+
+    base = base.add_nested(nested, "nested")
+
+    # Test basic functionality
+    dn_base = base.dropna(subset=["b"])
+    assert len(dn_base) == 2
+    assert len(dn_base["nested"].nest.to_flat() == 6)
+
+    # Test on_nested kwarg
+    dn_on_nested = base.dropna(on_nested="nested")
+    assert len(dn_on_nested) == 3
+    assert len(dn_on_nested["nested"].nest.to_flat() == 8)
+
+    # Test hierarchical column subset
+    dn_hierarchical = base.dropna(subset="nested.c")
+    assert len(dn_hierarchical) == 3
+    assert len(dn_hierarchical["nested"].nest.to_flat() == 8)
+
+    # Test hierarchical column subset and on_nested
+    dn_hierarchical = base.dropna(on_nested="nested", subset="nested.c")
+    assert len(dn_hierarchical) == 3
+    assert len(dn_hierarchical["nested"].nest.to_flat() == 8)
+
+
+def test_dropna_errors():
+    """Test that the various dropna exceptions trigger"""
+
+    base = NestedFrame(data={"a": [1, 2, 3], "b": [2, np.NaN, 6]}, index=[0, 1, 2])
+
+    nested = pd.DataFrame(
+        data={"c": [0, 2, 4, 1, np.NaN, 3, 1, 4, 1], "d": [5, 4, 7, 5, 3, 1, 9, 3, 4]},
+        index=[0, 0, 0, 1, 1, 1, 2, 2, 2],
+    )
+
+    base = base.add_nested(nested, "nested")
+
+    # Test multi-target
+    with pytest.raises(ValueError):
+        base.dropna(subset=["b", "nested.c"])
+
+    # Test no-target
+    with pytest.raises(ValueError):
+        base.dropna(subset=["not_nested.c"])
+
+    # Test bad on-nested value
+    with pytest.raises(ValueError):
+        base.dropna(on_nested="not_nested")
+
+    # Test on-nested + subset disagreement
+    with pytest.raises(ValueError):
+        base.dropna(on_nested="nested", subset=["b"])