Add "how" argument to add_nested (#125)

* Add "how" argument to add_nested * Ruff formatting * Add assert * Rearrange test assertions
lincc-frameworks · Jul 26, 2024 · bc85088 · bc85088
1 parent bcaf1b8
commit bc85088
Show file tree

Hide file tree

Showing 2 changed files with 91 additions and 9 deletions.
diff --git a/src/nested_pandas/nestedframe/core.py b/src/nested_pandas/nestedframe/core.py
@@ -66,7 +66,12 @@ def _is_known_column(self, colname) -> bool:
         return colname in self.columns or self._is_known_hierarchical_column(colname)
 
     def add_nested(
-        self, obj, name: str, *, dtype: NestedDtype | pd.ArrowDtype | pa.DataType | None = None
+        self,
+        obj,
+        name: str,
+        *,
+        how: str = "left",
+        dtype: NestedDtype | pd.ArrowDtype | pa.DataType | None = None,
     ) -> Self:  # type: ignore[name-defined] # noqa: F821
         """Packs input object to a nested column and adds it to the NestedFrame
 
@@ -86,6 +91,16 @@ def add_nested(
             missing values.
         name : str
             The name of the nested column to be added to the NestedFrame.
+        how : {'left', 'right', 'outer', 'inner'}, default: 'left'
+            How to handle the operation of the two objects:
+
+            - left: use calling frame's index.
+            - right: use the calling frame's index and order but drop values
+              not in the other frame's index.
+            - outer: form union of calling frame's index with other frame's
+              index, and sort it lexicographically.
+            - inner: form intersection of calling frame's index with other
+              frame's index, preserving the order of the calling index.
         dtype : dtype or None
             NestedDtype to use for the nested column; pd.ArrowDtype or
             pa.DataType can also be used to specify the nested dtype. If None,
@@ -98,10 +113,8 @@ def add_nested(
         """
         # Add sources to objects
         packed = packer.pack(obj, name=name, dtype=dtype)
-        label = packed.name
         new_df = self.copy()
-        new_df[label] = packed
-        return new_df
+        return new_df.join(packed, how=how)
 
     @classmethod
     def from_flat(cls, df, base_columns, nested_columns=None, index=None, name="nested"):

diff --git a/tests/nested_pandas/nestedframe/test_nestedframe.py b/tests/nested_pandas/nestedframe/test_nestedframe.py
@@ -89,13 +89,82 @@ def test_add_nested_with_flat_df_and_mismatched_index():
 
     nested = pd.DataFrame(
         data={"c": [0, 2, 4, 1, 4, 3, 1, 4, 1], "d": [5, 4, 7, 5, 3, 1, 9, 3, 4]},
-        index=[0, 0, 0, 1, 1, 1, 1, 1, 1],  # no data for index value of "2"
+        # no data for base index value of "2" and introduces new index value "4"
+        index=[0, 0, 0, 1, 1, 1, 1, 4, 4],
     )
 
-    base = base.add_nested(nested, "nested")
-
-    assert "nested" in base.columns
-    assert pd.isna(base.loc[2]["nested"])
+    # Add the nested frame in a "left" fashion, where the index of the "left"
+    # frame (our base layer) is preserved
+    left_res = base.add_nested(nested, "nested", how="left")
+    assert "nested" in left_res.columns
+    # Check that the index of the base layer is being used
+    assert (left_res.index == base.index).all()
+    for idx in left_res.index:
+        # Check that the nested column is aligned correctly to the base layer
+        if idx in nested.index:
+            assert left_res.loc[idx]["nested"] is not None
+        else:  # idx only in base.index
+            assert left_res.loc[idx]["nested"] is None
+
+    # Test that the default behavior is the same as how="left" by comparing the pandas dataframes
+    default_res = base.add_nested(nested, "nested")
+    assert_frame_equal(left_res, default_res)
+
+    # Test adding the nested frame in a "right" fashion, where the index of the "right"
+    # frame (our nested layer) is preserved
+    right_res = base.add_nested(nested, "nested", how="right")
+    assert "nested" in right_res.columns
+    # Check that the index of the nested layer is being used. Note that separate
+    # from a traditional join this will not be the same as our nested layer index
+    # and is just dropping values from the base layer that don't have a match in
+    # the nested layer.
+    assert (right_res.index == nested.index.unique()).all()
+    # For each index check that the base layer is aligned correctly to the nested layer
+    for idx in right_res.index:
+        # Check that the nested column is aligned correctly to the base layer. Here
+        # it should never be None
+        assert right_res.loc[idx]["nested"] is not None
+        # Check the values for each column in our "base" layer
+        for col in base.columns:
+            assert col in right_res.columns
+            if idx not in base.index:
+                # We expect a NaN value in the base layer due to the "right" join
+                assert pd.isna(right_res.loc[idx][col])
+            else:
+                assert not pd.isna(right_res.loc[idx][col])
+
+    # Test the "outer" behavior
+    outer_res = base.add_nested(nested, "nested", how="outer")
+    assert "nested" in outer_res.columns
+    # We expect the new index to be the union of the base and nested indices
+    assert set(outer_res.index) == set(base.index).union(set(nested.index))
+    for idx in outer_res.index:
+        # Check that the nested column is aligned correctly to the base layer
+        if idx in nested.index:
+            assert outer_res.loc[idx]["nested"] is not None
+        else:  # idx only in base.index
+            assert outer_res.loc[idx]["nested"] is None
+        # Check the values for each column in our "base" layer
+        for col in base.columns:
+            assert col in outer_res.columns
+            if idx not in base.index:
+                # We expect a NaN value in the base layer due to the "outer" join
+                assert pd.isna(outer_res.loc[idx][col])
+            else:
+                assert not pd.isna(outer_res.loc[idx][col])
+
+    # Test the "inner" behavior
+    inner_res = base.add_nested(nested, "nested", how="inner")
+    assert "nested" in inner_res.columns
+    # We expect the new index to be the set intersection of the base and nested indices
+    assert set(inner_res.index) == set(base.index).intersection(set(nested.index))
+    for idx in inner_res.index:
+        # None of our nested values should be None
+        assert inner_res.loc[idx]["nested"] is not None
+        # Check the values for each column in our "base" layer
+        for col in base.columns:
+            assert col in inner_res.columns
+            assert not pd.isna(inner_res.loc[idx][col])
 
 
 def test_add_nested_with_series():