From bc85088ee66584c5f956817c6a84d48072028124 Mon Sep 17 00:00:00 2001 From: Wilson Beebe Date: Fri, 26 Jul 2024 14:54:10 -0700 Subject: [PATCH] Add "how" argument to add_nested (#125) * Add "how" argument to add_nested * Ruff formatting * Add assert * Rearrange test assertions --- src/nested_pandas/nestedframe/core.py | 21 ++++- .../nestedframe/test_nestedframe.py | 79 +++++++++++++++++-- 2 files changed, 91 insertions(+), 9 deletions(-) diff --git a/src/nested_pandas/nestedframe/core.py b/src/nested_pandas/nestedframe/core.py index 285c598..aa15401 100644 --- a/src/nested_pandas/nestedframe/core.py +++ b/src/nested_pandas/nestedframe/core.py @@ -66,7 +66,12 @@ def _is_known_column(self, colname) -> bool: return colname in self.columns or self._is_known_hierarchical_column(colname) def add_nested( - self, obj, name: str, *, dtype: NestedDtype | pd.ArrowDtype | pa.DataType | None = None + self, + obj, + name: str, + *, + how: str = "left", + dtype: NestedDtype | pd.ArrowDtype | pa.DataType | None = None, ) -> Self: # type: ignore[name-defined] # noqa: F821 """Packs input object to a nested column and adds it to the NestedFrame @@ -86,6 +91,16 @@ def add_nested( missing values. name : str The name of the nested column to be added to the NestedFrame. + how : {'left', 'right', 'outer', 'inner'}, default: 'left' + How to handle the operation of the two objects: + + - left: use calling frame's index. + - right: use the calling frame's index and order but drop values + not in the other frame's index. + - outer: form union of calling frame's index with other frame's + index, and sort it lexicographically. + - inner: form intersection of calling frame's index with other + frame's index, preserving the order of the calling index. dtype : dtype or None NestedDtype to use for the nested column; pd.ArrowDtype or pa.DataType can also be used to specify the nested dtype. If None, @@ -98,10 +113,8 @@ def add_nested( """ # Add sources to objects packed = packer.pack(obj, name=name, dtype=dtype) - label = packed.name new_df = self.copy() - new_df[label] = packed - return new_df + return new_df.join(packed, how=how) @classmethod def from_flat(cls, df, base_columns, nested_columns=None, index=None, name="nested"): diff --git a/tests/nested_pandas/nestedframe/test_nestedframe.py b/tests/nested_pandas/nestedframe/test_nestedframe.py index 019f5f9..4b7c010 100644 --- a/tests/nested_pandas/nestedframe/test_nestedframe.py +++ b/tests/nested_pandas/nestedframe/test_nestedframe.py @@ -89,13 +89,82 @@ def test_add_nested_with_flat_df_and_mismatched_index(): nested = pd.DataFrame( data={"c": [0, 2, 4, 1, 4, 3, 1, 4, 1], "d": [5, 4, 7, 5, 3, 1, 9, 3, 4]}, - index=[0, 0, 0, 1, 1, 1, 1, 1, 1], # no data for index value of "2" + # no data for base index value of "2" and introduces new index value "4" + index=[0, 0, 0, 1, 1, 1, 1, 4, 4], ) - base = base.add_nested(nested, "nested") - - assert "nested" in base.columns - assert pd.isna(base.loc[2]["nested"]) + # Add the nested frame in a "left" fashion, where the index of the "left" + # frame (our base layer) is preserved + left_res = base.add_nested(nested, "nested", how="left") + assert "nested" in left_res.columns + # Check that the index of the base layer is being used + assert (left_res.index == base.index).all() + for idx in left_res.index: + # Check that the nested column is aligned correctly to the base layer + if idx in nested.index: + assert left_res.loc[idx]["nested"] is not None + else: # idx only in base.index + assert left_res.loc[idx]["nested"] is None + + # Test that the default behavior is the same as how="left" by comparing the pandas dataframes + default_res = base.add_nested(nested, "nested") + assert_frame_equal(left_res, default_res) + + # Test adding the nested frame in a "right" fashion, where the index of the "right" + # frame (our nested layer) is preserved + right_res = base.add_nested(nested, "nested", how="right") + assert "nested" in right_res.columns + # Check that the index of the nested layer is being used. Note that separate + # from a traditional join this will not be the same as our nested layer index + # and is just dropping values from the base layer that don't have a match in + # the nested layer. + assert (right_res.index == nested.index.unique()).all() + # For each index check that the base layer is aligned correctly to the nested layer + for idx in right_res.index: + # Check that the nested column is aligned correctly to the base layer. Here + # it should never be None + assert right_res.loc[idx]["nested"] is not None + # Check the values for each column in our "base" layer + for col in base.columns: + assert col in right_res.columns + if idx not in base.index: + # We expect a NaN value in the base layer due to the "right" join + assert pd.isna(right_res.loc[idx][col]) + else: + assert not pd.isna(right_res.loc[idx][col]) + + # Test the "outer" behavior + outer_res = base.add_nested(nested, "nested", how="outer") + assert "nested" in outer_res.columns + # We expect the new index to be the union of the base and nested indices + assert set(outer_res.index) == set(base.index).union(set(nested.index)) + for idx in outer_res.index: + # Check that the nested column is aligned correctly to the base layer + if idx in nested.index: + assert outer_res.loc[idx]["nested"] is not None + else: # idx only in base.index + assert outer_res.loc[idx]["nested"] is None + # Check the values for each column in our "base" layer + for col in base.columns: + assert col in outer_res.columns + if idx not in base.index: + # We expect a NaN value in the base layer due to the "outer" join + assert pd.isna(outer_res.loc[idx][col]) + else: + assert not pd.isna(outer_res.loc[idx][col]) + + # Test the "inner" behavior + inner_res = base.add_nested(nested, "nested", how="inner") + assert "nested" in inner_res.columns + # We expect the new index to be the set intersection of the base and nested indices + assert set(inner_res.index) == set(base.index).intersection(set(nested.index)) + for idx in inner_res.index: + # None of our nested values should be None + assert inner_res.loc[idx]["nested"] is not None + # Check the values for each column in our "base" layer + for col in base.columns: + assert col in inner_res.columns + assert not pd.isna(inner_res.loc[idx][col]) def test_add_nested_with_series():