diff --git a/src/nested_pandas/__init__.py b/src/nested_pandas/__init__.py index 4577a8f..613f651 100644 --- a/src/nested_pandas/__init__.py +++ b/src/nested_pandas/__init__.py @@ -1,7 +1,8 @@ from .example_module import greetings, meaning +from .nestedframe import NestedFrame # Import for registering from .series.accessor import NestSeriesAccessor # noqa: F401 from .series.dtype import NestedDtype -__all__ = ["greetings", "meaning", "NestedDtype"] +__all__ = ["greetings", "meaning", "NestedDtype", "NestedFrame"] diff --git a/src/nested_pandas/nestedframe/__init__.py b/src/nested_pandas/nestedframe/__init__.py new file mode 100644 index 0000000..54af689 --- /dev/null +++ b/src/nested_pandas/nestedframe/__init__.py @@ -0,0 +1 @@ +from .core import NestedFrame # noqa diff --git a/src/nested_pandas/nestedframe/core.py b/src/nested_pandas/nestedframe/core.py new file mode 100644 index 0000000..dfe4db6 --- /dev/null +++ b/src/nested_pandas/nestedframe/core.py @@ -0,0 +1,60 @@ +# typing.Self and "|" union syntax don't exist in Python 3.9 +from __future__ import annotations + +import pandas as pd + +from nested_pandas.series import packer +from nested_pandas.series.dtype import NestedDtype + + +class NestedFrame(pd.DataFrame): + """A Pandas Dataframe extension with support for nested structure. + + See https://pandas.pydata.org/docs/development/extending.html#subclassing-pandas-data-structures + """ + + # normal properties + _metadata = ["added_property"] + + @property + def _constructor(self) -> Self: # type: ignore[name-defined] # noqa: F821 + return NestedFrame + + @property + def _constructor_expanddim(self) -> Self: # type: ignore[name-defined] # noqa: F821 + return NestedFrame + + @property + def all_columns(self) -> dict: + """returns a dictionary of columns for each base/nested dataframe""" + all_columns = {"base": self.columns} + for column in self.columns: + if isinstance(self[column].dtype, NestedDtype): + nest_cols = self[column].nest.fields + all_columns[column] = nest_cols + return all_columns + + @property + def nested_columns(self) -> list: + """retrieves the base column names for all nested dataframes""" + nest_cols = [] + for column in self.columns: + if isinstance(self[column].dtype, NestedDtype): + nest_cols.append(column) + return nest_cols + + def _is_known_hierarchical_column(self, colname) -> bool: + """Determine whether a string is a known hierarchical column name""" + if "." in colname: + left, right = colname.split(".") + if left in self.nested_columns: + return right in self.all_columns[left] + return False + return False + + def add_nested(self, nested, name) -> Self: # type: ignore[name-defined] # noqa: F821 + """Packs a dataframe into a nested column""" + # Add sources to objects + packed = packer.pack_flat(nested, name=name) + label = packed.name + return self.assign(**{f"{label}": packed}) diff --git a/tests/nested_pandas/nestedframe/test_nestedframe.py b/tests/nested_pandas/nestedframe/test_nestedframe.py new file mode 100644 index 0000000..8c70da3 --- /dev/null +++ b/tests/nested_pandas/nestedframe/test_nestedframe.py @@ -0,0 +1,76 @@ +import pandas as pd +from nested_pandas import NestedFrame + + +def test_nestedframe_construction(): + """Test NestedFrame construction""" + base = NestedFrame(data={"a": [1, 2, 3], "b": [2, 4, 6]}, index=[0, 1, 2]) + + assert isinstance(base, NestedFrame) + + +def test_all_columns(): + """Test the all_columns function""" + + base = NestedFrame(data={"a": [1, 2, 3], "b": [2, 4, 6]}, index=[0, 1, 2]) + + assert list(base.all_columns.keys()) == ["base"] + assert list(base.all_columns["base"]) == list(base.columns) + + nested = pd.DataFrame( + data={"c": [0, 2, 4, 1, 4, 3, 1, 4, 1], "d": [5, 4, 7, 5, 3, 1, 9, 3, 4]}, + index=[0, 0, 0, 1, 1, 1, 2, 2, 2], + ) + + base = base.add_nested(nested, "nested") + + assert list(base.all_columns.keys()) == ["base", "nested"] + assert list(base.all_columns["nested"]) == list(nested.columns) + + +def test_nested_columns(): + """Test that nested_columns correctly retrieves the nested base columns""" + + base = NestedFrame(data={"a": [1, 2, 3], "b": [2, 4, 6]}, index=[0, 1, 2]) + + nested = pd.DataFrame( + data={"c": [0, 2, 4, 1, 4, 3, 1, 4, 1], "d": [5, 4, 7, 5, 3, 1, 9, 3, 4]}, + index=[0, 0, 0, 1, 1, 1, 2, 2, 2], + ) + + base = base.add_nested(nested, "nested") + + assert base.nested_columns == ["nested"] + + +def test_is_known_hierarchical_column(): + """Test that hierarchical column labels can be identified""" + + base = NestedFrame(data={"a": [1, 2, 3], "b": [2, 4, 6]}, index=[0, 1, 2]) + + nested = pd.DataFrame( + data={"c": [0, 2, 4, 1, 4, 3, 1, 4, 1], "d": [5, 4, 7, 5, 3, 1, 9, 3, 4]}, + index=[0, 0, 0, 1, 1, 1, 2, 2, 2], + ) + + base = base.add_nested(nested, "nested") + + assert base._is_known_hierarchical_column("nested.c") + assert not base._is_known_hierarchical_column("nested.b") + assert not base._is_known_hierarchical_column("base.a") + + +def test_add_nested(): + """Test that add_nested correctly adds a nested column to the base df""" + + base = NestedFrame(data={"a": [1, 2, 3], "b": [2, 4, 6]}, index=[0, 1, 2]) + + nested = pd.DataFrame( + data={"c": [0, 2, 4, 1, 4, 3, 1, 4, 1], "d": [5, 4, 7, 5, 3, 1, 9, 3, 4]}, + index=[0, 0, 0, 1, 1, 1, 2, 2, 2], + ) + + base = base.add_nested(nested, "nested") + + assert "nested" in base.columns + assert base.nested.nest.to_flat().equals(nested)