Skip to content

Commit

Permalink
Merge pull request #5 from lincc-frameworks/init_nestedframe
Browse files Browse the repository at this point in the history
initialize nestedframe
  • Loading branch information
dougbrn authored Apr 4, 2024
2 parents 9ca3810 + 5504dba commit e04f558
Show file tree
Hide file tree
Showing 4 changed files with 139 additions and 1 deletion.
3 changes: 2 additions & 1 deletion src/nested_pandas/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
from .example_module import greetings, meaning
from .nestedframe import NestedFrame

# Import for registering
from .series.accessor import NestSeriesAccessor # noqa: F401
from .series.dtype import NestedDtype

__all__ = ["greetings", "meaning", "NestedDtype"]
__all__ = ["greetings", "meaning", "NestedDtype", "NestedFrame"]
1 change: 1 addition & 0 deletions src/nested_pandas/nestedframe/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .core import NestedFrame # noqa
60 changes: 60 additions & 0 deletions src/nested_pandas/nestedframe/core.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
# typing.Self and "|" union syntax don't exist in Python 3.9
from __future__ import annotations

import pandas as pd

from nested_pandas.series import packer
from nested_pandas.series.dtype import NestedDtype


class NestedFrame(pd.DataFrame):
"""A Pandas Dataframe extension with support for nested structure.
See https://pandas.pydata.org/docs/development/extending.html#subclassing-pandas-data-structures
"""

# normal properties
_metadata = ["added_property"]

@property
def _constructor(self) -> Self: # type: ignore[name-defined] # noqa: F821
return NestedFrame

@property
def _constructor_expanddim(self) -> Self: # type: ignore[name-defined] # noqa: F821
return NestedFrame

@property
def all_columns(self) -> dict:
"""returns a dictionary of columns for each base/nested dataframe"""
all_columns = {"base": self.columns}
for column in self.columns:
if isinstance(self[column].dtype, NestedDtype):
nest_cols = self[column].nest.fields
all_columns[column] = nest_cols
return all_columns

@property
def nested_columns(self) -> list:
"""retrieves the base column names for all nested dataframes"""
nest_cols = []
for column in self.columns:
if isinstance(self[column].dtype, NestedDtype):
nest_cols.append(column)
return nest_cols

def _is_known_hierarchical_column(self, colname) -> bool:
"""Determine whether a string is a known hierarchical column name"""
if "." in colname:
left, right = colname.split(".")
if left in self.nested_columns:
return right in self.all_columns[left]
return False
return False

def add_nested(self, nested, name) -> Self: # type: ignore[name-defined] # noqa: F821
"""Packs a dataframe into a nested column"""
# Add sources to objects
packed = packer.pack_flat(nested, name=name)
label = packed.name
return self.assign(**{f"{label}": packed})
76 changes: 76 additions & 0 deletions tests/nested_pandas/nestedframe/test_nestedframe.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
import pandas as pd
from nested_pandas import NestedFrame


def test_nestedframe_construction():
"""Test NestedFrame construction"""
base = NestedFrame(data={"a": [1, 2, 3], "b": [2, 4, 6]}, index=[0, 1, 2])

assert isinstance(base, NestedFrame)


def test_all_columns():
"""Test the all_columns function"""

base = NestedFrame(data={"a": [1, 2, 3], "b": [2, 4, 6]}, index=[0, 1, 2])

assert list(base.all_columns.keys()) == ["base"]
assert list(base.all_columns["base"]) == list(base.columns)

nested = pd.DataFrame(
data={"c": [0, 2, 4, 1, 4, 3, 1, 4, 1], "d": [5, 4, 7, 5, 3, 1, 9, 3, 4]},
index=[0, 0, 0, 1, 1, 1, 2, 2, 2],
)

base = base.add_nested(nested, "nested")

assert list(base.all_columns.keys()) == ["base", "nested"]
assert list(base.all_columns["nested"]) == list(nested.columns)


def test_nested_columns():
"""Test that nested_columns correctly retrieves the nested base columns"""

base = NestedFrame(data={"a": [1, 2, 3], "b": [2, 4, 6]}, index=[0, 1, 2])

nested = pd.DataFrame(
data={"c": [0, 2, 4, 1, 4, 3, 1, 4, 1], "d": [5, 4, 7, 5, 3, 1, 9, 3, 4]},
index=[0, 0, 0, 1, 1, 1, 2, 2, 2],
)

base = base.add_nested(nested, "nested")

assert base.nested_columns == ["nested"]


def test_is_known_hierarchical_column():
"""Test that hierarchical column labels can be identified"""

base = NestedFrame(data={"a": [1, 2, 3], "b": [2, 4, 6]}, index=[0, 1, 2])

nested = pd.DataFrame(
data={"c": [0, 2, 4, 1, 4, 3, 1, 4, 1], "d": [5, 4, 7, 5, 3, 1, 9, 3, 4]},
index=[0, 0, 0, 1, 1, 1, 2, 2, 2],
)

base = base.add_nested(nested, "nested")

assert base._is_known_hierarchical_column("nested.c")
assert not base._is_known_hierarchical_column("nested.b")
assert not base._is_known_hierarchical_column("base.a")


def test_add_nested():
"""Test that add_nested correctly adds a nested column to the base df"""

base = NestedFrame(data={"a": [1, 2, 3], "b": [2, 4, 6]}, index=[0, 1, 2])

nested = pd.DataFrame(
data={"c": [0, 2, 4, 1, 4, 3, 1, 4, 1], "d": [5, 4, 7, 5, 3, 1, 9, 3, 4]},
index=[0, 0, 0, 1, 1, 1, 2, 2, 2],
)

base = base.add_nested(nested, "nested")

assert "nested" in base.columns
assert base.nested.nest.to_flat().equals(nested)

0 comments on commit e04f558

Please sign in to comment.