Skip to content

Commit

Permalink
Merge pull request #78 from lincc-frameworks/packer-multiindex
Browse files Browse the repository at this point in the history
Multiindex support for pack_df
  • Loading branch information
hombit authored May 10, 2024
2 parents 92b422d + 9b7554b commit 725da0c
Show file tree
Hide file tree
Showing 2 changed files with 93 additions and 90 deletions.
57 changes: 18 additions & 39 deletions src/nested_pandas/series/packer.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,36 +57,6 @@ def pack(
return pack_seq(obj, name=name, index=index, dtype=dtype)


def pack_flat_into_df(df: pd.DataFrame, name=None) -> pd.DataFrame:
"""Pack a "flat" dataframe into a "nested" dataframe.
For the input dataframe with repeated indexes, make a pandas.DataFrame,
where each original column is replaced by a column of lists, and,
optionally, a "structure" column is added, containing a structure of
lists with the original columns.
Parameters
----------
df : pd.DataFrame
Input dataframe, with repeated indexes.
name : str, optional
Name of the structure column. The default is None, which means no
structure column is added.
Returns
-------
pd.DataFrame
Output dataframe.
"""
# TODO: we can optimize name=None case a bit
struct_series = pack_flat(df, name=name)
packed_df = struct_series.nest.to_lists()
if name is not None:
packed_df[name] = struct_series
return packed_df


def pack_flat(df: pd.DataFrame, name: str | None = None) -> pd.Series:
"""Make a structure of lists representation of a "flat" dataframe.
Expand Down Expand Up @@ -116,7 +86,7 @@ def pack_flat(df: pd.DataFrame, name: str | None = None) -> pd.Series:
nested_pandas.series.packer.pack_lists : Pack a dataframe of nested arrays.
"""

# TODO: think about the case when the data is pre-sorted and we don't need a data copy.
# pandas knows when index is pre-sorted, so it would do nothing if it is already sorted
flat = df.sort_index(kind="stable")
return pack_sorted_df_into_struct(flat, name=name)

Expand Down Expand Up @@ -177,6 +147,9 @@ def pack_sorted_df_into_struct(df: pd.DataFrame, name: str | None = None) -> pd.
pd.Series
Output series, with unique indexes.
"""
if not df.index.is_monotonic_increasing:
raise ValueError("The index of the input dataframe must be sorted")

packed_df = view_sorted_df_as_list_arrays(df)
# No need to validate the dataframe, the length of the nested arrays is forced to be the same by
# the view_sorted_df_as_list_arrays function.
Expand Down Expand Up @@ -243,8 +216,11 @@ def view_sorted_df_as_list_arrays(df: pd.DataFrame) -> pd.DataFrame:
Output dataframe, with unique indexes. It is a view over the input
dataframe, so it would mute the input dataframe if modified.
"""
if not df.index.is_monotonic_increasing:
raise ValueError("The index of the input dataframe must be sorted")

offset_array = calculate_sorted_index_offsets(df.index)
unique_index = df.index.values[offset_array[:-1]]
unique_index = df.index[offset_array[:-1]]

series_ = {
column: view_sorted_series_as_list_array(df[column], offset_array, unique_index)
Expand Down Expand Up @@ -278,10 +254,13 @@ def view_sorted_series_as_list_array(
Output series, with unique indexes. It is a view over the input series,
so it would mute the input series if modified.
"""
if not series.index.is_monotonic_increasing:
raise ValueError("The index of the input series must be sorted")

if offset is None:
offset = calculate_sorted_index_offsets(series.index)
if unique_index is None:
unique_index = series.index.values[offset[:-1]]
unique_index = series.index[offset[:-1]]

list_array = pa.ListArray.from_arrays(
offset,
Expand Down Expand Up @@ -310,12 +289,12 @@ def calculate_sorted_index_offsets(index: pd.Index) -> np.ndarray:
Output array of offsets, one element more than the number of unique
index values.
"""
# TODO: implement multi-index support
index_diff = np.diff(index.values, prepend=index.values[0] - 1, append=index.values[-1] + 1)

if np.any(index_diff < 0):
raise ValueError("Table index must be strictly sorted.")
if not index.is_monotonic_increasing:
raise ValueError("The index must be sorted")

offset = np.nonzero(index_diff)[0]
# pd.Index.duplicated returns False for the first occurance and True for all others.
# So our offsets would be indexes of these False values with the array length in the end.
offset_but_last = np.nonzero(~index.duplicated(keep="first"))[0]
offset = np.append(offset_but_last, len(index))

return offset
126 changes: 75 additions & 51 deletions tests/nested_pandas/series/test_packer.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import numpy as np
import pandas as pd
import pyarrow as pa
import pytest
from nested_pandas import NestedDtype
from nested_pandas.series import packer
from numpy.testing import assert_array_equal
Expand All @@ -14,7 +15,7 @@ def test_pack_with_flat_df():
"a": [1, 2, 3, 4],
"b": [0, 1, 0, 1],
},
index=[1, 2, 1, 2],
index=pd.MultiIndex.from_arrays(([1, 1, 1, 1], [1, 2, 1, 2])),
)
series = packer.pack(df, name="series")

Expand All @@ -23,7 +24,7 @@ def test_pack_with_flat_df():
(np.array([1, 3]), np.array([0, 0])),
(np.array([2, 4]), np.array([1, 1])),
],
index=[1, 2],
index=pd.MultiIndex.from_arrays(([1, 1], [1, 2])),
dtype=NestedDtype.from_fields(dict(a=pa.int64(), b=pa.int64())),
name="series",
)
Expand Down Expand Up @@ -87,55 +88,6 @@ def test_pack_with_series_of_dfs():
assert_series_equal(series, desired)


def test_pack_flat_into_df():
"""Test pack_flat_into_df()."""
df = pd.DataFrame(
data={
"a": [7, 8, 9, 1, 2, 3, 4, 5, 6],
"b": [0, 1, 0, 0, 1, 0, 1, 0, 1],
},
index=[4, 4, 4, 1, 1, 2, 2, 3, 3],
)
actual = packer.pack_flat_into_df(df, name="struct")

desired = pd.DataFrame(
data={
"a": pd.Series(
data=[
np.array([1, 2]),
np.array([3, 4]),
np.array([5, 6]),
np.array([7, 8, 9]),
],
dtype=pd.ArrowDtype(pa.list_(pa.int64())),
index=[1, 2, 3, 4],
),
"b": pd.Series(
data=[
np.array([0, 1]),
np.array([0, 1]),
np.array([0, 1]),
np.array([0, 1, 0]),
],
dtype=pd.ArrowDtype(pa.list_(pa.int64())),
index=[1, 2, 3, 4],
),
"struct": pd.Series(
data=[
(np.array([1, 2]), np.array([0, 1])),
(np.array([3, 4]), np.array([0, 1])),
(np.array([5, 6]), np.array([0, 1])),
(np.array([7, 8, 9]), np.array([0, 1, 0])),
],
dtype=NestedDtype.from_fields(dict(a=pa.int64(), b=pa.int64())),
index=[1, 2, 3, 4],
),
},
)

assert_frame_equal(actual, desired)


def test_pack_flat():
"""Test pack_flat()."""
df = pd.DataFrame(
Expand Down Expand Up @@ -186,6 +138,19 @@ def test_pack_sorted_df_into_struct():
assert_series_equal(actual, desired)


def test_pack_sorted_df_into_struct_raises_when_not_sorted():
"""Test pack_sorted_df_into_struct() raises when not sorted."""
df = pd.DataFrame(
data={
"a": [1, 2, 3, 4, 5, 6, 7, 8, 9],
"b": [0, 1, 0, 1, 0, 1, 0, 1, 0],
},
index=[1, 2, 1, 2, 3, 3, 4, 4, 4],
)
with pytest.raises(ValueError):
packer.pack_sorted_df_into_struct(df)


def test_pack_lists():
"""Test pack_lists()."""
packed_df = pd.DataFrame(
Expand Down Expand Up @@ -362,6 +327,19 @@ def test_view_sorted_df_as_list_arrays():
assert_frame_equal(nested_df, desired_nested)


def test_view_sorted_df_as_list_arrays_raises_when_not_sorted():
"""Test view_sorted_df_as_list_arrays() raises when not sorted."""
flat_df = pd.DataFrame(
data={
"a": [1, 2, 3, 4, 5, 6, 7, 8, 9],
"b": [0, 1, 0, 1, 0, 1, 0, 1, 0],
},
index=[1, 2, 1, 2, 3, 3, 4, 4, 4],
)
with pytest.raises(ValueError):
packer.view_sorted_df_as_list_arrays(flat_df)


def test_view_sorted_series_as_list_array():
"""Test view_sorted_series_as_list_array()."""
series = pd.Series(
Expand All @@ -386,3 +364,49 @@ def test_view_sorted_series_as_list_array():
name="my_series",
)
assert_series_equal(nested, desired_nested)


def test_view_sorted_series_as_list_array_raises_when_not_sorted():
"""Test view_sorted_series_as_list_array() raises when not sorted."""
series = pd.Series(
data=[1, 2, 3, 4, 5, 6, 7, 8, 9],
index=[1, 2, 1, 2, 3, 3, 4, 4, 4],
)
with pytest.raises(ValueError):
packer.view_sorted_series_as_list_array(series)


@pytest.mark.parametrize(
"index,offsets",
[
(pd.Index([1, 2, 3, 4]), np.array([0, 1, 2, 3, 4])),
(pd.Index([1, 1, 2, 2, 3, 3, 4, 4, 4]), np.array([0, 2, 4, 6, 9])),
(pd.Index([1, 1, 1, 1, 1, 1, 1, 1, 1]), np.array([0, 9])),
(pd.Index([1, 2, 2, 2, 3, 3, 4]), np.array([0, 1, 4, 6, 7])),
(
pd.MultiIndex.from_product([[1, 2, 3], ["a", "a", "b", "b", "b"]]),
np.array([0, 2, 5, 7, 10, 12, 15]),
),
(
pd.MultiIndex.from_arrays(
(
[1, 1, 1, 1, 1, 1, 2, 2],
["a", "a", "a", "a", "b", "b", "z", "z"],
[1, 2, 2, 2, 9, 9, 9, 9],
),
names=["id1", "id2", "id3"],
),
np.array([0, 1, 4, 6, 8]),
),
],
)
def test_calculate_sorted_index_offsets(index, offsets):
"""Test calculate_sorted_index_offsets()."""
assert_array_equal(packer.calculate_sorted_index_offsets(index), offsets)


def test_calculate_sorted_index_offsets_raises_when_not_sorted():
"""Test calculate_sorted_index_offsets() raises when not sorted."""
index = pd.Index([1, 2, 1, 2, 3, 3, 4, 4, 4])
with pytest.raises(ValueError):
packer.calculate_sorted_index_offsets(index)

0 comments on commit 725da0c

Please sign in to comment.