Merge pull request #78 from lincc-frameworks/packer-multiindex

Multiindex support for pack_df
lincc-frameworks · May 10, 2024 · 725da0c · 725da0c
2 parents 92b422d + 9b7554b
commit 725da0c
Show file tree

Hide file tree

Showing 2 changed files with 93 additions and 90 deletions.
diff --git a/src/nested_pandas/series/packer.py b/src/nested_pandas/series/packer.py
@@ -57,36 +57,6 @@ def pack(
     return pack_seq(obj, name=name, index=index, dtype=dtype)
 
 
-def pack_flat_into_df(df: pd.DataFrame, name=None) -> pd.DataFrame:
-    """Pack a "flat" dataframe into a "nested" dataframe.
-
-    For the input dataframe with repeated indexes, make a pandas.DataFrame,
-    where each original column is replaced by a column of lists, and,
-    optionally, a "structure" column is added, containing a structure of
-    lists with the original columns.
-
-    Parameters
-    ----------
-    df : pd.DataFrame
-        Input dataframe, with repeated indexes.
-
-    name : str, optional
-        Name of the structure column. The default is None, which means no
-        structure column is added.
-
-    Returns
-    -------
-    pd.DataFrame
-        Output dataframe.
-    """
-    # TODO: we can optimize name=None case a bit
-    struct_series = pack_flat(df, name=name)
-    packed_df = struct_series.nest.to_lists()
-    if name is not None:
-        packed_df[name] = struct_series
-    return packed_df
-
-
 def pack_flat(df: pd.DataFrame, name: str | None = None) -> pd.Series:
     """Make a structure of lists representation of a "flat" dataframe.
 
@@ -116,7 +86,7 @@ def pack_flat(df: pd.DataFrame, name: str | None = None) -> pd.Series:
     nested_pandas.series.packer.pack_lists : Pack a dataframe of nested arrays.
     """
 
-    # TODO: think about the case when the data is pre-sorted and we don't need a data copy.
+    # pandas knows when index is pre-sorted, so it would do nothing if it is already sorted
     flat = df.sort_index(kind="stable")
     return pack_sorted_df_into_struct(flat, name=name)
 
@@ -177,6 +147,9 @@ def pack_sorted_df_into_struct(df: pd.DataFrame, name: str | None = None) -> pd.
     pd.Series
         Output series, with unique indexes.
     """
+    if not df.index.is_monotonic_increasing:
+        raise ValueError("The index of the input dataframe must be sorted")
+
     packed_df = view_sorted_df_as_list_arrays(df)
     # No need to validate the dataframe, the length of the nested arrays is forced to be the same by
     # the view_sorted_df_as_list_arrays function.
@@ -243,8 +216,11 @@ def view_sorted_df_as_list_arrays(df: pd.DataFrame) -> pd.DataFrame:
         Output dataframe, with unique indexes. It is a view over the input
         dataframe, so it would mute the input dataframe if modified.
     """
+    if not df.index.is_monotonic_increasing:
+        raise ValueError("The index of the input dataframe must be sorted")
+
     offset_array = calculate_sorted_index_offsets(df.index)
-    unique_index = df.index.values[offset_array[:-1]]
+    unique_index = df.index[offset_array[:-1]]
 
     series_ = {
         column: view_sorted_series_as_list_array(df[column], offset_array, unique_index)
@@ -278,10 +254,13 @@ def view_sorted_series_as_list_array(
         Output series, with unique indexes. It is a view over the input series,
         so it would mute the input series if modified.
     """
+    if not series.index.is_monotonic_increasing:
+        raise ValueError("The index of the input series must be sorted")
+
     if offset is None:
         offset = calculate_sorted_index_offsets(series.index)
     if unique_index is None:
-        unique_index = series.index.values[offset[:-1]]
+        unique_index = series.index[offset[:-1]]
 
     list_array = pa.ListArray.from_arrays(
         offset,
@@ -310,12 +289,12 @@ def calculate_sorted_index_offsets(index: pd.Index) -> np.ndarray:
         Output array of offsets, one element more than the number of unique
         index values.
     """
-    # TODO: implement multi-index support
-    index_diff = np.diff(index.values, prepend=index.values[0] - 1, append=index.values[-1] + 1)
-
-    if np.any(index_diff < 0):
-        raise ValueError("Table index must be strictly sorted.")
+    if not index.is_monotonic_increasing:
+        raise ValueError("The index must be sorted")
 
-    offset = np.nonzero(index_diff)[0]
+    # pd.Index.duplicated returns False for the first occurance and True for all others.
+    # So our offsets would be indexes of these False values with the array length in the end.
+    offset_but_last = np.nonzero(~index.duplicated(keep="first"))[0]
+    offset = np.append(offset_but_last, len(index))
 
     return offset
diff --git a/tests/nested_pandas/series/test_packer.py b/tests/nested_pandas/series/test_packer.py
@@ -1,6 +1,7 @@
 import numpy as np
 import pandas as pd
 import pyarrow as pa
+import pytest
 from nested_pandas import NestedDtype
 from nested_pandas.series import packer
 from numpy.testing import assert_array_equal
@@ -14,7 +15,7 @@ def test_pack_with_flat_df():
             "a": [1, 2, 3, 4],
             "b": [0, 1, 0, 1],
         },
-        index=[1, 2, 1, 2],
+        index=pd.MultiIndex.from_arrays(([1, 1, 1, 1], [1, 2, 1, 2])),
     )
     series = packer.pack(df, name="series")
 
@@ -23,7 +24,7 @@ def test_pack_with_flat_df():
             (np.array([1, 3]), np.array([0, 0])),
             (np.array([2, 4]), np.array([1, 1])),
         ],
-        index=[1, 2],
+        index=pd.MultiIndex.from_arrays(([1, 1], [1, 2])),
         dtype=NestedDtype.from_fields(dict(a=pa.int64(), b=pa.int64())),
         name="series",
     )
@@ -87,55 +88,6 @@ def test_pack_with_series_of_dfs():
     assert_series_equal(series, desired)
 
 
-def test_pack_flat_into_df():
-    """Test pack_flat_into_df()."""
-    df = pd.DataFrame(
-        data={
-            "a": [7, 8, 9, 1, 2, 3, 4, 5, 6],
-            "b": [0, 1, 0, 0, 1, 0, 1, 0, 1],
-        },
-        index=[4, 4, 4, 1, 1, 2, 2, 3, 3],
-    )
-    actual = packer.pack_flat_into_df(df, name="struct")
-
-    desired = pd.DataFrame(
-        data={
-            "a": pd.Series(
-                data=[
-                    np.array([1, 2]),
-                    np.array([3, 4]),
-                    np.array([5, 6]),
-                    np.array([7, 8, 9]),
-                ],
-                dtype=pd.ArrowDtype(pa.list_(pa.int64())),
-                index=[1, 2, 3, 4],
-            ),
-            "b": pd.Series(
-                data=[
-                    np.array([0, 1]),
-                    np.array([0, 1]),
-                    np.array([0, 1]),
-                    np.array([0, 1, 0]),
-                ],
-                dtype=pd.ArrowDtype(pa.list_(pa.int64())),
-                index=[1, 2, 3, 4],
-            ),
-            "struct": pd.Series(
-                data=[
-                    (np.array([1, 2]), np.array([0, 1])),
-                    (np.array([3, 4]), np.array([0, 1])),
-                    (np.array([5, 6]), np.array([0, 1])),
-                    (np.array([7, 8, 9]), np.array([0, 1, 0])),
-                ],
-                dtype=NestedDtype.from_fields(dict(a=pa.int64(), b=pa.int64())),
-                index=[1, 2, 3, 4],
-            ),
-        },
-    )
-
-    assert_frame_equal(actual, desired)
-
-
 def test_pack_flat():
     """Test pack_flat()."""
     df = pd.DataFrame(
@@ -186,6 +138,19 @@ def test_pack_sorted_df_into_struct():
     assert_series_equal(actual, desired)
 
 
+def test_pack_sorted_df_into_struct_raises_when_not_sorted():
+    """Test pack_sorted_df_into_struct() raises when not sorted."""
+    df = pd.DataFrame(
+        data={
+            "a": [1, 2, 3, 4, 5, 6, 7, 8, 9],
+            "b": [0, 1, 0, 1, 0, 1, 0, 1, 0],
+        },
+        index=[1, 2, 1, 2, 3, 3, 4, 4, 4],
+    )
+    with pytest.raises(ValueError):
+        packer.pack_sorted_df_into_struct(df)
+
+
 def test_pack_lists():
     """Test pack_lists()."""
     packed_df = pd.DataFrame(
@@ -362,6 +327,19 @@ def test_view_sorted_df_as_list_arrays():
     assert_frame_equal(nested_df, desired_nested)
 
 
+def test_view_sorted_df_as_list_arrays_raises_when_not_sorted():
+    """Test view_sorted_df_as_list_arrays() raises when not sorted."""
+    flat_df = pd.DataFrame(
+        data={
+            "a": [1, 2, 3, 4, 5, 6, 7, 8, 9],
+            "b": [0, 1, 0, 1, 0, 1, 0, 1, 0],
+        },
+        index=[1, 2, 1, 2, 3, 3, 4, 4, 4],
+    )
+    with pytest.raises(ValueError):
+        packer.view_sorted_df_as_list_arrays(flat_df)
+
+
 def test_view_sorted_series_as_list_array():
     """Test view_sorted_series_as_list_array()."""
     series = pd.Series(
@@ -386,3 +364,49 @@ def test_view_sorted_series_as_list_array():
         name="my_series",
     )
     assert_series_equal(nested, desired_nested)
+
+
+def test_view_sorted_series_as_list_array_raises_when_not_sorted():
+    """Test view_sorted_series_as_list_array() raises when not sorted."""
+    series = pd.Series(
+        data=[1, 2, 3, 4, 5, 6, 7, 8, 9],
+        index=[1, 2, 1, 2, 3, 3, 4, 4, 4],
+    )
+    with pytest.raises(ValueError):
+        packer.view_sorted_series_as_list_array(series)
+
+
+@pytest.mark.parametrize(
+    "index,offsets",
+    [
+        (pd.Index([1, 2, 3, 4]), np.array([0, 1, 2, 3, 4])),
+        (pd.Index([1, 1, 2, 2, 3, 3, 4, 4, 4]), np.array([0, 2, 4, 6, 9])),
+        (pd.Index([1, 1, 1, 1, 1, 1, 1, 1, 1]), np.array([0, 9])),
+        (pd.Index([1, 2, 2, 2, 3, 3, 4]), np.array([0, 1, 4, 6, 7])),
+        (
+            pd.MultiIndex.from_product([[1, 2, 3], ["a", "a", "b", "b", "b"]]),
+            np.array([0, 2, 5, 7, 10, 12, 15]),
+        ),
+        (
+            pd.MultiIndex.from_arrays(
+                (
+                    [1, 1, 1, 1, 1, 1, 2, 2],
+                    ["a", "a", "a", "a", "b", "b", "z", "z"],
+                    [1, 2, 2, 2, 9, 9, 9, 9],
+                ),
+                names=["id1", "id2", "id3"],
+            ),
+            np.array([0, 1, 4, 6, 8]),
+        ),
+    ],
+)
+def test_calculate_sorted_index_offsets(index, offsets):
+    """Test calculate_sorted_index_offsets()."""
+    assert_array_equal(packer.calculate_sorted_index_offsets(index), offsets)
+
+
+def test_calculate_sorted_index_offsets_raises_when_not_sorted():
+    """Test calculate_sorted_index_offsets() raises when not sorted."""
+    index = pd.Index([1, 2, 1, 2, 3, 3, 4, 4, 4])
+    with pytest.raises(ValueError):
+        packer.calculate_sorted_index_offsets(index)