From ced6af244294de12008238b6e529bbddec75c56c Mon Sep 17 00:00:00 2001
From: Doug Branton <brantd@uw.edu>
Date: Mon, 22 Apr 2024 14:46:31 -0700
Subject: [PATCH 1/2] speedup with to_lists()

---
 src/nested_pandas/nestedframe/core.py         | 26 +++++++++++++------
 src/nested_pandas/utils/utils.py              |  7 +++--
 .../nestedframe/test_nestedframe.py           | 12 ++++++++-
 3 files changed, 34 insertions(+), 11 deletions(-)

diff --git a/src/nested_pandas/nestedframe/core.py b/src/nested_pandas/nestedframe/core.py
index c9d6297..b26b2de 100644
--- a/src/nested_pandas/nestedframe/core.py
+++ b/src/nested_pandas/nestedframe/core.py
@@ -379,22 +379,32 @@ def my_sum(col1, col2):
         if len(requested_columns) < len(args):
             extra_args = args[len(requested_columns) :]
 
+        # find targeted layers
+        layers = np.unique([col[0] for col in requested_columns])
+
+        # build a flat dataframe with array columns to apply to the function
+        apply_df = NestedFrame()
+        for layer in layers:
+            if layer == "base":
+                columns = [col[1] for col in requested_columns if col[0] == layer]
+                apply_df = apply_df.join(self[columns], how="right")
+            else:
+                # TODO: It should be faster to pass these columns to to_lists, but its 20x slower
+                # columns = [col[1] for col in requested_columns if col[0] == layer]
+                apply_df = apply_df.join(self[layer].nest.to_lists(), how="right")
+
         # Translates the requested columns into the scalars or arrays we pass to func.
         def translate_cols(frame, layer, col):
             if layer == "base":
                 # We pass the "base" column as a scalar
                 return frame[col]
-            return frame[layer][col].to_numpy()
+            return np.array(frame[col])
 
-        # Note that this applys the function to each row of the nested dataframe. For
-        # the columns within packed frames, note taht we're directly accessing the dataframe
-        # within the cell of that row without having to unpack and flatten.
-        result = self.apply(
+        # send arrays along to the apply call
+        result = apply_df.apply(
             lambda x: func(
                 *[translate_cols(x, layer, col) for layer, col in requested_columns], *extra_args, **kwargs
             ),
-            axis=1,  # to apply func on each row of our nested frame
-            result_type="expand",  # to return a DataFrame when possible
+            axis=1,  # to apply func on each row of our nested frame)
         )
-
         return result
diff --git a/src/nested_pandas/utils/utils.py b/src/nested_pandas/utils/utils.py
index 662e1d2..a1b0a9f 100644
--- a/src/nested_pandas/utils/utils.py
+++ b/src/nested_pandas/utils/utils.py
@@ -28,9 +28,12 @@ def count_nested(df, nested, by=None, join=True) -> NestedFrame:
     """
 
     if by is None:
-        # to_flat() is faster than direct apply in this case
-        counts = df[nested].nest.to_flat().groupby(level=0).apply(lambda x: len(x)).rename(f"n_{nested}")
+        field_to_len = df[nested].nest.fields[0]
+        counts = (
+            df[nested].nest.to_lists().apply(lambda x: len(x[field_to_len]), axis=1).rename(f"n_{nested}")
+        )
     else:
+        # this may be able to be sped up using tolists() as well
         counts = df[nested].apply(lambda x: x[by].value_counts())
         counts = counts.rename(columns={colname: f"n_{nested}_{colname}" for colname in counts.columns})
     if join:
diff --git a/tests/nested_pandas/nestedframe/test_nestedframe.py b/tests/nested_pandas/nestedframe/test_nestedframe.py
index 782dbf7..077140f 100644
--- a/tests/nested_pandas/nestedframe/test_nestedframe.py
+++ b/tests/nested_pandas/nestedframe/test_nestedframe.py
@@ -242,7 +242,17 @@ def test_reduce():
 
     to_pack2 = pd.DataFrame(
         data={
-            "time": [1, 2, 3, 1, 2, 3, 1, 2, 4],
+            "time2": [
+                1,
+                2,
+                3,
+                1,
+                2,
+                3,
+                1,
+                2,
+                4,
+            ],  # TODO: fix duplicate name in join once to_list subset bug fixed
             "e": [2, 9, 4, 1, 23, 3, 1, 4, 1],
             "f": [5, 4, 7, 5, 3, 25, 9, 3, 4],
         },

From fe20c57a2176bb63bf08df3ebe3479b6300be11b Mon Sep 17 00:00:00 2001
From: Doug Branton <brantd@uw.edu>
Date: Tue, 23 Apr 2024 11:52:23 -0700
Subject: [PATCH 2/2] right->outer

---
 src/nested_pandas/nestedframe/core.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/nested_pandas/nestedframe/core.py b/src/nested_pandas/nestedframe/core.py
index b26b2de..af40efa 100644
--- a/src/nested_pandas/nestedframe/core.py
+++ b/src/nested_pandas/nestedframe/core.py
@@ -387,11 +387,11 @@ def my_sum(col1, col2):
         for layer in layers:
             if layer == "base":
                 columns = [col[1] for col in requested_columns if col[0] == layer]
-                apply_df = apply_df.join(self[columns], how="right")
+                apply_df = apply_df.join(self[columns], how="outer")
             else:
                 # TODO: It should be faster to pass these columns to to_lists, but its 20x slower
                 # columns = [col[1] for col in requested_columns if col[0] == layer]
-                apply_df = apply_df.join(self[layer].nest.to_lists(), how="right")
+                apply_df = apply_df.join(self[layer].nest.to_lists(), how="outer")
 
         # Translates the requested columns into the scalars or arrays we pass to func.
         def translate_cols(frame, layer, col):