From ced6af244294de12008238b6e529bbddec75c56c Mon Sep 17 00:00:00 2001 From: Doug Branton Date: Mon, 22 Apr 2024 14:46:31 -0700 Subject: [PATCH 1/2] speedup with to_lists() --- src/nested_pandas/nestedframe/core.py | 26 +++++++++++++------ src/nested_pandas/utils/utils.py | 7 +++-- .../nestedframe/test_nestedframe.py | 12 ++++++++- 3 files changed, 34 insertions(+), 11 deletions(-) diff --git a/src/nested_pandas/nestedframe/core.py b/src/nested_pandas/nestedframe/core.py index c9d6297..b26b2de 100644 --- a/src/nested_pandas/nestedframe/core.py +++ b/src/nested_pandas/nestedframe/core.py @@ -379,22 +379,32 @@ def my_sum(col1, col2): if len(requested_columns) < len(args): extra_args = args[len(requested_columns) :] + # find targeted layers + layers = np.unique([col[0] for col in requested_columns]) + + # build a flat dataframe with array columns to apply to the function + apply_df = NestedFrame() + for layer in layers: + if layer == "base": + columns = [col[1] for col in requested_columns if col[0] == layer] + apply_df = apply_df.join(self[columns], how="right") + else: + # TODO: It should be faster to pass these columns to to_lists, but its 20x slower + # columns = [col[1] for col in requested_columns if col[0] == layer] + apply_df = apply_df.join(self[layer].nest.to_lists(), how="right") + # Translates the requested columns into the scalars or arrays we pass to func. def translate_cols(frame, layer, col): if layer == "base": # We pass the "base" column as a scalar return frame[col] - return frame[layer][col].to_numpy() + return np.array(frame[col]) - # Note that this applys the function to each row of the nested dataframe. For - # the columns within packed frames, note taht we're directly accessing the dataframe - # within the cell of that row without having to unpack and flatten. - result = self.apply( + # send arrays along to the apply call + result = apply_df.apply( lambda x: func( *[translate_cols(x, layer, col) for layer, col in requested_columns], *extra_args, **kwargs ), - axis=1, # to apply func on each row of our nested frame - result_type="expand", # to return a DataFrame when possible + axis=1, # to apply func on each row of our nested frame) ) - return result diff --git a/src/nested_pandas/utils/utils.py b/src/nested_pandas/utils/utils.py index 662e1d2..a1b0a9f 100644 --- a/src/nested_pandas/utils/utils.py +++ b/src/nested_pandas/utils/utils.py @@ -28,9 +28,12 @@ def count_nested(df, nested, by=None, join=True) -> NestedFrame: """ if by is None: - # to_flat() is faster than direct apply in this case - counts = df[nested].nest.to_flat().groupby(level=0).apply(lambda x: len(x)).rename(f"n_{nested}") + field_to_len = df[nested].nest.fields[0] + counts = ( + df[nested].nest.to_lists().apply(lambda x: len(x[field_to_len]), axis=1).rename(f"n_{nested}") + ) else: + # this may be able to be sped up using tolists() as well counts = df[nested].apply(lambda x: x[by].value_counts()) counts = counts.rename(columns={colname: f"n_{nested}_{colname}" for colname in counts.columns}) if join: diff --git a/tests/nested_pandas/nestedframe/test_nestedframe.py b/tests/nested_pandas/nestedframe/test_nestedframe.py index 782dbf7..077140f 100644 --- a/tests/nested_pandas/nestedframe/test_nestedframe.py +++ b/tests/nested_pandas/nestedframe/test_nestedframe.py @@ -242,7 +242,17 @@ def test_reduce(): to_pack2 = pd.DataFrame( data={ - "time": [1, 2, 3, 1, 2, 3, 1, 2, 4], + "time2": [ + 1, + 2, + 3, + 1, + 2, + 3, + 1, + 2, + 4, + ], # TODO: fix duplicate name in join once to_list subset bug fixed "e": [2, 9, 4, 1, 23, 3, 1, 4, 1], "f": [5, 4, 7, 5, 3, 25, 9, 3, 4], }, From fe20c57a2176bb63bf08df3ebe3479b6300be11b Mon Sep 17 00:00:00 2001 From: Doug Branton Date: Tue, 23 Apr 2024 11:52:23 -0700 Subject: [PATCH 2/2] right->outer --- src/nested_pandas/nestedframe/core.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/nested_pandas/nestedframe/core.py b/src/nested_pandas/nestedframe/core.py index b26b2de..af40efa 100644 --- a/src/nested_pandas/nestedframe/core.py +++ b/src/nested_pandas/nestedframe/core.py @@ -387,11 +387,11 @@ def my_sum(col1, col2): for layer in layers: if layer == "base": columns = [col[1] for col in requested_columns if col[0] == layer] - apply_df = apply_df.join(self[columns], how="right") + apply_df = apply_df.join(self[columns], how="outer") else: # TODO: It should be faster to pass these columns to to_lists, but its 20x slower # columns = [col[1] for col in requested_columns if col[0] == layer] - apply_df = apply_df.join(self[layer].nest.to_lists(), how="right") + apply_df = apply_df.join(self[layer].nest.to_lists(), how="outer") # Translates the requested columns into the scalars or arrays we pass to func. def translate_cols(frame, layer, col):