Skip to content

Commit

Permalink
Merge pull request #34 from lincc-frameworks/to_lists_speedup
Browse files Browse the repository at this point in the history
speedup with to_lists()
  • Loading branch information
dougbrn authored Apr 23, 2024
2 parents 025ad96 + fe20c57 commit 659b639
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 11 deletions.
26 changes: 18 additions & 8 deletions src/nested_pandas/nestedframe/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -379,22 +379,32 @@ def my_sum(col1, col2):
if len(requested_columns) < len(args):
extra_args = args[len(requested_columns) :]

# find targeted layers
layers = np.unique([col[0] for col in requested_columns])

# build a flat dataframe with array columns to apply to the function
apply_df = NestedFrame()
for layer in layers:
if layer == "base":
columns = [col[1] for col in requested_columns if col[0] == layer]
apply_df = apply_df.join(self[columns], how="outer")
else:
# TODO: It should be faster to pass these columns to to_lists, but its 20x slower
# columns = [col[1] for col in requested_columns if col[0] == layer]
apply_df = apply_df.join(self[layer].nest.to_lists(), how="outer")

# Translates the requested columns into the scalars or arrays we pass to func.
def translate_cols(frame, layer, col):
if layer == "base":
# We pass the "base" column as a scalar
return frame[col]
return frame[layer][col].to_numpy()
return np.array(frame[col])

# Note that this applys the function to each row of the nested dataframe. For
# the columns within packed frames, note taht we're directly accessing the dataframe
# within the cell of that row without having to unpack and flatten.
result = self.apply(
# send arrays along to the apply call
result = apply_df.apply(
lambda x: func(
*[translate_cols(x, layer, col) for layer, col in requested_columns], *extra_args, **kwargs
),
axis=1, # to apply func on each row of our nested frame
result_type="expand", # to return a DataFrame when possible
axis=1, # to apply func on each row of our nested frame)
)

return result
7 changes: 5 additions & 2 deletions src/nested_pandas/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,12 @@ def count_nested(df, nested, by=None, join=True) -> NestedFrame:
"""

if by is None:
# to_flat() is faster than direct apply in this case
counts = df[nested].nest.to_flat().groupby(level=0).apply(lambda x: len(x)).rename(f"n_{nested}")
field_to_len = df[nested].nest.fields[0]
counts = (
df[nested].nest.to_lists().apply(lambda x: len(x[field_to_len]), axis=1).rename(f"n_{nested}")
)
else:
# this may be able to be sped up using tolists() as well
counts = df[nested].apply(lambda x: x[by].value_counts())
counts = counts.rename(columns={colname: f"n_{nested}_{colname}" for colname in counts.columns})
if join:
Expand Down
12 changes: 11 additions & 1 deletion tests/nested_pandas/nestedframe/test_nestedframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,7 +242,17 @@ def test_reduce():

to_pack2 = pd.DataFrame(
data={
"time": [1, 2, 3, 1, 2, 3, 1, 2, 4],
"time2": [
1,
2,
3,
1,
2,
3,
1,
2,
4,
], # TODO: fix duplicate name in join once to_list subset bug fixed
"e": [2, 9, 4, 1, 23, 3, 1, 4, 1],
"f": [5, 4, 7, 5, 3, 25, 9, 3, 4],
},
Expand Down

0 comments on commit 659b639

Please sign in to comment.