From b5cb137e2562148ab68d2b12734f708f3d6d9f4c Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Wed, 18 Sep 2024 07:39:45 +0100 Subject: [PATCH] docs(python): Improve `over` docs, add example with `order_by` (#18796) --- py-polars/polars/expr/expr.py | 106 +++++++++++++++++++--------------- 1 file changed, 60 insertions(+), 46 deletions(-) diff --git a/py-polars/polars/expr/expr.py b/py-polars/polars/expr/expr.py index bb2430671234..1d3b2a07fdcb 100644 --- a/py-polars/polars/expr/expr.py +++ b/py-polars/polars/expr/expr.py @@ -3425,9 +3425,7 @@ def over( ... "c": [5, 4, 3, 2, 1], ... } ... ) - >>> df.with_columns( - ... pl.col("c").max().over("a").name.suffix("_max"), - ... ) + >>> df.with_columns(c_max=pl.col("c").max().over("a")) shape: (5, 4) ┌─────┬─────┬─────┬───────┐ │ a ┆ b ┆ c ┆ c_max │ @@ -3441,11 +3439,9 @@ def over( │ b ┆ 3 ┆ 1 ┆ 3 │ └─────┴─────┴─────┴───────┘ - Expression input is supported. + Expression input is also supported. - >>> df.with_columns( - ... pl.col("c").max().over(pl.col("b") // 2).name.suffix("_max"), - ... ) + >>> df.with_columns(c_max=pl.col("c").max().over(pl.col("b") // 2)) shape: (5, 4) ┌─────┬─────┬─────┬───────┐ │ a ┆ b ┆ c ┆ c_max │ @@ -3459,29 +3455,9 @@ def over( │ b ┆ 3 ┆ 1 ┆ 4 │ └─────┴─────┴─────┴───────┘ - Group by multiple columns by passing a list of column names or expressions. - - >>> df.with_columns( - ... pl.col("c").min().over(["a", "b"]).name.suffix("_min"), - ... ) - shape: (5, 4) - ┌─────┬─────┬─────┬───────┐ - │ a ┆ b ┆ c ┆ c_min │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╪═══════╡ - │ a ┆ 1 ┆ 5 ┆ 5 │ - │ a ┆ 2 ┆ 4 ┆ 4 │ - │ b ┆ 3 ┆ 3 ┆ 1 │ - │ b ┆ 5 ┆ 2 ┆ 2 │ - │ b ┆ 3 ┆ 1 ┆ 1 │ - └─────┴─────┴─────┴───────┘ - - Or use positional arguments to group by multiple columns in the same way. + Group by multiple columns by passing multiple column names or expressions. - >>> df.with_columns( - ... pl.col("c").min().over("a", pl.col("b") % 2).name.suffix("_min"), - ... ) + >>> df.with_columns(c_min=pl.col("c").min().over("a", pl.col("b") % 2)) shape: (5, 4) ┌─────┬─────┬─────┬───────┐ │ a ┆ b ┆ c ┆ c_min │ @@ -3495,25 +3471,63 @@ def over( │ b ┆ 3 ┆ 1 ┆ 1 │ └─────┴─────┴─────┴───────┘ - Aggregate values from each group using `mapping_strategy="explode"`. + You can use non-elementwise expressions with `over` too. By default they are + evaluated using row-order, but you can specify a different one using `order_by`. - >>> df.select( - ... pl.col("a").head(2).over("a", mapping_strategy="explode"), - ... pl.col("b").sort_by("b").head(2).over("a", mapping_strategy="explode"), - ... pl.col("c").sort_by("b").head(2).over("a", mapping_strategy="explode"), + >>> from datetime import date + >>> df = pl.DataFrame( + ... { + ... "store_id": ["a", "a", "b", "b"], + ... "date": [ + ... date(2024, 9, 18), + ... date(2024, 9, 17), + ... date(2024, 9, 18), + ... date(2024, 9, 16), + ... ], + ... "sales": [7, 9, 8, 10], + ... } ... ) - shape: (4, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ a ┆ 1 ┆ 5 │ - │ a ┆ 2 ┆ 4 │ - │ b ┆ 3 ┆ 3 │ - │ b ┆ 3 ┆ 1 │ - └─────┴─────┴─────┘ - + >>> df.with_columns( + ... cumulative_sales=pl.col("sales") + ... .cum_sum() + ... .over("store_id", order_by="date") + ... ) + shape: (4, 4) + ┌──────────┬────────────┬───────┬──────────────────┐ + │ store_id ┆ date ┆ sales ┆ cumulative_sales │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ date ┆ i64 ┆ i64 │ + ╞══════════╪════════════╪═══════╪══════════════════╡ + │ a ┆ 2024-09-18 ┆ 7 ┆ 16 │ + │ a ┆ 2024-09-17 ┆ 9 ┆ 9 │ + │ b ┆ 2024-09-18 ┆ 8 ┆ 18 │ + │ b ┆ 2024-09-16 ┆ 10 ┆ 10 │ + └──────────┴────────────┴───────┴──────────────────┘ + + If you don't require that the group order be preserved, then the more performant + option is to use `mapping_strategy='explode'` - be careful however to only ever + use this in a `select` statement, not a `with_columns` one. + + >>> window = { + ... "partition_by": "store_id", + ... "order_by": "date", + ... "mapping_strategy": "explode", + ... } + >>> df.select( + ... pl.all().over(**window), + ... cumulative_sales=pl.col("sales").cum_sum().over(**window), + ... ) + shape: (4, 4) + ┌──────────┬────────────┬───────┬──────────────────┐ + │ store_id ┆ date ┆ sales ┆ cumulative_sales │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ date ┆ i64 ┆ i64 │ + ╞══════════╪════════════╪═══════╪══════════════════╡ + │ a ┆ 2024-09-17 ┆ 9 ┆ 9 │ + │ a ┆ 2024-09-18 ┆ 7 ┆ 16 │ + │ b ┆ 2024-09-16 ┆ 10 ┆ 10 │ + │ b ┆ 2024-09-18 ┆ 8 ┆ 18 │ + └──────────┴────────────┴───────┴──────────────────┘ """ partition_by = parse_into_list_of_expressions(partition_by, *more_exprs) if order_by is not None: