From 135fdc7daf385f0c3c390eb05e8b9155eb23148b Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 21 Nov 2023 12:20:52 -0800 Subject: [PATCH] Added rle and rle_id methods to Series and Expr --- CHANGELOG.md | 1 + ext/polars/Cargo.toml | 1 + ext/polars/src/expr/general.rs | 8 +++++ ext/polars/src/lib.rs | 2 ++ lib/polars/expr.rb | 53 ++++++++++++++++++++++++++++++++++ lib/polars/series.rb | 53 ++++++++++++++++++++++++++++++++++ 6 files changed, 118 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 473dd0389a..06f7009fd4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,7 @@ ## 0.7.1 (unreleased) - Added `cut` and `qcut` methods to `Series` and `Expr` +- Added `rle` and `rle_id` methods to `Series` and `Expr` - Added `bottom_k` method to `Series` - Fixed error with `top_k` method diff --git a/ext/polars/Cargo.toml b/ext/polars/Cargo.toml index 9891688134..a966c40c76 100644 --- a/ext/polars/Cargo.toml +++ b/ext/polars/Cargo.toml @@ -78,6 +78,7 @@ features = [ "range", "reinterpret", "repeat_by", + "rle", "rolling_window", "round_series", "row_hash", diff --git a/ext/polars/src/expr/general.rs b/ext/polars/src/expr/general.rs index 75c02f627a..275b5e5c80 100644 --- a/ext/polars/src/expr/general.rs +++ b/ext/polars/src/expr/general.rs @@ -209,6 +209,14 @@ impl RbExpr { .into() } + pub fn rle(&self) -> Self { + self.inner.clone().rle().into() + } + + pub fn rle_id(&self) -> Self { + self.inner.clone().rle_id().into() + } + pub fn agg_groups(&self) -> Self { self.clone().inner.agg_groups().into() } diff --git a/ext/polars/src/lib.rs b/ext/polars/src/lib.rs index 1d9948d83e..8998811a4f 100644 --- a/ext/polars/src/lib.rs +++ b/ext/polars/src/lib.rs @@ -312,6 +312,8 @@ fn init(ruby: &Ruby) -> RbResult<()> { class.define_method("cut", method!(RbExpr::cut, 4))?; class.define_method("qcut", method!(RbExpr::qcut, 5))?; class.define_method("qcut_uniform", method!(RbExpr::qcut_uniform, 5))?; + class.define_method("rle", method!(RbExpr::rle, 0))?; + class.define_method("rle_id", method!(RbExpr::rle_id, 0))?; class.define_method("agg_groups", method!(RbExpr::agg_groups, 0))?; class.define_method("count", method!(RbExpr::count, 0))?; class.define_method("value_counts", method!(RbExpr::value_counts, 2))?; diff --git a/lib/polars/expr.rb b/lib/polars/expr.rb index f2b9f9c644..b5fe66279f 100644 --- a/lib/polars/expr.rb +++ b/lib/polars/expr.rb @@ -2625,6 +2625,59 @@ def qcut(quantiles, labels: nil, left_closed: false, allow_duplicates: false, in wrap_expr(rbexpr) end + # Get the lengths of runs of identical values. + # + # @return [Expr] + # + # @example + # df = Polars::DataFrame.new(Polars::Series.new("s", [1, 1, 2, 1, nil, 1, 3, 3])) + # df.select(Polars.col("s").rle).unnest("s") + # # => + # # shape: (6, 2) + # # ┌─────────┬────────┐ + # # │ lengths ┆ values │ + # # │ --- ┆ --- │ + # # │ i32 ┆ i64 │ + # # ╞═════════╪════════╡ + # # │ 2 ┆ 1 │ + # # │ 1 ┆ 2 │ + # # │ 1 ┆ 1 │ + # # │ 1 ┆ null │ + # # │ 1 ┆ 1 │ + # # │ 2 ┆ 3 │ + # # └─────────┴────────┘ + def rle + wrap_expr(_rbexpr.rle) + end + + # Map values to run IDs. + # + # Similar to RLE, but it maps each value to an ID corresponding to the run into + # which it falls. This is especially useful when you want to define groups by + # runs of identical values rather than the values themselves. + # + # @return [Expr] + # + # @example + # df = Polars::DataFrame.new({"a" => [1, 2, 1, 1, 1], "b" => ["x", "x", nil, "y", "y"]}) + # df.with_columns([Polars.col("a").rle_id.alias("a_r"), Polars.struct(["a", "b"]).rle_id.alias("ab_r")]) + # # => + # # shape: (5, 4) + # # ┌─────┬──────┬─────┬──────┐ + # # │ a ┆ b ┆ a_r ┆ ab_r │ + # # │ --- ┆ --- ┆ --- ┆ --- │ + # # │ i64 ┆ str ┆ u32 ┆ u32 │ + # # ╞═════╪══════╪═════╪══════╡ + # # │ 1 ┆ x ┆ 0 ┆ 0 │ + # # │ 2 ┆ x ┆ 1 ┆ 1 │ + # # │ 1 ┆ null ┆ 2 ┆ 2 │ + # # │ 1 ┆ y ┆ 2 ┆ 3 │ + # # │ 1 ┆ y ┆ 2 ┆ 3 │ + # # └─────┴──────┴─────┴──────┘ + def rle_id + wrap_expr(_rbexpr.rle_id) + end + # Filter a single column. # # Mostly useful in an aggregation context. If you want to filter on a DataFrame diff --git a/lib/polars/series.rb b/lib/polars/series.rb index 46d43d167e..6b9e13c606 100644 --- a/lib/polars/series.rb +++ b/lib/polars/series.rb @@ -888,6 +888,59 @@ def qcut(quantiles, labels: nil, left_closed: false, allow_duplicates: false, in result end + # Get the lengths of runs of identical values. + # + # @return [Series] + # + # @example + # s = Polars::Series.new("s", [1, 1, 2, 1, nil, 1, 3, 3]) + # s.rle.struct.unnest + # # => + # # shape: (6, 2) + # # ┌─────────┬────────┐ + # # │ lengths ┆ values │ + # # │ --- ┆ --- │ + # # │ i32 ┆ i64 │ + # # ╞═════════╪════════╡ + # # │ 2 ┆ 1 │ + # # │ 1 ┆ 2 │ + # # │ 1 ┆ 1 │ + # # │ 1 ┆ null │ + # # │ 1 ┆ 1 │ + # # │ 2 ┆ 3 │ + # # └─────────┴────────┘ + def rle + super + end + + # Map values to run IDs. + # + # Similar to RLE, but it maps each value to an ID corresponding to the run into + # which it falls. This is especially useful when you want to define groups by + # runs of identical values rather than the values themselves. + # + # @return [Series] + # + # @example + # s = Polars::Series.new("s", [1, 1, 2, 1, nil, 1, 3, 3]) + # s.rle_id() + # # => + # # shape: (8,) + # # Series: 's' [u32] + # # [ + # # 0 + # # 0 + # # 1 + # # 2 + # # 3 + # # 4 + # # 5 + # # 5 + # # ] + def rle_id + super + end + # Count the unique values in a Series. # # @param sort [Boolean]