diff --git a/CHANGELOG.md b/CHANGELOG.md index 10bd3d1453..aacc28720e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,6 @@ ## 0.7.1 (unreleased) +- Added `cut` method to `Series` and `Expr` - Added `bottom_k` method to `Series` - Fixed error with `top_k` method diff --git a/ext/polars/Cargo.toml b/ext/polars/Cargo.toml index 13797ff5c8..9891688134 100644 --- a/ext/polars/Cargo.toml +++ b/ext/polars/Cargo.toml @@ -35,6 +35,7 @@ features = [ "csv", "cum_agg", "cumulative_eval", + "cutqcut", "dataframe_arithmetic", "date_offset", "diagonal_concat", diff --git a/ext/polars/src/expr/general.rs b/ext/polars/src/expr/general.rs index 45485afbc0..7ed79d605a 100644 --- a/ext/polars/src/expr/general.rs +++ b/ext/polars/src/expr/general.rs @@ -162,6 +162,19 @@ impl RbExpr { .into() } + pub fn cut( + &self, + breaks: Vec, + labels: Option>, + left_closed: bool, + include_breaks: bool, + ) -> Self { + self.inner + .clone() + .cut(breaks, labels, left_closed, include_breaks) + .into() + } + pub fn agg_groups(&self) -> Self { self.clone().inner.agg_groups().into() } diff --git a/ext/polars/src/lib.rs b/ext/polars/src/lib.rs index ec7b3948de..f4efef6a88 100644 --- a/ext/polars/src/lib.rs +++ b/ext/polars/src/lib.rs @@ -309,6 +309,7 @@ fn init(ruby: &Ruby) -> RbResult<()> { class.define_method("last", method!(RbExpr::last, 0))?; class.define_method("implode", method!(RbExpr::implode, 0))?; class.define_method("quantile", method!(RbExpr::quantile, 2))?; + class.define_method("cut", method!(RbExpr::cut, 4))?; class.define_method("agg_groups", method!(RbExpr::agg_groups, 0))?; class.define_method("count", method!(RbExpr::count, 0))?; class.define_method("value_counts", method!(RbExpr::value_counts, 2))?; diff --git a/lib/polars/expr.rb b/lib/polars/expr.rb index 6ac1206db4..a9003021b0 100644 --- a/lib/polars/expr.rb +++ b/lib/polars/expr.rb @@ -2478,6 +2478,62 @@ def quantile(quantile, interpolation: "nearest") wrap_expr(_rbexpr.quantile(quantile._rbexpr, interpolation)) end + # Bin continuous values into discrete categories. + # + # @param breaks [Array] + # List of unique cut points. + # @param labels [Array] + # Names of the categories. The number of labels must be equal to the number + # of cut points plus one. + # @param left_closed [Boolean] + # Set the intervals to be left-closed instead of right-closed. + # @param include_breaks [Boolean] + # Include a column with the right endpoint of the bin each observation falls + # in. This will change the data type of the output from a + # `Categorical` to a `Struct`. + # + # @return [Expr] + # + # @example Divide a column into three categories. + # df = Polars::DataFrame.new({"foo" => [-2, -1, 0, 1, 2]}) + # df.with_columns( + # Polars.col("foo").cut([-1, 1], labels: ["a", "b", "c"]).alias("cut") + # ) + # # => + # # shape: (5, 2) + # # ┌─────┬─────┐ + # # │ foo ┆ cut │ + # # │ --- ┆ --- │ + # # │ i64 ┆ cat │ + # # ╞═════╪═════╡ + # # │ -2 ┆ a │ + # # │ -1 ┆ a │ + # # │ 0 ┆ b │ + # # │ 1 ┆ b │ + # # │ 2 ┆ c │ + # # └─────┴─────┘ + # + # @example Add both the category and the breakpoint. + # df.with_columns( + # Polars.col("foo").cut([-1, 1], include_breaks: true).alias("cut") + # ).unnest("cut") + # # => + # # shape: (5, 3) + # # ┌─────┬──────┬────────────┐ + # # │ foo ┆ brk ┆ foo_bin │ + # # │ --- ┆ --- ┆ --- │ + # # │ i64 ┆ f64 ┆ cat │ + # # ╞═════╪══════╪════════════╡ + # # │ -2 ┆ -1.0 ┆ (-inf, -1] │ + # # │ -1 ┆ -1.0 ┆ (-inf, -1] │ + # # │ 0 ┆ 1.0 ┆ (-1, 1] │ + # # │ 1 ┆ 1.0 ┆ (-1, 1] │ + # # │ 2 ┆ inf ┆ (1, inf] │ + # # └─────┴──────┴────────────┘ + def cut(breaks, labels: nil, left_closed: false, include_breaks: false) + wrap_expr(_rbexpr.cut(breaks, labels, left_closed, include_breaks)) + end + # Filter a single column. # # Mostly useful in an aggregation context. If you want to filter on a DataFrame diff --git a/lib/polars/series.rb b/lib/polars/series.rb index f605477ba7..1bf97dfb6a 100644 --- a/lib/polars/series.rb +++ b/lib/polars/series.rb @@ -735,6 +735,73 @@ def to_dummies(separator: "_", drop_first: false) Utils.wrap_df(_s.to_dummies(separator, drop_first)) end + # Bin continuous values into discrete categories. + # + # @param breaks [Array] + # List of unique cut points. + # @param labels [Array] + # Names of the categories. The number of labels must be equal to the number + # of cut points plus one. + # @param left_closed [Boolean] + # Set the intervals to be left-closed instead of right-closed. + # @param include_breaks [Boolean] + # Include a column with the right endpoint of the bin each observation falls + # in. This will change the data type of the output from a + # `Categorical` to a `Struct`. + # + # @return [Series] + # + # @example Divide the column into three categories. + # s = Polars::Series.new("foo", [-2, -1, 0, 1, 2]) + # s.cut([-1, 1], labels: ["a", "b", "c"]) + # # => + # # shape: (5,) + # # Series: 'foo' [cat] + # # [ + # # "a" + # # "a" + # # "b" + # # "b" + # # "c" + # # ] + # + # @example Create a DataFrame with the breakpoint and category for each value. + # cut = s.cut([-1, 1], include_breaks: true).alias("cut") + # s.to_frame.with_columns(cut).unnest("cut") + # # => + # # shape: (5, 3) + # # ┌─────┬─────────────┬────────────┐ + # # │ foo ┆ break_point ┆ category │ + # # │ --- ┆ --- ┆ --- │ + # # │ i64 ┆ f64 ┆ cat │ + # # ╞═════╪═════════════╪════════════╡ + # # │ -2 ┆ -1.0 ┆ (-inf, -1] │ + # # │ -1 ┆ -1.0 ┆ (-inf, -1] │ + # # │ 0 ┆ 1.0 ┆ (-1, 1] │ + # # │ 1 ┆ 1.0 ┆ (-1, 1] │ + # # │ 2 ┆ inf ┆ (1, inf] │ + # # └─────┴─────────────┴────────────┘ + def cut(breaks, labels: nil, left_closed: false, include_breaks: false) + result = ( + to_frame + .select( + Polars.col(name).cut( + breaks, + labels: labels, + left_closed: left_closed, + include_breaks: include_breaks + ) + ) + .to_series + ) + + if include_breaks + result = result.struct.rename_fields(["break_point", "category"]) + end + + result + end + # Count the unique values in a Series. # # @param sort [Boolean]