Skip to content

Commit

Permalink
Added cut method to Series and Expr - closes #38
Browse files Browse the repository at this point in the history
  • Loading branch information
ankane committed Nov 21, 2023
1 parent d11ff23 commit ec58719
Show file tree
Hide file tree
Showing 6 changed files with 139 additions and 0 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
## 0.7.1 (unreleased)

- Added `cut` method to `Series` and `Expr`
- Added `bottom_k` method to `Series`
- Fixed error with `top_k` method

Expand Down
1 change: 1 addition & 0 deletions ext/polars/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ features = [
"csv",
"cum_agg",
"cumulative_eval",
"cutqcut",
"dataframe_arithmetic",
"date_offset",
"diagonal_concat",
Expand Down
13 changes: 13 additions & 0 deletions ext/polars/src/expr/general.rs
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,19 @@ impl RbExpr {
.into()
}

pub fn cut(
&self,
breaks: Vec<f64>,
labels: Option<Vec<String>>,
left_closed: bool,
include_breaks: bool,
) -> Self {
self.inner
.clone()
.cut(breaks, labels, left_closed, include_breaks)
.into()
}

pub fn agg_groups(&self) -> Self {
self.clone().inner.agg_groups().into()
}
Expand Down
1 change: 1 addition & 0 deletions ext/polars/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -309,6 +309,7 @@ fn init(ruby: &Ruby) -> RbResult<()> {
class.define_method("last", method!(RbExpr::last, 0))?;
class.define_method("implode", method!(RbExpr::implode, 0))?;
class.define_method("quantile", method!(RbExpr::quantile, 2))?;
class.define_method("cut", method!(RbExpr::cut, 4))?;
class.define_method("agg_groups", method!(RbExpr::agg_groups, 0))?;
class.define_method("count", method!(RbExpr::count, 0))?;
class.define_method("value_counts", method!(RbExpr::value_counts, 2))?;
Expand Down
56 changes: 56 additions & 0 deletions lib/polars/expr.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2478,6 +2478,62 @@ def quantile(quantile, interpolation: "nearest")
wrap_expr(_rbexpr.quantile(quantile._rbexpr, interpolation))
end

# Bin continuous values into discrete categories.
#
# @param breaks [Array]
# List of unique cut points.
# @param labels [Array]
# Names of the categories. The number of labels must be equal to the number
# of cut points plus one.
# @param left_closed [Boolean]
# Set the intervals to be left-closed instead of right-closed.
# @param include_breaks [Boolean]
# Include a column with the right endpoint of the bin each observation falls
# in. This will change the data type of the output from a
# `Categorical` to a `Struct`.
#
# @return [Expr]
#
# @example Divide a column into three categories.
# df = Polars::DataFrame.new({"foo" => [-2, -1, 0, 1, 2]})
# df.with_columns(
# Polars.col("foo").cut([-1, 1], labels: ["a", "b", "c"]).alias("cut")
# )
# # =>
# # shape: (5, 2)
# # ┌─────┬─────┐
# # │ foo ┆ cut │
# # │ --- ┆ --- │
# # │ i64 ┆ cat │
# # ╞═════╪═════╡
# # │ -2 ┆ a │
# # │ -1 ┆ a │
# # │ 0 ┆ b │
# # │ 1 ┆ b │
# # │ 2 ┆ c │
# # └─────┴─────┘
#
# @example Add both the category and the breakpoint.
# df.with_columns(
# Polars.col("foo").cut([-1, 1], include_breaks: true).alias("cut")
# ).unnest("cut")
# # =>
# # shape: (5, 3)
# # ┌─────┬──────┬────────────┐
# # │ foo ┆ brk ┆ foo_bin │
# # │ --- ┆ --- ┆ --- │
# # │ i64 ┆ f64 ┆ cat │
# # ╞═════╪══════╪════════════╡
# # │ -2 ┆ -1.0 ┆ (-inf, -1] │
# # │ -1 ┆ -1.0 ┆ (-inf, -1] │
# # │ 0 ┆ 1.0 ┆ (-1, 1] │
# # │ 1 ┆ 1.0 ┆ (-1, 1] │
# # │ 2 ┆ inf ┆ (1, inf] │
# # └─────┴──────┴────────────┘
def cut(breaks, labels: nil, left_closed: false, include_breaks: false)
wrap_expr(_rbexpr.cut(breaks, labels, left_closed, include_breaks))
end

# Filter a single column.
#
# Mostly useful in an aggregation context. If you want to filter on a DataFrame
Expand Down
67 changes: 67 additions & 0 deletions lib/polars/series.rb
Original file line number Diff line number Diff line change
Expand Up @@ -735,6 +735,73 @@ def to_dummies(separator: "_", drop_first: false)
Utils.wrap_df(_s.to_dummies(separator, drop_first))
end

# Bin continuous values into discrete categories.
#
# @param breaks [Array]
# List of unique cut points.
# @param labels [Array]
# Names of the categories. The number of labels must be equal to the number
# of cut points plus one.
# @param left_closed [Boolean]
# Set the intervals to be left-closed instead of right-closed.
# @param include_breaks [Boolean]
# Include a column with the right endpoint of the bin each observation falls
# in. This will change the data type of the output from a
# `Categorical` to a `Struct`.
#
# @return [Series]
#
# @example Divide the column into three categories.
# s = Polars::Series.new("foo", [-2, -1, 0, 1, 2])
# s.cut([-1, 1], labels: ["a", "b", "c"])
# # =>
# # shape: (5,)
# # Series: 'foo' [cat]
# # [
# # "a"
# # "a"
# # "b"
# # "b"
# # "c"
# # ]
#
# @example Create a DataFrame with the breakpoint and category for each value.
# cut = s.cut([-1, 1], include_breaks: true).alias("cut")
# s.to_frame.with_columns(cut).unnest("cut")
# # =>
# # shape: (5, 3)
# # ┌─────┬─────────────┬────────────┐
# # │ foo ┆ break_point ┆ category │
# # │ --- ┆ --- ┆ --- │
# # │ i64 ┆ f64 ┆ cat │
# # ╞═════╪═════════════╪════════════╡
# # │ -2 ┆ -1.0 ┆ (-inf, -1] │
# # │ -1 ┆ -1.0 ┆ (-inf, -1] │
# # │ 0 ┆ 1.0 ┆ (-1, 1] │
# # │ 1 ┆ 1.0 ┆ (-1, 1] │
# # │ 2 ┆ inf ┆ (1, inf] │
# # └─────┴─────────────┴────────────┘
def cut(breaks, labels: nil, left_closed: false, include_breaks: false)
result = (
to_frame
.select(
Polars.col(name).cut(
breaks,
labels: labels,
left_closed: left_closed,
include_breaks: include_breaks
)
)
.to_series
)

if include_breaks
result = result.struct.rename_fields(["break_point", "category"])
end

result
end

# Count the unique values in a Series.
#
# @param sort [Boolean]
Expand Down

0 comments on commit ec58719

Please sign in to comment.