Skip to content

Commit

Permalink
Added coalesce option to join and join_asof methods (#83)
Browse files Browse the repository at this point in the history
  • Loading branch information
simpl1g authored Oct 11, 2024
1 parent 125e4f0 commit 837e61b
Show file tree
Hide file tree
Showing 6 changed files with 75 additions and 10 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
- Added support for array of name-type pairs to `schema` option
- Added `cast` method to `DataFrame` and `LazyFrame`
- Fixed `read_database` connection leasing for Active Record 7.2
- Added `coalesce` option to `join` and `join_asof` methods

## 0.14.0 (2024-09-17)

Expand Down
14 changes: 14 additions & 0 deletions ext/polars/src/lazyframe/general.rs
Original file line number Diff line number Diff line change
Expand Up @@ -539,7 +539,13 @@ impl RbLazyFrame {
strategy: Wrap<AsofStrategy>,
tolerance: Option<Wrap<AnyValue<'_>>>,
tolerance_str: Option<String>,
coalesce: bool,
) -> RbResult<Self> {
let coalesce = if coalesce {
JoinCoalesce::CoalesceColumns
} else {
JoinCoalesce::KeepColumns
};
let ldf = self.ldf.borrow().clone();
let other = other.ldf.borrow().clone();
let left_on = left_on.inner.clone();
Expand All @@ -551,6 +557,7 @@ impl RbLazyFrame {
.right_on([right_on])
.allow_parallel(allow_parallel)
.force_parallel(force_parallel)
.coalesce(coalesce)
.how(JoinType::AsOf(AsOfOptions {
strategy: strategy.0,
left_by: left_by.map(strings_to_pl_smallstr),
Expand All @@ -574,7 +581,13 @@ impl RbLazyFrame {
join_nulls: bool,
how: Wrap<JoinType>,
suffix: String,
coalesce: Option<bool>,
) -> RbResult<Self> {
let coalesce = match coalesce {
None => JoinCoalesce::JoinSpecific,
Some(true) => JoinCoalesce::CoalesceColumns,
Some(false) => JoinCoalesce::KeepColumns,
};
let ldf = self.ldf.borrow().clone();
let other = other.ldf.borrow().clone();
let left_on = rb_exprs_to_exprs(left_on)?;
Expand All @@ -590,6 +603,7 @@ impl RbLazyFrame {
.join_nulls(join_nulls)
.how(how.0)
.suffix(suffix)
.coalesce(coalesce)
.finish()
.into())
}
Expand Down
4 changes: 2 additions & 2 deletions ext/polars/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -755,8 +755,8 @@ fn init(ruby: &Ruby) -> RbResult<()> {
method!(RbLazyFrame::group_by_dynamic, 9),
)?;
class.define_method("with_context", method!(RbLazyFrame::with_context, 1))?;
class.define_method("join_asof", method!(RbLazyFrame::join_asof, 11))?;
class.define_method("join", method!(RbLazyFrame::join, 8))?;
class.define_method("join_asof", method!(RbLazyFrame::join_asof, 12))?;
class.define_method("join", method!(RbLazyFrame::join, 9))?;
class.define_method("with_column", method!(RbLazyFrame::with_column, 1))?;
class.define_method("with_columns", method!(RbLazyFrame::with_columns, 1))?;
class.define_method(
Expand Down
30 changes: 26 additions & 4 deletions lib/polars/data_frame.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2234,6 +2234,11 @@ def upsample(
# @param force_parallel [Boolean]
# Force the physical plan to evaluate the computation of both DataFrames up to
# the join in parallel.
# @param coalesce [Boolean]
# Coalescing behavior (merging of join columns).
# - true: -> Always coalesce join columns.
# - false: -> Never coalesce join columns.
# Note that joining on any other expressions than `col` will turn off coalescing.
#
# @return [DataFrame]
#
Expand Down Expand Up @@ -2287,7 +2292,8 @@ def join_asof(
suffix: "_right",
tolerance: nil,
allow_parallel: true,
force_parallel: false
force_parallel: false,
coalesce: true
)
lazy
.join_asof(
Expand All @@ -2302,7 +2308,8 @@ def join_asof(
suffix: suffix,
tolerance: tolerance,
allow_parallel: allow_parallel,
force_parallel: force_parallel
force_parallel: force_parallel,
coalesce: coalesce
)
.collect(no_optimization: true)
end
Expand All @@ -2323,6 +2330,12 @@ def join_asof(
# Suffix to append to columns with a duplicate name.
# @param join_nulls [Boolean]
# Join on null values. By default null values will never produce matches.
# @param coalesce [Boolean]
# Coalescing behavior (merging of join columns).
# - nil: -> join specific.
# - true: -> Always coalesce join columns.
# - false: -> Never coalesce join columns.
# Note that joining on any other expressions than `col` will turn off coalescing.
#
# @return [DataFrame]
#
Expand Down Expand Up @@ -2405,7 +2418,15 @@ def join_asof(
# # ╞═════╪═════╪═════╡
# # │ 3 ┆ 8.0 ┆ c │
# # └─────┴─────┴─────┘
def join(other, left_on: nil, right_on: nil, on: nil, how: "inner", suffix: "_right", join_nulls: false)
def join(other,
left_on: nil,
right_on: nil,
on: nil,
how: "inner",
suffix: "_right",
join_nulls: false,
coalesce: nil
)
lazy
.join(
other.lazy,
Expand All @@ -2414,7 +2435,8 @@ def join(other, left_on: nil, right_on: nil, on: nil, how: "inner", suffix: "_ri
on: on,
how: how,
suffix: suffix,
join_nulls: join_nulls
join_nulls: join_nulls,
coalesce: coalesce
)
.collect(no_optimization: true)
end
Expand Down
23 changes: 19 additions & 4 deletions lib/polars/lazy_frame.rb
Original file line number Diff line number Diff line change
Expand Up @@ -1581,6 +1581,11 @@ def group_by_dynamic(
# @param force_parallel [Boolean]
# Force the physical plan to evaluate the computation of both DataFrames up to
# the join in parallel.
# @param coalesce [Boolean]
# Coalescing behavior (merging of join columns).
# - true: -> Always coalesce join columns.
# - false: -> Never coalesce join columns.
# Note that joining on any other expressions than `col` will turn off coalescing.
#
# @return [LazyFrame]
def join_asof(
Expand All @@ -1595,7 +1600,8 @@ def join_asof(
suffix: "_right",
tolerance: nil,
allow_parallel: true,
force_parallel: false
force_parallel: false,
coalesce: true
)
if !other.is_a?(LazyFrame)
raise ArgumentError, "Expected a `LazyFrame` as join table, got #{other.class.name}"
Expand Down Expand Up @@ -1650,7 +1656,8 @@ def join_asof(
suffix,
strategy,
tolerance_num,
tolerance_str
tolerance_str,
coalesce
)
)
end
Expand Down Expand Up @@ -1678,6 +1685,12 @@ def join_asof(
# @param force_parallel [Boolean]
# Force the physical plan to evaluate the computation of both DataFrames up to
# the join in parallel.
# @param coalesce [Boolean]
# Coalescing behavior (merging of join columns).
# - nil: -> join specific.
# - true: -> Always coalesce join columns.
# - false: -> Never coalesce join columns.
# Note that joining on any other expressions than `col` will turn off coalescing.
#
# @return [LazyFrame]
#
Expand Down Expand Up @@ -1769,7 +1782,8 @@ def join(
suffix: "_right",
join_nulls: false,
allow_parallel: true,
force_parallel: false
force_parallel: false,
coalesce: nil
)
if !other.is_a?(LazyFrame)
raise ArgumentError, "Expected a `LazyFrame` as join table, got #{other.class.name}"
Expand All @@ -1780,7 +1794,7 @@ def join(
elsif how == "cross"
return _from_rbldf(
_ldf.join(
other._ldf, [], [], allow_parallel, join_nulls, force_parallel, how, suffix
other._ldf, [], [], allow_parallel, join_nulls, force_parallel, how, suffix, coalesce
)
)
end
Expand All @@ -1806,6 +1820,7 @@ def join(
join_nulls,
how,
suffix,
coalesce
)
)
end
Expand Down
13 changes: 13 additions & 0 deletions test/data_frame_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -458,6 +458,19 @@ def test_join_outer
assert_frame expected, df3.sort("L1", nulls_last: true)
end

def test_join_outer_coalesce
df1 = Polars::DataFrame.new({"L1" => ["a", "b", "c"], "L2" => [1, 2, 3]})
df2 = Polars::DataFrame.new({"L1" => ["a", "c", "d"], "R2" => [7, 8, 9]})
df3 = df1.join(df2, on: "L1", how: "full", coalesce: true)
expected =
Polars::DataFrame.new({
"L1" => ["a", "b", "c", "d"],
"L2" => [1, 2, 3, nil],
"R2" => [7, nil, 8, 9]
})
assert_frame expected, df3.sort("L1")
end

def test_join_cross
df1 = Polars::DataFrame.new({a: [1, 2]})
df2 = Polars::DataFrame.new({b: ["three", "four"]})
Expand Down

0 comments on commit 837e61b

Please sign in to comment.