Skip to content

Commit

Permalink
feat(python): add support for "outer" mode to frame update method (#…
Browse files Browse the repository at this point in the history
  • Loading branch information
alexander-beedie authored Oct 13, 2023
1 parent 0cf3487 commit 717b4df
Show file tree
Hide file tree
Showing 3 changed files with 133 additions and 59 deletions.
66 changes: 46 additions & 20 deletions py-polars/polars/dataframe/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -9725,7 +9725,7 @@ def update(
on: str | Sequence[str] | None = None,
left_on: str | Sequence[str] | None = None,
right_on: str | Sequence[str] | None = None,
how: Literal["left", "inner"] = "left",
how: Literal["left", "inner", "outer"] = "left",
) -> DataFrame:
"""
Update the values in this `DataFrame` with the non-null values in `other`.
Expand All @@ -9750,10 +9750,12 @@ def update(
Join column(s) of the left DataFrame.
right_on
Join column(s) of the right DataFrame.
how : {'left', 'inner'}
'left' will keep all rows from the left table. Rows may be duplicated if
multiple rows in right frame match left row's `on` key.
'inner' will remove rows that are not found in other
how : {'left', 'inner', 'outer'}
* 'left' will keep all rows from the left table; rows may be duplicated
if multiple rows in the right frame match the left row's key.
* 'inner' keeps only those rows where the key exists in both frames.
* 'outer' will update existing rows where the key matches while also
adding any new rows contained in the given frame.
Examples
--------
Expand All @@ -9777,32 +9779,56 @@ def update(
└─────┴─────┘
>>> new_df = pl.DataFrame(
... {
... "B": [4, None, 6],
... "C": [7, 8, 9],
... "B": [-66, None, -99],
... "C": [5, 3, 1],
... }
... )
>>> new_df
shape: (3, 2)
┌──────┬─────┐
│ B ┆ C │
│ --- ┆ --- │
│ i64 ┆ i64 │
╞══════╪═════╡
│ 4 ┆ 7 │
│ null ┆ 8 │
│ 6 ┆ 9 │
└──────┴─────┘
Update `df` values with the non-null values in `new_df`, by row index:
>>> df.update(new_df)
shape: (4, 2)
┌─────┬─────┐
│ A ┆ B │
│ --- ┆ --- │
│ i64 ┆ i64 │
╞═════╪═════╡
│ 1 ┆ 4
│ 1 ┆ -66
│ 2 ┆ 500 │
│ 3 ┆ 6 │
│ 3 ┆ -99 │
│ 4 ┆ 700 │
└─────┴─────┘
Update `df` values with the non-null values in `new_df`, by row index,
but only keeping those rows that are common to both frames:
>>> df.update(new_df, how="inner")
shape: (3, 2)
┌─────┬─────┐
│ A ┆ B │
│ --- ┆ --- │
│ i64 ┆ i64 │
╞═════╪═════╡
│ 1 ┆ -66 │
│ 2 ┆ 500 │
│ 3 ┆ -99 │
└─────┴─────┘
Update `df` values with the non-null values in `new_df`, using an outer join
strategy that defines explicit join columns in each frame:
>>> df.update(new_df, left_on=["A"], right_on=["C"], how="outer")
shape: (5, 2)
┌─────┬─────┐
│ A ┆ B │
│ --- ┆ --- │
│ i64 ┆ i64 │
╞═════╪═════╡
│ 1 ┆ -99 │
│ 2 ┆ 500 │
│ 3 ┆ 600 │
│ 4 ┆ 700 │
│ 5 ┆ -66 │
└─────┴─────┘
"""
Expand Down
96 changes: 60 additions & 36 deletions py-polars/polars/lazyframe/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -5560,45 +5560,42 @@ def update(
on: str | Sequence[str] | None = None,
left_on: str | Sequence[str] | None = None,
right_on: str | Sequence[str] | None = None,
how: Literal["left", "inner"] = "left",
how: Literal["left", "inner", "outer"] = "left",
) -> Self:
"""
Update the values in this `LazyFrame` with the non-null values in `other`.
Warnings
--------
This functionality is experimental and may change without it being considered a
breaking change.
Parameters
----------
other
LazyFrame that will be used to update the values
on
Column names that will be joined on.
If none given the row count is used.
Column names that will be joined on; if given ``None`` the implicit row
index is used as a join key instead.
left_on
Join column(s) of the left DataFrame.
right_on
Join column(s) of the right DataFrame.
how : {'left', 'inner'}
'left' will keep all rows from the left table. Rows may be duplicated if
multiple rows in right frame match left row's `on` key.
'inner' will remove rows that are not found in other
how : {'left', 'inner', 'outer'}
* 'left' will keep all rows from the left table; rows may be duplicated
if multiple rows in the right frame match the left row's key.
* 'inner' keeps only those rows where the key exists in both frames.
* 'outer' will update existing rows where the key matches while also
adding any new rows contained in the given frame.
Notes
-----
This is syntactic sugar for a left/inner join + coalesce
This is syntactic sugar for a join + coalesce (upsert) operation.
Examples
--------
>>> df = pl.DataFrame(
>>> lf = pl.LazyFrame(
... {
... "A": [1, 2, 3, 4],
... "B": [400, 500, 600, 700],
... }
... )
>>> df
>>> lf.collect()
shape: (4, 2)
┌─────┬─────┐
│ A ┆ B │
Expand All @@ -5610,39 +5607,67 @@ def update(
│ 3 ┆ 600 │
│ 4 ┆ 700 │
└─────┴─────┘
>>> new_df = pl.DataFrame(
>>> new_lf = pl.LazyFrame(
... {
... "B": [4, None, 6],
... "C": [7, 8, 9],
... "B": [-66, None, -99],
... "C": [5, 3, 1],
... }
... )
>>> new_df
shape: (3, 2)
┌──────┬─────┐
│ B ┆ C │
│ --- ┆ --- │
│ i64 ┆ i64 │
╞══════╪═════╡
│ 4 ┆ 7 │
│ null ┆ 8 │
│ 6 ┆ 9 │
└──────┴─────┘
>>> df.update(new_df)
Update `df` values with the non-null values in `new_df`, by row index:
>>> lf.update(new_lf).collect()
shape: (4, 2)
┌─────┬─────┐
│ A ┆ B │
│ --- ┆ --- │
│ i64 ┆ i64 │
╞═════╪═════╡
│ 1 ┆ 4
│ 1 ┆ -66
│ 2 ┆ 500 │
│ 3 ┆ 6
│ 3 ┆ -99
│ 4 ┆ 700 │
└─────┴─────┘
Update `df` values with the non-null values in `new_df`, by row index,
but only keeping those rows that are common to both frames:
>>> lf.update(new_lf, how="inner").collect()
shape: (3, 2)
┌─────┬─────┐
│ A ┆ B │
│ --- ┆ --- │
│ i64 ┆ i64 │
╞═════╪═════╡
│ 1 ┆ -66 │
│ 2 ┆ 500 │
│ 3 ┆ -99 │
└─────┴─────┘
Update `df` values with the non-null values in `new_df`, using an outer join
strategy that defines explicit join columns in each frame:
>>> lf.update(new_lf, left_on=["A"], right_on=["C"], how="outer").collect()
shape: (5, 2)
┌─────┬─────┐
│ A ┆ B │
│ --- ┆ --- │
│ i64 ┆ i64 │
╞═════╪═════╡
│ 1 ┆ -99 │
│ 2 ┆ 500 │
│ 3 ┆ 600 │
│ 4 ┆ 700 │
│ 5 ┆ -66 │
└─────┴─────┘
"""
row_count_used = False
if how not in ("left", "inner", "outer"):
raise ValueError(
f"`how` must be one of {{'left', 'inner', 'outer'}}; found {how!r}"
)

row_count_used = False
if on is None:
if left_on is None and right_on is None:
# no keys provided--use row count
Expand All @@ -5657,7 +5682,6 @@ def update(
raise ValueError("missing join columns for left frame")
if right_on is None:
raise ValueError("missing join columns for right frame")

else:
# move on into left/right_on to simplify logic
left_on = right_on = on
Expand All @@ -5676,8 +5700,8 @@ def update(
if name not in right_names:
raise ValueError(f"right join column {name!r} not found")

# no need to join if only join columns are in other
if len(other.columns) == len(right_on):
# no need to join if *only* join columns are in other (inner/left update only)
if how != "outer" and len(other.columns) == len(right_on):
if row_count_used:
return self.drop(row_count_name)
return self
Expand Down
30 changes: 27 additions & 3 deletions py-polars/tests/unit/operations/test_join.py
Original file line number Diff line number Diff line change
Expand Up @@ -544,11 +544,35 @@ def test_update() -> None:

assert df1.update(df2, on="a").to_dict(False) == {"a": [1, 2, 3], "b": [4, 8, 9]}

a = pl.DataFrame({"a": [1, 2, 3]})
b = pl.DataFrame({"b": [4, 5]})
a = pl.LazyFrame({"a": [1, 2, 3]})
b = pl.LazyFrame({"b": [4, 5], "c": [3, 1]})
c = a.update(b)

assert c.rows() == a.rows()
assert_frame_equal(a, c)

# check behaviour of 'how' param
assert [1, 2, 3] == list(
a.update(b, left_on="a", right_on="c").collect().to_series()
)
assert [1, 3] == list(
a.update(b, how="inner", left_on="a", right_on="c").collect().to_series()
)
assert [1, 2, 3, 4, 5] == sorted(
a.update(b.rename({"b": "a"}), how="outer", on="a").collect().to_series()
)

# edge-case #11684
x = pl.DataFrame({"a": [0, 1]})
y = pl.DataFrame({"a": [2, 3]})
assert [0, 1, 2, 3] == sorted(x.update(y, on="a", how="outer")["a"].to_list())

# disallowed join strategies
for join_strategy in ("cross", "anti", "semi"):
with pytest.raises(
ValueError,
match=f"`how` must be one of {{'left', 'inner', 'outer'}}; found '{join_strategy}'",
):
a.update(b, how=join_strategy) # type: ignore[arg-type]


def test_join_frame_consistency() -> None:
Expand Down

0 comments on commit 717b4df

Please sign in to comment.