Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: Fix padding for non-ASCII strings #12008

Merged
merged 3 commits into from
Oct 25, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions crates/polars-ops/src/chunked_array/strings/pad.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ pub(super) fn pad_end<'a>(ca: &'a Utf8Chunked, length: usize, fill_char: char) -
// amortize allocation
let mut buf = String::new();
let f = |s: &'a str| {
let padding = length.saturating_sub(s.len());
let padding = length.saturating_sub(s.chars().count());
if padding == 0 {
s
} else {
Expand All @@ -28,7 +28,7 @@ pub(super) fn pad_start<'a>(ca: &'a Utf8Chunked, length: usize, fill_char: char)
// amortize allocation
let mut buf = String::new();
let f = |s: &'a str| {
let padding = length.saturating_sub(s.len());
let padding = length.saturating_sub(s.chars().count());
if padding == 0 {
s
} else {
Expand Down
5 changes: 5 additions & 0 deletions py-polars/polars/expr/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -854,6 +854,11 @@ def zfill(self, length: int) -> Expr:
--------
pad_start
Notes
-----
This method is intended for padding numeric strings. If your data contains
non-ASCII characters, use :func:`pad_start` instead.
Examples
--------
>>> df = pl.DataFrame({"a": [-1, 123, 999999, None]})
Expand Down
5 changes: 5 additions & 0 deletions py-polars/polars/series/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -1355,6 +1355,11 @@ def zfill(self, length: int) -> Series:
--------
pad_start
Notes
-----
This method is intended for padding numeric strings. If your data contains
non-ASCII characters, use :func:`pad_start` instead.
Examples
--------
>>> s = pl.Series([-1, 123, 999999, None])
Expand Down
27 changes: 27 additions & 0 deletions py-polars/tests/unit/namespaces/string/test_pad.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,3 +86,30 @@ def test_str_rjust_deprecated() -> None:

expected = pl.Series([" a", " bc", " def"])
assert_series_equal(result, expected)


def test_pad_end_unicode() -> None:
lf = pl.LazyFrame({"a": ["Café", "345", "東京", None]})

result = lf.select(pl.col("a").str.pad_end(6, "日"))

expected = pl.LazyFrame({"a": ["Café日日", "345日日日", "東京日日日日", None]})
assert_frame_equal(result, expected)


def test_pad_start_unicode() -> None:
lf = pl.LazyFrame({"a": ["Café", "345", "東京", None]})

result = lf.select(pl.col("a").str.pad_start(6, "日"))

expected = pl.LazyFrame({"a": ["日日Café", "日日日345", "日日日日東京", None]})
assert_frame_equal(result, expected)


def test_str_zfill_unicode_not_respected() -> None:
lf = pl.LazyFrame({"a": ["Café", "345", "東京", None]})

result = lf.select(pl.col("a").str.zfill(6))

expected = pl.LazyFrame({"a": ["0Café", "000345", "東京", None]})
assert_frame_equal(result, expected)