diff --git a/crates/polars-ops/src/chunked_array/strings/pad.rs b/crates/polars-ops/src/chunked_array/strings/pad.rs index 19ed3ebc6719..66ff80953da6 100644 --- a/crates/polars-ops/src/chunked_array/strings/pad.rs +++ b/crates/polars-ops/src/chunked_array/strings/pad.rs @@ -6,7 +6,7 @@ pub(super) fn pad_end<'a>(ca: &'a Utf8Chunked, length: usize, fill_char: char) - // amortize allocation let mut buf = String::new(); let f = |s: &'a str| { - let padding = length.saturating_sub(s.len()); + let padding = length.saturating_sub(s.chars().count()); if padding == 0 { s } else { @@ -28,7 +28,7 @@ pub(super) fn pad_start<'a>(ca: &'a Utf8Chunked, length: usize, fill_char: char) // amortize allocation let mut buf = String::new(); let f = |s: &'a str| { - let padding = length.saturating_sub(s.len()); + let padding = length.saturating_sub(s.chars().count()); if padding == 0 { s } else { diff --git a/py-polars/polars/expr/string.py b/py-polars/polars/expr/string.py index 96caa774aafb..695d0cba55fd 100644 --- a/py-polars/polars/expr/string.py +++ b/py-polars/polars/expr/string.py @@ -854,6 +854,11 @@ def zfill(self, length: int) -> Expr: -------- pad_start + Notes + ----- + This method is intended for padding numeric strings. If your data contains + non-ASCII characters, use :func:`pad_start` instead. + Examples -------- >>> df = pl.DataFrame({"a": [-1, 123, 999999, None]}) diff --git a/py-polars/polars/series/string.py b/py-polars/polars/series/string.py index 57e613c04c64..1b8d29d2d693 100644 --- a/py-polars/polars/series/string.py +++ b/py-polars/polars/series/string.py @@ -1355,6 +1355,11 @@ def zfill(self, length: int) -> Series: -------- pad_start + Notes + ----- + This method is intended for padding numeric strings. If your data contains + non-ASCII characters, use :func:`pad_start` instead. + Examples -------- >>> s = pl.Series([-1, 123, 999999, None]) diff --git a/py-polars/tests/unit/namespaces/string/test_pad.py b/py-polars/tests/unit/namespaces/string/test_pad.py index ff28febcd7ef..2b8e5c032817 100644 --- a/py-polars/tests/unit/namespaces/string/test_pad.py +++ b/py-polars/tests/unit/namespaces/string/test_pad.py @@ -86,3 +86,30 @@ def test_str_rjust_deprecated() -> None: expected = pl.Series([" a", " bc", " def"]) assert_series_equal(result, expected) + + +def test_pad_end_unicode() -> None: + lf = pl.LazyFrame({"a": ["Café", "345", "東京", None]}) + + result = lf.select(pl.col("a").str.pad_end(6, "日")) + + expected = pl.LazyFrame({"a": ["Café日日", "345日日日", "東京日日日日", None]}) + assert_frame_equal(result, expected) + + +def test_pad_start_unicode() -> None: + lf = pl.LazyFrame({"a": ["Café", "345", "東京", None]}) + + result = lf.select(pl.col("a").str.pad_start(6, "日")) + + expected = pl.LazyFrame({"a": ["日日Café", "日日日345", "日日日日東京", None]}) + assert_frame_equal(result, expected) + + +def test_str_zfill_unicode_not_respected() -> None: + lf = pl.LazyFrame({"a": ["Café", "345", "東京", None]}) + + result = lf.select(pl.col("a").str.zfill(6)) + + expected = pl.LazyFrame({"a": ["0Café", "000345", "東京", None]}) + assert_frame_equal(result, expected)