From a5c207e0c81ec1005d7fe7195b032d98b6023aa5 Mon Sep 17 00:00:00 2001 From: Stijn de Gooijer Date: Wed, 25 Oct 2023 08:58:41 +0200 Subject: [PATCH 1/3] Add tests --- .../tests/unit/namespaces/string/test_pad.py | 27 +++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/py-polars/tests/unit/namespaces/string/test_pad.py b/py-polars/tests/unit/namespaces/string/test_pad.py index ff28febcd7ef..2b8e5c032817 100644 --- a/py-polars/tests/unit/namespaces/string/test_pad.py +++ b/py-polars/tests/unit/namespaces/string/test_pad.py @@ -86,3 +86,30 @@ def test_str_rjust_deprecated() -> None: expected = pl.Series([" a", " bc", " def"]) assert_series_equal(result, expected) + + +def test_pad_end_unicode() -> None: + lf = pl.LazyFrame({"a": ["Café", "345", "東京", None]}) + + result = lf.select(pl.col("a").str.pad_end(6, "日")) + + expected = pl.LazyFrame({"a": ["Café日日", "345日日日", "東京日日日日", None]}) + assert_frame_equal(result, expected) + + +def test_pad_start_unicode() -> None: + lf = pl.LazyFrame({"a": ["Café", "345", "東京", None]}) + + result = lf.select(pl.col("a").str.pad_start(6, "日")) + + expected = pl.LazyFrame({"a": ["日日Café", "日日日345", "日日日日東京", None]}) + assert_frame_equal(result, expected) + + +def test_str_zfill_unicode_not_respected() -> None: + lf = pl.LazyFrame({"a": ["Café", "345", "東京", None]}) + + result = lf.select(pl.col("a").str.zfill(6)) + + expected = pl.LazyFrame({"a": ["0Café", "000345", "東京", None]}) + assert_frame_equal(result, expected) From 9cedacca748644dc7723bf043f123a3ff0760ecc Mon Sep 17 00:00:00 2001 From: Stijn de Gooijer Date: Wed, 25 Oct 2023 08:58:52 +0200 Subject: [PATCH 2/3] Fix implementation --- crates/polars-ops/src/chunked_array/strings/pad.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/polars-ops/src/chunked_array/strings/pad.rs b/crates/polars-ops/src/chunked_array/strings/pad.rs index 19ed3ebc6719..66ff80953da6 100644 --- a/crates/polars-ops/src/chunked_array/strings/pad.rs +++ b/crates/polars-ops/src/chunked_array/strings/pad.rs @@ -6,7 +6,7 @@ pub(super) fn pad_end<'a>(ca: &'a Utf8Chunked, length: usize, fill_char: char) - // amortize allocation let mut buf = String::new(); let f = |s: &'a str| { - let padding = length.saturating_sub(s.len()); + let padding = length.saturating_sub(s.chars().count()); if padding == 0 { s } else { @@ -28,7 +28,7 @@ pub(super) fn pad_start<'a>(ca: &'a Utf8Chunked, length: usize, fill_char: char) // amortize allocation let mut buf = String::new(); let f = |s: &'a str| { - let padding = length.saturating_sub(s.len()); + let padding = length.saturating_sub(s.chars().count()); if padding == 0 { s } else { From 155bb847396674ff3258748deeb79db372791dd9 Mon Sep 17 00:00:00 2001 From: Stijn de Gooijer Date: Wed, 25 Oct 2023 08:58:57 +0200 Subject: [PATCH 3/3] Add note --- py-polars/polars/expr/string.py | 5 +++++ py-polars/polars/series/string.py | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/py-polars/polars/expr/string.py b/py-polars/polars/expr/string.py index 96caa774aafb..695d0cba55fd 100644 --- a/py-polars/polars/expr/string.py +++ b/py-polars/polars/expr/string.py @@ -854,6 +854,11 @@ def zfill(self, length: int) -> Expr: -------- pad_start + Notes + ----- + This method is intended for padding numeric strings. If your data contains + non-ASCII characters, use :func:`pad_start` instead. + Examples -------- >>> df = pl.DataFrame({"a": [-1, 123, 999999, None]}) diff --git a/py-polars/polars/series/string.py b/py-polars/polars/series/string.py index 57e613c04c64..1b8d29d2d693 100644 --- a/py-polars/polars/series/string.py +++ b/py-polars/polars/series/string.py @@ -1355,6 +1355,11 @@ def zfill(self, length: int) -> Series: -------- pad_start + Notes + ----- + This method is intended for padding numeric strings. If your data contains + non-ASCII characters, use :func:`pad_start` instead. + Examples -------- >>> s = pl.Series([-1, 123, 999999, None])