From f23f2c3977656cade7bfd60955ede6269b3ed4e0 Mon Sep 17 00:00:00 2001 From: Stijn de Gooijer Date: Tue, 10 Oct 2023 10:53:43 +0200 Subject: [PATCH] depr(python, rust!): Rename `.list.lengths` and `.str.lengths` (#11613) --- crates/polars-arrow/src/kernels/string.rs | 4 +- .../src/chunked_array/strings/namespace.rs | 8 +- .../polars-plan/src/dsl/function_expr/mod.rs | 4 +- .../src/dsl/function_expr/strings.rs | 20 +-- crates/polars-plan/src/dsl/list.rs | 6 +- crates/polars-plan/src/dsl/string.rs | 25 +++- crates/polars-sql/src/functions.rs | 6 +- docs/_build/API_REFERENCE_LINKS.yml | 16 +-- .../python/user-guide/expressions/lists.py | 2 +- .../python/user-guide/expressions/strings.py | 4 +- .../expressions/user-defined-functions.py | 2 +- .../rust/user-guide/expressions/strings.rs | 4 +- docs/user-guide/expressions/strings.md | 2 +- .../source/reference/expressions/list.rst | 1 + .../source/reference/expressions/string.rst | 2 + .../docs/source/reference/series/list.rst | 1 + .../docs/source/reference/series/string.rst | 2 + py-polars/polars/config.py | 2 +- py-polars/polars/expr/expr.py | 19 ++- py-polars/polars/expr/list.py | 34 +++-- py-polars/polars/expr/string.py | 123 ++++++++++++------ py-polars/polars/interchange/buffer.py | 2 +- py-polars/polars/series/list.py | 25 +++- py-polars/polars/series/series.py | 6 +- py-polars/polars/series/string.py | 65 ++++++--- py-polars/polars/utils/various.py | 4 +- py-polars/src/expr/list.rs | 4 +- py-polars/src/expr/string.rs | 8 +- .../tests/unit/datatypes/test_categorical.py | 2 +- py-polars/tests/unit/datatypes/test_list.py | 4 +- .../tests/unit/functions/test_whenthen.py | 4 +- py-polars/tests/unit/io/test_parquet.py | 8 +- py-polars/tests/unit/namespaces/test_list.py | 15 +++ .../tests/unit/namespaces/test_string.py | 40 +++--- py-polars/tests/unit/series/test_series.py | 4 +- 35 files changed, 311 insertions(+), 167 deletions(-) diff --git a/crates/polars-arrow/src/kernels/string.rs b/crates/polars-arrow/src/kernels/string.rs index 5d4770b2b13e..e348ac1f9548 100644 --- a/crates/polars-arrow/src/kernels/string.rs +++ b/crates/polars-arrow/src/kernels/string.rs @@ -5,7 +5,7 @@ use arrow::datatypes::DataType; use crate::prelude::*; use crate::trusted_len::TrustedLenPush; -pub fn string_lengths(array: &Utf8Array) -> ArrayRef { +pub fn string_len_bytes(array: &Utf8Array) -> ArrayRef { let values = array .offsets() .as_slice() @@ -16,7 +16,7 @@ pub fn string_lengths(array: &Utf8Array) -> ArrayRef { Box::new(array) } -pub fn string_nchars(array: &Utf8Array) -> ArrayRef { +pub fn string_len_chars(array: &Utf8Array) -> ArrayRef { let values = array.values_iter().map(|x| x.chars().count() as u32); let values: Buffer<_> = Vec::from_trusted_len_iter(values).into(); let array = UInt32Array::new(DataType::UInt32, values, array.validity().cloned()); diff --git a/crates/polars-ops/src/chunked_array/strings/namespace.rs b/crates/polars-ops/src/chunked_array/strings/namespace.rs index c594094e9030..b38f6e9f9590 100644 --- a/crates/polars-ops/src/chunked_array/strings/namespace.rs +++ b/crates/polars-ops/src/chunked_array/strings/namespace.rs @@ -145,15 +145,15 @@ pub trait Utf8NameSpaceImpl: AsUtf8 { } /// Get the length of the string values as number of chars. - fn str_n_chars(&self) -> UInt32Chunked { + fn str_len_chars(&self) -> UInt32Chunked { let ca = self.as_utf8(); - ca.apply_kernel_cast(&string_nchars) + ca.apply_kernel_cast(&string_len_chars) } /// Get the length of the string values as number of bytes. - fn str_lengths(&self) -> UInt32Chunked { + fn str_len_bytes(&self) -> UInt32Chunked { let ca = self.as_utf8(); - ca.apply_kernel_cast(&string_lengths) + ca.apply_kernel_cast(&string_len_bytes) } /// Return a copy of the string left filled with ASCII '0' digits to make a string of length width. diff --git a/crates/polars-plan/src/dsl/function_expr/mod.rs b/crates/polars-plan/src/dsl/function_expr/mod.rs index 89215127dd70..da6869a2cca8 100644 --- a/crates/polars-plan/src/dsl/function_expr/mod.rs +++ b/crates/polars-plan/src/dsl/function_expr/mod.rs @@ -772,8 +772,8 @@ impl From for SpecialEq> { ExtractGroups { pat, dtype } => { map!(strings::extract_groups, &pat, &dtype) }, - NChars => map!(strings::n_chars), - Length => map!(strings::lengths), + LenBytes => map!(strings::len_bytes), + LenChars => map!(strings::len_chars), #[cfg(feature = "string_justify")] Zfill(alignment) => { map!(strings::zfill, alignment) diff --git a/crates/polars-plan/src/dsl/function_expr/strings.rs b/crates/polars-plan/src/dsl/function_expr/strings.rs index 2da29bf58158..296aefed0ae7 100644 --- a/crates/polars-plan/src/dsl/function_expr/strings.rs +++ b/crates/polars-plan/src/dsl/function_expr/strings.rs @@ -44,8 +44,8 @@ pub enum StringFunction { }, #[cfg(feature = "string_from_radix")] FromRadix(u32, bool), - NChars, - Length, + LenBytes, + LenChars, #[cfg(feature = "string_justify")] LJust { width: usize, @@ -114,8 +114,8 @@ impl StringFunction { FromRadix { .. } => mapper.with_dtype(DataType::Int32), #[cfg(feature = "extract_jsonpath")] JsonExtract { dtype, .. } => mapper.with_opt_dtype(dtype.clone()), - Length => mapper.with_dtype(DataType::UInt32), - NChars => mapper.with_dtype(DataType::UInt32), + LenBytes => mapper.with_dtype(DataType::UInt32), + LenChars => mapper.with_dtype(DataType::UInt32), #[cfg(feature = "regex")] Replace { .. } => mapper.with_same_dtype(), #[cfg(feature = "temporal")] @@ -173,9 +173,9 @@ impl Display for StringFunction { StringFunction::JsonExtract { .. } => "json_extract", #[cfg(feature = "string_justify")] StringFunction::LJust { .. } => "ljust", - StringFunction::Length => "lengths", + StringFunction::LenBytes => "len_bytes", StringFunction::Lowercase => "lowercase", - StringFunction::NChars => "n_chars", + StringFunction::LenChars => "len_chars", #[cfg(feature = "string_justify")] StringFunction::RJust { .. } => "rjust", #[cfg(feature = "regex")] @@ -234,14 +234,14 @@ pub(super) fn titlecase(s: &Series) -> PolarsResult { Ok(ca.to_titlecase().into_series()) } -pub(super) fn n_chars(s: &Series) -> PolarsResult { +pub(super) fn len_chars(s: &Series) -> PolarsResult { let ca = s.utf8()?; - Ok(ca.str_n_chars().into_series()) + Ok(ca.str_len_chars().into_series()) } -pub(super) fn lengths(s: &Series) -> PolarsResult { +pub(super) fn len_bytes(s: &Series) -> PolarsResult { let ca = s.utf8()?; - Ok(ca.str_lengths().into_series()) + Ok(ca.str_len_bytes().into_series()) } #[cfg(feature = "regex")] diff --git a/crates/polars-plan/src/dsl/list.rs b/crates/polars-plan/src/dsl/list.rs index 0e19de499679..6e9bde5b68eb 100644 --- a/crates/polars-plan/src/dsl/list.rs +++ b/crates/polars-plan/src/dsl/list.rs @@ -34,8 +34,10 @@ impl ListNameSpace { .map_private(FunctionExpr::ListExpr(ListFunction::DropNulls)) } - /// Get lengths of the arrays in the List type. - pub fn lengths(self) -> Expr { + /// Return the number of elements in each list. + /// + /// Null values are treated like regular elements in this context. + pub fn len(self) -> Expr { self.0 .map_private(FunctionExpr::ListExpr(ListFunction::Length)) } diff --git a/crates/polars-plan/src/dsl/string.rs b/crates/polars-plan/src/dsl/string.rs index 70e4cfa21373..1fdebe23f676 100644 --- a/crates/polars-plan/src/dsl/string.rs +++ b/crates/polars-plan/src/dsl/string.rs @@ -369,16 +369,29 @@ impl StringNameSpace { ))) } - /// Return the number of characters in the string (not bytes). - pub fn n_chars(self) -> Expr { + /// Return the length of each string as the number of bytes. + /// + /// When working with non-ASCII text, the length in bytes is not the same + /// as the length in characters. You may want to use + /// [`len_chars`] instead. Note that `len_bytes` is much more + /// performant (_O(1)_) than [`len_chars`] (_O(n)_). + /// + /// [`len_chars`]: StringNameSpace::len_chars + pub fn len_bytes(self) -> Expr { self.0 - .map_private(FunctionExpr::StringExpr(StringFunction::NChars)) + .map_private(FunctionExpr::StringExpr(StringFunction::LenBytes)) } - /// Return the number of bytes in the string (not characters). - pub fn lengths(self) -> Expr { + /// Return the length of each string as the number of characters. + /// + /// When working with ASCII text, use [`len_bytes`] instead to achieve + /// equivalent output with much better performance: + /// [`len_bytes`] runs in _O(1)_, while `len_chars` runs in _O(n)_. + /// + /// [`len_bytes`]: StringNameSpace::len_bytes + pub fn len_chars(self) -> Expr { self.0 - .map_private(FunctionExpr::StringExpr(StringFunction::Length)) + .map_private(FunctionExpr::StringExpr(StringFunction::LenChars)) } /// Slice the string values. diff --git a/crates/polars-sql/src/functions.rs b/crates/polars-sql/src/functions.rs index f34289937aee..5822aa8086bf 100644 --- a/crates/polars-sql/src/functions.rs +++ b/crates/polars-sql/src/functions.rs @@ -631,7 +631,7 @@ impl SqlFunctionVisitor<'_> { } })) }), - Length => self.visit_unary(|e| e.str().n_chars()), + Length => self.visit_unary(|e| e.str().len_chars()), Lower => self.visit_unary(|e| e.str().to_lowercase()), LTrim => match function.args.len() { 1 => self.visit_unary(|e| e.str().strip_chars_start(lit(Null))), @@ -641,7 +641,7 @@ impl SqlFunctionVisitor<'_> { function.args.len() ), }, - OctetLength => self.visit_unary(|e| e.str().lengths()), + OctetLength => self.visit_unary(|e| e.str().len_bytes()), RegexpLike => match function.args.len() { 2 => self.visit_binary(|e, s| e.str().contains(s, true)), 3 => self.try_visit_ternary(|e, pat, flags| { @@ -714,7 +714,7 @@ impl SqlFunctionVisitor<'_> { // ---- ArrayContains => self.visit_binary::(|e, s| e.list().contains(s)), ArrayGet => self.visit_binary(|e, i| e.list().get(i)), - ArrayLength => self.visit_unary(|e| e.list().lengths()), + ArrayLength => self.visit_unary(|e| e.list().len()), ArrayMax => self.visit_unary(|e| e.list().max()), ArrayMean => self.visit_unary(|e| e.list().mean()), ArrayMin => self.visit_unary(|e| e.list().min()), diff --git a/docs/_build/API_REFERENCE_LINKS.yml b/docs/_build/API_REFERENCE_LINKS.yml index d68415e84f5f..bad064287873 100644 --- a/docs/_build/API_REFERENCE_LINKS.yml +++ b/docs/_build/API_REFERENCE_LINKS.yml @@ -142,8 +142,8 @@ python: str.replace_all: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.str.replace_all.html str.to_datetime: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.str.to_datetime.html str.to_date: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.str.to_date.html - str.n_chars: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.str.n_chars.html - str.lengths: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.str.lengths.html + str.len_chars: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.str.len_chars.html + str.len_bytes: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.str.len_bytes.html struct.field: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.struct.field.html struct.rename_fields: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.struct.rename_fields.html @@ -354,12 +354,12 @@ rust: name: str.replace_all link: https://pola-rs.github.io/polars/docs/rust/dev/polars_lazy/dsl/string/struct.StringNameSpace.html#method.to_datetime feature_flags: [dtype-datetime] - str.n_chars: - name: str.n_chars - link: https://pola-rs.github.io/polars/docs/rust/dev/polars_lazy/dsl/string/struct.StringNameSpace.html#method.n_chars - str.lengths: - name: str.lengths - link: https://pola-rs.github.io/polars/docs/rust/dev/polars_lazy/dsl/string/struct.StringNameSpace.html#method.lengths + str.len_chars: + name: str.len_chars + link: https://pola-rs.github.io/polars/docs/rust/dev/polars_lazy/dsl/string/struct.StringNameSpace.html#method.len_chars + str.len_bytes: + name: str.len_bytes + link: https://pola-rs.github.io/polars/docs/rust/dev/polars_lazy/dsl/string/struct.StringNameSpace.html#method.len_bytes struct.rename_fields: https://pola-rs.github.io/polars/docs/rust/dev/polars_lazy/dsl/struct.StructNameSpace.html#method.rename_fields struct.field: diff --git a/docs/src/python/user-guide/expressions/lists.py b/docs/src/python/user-guide/expressions/lists.py index d81dac154461..5703a01a5518 100644 --- a/docs/src/python/user-guide/expressions/lists.py +++ b/docs/src/python/user-guide/expressions/lists.py @@ -35,7 +35,7 @@ out = weather.with_columns(pl.col("temperatures").str.split(" ")).with_columns( pl.col("temperatures").list.head(3).alias("top3"), pl.col("temperatures").list.slice(-3, 3).alias("bottom_3"), - pl.col("temperatures").list.lengths().alias("obs"), + pl.col("temperatures").list.len().alias("obs"), ) print(out) # --8<-- [end:list_ops] diff --git a/docs/src/python/user-guide/expressions/strings.py b/docs/src/python/user-guide/expressions/strings.py index 9bec188f8930..379c20358feb 100644 --- a/docs/src/python/user-guide/expressions/strings.py +++ b/docs/src/python/user-guide/expressions/strings.py @@ -8,8 +8,8 @@ df = pl.DataFrame({"animal": ["Crab", "cat and dog", "rab$bit", None]}) out = df.select( - pl.col("animal").str.lengths().alias("byte_count"), - pl.col("animal").str.n_chars().alias("letter_count"), + pl.col("animal").str.len_bytes().alias("byte_count"), + pl.col("animal").str.len_chars().alias("letter_count"), ) print(out) # --8<-- [end:df] diff --git a/docs/src/python/user-guide/expressions/user-defined-functions.py b/docs/src/python/user-guide/expressions/user-defined-functions.py index 89fa51420554..920812babd93 100644 --- a/docs/src/python/user-guide/expressions/user-defined-functions.py +++ b/docs/src/python/user-guide/expressions/user-defined-functions.py @@ -50,7 +50,7 @@ def add_counter(val: int) -> int: pl.struct(["keys", "values"]) .map_elements(lambda x: len(x["keys"]) + x["values"]) .alias("solution_apply"), - (pl.col("keys").str.lengths() + pl.col("values")).alias("solution_expr"), + (pl.col("keys").str.len_bytes() + pl.col("values")).alias("solution_expr"), ) print(out) # --8<-- [end:combine] diff --git a/docs/src/rust/user-guide/expressions/strings.rs b/docs/src/rust/user-guide/expressions/strings.rs index f3020e4fa2ce..0b606095ca92 100644 --- a/docs/src/rust/user-guide/expressions/strings.rs +++ b/docs/src/rust/user-guide/expressions/strings.rs @@ -12,8 +12,8 @@ fn main() -> Result<(), Box> { .clone() .lazy() .select([ - col("animal").str().lengths().alias("byte_count"), - col("animal").str().n_chars().alias("letter_count"), + col("animal").str().len_bytes().alias("byte_count"), + col("animal").str().len_chars().alias("letter_count"), ]) .collect()?; diff --git a/docs/user-guide/expressions/strings.md b/docs/user-guide/expressions/strings.md index 9f00f7b1268e..93b1c4de93f7 100644 --- a/docs/user-guide/expressions/strings.md +++ b/docs/user-guide/expressions/strings.md @@ -8,7 +8,7 @@ String processing functions are available in the `str` namespace. The `str` namespace can be accessed through the `.str` attribute of a column with `Utf8` data type. In the following example, we create a column named `animal` and compute the length of each element in the column in terms of the number of bytes and the number of characters. If you are working with ASCII text, then the results of these two computations will be the same, and using `lengths` is recommended since it is faster. -{{code_block('user-guide/expressions/strings','df',['str.lengths','str.n_chars'])}} +{{code_block('user-guide/expressions/strings','df',['str.len_bytes','str.len_chars'])}} ```python exec="on" result="text" session="user-guide/strings" --8<-- "python/user-guide/expressions/strings.py:setup" diff --git a/py-polars/docs/source/reference/expressions/list.rst b/py-polars/docs/source/reference/expressions/list.rst index bfbcdf957a2f..d56b44abcc30 100644 --- a/py-polars/docs/source/reference/expressions/list.rst +++ b/py-polars/docs/source/reference/expressions/list.rst @@ -28,6 +28,7 @@ The following methods are available under the `expr.list` attribute. Expr.list.intersection Expr.list.join Expr.list.last + Expr.list.len Expr.list.lengths Expr.list.max Expr.list.mean diff --git a/py-polars/docs/source/reference/expressions/string.rst b/py-polars/docs/source/reference/expressions/string.rst index ff4e73d7cc4a..2412f2444499 100644 --- a/py-polars/docs/source/reference/expressions/string.rst +++ b/py-polars/docs/source/reference/expressions/string.rst @@ -22,6 +22,8 @@ The following methods are available under the `expr.str` attribute. Expr.str.extract_groups Expr.str.json_extract Expr.str.json_path_match + Expr.str.len_bytes + Expr.str.len_chars Expr.str.lengths Expr.str.ljust Expr.str.lstrip diff --git a/py-polars/docs/source/reference/series/list.rst b/py-polars/docs/source/reference/series/list.rst index ad346388938a..7f3b709e80db 100644 --- a/py-polars/docs/source/reference/series/list.rst +++ b/py-polars/docs/source/reference/series/list.rst @@ -28,6 +28,7 @@ The following methods are available under the `Series.list` attribute. Series.list.join Series.list.intersection Series.list.last + Series.list.len Series.list.lengths Series.list.max Series.list.mean diff --git a/py-polars/docs/source/reference/series/string.rst b/py-polars/docs/source/reference/series/string.rst index a47113010c5b..5fa2efc643a8 100644 --- a/py-polars/docs/source/reference/series/string.rst +++ b/py-polars/docs/source/reference/series/string.rst @@ -22,6 +22,8 @@ The following methods are available under the `Series.str` attribute. Series.str.extract_groups Series.str.json_extract Series.str.json_path_match + Series.str.len_bytes + Series.str.len_chars Series.str.lengths Series.str.ljust Series.str.lstrip diff --git a/py-polars/polars/config.py b/py-polars/polars/config.py index ad57e463fda8..2a327d297997 100644 --- a/py-polars/polars/config.py +++ b/py-polars/polars/config.py @@ -440,7 +440,7 @@ def set_fmt_str_lengths(cls, n: int | None) -> type[Config]: ... ] ... } ... ) - >>> df.with_columns(pl.col("txt").str.lengths().alias("len")) + >>> df.with_columns(pl.col("txt").str.len_bytes().alias("len")) shape: (2, 2) ┌───────────────────────────────────┬─────┐ │ txt ┆ len │ diff --git a/py-polars/polars/expr/expr.py b/py-polars/polars/expr/expr.py index db0b18707135..67529f9e4e35 100644 --- a/py-polars/polars/expr/expr.py +++ b/py-polars/polars/expr/expr.py @@ -1241,15 +1241,15 @@ def agg_groups(self) -> Self: def count(self) -> Self: """ - Count the number of values in this expression. + Return the number of elements in the column. .. warning:: - `null` is deemed a value in this context. + Null values are treated like regular elements in this context. Examples -------- >>> df = pl.DataFrame({"a": [8, 9, 10], "b": [None, 4, 4]}) - >>> df.select(pl.all().count()) # counts nulls + >>> df.select(pl.all().count()) shape: (1, 2) ┌─────┬─────┐ │ a ┆ b │ @@ -1264,19 +1264,16 @@ def count(self) -> Self: def len(self) -> Self: """ - Count the number of values in this expression. + Return the number of elements in the column. + + Null values are treated like regular elements in this context. Alias for :func:`count`. Examples -------- - >>> df = pl.DataFrame( - ... { - ... "a": [8, 9, 10], - ... "b": [None, 4, 4], - ... } - ... ) - >>> df.select(pl.all().len()) # counts nulls + >>> df = pl.DataFrame({"a": [8, 9, 10], "b": [None, 4, 4]}) + >>> df.select(pl.all().len()) shape: (1, 2) ┌─────┬─────┐ │ a ┆ b │ diff --git a/py-polars/polars/expr/list.py b/py-polars/polars/expr/list.py index 2ea3d1291ed6..5a41f4e2f213 100644 --- a/py-polars/polars/expr/list.py +++ b/py-polars/polars/expr/list.py @@ -86,26 +86,33 @@ def any(self) -> Expr: """ return wrap_expr(self._pyexpr.list_any()) - def lengths(self) -> Expr: + def len(self) -> Expr: """ - Get the length of the arrays as UInt32. + Return the number of elements in each list. + + Null values are treated like regular elements in this context. + + Returns + ------- + Expr + Expression of data type :class:`UInt32`. Examples -------- - >>> df = pl.DataFrame({"foo": [1, 2], "bar": [["a", "b"], ["c"]]}) - >>> df.select(pl.col("bar").list.lengths()) + >>> df = pl.DataFrame({"a": [[1, 2, None], [5]]}) + >>> df.select(pl.col("a").list.len()) shape: (2, 1) ┌─────┐ - │ bar │ + │ a │ │ --- │ │ u32 │ ╞═════╡ - │ 2 │ + │ 3 │ │ 1 │ └─────┘ """ - return wrap_expr(self._pyexpr.list_lengths()) + return wrap_expr(self._pyexpr.list_len()) def drop_nulls(self) -> Expr: """ @@ -217,7 +224,7 @@ def mean(self) -> Expr: def sort(self, *, descending: bool = False) -> Expr: """ - Sort the arrays in this column. + Sort the lists in this column. Parameters ---------- @@ -1128,3 +1135,14 @@ def count_match(self, element: IntoExpr) -> Expr: """ return self.count_matches(element) + + @deprecate_renamed_function("len", version="0.19.8") + def lengths(self) -> Expr: + """ + Return the number of elements in each list. + + .. deprecated:: 0.19.8 + This method has been renamed to :func:`len`. + + """ + return self.len() diff --git a/py-polars/polars/expr/string.py b/py-polars/polars/expr/string.py index a43e96445929..0ed81e7b1b83 100644 --- a/py-polars/polars/expr/string.py +++ b/py-polars/polars/expr/string.py @@ -369,71 +369,88 @@ def to_decimal( """ return wrap_expr(self._pyexpr.str_to_decimal(inference_length)) - def lengths(self) -> Expr: + def len_bytes(self) -> Expr: """ - Get length of the strings as UInt32 (as number of bytes). + Return the length of each string as the number of bytes. + + Returns + ------- + Expr + Expression of data type :class:`UInt32`. + + See Also + -------- + len_chars Notes ----- - The returned lengths are equal to the number of bytes in the UTF8 string. If you - need the length in terms of the number of characters, use ``n_chars`` instead. + When working with non-ASCII text, the length in bytes is not the same as the + length in characters. You may want to use :func:`len_chars` instead. + Note that :func:`len_bytes` is much more performant (_O(1)_) than + :func:`len_chars` (_O(n)_). Examples -------- - >>> df = pl.DataFrame({"s": ["Café", None, "345", "東京"]}).with_columns( - ... [ - ... pl.col("s").str.lengths().alias("length"), - ... pl.col("s").str.n_chars().alias("nchars"), - ... ] + >>> df = pl.DataFrame({"a": ["Café", "345", "東京", None]}) + >>> df.with_columns( + ... pl.col("a").str.len_bytes().alias("n_bytes"), + ... pl.col("a").str.len_chars().alias("n_chars"), ... ) - >>> df shape: (4, 3) - ┌──────┬────────┬────────┐ - │ s ┆ length ┆ nchars │ - │ --- ┆ --- ┆ --- │ - │ str ┆ u32 ┆ u32 │ - ╞══════╪════════╪════════╡ - │ Café ┆ 5 ┆ 4 │ - │ null ┆ null ┆ null │ - │ 345 ┆ 3 ┆ 3 │ - │ 東京 ┆ 6 ┆ 2 │ - └──────┴────────┴────────┘ + ┌──────┬─────────┬─────────┐ + │ a ┆ n_bytes ┆ n_chars │ + │ --- ┆ --- ┆ --- │ + │ str ┆ u32 ┆ u32 │ + ╞══════╪═════════╪═════════╡ + │ Café ┆ 5 ┆ 4 │ + │ 345 ┆ 3 ┆ 3 │ + │ 東京 ┆ 6 ┆ 2 │ + │ null ┆ null ┆ null │ + └──────┴─────────┴─────────┘ """ - return wrap_expr(self._pyexpr.str_lengths()) + return wrap_expr(self._pyexpr.str_len_bytes()) - def n_chars(self) -> Expr: + def len_chars(self) -> Expr: """ - Get length of the strings as UInt32 (as number of chars). + Return the length of each string as the number of characters. + + Returns + ------- + Expr + Expression of data type :class:`UInt32`. + + See Also + -------- + len_bytes Notes ----- - If you know that you are working with ASCII text, ``lengths`` will be - equivalent, and faster (returns length in terms of the number of bytes). + When working with ASCII text, use :func:`len_bytes` instead to achieve + equivalent output with much better performance: + :func:`len_bytes` runs in _O(1)_, while :func:`len_chars` runs in (_O(n)_). Examples -------- - >>> df = pl.DataFrame({"s": ["Café", None, "345", "東京"]}).with_columns( - ... [ - ... pl.col("s").str.n_chars().alias("nchars"), - ... pl.col("s").str.lengths().alias("length"), - ... ] + >>> df = pl.DataFrame({"a": ["Café", "345", "東京", None]}) + >>> df.with_columns( + ... pl.col("a").str.len_chars().alias("n_chars"), + ... pl.col("a").str.len_bytes().alias("n_bytes"), ... ) - >>> df shape: (4, 3) - ┌──────┬────────┬────────┐ - │ s ┆ nchars ┆ length │ - │ --- ┆ --- ┆ --- │ - │ str ┆ u32 ┆ u32 │ - ╞══════╪════════╪════════╡ - │ Café ┆ 4 ┆ 5 │ - │ null ┆ null ┆ null │ - │ 345 ┆ 3 ┆ 3 │ - │ 東京 ┆ 2 ┆ 6 │ - └──────┴────────┴────────┘ + ┌──────┬─────────┬─────────┐ + │ a ┆ n_chars ┆ n_bytes │ + │ --- ┆ --- ┆ --- │ + │ str ┆ u32 ┆ u32 │ + ╞══════╪═════════╪═════════╡ + │ Café ┆ 4 ┆ 5 │ + │ 345 ┆ 3 ┆ 3 │ + │ 東京 ┆ 2 ┆ 6 │ + │ null ┆ null ┆ null │ + └──────┴─────────┴─────────┘ """ - return wrap_expr(self._pyexpr.str_n_chars()) + return wrap_expr(self._pyexpr.str_len_chars()) def concat(self, delimiter: str = "-") -> Expr: """ @@ -2055,6 +2072,28 @@ def count_match(self, pattern: str | Expr) -> Expr: """ return self.count_matches(pattern) + @deprecate_renamed_function("len_bytes", version="0.19.8") + def lengths(self) -> Expr: + """ + Return the length of each string as the number of bytes. + + .. deprecated:: 0.19.8 + This method has been renamed to :func:`len_bytes`. + + """ + return self.len_bytes() + + @deprecate_renamed_function("len_chars", version="0.19.8") + def n_chars(self) -> Expr: + """ + Return the length of each string as the number of characters. + + .. deprecated:: 0.19.8 + This method has been renamed to :func:`len_chars`. + + """ + return self.len_chars() + def _validate_format_argument(format: str | None) -> None: if format is not None and ".%f" in format: diff --git a/py-polars/polars/interchange/buffer.py b/py-polars/polars/interchange/buffer.py index 81c9e01d7dcc..99f1840b07a3 100644 --- a/py-polars/polars/interchange/buffer.py +++ b/py-polars/polars/interchange/buffer.py @@ -46,7 +46,7 @@ def bufsize(self) -> int: dtype = polars_dtype_to_dtype(self._data.dtype) if dtype[0] == DtypeKind.STRING: - return self._data.str.lengths().sum() # type: ignore[return-value] + return self._data.str.len_bytes().sum() # type: ignore[return-value] n_bits = self._data.len() * dtype[1] diff --git a/py-polars/polars/series/list.py b/py-polars/polars/series/list.py index 8f607ee1e42e..3b883df4dea3 100644 --- a/py-polars/polars/series/list.py +++ b/py-polars/polars/series/list.py @@ -81,14 +81,21 @@ def any(self) -> Expr: """ - def lengths(self) -> Series: + def len(self) -> Series: """ - Get the length of the arrays as UInt32. + Return the number of elements in each list. + + Null values are treated like regular elements in this context. + + Returns + ------- + Series + Series of data type :class:`UInt32`. Examples -------- - >>> s = pl.Series([[1, 2, 3], [5]]) - >>> s.list.lengths() + >>> s = pl.Series([[1, 2, None], [5]]) + >>> s.list.len() shape: (2,) Series: '' [u32] [ @@ -730,3 +737,13 @@ def count_match( An expression that produces a single value """ + + @deprecate_renamed_function("len", version="0.19.8") + def lengths(self) -> Series: + """ + Return the number of elements in each list. + + .. deprecated:: 0.19.8 + This method has been renamed to :func:`len`. + + """ diff --git a/py-polars/polars/series/series.py b/py-polars/polars/series/series.py index 0c2834de5fd6..40be89470220 100644 --- a/py-polars/polars/series/series.py +++ b/py-polars/polars/series/series.py @@ -3719,11 +3719,13 @@ def series_equal( def len(self) -> int: """ - Length of this Series. + Return the number of elements in this Series. + + Null values are treated like regular elements in this context. Examples -------- - >>> s = pl.Series("a", [1, 2, 3]) + >>> s = pl.Series("a", [1, 2, None]) >>> s.len() 3 diff --git a/py-polars/polars/series/string.py b/py-polars/polars/series/string.py index ff484633c98e..36b716e230a7 100644 --- a/py-polars/polars/series/string.py +++ b/py-polars/polars/series/string.py @@ -318,60 +318,71 @@ def to_decimal( """ - def lengths(self) -> Series: + def len_bytes(self) -> Series: """ - Get length of the string values in the Series (as number of bytes). - - Notes - ----- - The returned lengths are equal to the number of bytes in the UTF8 string. If you - need the length in terms of the number of characters, use ``n_chars`` instead. + Return the length of each string as the number of bytes. Returns ------- Series Series of data type :class:`UInt32`. + See Also + -------- + len_chars + + Notes + ----- + When working with non-ASCII text, the length in bytes is not the same as the + length in characters. You may want to use :func:`len_chars` instead. + Note that :func:`len_bytes` is much more performant (_O(1)_) than + :func:`len_chars` (_O(n)_). + Examples -------- - >>> s = pl.Series(["Café", None, "345", "東京"]) - >>> s.str.lengths() + >>> s = pl.Series(["Café", "345", "東京", None]) + >>> s.str.len_bytes() shape: (4,) Series: '' [u32] [ 5 - null 3 6 + null ] """ - def n_chars(self) -> Series: + def len_chars(self) -> Series: """ - Get length of the string values in the Series (as number of chars). + Return the length of each string as the number of characters. Returns ------- Series Series of data type :class:`UInt32`. + See Also + -------- + len_bytes + Notes ----- - If you know that you are working with ASCII text, ``lengths`` will be - equivalent, and faster (returns length in terms of the number of bytes). + When working with ASCII text, use :func:`len_bytes` instead to achieve + equivalent output with much better performance: + :func:`len_bytes` runs in _O(1)_, while :func:`len_chars` runs in (_O(n)_). Examples -------- - >>> s = pl.Series(["Café", None, "345", "東京"]) - >>> s.str.n_chars() + >>> s = pl.Series(["Café", "345", "東京", None]) + >>> s.str.len_chars() shape: (4,) Series: '' [u32] [ 4 - null 3 2 + null ] """ @@ -1575,3 +1586,23 @@ def count_match(self, pattern: str | Series) -> Series: value is null. """ + + @deprecate_renamed_function("len_bytes", version="0.19.8") + def lengths(self) -> Series: + """ + Return the number of bytes in each string. + + .. deprecated:: 0.19.8 + This method has been renamed to :func:`len_bytes`. + + """ + + @deprecate_renamed_function("len_chars", version="0.19.8") + def n_chars(self) -> Series: + """ + Return the length of each string as the number of characters. + + .. deprecated:: 0.19.8 + This method has been renamed to :func:`len_chars`. + + """ diff --git a/py-polars/polars/utils/various.py b/py-polars/polars/utils/various.py index 192a14c0bad4..a7a0e93fc4f9 100644 --- a/py-polars/polars/utils/various.py +++ b/py-polars/polars/utils/various.py @@ -284,7 +284,7 @@ def str_duration_(td: str | None) -> int | None: tp_base = Datetime(tp.time_unit) # type: ignore[union-attr] d = F.col(c).str.replace(r"[A-Z ]+$", "") cast_cols[c] = ( - F.when(d.str.lengths() == 19) + F.when(d.str.len_bytes() == 19) .then(d + ".000000000") .otherwise(d + "000000000") .str.slice(0, 29) @@ -296,7 +296,7 @@ def str_duration_(td: str | None) -> int | None: cast_cols[c] = F.col(c).str.strptime(tp, "%Y-%m-%d") # type: ignore[arg-type] elif tp == Time: cast_cols[c] = ( - F.when(F.col(c).str.lengths() == 8) + F.when(F.col(c).str.len_bytes() == 8) .then(F.col(c) + ".000000000") .otherwise(F.col(c) + "000000000") .str.slice(0, 18) diff --git a/py-polars/src/expr/list.rs b/py-polars/src/expr/list.rs index accd5529cf0b..dbac07c08a3b 100644 --- a/py-polars/src/expr/list.rs +++ b/py-polars/src/expr/list.rs @@ -53,8 +53,8 @@ impl PyExpr { self.inner.clone().list().join(separator.inner).into() } - fn list_lengths(&self) -> Self { - self.inner.clone().list().lengths().into() + fn list_len(&self) -> Self { + self.inner.clone().list().len().into() } fn list_max(&self) -> Self { diff --git a/py-polars/src/expr/string.rs b/py-polars/src/expr/string.rs index 4aefb2f4162b..2c5d1a4be641 100644 --- a/py-polars/src/expr/string.rs +++ b/py-polars/src/expr/string.rs @@ -112,12 +112,12 @@ impl PyExpr { self.inner.clone().str().to_titlecase().into() } - fn str_lengths(&self) -> Self { - self.inner.clone().str().lengths().into() + fn str_len_bytes(&self) -> Self { + self.inner.clone().str().len_bytes().into() } - fn str_n_chars(&self) -> Self { - self.inner.clone().str().n_chars().into() + fn str_len_chars(&self) -> Self { + self.inner.clone().str().len_chars().into() } #[cfg(feature = "lazy_regex")] diff --git a/py-polars/tests/unit/datatypes/test_categorical.py b/py-polars/tests/unit/datatypes/test_categorical.py index 69c9d56a007d..af727b187a75 100644 --- a/py-polars/tests/unit/datatypes/test_categorical.py +++ b/py-polars/tests/unit/datatypes/test_categorical.py @@ -310,7 +310,7 @@ def test_nested_categorical_aggregation_7848() -> None: ).with_columns([pl.col("letter").cast(pl.Categorical)]).group_by( maintain_order=True, by=["group"] ).all().with_columns( - [pl.col("letter").list.lengths().alias("c_group")] + [pl.col("letter").list.len().alias("c_group")] ).group_by( by=["c_group"], maintain_order=True ).agg( diff --git a/py-polars/tests/unit/datatypes/test_list.py b/py-polars/tests/unit/datatypes/test_list.py index 358f62595ba7..abe5ec123780 100644 --- a/py-polars/tests/unit/datatypes/test_list.py +++ b/py-polars/tests/unit/datatypes/test_list.py @@ -133,7 +133,7 @@ def test_list_fill_null() -> None: df = pl.DataFrame({"C": [["a", "b", "c"], [], [], ["d", "e"]]}) assert df.with_columns( [ - pl.when(pl.col("C").list.lengths() == 0) + pl.when(pl.col("C").list.len() == 0) .then(None) .otherwise(pl.col("C")) .alias("C") @@ -144,7 +144,7 @@ def test_list_fill_null() -> None: def test_list_fill_list() -> None: assert pl.DataFrame({"a": [[1, 2, 3], []]}).select( [ - pl.when(pl.col("a").list.lengths() == 0) + pl.when(pl.col("a").list.len() == 0) .then([5]) .otherwise(pl.col("a")) .alias("filled") diff --git a/py-polars/tests/unit/functions/test_whenthen.py b/py-polars/tests/unit/functions/test_whenthen.py index c647ded4fa49..591ef9a6f4da 100644 --- a/py-polars/tests/unit/functions/test_whenthen.py +++ b/py-polars/tests/unit/functions/test_whenthen.py @@ -190,7 +190,7 @@ def test_when_then_edge_cases_3994() -> None: .group_by(["id"]) .agg(pl.col("type")) .with_columns( - pl.when(pl.col("type").list.lengths() == 0) + pl.when(pl.col("type").list.len() == 0) .then(pl.lit(None)) .otherwise(pl.col("type")) .keep_name() @@ -204,7 +204,7 @@ def test_when_then_edge_cases_3994() -> None: .group_by(["id"]) .agg(pl.col("type")) .with_columns( - pl.when(pl.col("type").list.lengths() == 0) + pl.when(pl.col("type").list.len() == 0) .then(pl.lit(None)) .otherwise(pl.col("type")) .keep_name() diff --git a/py-polars/tests/unit/io/test_parquet.py b/py-polars/tests/unit/io/test_parquet.py index 18a4d6863669..7aed3a5fdcf4 100644 --- a/py-polars/tests/unit/io/test_parquet.py +++ b/py-polars/tests/unit/io/test_parquet.py @@ -508,9 +508,5 @@ def test_nested_list_page_reads_to_end_11548() -> None: f.seek(0) - assert pl.read_parquet(f).select( - pl.col("x").list.lengths() - ).to_series().to_list() == [ - 2048, - 2048, - ] + result = pl.read_parquet(f).select(pl.col("x").list.len()) + assert result.to_series().to_list() == [2048, 2048] diff --git a/py-polars/tests/unit/namespaces/test_list.py b/py-polars/tests/unit/namespaces/test_list.py index 9eb39b89353f..9c0eeb51e063 100644 --- a/py-polars/tests/unit/namespaces/test_list.py +++ b/py-polars/tests/unit/namespaces/test_list.py @@ -612,3 +612,18 @@ def test_utf8_empty_series_arg_min_max_10703() -> None: "arg_min": [0, None], "arg_max": [0, None], } + + +def test_list_len() -> None: + s = pl.Series([[1, 2, None], [5]]) + result = s.list.len() + expected = pl.Series([3, 1], dtype=pl.UInt32) + assert_series_equal(result, expected) + + +def test_list_lengths_deprecated() -> None: + s = pl.Series([[1, 2, None], [5]]) + with pytest.deprecated_call(): + result = s.list.lengths() + expected = pl.Series([3, 1], dtype=pl.UInt32) + assert_series_equal(result, expected) diff --git a/py-polars/tests/unit/namespaces/test_string.py b/py-polars/tests/unit/namespaces/test_string.py index fce10f852d63..3e78970a1470 100644 --- a/py-polars/tests/unit/namespaces/test_string.py +++ b/py-polars/tests/unit/namespaces/test_string.py @@ -37,16 +37,34 @@ def test_str_concat_datetime() -> None: ) -def test_str_lengths() -> None: +def test_str_len_bytes() -> None: s = pl.Series(["Café", None, "345", "東京"]) + result = s.str.len_bytes() expected = pl.Series([5, None, 3, 6], dtype=pl.UInt32) - assert_series_equal(s.str.lengths(), expected) + assert_series_equal(result, expected) + + +def test_str_lengths_deprecated() -> None: + s = pl.Series(["Café", None, "345", "東京"]) + with pytest.deprecated_call(): + result = s.str.lengths() + expected = pl.Series([5, None, 3, 6], dtype=pl.UInt32) + assert_series_equal(result, expected) -def test_str_n_chars() -> None: +def test_str_len_chars() -> None: s = pl.Series(["Café", None, "345", "東京"]) + result = s.str.len_chars() expected = pl.Series([4, None, 3, 2], dtype=pl.UInt32) - assert_series_equal(s.str.n_chars(), expected) + assert_series_equal(result, expected) + + +def test_str_n_chars_deprecated() -> None: + s = pl.Series(["Café", None, "345", "東京"]) + with pytest.deprecated_call(): + result = s.str.n_chars() + expected = pl.Series([4, None, 3, 2], dtype=pl.UInt32) + assert_series_equal(result, expected) def test_str_contains() -> None: @@ -832,9 +850,9 @@ def test_ljust_and_rjust() -> None: df.select( [ pl.col("a").str.rjust(10).alias("rjust"), - pl.col("a").str.rjust(10).str.lengths().alias("rjust_len"), + pl.col("a").str.rjust(10).str.len_bytes().alias("rjust_len"), pl.col("a").str.ljust(10).alias("ljust"), - pl.col("a").str.ljust(10).str.lengths().alias("ljust_len"), + pl.col("a").str.ljust(10).str.len_bytes().alias("ljust_len"), ] ).to_dict(False) ) == { @@ -879,16 +897,6 @@ def test_json_path_match_type_4905() -> None: ).to_dict(False) == {"json_val": ['{"a":"hello"}']} -def test_length_vs_nchars() -> None: - df = pl.DataFrame({"s": ["café", "東京"]}).with_columns( - [ - pl.col("s").str.lengths().alias("length"), - pl.col("s").str.n_chars().alias("nchars"), - ] - ) - assert df.rows() == [("café", 5, 4), ("東京", 6, 2)] - - def test_decode_strict() -> None: df = pl.DataFrame( {"strings": ["0IbQvTc3", "0J%2FQldCf0JA%3D", "0J%2FRgNC%2B0YHRgtC%2B"]} diff --git a/py-polars/tests/unit/series/test_series.py b/py-polars/tests/unit/series/test_series.py index 01744c1f7348..12edd3e80ec9 100644 --- a/py-polars/tests/unit/series/test_series.py +++ b/py-polars/tests/unit/series/test_series.py @@ -1296,10 +1296,10 @@ def test_kurtosis() -> None: def test_arr_lengths() -> None: s = pl.Series("a", [[1, 2], [1, 2, 3]]) - assert_series_equal(s.list.lengths(), pl.Series("a", [2, 3], dtype=UInt32)) + assert_series_equal(s.list.len(), pl.Series("a", [2, 3], dtype=UInt32)) df = pl.DataFrame([s]) assert_series_equal( - df.select(pl.col("a").list.lengths())["a"], pl.Series("a", [2, 3], dtype=UInt32) + df.select(pl.col("a").list.len())["a"], pl.Series("a", [2, 3], dtype=UInt32) )