Skip to content

Commit

Permalink
depr(python, rust!): Rename .list.lengths and .str.lengths (#11613)
Browse files Browse the repository at this point in the history
  • Loading branch information
stinodego authored Oct 10, 2023
1 parent 543cea5 commit f23f2c3
Show file tree
Hide file tree
Showing 35 changed files with 311 additions and 167 deletions.
4 changes: 2 additions & 2 deletions crates/polars-arrow/src/kernels/string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ use arrow::datatypes::DataType;
use crate::prelude::*;
use crate::trusted_len::TrustedLenPush;

pub fn string_lengths(array: &Utf8Array<i64>) -> ArrayRef {
pub fn string_len_bytes(array: &Utf8Array<i64>) -> ArrayRef {
let values = array
.offsets()
.as_slice()
Expand All @@ -16,7 +16,7 @@ pub fn string_lengths(array: &Utf8Array<i64>) -> ArrayRef {
Box::new(array)
}

pub fn string_nchars(array: &Utf8Array<i64>) -> ArrayRef {
pub fn string_len_chars(array: &Utf8Array<i64>) -> ArrayRef {
let values = array.values_iter().map(|x| x.chars().count() as u32);
let values: Buffer<_> = Vec::from_trusted_len_iter(values).into();
let array = UInt32Array::new(DataType::UInt32, values, array.validity().cloned());
Expand Down
8 changes: 4 additions & 4 deletions crates/polars-ops/src/chunked_array/strings/namespace.rs
Original file line number Diff line number Diff line change
Expand Up @@ -145,15 +145,15 @@ pub trait Utf8NameSpaceImpl: AsUtf8 {
}

/// Get the length of the string values as number of chars.
fn str_n_chars(&self) -> UInt32Chunked {
fn str_len_chars(&self) -> UInt32Chunked {
let ca = self.as_utf8();
ca.apply_kernel_cast(&string_nchars)
ca.apply_kernel_cast(&string_len_chars)
}

/// Get the length of the string values as number of bytes.
fn str_lengths(&self) -> UInt32Chunked {
fn str_len_bytes(&self) -> UInt32Chunked {
let ca = self.as_utf8();
ca.apply_kernel_cast(&string_lengths)
ca.apply_kernel_cast(&string_len_bytes)
}

/// Return a copy of the string left filled with ASCII '0' digits to make a string of length width.
Expand Down
4 changes: 2 additions & 2 deletions crates/polars-plan/src/dsl/function_expr/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -772,8 +772,8 @@ impl From<StringFunction> for SpecialEq<Arc<dyn SeriesUdf>> {
ExtractGroups { pat, dtype } => {
map!(strings::extract_groups, &pat, &dtype)
},
NChars => map!(strings::n_chars),
Length => map!(strings::lengths),
LenBytes => map!(strings::len_bytes),
LenChars => map!(strings::len_chars),
#[cfg(feature = "string_justify")]
Zfill(alignment) => {
map!(strings::zfill, alignment)
Expand Down
20 changes: 10 additions & 10 deletions crates/polars-plan/src/dsl/function_expr/strings.rs
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,8 @@ pub enum StringFunction {
},
#[cfg(feature = "string_from_radix")]
FromRadix(u32, bool),
NChars,
Length,
LenBytes,
LenChars,
#[cfg(feature = "string_justify")]
LJust {
width: usize,
Expand Down Expand Up @@ -114,8 +114,8 @@ impl StringFunction {
FromRadix { .. } => mapper.with_dtype(DataType::Int32),
#[cfg(feature = "extract_jsonpath")]
JsonExtract { dtype, .. } => mapper.with_opt_dtype(dtype.clone()),
Length => mapper.with_dtype(DataType::UInt32),
NChars => mapper.with_dtype(DataType::UInt32),
LenBytes => mapper.with_dtype(DataType::UInt32),
LenChars => mapper.with_dtype(DataType::UInt32),
#[cfg(feature = "regex")]
Replace { .. } => mapper.with_same_dtype(),
#[cfg(feature = "temporal")]
Expand Down Expand Up @@ -173,9 +173,9 @@ impl Display for StringFunction {
StringFunction::JsonExtract { .. } => "json_extract",
#[cfg(feature = "string_justify")]
StringFunction::LJust { .. } => "ljust",
StringFunction::Length => "lengths",
StringFunction::LenBytes => "len_bytes",
StringFunction::Lowercase => "lowercase",
StringFunction::NChars => "n_chars",
StringFunction::LenChars => "len_chars",
#[cfg(feature = "string_justify")]
StringFunction::RJust { .. } => "rjust",
#[cfg(feature = "regex")]
Expand Down Expand Up @@ -234,14 +234,14 @@ pub(super) fn titlecase(s: &Series) -> PolarsResult<Series> {
Ok(ca.to_titlecase().into_series())
}

pub(super) fn n_chars(s: &Series) -> PolarsResult<Series> {
pub(super) fn len_chars(s: &Series) -> PolarsResult<Series> {
let ca = s.utf8()?;
Ok(ca.str_n_chars().into_series())
Ok(ca.str_len_chars().into_series())
}

pub(super) fn lengths(s: &Series) -> PolarsResult<Series> {
pub(super) fn len_bytes(s: &Series) -> PolarsResult<Series> {
let ca = s.utf8()?;
Ok(ca.str_lengths().into_series())
Ok(ca.str_len_bytes().into_series())
}

#[cfg(feature = "regex")]
Expand Down
6 changes: 4 additions & 2 deletions crates/polars-plan/src/dsl/list.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,10 @@ impl ListNameSpace {
.map_private(FunctionExpr::ListExpr(ListFunction::DropNulls))
}

/// Get lengths of the arrays in the List type.
pub fn lengths(self) -> Expr {
/// Return the number of elements in each list.
///
/// Null values are treated like regular elements in this context.
pub fn len(self) -> Expr {
self.0
.map_private(FunctionExpr::ListExpr(ListFunction::Length))
}
Expand Down
25 changes: 19 additions & 6 deletions crates/polars-plan/src/dsl/string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -369,16 +369,29 @@ impl StringNameSpace {
)))
}

/// Return the number of characters in the string (not bytes).
pub fn n_chars(self) -> Expr {
/// Return the length of each string as the number of bytes.
///
/// When working with non-ASCII text, the length in bytes is not the same
/// as the length in characters. You may want to use
/// [`len_chars`] instead. Note that `len_bytes` is much more
/// performant (_O(1)_) than [`len_chars`] (_O(n)_).
///
/// [`len_chars`]: StringNameSpace::len_chars
pub fn len_bytes(self) -> Expr {
self.0
.map_private(FunctionExpr::StringExpr(StringFunction::NChars))
.map_private(FunctionExpr::StringExpr(StringFunction::LenBytes))
}

/// Return the number of bytes in the string (not characters).
pub fn lengths(self) -> Expr {
/// Return the length of each string as the number of characters.
///
/// When working with ASCII text, use [`len_bytes`] instead to achieve
/// equivalent output with much better performance:
/// [`len_bytes`] runs in _O(1)_, while `len_chars` runs in _O(n)_.
///
/// [`len_bytes`]: StringNameSpace::len_bytes
pub fn len_chars(self) -> Expr {
self.0
.map_private(FunctionExpr::StringExpr(StringFunction::Length))
.map_private(FunctionExpr::StringExpr(StringFunction::LenChars))
}

/// Slice the string values.
Expand Down
6 changes: 3 additions & 3 deletions crates/polars-sql/src/functions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -631,7 +631,7 @@ impl SqlFunctionVisitor<'_> {
}
}))
}),
Length => self.visit_unary(|e| e.str().n_chars()),
Length => self.visit_unary(|e| e.str().len_chars()),
Lower => self.visit_unary(|e| e.str().to_lowercase()),
LTrim => match function.args.len() {
1 => self.visit_unary(|e| e.str().strip_chars_start(lit(Null))),
Expand All @@ -641,7 +641,7 @@ impl SqlFunctionVisitor<'_> {
function.args.len()
),
},
OctetLength => self.visit_unary(|e| e.str().lengths()),
OctetLength => self.visit_unary(|e| e.str().len_bytes()),
RegexpLike => match function.args.len() {
2 => self.visit_binary(|e, s| e.str().contains(s, true)),
3 => self.try_visit_ternary(|e, pat, flags| {
Expand Down Expand Up @@ -714,7 +714,7 @@ impl SqlFunctionVisitor<'_> {
// ----
ArrayContains => self.visit_binary::<Expr>(|e, s| e.list().contains(s)),
ArrayGet => self.visit_binary(|e, i| e.list().get(i)),
ArrayLength => self.visit_unary(|e| e.list().lengths()),
ArrayLength => self.visit_unary(|e| e.list().len()),
ArrayMax => self.visit_unary(|e| e.list().max()),
ArrayMean => self.visit_unary(|e| e.list().mean()),
ArrayMin => self.visit_unary(|e| e.list().min()),
Expand Down
16 changes: 8 additions & 8 deletions docs/_build/API_REFERENCE_LINKS.yml
Original file line number Diff line number Diff line change
Expand Up @@ -142,8 +142,8 @@ python:
str.replace_all: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.str.replace_all.html
str.to_datetime: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.str.to_datetime.html
str.to_date: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.str.to_date.html
str.n_chars: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.str.n_chars.html
str.lengths: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.str.lengths.html
str.len_chars: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.str.len_chars.html
str.len_bytes: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.str.len_bytes.html

struct.field: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.struct.field.html
struct.rename_fields: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.struct.rename_fields.html
Expand Down Expand Up @@ -354,12 +354,12 @@ rust:
name: str.replace_all
link: https://pola-rs.github.io/polars/docs/rust/dev/polars_lazy/dsl/string/struct.StringNameSpace.html#method.to_datetime
feature_flags: [dtype-datetime]
str.n_chars:
name: str.n_chars
link: https://pola-rs.github.io/polars/docs/rust/dev/polars_lazy/dsl/string/struct.StringNameSpace.html#method.n_chars
str.lengths:
name: str.lengths
link: https://pola-rs.github.io/polars/docs/rust/dev/polars_lazy/dsl/string/struct.StringNameSpace.html#method.lengths
str.len_chars:
name: str.len_chars
link: https://pola-rs.github.io/polars/docs/rust/dev/polars_lazy/dsl/string/struct.StringNameSpace.html#method.len_chars
str.len_bytes:
name: str.len_bytes
link: https://pola-rs.github.io/polars/docs/rust/dev/polars_lazy/dsl/string/struct.StringNameSpace.html#method.len_bytes

struct.rename_fields: https://pola-rs.github.io/polars/docs/rust/dev/polars_lazy/dsl/struct.StructNameSpace.html#method.rename_fields
struct.field:
Expand Down
2 changes: 1 addition & 1 deletion docs/src/python/user-guide/expressions/lists.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
out = weather.with_columns(pl.col("temperatures").str.split(" ")).with_columns(
pl.col("temperatures").list.head(3).alias("top3"),
pl.col("temperatures").list.slice(-3, 3).alias("bottom_3"),
pl.col("temperatures").list.lengths().alias("obs"),
pl.col("temperatures").list.len().alias("obs"),
)
print(out)
# --8<-- [end:list_ops]
Expand Down
4 changes: 2 additions & 2 deletions docs/src/python/user-guide/expressions/strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@
df = pl.DataFrame({"animal": ["Crab", "cat and dog", "rab$bit", None]})

out = df.select(
pl.col("animal").str.lengths().alias("byte_count"),
pl.col("animal").str.n_chars().alias("letter_count"),
pl.col("animal").str.len_bytes().alias("byte_count"),
pl.col("animal").str.len_chars().alias("letter_count"),
)
print(out)
# --8<-- [end:df]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def add_counter(val: int) -> int:
pl.struct(["keys", "values"])
.map_elements(lambda x: len(x["keys"]) + x["values"])
.alias("solution_apply"),
(pl.col("keys").str.lengths() + pl.col("values")).alias("solution_expr"),
(pl.col("keys").str.len_bytes() + pl.col("values")).alias("solution_expr"),
)
print(out)
# --8<-- [end:combine]
4 changes: 2 additions & 2 deletions docs/src/rust/user-guide/expressions/strings.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
.clone()
.lazy()
.select([
col("animal").str().lengths().alias("byte_count"),
col("animal").str().n_chars().alias("letter_count"),
col("animal").str().len_bytes().alias("byte_count"),
col("animal").str().len_chars().alias("letter_count"),
])
.collect()?;

Expand Down
2 changes: 1 addition & 1 deletion docs/user-guide/expressions/strings.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ String processing functions are available in the `str` namespace.

The `str` namespace can be accessed through the `.str` attribute of a column with `Utf8` data type. In the following example, we create a column named `animal` and compute the length of each element in the column in terms of the number of bytes and the number of characters. If you are working with ASCII text, then the results of these two computations will be the same, and using `lengths` is recommended since it is faster.

{{code_block('user-guide/expressions/strings','df',['str.lengths','str.n_chars'])}}
{{code_block('user-guide/expressions/strings','df',['str.len_bytes','str.len_chars'])}}

```python exec="on" result="text" session="user-guide/strings"
--8<-- "python/user-guide/expressions/strings.py:setup"
Expand Down
1 change: 1 addition & 0 deletions py-polars/docs/source/reference/expressions/list.rst
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ The following methods are available under the `expr.list` attribute.
Expr.list.intersection
Expr.list.join
Expr.list.last
Expr.list.len
Expr.list.lengths
Expr.list.max
Expr.list.mean
Expand Down
2 changes: 2 additions & 0 deletions py-polars/docs/source/reference/expressions/string.rst
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ The following methods are available under the `expr.str` attribute.
Expr.str.extract_groups
Expr.str.json_extract
Expr.str.json_path_match
Expr.str.len_bytes
Expr.str.len_chars
Expr.str.lengths
Expr.str.ljust
Expr.str.lstrip
Expand Down
1 change: 1 addition & 0 deletions py-polars/docs/source/reference/series/list.rst
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ The following methods are available under the `Series.list` attribute.
Series.list.join
Series.list.intersection
Series.list.last
Series.list.len
Series.list.lengths
Series.list.max
Series.list.mean
Expand Down
2 changes: 2 additions & 0 deletions py-polars/docs/source/reference/series/string.rst
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ The following methods are available under the `Series.str` attribute.
Series.str.extract_groups
Series.str.json_extract
Series.str.json_path_match
Series.str.len_bytes
Series.str.len_chars
Series.str.lengths
Series.str.ljust
Series.str.lstrip
Expand Down
2 changes: 1 addition & 1 deletion py-polars/polars/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -440,7 +440,7 @@ def set_fmt_str_lengths(cls, n: int | None) -> type[Config]:
... ]
... }
... )
>>> df.with_columns(pl.col("txt").str.lengths().alias("len"))
>>> df.with_columns(pl.col("txt").str.len_bytes().alias("len"))
shape: (2, 2)
┌───────────────────────────────────┬─────┐
│ txt ┆ len │
Expand Down
19 changes: 8 additions & 11 deletions py-polars/polars/expr/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -1241,15 +1241,15 @@ def agg_groups(self) -> Self:

def count(self) -> Self:
"""
Count the number of values in this expression.
Return the number of elements in the column.
.. warning::
`null` is deemed a value in this context.
Null values are treated like regular elements in this context.
Examples
--------
>>> df = pl.DataFrame({"a": [8, 9, 10], "b": [None, 4, 4]})
>>> df.select(pl.all().count()) # counts nulls
>>> df.select(pl.all().count())
shape: (1, 2)
┌─────┬─────┐
│ a ┆ b │
Expand All @@ -1264,19 +1264,16 @@ def count(self) -> Self:

def len(self) -> Self:
"""
Count the number of values in this expression.
Return the number of elements in the column.
Null values are treated like regular elements in this context.
Alias for :func:`count`.
Examples
--------
>>> df = pl.DataFrame(
... {
... "a": [8, 9, 10],
... "b": [None, 4, 4],
... }
... )
>>> df.select(pl.all().len()) # counts nulls
>>> df = pl.DataFrame({"a": [8, 9, 10], "b": [None, 4, 4]})
>>> df.select(pl.all().len())
shape: (1, 2)
┌─────┬─────┐
│ a ┆ b │
Expand Down
Loading

0 comments on commit f23f2c3

Please sign in to comment.