Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

depr(python, rust!): Rename .list.lengths and .str.lengths #11613

Merged
merged 10 commits into from
Oct 10, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions crates/polars-arrow/src/kernels/string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ use arrow::datatypes::DataType;
use crate::prelude::*;
use crate::trusted_len::TrustedLenPush;

pub fn string_lengths(array: &Utf8Array<i64>) -> ArrayRef {
pub fn string_len_bytes(array: &Utf8Array<i64>) -> ArrayRef {
let values = array
.offsets()
.as_slice()
Expand All @@ -16,7 +16,7 @@ pub fn string_lengths(array: &Utf8Array<i64>) -> ArrayRef {
Box::new(array)
}

pub fn string_nchars(array: &Utf8Array<i64>) -> ArrayRef {
pub fn string_len_chars(array: &Utf8Array<i64>) -> ArrayRef {
let values = array.values_iter().map(|x| x.chars().count() as u32);
let values: Buffer<_> = Vec::from_trusted_len_iter(values).into();
let array = UInt32Array::new(DataType::UInt32, values, array.validity().cloned());
Expand Down
8 changes: 4 additions & 4 deletions crates/polars-ops/src/chunked_array/strings/namespace.rs
Original file line number Diff line number Diff line change
Expand Up @@ -145,15 +145,15 @@ pub trait Utf8NameSpaceImpl: AsUtf8 {
}

/// Get the length of the string values as number of chars.
fn str_n_chars(&self) -> UInt32Chunked {
fn str_len_chars(&self) -> UInt32Chunked {
let ca = self.as_utf8();
ca.apply_kernel_cast(&string_nchars)
ca.apply_kernel_cast(&string_len_chars)
}

/// Get the length of the string values as number of bytes.
fn str_lengths(&self) -> UInt32Chunked {
fn str_len_bytes(&self) -> UInt32Chunked {
let ca = self.as_utf8();
ca.apply_kernel_cast(&string_lengths)
ca.apply_kernel_cast(&string_len_bytes)
}

/// Return a copy of the string left filled with ASCII '0' digits to make a string of length width.
Expand Down
4 changes: 2 additions & 2 deletions crates/polars-plan/src/dsl/function_expr/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -762,8 +762,8 @@ impl From<StringFunction> for SpecialEq<Arc<dyn SeriesUdf>> {
ExtractGroups { pat, dtype } => {
map!(strings::extract_groups, &pat, &dtype)
},
NChars => map!(strings::n_chars),
Length => map!(strings::lengths),
LenBytes => map!(strings::len_bytes),
LenChars => map!(strings::len_chars),
#[cfg(feature = "string_justify")]
Zfill(alignment) => {
map!(strings::zfill, alignment)
Expand Down
20 changes: 10 additions & 10 deletions crates/polars-plan/src/dsl/function_expr/strings.rs
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,8 @@ pub enum StringFunction {
},
#[cfg(feature = "string_from_radix")]
FromRadix(u32, bool),
NChars,
Length,
LenBytes,
LenChars,
#[cfg(feature = "string_justify")]
LJust {
width: usize,
Expand Down Expand Up @@ -114,8 +114,8 @@ impl StringFunction {
FromRadix { .. } => mapper.with_dtype(DataType::Int32),
#[cfg(feature = "extract_jsonpath")]
JsonExtract { dtype, .. } => mapper.with_opt_dtype(dtype.clone()),
Length => mapper.with_dtype(DataType::UInt32),
NChars => mapper.with_dtype(DataType::UInt32),
LenBytes => mapper.with_dtype(DataType::UInt32),
LenChars => mapper.with_dtype(DataType::UInt32),
#[cfg(feature = "regex")]
Replace { .. } => mapper.with_same_dtype(),
#[cfg(feature = "temporal")]
Expand Down Expand Up @@ -173,9 +173,9 @@ impl Display for StringFunction {
StringFunction::JsonExtract { .. } => "json_extract",
#[cfg(feature = "string_justify")]
StringFunction::LJust { .. } => "ljust",
StringFunction::Length => "lengths",
StringFunction::LenBytes => "len_bytes",
StringFunction::Lowercase => "lowercase",
StringFunction::NChars => "n_chars",
StringFunction::LenChars => "len_chars",
#[cfg(feature = "string_justify")]
StringFunction::RJust { .. } => "rjust",
#[cfg(feature = "regex")]
Expand Down Expand Up @@ -234,14 +234,14 @@ pub(super) fn titlecase(s: &Series) -> PolarsResult<Series> {
Ok(ca.to_titlecase().into_series())
}

pub(super) fn n_chars(s: &Series) -> PolarsResult<Series> {
pub(super) fn len_chars(s: &Series) -> PolarsResult<Series> {
let ca = s.utf8()?;
Ok(ca.str_n_chars().into_series())
Ok(ca.str_len_chars().into_series())
}

pub(super) fn lengths(s: &Series) -> PolarsResult<Series> {
pub(super) fn len_bytes(s: &Series) -> PolarsResult<Series> {
let ca = s.utf8()?;
Ok(ca.str_lengths().into_series())
Ok(ca.str_len_bytes().into_series())
}

#[cfg(feature = "regex")]
Expand Down
6 changes: 4 additions & 2 deletions crates/polars-plan/src/dsl/list.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,10 @@ impl ListNameSpace {
.map_private(FunctionExpr::ListExpr(ListFunction::DropNulls))
}

/// Get lengths of the arrays in the List type.
pub fn lengths(self) -> Expr {
/// Return the number of elements in each list.
///
/// Null values are treated like regular elements in this context.
pub fn len(self) -> Expr {
self.0
.map_private(FunctionExpr::ListExpr(ListFunction::Length))
}
Expand Down
25 changes: 19 additions & 6 deletions crates/polars-plan/src/dsl/string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -369,16 +369,29 @@ impl StringNameSpace {
)))
}

/// Return the number of characters in the string (not bytes).
pub fn n_chars(self) -> Expr {
/// Return the length of each string as the number of bytes.
stinodego marked this conversation as resolved.
Show resolved Hide resolved
///
/// When working with non-ASCII text, the length in bytes is not the same
/// as the length in characters. You may want to use
/// [`len_chars`] instead. Note that `len_bytes` is much more
/// performant (_O(1)_) than [`len_chars`] (_O(n)_).
///
/// [`len_chars`]: StringNameSpace::len_chars
pub fn len_bytes(self) -> Expr {
self.0
.map_private(FunctionExpr::StringExpr(StringFunction::NChars))
.map_private(FunctionExpr::StringExpr(StringFunction::LenBytes))
}

/// Return the number of bytes in the string (not characters).
pub fn lengths(self) -> Expr {
/// Return the length of each string as the number of characters.
///
/// When working with ASCII text, use [`len_bytes`] instead to achieve
/// equivalent output with much better performance:
/// [`len_bytes`] runs in _O(1)_, while `len_chars` runs in _O(n)_.
///
/// [`len_bytes`]: StringNameSpace::len_bytes
pub fn len_chars(self) -> Expr {
self.0
.map_private(FunctionExpr::StringExpr(StringFunction::Length))
.map_private(FunctionExpr::StringExpr(StringFunction::LenChars))
}

/// Slice the string values.
Expand Down
6 changes: 3 additions & 3 deletions crates/polars-sql/src/functions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -631,7 +631,7 @@ impl SqlFunctionVisitor<'_> {
}
}))
}),
Length => self.visit_unary(|e| e.str().n_chars()),
Length => self.visit_unary(|e| e.str().len_chars()),
Lower => self.visit_unary(|e| e.str().to_lowercase()),
LTrim => match function.args.len() {
1 => self.visit_unary(|e| e.str().strip_chars_start(lit(Null))),
Expand All @@ -641,7 +641,7 @@ impl SqlFunctionVisitor<'_> {
function.args.len()
),
},
OctetLength => self.visit_unary(|e| e.str().lengths()),
OctetLength => self.visit_unary(|e| e.str().len_bytes()),
RegexpLike => match function.args.len() {
2 => self.visit_binary(|e, s| e.str().contains(s, true)),
3 => self.try_visit_ternary(|e, pat, flags| {
Expand Down Expand Up @@ -714,7 +714,7 @@ impl SqlFunctionVisitor<'_> {
// ----
ArrayContains => self.visit_binary::<Expr>(|e, s| e.list().contains(s)),
ArrayGet => self.visit_binary(|e, i| e.list().get(i)),
ArrayLength => self.visit_unary(|e| e.list().lengths()),
ArrayLength => self.visit_unary(|e| e.list().len()),
ArrayMax => self.visit_unary(|e| e.list().max()),
ArrayMean => self.visit_unary(|e| e.list().mean()),
ArrayMin => self.visit_unary(|e| e.list().min()),
Expand Down
16 changes: 8 additions & 8 deletions docs/_build/API_REFERENCE_LINKS.yml
Original file line number Diff line number Diff line change
Expand Up @@ -142,8 +142,8 @@ python:
str.replace_all: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.str.replace_all.html
str.to_datetime: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.str.to_datetime.html
str.to_date: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.str.to_date.html
str.n_chars: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.str.n_chars.html
str.lengths: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.str.lengths.html
str.len_chars: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.str.len_chars.html
str.len_bytes: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.str.len_bytes.html

struct.field: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.struct.field.html
struct.rename_fields: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.struct.rename_fields.html
Expand Down Expand Up @@ -354,12 +354,12 @@ rust:
name: str.replace_all
link: https://pola-rs.github.io/polars/docs/rust/dev/polars_lazy/dsl/string/struct.StringNameSpace.html#method.to_datetime
feature_flags: [dtype-datetime]
str.n_chars:
name: str.n_chars
link: https://pola-rs.github.io/polars/docs/rust/dev/polars_lazy/dsl/string/struct.StringNameSpace.html#method.n_chars
str.lengths:
name: str.lengths
link: https://pola-rs.github.io/polars/docs/rust/dev/polars_lazy/dsl/string/struct.StringNameSpace.html#method.lengths
str.len_chars:
name: str.len_chars
link: https://pola-rs.github.io/polars/docs/rust/dev/polars_lazy/dsl/string/struct.StringNameSpace.html#method.len_chars
str.len_bytes:
name: str.len_bytes
link: https://pola-rs.github.io/polars/docs/rust/dev/polars_lazy/dsl/string/struct.StringNameSpace.html#method.len_bytes

struct.rename_fields: https://pola-rs.github.io/polars/docs/rust/dev/polars_lazy/dsl/struct.StructNameSpace.html#method.rename_fields
struct.field:
Expand Down
2 changes: 1 addition & 1 deletion docs/src/python/user-guide/expressions/lists.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
out = weather.with_columns(pl.col("temperatures").str.split(" ")).with_columns(
pl.col("temperatures").list.head(3).alias("top3"),
pl.col("temperatures").list.slice(-3, 3).alias("bottom_3"),
pl.col("temperatures").list.lengths().alias("obs"),
pl.col("temperatures").list.len().alias("obs"),
)
print(out)
# --8<-- [end:list_ops]
Expand Down
4 changes: 2 additions & 2 deletions docs/src/python/user-guide/expressions/strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@
df = pl.DataFrame({"animal": ["Crab", "cat and dog", "rab$bit", None]})

out = df.select(
pl.col("animal").str.lengths().alias("byte_count"),
pl.col("animal").str.n_chars().alias("letter_count"),
pl.col("animal").str.len_bytes().alias("byte_count"),
pl.col("animal").str.len_chars().alias("letter_count"),
)
print(out)
# --8<-- [end:df]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def add_counter(val: int) -> int:
pl.struct(["keys", "values"])
.map_elements(lambda x: len(x["keys"]) + x["values"])
.alias("solution_apply"),
(pl.col("keys").str.lengths() + pl.col("values")).alias("solution_expr"),
(pl.col("keys").str.len_bytes() + pl.col("values")).alias("solution_expr"),
)
print(out)
# --8<-- [end:combine]
4 changes: 2 additions & 2 deletions docs/src/rust/user-guide/expressions/strings.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
.clone()
.lazy()
.select([
col("animal").str().lengths().alias("byte_count"),
col("animal").str().n_chars().alias("letter_count"),
col("animal").str().len_bytes().alias("byte_count"),
col("animal").str().len_chars().alias("letter_count"),
])
.collect()?;

Expand Down
2 changes: 1 addition & 1 deletion docs/user-guide/expressions/strings.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ String processing functions are available in the `str` namespace.

The `str` namespace can be accessed through the `.str` attribute of a column with `Utf8` data type. In the following example, we create a column named `animal` and compute the length of each element in the column in terms of the number of bytes and the number of characters. If you are working with ASCII text, then the results of these two computations will be the same, and using `lengths` is recommended since it is faster.

{{code_block('user-guide/expressions/strings','df',['str.lengths','str.n_chars'])}}
{{code_block('user-guide/expressions/strings','df',['str.len_bytes','str.len_chars'])}}

```python exec="on" result="text" session="user-guide/strings"
--8<-- "python/user-guide/expressions/strings.py:setup"
Expand Down
1 change: 1 addition & 0 deletions py-polars/docs/source/reference/expressions/list.rst
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ The following methods are available under the `expr.list` attribute.
Expr.list.intersection
Expr.list.join
Expr.list.last
Expr.list.len
Expr.list.lengths
Expr.list.max
Expr.list.mean
Expand Down
2 changes: 2 additions & 0 deletions py-polars/docs/source/reference/expressions/string.rst
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ The following methods are available under the `expr.str` attribute.
Expr.str.extract_groups
Expr.str.json_extract
Expr.str.json_path_match
Expr.str.len_bytes
Expr.str.len_chars
Expr.str.lengths
Expr.str.ljust
Expr.str.lstrip
Expand Down
1 change: 1 addition & 0 deletions py-polars/docs/source/reference/series/list.rst
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ The following methods are available under the `Series.list` attribute.
Series.list.join
Series.list.intersection
Series.list.last
Series.list.len
Series.list.lengths
Series.list.max
Series.list.mean
Expand Down
2 changes: 2 additions & 0 deletions py-polars/docs/source/reference/series/string.rst
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ The following methods are available under the `Series.str` attribute.
Series.str.extract_groups
Series.str.json_extract
Series.str.json_path_match
Series.str.len_bytes
Series.str.len_chars
Series.str.lengths
Series.str.ljust
Series.str.lstrip
Expand Down
2 changes: 1 addition & 1 deletion py-polars/polars/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -440,7 +440,7 @@ def set_fmt_str_lengths(cls, n: int | None) -> type[Config]:
... ]
... }
... )
>>> df.with_columns(pl.col("txt").str.lengths().alias("len"))
>>> df.with_columns(pl.col("txt").str.len_bytes().alias("len"))
shape: (2, 2)
┌───────────────────────────────────┬─────┐
│ txt ┆ len │
Expand Down
19 changes: 8 additions & 11 deletions py-polars/polars/expr/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -1241,15 +1241,15 @@ def agg_groups(self) -> Self:

def count(self) -> Self:
"""
Count the number of values in this expression.
Return the number of elements in the column.

.. warning::
`null` is deemed a value in this context.
Null values are treated like regular elements in this context.

Examples
--------
>>> df = pl.DataFrame({"a": [8, 9, 10], "b": [None, 4, 4]})
>>> df.select(pl.all().count()) # counts nulls
>>> df.select(pl.all().count())
shape: (1, 2)
┌─────┬─────┐
│ a ┆ b │
Expand All @@ -1264,19 +1264,16 @@ def count(self) -> Self:

def len(self) -> Self:
"""
Count the number of values in this expression.
Return the number of elements in the column.

Null values are treated like regular elements in this context.

Alias for :func:`count`.

Examples
--------
>>> df = pl.DataFrame(
... {
... "a": [8, 9, 10],
... "b": [None, 4, 4],
... }
... )
>>> df.select(pl.all().len()) # counts nulls
>>> df = pl.DataFrame({"a": [8, 9, 10], "b": [None, 4, 4]})
>>> df.select(pl.all().len())
shape: (1, 2)
┌─────┬─────┐
│ a ┆ b │
Expand Down
Loading