diff --git a/Cargo.lock b/Cargo.lock index 5dfc53506e7a..884b50f2851a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3512,11 +3512,12 @@ dependencies = [ [[package]] name = "py-polars" -version = "1.3.0" +version = "1.4.0" dependencies = [ "ahash", "arboard", "built", + "bytemuck", "ciborium", "either", "itoa", diff --git a/crates/polars-arrow/src/array/binview/mod.rs b/crates/polars-arrow/src/array/binview/mod.rs index bdf378cf15a1..55f99b7e319b 100644 --- a/crates/polars-arrow/src/array/binview/mod.rs +++ b/crates/polars-arrow/src/array/binview/mod.rs @@ -232,6 +232,29 @@ impl BinaryViewArrayGeneric { ) } + /// Apply a function over the views. This can be used to update views in operations like slicing. + /// + /// # Safety + /// Update the views. All invariants of the views apply. + pub unsafe fn apply_views View>(&self, mut update_view: F) -> Self { + let arr = self.clone(); + let (views, buffers, validity, total_bytes_len, total_buffer_len) = arr.into_inner(); + + let mut views = views.make_mut(); + for v in views.iter_mut() { + let str_slice = T::from_bytes_unchecked(v.get_slice_unchecked(&buffers)); + *v = update_view(*v, str_slice); + } + Self::new_unchecked( + self.data_type.clone(), + views.into(), + buffers, + validity, + total_bytes_len, + total_buffer_len, + ) + } + pub fn try_new( data_type: ArrowDataType, views: Buffer, diff --git a/crates/polars-core/src/chunked_array/ops/apply.rs b/crates/polars-core/src/chunked_array/ops/apply.rs index 7119a48f58d3..de62f1eddc7f 100644 --- a/crates/polars-core/src/chunked_array/ops/apply.rs +++ b/crates/polars-core/src/chunked_array/ops/apply.rs @@ -573,3 +573,15 @@ where }); } } + +impl StringChunked { + /// # Safety + /// Update the views. All invariants of the views apply. + pub unsafe fn apply_views View + Copy>(&self, update_view: F) -> Self { + let mut out = self.clone(); + for arr in out.downcast_iter_mut() { + *arr = arr.apply_views(update_view); + } + out + } +} diff --git a/crates/polars-lazy/src/frame/mod.rs b/crates/polars-lazy/src/frame/mod.rs index 5c41740637c3..faa5e004c551 100644 --- a/crates/polars-lazy/src/frame/mod.rs +++ b/crates/polars-lazy/src/frame/mod.rs @@ -426,12 +426,9 @@ impl LazyFrame { fn _drop(self, columns: I, strict: bool) -> Self where I: IntoIterator, - T: AsRef, + T: Into, { - let to_drop = columns - .into_iter() - .map(|s| s.as_ref().to_string()) - .collect::>(); + let to_drop = columns.into_iter().map(|c| c.into()).collect(); let opt_state = self.get_opt_state(); let lp = self.get_plan_builder().drop(to_drop, strict).build(); @@ -444,11 +441,10 @@ impl LazyFrame { /// /// Any given columns that are not in the schema will give a [`PolarsError::ColumnNotFound`] /// error while materializing the [`LazyFrame`]. - #[inline] pub fn drop(self, columns: I) -> Self where I: IntoIterator, - T: AsRef, + T: Into, { self._drop(columns, true) } @@ -458,11 +454,10 @@ impl LazyFrame { /// and let the projection pushdown optimize away the unneeded columns. /// /// If a column name does not exist in the schema, it will quietly be ignored. - #[inline] pub fn drop_no_validate(self, columns: I) -> Self where I: IntoIterator, - T: AsRef, + T: Into, { self._drop(columns, false) } diff --git a/crates/polars-ops/src/chunked_array/strings/namespace.rs b/crates/polars-ops/src/chunked_array/strings/namespace.rs index 894b36d3c84b..b9c1e3041967 100644 --- a/crates/polars-ops/src/chunked_array/strings/namespace.rs +++ b/crates/polars-ops/src/chunked_array/strings/namespace.rs @@ -622,7 +622,7 @@ pub trait StringNameSpaceImpl: AsString { let ca = self.as_string(); let n = n.strict_cast(&DataType::Int64)?; - Ok(substring::head(ca, n.i64()?)) + substring::head(ca, n.i64()?) } /// Slice the last `n` values of the string. @@ -633,7 +633,7 @@ pub trait StringNameSpaceImpl: AsString { let ca = self.as_string(); let n = n.strict_cast(&DataType::Int64)?; - Ok(substring::tail(ca, n.i64()?)) + substring::tail(ca, n.i64()?) } } diff --git a/crates/polars-ops/src/chunked_array/strings/substring.rs b/crates/polars-ops/src/chunked_array/strings/substring.rs index 58e32061ea40..c9512f11bb2c 100644 --- a/crates/polars-ops/src/chunked_array/strings/substring.rs +++ b/crates/polars-ops/src/chunked_array/strings/substring.rs @@ -1,83 +1,100 @@ +use arrow::array::View; use polars_core::prelude::arity::{binary_elementwise, ternary_elementwise, unary_elementwise}; -use polars_core::prelude::{Int64Chunked, StringChunked, UInt64Chunked}; +use polars_core::prelude::{ChunkFullNull, Int64Chunked, StringChunked, UInt64Chunked}; +use polars_error::{polars_ensure, PolarsResult}; fn head_binary(opt_str_val: Option<&str>, opt_n: Option) -> Option<&str> { if let (Some(str_val), Some(n)) = (opt_str_val, opt_n) { - // `max_len` is guaranteed to be at least the total number of characters. - let max_len = str_val.len(); - if n == 0 { - Some("") - } else { - let end_idx = if n > 0 { - if n as usize >= max_len { - return opt_str_val; - } - // End after the nth codepoint. - str_val - .char_indices() - .nth(n as usize) - .map(|(idx, _)| idx) - .unwrap_or(max_len) - } else { - // End after the nth codepoint from the end. - str_val - .char_indices() - .rev() - .nth((-n - 1) as usize) - .map(|(idx, _)| idx) - .unwrap_or(0) - }; - Some(&str_val[..end_idx]) - } + let end_idx = head_binary_values(str_val, n); + Some(unsafe { str_val.get_unchecked(..end_idx) }) } else { None } } +fn head_binary_values(str_val: &str, n: i64) -> usize { + if n == 0 { + 0 + } else { + let end_idx = if n > 0 { + if n as usize >= str_val.len() { + return str_val.len(); + } + // End after the nth codepoint. + str_val + .char_indices() + .nth(n as usize) + .map(|(idx, _)| idx) + .unwrap_or(str_val.len()) + } else { + // End after the nth codepoint from the end. + str_val + .char_indices() + .rev() + .nth((-n - 1) as usize) + .map(|(idx, _)| idx) + .unwrap_or(0) + }; + end_idx + } +} + fn tail_binary(opt_str_val: Option<&str>, opt_n: Option) -> Option<&str> { if let (Some(str_val), Some(n)) = (opt_str_val, opt_n) { - // `max_len` is guaranteed to be at least the total number of characters. - let max_len = str_val.len(); - if n == 0 { - Some("") - } else { - let start_idx = if n > 0 { - if n as usize >= max_len { - return opt_str_val; - } - // Start from nth codepoint from the end - str_val - .char_indices() - .rev() - .nth((n - 1) as usize) - .map(|(idx, _)| idx) - .unwrap_or(0) - } else { - // Start after the nth codepoint - str_val - .char_indices() - .nth((-n) as usize) - .map(|(idx, _)| idx) - .unwrap_or(max_len) - }; - Some(&str_val[start_idx..]) - } + let start_idx = tail_binary_values(str_val, n); + Some(unsafe { str_val.get_unchecked(start_idx..) }) } else { None } } -fn substring_ternary( +fn tail_binary_values(str_val: &str, n: i64) -> usize { + // `max_len` is guaranteed to be at least the total number of characters. + let max_len = str_val.len(); + if n == 0 { + max_len + } else { + let start_idx = if n > 0 { + if n as usize >= max_len { + return 0; + } + // Start from nth codepoint from the end + str_val + .char_indices() + .rev() + .nth((n - 1) as usize) + .map(|(idx, _)| idx) + .unwrap_or(0) + } else { + // Start after the nth codepoint + str_val + .char_indices() + .nth((-n) as usize) + .map(|(idx, _)| idx) + .unwrap_or(max_len) + }; + start_idx + } +} + +fn substring_ternary_offsets( opt_str_val: Option<&str>, opt_offset: Option, opt_length: Option, -) -> Option<&str> { +) -> Option<(usize, usize)> { let str_val = opt_str_val?; let offset = opt_offset?; + Some(substring_ternary_offsets_value( + str_val, + offset, + opt_length.unwrap_or(u64::MAX), + )) +} +fn substring_ternary_offsets_value(str_val: &str, offset: i64, length: u64) -> (usize, usize) { // Fast-path: always empty string. - if opt_length == Some(0) || offset >= str_val.len() as i64 { - return Some(""); + if length == 0 || offset >= str_val.len() as i64 { + return (0, 0); } let mut indices = str_val.char_indices().map(|(o, _)| o); @@ -104,10 +121,36 @@ fn substring_ternary( let str_val = &str_val[start_byte_offset..]; let mut indices = str_val.char_indices().map(|(o, _)| o); - let stop_byte_offset = opt_length - .and_then(|l| indices.nth((l as usize).saturating_sub(length_reduction))) + let stop_byte_offset = indices + .nth((length as usize).saturating_sub(length_reduction)) .unwrap_or(str_val.len()); - Some(&str_val[..stop_byte_offset]) + (start_byte_offset, stop_byte_offset + start_byte_offset) +} + +fn substring_ternary( + opt_str_val: Option<&str>, + opt_offset: Option, + opt_length: Option, +) -> Option<&str> { + let (start, end) = substring_ternary_offsets(opt_str_val, opt_offset, opt_length)?; + unsafe { opt_str_val.map(|str_val| str_val.get_unchecked(start..end)) } +} + +fn update_view(mut view: View, start: usize, end: usize, val: &str) -> View { + let length = (end - start) as u32; + view.length = length; + + // SAFETY: we just compute the start /end. + let subval = unsafe { val.get_unchecked(start..end).as_bytes() }; + + if length <= 12 { + View::new_inline(subval) + } else { + view.offset += start as u32; + view.length = length; + view.prefix = u32::from_le_bytes(subval[0..4].try_into().unwrap()); + view + } } pub(super) fn substring( @@ -117,31 +160,34 @@ pub(super) fn substring( ) -> StringChunked { match (ca.len(), offset.len(), length.len()) { (1, 1, _) => { - // SAFETY: `ca` was verified to have least 1 element. - let str_val = unsafe { ca.get_unchecked(0) }; - // SAFETY: `offset` was verified to have at least 1 element. - let offset = unsafe { offset.get_unchecked(0) }; + let str_val = ca.get(0); + let offset = offset.get(0); unary_elementwise(length, |length| substring_ternary(str_val, offset, length)) .with_name(ca.name()) }, (_, 1, 1) => { - // SAFETY: `offset` was verified to have at least 1 element. - let offset = unsafe { offset.get_unchecked(0) }; - // SAFETY: `length` was verified to have at least 1 element. - let length = unsafe { length.get_unchecked(0) }; - unary_elementwise(ca, |str_val| substring_ternary(str_val, offset, length)) + let offset = offset.get(0); + let length = length.get(0).unwrap_or(u64::MAX); + + let Some(offset) = offset else { + return StringChunked::full_null(ca.name(), ca.len()); + }; + + unsafe { + ca.apply_views(|view, val| { + let (start, end) = substring_ternary_offsets_value(val, offset, length); + update_view(view, start, end, val) + }) + } }, (1, _, 1) => { - // SAFETY: `ca` was verified to have at least 1 element. - let str_val = unsafe { ca.get_unchecked(0) }; - // SAFETY: `length` was verified to have at least 1 element. - let length = unsafe { length.get_unchecked(0) }; + let str_val = ca.get(0); + let length = length.get(0); unary_elementwise(offset, |offset| substring_ternary(str_val, offset, length)) .with_name(ca.name()) }, (1, len_b, len_c) if len_b == len_c => { - // SAFETY: `ca` was verified to have at least 1 element. - let str_val = unsafe { ca.get_unchecked(0) }; + let str_val = ca.get(0); binary_elementwise(offset, length, |offset, length| { substring_ternary(str_val, offset, length) }) @@ -151,8 +197,7 @@ pub(super) fn substring( { f } - // SAFETY: index `0` is in bound. - let offset = unsafe { offset.get_unchecked(0) }; + let offset = offset.get(0); binary_elementwise( ca, length, @@ -164,8 +209,7 @@ pub(super) fn substring( { f } - // SAFETY: index `0` is in bound. - let length = unsafe { length.get_unchecked(0) }; + let length = length.get(0); binary_elementwise( ca, offset, @@ -176,34 +220,55 @@ pub(super) fn substring( } } -pub(super) fn head(ca: &StringChunked, n: &Int64Chunked) -> StringChunked { +pub(super) fn head(ca: &StringChunked, n: &Int64Chunked) -> PolarsResult { match (ca.len(), n.len()) { - (_, 1) => { - // SAFETY: `n` was verified to have at least 1 element. - let n = unsafe { n.get_unchecked(0) }; - unary_elementwise(ca, |str_val| head_binary(str_val, n)).with_name(ca.name()) + (len, 1) => { + let n = n.get(0); + let Some(n) = n else { + return Ok(StringChunked::full_null(ca.name(), len)); + }; + + Ok(unsafe { + ca.apply_views(|view, val| { + let end = head_binary_values(val, n); + update_view(view, 0, end, val) + }) + }) }, + // TODO! below should also work on only views (1, _) => { - // SAFETY: `ca` was verified to have at least 1 element. - let str_val = unsafe { ca.get_unchecked(0) }; - unary_elementwise(n, |n| head_binary(str_val, n)).with_name(ca.name()) + let str_val = ca.get(0); + Ok(unary_elementwise(n, |n| head_binary(str_val, n)).with_name(ca.name())) + }, + (a, b) => { + polars_ensure!(a == b, ShapeMismatch: "lengths of arguments do not align in 'str.head' got length: {} for column: {}, got length: {} for argument 'n'", a, ca.name(), b); + Ok(binary_elementwise(ca, n, head_binary)) }, - _ => binary_elementwise(ca, n, head_binary), } } -pub(super) fn tail(ca: &StringChunked, n: &Int64Chunked) -> StringChunked { - match (ca.len(), n.len()) { - (_, 1) => { - // SAFETY: `n` was verified to have at least 1 element. - let n = unsafe { n.get_unchecked(0) }; - unary_elementwise(ca, |str_val| tail_binary(str_val, n)).with_name(ca.name()) +pub(super) fn tail(ca: &StringChunked, n: &Int64Chunked) -> PolarsResult { + Ok(match (ca.len(), n.len()) { + (len, 1) => { + let n = n.get(0); + let Some(n) = n else { + return Ok(StringChunked::full_null(ca.name(), len)); + }; + unsafe { + ca.apply_views(|view, val| { + let start = tail_binary_values(val, n); + update_view(view, start, val.len(), val) + }) + } }, + // TODO! below should also work on only views (1, _) => { - // SAFETY: `ca` was verified to have at least 1 element. - let str_val = unsafe { ca.get_unchecked(0) }; + let str_val = ca.get(0); unary_elementwise(n, |n| tail_binary(str_val, n)).with_name(ca.name()) }, - _ => binary_elementwise(ca, n, tail_binary), - } + (a, b) => { + polars_ensure!(a == b, ShapeMismatch: "lengths of arguments do not align in 'str.tail' got length: {} for column: {}, got length: {} for argument 'n'", a, ca.name(), b); + binary_elementwise(ca, n, tail_binary) + }, + }) } diff --git a/crates/polars-plan/src/dsl/mod.rs b/crates/polars-plan/src/dsl/mod.rs index c34e03c20f68..c5cf50e1483c 100644 --- a/crates/polars-plan/src/dsl/mod.rs +++ b/crates/polars-plan/src/dsl/mod.rs @@ -63,7 +63,7 @@ use polars_core::prelude::*; use polars_core::series::ops::NullBehavior; use polars_core::series::IsSorted; use polars_core::utils::try_get_supertype; -pub(crate) use selector::Selector; +pub use selector::Selector; #[cfg(feature = "dtype-struct")] pub use struct_::*; pub use udf::UserDefinedFunction; diff --git a/crates/polars-plan/src/dsl/selector.rs b/crates/polars-plan/src/dsl/selector.rs index e889d9e5b83d..b19781de1024 100644 --- a/crates/polars-plan/src/dsl/selector.rs +++ b/crates/polars-plan/src/dsl/selector.rs @@ -16,8 +16,7 @@ pub enum Selector { } impl Selector { - #[cfg(feature = "meta")] - pub(crate) fn new(e: Expr) -> Self { + pub fn new(e: Expr) -> Self { Self::Root(Box::new(e)) } } @@ -56,3 +55,27 @@ impl Sub for Selector { Selector::Sub(Box::new(self), Box::new(rhs)) } } + +impl From<&str> for Selector { + fn from(value: &str) -> Self { + Selector::new(col(value)) + } +} + +impl From for Selector { + fn from(value: String) -> Self { + Selector::new(col(value.as_ref())) + } +} + +impl From for Selector { + fn from(value: ColumnName) -> Self { + Selector::new(Expr::Column(value)) + } +} + +impl From for Selector { + fn from(value: Expr) -> Self { + Selector::new(value) + } +} diff --git a/crates/polars-plan/src/plans/builder_dsl.rs b/crates/polars-plan/src/plans/builder_dsl.rs index 4bb111c77652..fcb767b0689b 100644 --- a/crates/polars-plan/src/plans/builder_dsl.rs +++ b/crates/polars-plan/src/plans/builder_dsl.rs @@ -219,7 +219,7 @@ impl DslBuilder { .into() } - pub fn drop(self, to_drop: PlHashSet, strict: bool) -> Self { + pub fn drop(self, to_drop: Vec, strict: bool) -> Self { self.map_private(DslFunction::Drop(DropFunction { to_drop, strict })) } diff --git a/crates/polars-plan/src/plans/conversion/dsl_to_ir.rs b/crates/polars-plan/src/plans/conversion/dsl_to_ir.rs index 022881e20fb2..aa1c697aeeb6 100644 --- a/crates/polars-plan/src/plans/conversion/dsl_to_ir.rs +++ b/crates/polars-plan/src/plans/conversion/dsl_to_ir.rs @@ -13,6 +13,7 @@ use polars_io::path_utils::{expand_paths_hive, expanded_from_single_directory}; use super::stack_opt::ConversionOptimizer; use super::*; +use crate::plans::conversion::expr_expansion::expand_selectors; fn expand_expressions( input: Node, @@ -646,6 +647,9 @@ pub fn to_alp_impl( return run_conversion(lp, lp_arena, expr_arena, convert, "fill_nan"); }, DslFunction::Drop(DropFunction { to_drop, strict }) => { + let to_drop = expand_selectors(to_drop, &input_schema, &[])?; + let to_drop = to_drop.iter().map(|s| s.as_ref()).collect::>(); + if strict { for col_name in to_drop.iter() { polars_ensure!( diff --git a/crates/polars-plan/src/plans/conversion/expr_expansion.rs b/crates/polars-plan/src/plans/conversion/expr_expansion.rs index 50b6f333664f..9b5b532479f6 100644 --- a/crates/polars-plan/src/plans/conversion/expr_expansion.rs +++ b/crates/polars-plan/src/plans/conversion/expr_expansion.rs @@ -840,32 +840,70 @@ fn replace_selector(expr: Expr, schema: &Schema, keys: &[Expr]) -> PolarsResult< let mut swapped = Selector::Root(Box::new(Expr::Wildcard)); std::mem::swap(&mut s, &mut swapped); - let mut members = PlIndexSet::new(); - replace_selector_inner(swapped, &mut members, &mut vec![], schema, keys)?; - - if members.len() <= 1 { - Ok(Expr::Columns( - members - .into_iter() - .map(|e| { - let Expr::Column(name) = e else { - unreachable!() - }; - name - }) - .collect(), - )) - } else { - // Ensure that multiple columns returned from combined/nested selectors remain in schema order - let selected = schema - .iter_fields() - .map(|field| ColumnName::from(field.name().as_ref())) - .filter(|field_name| members.contains(&Expr::Column(field_name.clone()))) - .collect(); - - Ok(Expr::Columns(selected)) - } + let cols = expand_selector(swapped, schema, keys)?; + Ok(Expr::Columns(cols)) }, e => Ok(e), }) } + +pub(super) fn expand_selectors( + s: Vec, + schema: &Schema, + keys: &[Expr], +) -> PolarsResult> { + let mut columns = vec![]; + + for s in s { + match s { + Selector::Root(e) => match *e { + Expr::Column(name) => columns.push(name), + Expr::Columns(names) => columns.extend_from_slice(names.as_ref()), + Expr::Selector(s) => { + let names = expand_selector(s, schema, keys)?; + columns.extend_from_slice(names.as_ref()); + }, + e => { + let names = expand_selector(Selector::new(e), schema, keys)?; + columns.extend_from_slice(names.as_ref()); + }, + }, + other => { + let names = expand_selector(other, schema, keys)?; + columns.extend_from_slice(names.as_ref()); + }, + } + } + + Ok(Arc::from(columns)) +} + +pub(super) fn expand_selector( + s: Selector, + schema: &Schema, + keys: &[Expr], +) -> PolarsResult> { + let mut members = PlIndexSet::new(); + replace_selector_inner(s, &mut members, &mut vec![], schema, keys)?; + + if members.len() <= 1 { + Ok(members + .into_iter() + .map(|e| { + let Expr::Column(name) = e else { + unreachable!() + }; + name + }) + .collect()) + } else { + // Ensure that multiple columns returned from combined/nested selectors remain in schema order + let selected = schema + .iter_fields() + .map(|field| ColumnName::from(field.name().as_ref())) + .filter(|field_name| members.contains(&Expr::Column(field_name.clone()))) + .collect(); + + Ok(selected) + } +} diff --git a/crates/polars-plan/src/plans/functions/dsl.rs b/crates/polars-plan/src/plans/functions/dsl.rs index ee56c8dc6fb2..80314787e83e 100644 --- a/crates/polars-plan/src/plans/functions/dsl.rs +++ b/crates/polars-plan/src/plans/functions/dsl.rs @@ -30,7 +30,7 @@ pub enum DslFunction { #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] pub struct DropFunction { /// Columns that are going to be dropped - pub(crate) to_drop: PlHashSet, + pub(crate) to_drop: Vec, /// If `true`, performs a check for each item in `to_drop` against the schema. Returns an /// `ColumnNotFound` error if the column does not exist in the schema. pub(crate) strict: bool, diff --git a/crates/polars-sql/src/context.rs b/crates/polars-sql/src/context.rs index 9ff031a0a31c..ab1b9a53997c 100644 --- a/crates/polars-sql/src/context.rs +++ b/crates/polars-sql/src/context.rs @@ -1032,6 +1032,16 @@ impl SQLContext { polars_bail!(SQLSyntax: "UNNEST table must have an alias"); } }, + TableFactor::NestedJoin { + table_with_joins, + alias, + } => { + let lf = self.execute_from_statement(table_with_joins)?; + match alias { + Some(a) => Ok((a.name.value.clone(), lf)), + None => Ok(("".to_string(), lf)), + } + }, // Support bare table, optionally with an alias, for now _ => polars_bail!(SQLInterface: "not yet implemented: {}", relation), } diff --git a/docs/src/python/user-guide/expressions/aggregation.py b/docs/src/python/user-guide/expressions/aggregation.py index f2d75cbd3726..f67226fdc3d7 100644 --- a/docs/src/python/user-guide/expressions/aggregation.py +++ b/docs/src/python/user-guide/expressions/aggregation.py @@ -160,9 +160,7 @@ def get_person() -> pl.Expr: get_person().first().alias("youngest"), get_person().last().alias("oldest"), get_person().sort().first().alias("alphabetical_first"), - pl.col("gender") - .sort_by(pl.col("first_name").cast(pl.Categorical("lexical"))) - .first(), + pl.col("gender").sort_by(get_person()).first(), ) .sort("state") .limit(5) diff --git a/docs/user-guide/expressions/aggregation.md b/docs/user-guide/expressions/aggregation.md index 2a036ed82322..f4d963606ffb 100644 --- a/docs/user-guide/expressions/aggregation.md +++ b/docs/user-guide/expressions/aggregation.md @@ -17,9 +17,8 @@ Per GROUP `"first_name"` we - count the number of rows in the group: - - short form: `pl.count("party")` - - full form: `pl.col("party").count()` -- aggregate the gender values groups: + - full form: `pl.len()` +- combine the values of gender into a list by omitting an aggregate function: - full form: `pl.col("gender")` - get the first value of column `"last_name"` in the group: - short form: `pl.first("last_name")` (not available in Rust) @@ -94,7 +93,7 @@ However, **if** we also want to sort the names alphabetically, this breaks. Luck --8<-- "python/user-guide/expressions/aggregation.py:sort2" ``` -We can even sort by another column in the `group_by` context. If we want to know if the alphabetically sorted name is male or female we could add: `pl.col("gender").sort_by("first_name").first().alias("gender")` +We can even sort by another column in the `group_by` context. If we want to know if the alphabetically sorted name is male or female we could add: `pl.col("gender").sort_by(get_person()).first()` {{code_block('user-guide/expressions/aggregation','sort3',['group_by'])}} diff --git a/py-polars/Cargo.toml b/py-polars/Cargo.toml index 91f920b63207..64b8589de25d 100644 --- a/py-polars/Cargo.toml +++ b/py-polars/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "py-polars" -version = "1.3.0" +version = "1.4.0" edition = "2021" [lib] @@ -23,6 +23,7 @@ polars-stream = { workspace = true } ahash = { workspace = true } arboard = { workspace = true, optional = true } +bytemuck = { workspace = true } ciborium = { workspace = true } either = { workspace = true } itoa = { workspace = true } diff --git a/py-polars/polars/lazyframe/frame.py b/py-polars/polars/lazyframe/frame.py index 6bdadc7311b7..bda794596522 100644 --- a/py-polars/polars/lazyframe/frame.py +++ b/py-polars/polars/lazyframe/frame.py @@ -4719,7 +4719,7 @@ def drop( │ 8.0 │ └─────┘ """ - drop_cols = _expand_selectors(self, *columns) + drop_cols = parse_into_list_of_expressions(*columns) return self._from_pyldf(self._ldf.drop(drop_cols, strict=strict)) def rename(self, mapping: dict[str, str] | Callable[[str], str]) -> LazyFrame: diff --git a/py-polars/polars/selectors.py b/py-polars/polars/selectors.py index 74a8690d9d73..f08e8f9c0075 100644 --- a/py-polars/polars/selectors.py +++ b/py-polars/polars/selectors.py @@ -109,6 +109,8 @@ def is_selector(obj: Any) -> bool: return isinstance(obj, _selector_proxy_) and hasattr(obj, "_attrs") +# TODO: Don't use this. It collects a schema. +# This should all go to IR conversion. def expand_selector( target: DataFrame | LazyFrame | Mapping[str, PolarsDataType], selector: SelectorType | Expr, @@ -188,6 +190,8 @@ def expand_selector( return tuple(target.select(selector).collect_schema()) +# TODO: Don't use this. It collects a schema. +# This should all go to IR conversion. def _expand_selectors(frame: DataFrame | LazyFrame, *items: Any) -> list[Any]: """ Internal function that expands any selectors to column names in the given input. diff --git a/py-polars/src/expr/mod.rs b/py-polars/src/expr/mod.rs index 2089c6325573..0206f74ca0aa 100644 --- a/py-polars/src/expr/mod.rs +++ b/py-polars/src/expr/mod.rs @@ -12,6 +12,8 @@ mod serde; mod string; mod r#struct; +use std::mem::ManuallyDrop; + use polars::lazy::dsl::Expr; use pyo3::prelude::*; @@ -35,7 +37,14 @@ pub(crate) trait ToExprs { impl ToExprs for Vec { fn to_exprs(self) -> Vec { // SAFETY: repr is transparent. - unsafe { std::mem::transmute(self) } + unsafe { + let length = self.len(); + let capacity = self.capacity(); + let mut manual_drop_vec = ManuallyDrop::new(self); + let vec_ptr: *mut PyExpr = manual_drop_vec.as_mut_ptr(); + let ptr: *mut Expr = vec_ptr as *mut Expr; + Vec::from_raw_parts(ptr, length, capacity) + } } } @@ -46,6 +55,13 @@ pub(crate) trait ToPyExprs { impl ToPyExprs for Vec { fn to_pyexprs(self) -> Vec { // SAFETY: repr is transparent. - unsafe { std::mem::transmute(self) } + unsafe { + let length = self.len(); + let capacity = self.capacity(); + let mut manual_drop_vec = ManuallyDrop::new(self); + let vec_ptr: *mut Expr = manual_drop_vec.as_mut_ptr(); + let ptr: *mut PyExpr = vec_ptr as *mut PyExpr; + Vec::from_raw_parts(ptr, length, capacity) + } } } diff --git a/py-polars/src/lazyframe/mod.rs b/py-polars/src/lazyframe/mod.rs index d0f7d27b9c36..96d28b3e78f0 100644 --- a/py-polars/src/lazyframe/mod.rs +++ b/py-polars/src/lazyframe/mod.rs @@ -1162,8 +1162,9 @@ impl PyLazyFrame { .into() } - fn drop(&self, columns: Vec, strict: bool) -> Self { + fn drop(&self, columns: Vec, strict: bool) -> Self { let ldf = self.ldf.clone(); + let columns = columns.to_exprs(); if strict { ldf.drop(columns) } else { diff --git a/py-polars/tests/unit/sql/test_joins.py b/py-polars/tests/unit/sql/test_joins.py index 5c13aea6803c..97872d8bbdcc 100644 --- a/py-polars/tests/unit/sql/test_joins.py +++ b/py-polars/tests/unit/sql/test_joins.py @@ -466,3 +466,78 @@ def test_natural_joins_02(cols_constraint: str, expect_data: list[tuple[int]]) - expected = pl.DataFrame(expect_data, schema=actual.columns, orient="row") assert_frame_equal(actual, expected, check_row_order=False) + + +@pytest.mark.parametrize( + "join_clause", + [ + "df2 INNER JOIN df3 ON df2.CharacterID=df3.CharacterID", + "df2 INNER JOIN (df3 INNDER JOIN df4 ON df3.CharacterID=df4.CharacterID) ON df2.CharacterID=df3.CharacterID", + ], +) +def test_nested_join(join_clause: str) -> None: + df1 = pl.DataFrame( + { + "CharacterID": [1, 2, 3, 4], + "FirstName": ["Jernau Morat", "Cheradenine", "Byr", "Diziet"], + "LastName": ["Gurgeh", "Zakalwe", "Genar-Hofoen", "Sma"], + } + ) + df2 = pl.DataFrame( + { + "CharacterID": [1, 2, 3, 5], + "Role": ["Protagonist", "Protagonist", "Protagonist", "Antagonist"], + "Book": [ + "Player of Games", + "Use of Weapons", + "Excession", + "Consider Phlebas", + ], + } + ) + df3 = pl.DataFrame( + { + "CharacterID": [1, 2, 5, 6], + "Affiliation": ["Culture", "Culture", "Culture", "Shellworld"], + "Species": ["Pan-human", "Human", "Human", "Oct"], + } + ) + df4 = pl.DataFrame( + { + "CharacterID": [1, 2, 3, 6], + "Ship": [ + "Limiting Factor", + "Xenophobe", + "Grey Area", + "Falling Outside The Normal Moral Constraints", + ], + "Drone": ["Flere-Imsaho", "Skaffen-Amtiskaw", "Eccentric", "Psychopath"], + } + ) + + with pl.SQLContext( + {"df1": df1, "df2": df2, "df3": df3, "df4": df4}, eager=True + ) as ctx: + res = ctx.execute( + f""" + SELECT df1.CharacterID, df1.FirstName, df2.Role, df3.Species + FROM df1 + INNER JOIN ({join_clause}) + ON df1.CharacterID = df2.CharacterID + ORDER BY ALL + """ + ) + assert res.rows(named=True) == [ + { + "CharacterID": 1, + "FirstName": "Jernau Morat", + "Role": "Protagonist", + "Species": "Pan-human", + }, + { + "CharacterID": 2, + "FirstName": "Cheradenine", + "Role": "Protagonist", + "Species": "Human", + }, + ]