From ea7953ebe736abe1f677a65c2e352b3a2212cae2 Mon Sep 17 00:00:00 2001 From: Gijs Burghoorn Date: Mon, 23 Sep 2024 17:29:57 +0200 Subject: [PATCH] refactor: Keep scalar in more places (#18775) --- crates/polars-arrow/src/array/struct_/mod.rs | 16 +- .../metadata/interior_mutable.rs | 5 + crates/polars-core/src/chunked_array/mod.rs | 7 + .../object/extension/polars_extension.rs | 1 - .../src/chunked_array/ops/aggregate/mod.rs | 12 +- crates/polars-core/src/datatypes/any_value.rs | 183 ++++++++++++------ crates/polars-core/src/frame/column/mod.rs | 83 +++++--- crates/polars-core/src/frame/mod.rs | 72 +++++++ crates/polars-core/src/frame/row/av_buffer.rs | 4 +- .../src/series/implementations/boolean.rs | 6 +- .../src/series/implementations/date.rs | 12 +- .../src/series/implementations/floats.rs | 6 +- .../src/series/implementations/mod.rs | 6 +- .../src/series/implementations/time.rs | 7 +- crates/polars-core/src/series/series_trait.rs | 4 + crates/polars-lazy/src/dsl/eval.rs | 2 +- .../sinks/group_by/aggregates/first.rs | 8 +- .../sinks/group_by/aggregates/last.rs | 10 +- .../src/executors/sinks/sort/sink.rs | 2 +- crates/polars-python/src/dataframe/general.rs | 6 + crates/polars-python/src/functions/lazy.rs | 2 +- crates/polars-python/src/lazyframe/general.rs | 2 +- crates/polars-sql/src/context.rs | 2 +- py-polars/polars/dataframe/frame.py | 38 ++++ py-polars/polars/lazyframe/frame.py | 20 ++ 25 files changed, 390 insertions(+), 126 deletions(-) diff --git a/crates/polars-arrow/src/array/struct_/mod.rs b/crates/polars-arrow/src/array/struct_/mod.rs index efac13a481ea..decc95a2627a 100644 --- a/crates/polars-arrow/src/array/struct_/mod.rs +++ b/crates/polars-arrow/src/array/struct_/mod.rs @@ -32,6 +32,7 @@ use crate::compute::utils::combine_validities_and; #[derive(Clone)] pub struct StructArray { dtype: ArrowDataType, + // invariant: each array has the same length values: Vec>, validity: Option, } @@ -226,6 +227,17 @@ impl StructArray { impl StructArray { #[inline] fn len(&self) -> usize { + #[cfg(debug_assertions)] + if let Some(fst) = self.values.first() { + for arr in self.values.iter().skip(1) { + assert_eq!( + arr.len(), + fst.len(), + "StructArray invariant: each array has same length" + ); + } + } + self.values.first().map(|arr| arr.len()).unwrap_or(0) } @@ -242,7 +254,9 @@ impl StructArray { /// Returns the fields of this [`StructArray`]. pub fn fields(&self) -> &[Field] { - Self::get_fields(&self.dtype) + let fields = Self::get_fields(&self.dtype); + debug_assert_eq!(self.values().len(), fields.len()); + fields } } diff --git a/crates/polars-core/src/chunked_array/metadata/interior_mutable.rs b/crates/polars-core/src/chunked_array/metadata/interior_mutable.rs index 924be274b18f..2b55c22e89e4 100644 --- a/crates/polars-core/src/chunked_array/metadata/interior_mutable.rs +++ b/crates/polars-core/src/chunked_array/metadata/interior_mutable.rs @@ -27,6 +27,11 @@ where pub fn upcast(&'a self) -> &'a RwLock { &self.0 as &RwLock } + + /// Cast the [`IMMetadata`] to a boxed trait object of [`MetadataTrait`] + pub fn boxed_upcast(&'a self) -> Box { + Box::new(self.0.read().unwrap().clone()) as Box + } } impl IMMetadata { diff --git a/crates/polars-core/src/chunked_array/mod.rs b/crates/polars-core/src/chunked_array/mod.rs index c59b520bf8e8..a45ff1ae9c21 100644 --- a/crates/polars-core/src/chunked_array/mod.rs +++ b/crates/polars-core/src/chunked_array/mod.rs @@ -163,6 +163,13 @@ where pub fn metadata_dyn(&self) -> Option> { self.md.as_ref().upcast().try_read().ok() } + + /// Attempt to get a reference to the trait object containing the [`ChunkedArray`]'s [`Metadata`] + /// + /// This fails if there is a need to block. + pub fn boxed_metadata_dyn<'a>(&'a self) -> Box { + self.md.as_ref().boxed_upcast() + } } impl ChunkedArray { diff --git a/crates/polars-core/src/chunked_array/object/extension/polars_extension.rs b/crates/polars-core/src/chunked_array/object/extension/polars_extension.rs index 424c8aaccf6c..f9a931a7846a 100644 --- a/crates/polars-core/src/chunked_array/object/extension/polars_extension.rs +++ b/crates/polars-core/src/chunked_array/object/extension/polars_extension.rs @@ -17,7 +17,6 @@ impl PolarsExtension { .get(0) .unwrap() .into_static() - .unwrap() } pub(crate) unsafe fn new(array: FixedSizeBinaryArray) -> Self { diff --git a/crates/polars-core/src/chunked_array/ops/aggregate/mod.rs b/crates/polars-core/src/chunked_array/ops/aggregate/mod.rs index b94d724a5185..cf79b0acb473 100644 --- a/crates/polars-core/src/chunked_array/ops/aggregate/mod.rs +++ b/crates/polars-core/src/chunked_array/ops/aggregate/mod.rs @@ -483,11 +483,11 @@ impl StringChunked { impl ChunkAggSeries for StringChunked { fn max_reduce(&self) -> Scalar { let av: AnyValue = self.max_str().into(); - Scalar::new(DataType::String, av.into_static().unwrap()) + Scalar::new(DataType::String, av.into_static()) } fn min_reduce(&self) -> Scalar { let av: AnyValue = self.min_str().into(); - Scalar::new(DataType::String, av.into_static().unwrap()) + Scalar::new(DataType::String, av.into_static()) } } @@ -554,11 +554,11 @@ impl CategoricalChunked { impl ChunkAggSeries for CategoricalChunked { fn min_reduce(&self) -> Scalar { let av: AnyValue = self.min_categorical().into(); - Scalar::new(DataType::String, av.into_static().unwrap()) + Scalar::new(DataType::String, av.into_static()) } fn max_reduce(&self) -> Scalar { let av: AnyValue = self.max_categorical().into(); - Scalar::new(DataType::String, av.into_static().unwrap()) + Scalar::new(DataType::String, av.into_static()) } } @@ -618,11 +618,11 @@ impl ChunkAggSeries for BinaryChunked { } fn max_reduce(&self) -> Scalar { let av: AnyValue = self.max_binary().into(); - Scalar::new(self.dtype().clone(), av.into_static().unwrap()) + Scalar::new(self.dtype().clone(), av.into_static()) } fn min_reduce(&self) -> Scalar { let av: AnyValue = self.min_binary().into(); - Scalar::new(self.dtype().clone(), av.into_static().unwrap()) + Scalar::new(self.dtype().clone(), av.into_static()) } } diff --git a/crates/polars-core/src/datatypes/any_value.rs b/crates/polars-core/src/datatypes/any_value.rs index 05ad0647dbcf..fa8bf7caa3a1 100644 --- a/crates/polars-core/src/datatypes/any_value.rs +++ b/crates/polars-core/src/datatypes/any_value.rs @@ -1,12 +1,7 @@ use std::borrow::Cow; -#[cfg(feature = "dtype-struct")] -use arrow::legacy::trusted_len::TrustedLenPush; use arrow::types::PrimitiveType; use polars_utils::format_pl_smallstr; -use polars_utils::itertools::Itertools; -#[cfg(feature = "dtype-struct")] -use polars_utils::slice::GetSaferUnchecked; #[cfg(feature = "dtype-categorical")] use polars_utils::sync::SyncPtr; use polars_utils::total_ord::ToTotalOrd; @@ -907,12 +902,34 @@ impl<'a> AnyValue<'a> { } } + pub(crate) fn to_i128(&self) -> Option { + match self { + AnyValue::UInt8(v) => Some((*v).into()), + AnyValue::UInt16(v) => Some((*v).into()), + AnyValue::UInt32(v) => Some((*v).into()), + AnyValue::UInt64(v) => Some((*v).into()), + AnyValue::Int8(v) => Some((*v).into()), + AnyValue::Int16(v) => Some((*v).into()), + AnyValue::Int32(v) => Some((*v).into()), + AnyValue::Int64(v) => Some((*v).into()), + _ => None, + } + } + + pub(crate) fn to_f64(&self) -> Option { + match self { + AnyValue::Float32(v) => Some((*v).into()), + AnyValue::Float64(v) => Some(*v), + _ => None, + } + } + #[must_use] pub fn add(&self, rhs: &AnyValue) -> AnyValue<'static> { use AnyValue::*; match (self, rhs) { - (Null, r) => r.clone().into_static().unwrap(), - (l, Null) => l.clone().into_static().unwrap(), + (Null, r) => r.clone().into_static(), + (l, Null) => l.clone().into_static(), (Int32(l), Int32(r)) => Int32(l + r), (Int64(l), Int64(r)) => Int64(l + r), (UInt32(l), UInt32(r)) => UInt32(l + r), @@ -961,9 +978,9 @@ impl<'a> AnyValue<'a> { /// Try to coerce to an AnyValue with static lifetime. /// This can be done if it does not borrow any values. #[inline] - pub fn into_static(self) -> PolarsResult> { + pub fn into_static(self) -> AnyValue<'static> { use AnyValue::*; - let av = match self { + match self { Null => Null, Int8(v) => Int8(v), Int16(v) => Int16(v), @@ -997,7 +1014,7 @@ impl<'a> AnyValue<'a> { Object(v) => ObjectOwned(OwnedObject(v.to_boxed())), #[cfg(feature = "dtype-struct")] Struct(idx, arr, fields) => { - let avs = struct_to_avs_static(idx, arr, fields)?; + let avs = struct_to_avs_static(idx, arr, fields); StructOwned(Box::new((avs, fields.to_vec()))) }, #[cfg(feature = "dtype-struct")] @@ -1022,8 +1039,7 @@ impl<'a> AnyValue<'a> { Enum(v, rev, arr) => EnumOwned(v, Arc::new(rev.clone()), arr), #[cfg(feature = "dtype-categorical")] EnumOwned(v, rev, arr) => EnumOwned(v, rev, arr), - }; - Ok(av) + } } /// Get a reference to the `&str` contained within [`AnyValue`]. @@ -1070,6 +1086,37 @@ impl<'a> From> for Option { impl AnyValue<'_> { #[inline] pub fn eq_missing(&self, other: &Self, null_equal: bool) -> bool { + fn struct_owned_value_iter<'a>( + v: &'a (Vec>, Vec), + ) -> impl ExactSizeIterator> { + v.0.iter().map(|v| v.as_borrowed()) + } + fn struct_value_iter( + idx: usize, + arr: &StructArray, + ) -> impl ExactSizeIterator> { + assert!(idx < arr.len()); + + arr.values().iter().map(move |field_arr| unsafe { + // SAFETY: We asserted before that idx is smaller than the array length. Since it + // is an invariant of StructArray that all fields have the same length this is fine + // to do. + field_arr.get_unchecked(idx) + }) + } + + fn struct_eq_missing<'a>( + l: impl ExactSizeIterator>, + r: impl ExactSizeIterator>, + null_equal: bool, + ) -> bool { + if l.len() != r.len() { + return false; + } + + l.zip(r).all(|(lv, rv)| lv.eq_missing(&rv, null_equal)) + } + use AnyValue::*; match (self, other) { // Map to borrowed. @@ -1150,25 +1197,31 @@ impl AnyValue<'_> { }, #[cfg(feature = "dtype-duration")] (Duration(l, tu_l), Duration(r, tu_r)) => l == r && tu_l == tu_r, + #[cfg(feature = "dtype-struct")] - (StructOwned(l), StructOwned(r)) => { - let l_av = &*l.0; - let r_av = &*r.0; - l_av == r_av - }, + (StructOwned(l), StructOwned(r)) => struct_eq_missing( + struct_owned_value_iter(l.as_ref()), + struct_owned_value_iter(r.as_ref()), + null_equal, + ), #[cfg(feature = "dtype-struct")] - (StructOwned(l), Struct(idx, arr, fields)) => { - l.0.iter() - .eq_by_(struct_av_iter(*idx, arr, fields), |lv, rv| *lv == rv) - }, + (StructOwned(l), Struct(idx, arr, _)) => struct_eq_missing( + struct_owned_value_iter(l.as_ref()), + struct_value_iter(*idx, arr), + null_equal, + ), #[cfg(feature = "dtype-struct")] - (Struct(idx, arr, fields), StructOwned(r)) => { - struct_av_iter(*idx, arr, fields).eq_by_(r.0.iter(), |lv, rv| lv == *rv) - }, + (Struct(idx, arr, _), StructOwned(r)) => struct_eq_missing( + struct_value_iter(*idx, arr), + struct_owned_value_iter(r.as_ref()), + null_equal, + ), #[cfg(feature = "dtype-struct")] - (Struct(l_idx, l_arr, l_fields), Struct(r_idx, r_arr, r_fields)) => { - struct_av_iter(*l_idx, l_arr, l_fields).eq(struct_av_iter(*r_idx, r_arr, r_fields)) - }, + (Struct(l_idx, l_arr, _), Struct(r_idx, r_arr, _)) => struct_eq_missing( + struct_value_iter(*l_idx, l_arr), + struct_value_iter(*r_idx, r_arr), + null_equal, + ), #[cfg(feature = "dtype-decimal")] (Decimal(l_v, l_s), Decimal(r_v, r_s)) => { // l_v / 10**l_s == r_v / 10**r_s @@ -1198,9 +1251,34 @@ impl AnyValue<'_> { }, #[cfg(feature = "object")] (Object(l), Object(r)) => l == r, + #[cfg(feature = "dtype-array")] + (Array(l_values, l_size), Array(r_values, r_size)) => { + if l_size != r_size { + return false; + } + + debug_assert_eq!(l_values.len(), *l_size); + debug_assert_eq!(r_values.len(), *r_size); + + let mut is_equal = true; + for i in 0..*l_size { + let l = unsafe { l_values.get_unchecked(i) }; + let r = unsafe { r_values.get_unchecked(i) }; + + is_equal &= l.eq_missing(&r, null_equal); + } + is_equal + }, + + (l, r) if l.to_i128().is_some() && r.to_i128().is_some() => l.to_i128() == r.to_i128(), + (l, r) if l.to_f64().is_some() && r.to_f64().is_some() => { + l.to_f64().unwrap().to_total_ord() == r.to_f64().unwrap().to_total_ord() + }, (_, _) => { - unimplemented!("ordering for mixed dtypes is not supported") + unimplemented!( + "scalar eq_missing for mixed dtypes {self:?} and {other:?} is not supported" + ) }, } } @@ -1346,7 +1424,9 @@ impl PartialOrd for AnyValue<'_> { }, (_, _) => { - unimplemented!("ordering for mixed dtypes is not supported") + unimplemented!( + "scalar ordering for mixed dtypes {self:?} and {other:?} is not supported" + ) }, } } @@ -1360,23 +1440,22 @@ impl TotalEq for AnyValue<'_> { } #[cfg(feature = "dtype-struct")] -fn struct_to_avs_static( - idx: usize, - arr: &StructArray, - fields: &[Field], -) -> PolarsResult>> { +fn struct_to_avs_static(idx: usize, arr: &StructArray, fields: &[Field]) -> Vec> { + assert!(idx < arr.len()); + let arrs = arr.values(); - let mut avs = Vec::with_capacity(arrs.len()); - // amortize loop counter - for i in 0..arrs.len() { - unsafe { - let arr = &**arrs.get_unchecked_release(i); - let field = fields.get_unchecked_release(i); - let av = arr_to_any_value(arr, idx, &field.dtype); - avs.push_unchecked(av.into_static()?); - } - } - Ok(avs) + + debug_assert_eq!(arrs.len(), fields.len()); + + arrs.iter() + .zip(fields) + .map(|(arr, field)| { + // SAFETY: We asserted above that the length of StructArray is larger than `idx`. Since + // StructArray has the invariant that each array is the same length. This is okay to do + // now. + unsafe { arr_to_any_value(arr.as_ref(), idx, &field.dtype) }.into_static() + }) + .collect() } #[cfg(feature = "dtype-categorical")] @@ -1397,20 +1476,6 @@ fn same_revmap( } } -#[cfg(feature = "dtype-struct")] -fn struct_av_iter<'a>( - idx: usize, - arr: &'a StructArray, - fields: &'a [Field], -) -> impl Iterator> { - let arrs = arr.values(); - (0..arrs.len()).map(move |i| unsafe { - let arr = &**arrs.get_unchecked_release(i); - let field = fields.get_unchecked_release(i); - arr_to_any_value(arr, idx, &field.dtype) - }) -} - pub trait GetAnyValue { /// # Safety /// diff --git a/crates/polars-core/src/frame/column/mod.rs b/crates/polars-core/src/frame/column/mod.rs index 2296b09a4a03..b39f66b543ea 100644 --- a/crates/polars-core/src/frame/column/mod.rs +++ b/crates/polars-core/src/frame/column/mod.rs @@ -8,7 +8,7 @@ use polars_utils::pl_str::PlSmallStr; use self::gather::check_bounds_ca; use crate::chunked_array::cast::CastOptions; -use crate::chunked_array::metadata::MetadataFlags; +use crate::chunked_array::metadata::{MetadataFlags, MetadataTrait}; use crate::prelude::*; use crate::series::{BitRepr, IsSorted, SeriesPhysIter}; use crate::utils::{slice_offsets, Container}; @@ -581,6 +581,14 @@ impl Column { } } + pub fn get_metadata<'a>(&'a self) -> Option> { + match self { + Column::Series(s) => s.boxed_metadata(), + // @scalar-opt + Column::Scalar(_) => None, + } + } + pub fn get_data_ptr(&self) -> usize { // @scalar-opt self.as_materialized_series().get_data_ptr() @@ -782,27 +790,37 @@ impl Column { } pub fn gather_every(&self, n: usize, offset: usize) -> Column { - // @scalar-opt - self.as_materialized_series().gather_every(n, offset).into() + if self.len().saturating_sub(offset) == 0 { + return self.clear(); + } + + match self { + Column::Series(s) => s.gather_every(n, offset).into(), + Column::Scalar(s) => s.resize(s.length - offset / n).into(), + } } pub fn extend_constant(&self, value: AnyValue, n: usize) -> PolarsResult { - self.as_materialized_series() - .extend_constant(value, n) - .map(Column::from) - // @scalar-opt: This currently fails because Scalar::partial_cmp cannot deal with Nulls - // - // match self { - // Column::Series(s) => s.extend_constant(value, n).map(Column::from), - // Column::Scalar(s) => { - // if s.scalar.as_any_value() == value && s.len() > 0 { - // Ok(s.resize(s.len() + n).into()) - // } else { - // // @scalar-opt - // s.as_materialized_series().extend_constant(value, n).map(Column::from) - // } - // }, - // } + if self.is_empty() { + return Ok(Self::new_scalar( + self.name().clone(), + Scalar::new(self.dtype().clone(), value.into_static()), + n, + )); + } + + match self { + Column::Series(s) => s.extend_constant(value, n).map(Column::from), + Column::Scalar(s) => { + if s.scalar.as_any_value() == value { + Ok(s.resize(s.len() + n).into()) + } else { + s.as_materialized_series() + .extend_constant(value, n) + .map(Column::from) + } + }, + } } pub fn is_finite(&self) -> PolarsResult { @@ -994,14 +1012,11 @@ impl From for Column { fn from(series: Series) -> Self { if series.len() == 1 { // SAFETY: We just did the bounds check - let value = unsafe { series.get_unchecked(0) }; - - if let Ok(value) = value.into_static() { - let value = Scalar::new(series.dtype().clone(), value); - let mut col = ScalarColumn::new(series.name().clone(), value, 1); - col.materialized = OnceLock::from(series); - return Self::Scalar(col); - } + let value = unsafe { series.get_unchecked(0) }.into_static(); + let value = Scalar::new(series.dtype().clone(), value); + let mut col = ScalarColumn::new(series.name().clone(), value, 1); + col.materialized = OnceLock::from(series); + return Self::Scalar(col); } Self::Series(series) @@ -1105,7 +1120,7 @@ impl ScalarColumn { pub fn from_single_value_series(series: Series, length: usize) -> PolarsResult { debug_assert_eq!(series.len(), 1); let value = series.get(0)?; - let value = value.into_static()?; + let value = value.into_static(); let value = Scalar::new(series.dtype().clone(), value); Ok(ScalarColumn::new(series.name().clone(), value, length)) } @@ -1114,6 +1129,14 @@ impl ScalarColumn { /// /// This reuses the materialized [`Series`], if `length <= self.length`. pub fn resize(&self, length: usize) -> ScalarColumn { + if self.length == length { + return self.clone(); + } + + // This is violates an invariant if this triggers, the scalar value is undefined if the + // self.length == 0 so therefore we should never resize using that value. + debug_assert_ne!(self.length, 0); + let mut resized = Self { name: self.name.clone(), scalar: self.scalar.clone(), @@ -1145,7 +1168,7 @@ impl ScalarColumn { Self::new_empty(materialized.name().clone(), materialized.dtype().clone()) } else { // SAFETY: Just did bounds check - let scalar = unsafe { materialized.get_unchecked(0) }.into_static()?; + let scalar = unsafe { materialized.get_unchecked(0) }.into_static(); Self::new( materialized.name().clone(), Scalar::new(materialized.dtype().clone(), scalar), @@ -1193,7 +1216,7 @@ impl ScalarColumn { Self::new_empty(materialized.name().clone(), materialized.dtype().clone()) } else { // SAFETY: Just did bounds check - let scalar = unsafe { materialized.get_unchecked(0) }.into_static()?; + let scalar = unsafe { materialized.get_unchecked(0) }.into_static(); Self::new( materialized.name().clone(), Scalar::new(materialized.dtype().clone(), scalar), diff --git a/crates/polars-core/src/frame/mod.rs b/crates/polars-core/src/frame/mod.rs index 444e98390854..236bd362a02a 100644 --- a/crates/polars-core/src/frame/mod.rs +++ b/crates/polars-core/src/frame/mod.rs @@ -6,6 +6,7 @@ use std::{mem, ops}; use polars_utils::itertools::Itertools; use rayon::prelude::*; +use crate::chunked_array::metadata::MetadataFlags; #[cfg(feature = "algorithm_group_by")] use crate::chunked_array::ops::unique::is_unique_helper; use crate::prelude::*; @@ -1920,6 +1921,77 @@ impl DataFrame { Ok(df) } + /// Create a `DataFrame` that has fields for all the known runtime metadata for each column. + /// + /// This dataframe does not necessarily have a specified schema and may be changed at any + /// point. It is primarily used for debugging. + pub fn _to_metadata(&self) -> DataFrame { + let num_columns = self.columns.len(); + + let mut column_names = + StringChunkedBuilder::new(PlSmallStr::from_static("column_name"), num_columns); + let mut repr_ca = StringChunkedBuilder::new(PlSmallStr::from_static("repr"), num_columns); + let mut sorted_asc_ca = + BooleanChunkedBuilder::new(PlSmallStr::from_static("sorted_asc"), num_columns); + let mut sorted_dsc_ca = + BooleanChunkedBuilder::new(PlSmallStr::from_static("sorted_dsc"), num_columns); + let mut fast_explode_list_ca = + BooleanChunkedBuilder::new(PlSmallStr::from_static("fast_explode_list"), num_columns); + let mut min_value_ca = + StringChunkedBuilder::new(PlSmallStr::from_static("min_value"), num_columns); + let mut max_value_ca = + StringChunkedBuilder::new(PlSmallStr::from_static("max_value"), num_columns); + let mut distinct_count_ca: Vec> = Vec::with_capacity(num_columns); + + for col in &self.columns { + let metadata = col.get_metadata(); + + let (flags, min_value, max_value, distinct_count) = + metadata.map_or((MetadataFlags::default(), None, None, None), |md| { + ( + md.get_flags(), + md.min_value(), + md.max_value(), + md.distinct_count(), + ) + }); + + let repr = match col { + Column::Series(_) => "series", + Column::Scalar(_) => "scalar", + }; + let sorted_asc = flags.contains(MetadataFlags::SORTED_ASC); + let sorted_dsc = flags.contains(MetadataFlags::SORTED_DSC); + let fast_explode_list = flags.contains(MetadataFlags::FAST_EXPLODE_LIST); + + column_names.append_value(col.name().clone()); + repr_ca.append_value(repr); + sorted_asc_ca.append_value(sorted_asc); + sorted_dsc_ca.append_value(sorted_dsc); + fast_explode_list_ca.append_value(fast_explode_list); + min_value_ca.append_option(min_value.map(|v| v.as_any_value().to_string())); + max_value_ca.append_option(max_value.map(|v| v.as_any_value().to_string())); + distinct_count_ca.push(distinct_count); + } + + unsafe { + DataFrame::new_no_checks(vec![ + column_names.finish().into_column(), + repr_ca.finish().into_column(), + sorted_asc_ca.finish().into_column(), + sorted_dsc_ca.finish().into_column(), + fast_explode_list_ca.finish().into_column(), + min_value_ca.finish().into_column(), + max_value_ca.finish().into_column(), + IdxCa::from_slice_options( + PlSmallStr::from_static("distinct_count"), + &distinct_count_ca[..], + ) + .into_column(), + ]) + } + } + /// Return a sorted clone of this [`DataFrame`]. /// /// # Example diff --git a/crates/polars-core/src/frame/row/av_buffer.rs b/crates/polars-core/src/frame/row/av_buffer.rs index f46332021ef1..5d8da9c55666 100644 --- a/crates/polars-core/src/frame/row/av_buffer.rs +++ b/crates/polars-core/src/frame/row/av_buffer.rs @@ -499,7 +499,7 @@ impl<'a> AnyValueBufferTrusted<'a> { } } }, - All(_, vals) => vals.push(val.clone().into_static().unwrap()), + All(_, vals) => vals.push(val.clone().into_static()), _ => self.add_physical(val), } }, @@ -540,7 +540,7 @@ impl<'a> AnyValueBufferTrusted<'a> { } } }, - All(_, vals) => vals.push(val.clone().into_static().unwrap()), + All(_, vals) => vals.push(val.clone().into_static()), _ => self.add_physical(val), } }, diff --git a/crates/polars-core/src/series/implementations/boolean.rs b/crates/polars-core/src/series/implementations/boolean.rs index 30c78b95943d..a4e9d662226d 100644 --- a/crates/polars-core/src/series/implementations/boolean.rs +++ b/crates/polars-core/src/series/implementations/boolean.rs @@ -103,7 +103,11 @@ impl private::PrivateSeries for SeriesWrap { impl SeriesTrait for SeriesWrap { fn get_metadata(&self) -> Option> { - self.metadata_dyn() + self.0.metadata_dyn() + } + + fn boxed_metadata<'a>(&'a self) -> Option> { + Some(self.0.boxed_metadata_dyn()) } fn bitxor(&self, other: &Series) -> PolarsResult { diff --git a/crates/polars-core/src/series/implementations/date.rs b/crates/polars-core/src/series/implementations/date.rs index 479478a94530..a2ef6ed0788c 100644 --- a/crates/polars-core/src/series/implementations/date.rs +++ b/crates/polars-core/src/series/implementations/date.rs @@ -144,6 +144,14 @@ impl SeriesTrait for SeriesWrap { self.0.rename(name); } + fn get_metadata(&self) -> Option> { + self.0.metadata_dyn() + } + + fn boxed_metadata<'a>(&'a self) -> Option> { + Some(self.0.boxed_metadata_dyn()) + } + fn chunk_lengths(&self) -> ChunkLenIter { self.0.chunk_lengths() } @@ -320,13 +328,13 @@ impl SeriesTrait for SeriesWrap { fn max_reduce(&self) -> PolarsResult { let sc = self.0.max_reduce(); - let av = sc.value().cast(self.dtype()).into_static().unwrap(); + let av = sc.value().cast(self.dtype()).into_static(); Ok(Scalar::new(self.dtype().clone(), av)) } fn min_reduce(&self) -> PolarsResult { let sc = self.0.min_reduce(); - let av = sc.value().cast(self.dtype()).into_static().unwrap(); + let av = sc.value().cast(self.dtype()).into_static(); Ok(Scalar::new(self.dtype().clone(), av)) } diff --git a/crates/polars-core/src/series/implementations/floats.rs b/crates/polars-core/src/series/implementations/floats.rs index de349c2a22f5..6f83811561e3 100644 --- a/crates/polars-core/src/series/implementations/floats.rs +++ b/crates/polars-core/src/series/implementations/floats.rs @@ -166,7 +166,11 @@ macro_rules! impl_dyn_series { } fn get_metadata(&self) -> Option> { - self.metadata_dyn() + self.0.metadata_dyn() + } + + fn boxed_metadata<'a>(&'a self) -> Option> { + Some(self.0.boxed_metadata_dyn()) } fn rename(&mut self, name: PlSmallStr) { diff --git a/crates/polars-core/src/series/implementations/mod.rs b/crates/polars-core/src/series/implementations/mod.rs index 4116df5a42fa..6094dff6a838 100644 --- a/crates/polars-core/src/series/implementations/mod.rs +++ b/crates/polars-core/src/series/implementations/mod.rs @@ -239,7 +239,11 @@ macro_rules! impl_dyn_series { } fn get_metadata(&self) -> Option> { - self.metadata_dyn() + self.0.metadata_dyn() + } + + fn boxed_metadata<'a>(&'a self) -> Option> { + Some(self.0.boxed_metadata_dyn()) } fn bitand(&self, other: &Series) -> PolarsResult { diff --git a/crates/polars-core/src/series/implementations/time.rs b/crates/polars-core/src/series/implementations/time.rs index ed810d34b3f4..870efc27de7e 100644 --- a/crates/polars-core/src/series/implementations/time.rs +++ b/crates/polars-core/src/series/implementations/time.rs @@ -290,21 +290,20 @@ impl SeriesTrait for SeriesWrap { fn max_reduce(&self) -> PolarsResult { let sc = self.0.max_reduce(); - let av = sc.value().cast(self.dtype()).into_static().unwrap(); + let av = sc.value().cast(self.dtype()).into_static(); Ok(Scalar::new(self.dtype().clone(), av)) } fn min_reduce(&self) -> PolarsResult { let sc = self.0.min_reduce(); - let av = sc.value().cast(self.dtype()).into_static().unwrap(); + let av = sc.value().cast(self.dtype()).into_static(); Ok(Scalar::new(self.dtype().clone(), av)) } fn median_reduce(&self) -> PolarsResult { let av = AnyValue::from(self.median().map(|v| v as i64)) .cast(self.dtype()) - .into_static() - .unwrap(); + .into_static(); Ok(Scalar::new(self.dtype().clone(), av)) } diff --git a/crates/polars-core/src/series/series_trait.rs b/crates/polars-core/src/series/series_trait.rs index 2804c5ce1840..d9e11e5c5e8c 100644 --- a/crates/polars-core/src/series/series_trait.rs +++ b/crates/polars-core/src/series/series_trait.rs @@ -209,6 +209,10 @@ pub trait SeriesTrait: None } + fn boxed_metadata<'a>(&'a self) -> Option> { + None + } + /// Get the lengths of the underlying chunks fn chunk_lengths(&self) -> ChunkLenIter; diff --git a/crates/polars-lazy/src/dsl/eval.rs b/crates/polars-lazy/src/dsl/eval.rs index dcb4853f0671..b25a30240a41 100644 --- a/crates/polars-lazy/src/dsl/eval.rs +++ b/crates/polars-lazy/src/dsl/eval.rs @@ -72,7 +72,7 @@ pub trait ExprEvalExtension: IntoExpr + Sized { "expected single value, got a result with length {}, {:?}", out.len(), out, ); - Ok(out.get(0).unwrap().into_static().unwrap()) + Ok(out.get(0).unwrap().into_static()) }; let avs = if parallel { diff --git a/crates/polars-pipe/src/executors/sinks/group_by/aggregates/first.rs b/crates/polars-pipe/src/executors/sinks/group_by/aggregates/first.rs index 60b50b144aa0..433e5a54adef 100644 --- a/crates/polars-pipe/src/executors/sinks/group_by/aggregates/first.rs +++ b/crates/polars-pipe/src/executors/sinks/group_by/aggregates/first.rs @@ -28,7 +28,7 @@ impl AggregateFn for FirstAgg { let item = unsafe { item.next().unwrap_unchecked_release() }; if self.first.is_none() { self.chunk_idx = chunk_idx; - self.first = Some(item.into_static().unwrap()) + self.first = Some(item.into_static()) } } fn pre_agg_ordered( @@ -40,11 +40,7 @@ impl AggregateFn for FirstAgg { ) { if self.first.is_none() { self.chunk_idx = chunk_idx; - self.first = Some( - unsafe { values.get_unchecked(offset as usize) } - .into_static() - .unwrap(), - ) + self.first = Some(unsafe { values.get_unchecked(offset as usize) }.into_static()) } } diff --git a/crates/polars-pipe/src/executors/sinks/group_by/aggregates/last.rs b/crates/polars-pipe/src/executors/sinks/group_by/aggregates/last.rs index 2a659d1aea01..c808fe815cbb 100644 --- a/crates/polars-pipe/src/executors/sinks/group_by/aggregates/last.rs +++ b/crates/polars-pipe/src/executors/sinks/group_by/aggregates/last.rs @@ -27,7 +27,7 @@ impl AggregateFn for LastAgg { fn pre_agg(&mut self, chunk_idx: IdxSize, item: &mut dyn ExactSizeIterator) { let item = unsafe { item.next().unwrap_unchecked_release() }; self.chunk_idx = chunk_idx; - self.last = Some(unsafe { item.into_static().unwrap_unchecked() }); + self.last = Some(item.into_static()); } fn pre_agg_ordered( &mut self, @@ -37,12 +37,8 @@ impl AggregateFn for LastAgg { values: &Series, ) { self.chunk_idx = chunk_idx; - self.last = Some(unsafe { - values - .get_unchecked((offset + length - 1) as usize) - .into_static() - .unwrap_unchecked() - }) + self.last = + Some(unsafe { values.get_unchecked((offset + length - 1) as usize) }.into_static()) } fn dtype(&self) -> DataType { diff --git a/crates/polars-pipe/src/executors/sinks/sort/sink.rs b/crates/polars-pipe/src/executors/sinks/sort/sink.rs index b6c5316485b7..43589c9783a1 100644 --- a/crates/polars-pipe/src/executors/sinks/sort/sink.rs +++ b/crates/polars-pipe/src/executors/sinks/sort/sink.rs @@ -124,7 +124,7 @@ impl SortSink { // SAFETY: we just asserted height > 0 let sample = unsafe { let s = &df.get_columns()[self.sort_idx]; - s.to_physical_repr().get_unchecked(0).into_static().unwrap() + s.to_physical_repr().get_unchecked(0).into_static() }; self.dist_sample.push(sample); diff --git a/crates/polars-python/src/dataframe/general.rs b/crates/polars-python/src/dataframe/general.rs index ff635c08898b..78727ffefd33 100644 --- a/crates/polars-python/src/dataframe/general.rs +++ b/crates/polars-python/src/dataframe/general.rs @@ -337,6 +337,12 @@ impl PyDataFrame { Ok(df.into()) } + pub fn _to_metadata(&self) -> Self { + Self { + df: self.df._to_metadata(), + } + } + pub fn group_by_map_groups( &self, by: Vec, diff --git a/crates/polars-python/src/functions/lazy.rs b/crates/polars-python/src/functions/lazy.rs index d649b7be4cba..7e3ea213667e 100644 --- a/crates/polars-python/src/functions/lazy.rs +++ b/crates/polars-python/src/functions/lazy.rs @@ -451,7 +451,7 @@ pub fn lit(value: &Bound<'_, PyAny>, allow_object: bool, is_scalar: bool) -> PyR let av = s .get(0) .map_err(|_| PyValueError::new_err("expected at least 1 value"))?; - let av = av.into_static().map_err(PyPolarsErr::from)?; + let av = av.into_static(); Ok(dsl::lit(Scalar::new(s.dtype().clone(), av)).into()) } else { Ok(dsl::lit(s).into()) diff --git a/crates/polars-python/src/lazyframe/general.rs b/crates/polars-python/src/lazyframe/general.rs index 86bcd3c2566b..57c31d0b05da 100644 --- a/crates/polars-python/src/lazyframe/general.rs +++ b/crates/polars-python/src/lazyframe/general.rs @@ -881,7 +881,7 @@ impl PyLazyFrame { strategy: strategy.0, left_by: left_by.map(strings_to_pl_smallstr), right_by: right_by.map(strings_to_pl_smallstr), - tolerance: tolerance.map(|t| t.0.into_static().unwrap()), + tolerance: tolerance.map(|t| t.0.into_static()), tolerance_str: tolerance_str.map(|s| s.into()), })) .suffix(suffix) diff --git a/crates/polars-sql/src/context.rs b/crates/polars-sql/src/context.rs index c5d9f4a371b3..e95bda62916d 100644 --- a/crates/polars-sql/src/context.rs +++ b/crates/polars-sql/src/context.rs @@ -451,7 +451,7 @@ impl SQLContext { Expr::Literal(value) => { value.to_any_value() .ok_or_else(|| polars_err!(SQLInterface: "invalid literal value: {:?}", value)) - .map(|av| av.into_static().unwrap()) + .map(|av| av.into_static()) }, _ => polars_bail!(SQLInterface: "VALUES clause expects literals; found {}", expr), } diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py index a7178fb2bf77..7fafdd058735 100644 --- a/py-polars/polars/dataframe/frame.py +++ b/py-polars/polars/dataframe/frame.py @@ -11223,6 +11223,44 @@ def melt( value_name=value_name, ) + def _to_metadata( + self, + columns: None | str | list[str] = None, + stats: None | str | list[str] = None, + ) -> DataFrame: + """ + Get all runtime metadata for each column. + + This is unstable and is meant for debugging purposes. + + Parameters + ---------- + columns + Column(s) to show the information for + stats + Statistics to show + """ + df = self + + if columns is not None: + if isinstance(columns, str): + columns = [columns] + + df = df.select(columns) + + md = self._from_pydf(df._df._to_metadata()) + + if stats is not None: + if isinstance(stats, str): + stats = [stats] + + if "column_name" not in stats: + stats = ["column_name"] + stats + + md = md.select(stats) + + return md + def _prepare_other_arg(other: Any, length: int | None = None) -> Series: # if not a series create singleton series such that it will broadcast diff --git a/py-polars/polars/lazyframe/frame.py b/py-polars/polars/lazyframe/frame.py index cf98067d1809..b26160bc7e75 100644 --- a/py-polars/polars/lazyframe/frame.py +++ b/py-polars/polars/lazyframe/frame.py @@ -6888,3 +6888,23 @@ def melt( value_name=value_name, streamable=streamable, ) + + def _to_metadata( + self, + columns: None | str | list[str] = None, + stats: None | str | list[str] = None, + ) -> DataFrame: + """ + Get all runtime metadata for each column. + + This is unstable and is meant for debugging purposes. + """ + lf = self + + if columns is not None: + if isinstance(columns, str): + columns = [columns] + + lf = lf.select(columns) + + return lf.collect()._to_metadata(stats=stats)