diff --git a/Cargo.lock b/Cargo.lock index 0a88b89e2edb..7aca7dc3acc5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3183,6 +3183,7 @@ dependencies = [ "polars-error", "polars-json", "polars-parquet", + "polars-schema", "polars-time", "polars-utils", "rayon", @@ -3317,6 +3318,7 @@ dependencies = [ "fallible-streaming-iterator", "flate2", "futures", + "hashbrown", "lz4", "lz4_flex", "num-traits", @@ -3366,6 +3368,7 @@ dependencies = [ "ahash", "bitflags", "bytemuck", + "bytes", "chrono", "chrono-tz", "ciborium", @@ -3402,6 +3405,7 @@ dependencies = [ "ahash", "arboard", "bytemuck", + "bytes", "ciborium", "either", "itoa", diff --git a/Makefile b/Makefile index 524e04eddc2c..ae13f7a525bc 100644 --- a/Makefile +++ b/Makefile @@ -21,7 +21,7 @@ FILTER_PIP_WARNINGS=| grep -v "don't match your environment"; test $${PIPESTATUS requirements: .venv ## Install/refresh Python project requirements @unset CONDA_PREFIX \ && $(VENV_BIN)/python -m pip install --upgrade uv \ - && $(VENV_BIN)/uv pip install --upgrade --compile-bytecode \ + && $(VENV_BIN)/uv pip install --upgrade --compile-bytecode --no-build \ -r py-polars/requirements-dev.txt \ -r py-polars/requirements-lint.txt \ -r py-polars/docs/requirements-docs.txt \ diff --git a/crates/polars-core/src/chunked_array/from_iterator.rs b/crates/polars-core/src/chunked_array/from_iterator.rs index 766ef94acc8e..72f2bc8c60cb 100644 --- a/crates/polars-core/src/chunked_array/from_iterator.rs +++ b/crates/polars-core/src/chunked_array/from_iterator.rs @@ -2,7 +2,7 @@ use std::borrow::{Borrow, Cow}; #[cfg(feature = "object")] -use arrow::bitmap::{Bitmap, MutableBitmap}; +use arrow::bitmap::MutableBitmap; use crate::chunked_array::builder::{get_list_builder, AnonymousOwnedListBuilder}; #[cfg(feature = "object")] @@ -268,17 +268,7 @@ impl FromIterator> for ObjectChunked { }) .collect(); - let null_bit_buffer: Option = null_mask_builder.into(); - let null_bitmap = null_bit_buffer; - - let len = values.len(); - - let arr = Box::new(ObjectArray { - values: Arc::new(values), - null_bitmap, - offset: 0, - len, - }); + let arr = Box::new(ObjectArray::from(values).with_validity(null_mask_builder.into())); ChunkedArray::new_with_compute_len( Arc::new(Field::new(PlSmallStr::EMPTY, get_object_type::())), vec![arr], diff --git a/crates/polars-core/src/chunked_array/iterator/mod.rs b/crates/polars-core/src/chunked_array/iterator/mod.rs index 7756153891c6..728ffc5a8cff 100644 --- a/crates/polars-core/src/chunked_array/iterator/mod.rs +++ b/crates/polars-core/src/chunked_array/iterator/mod.rs @@ -432,7 +432,7 @@ impl ObjectChunked { // we know that we only iterate over length == self.len() unsafe { self.downcast_iter() - .flat_map(|arr| arr.values().iter()) + .flat_map(|arr| arr.values_iter()) .trust_my_length(self.len()) } } diff --git a/crates/polars-core/src/chunked_array/mod.rs b/crates/polars-core/src/chunked_array/mod.rs index 8b68e0c9ef85..c59b520bf8e8 100644 --- a/crates/polars-core/src/chunked_array/mod.rs +++ b/crates/polars-core/src/chunked_array/mod.rs @@ -690,6 +690,14 @@ where } } + #[inline] + pub fn first(&self) -> Option> { + unsafe { + let arr = self.downcast_get_unchecked(0); + arr.get_unchecked(0) + } + } + #[inline] pub fn last(&self) -> Option> { unsafe { @@ -950,9 +958,12 @@ pub(crate) fn to_array( impl Default for ChunkedArray { fn default() -> Self { + let dtype = T::get_dtype(); + let arrow_dtype = dtype.to_physical().to_arrow(CompatLevel::newest()); ChunkedArray { - field: Arc::new(Field::new(PlSmallStr::EMPTY, DataType::Null)), - chunks: Default::default(), + field: Arc::new(Field::new(PlSmallStr::EMPTY, dtype)), + // Invariant: always has 1 chunk. + chunks: vec![new_empty_array(arrow_dtype)], md: Arc::new(IMMetadata::default()), length: 0, null_count: 0, diff --git a/crates/polars-core/src/chunked_array/object/builder.rs b/crates/polars-core/src/chunked_array/object/builder.rs index 01524c018ec2..45f63847e97f 100644 --- a/crates/polars-core/src/chunked_array/object/builder.rs +++ b/crates/polars-core/src/chunked_array/object/builder.rs @@ -61,10 +61,8 @@ where .unwrap_or(0) as IdxSize; let arr = Box::new(ObjectArray { - values: Arc::new(self.values), - null_bitmap, - offset: 0, - len, + values: self.values.into(), + validity: null_bitmap, }); self.field.dtype = get_object_type::(); @@ -140,10 +138,8 @@ where let field = Arc::new(Field::new(name, DataType::Object(T::type_name(), None))); let len = v.len(); let arr = Box::new(ObjectArray { - values: Arc::new(v), - null_bitmap: None, - offset: 0, - len, + values: v.into(), + validity: None, }); unsafe { ObjectChunked::new_with_dims(field, vec![arr], len as IdxSize, 0) } @@ -154,10 +150,8 @@ where let len = v.len(); let null_count = validity.unset_bits(); let arr = Box::new(ObjectArray { - values: Arc::new(v), - null_bitmap: Some(validity), - offset: 0, - len, + values: v.into(), + validity: Some(validity), }); unsafe { diff --git a/crates/polars-core/src/chunked_array/object/mod.rs b/crates/polars-core/src/chunked_array/object/mod.rs index 1b018800dd98..a7e3d2f9952d 100644 --- a/crates/polars-core/src/chunked_array/object/mod.rs +++ b/crates/polars-core/src/chunked_array/object/mod.rs @@ -4,6 +4,7 @@ use std::hash::Hash; use arrow::bitmap::utils::{BitmapIter, ZipValidity}; use arrow::bitmap::{Bitmap, MutableBitmap}; +use arrow::buffer::Buffer; use polars_utils::total_ord::TotalHash; use crate::prelude::*; @@ -22,10 +23,8 @@ pub struct ObjectArray where T: PolarsObject, { - pub(crate) values: Arc>, - pub(crate) null_bitmap: Option, - pub(crate) offset: usize, - pub(crate) len: usize, + values: Buffer, + validity: Option, } /// Trimmed down object safe polars object @@ -80,23 +79,18 @@ impl ObjectArray where T: PolarsObject, { - /// Get a reference to the underlying data - pub fn values(&self) -> &Arc> { - &self.values - } - pub fn values_iter(&self) -> ObjectValueIter<'_, T> { self.values.iter() } /// Returns an iterator of `Option<&T>` over every element of this array. pub fn iter(&self) -> ZipValidity<&T, ObjectValueIter<'_, T>, BitmapIter> { - ZipValidity::new_with_validity(self.values_iter(), self.null_bitmap.as_ref()) + ZipValidity::new_with_validity(self.values_iter(), self.validity.as_ref()) } /// Get a value at a certain index location pub fn value(&self, index: usize) -> &T { - &self.values[self.offset + index] + &self.values[index] } pub fn get(&self, index: usize) -> Option<&T> { @@ -123,7 +117,7 @@ where /// No bounds checks #[inline] pub unsafe fn is_valid_unchecked(&self, i: usize) -> bool { - if let Some(b) = &self.null_bitmap { + if let Some(b) = &self.validity { b.get_bit_unchecked(i) } else { true @@ -157,7 +151,7 @@ where if matches!(&validity, Some(bitmap) if bitmap.len() != self.len()) { panic!("validity must be equal to the array's length") } - self.null_bitmap = validity; + self.validity = validity; } } @@ -182,10 +176,12 @@ where } unsafe fn slice_unchecked(&mut self, offset: usize, length: usize) { - let len = std::cmp::min(self.len - offset, length); - - self.len = len; - self.offset = offset; + self.validity = self + .validity + .take() + .map(|bitmap| bitmap.sliced_unchecked(offset, length)) + .filter(|bitmap| bitmap.unset_bits() > 0); + self.values.slice_unchecked(offset, length); } fn split_at_boxed(&self, offset: usize) -> (Box, Box) { @@ -199,11 +195,11 @@ where } fn len(&self) -> usize { - self.len + self.values.len() } fn validity(&self) -> Option<&Bitmap> { - self.null_bitmap.as_ref() + self.validity.as_ref() } fn with_validity(&self, validity: Option) -> Box { @@ -219,7 +215,7 @@ where } fn null_count(&self) -> usize { - match &self.null_bitmap { + match &self.validity { None => 0, Some(validity) => validity.unset_bits(), } @@ -232,18 +228,16 @@ impl Splitable for ObjectArray { } unsafe fn _split_at_unchecked(&self, offset: usize) -> (Self, Self) { + let (left_values, right_values) = unsafe { self.values.split_at_unchecked(offset) }; + let (left_validity, right_validity) = unsafe { self.validity.split_at_unchecked(offset) }; ( Self { - values: self.values.clone(), - null_bitmap: self.null_bitmap.clone(), - len: offset, - offset: self.offset, + values: left_values, + validity: left_validity, }, Self { - values: self.values.clone(), - null_bitmap: self.null_bitmap.clone(), - len: self.len() - offset, - offset: self.offset + offset, + values: right_values, + validity: right_validity, }, ) } @@ -273,10 +267,8 @@ impl StaticArray for ObjectArray { fn full_null(length: usize, _dtype: ArrowDataType) -> Self { ObjectArray { - values: Arc::new(vec![T::default(); length]), - null_bitmap: Some(Bitmap::new_with_value(false, length)), - offset: 0, - len: length, + values: vec![T::default(); length].into(), + validity: Some(Bitmap::new_with_value(false, length)), } } } @@ -324,3 +316,12 @@ where } } } + +impl From> for ObjectArray { + fn from(values: Vec) -> Self { + Self { + values: values.into(), + validity: None, + } + } +} diff --git a/crates/polars-core/src/datatypes/any_value.rs b/crates/polars-core/src/datatypes/any_value.rs index 5475b8fb5203..c3f5f57e0c68 100644 --- a/crates/polars-core/src/datatypes/any_value.rs +++ b/crates/polars-core/src/datatypes/any_value.rs @@ -499,6 +499,14 @@ impl<'a> AnyValue<'a> { ) } + pub fn is_nan(&self) -> bool { + match self { + AnyValue::Float32(f) => f.is_nan(), + AnyValue::Float64(f) => f.is_nan(), + _ => false, + } + } + pub fn is_null(&self) -> bool { matches!(self, AnyValue::Null) } diff --git a/crates/polars-core/src/datatypes/static_array_collect.rs b/crates/polars-core/src/datatypes/static_array_collect.rs index 02974d7b33a8..5974b3049a0d 100644 --- a/crates/polars-core/src/datatypes/static_array_collect.rs +++ b/crates/polars-core/src/datatypes/static_array_collect.rs @@ -1,7 +1,4 @@ -use std::sync::Arc; - use arrow::array::ArrayFromIter; -use arrow::bitmap::Bitmap; use crate::chunked_array::object::{ObjectArray, PolarsObject}; @@ -41,14 +38,6 @@ impl<'a, T: PolarsObject> ArrayFromIter> for ObjectArray { }) .collect::, E>>()?; - let null_bit_buffer: Option = null_mask_builder.into(); - let null_bitmap = null_bit_buffer; - let len = values.len(); - Ok(ObjectArray { - values: Arc::new(values), - null_bitmap, - offset: 0, - len, - }) + Ok(ObjectArray::from(values).with_validity(null_mask_builder.into())) } } diff --git a/crates/polars-core/src/scalar/mod.rs b/crates/polars-core/src/scalar/mod.rs index ed0ab1f98bfa..ac7a946ebebc 100644 --- a/crates/polars-core/src/scalar/mod.rs +++ b/crates/polars-core/src/scalar/mod.rs @@ -22,6 +22,11 @@ impl Scalar { self.value.is_null() } + #[inline(always)] + pub fn is_nan(&self) -> bool { + self.value.is_nan() + } + #[inline(always)] pub fn value(&self) -> &AnyValue<'static> { &self.value diff --git a/crates/polars-core/src/schema.rs b/crates/polars-core/src/schema.rs index b2ec55b528c7..d100cf91172f 100644 --- a/crates/polars-core/src/schema.rs +++ b/crates/polars-core/src/schema.rs @@ -90,97 +90,46 @@ impl SchemaExt for Schema { } } -/// This trait exists to be unify the API of polars Schema and arrows Schema. -pub trait IndexOfSchema: Debug { - /// Get the index of a column by name. - fn index_of(&self, name: &str) -> Option; - - /// Get a vector of all column names. - fn get_names(&self) -> Vec<&PlSmallStr>; - - fn get_names_str(&self) -> Vec<&str>; - - fn get_names_owned(&self) -> Vec; - - fn try_index_of(&self, name: &str) -> PolarsResult { - self.index_of(name).ok_or_else(|| { - polars_err!( - ColumnNotFound: - "unable to find column {:?}; valid columns: {:?}", name, self.get_names(), - ) - }) - } -} - -impl IndexOfSchema for Schema { - fn index_of(&self, name: &str) -> Option { - self.index_of(name) - } - - fn get_names(&self) -> Vec<&PlSmallStr> { - self.iter_names().collect() - } - - fn get_names_owned(&self) -> Vec { - self.iter_names().cloned().collect() - } +pub trait SchemaNamesAndDtypes { + const IS_ARROW: bool; + type DataType: Debug + Clone + Default + PartialEq; - fn get_names_str(&self) -> Vec<&str> { - self.iter_names().map(|x| x.as_str()).collect() - } + fn iter_names_and_dtypes( + &self, + ) -> impl ExactSizeIterator; } -impl IndexOfSchema for ArrowSchema { - fn index_of(&self, name: &str) -> Option { - self.iter_values().position(|f| f.name.as_str() == name) - } - - fn get_names(&self) -> Vec<&PlSmallStr> { - self.iter_values().map(|f| &f.name).collect() - } - - fn get_names_owned(&self) -> Vec { - self.iter_values().map(|f| f.name.clone()).collect() - } +impl SchemaNamesAndDtypes for ArrowSchema { + const IS_ARROW: bool = true; + type DataType = ArrowDataType; - fn get_names_str(&self) -> Vec<&str> { - self.iter_values().map(|f| f.name.as_str()).collect() + fn iter_names_and_dtypes( + &self, + ) -> impl ExactSizeIterator { + self.iter_values().map(|x| (&x.name, &x.dtype)) } } -pub trait SchemaNamesAndDtypes { - const IS_ARROW: bool; - type DataType: Debug + PartialEq; - - /// Get a vector of (name, dtype) pairs - fn get_names_and_dtypes(&'_ self) -> Vec<(&'_ str, Self::DataType)>; -} - impl SchemaNamesAndDtypes for Schema { const IS_ARROW: bool = false; type DataType = DataType; - fn get_names_and_dtypes(&'_ self) -> Vec<(&'_ str, Self::DataType)> { + fn iter_names_and_dtypes( + &self, + ) -> impl ExactSizeIterator { self.iter() - .map(|(name, dtype)| (name.as_str(), dtype.clone())) - .collect() - } -} - -impl SchemaNamesAndDtypes for ArrowSchema { - const IS_ARROW: bool = true; - type DataType = ArrowDataType; - - fn get_names_and_dtypes(&'_ self) -> Vec<(&'_ str, Self::DataType)> { - self.iter_values() - .map(|x| (x.name.as_str(), x.dtype.clone())) - .collect() } } -pub fn ensure_matching_schema(lhs: &S, rhs: &S) -> PolarsResult<()> { - let lhs = lhs.get_names_and_dtypes(); - let rhs = rhs.get_names_and_dtypes(); +pub fn ensure_matching_schema( + lhs: &polars_schema::Schema, + rhs: &polars_schema::Schema, +) -> PolarsResult<()> +where + polars_schema::Schema: SchemaNamesAndDtypes, +{ + let lhs = lhs.iter_names_and_dtypes(); + let rhs = rhs.iter_names_and_dtypes(); if lhs.len() != rhs.len() { polars_bail!( @@ -190,7 +139,7 @@ pub fn ensure_matching_schema(lhs: &S, rhs: &S) -> Pola ); } - for (i, ((l_name, l_dtype), (r_name, r_dtype))) in lhs.iter().zip(&rhs).enumerate() { + for (i, ((l_name, l_dtype), (r_name, r_dtype))) in lhs.zip(rhs).enumerate() { if l_name != r_name { polars_bail!( SchemaMismatch: @@ -199,18 +148,20 @@ pub fn ensure_matching_schema(lhs: &S, rhs: &S) -> Pola ) } if l_dtype != r_dtype - && (!S::IS_ARROW + && (!polars_schema::Schema::::IS_ARROW || unsafe { // For timezone normalization. Easier than writing out the entire PartialEq. DataType::from_arrow( - std::mem::transmute::<&::DataType, &ArrowDataType>( - l_dtype, - ), + std::mem::transmute::< + & as SchemaNamesAndDtypes>::DataType, + &ArrowDataType, + >(l_dtype), true, ) != DataType::from_arrow( - std::mem::transmute::<&::DataType, &ArrowDataType>( - r_dtype, - ), + std::mem::transmute::< + & as SchemaNamesAndDtypes>::DataType, + &ArrowDataType, + >(r_dtype), true, ) }) diff --git a/crates/polars-expr/src/expressions/slice.rs b/crates/polars-expr/src/expressions/slice.rs index d2bc9137a7d3..579c8d66635e 100644 --- a/crates/polars-expr/src/expressions/slice.rs +++ b/crates/polars-expr/src/expressions/slice.rs @@ -60,9 +60,16 @@ fn check_argument(arg: &Series, groups: &GroupsProxy, name: &str, expr: &Expr) - Ok(()) } -fn slice_groups_idx(offset: i64, length: usize, first: IdxSize, idx: &[IdxSize]) -> IdxItem { +fn slice_groups_idx(offset: i64, length: usize, mut first: IdxSize, idx: &[IdxSize]) -> IdxItem { let (offset, len) = slice_offsets(offset, length, idx.len()); - (first + offset as IdxSize, idx[offset..offset + len].into()) + + // If slice isn't out of bounds, we replace first. + // If slice is oob, the `idx` vec will be empty and `first` will be ignored + if let Some(f) = idx.get(offset) { + first = *f; + } + // This is a clone of the vec, which is unfortunate. Maybe we have a `sliceable` unitvec one day. + (first, idx[offset..offset + len].into()) } fn slice_groups_slice(offset: i64, length: usize, first: IdxSize, len: IdxSize) -> [IdxSize; 2] { diff --git a/crates/polars-expr/src/reduce/min_max.rs b/crates/polars-expr/src/reduce/min_max.rs index 5d514ce218c5..ba011d7d95f0 100644 --- a/crates/polars-expr/src/reduce/min_max.rs +++ b/crates/polars-expr/src/reduce/min_max.rs @@ -25,7 +25,9 @@ struct MinReduceState { impl MinReduceState { fn update_with_value(&mut self, other: &AnyValue<'static>) { - if self.value.is_null() || !other.is_null() && other < self.value.value() { + if self.value.is_null() + || !other.is_null() && (other < self.value.value() || self.value.is_nan()) + { self.value.update(other.clone()); } } @@ -78,7 +80,9 @@ struct MaxReduceState { impl MaxReduceState { fn update_with_value(&mut self, other: &AnyValue<'static>) { - if self.value.is_null() || !other.is_null() && other > self.value.value() { + if self.value.is_null() + || !other.is_null() && (other > self.value.value() || self.value.is_nan()) + { self.value.update(other.clone()); } } diff --git a/crates/polars-io/Cargo.toml b/crates/polars-io/Cargo.toml index 9eb2addc8be5..ca3d313e08ae 100644 --- a/crates/polars-io/Cargo.toml +++ b/crates/polars-io/Cargo.toml @@ -13,6 +13,7 @@ polars-core = { workspace = true } polars-error = { workspace = true } polars-json = { workspace = true, optional = true } polars-parquet = { workspace = true, optional = true } +polars-schema = { workspace = true } polars-time = { workspace = true, features = [], optional = true } polars-utils = { workspace = true, features = ['mmap'] } @@ -94,6 +95,7 @@ timezones = [ "dtype-datetime", "arrow/timezones", "polars-json?/chrono-tz", + "polars-json?/timezones", ] dtype-time = ["polars-core/dtype-time", "polars-core/temporal", "polars-time/dtype-time"] dtype-struct = ["polars-core/dtype-struct"] diff --git a/crates/polars-io/src/csv/read/mod.rs b/crates/polars-io/src/csv/read/mod.rs index 969be1a58908..b9d48291f8ce 100644 --- a/crates/polars-io/src/csv/read/mod.rs +++ b/crates/polars-io/src/csv/read/mod.rs @@ -26,7 +26,7 @@ mod splitfields; mod utils; pub use options::{CommentPrefix, CsvEncoding, CsvParseOptions, CsvReadOptions, NullValues}; -pub use parser::count_rows; +pub use parser::{count_rows, count_rows_from_slice}; pub use read_impl::batched::{BatchedCsvReader, OwnedBatchedCsvReader}; pub use reader::CsvReader; pub use schema_inference::infer_file_schema; diff --git a/crates/polars-io/src/csv/read/options.rs b/crates/polars-io/src/csv/read/options.rs index 7659565918ef..83b356fabde8 100644 --- a/crates/polars-io/src/csv/read/options.rs +++ b/crates/polars-io/src/csv/read/options.rs @@ -2,7 +2,7 @@ use std::path::PathBuf; use std::sync::Arc; use polars_core::datatypes::{DataType, Field}; -use polars_core::schema::{IndexOfSchema, Schema, SchemaRef}; +use polars_core::schema::{Schema, SchemaRef}; use polars_error::PolarsResult; use polars_utils::pl_str::PlSmallStr; #[cfg(feature = "serde")] diff --git a/crates/polars-io/src/csv/read/parser.rs b/crates/polars-io/src/csv/read/parser.rs index 18e6ef5f3f6d..282a304003a3 100644 --- a/crates/polars-io/src/csv/read/parser.rs +++ b/crates/polars-io/src/csv/read/parser.rs @@ -1,9 +1,10 @@ -use std::path::PathBuf; +use std::path::Path; use memchr::memchr2_iter; use num_traits::Pow; use polars_core::prelude::*; use polars_core::{config, POOL}; +use polars_error::feature_gated; use polars_utils::index::Bounded; use polars_utils::slice::GetSaferUnchecked; use rayon::prelude::*; @@ -18,7 +19,7 @@ use crate::utils::maybe_decompress_bytes; /// Read the number of rows without parsing columns /// useful for count(*) queries pub fn count_rows( - path: &PathBuf, + path: &Path, separator: u8, quote_char: Option, comment_prefix: Option<&CommentPrefix>, @@ -26,32 +27,47 @@ pub fn count_rows( has_header: bool, ) -> PolarsResult { let file = if is_cloud_url(path) || config::force_async() { - #[cfg(feature = "cloud")] - { + feature_gated!("cloud", { crate::file_cache::FILE_CACHE .get_entry(path.to_str().unwrap()) // Safety: This was initialized by schema inference. .unwrap() .try_open_assume_latest()? - } - #[cfg(not(feature = "cloud"))] - { - panic!("required feature `cloud` is not enabled") - } + }) } else { polars_utils::open_file(path)? }; let mmap = unsafe { memmap::Mmap::map(&file).unwrap() }; let owned = &mut vec![]; - let mut reader_bytes = maybe_decompress_bytes(mmap.as_ref(), owned)?; + let reader_bytes = maybe_decompress_bytes(mmap.as_ref(), owned)?; - for _ in 0..reader_bytes.len() { - if reader_bytes[0] != eol_char { + count_rows_from_slice( + reader_bytes, + separator, + quote_char, + comment_prefix, + eol_char, + has_header, + ) +} + +/// Read the number of rows without parsing columns +/// useful for count(*) queries +pub fn count_rows_from_slice( + mut bytes: &[u8], + separator: u8, + quote_char: Option, + comment_prefix: Option<&CommentPrefix>, + eol_char: u8, + has_header: bool, +) -> PolarsResult { + for _ in 0..bytes.len() { + if bytes[0] != eol_char { break; } - reader_bytes = &reader_bytes[1..]; + bytes = &bytes[1..]; } const MIN_ROWS_PER_THREAD: usize = 1024; @@ -59,7 +75,7 @@ pub fn count_rows( // Determine if parallelism is beneficial and how many threads let n_threads = get_line_stats( - reader_bytes, + bytes, MIN_ROWS_PER_THREAD, eol_char, None, @@ -67,22 +83,16 @@ pub fn count_rows( quote_char, ) .map(|(mean, std)| { - let n_rows = (reader_bytes.len() as f32 / (mean - 0.01 * std)) as usize; + let n_rows = (bytes.len() as f32 / (mean - 0.01 * std)) as usize; (n_rows / MIN_ROWS_PER_THREAD).clamp(1, max_threads) }) .unwrap_or(1); - let file_chunks: Vec<(usize, usize)> = get_file_chunks( - reader_bytes, - n_threads, - None, - separator, - quote_char, - eol_char, - ); + let file_chunks: Vec<(usize, usize)> = + get_file_chunks(bytes, n_threads, None, separator, quote_char, eol_char); let iter = file_chunks.into_par_iter().map(|(start, stop)| { - let local_bytes = &reader_bytes[start..stop]; + let local_bytes = &bytes[start..stop]; let row_iterator = SplitLines::new(local_bytes, quote_char.unwrap_or(b'"'), eol_char); if comment_prefix.is_some() { Ok(row_iterator diff --git a/crates/polars-io/src/csv/write/writer.rs b/crates/polars-io/src/csv/write/writer.rs index 32c657b6e1a6..f3017ce189ec 100644 --- a/crates/polars-io/src/csv/write/writer.rs +++ b/crates/polars-io/src/csv/write/writer.rs @@ -2,7 +2,7 @@ use std::io::Write; use std::num::NonZeroUsize; use polars_core::frame::DataFrame; -use polars_core::schema::{IndexOfSchema, Schema}; +use polars_core::schema::Schema; use polars_core::POOL; use polars_error::PolarsResult; @@ -228,7 +228,11 @@ impl BatchedWriter { if !self.has_written_header { self.has_written_header = true; - let names = self.schema.get_names_str(); + let names = self + .schema + .iter_names() + .map(|x| x.as_str()) + .collect::>(); write_header(&mut self.writer.buffer, &names, &self.writer.options)?; }; diff --git a/crates/polars-io/src/hive.rs b/crates/polars-io/src/hive.rs index ddf1d8973b3e..17ace26d6be7 100644 --- a/crates/polars-io/src/hive.rs +++ b/crates/polars-io/src/hive.rs @@ -1,42 +1,74 @@ use polars_core::frame::DataFrame; -use polars_core::schema::IndexOfSchema; use polars_core::series::Series; /// Materializes hive partitions. /// We have a special num_rows arg, as df can be empty when a projection contains /// only hive partition columns. /// +/// The `hive_partition_columns` must be ordered by their position in the `reader_schema` +/// /// # Safety /// /// num_rows equals the height of the df when the df height is non-zero. -pub(crate) fn materialize_hive_partitions( +pub(crate) fn materialize_hive_partitions( df: &mut DataFrame, - reader_schema: &S, + reader_schema: &polars_schema::Schema, hive_partition_columns: Option<&[Series]>, num_rows: usize, ) { if let Some(hive_columns) = hive_partition_columns { - let Some(first) = hive_columns.first() else { + // Insert these hive columns in the order they are stored in the file. + if hive_columns.is_empty() { return; - }; - - if reader_schema.index_of(first.name()).is_some() { - // Insert these hive columns in the order they are stored in the file. - for s in hive_columns { - let i = match df.get_columns().binary_search_by_key( - &reader_schema.index_of(s.name()).unwrap_or(usize::MAX), - |s| reader_schema.index_of(s.name()).unwrap_or(usize::MIN), - ) { - Ok(i) => i, - Err(i) => i, - }; - - df.insert_column(i, s.new_from_index(0, num_rows)).unwrap(); - } - } else { - for s in hive_columns { - unsafe { df.with_column_unchecked(s.new_from_index(0, num_rows)) }; + } + + let hive_columns_iter = hive_columns.iter().map(|s| s.new_from_index(0, num_rows)); + + if reader_schema.index_of(hive_columns[0].name()).is_none() || df.width() == 0 { + // Fast-path - all hive columns are at the end + unsafe { df.get_columns_mut() }.extend(hive_columns_iter); + return; + } + + let out_width: usize = df.width() + hive_columns.len(); + let df_columns = df.get_columns(); + let mut out_columns = Vec::with_capacity(out_width); + + // We have a slightly involved algorithm here because `reader_schema` may contain extra + // columns that were excluded from a projection pushdown. + + let hive_columns = hive_columns_iter.collect::>(); + // Safety: These are both non-empty at the start + let mut series_arr = [df_columns, hive_columns.as_slice()]; + let mut schema_idx_arr = [ + reader_schema.index_of(series_arr[0][0].name()).unwrap(), + reader_schema.index_of(series_arr[1][0].name()).unwrap(), + ]; + + loop { + let arg_min = if schema_idx_arr[0] < schema_idx_arr[1] { + 0 + } else { + 1 + }; + + out_columns.push(series_arr[arg_min][0].clone()); + series_arr[arg_min] = &series_arr[arg_min][1..]; + + if series_arr[arg_min].is_empty() { + break; } + + let Some(i) = reader_schema.index_of(series_arr[arg_min][0].name()) else { + break; + }; + + schema_idx_arr[arg_min] = i; } + + out_columns.extend_from_slice(series_arr[0]); + out_columns.extend_from_slice(series_arr[1]); + + *unsafe { df.get_columns_mut() } = out_columns; } } diff --git a/crates/polars-io/src/ipc/ipc_file.rs b/crates/polars-io/src/ipc/ipc_file.rs index aa6546c8dd5a..feaea44f5417 100644 --- a/crates/polars-io/src/ipc/ipc_file.rs +++ b/crates/polars-io/src/ipc/ipc_file.rs @@ -51,9 +51,7 @@ use crate::RowIndex; #[derive(Clone, Debug, PartialEq, Hash)] #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] -pub struct IpcScanOptions { - pub memory_map: bool, -} +pub struct IpcScanOptions; /// Read Arrows IPC format into a DataFrame /// diff --git a/crates/polars-io/src/ipc/mmap.rs b/crates/polars-io/src/ipc/mmap.rs index 854bd4c8d9d7..f0343642482e 100644 --- a/crates/polars-io/src/ipc/mmap.rs +++ b/crates/polars-io/src/ipc/mmap.rs @@ -3,9 +3,10 @@ use arrow::io::ipc::read::{Dictionaries, FileMetadata}; use arrow::mmap::{mmap_dictionaries_unchecked, mmap_unchecked}; use arrow::record_batch::RecordBatch; use polars_core::prelude::*; +use polars_utils::mmap::MMapSemaphore; use super::ipc_file::IpcReader; -use crate::mmap::{MMapSemaphore, MmapBytesReader}; +use crate::mmap::MmapBytesReader; use crate::predicates::PhysicalIoExpr; use crate::shared::{finish_reader, ArrowReader}; use crate::utils::{apply_projection, columns_to_projection}; @@ -15,17 +16,9 @@ impl IpcReader { &mut self, predicate: Option>, ) -> PolarsResult { - #[cfg(target_family = "unix")] - use std::os::unix::fs::MetadataExt; match self.reader.to_file() { Some(file) => { - #[cfg(target_family = "unix")] - let metadata = file.metadata()?; - let mmap = unsafe { memmap::Mmap::map(file).unwrap() }; - #[cfg(target_family = "unix")] - let semaphore = MMapSemaphore::new(metadata.dev(), metadata.ino(), mmap); - #[cfg(not(target_family = "unix"))] - let semaphore = MMapSemaphore::new(mmap); + let semaphore = MMapSemaphore::new_from_file(file)?; let metadata = read::read_file_metadata(&mut std::io::Cursor::new(semaphore.as_ref()))?; diff --git a/crates/polars-io/src/json/infer.rs b/crates/polars-io/src/json/infer.rs index 0ff83225e97f..8ba7965e63ae 100644 --- a/crates/polars-io/src/json/infer.rs +++ b/crates/polars-io/src/json/infer.rs @@ -2,7 +2,7 @@ use std::num::NonZeroUsize; use polars_core::prelude::DataType; use polars_core::utils::try_get_supertype; -use polars_error::{polars_bail, PolarsResult}; +use polars_error::{polars_bail, PolarsError, PolarsResult}; use simd_json::BorrowedValue; pub(crate) fn json_values_to_supertype( @@ -10,7 +10,7 @@ pub(crate) fn json_values_to_supertype( infer_schema_len: NonZeroUsize, ) -> PolarsResult { // struct types may have missing fields so find supertype - values + let out_opt: Option> = values .iter() .take(infer_schema_len.into()) .map(|value| polars_json::json::infer(value).map(|dt| DataType::from(&dt))) @@ -18,8 +18,16 @@ pub(crate) fn json_values_to_supertype( let l = l?; let r = r?; try_get_supertype(&l, &r) - }) - .unwrap_or_else(|| polars_bail!(ComputeError: "could not infer data-type")) + }); + match (out_opt, values.is_empty()) { + (Some(out), true) => match out { + Ok(out) => Ok(out), + _ => Err(PolarsError::NoData("no data".into())), + }, + (Some(out), false) => out, + (None, true) => Err(PolarsError::NoData("no data".into())), + (None, false) => polars_bail!(ComputeError: "could not infer data-type"), + } } pub(crate) fn dtypes_to_supertype>( diff --git a/crates/polars-io/src/json/mod.rs b/crates/polars-io/src/json/mod.rs index 1a8f9eb8f5a4..122f917b5e3a 100644 --- a/crates/polars-io/src/json/mod.rs +++ b/crates/polars-io/src/json/mod.rs @@ -71,7 +71,7 @@ use std::ops::Deref; use arrow::legacy::conversion::chunk_to_struct; use polars_core::error::to_compute_err; use polars_core::prelude::*; -use polars_error::{polars_bail, PolarsResult}; +use polars_error::{polars_bail, PolarsError, PolarsResult}; use polars_json::json::write::FallibleStreamingIterator; #[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; @@ -274,27 +274,41 @@ where let mut bytes = rb.deref().to_vec(); let json_value = simd_json::to_borrowed_value(&mut bytes).map_err(to_compute_err)?; - - // struct type - let dtype = if let Some(mut schema) = self.schema { + let dtype_result = if let Some(mut schema) = self.schema { if let Some(overwrite) = self.schema_overwrite { let mut_schema = Arc::make_mut(&mut schema); overwrite_schema(mut_schema, overwrite)?; } - - DataType::Struct(schema.iter_fields().collect()).to_arrow(CompatLevel::newest()) + Ok(DataType::Struct(schema.iter_fields().collect()) + .to_arrow(CompatLevel::newest())) } else { // infer - let inner_dtype = if let BorrowedValue::Array(values) = &json_value { - infer::json_values_to_supertype( + let inner_dtype_result = if let BorrowedValue::Array(values) = &json_value { + let supertype = infer::json_values_to_supertype( values, self.infer_schema_len .unwrap_or(NonZeroUsize::new(usize::MAX).unwrap()), - )? - .to_arrow(CompatLevel::newest()) + ); + match supertype { + Ok(supertype) => Ok(supertype.to_arrow(CompatLevel::newest())), + Err(e) => Err(e), + } } else { - polars_json::json::infer(&json_value)? + polars_json::json::infer(&json_value) }; + if inner_dtype_result.is_err() { + match &json_value { + BorrowedValue::Array(array) => { + if array.is_empty() { + return Ok(DataFrame::empty()); + } + }, + _ => { + polars_bail!(ComputeError: "could not infer data-type") + }, + } + } + let inner_dtype = inner_dtype_result?; if let Some(overwrite) = self.schema_overwrite { let ArrowDataType::Struct(fields) = inner_dtype else { @@ -304,18 +318,21 @@ where let mut schema = Schema::from_iter(fields.iter().map(Into::::into)); overwrite_schema(&mut schema, overwrite)?; - DataType::Struct( + Ok(DataType::Struct( schema .into_iter() .map(|(name, dt)| Field::new(name, dt)) .collect(), ) - .to_arrow(CompatLevel::newest()) + .to_arrow(CompatLevel::newest())) } else { - inner_dtype + Ok(inner_dtype) } }; - + if let Err(PolarsError::NoData(_)) = &dtype_result { + return Ok(DataFrame::empty()); + }; + let dtype = dtype_result?; let dtype = if let BorrowedValue::Array(_) = &json_value { ArrowDataType::LargeList(Box::new(arrow::datatypes::Field::new( PlSmallStr::from_static("item"), diff --git a/crates/polars-io/src/mmap.rs b/crates/polars-io/src/mmap.rs index 66ea8ed7b48b..df91f32942f9 100644 --- a/crates/polars-io/src/mmap.rs +++ b/crates/polars-io/src/mmap.rs @@ -1,84 +1,9 @@ -#[cfg(target_family = "unix")] -use std::collections::btree_map::Entry; -#[cfg(target_family = "unix")] -use std::collections::BTreeMap; use std::fs::File; use std::io::{BufReader, Cursor, Read, Seek}; use std::sync::Arc; -#[cfg(target_family = "unix")] -use std::sync::Mutex; -use memmap::Mmap; -#[cfg(target_family = "unix")] -use once_cell::sync::Lazy; use polars_core::config::verbose; -#[cfg(target_family = "unix")] -use polars_error::polars_bail; -use polars_error::PolarsResult; -use polars_utils::mmap::MemSlice; - -// Keep track of memory mapped files so we don't write to them while reading -// Use a btree as it uses less memory than a hashmap and this thing never shrinks. -// Write handle in Windows is exclusive, so this is only necessary in Unix. -#[cfg(target_family = "unix")] -static MEMORY_MAPPED_FILES: Lazy>> = - Lazy::new(|| Mutex::new(Default::default())); - -pub(crate) struct MMapSemaphore { - #[cfg(target_family = "unix")] - key: (u64, u64), - mmap: Mmap, -} - -impl MMapSemaphore { - #[cfg(target_family = "unix")] - pub(super) fn new(dev: u64, ino: u64, mmap: Mmap) -> Self { - let mut guard = MEMORY_MAPPED_FILES.lock().unwrap(); - let key = (dev, ino); - guard.insert(key, 1); - Self { key, mmap } - } - - #[cfg(not(target_family = "unix"))] - pub(super) fn new(mmap: Mmap) -> Self { - Self { mmap } - } -} - -impl AsRef<[u8]> for MMapSemaphore { - #[inline] - fn as_ref(&self) -> &[u8] { - self.mmap.as_ref() - } -} - -#[cfg(target_family = "unix")] -impl Drop for MMapSemaphore { - fn drop(&mut self) { - let mut guard = MEMORY_MAPPED_FILES.lock().unwrap(); - if let Entry::Occupied(mut e) = guard.entry(self.key) { - let v = e.get_mut(); - *v -= 1; - - if *v == 0 { - e.remove_entry(); - } - } - } -} - -pub fn ensure_not_mapped(#[allow(unused)] file: &File) -> PolarsResult<()> { - #[cfg(target_family = "unix")] - { - use std::os::unix::fs::MetadataExt; - let guard = MEMORY_MAPPED_FILES.lock().unwrap(); - let metadata = file.metadata()?; - if guard.contains_key(&(metadata.dev(), metadata.ino())) { - polars_bail!(ComputeError: "cannot write to file: already memory mapped"); - } - } - Ok(()) -} +use polars_utils::mmap::{MMapSemaphore, MemSlice}; /// Trait used to get a hold to file handler or to the underlying bytes /// without performing a Read. @@ -104,6 +29,12 @@ impl MmapBytesReader for BufReader { } } +impl MmapBytesReader for BufReader<&File> { + fn to_file(&self) -> Option<&File> { + Some(self.get_ref()) + } +} + impl MmapBytesReader for Cursor where T: AsRef<[u8]> + Send + Sync, @@ -137,7 +68,7 @@ impl MmapBytesReader for &mut T { pub enum ReaderBytes<'a> { Borrowed(&'a [u8]), Owned(Vec), - Mapped(memmap::Mmap, &'a File), + Mapped(MMapSemaphore, &'a File), } impl std::ops::Deref for ReaderBytes<'_> { @@ -146,7 +77,7 @@ impl std::ops::Deref for ReaderBytes<'_> { match self { Self::Borrowed(ref_bytes) => ref_bytes, Self::Owned(vec) => vec, - Self::Mapped(mmap, _) => mmap, + Self::Mapped(mmap, _) => mmap.as_ref(), } } } @@ -156,7 +87,7 @@ impl std::ops::Deref for ReaderBytes<'_> { impl ReaderBytes<'static> { pub fn into_mem_slice(self) -> MemSlice { match self { - ReaderBytes::Borrowed(v) => MemSlice::from_slice(v), + ReaderBytes::Borrowed(v) => MemSlice::from_static(v), ReaderBytes::Owned(v) => MemSlice::from_vec(v), ReaderBytes::Mapped(v, _) => MemSlice::from_mmap(Arc::new(v)), } @@ -174,7 +105,7 @@ impl<'a, T: 'a + MmapBytesReader> From<&'a mut T> for ReaderBytes<'a> { None => { if let Some(f) = m.to_file() { let f = unsafe { std::mem::transmute::<&File, &'a File>(f) }; - let mmap = unsafe { memmap::Mmap::map(f).unwrap() }; + let mmap = MMapSemaphore::new_from_file(f).unwrap(); ReaderBytes::Mapped(mmap, f) } else { if verbose() { diff --git a/crates/polars-io/src/parquet/metadata.rs b/crates/polars-io/src/parquet/metadata.rs index bc032651b837..ad62aecf36d3 100644 --- a/crates/polars-io/src/parquet/metadata.rs +++ b/crates/polars-io/src/parquet/metadata.rs @@ -2,7 +2,7 @@ use std::sync::Arc; -pub use polars_parquet::parquet::metadata::FileMetaData; +pub use polars_parquet::parquet::metadata::FileMetadata; pub use polars_parquet::read::statistics::{deserialize, Statistics as ParquetStatistics}; -pub type FileMetaDataRef = Arc; +pub type FileMetadataRef = Arc; diff --git a/crates/polars-io/src/parquet/read/async_impl.rs b/crates/polars-io/src/parquet/read/async_impl.rs index 466b223982b4..0c1ead03b85b 100644 --- a/crates/polars-io/src/parquet/read/async_impl.rs +++ b/crates/polars-io/src/parquet/read/async_impl.rs @@ -6,8 +6,8 @@ use bytes::Bytes; use object_store::path::Path as ObjectPath; use polars_core::config::{get_rg_prefetch_size, verbose}; use polars_core::prelude::*; -use polars_parquet::read::RowGroupMetaData; -use polars_parquet::write::FileMetaData; +use polars_parquet::read::RowGroupMetadata; +use polars_parquet::write::FileMetadata; use polars_utils::pl_str::PlSmallStr; use tokio::sync::mpsc::{channel, Receiver, Sender}; use tokio::sync::Mutex; @@ -17,8 +17,7 @@ use super::predicates::read_this_row_group; use crate::cloud::{ build_object_store, object_path_from_str, CloudLocation, CloudOptions, PolarsObjectStore, }; -use crate::parquet::metadata::FileMetaDataRef; -use crate::parquet::read::metadata::PartitionedColumnChunkMD; +use crate::parquet::metadata::FileMetadataRef; use crate::pl_async::get_runtime; use crate::predicates::PhysicalIoExpr; @@ -30,14 +29,14 @@ pub struct ParquetObjectStore { store: PolarsObjectStore, path: ObjectPath, length: Option, - metadata: Option, + metadata: Option, } impl ParquetObjectStore { pub async fn from_uri( uri: &str, options: Option<&CloudOptions>, - metadata: Option, + metadata: Option, ) -> PolarsResult { let (CloudLocation { prefix, .. }, store) = build_object_store(uri, options, false).await?; let path = object_path_from_str(&prefix)?; @@ -75,13 +74,13 @@ impl ParquetObjectStore { } /// Fetch the metadata of the parquet file, do not memoize it. - async fn fetch_metadata(&mut self) -> PolarsResult { + async fn fetch_metadata(&mut self) -> PolarsResult { let length = self.length().await?; fetch_metadata(&self.store, &self.path, length).await } /// Fetch and memoize the metadata of the parquet file. - pub async fn get_metadata(&mut self) -> PolarsResult<&FileMetaDataRef> { + pub async fn get_metadata(&mut self) -> PolarsResult<&FileMetadataRef> { if self.metadata.is_none() { self.metadata = Some(Arc::new(self.fetch_metadata().await?)); } @@ -108,7 +107,7 @@ pub async fn fetch_metadata( store: &PolarsObjectStore, path: &ObjectPath, file_byte_length: usize, -) -> PolarsResult { +) -> PolarsResult { let footer_header_bytes = store .get_range( path, @@ -166,7 +165,7 @@ pub async fn fetch_metadata( /// We concurrently download the columns for each field. async fn download_projection( fields: Arc<[PlSmallStr]>, - row_group: RowGroupMetaData, + row_group: RowGroupMetadata, async_reader: Arc, sender: QueueSend, rg_index: usize, @@ -178,16 +177,12 @@ async fn download_projection( let mut ranges = Vec::with_capacity(fields.len()); let mut offsets = Vec::with_capacity(fields.len()); fields.iter().for_each(|name| { - let columns = row_group.columns(); - // A single column can have multiple matches (structs). - let iter = columns.iter().filter_map(|meta| { - if meta.descriptor().path_in_schema[0] == name { - let (offset, len) = meta.byte_range(); - Some((offset, offset as usize..(offset + len) as usize)) - } else { - None - } + let iter = row_group.columns_under_root_iter(name).map(|meta| { + let byte_range = meta.byte_range(); + let offset = byte_range.start; + let byte_range = byte_range.start as usize..byte_range.end as usize; + (offset, byte_range) }); for (offset, range) in iter { @@ -210,38 +205,35 @@ async fn download_projection( } async fn download_row_group( - rg: RowGroupMetaData, + rg: RowGroupMetadata, async_reader: Arc, sender: QueueSend, rg_index: usize, ) -> bool { - if rg.columns().is_empty() { + if rg.n_columns() == 0 { return true; } - let offset = rg.columns().iter().map(|c| c.byte_range().0).min().unwrap(); - let (max_offset, len) = rg - .columns() - .iter() - .map(|c| c.byte_range()) - .max_by_key(|k| k.0) - .unwrap(); + + let full_byte_range = rg.full_byte_range(); + let full_byte_range = full_byte_range.start as usize..full_byte_range.end as usize; let result = async_reader - .get_range(offset as usize, (max_offset - offset + len) as usize) + .get_range( + full_byte_range.start, + full_byte_range.end - full_byte_range.start, + ) .await .map(|bytes| { - let base_offset = offset; ( rg_index, - rg.columns() - .iter() - .map(|c| { - let (offset, len) = c.byte_range(); - let slice_offset = offset - base_offset; - + rg.byte_ranges_iter() + .map(|range| { ( - offset, - bytes.slice(slice_offset as usize..(slice_offset + len) as usize), + range.start, + bytes.slice( + range.start as usize - full_byte_range.start + ..range.end as usize - full_byte_range.start, + ), ) }) .collect::(), @@ -263,7 +255,7 @@ impl FetchRowGroupsFromObjectStore { projection: Option<&[usize]>, predicate: Option>, row_group_range: Range, - row_groups: &[RowGroupMetaData], + row_groups: &[RowGroupMetadata], ) -> PolarsResult { let projected_fields: Option> = projection.map(|projection| { projection @@ -279,18 +271,8 @@ impl FetchRowGroupsFromObjectStore { .filter_map(|i| { let rg = &row_groups[i]; - // TODO! - // Optimize this. Now we partition the predicate columns twice. (later on reading as well) - // I think we must add metadata context where we can cache and amortize the partitioning. - let mut part_md = PartitionedColumnChunkMD::new(rg); - let live = pred.live_variables(); - part_md.set_partitions( - live.as_ref() - .map(|vars| vars.iter().map(|s| s.as_ref()).collect::>()) - .as_ref(), - ); let should_be_read = - matches!(read_this_row_group(Some(pred), &part_md, &schema), Ok(true)); + matches!(read_this_row_group(Some(pred), rg, &schema), Ok(true)); // Already add the row groups that will be skipped to the prefetched data. if !should_be_read { diff --git a/crates/polars-io/src/parquet/read/metadata.rs b/crates/polars-io/src/parquet/read/metadata.rs deleted file mode 100644 index 8f1a2c1642c8..000000000000 --- a/crates/polars-io/src/parquet/read/metadata.rs +++ /dev/null @@ -1,57 +0,0 @@ -use hashbrown::hash_map::RawEntryMut; -use polars_parquet::read::{ColumnChunkMetaData, RowGroupMetaData}; -use polars_utils::aliases::{PlHashMap, PlHashSet}; -use polars_utils::idx_vec::UnitVec; -use polars_utils::unitvec; - -/// This is a utility struct that Partitions the `ColumnChunkMetaData` by `field.name == descriptor.path_in_schema[0]` -/// This is required to fix quadratic behavior in wide parquet files. See #18319. -pub struct PartitionedColumnChunkMD<'a> { - partitions: Option>>, - metadata: &'a RowGroupMetaData, -} - -impl<'a> PartitionedColumnChunkMD<'a> { - pub fn new(metadata: &'a RowGroupMetaData) -> Self { - Self { - partitions: Default::default(), - metadata, - } - } - - pub(super) fn num_rows(&self) -> usize { - self.metadata.num_rows() - } - - pub fn set_partitions(&mut self, field_names: Option<&PlHashSet<&str>>) { - let mut partitions = PlHashMap::default(); - for (i, ccmd) in self.metadata.columns().iter().enumerate() { - let name = &ccmd.descriptor().path_in_schema[0]; - if field_names - .map(|field_names| field_names.contains(name.as_str())) - .unwrap_or(true) - { - let entry = partitions.raw_entry_mut().from_key(name.as_str()); - - match entry { - RawEntryMut::Vacant(slot) => { - slot.insert(name.to_string(), unitvec![i]); - }, - RawEntryMut::Occupied(mut slot) => { - slot.get_mut().push(i); - }, - }; - } - } - self.partitions = Some(partitions) - } - - pub fn get_partitions(&self, name: &str) -> Option> { - let columns = self.metadata.columns(); - self.partitions - .as_ref() - .expect("fields should be partitioned first") - .get(name) - .map(|idx| idx.iter().map(|i| &columns[*i]).collect::>()) - } -} diff --git a/crates/polars-io/src/parquet/read/mmap.rs b/crates/polars-io/src/parquet/read/mmap.rs index 84725fd7a2e1..04edfc8400f4 100644 --- a/crates/polars-io/src/parquet/read/mmap.rs +++ b/crates/polars-io/src/parquet/read/mmap.rs @@ -6,7 +6,7 @@ use bytes::Bytes; use polars_core::datatypes::PlHashMap; use polars_error::PolarsResult; use polars_parquet::read::{ - column_iter_to_arrays, BasicDecompressor, ColumnChunkMetaData, Filter, PageReader, + column_iter_to_arrays, BasicDecompressor, ColumnChunkMetadata, Filter, PageReader, }; use polars_utils::mmap::{MemReader, MemSlice}; @@ -31,8 +31,8 @@ pub enum ColumnStore { /// For cloud files the relevant memory regions should have been prefetched. pub(super) fn mmap_columns<'a>( store: &'a ColumnStore, - field_columns: &'a [&ColumnChunkMetaData], -) -> Vec<(&'a ColumnChunkMetaData, MemSlice)> { + field_columns: &'a [&ColumnChunkMetadata], +) -> Vec<(&'a ColumnChunkMetadata, MemSlice)> { field_columns .iter() .map(|meta| _mmap_single_column(store, meta)) @@ -41,16 +41,19 @@ pub(super) fn mmap_columns<'a>( fn _mmap_single_column<'a>( store: &'a ColumnStore, - meta: &'a ColumnChunkMetaData, -) -> (&'a ColumnChunkMetaData, MemSlice) { - let (start, len) = meta.byte_range(); + meta: &'a ColumnChunkMetadata, +) -> (&'a ColumnChunkMetadata, MemSlice) { + let byte_range = meta.byte_range(); let chunk = match store { - ColumnStore::Local(mem_slice) => mem_slice.slice((start as usize)..(start + len) as usize), + ColumnStore::Local(mem_slice) => { + mem_slice.slice(byte_range.start as usize..byte_range.end as usize) + }, #[cfg(all(feature = "async", feature = "parquet"))] ColumnStore::Fetched(fetched) => { - let entry = fetched.get(&start).unwrap_or_else(|| { + let entry = fetched.get(&byte_range.start).unwrap_or_else(|| { panic!( - "mmap_columns: column with start {start} must be prefetched in ColumnStore.\n" + "mmap_columns: column with start {} must be prefetched in ColumnStore.\n", + byte_range.start ) }); MemSlice::from_bytes(entry.clone()) @@ -62,7 +65,7 @@ fn _mmap_single_column<'a>( // similar to arrow2 serializer, except this accepts a slice instead of a vec. // this allows us to memory map pub fn to_deserializer( - columns: Vec<(&ColumnChunkMetaData, MemSlice)>, + columns: Vec<(&ColumnChunkMetadata, MemSlice)>, field: Field, filter: Option, ) -> PolarsResult> { diff --git a/crates/polars-io/src/parquet/read/mod.rs b/crates/polars-io/src/parquet/read/mod.rs index b6b337c3ff6e..14c24bce12ac 100644 --- a/crates/polars-io/src/parquet/read/mod.rs +++ b/crates/polars-io/src/parquet/read/mod.rs @@ -16,7 +16,6 @@ #[cfg(feature = "cloud")] mod async_impl; -mod metadata; mod mmap; mod options; mod predicates; @@ -40,7 +39,6 @@ pub use reader::{BatchedParquetReader, ParquetReader}; pub use utils::materialize_empty_df; pub mod _internal { - pub use super::metadata::PartitionedColumnChunkMD; pub use super::mmap::to_deserializer; pub use super::predicates::read_this_row_group; } diff --git a/crates/polars-io/src/parquet/read/predicates.rs b/crates/polars-io/src/parquet/read/predicates.rs index b08c5b6125ae..87615de1b8c2 100644 --- a/crates/polars-io/src/parquet/read/predicates.rs +++ b/crates/polars-io/src/parquet/read/predicates.rs @@ -1,7 +1,7 @@ use polars_core::prelude::*; use polars_parquet::read::statistics::{deserialize, Statistics}; +use polars_parquet::read::RowGroupMetadata; -use crate::parquet::read::metadata::PartitionedColumnChunkMD; use crate::predicates::{BatchStats, ColumnStats, PhysicalIoExpr}; impl ColumnStats { @@ -17,18 +17,20 @@ impl ColumnStats { /// Collect the statistics in a row-group pub(crate) fn collect_statistics( - part_md: &PartitionedColumnChunkMD, + md: &RowGroupMetadata, schema: &ArrowSchema, ) -> PolarsResult> { // TODO! fix this performance. This is a full sequential scan. let stats = schema .iter_values() - .map(|field| match part_md.get_partitions(&field.name) { - Some(md) => { - let st = deserialize(field, &md)?; - Ok(ColumnStats::from_arrow_stats(st, field)) - }, - None => Ok(ColumnStats::new(field.into(), None, None, None)), + .map(|field| { + let iter = md.columns_under_root_iter(&field.name); + + Ok(if iter.len() == 0 { + ColumnStats::new(field.into(), None, None, None) + } else { + ColumnStats::from_arrow_stats(deserialize(field, iter)?, field) + }) }) .collect::>>()?; @@ -39,18 +41,18 @@ pub(crate) fn collect_statistics( Ok(Some(BatchStats::new( Arc::new(Schema::from_arrow_schema(schema)), stats, - Some(part_md.num_rows()), + Some(md.num_rows()), ))) } pub fn read_this_row_group( predicate: Option<&dyn PhysicalIoExpr>, - part_md: &PartitionedColumnChunkMD, + md: &RowGroupMetadata, schema: &ArrowSchema, ) -> PolarsResult { if let Some(pred) = predicate { if let Some(pred) = pred.as_stats_evaluator() { - if let Some(stats) = collect_statistics(part_md, schema)? { + if let Some(stats) = collect_statistics(md, schema)? { let should_read = pred.should_read(&stats); // a parquet file may not have statistics of all columns if matches!(should_read, Ok(false)) { diff --git a/crates/polars-io/src/parquet/read/read_impl.rs b/crates/polars-io/src/parquet/read/read_impl.rs index b31c0d1ff2b1..c621b698cebc 100644 --- a/crates/polars-io/src/parquet/read/read_impl.rs +++ b/crates/polars-io/src/parquet/read/read_impl.rs @@ -12,7 +12,7 @@ use polars_core::POOL; use polars_parquet::parquet::error::ParquetResult; use polars_parquet::parquet::statistics::Statistics; use polars_parquet::read::{ - self, ColumnChunkMetaData, FileMetaData, Filter, PhysicalType, RowGroupMetaData, + self, ColumnChunkMetadata, FileMetadata, Filter, PhysicalType, RowGroupMetadata, }; use polars_utils::mmap::MemSlice; use rayon::prelude::*; @@ -26,8 +26,7 @@ use super::utils::materialize_empty_df; use super::{mmap, ParallelStrategy}; use crate::hive::materialize_hive_partitions; use crate::mmap::{MmapBytesReader, ReaderBytes}; -use crate::parquet::metadata::FileMetaDataRef; -use crate::parquet::read::metadata::PartitionedColumnChunkMD; +use crate::parquet::metadata::FileMetadataRef; use crate::parquet::read::ROW_COUNT_OVERFLOW_ERR; use crate::predicates::{apply_predicate, PhysicalIoExpr}; use crate::utils::get_reader_bytes; @@ -64,7 +63,7 @@ fn assert_dtypes(dtype: &ArrowDataType) { fn column_idx_to_series( column_i: usize, // The metadata belonging to this column - field_md: &[&ColumnChunkMetaData], + field_md: &[&ColumnChunkMetadata], filter: Option, file_schema: &ArrowSchema, store: &mmap::ColumnStore, @@ -143,7 +142,7 @@ fn rg_to_dfs( row_group_start: usize, row_group_end: usize, slice: (usize, usize), - file_metadata: &FileMetaData, + file_metadata: &FileMetadata, schema: &ArrowSchemaRef, predicate: Option<&dyn PhysicalIoExpr>, row_index: Option, @@ -222,31 +221,13 @@ fn rg_to_dfs( } } -/// Collect a HashSet of the projected columns. -/// Returns `None` if all columns are projected. -fn projected_columns_set<'a>( - schema: &'a ArrowSchema, - projection: &[usize], -) -> Option> { - if projection.len() == schema.len() { - None - } else { - Some( - projection - .iter() - .map(|i| schema.get_at_index(*i).unwrap().0.as_str()) - .collect::>(), - ) - } -} - #[allow(clippy::too_many_arguments)] fn rg_to_dfs_prefiltered( store: &mmap::ColumnStore, previous_row_count: &mut IdxSize, row_group_start: usize, row_group_end: usize, - file_metadata: &FileMetaData, + file_metadata: &FileMetadata, schema: &ArrowSchemaRef, live_variables: Vec, predicate: &dyn PhysicalIoExpr, @@ -259,20 +240,6 @@ fn rg_to_dfs_prefiltered( polars_bail!(ComputeError: "Parquet file contains too many row groups (> {})", u32::MAX); } - let projected_columns = projected_columns_set(schema, projection); - - let part_mds = POOL.install(|| { - file_metadata - .row_groups - .par_iter() - .map(|rg| { - let mut part_md = PartitionedColumnChunkMD::new(rg); - part_md.set_partitions(projected_columns.as_ref()); - part_md - }) - .collect::>() - }); - let mut row_offset = *previous_row_count; let rg_offsets: Vec = match row_index { None => Vec::new(), @@ -335,10 +302,10 @@ fn rg_to_dfs_prefiltered( (row_group_start..row_group_end) .into_par_iter() .map(|rg_idx| { - let part_md = &part_mds[rg_idx]; + let md = &file_metadata.row_groups[rg_idx]; if use_statistics { - match read_this_row_group(Some(predicate), part_md, schema) { + match read_this_row_group(Some(predicate), md, schema) { Ok(false) => return Ok(None), Ok(true) => {}, Err(e) => return Err(e), @@ -352,7 +319,9 @@ fn rg_to_dfs_prefiltered( let col_idx = live_idx_to_col_idx[i]; let name = schema.get_at_index(col_idx).unwrap().0; - let field_md = part_mds[rg_idx].get_partitions(name).unwrap(); + let field_md = file_metadata.row_groups[rg_idx] + .columns_under_root_iter(name) + .collect::>(); column_idx_to_series(col_idx, field_md.as_slice(), None, schema, store) }) @@ -438,7 +407,9 @@ fn rg_to_dfs_prefiltered( let md = &file_metadata.row_groups[rg_idx]; debug_assert_eq!(md.num_rows(), mask.len()); } - let field_md = part_mds[rg_idx].get_partitions(name).unwrap(); + let field_md = file_metadata.row_groups[rg_idx] + .columns_under_root_iter(name) + .collect::>(); let pre = || { column_idx_to_series( @@ -504,7 +475,7 @@ fn rg_to_dfs_prefiltered( // We first add the columns with the live columns at the start. Then, we do a // projections that puts the columns at the right spot. df._add_columns(rg_columns, &rearranged_schema)?; - let df = df.select(schema.get_names_owned())?; + let df = df.select(schema.iter_names_cloned())?; PolarsResult::Ok(Some(df)) }) @@ -530,7 +501,7 @@ fn rg_to_dfs_optionally_par_over_columns( row_group_start: usize, row_group_end: usize, slice: (usize, usize), - file_metadata: &FileMetaData, + file_metadata: &FileMetadata, schema: &ArrowSchemaRef, predicate: Option<&dyn PhysicalIoExpr>, row_index: Option, @@ -549,16 +520,13 @@ fn rg_to_dfs_optionally_par_over_columns( for rg_idx in row_group_start..row_group_end { let md = &file_metadata.row_groups[rg_idx]; - // Set partitioned fields to prevent quadratic behavior. - let projected_columns = projected_columns_set(schema, projection); - let mut part_md = PartitionedColumnChunkMD::new(md); - part_md.set_partitions(projected_columns.as_ref()); - let rg_slice = split_slice_at_file(&mut n_rows_processed, md.num_rows(), slice.0, slice_end); let current_row_count = md.num_rows() as IdxSize; - if use_statistics && !read_this_row_group(predicate, &part_md, schema)? { + if use_statistics + && !read_this_row_group(predicate, &file_metadata.row_groups[rg_idx], schema)? + { *previous_row_count += rg_slice.1 as IdxSize; continue; } @@ -574,7 +542,7 @@ fn rg_to_dfs_optionally_par_over_columns( .par_iter() .map(|column_i| { let name = schema.get_at_index(*column_i).unwrap().0; - let part = part_md.get_partitions(name).unwrap(); + let part = md.columns_under_root_iter(name).collect::>(); column_idx_to_series( *column_i, @@ -591,7 +559,7 @@ fn rg_to_dfs_optionally_par_over_columns( .iter() .map(|column_i| { let name = schema.get_at_index(*column_i).unwrap().0; - let part = part_md.get_partitions(name).unwrap(); + let part = md.columns_under_root_iter(name).collect::>(); column_idx_to_series( *column_i, @@ -637,7 +605,7 @@ fn rg_to_dfs_par_over_rg( row_group_end: usize, previous_row_count: &mut IdxSize, slice: (usize, usize), - file_metadata: &FileMetaData, + file_metadata: &FileMetadata, schema: &ArrowSchemaRef, predicate: Option<&dyn PhysicalIoExpr>, row_index: Option, @@ -672,27 +640,13 @@ fn rg_to_dfs_par_over_rg( let dfs = POOL.install(|| { // Set partitioned fields to prevent quadratic behavior. // Ensure all row groups are partitioned. - let part_mds = { - let projected_columns = projected_columns_set(schema, projection); - row_groups - .par_iter() - .map(|(_, rg, _, _)| { - let mut ccmd = PartitionedColumnChunkMD::new(rg); - ccmd.set_partitions(projected_columns.as_ref()); - ccmd - }) - .collect::>() - }; - row_groups .into_par_iter() .enumerate() .map(|(iter_idx, (_rg_idx, _md, slice, row_count_start))| { - let part_md = &part_mds[iter_idx]; + let md = &file_metadata.row_groups[iter_idx]; - if slice.1 == 0 - || use_statistics && !read_this_row_group(predicate, part_md, schema)? - { + if slice.1 == 0 || use_statistics && !read_this_row_group(predicate, md, schema)? { return Ok(None); } // test we don't read the parquet file if this env var is set @@ -705,7 +659,7 @@ fn rg_to_dfs_par_over_rg( .iter() .map(|column_i| { let name = schema.get_at_index(*column_i).unwrap().0; - let field_md = part_md.get_partitions(name).unwrap(); + let field_md = md.columns_under_root_iter(name).collect::>(); column_idx_to_series( *column_i, @@ -747,7 +701,7 @@ pub fn read_parquet( slice: (usize, usize), projection: Option<&[usize]>, reader_schema: &ArrowSchemaRef, - metadata: Option, + metadata: Option, predicate: Option<&dyn PhysicalIoExpr>, mut parallel: ParallelStrategy, row_index: Option, @@ -901,7 +855,7 @@ pub(super) fn compute_row_group_range( row_group_start: usize, row_group_end: usize, slice: (usize, usize), - row_groups: &[RowGroupMetaData], + row_groups: &[RowGroupMetadata], ) -> std::ops::Range { let mut start = row_group_start; let mut cum_rows: usize = (0..row_group_start).map(|i| row_groups[i].num_rows()).sum(); @@ -947,7 +901,7 @@ pub struct BatchedParquetReader { slice: (usize, usize), projection: Arc<[usize]>, schema: ArrowSchemaRef, - metadata: FileMetaDataRef, + metadata: FileMetadataRef, predicate: Option>, row_index: Option, rows_read: IdxSize, @@ -967,7 +921,7 @@ impl BatchedParquetReader { #[allow(clippy::too_many_arguments)] pub fn new( row_group_fetcher: RowGroupFetcher, - metadata: FileMetaDataRef, + metadata: FileMetadataRef, schema: ArrowSchemaRef, slice: (usize, usize), projection: Option>, diff --git a/crates/polars-io/src/parquet/read/reader.rs b/crates/polars-io/src/parquet/read/reader.rs index 25e8852a92ce..0f6f3b70b4f3 100644 --- a/crates/polars-io/src/parquet/read/reader.rs +++ b/crates/polars-io/src/parquet/read/reader.rs @@ -18,7 +18,7 @@ use super::utils::materialize_empty_df; #[cfg(feature = "cloud")] use crate::cloud::CloudOptions; use crate::mmap::MmapBytesReader; -use crate::parquet::metadata::FileMetaDataRef; +use crate::parquet::metadata::FileMetadataRef; use crate::predicates::PhysicalIoExpr; use crate::prelude::*; use crate::RowIndex; @@ -35,7 +35,7 @@ pub struct ParquetReader { schema: Option, row_index: Option, low_memory: bool, - metadata: Option, + metadata: Option, predicate: Option>, hive_partition_columns: Option>, include_file_path: Option<(PlSmallStr, Arc)>, @@ -138,7 +138,7 @@ impl ParquetReader { self } - pub fn get_metadata(&mut self) -> PolarsResult<&FileMetaDataRef> { + pub fn get_metadata(&mut self) -> PolarsResult<&FileMetadataRef> { if self.metadata.is_none() { self.metadata = Some(Arc::new(read::read_metadata(&mut self.reader)?)); } @@ -267,7 +267,7 @@ impl ParquetAsyncReader { pub async fn from_uri( uri: &str, cloud_options: Option<&CloudOptions>, - metadata: Option, + metadata: Option, ) -> PolarsResult { Ok(ParquetAsyncReader { reader: ParquetObjectStore::from_uri(uri, cloud_options, metadata).await?, @@ -406,7 +406,7 @@ impl ParquetAsyncReader { ) } - pub async fn get_metadata(&mut self) -> PolarsResult<&FileMetaDataRef> { + pub async fn get_metadata(&mut self) -> PolarsResult<&FileMetadataRef> { self.reader.get_metadata().await } diff --git a/crates/polars-io/src/path_utils/mod.rs b/crates/polars-io/src/path_utils/mod.rs index 5c4e48f7e6e4..1795cda6ebd0 100644 --- a/crates/polars-io/src/path_utils/mod.rs +++ b/crates/polars-io/src/path_utils/mod.rs @@ -5,6 +5,7 @@ use std::sync::Arc; use once_cell::sync::Lazy; use polars_core::config; use polars_core::error::{polars_bail, to_compute_err, PolarsError, PolarsResult}; +use polars_utils::pl_str::PlSmallStr; use regex::Regex; #[cfg(feature = "cloud")] @@ -88,7 +89,7 @@ pub fn expand_paths( paths: &[PathBuf], glob: bool, #[allow(unused_variables)] cloud_options: Option<&CloudOptions>, -) -> PolarsResult>> { +) -> PolarsResult> { expand_paths_hive(paths, glob, cloud_options, false).map(|x| x.0) } @@ -129,13 +130,69 @@ pub fn expand_paths_hive( glob: bool, #[allow(unused_variables)] cloud_options: Option<&CloudOptions>, check_directory_level: bool, -) -> PolarsResult<(Arc>, usize)> { +) -> PolarsResult<(Arc<[PathBuf]>, usize)> { let Some(first_path) = paths.first() else { return Ok((vec![].into(), 0)); }; let is_cloud = is_cloud_url(first_path); - let mut out_paths = vec![]; + + /// Wrapper around `Vec` that also tracks file extensions, so that + /// we don't have to traverse the entire list again to validate extensions. + struct OutPaths { + paths: Vec, + exts: [Option<(PlSmallStr, usize)>; 2], + current_idx: usize, + } + + impl OutPaths { + fn update_ext_status( + current_idx: &mut usize, + exts: &mut [Option<(PlSmallStr, usize)>; 2], + value: &Path, + ) { + let ext = value + .extension() + .map(|x| PlSmallStr::from(x.to_str().unwrap())) + .unwrap_or(PlSmallStr::EMPTY); + + if exts[0].is_none() { + exts[0] = Some((ext, *current_idx)); + } else if exts[1].is_none() && ext != exts[0].as_ref().unwrap().0 { + exts[1] = Some((ext, *current_idx)); + } + + *current_idx += 1; + } + + fn push(&mut self, value: PathBuf) { + { + let current_idx = &mut self.current_idx; + let exts = &mut self.exts; + Self::update_ext_status(current_idx, exts, &value); + } + self.paths.push(value) + } + + fn extend(&mut self, values: impl IntoIterator) { + let current_idx = &mut self.current_idx; + let exts = &mut self.exts; + + self.paths.extend(values.into_iter().inspect(|x| { + Self::update_ext_status(current_idx, exts, x); + })) + } + + fn extend_from_slice(&mut self, values: &[PathBuf]) { + self.extend(values.iter().cloned()) + } + } + + let mut out_paths = OutPaths { + paths: vec![], + exts: [None, None], + current_idx: 0, + }; let mut hive_idx_tracker = HiveIdxTracker { idx: usize::MAX, @@ -337,31 +394,20 @@ pub fn expand_paths_hive( } } - let out_paths = if expanded_from_single_directory(paths, out_paths.as_ref()) { - // Require all file extensions to be the same when expanding a single directory. - let ext = out_paths[0].extension(); - - (0..out_paths.len()) - .map(|i| { - let path = out_paths[i].clone(); - - if path.extension() != ext { - polars_bail!( - InvalidOperation: r#"directory contained paths with different file extensions: \ - first path: {}, second path: {}. Please use a glob pattern to explicitly specify \ - which files to read (e.g. "dir/**/*", "dir/**/*.parquet")"#, - out_paths[i - 1].to_str().unwrap(), path.to_str().unwrap() - ); - }; + assert_eq!(out_paths.current_idx, out_paths.paths.len()); - Ok(path) - }) - .collect::>>()? - } else { - out_paths - }; + if expanded_from_single_directory(paths, out_paths.paths.as_slice()) { + if let [Some((_, i1)), Some((_, i2))] = out_paths.exts { + polars_bail!( + InvalidOperation: r#"directory contained paths with different file extensions: \ + first path: {}, second path: {}. Please use a glob pattern to explicitly specify \ + which files to read (e.g. "dir/**/*", "dir/**/*.parquet")"#, + &out_paths.paths[i1].to_string_lossy(), &out_paths.paths[i2].to_string_lossy() + ) + } + } - Ok((Arc::new(out_paths), hive_idx_tracker.idx)) + Ok((out_paths.paths.into(), hive_idx_tracker.idx)) } /// Ignores errors from `std::fs::create_dir_all` if the directory exists. diff --git a/crates/polars-io/src/utils/byte_source.rs b/crates/polars-io/src/utils/byte_source.rs index fce7e795ce46..e2dd3e876c2a 100644 --- a/crates/polars-io/src/utils/byte_source.rs +++ b/crates/polars-io/src/utils/byte_source.rs @@ -1,7 +1,7 @@ use std::ops::Range; use std::sync::Arc; -use polars_error::{to_compute_err, PolarsResult}; +use polars_error::PolarsResult; use polars_utils::_limit_path_len_io_err; use polars_utils::mmap::MemSlice; @@ -34,9 +34,8 @@ impl MemSliceByteSource { .into_std() .await, ); - let mmap = Arc::new(unsafe { memmap::Mmap::map(file.as_ref()) }.map_err(to_compute_err)?); - Ok(Self(MemSlice::from_mmap(mmap))) + Ok(Self(MemSlice::from_file(file.as_ref())?)) } } @@ -151,6 +150,12 @@ impl From for DynByteSource { } } +impl From for DynByteSource { + fn from(value: MemSlice) -> Self { + Self::MemSlice(MemSliceByteSource(value)) + } +} + #[derive(Clone, Debug)] pub enum DynByteSourceBuilder { Mmap, diff --git a/crates/polars-io/src/utils/other.rs b/crates/polars-io/src/utils/other.rs index 3c1ab1e248d8..1984e6ad480e 100644 --- a/crates/polars-io/src/utils/other.rs +++ b/crates/polars-io/src/utils/other.rs @@ -7,6 +7,7 @@ use polars_core::prelude::*; #[cfg(any(feature = "ipc_streaming", feature = "parquet"))] use polars_core::utils::{accumulate_dataframes_vertical_unchecked, split_df_as_ref}; use polars_error::to_compute_err; +use polars_utils::mmap::MMapSemaphore; use regex::{Regex, RegexBuilder}; use crate::mmap::{MmapBytesReader, ReaderBytes}; @@ -21,12 +22,15 @@ pub fn get_reader_bytes<'a, R: Read + MmapBytesReader + ?Sized>( .ok() .and_then(|offset| Some((reader.to_file()?, offset))) { - let mmap = unsafe { memmap::MmapOptions::new().offset(offset).map(file)? }; + let mut options = memmap::MmapOptions::new(); + options.offset(offset); // somehow bck thinks borrows alias // this is sound as file was already bound to 'a use std::fs::File; + let file = unsafe { std::mem::transmute::<&File, &'a File>(file) }; + let mmap = MMapSemaphore::new_from_file_with_options(file, options)?; Ok(ReaderBytes::Mapped(mmap, file)) } else { // we can get the bytes for free @@ -107,26 +111,10 @@ pub(crate) fn columns_to_projection( schema: &ArrowSchema, ) -> PolarsResult> { let mut prj = Vec::with_capacity(columns.len()); - if columns.len() > 100 { - let mut column_names = PlHashMap::with_capacity(schema.len()); - schema.iter_values().enumerate().for_each(|(i, c)| { - column_names.insert(c.name.as_str(), i); - }); - - for column in columns.iter() { - let Some(&i) = column_names.get(column.as_str()) else { - polars_bail!( - ColumnNotFound: - "unable to find column {:?}; valid columns: {:?}", column, schema.get_names(), - ); - }; - prj.push(i); - } - } else { - for column in columns.iter() { - let i = schema.try_index_of(column)?; - prj.push(i); - } + + for column in columns { + let i = schema.try_index_of(column)?; + prj.push(i); } Ok(prj) diff --git a/crates/polars-lazy/Cargo.toml b/crates/polars-lazy/Cargo.toml index 03fcc0d8b2c8..a3efd659a77f 100644 --- a/crates/polars-lazy/Cargo.toml +++ b/crates/polars-lazy/Cargo.toml @@ -169,6 +169,7 @@ is_between = ["polars-plan/is_between", "polars-expr/is_between"] is_unique = ["polars-plan/is_unique"] cross_join = ["polars-plan/cross_join", "polars-pipe?/cross_join", "polars-ops/cross_join"] asof_join = ["polars-plan/asof_join", "polars-time", "polars-ops/asof_join", "polars-mem-engine/asof_join"] +iejoin = ["polars-plan/iejoin"] business = ["polars-plan/business"] concat_str = ["polars-plan/concat_str"] range = ["polars-plan/range"] diff --git a/crates/polars-lazy/src/frame/mod.rs b/crates/polars-lazy/src/frame/mod.rs index d0125dd0bfbc..d6fd3d4364cb 100644 --- a/crates/polars-lazy/src/frame/mod.rs +++ b/crates/polars-lazy/src/frame/mod.rs @@ -184,11 +184,13 @@ impl LazyFrame { } /// Run nodes that are capably of doing so on the streaming engine. + #[cfg(feature = "streaming")] pub fn with_streaming(mut self, toggle: bool) -> Self { self.opt_state.set(OptFlags::STREAMING, toggle); self } + #[cfg(feature = "new_streaming")] pub fn with_new_streaming(mut self, toggle: bool) -> Self { self.opt_state.set(OptFlags::NEW_STREAMING, toggle); self @@ -731,7 +733,9 @@ impl LazyFrame { // Fallback to normal engine if error is due to not being implemented // and auto_new_streaming is set, otherwise propagate error. if auto_new_streaming - && e.downcast_ref::<&str>() == Some(&"not yet implemented") + && e.downcast_ref::<&str>() + .map(|s| s.starts_with("not yet implemented")) + .unwrap_or(false) { if polars_core::config::verbose() { eprintln!("caught unimplemented error in new streaming engine, falling back to normal engine"); @@ -2072,7 +2076,7 @@ impl JoinBuilder { let mut opt_state = self.lf.opt_state; let other = self.other.expect("with not set"); - // If any of the nodes reads from files we must activate this this plan as well. + // If any of the nodes reads from files we must activate this plan as well. if other.opt_state.contains(OptFlags::FILE_CACHING) { opt_state |= OptFlags::FILE_CACHING; } @@ -2104,4 +2108,41 @@ impl JoinBuilder { .build(); LazyFrame::from_logical_plan(lp, opt_state) } + + // Finish with join predicates + pub fn join_where(self, predicates: Vec) -> LazyFrame { + let mut opt_state = self.lf.opt_state; + let other = self.other.expect("with not set"); + + // If any of the nodes reads from files we must activate this plan as well. + if other.opt_state.contains(OptFlags::FILE_CACHING) { + opt_state |= OptFlags::FILE_CACHING; + } + + let args = JoinArgs { + how: self.how, + validation: self.validation, + suffix: self.suffix, + slice: None, + join_nulls: self.join_nulls, + coalesce: self.coalesce, + }; + let options = JoinOptions { + allow_parallel: self.allow_parallel, + force_parallel: self.force_parallel, + args, + ..Default::default() + }; + + let lp = DslPlan::Join { + input_left: Arc::new(self.lf.logical_plan), + input_right: Arc::new(other.logical_plan), + left_on: Default::default(), + right_on: Default::default(), + predicates, + options: Arc::from(options), + }; + + LazyFrame::from_logical_plan(lp, opt_state) + } } diff --git a/crates/polars-lazy/src/lib.rs b/crates/polars-lazy/src/lib.rs index 024f2a26bffb..005a09186ba2 100644 --- a/crates/polars-lazy/src/lib.rs +++ b/crates/polars-lazy/src/lib.rs @@ -206,6 +206,7 @@ pub mod dsl; pub mod frame; pub mod physical_plan; pub mod prelude; + mod scan; #[cfg(test)] mod tests; diff --git a/crates/polars-lazy/src/scan/csv.rs b/crates/polars-lazy/src/scan/csv.rs index 54e9c77e2480..998f422820c6 100644 --- a/crates/polars-lazy/src/scan/csv.rs +++ b/crates/polars-lazy/src/scan/csv.rs @@ -5,6 +5,7 @@ use polars_io::cloud::CloudOptions; use polars_io::csv::read::{ infer_file_schema, CommentPrefix, CsvEncoding, CsvParseOptions, CsvReadOptions, NullValues, }; +use polars_io::mmap::ReaderBytes; use polars_io::path_utils::expand_paths; use polars_io::utils::get_reader_bytes; use polars_io::RowIndex; @@ -14,7 +15,7 @@ use crate::prelude::*; #[derive(Clone)] #[cfg(feature = "csv")] pub struct LazyCsvReader { - paths: Arc>, + sources: ScanSources, glob: bool, cache: bool, read_options: CsvReadOptions, @@ -30,13 +31,13 @@ impl LazyCsvReader { self } - pub fn new_paths(paths: Arc>) -> Self { - Self::new("").with_paths(paths) + pub fn new_paths(paths: Arc<[PathBuf]>) -> Self { + Self::new_with_sources(ScanSources::Paths(paths)) } - pub fn new(path: impl AsRef) -> Self { + pub fn new_with_sources(sources: ScanSources) -> Self { LazyCsvReader { - paths: Arc::new(vec![path.as_ref().to_path_buf()]), + sources, glob: true, cache: true, read_options: Default::default(), @@ -45,6 +46,10 @@ impl LazyCsvReader { } } + pub fn new(path: impl AsRef) -> Self { + Self::new_with_sources(ScanSources::Paths([path.as_ref().to_path_buf()].into())) + } + /// Skip this number of rows after the header location. #[must_use] pub fn with_skip_rows_after_header(mut self, offset: usize) -> Self { @@ -219,38 +224,71 @@ impl LazyCsvReader { where F: Fn(Schema) -> PolarsResult, { - // TODO: Path expansion should happen when converting to the IR - // https://github.com/pola-rs/polars/issues/17634 - let paths = expand_paths(self.paths(), self.glob(), self.cloud_options())?; + let mut n_threads = self.read_options.n_threads; + + let mut infer_schema = |reader_bytes: ReaderBytes| { + let skip_rows = self.read_options.skip_rows; + let parse_options = self.read_options.get_parse_options(); + + PolarsResult::Ok( + infer_file_schema( + &reader_bytes, + parse_options.separator, + self.read_options.infer_schema_length, + self.read_options.has_header, + // we set it to None and modify them after the schema is updated + None, + skip_rows, + self.read_options.skip_rows_after_header, + parse_options.comment_prefix.as_ref(), + parse_options.quote_char, + parse_options.eol_char, + None, + parse_options.try_parse_dates, + self.read_options.raise_if_empty, + &mut n_threads, + parse_options.decimal_comma, + )? + .0, + ) + }; - let Some(path) = paths.first() else { - polars_bail!(ComputeError: "no paths specified for this reader"); + let schema = match self.sources.clone() { + ScanSources::Paths(paths) => { + // TODO: Path expansion should happen when converting to the IR + // https://github.com/pola-rs/polars/issues/17634 + let paths = expand_paths(&paths[..], self.glob(), self.cloud_options())?; + + let Some(path) = paths.first() else { + polars_bail!(ComputeError: "no paths specified for this reader"); + }; + + let mut file = polars_utils::open_file(path)?; + infer_schema(get_reader_bytes(&mut file).expect("could not mmap file"))? + }, + ScanSources::Files(files) => { + let Some(file) = files.first() else { + polars_bail!(ComputeError: "no buffers specified for this reader"); + }; + + infer_schema( + get_reader_bytes(&mut std::io::BufReader::new(file)) + .expect("could not mmap file"), + )? + }, + ScanSources::Buffers(buffers) => { + let Some(buffer) = buffers.first() else { + polars_bail!(ComputeError: "no buffers specified for this reader"); + }; + + infer_schema( + get_reader_bytes(&mut std::io::Cursor::new(buffer)) + .expect("could not mmap file"), + )? + }, }; - let mut file = polars_utils::open_file(path)?; - - let reader_bytes = get_reader_bytes(&mut file).expect("could not mmap file"); - let skip_rows = self.read_options.skip_rows; - let parse_options = self.read_options.get_parse_options(); - - let (schema, _, _) = infer_file_schema( - &reader_bytes, - parse_options.separator, - self.read_options.infer_schema_length, - self.read_options.has_header, - // we set it to None and modify them after the schema is updated - None, - skip_rows, - self.read_options.skip_rows_after_header, - parse_options.comment_prefix.as_ref(), - parse_options.quote_char, - parse_options.eol_char, - None, - parse_options.try_parse_dates, - self.read_options.raise_if_empty, - &mut self.read_options.n_threads, - parse_options.decimal_comma, - )?; + self.read_options.n_threads = n_threads; let mut schema = f(schema)?; // the dtypes set may be for the new names, so update again @@ -273,7 +311,7 @@ impl LazyFileListReader for LazyCsvReader { /// Get the final [LazyFrame]. fn finish(self) -> PolarsResult { let mut lf: LazyFrame = DslBuilder::scan_csv( - self.paths, + self.sources.to_dsl(false), self.read_options, self.cache, self.cloud_options, @@ -294,12 +332,12 @@ impl LazyFileListReader for LazyCsvReader { self.glob } - fn paths(&self) -> &[PathBuf] { - &self.paths + fn sources(&self) -> &ScanSources { + &self.sources } - fn with_paths(mut self, paths: Arc>) -> Self { - self.paths = paths; + fn with_sources(mut self, sources: ScanSources) -> Self { + self.sources = sources; self } diff --git a/crates/polars-lazy/src/scan/file_list_reader.rs b/crates/polars-lazy/src/scan/file_list_reader.rs index f7b91d427200..28315c96f736 100644 --- a/crates/polars-lazy/src/scan/file_list_reader.rs +++ b/crates/polars-lazy/src/scan/file_list_reader.rs @@ -1,4 +1,5 @@ use std::path::PathBuf; +use std::sync::Arc; use polars_core::prelude::*; use polars_io::cloud::CloudOptions; @@ -18,8 +19,11 @@ pub trait LazyFileListReader: Clone { return self.finish_no_glob(); } - let lfs = self - .paths() + let ScanSources::Paths(paths) = self.sources() else { + unreachable!("opened-files or in-memory buffers should never be globbed"); + }; + + let lfs = paths .iter() .map(|path| { self.clone() @@ -27,7 +31,7 @@ pub trait LazyFileListReader: Clone { .with_n_rows(None) // Each individual reader should not apply a row index. .with_row_index(None) - .with_paths(Arc::new(vec![path.clone()])) + .with_paths([path.clone()].into()) .with_rechunk(false) .finish_no_glob() .map_err(|e| { @@ -40,7 +44,7 @@ pub trait LazyFileListReader: Clone { polars_ensure!( !lfs.is_empty(), - ComputeError: "no matching files found in {:?}", self.paths().iter().map(|x| x.to_str().unwrap()).collect::>() + ComputeError: "no matching files found in {:?}", paths.iter().map(|x| x.to_str().unwrap()).collect::>() ); let mut lf = self.concat_impl(lfs)?; @@ -79,11 +83,18 @@ pub trait LazyFileListReader: Clone { true } - fn paths(&self) -> &[PathBuf]; + /// Get the sources for this reader. + fn sources(&self) -> &ScanSources; + + /// Set sources of the scanned files. + #[must_use] + fn with_sources(self, source: ScanSources) -> Self; /// Set paths of the scanned files. #[must_use] - fn with_paths(self, paths: Arc>) -> Self; + fn with_paths(self, paths: Arc<[PathBuf]>) -> Self { + self.with_sources(ScanSources::Paths(paths)) + } /// Configure the row limit. fn with_n_rows(self, n_rows: impl Into>) -> Self; diff --git a/crates/polars-lazy/src/scan/ipc.rs b/crates/polars-lazy/src/scan/ipc.rs index 9d981bc74c0e..a9f8c8b98b0f 100644 --- a/crates/polars-lazy/src/scan/ipc.rs +++ b/crates/polars-lazy/src/scan/ipc.rs @@ -13,7 +13,6 @@ pub struct ScanArgsIpc { pub cache: bool, pub rechunk: bool, pub row_index: Option, - pub memory_map: bool, pub cloud_options: Option, pub hive_options: HiveOptions, pub include_file_paths: Option, @@ -26,7 +25,6 @@ impl Default for ScanArgsIpc { cache: true, rechunk: false, row_index: None, - memory_map: true, cloud_options: Default::default(), hive_options: Default::default(), include_file_paths: None, @@ -37,29 +35,26 @@ impl Default for ScanArgsIpc { #[derive(Clone)] struct LazyIpcReader { args: ScanArgsIpc, - paths: Arc>, + sources: ScanSources, } impl LazyIpcReader { fn new(args: ScanArgsIpc) -> Self { Self { args, - paths: Arc::new(vec![]), + sources: ScanSources::default(), } } } impl LazyFileListReader for LazyIpcReader { fn finish(self) -> PolarsResult { - let paths = self.paths; let args = self.args; - let options = IpcScanOptions { - memory_map: args.memory_map, - }; + let options = IpcScanOptions {}; let mut lf: LazyFrame = DslBuilder::scan_ipc( - paths, + self.sources.to_dsl(false), options, args.n_rows, args.cache, @@ -80,12 +75,12 @@ impl LazyFileListReader for LazyIpcReader { unreachable!() } - fn paths(&self) -> &[PathBuf] { - &self.paths + fn sources(&self) -> &ScanSources { + &self.sources } - fn with_paths(mut self, paths: Arc>) -> Self { - self.paths = paths; + fn with_sources(mut self, sources: ScanSources) -> Self { + self.sources = sources; self } @@ -125,12 +120,17 @@ impl LazyFileListReader for LazyIpcReader { impl LazyFrame { /// Create a LazyFrame directly from a ipc scan. pub fn scan_ipc(path: impl AsRef, args: ScanArgsIpc) -> PolarsResult { - LazyIpcReader::new(args) - .with_paths(Arc::new(vec![path.as_ref().to_path_buf()])) - .finish() + Self::scan_ipc_sources( + ScanSources::Paths([path.as_ref().to_path_buf()].into()), + args, + ) + } + + pub fn scan_ipc_files(paths: Arc<[PathBuf]>, args: ScanArgsIpc) -> PolarsResult { + Self::scan_ipc_sources(ScanSources::Paths(paths), args) } - pub fn scan_ipc_files(paths: Arc>, args: ScanArgsIpc) -> PolarsResult { - LazyIpcReader::new(args).with_paths(paths).finish() + pub fn scan_ipc_sources(sources: ScanSources, args: ScanArgsIpc) -> PolarsResult { + LazyIpcReader::new(args).with_sources(sources).finish() } } diff --git a/crates/polars-lazy/src/scan/ndjson.rs b/crates/polars-lazy/src/scan/ndjson.rs index 0effd26d5497..e38270ec3e09 100644 --- a/crates/polars-lazy/src/scan/ndjson.rs +++ b/crates/polars-lazy/src/scan/ndjson.rs @@ -4,8 +4,8 @@ use std::sync::{Arc, Mutex, RwLock}; use polars_core::prelude::*; use polars_io::cloud::CloudOptions; -use polars_io::RowIndex; -use polars_plan::plans::{DslPlan, FileScan}; +use polars_io::{HiveOptions, RowIndex}; +use polars_plan::plans::{DslPlan, FileScan, ScanSources}; use polars_plan::prelude::{FileScanOptions, NDJsonReadOptions}; use crate::prelude::LazyFrame; @@ -13,7 +13,7 @@ use crate::scan::file_list_reader::LazyFileListReader; #[derive(Clone)] pub struct LazyJsonLineReader { - pub(crate) paths: Arc>, + pub(crate) sources: ScanSources, pub(crate) batch_size: Option, pub(crate) low_memory: bool, pub(crate) rechunk: bool, @@ -28,13 +28,13 @@ pub struct LazyJsonLineReader { } impl LazyJsonLineReader { - pub fn new_paths(paths: Arc>) -> Self { - Self::new(PathBuf::new()).with_paths(paths) + pub fn new_paths(paths: Arc<[PathBuf]>) -> Self { + Self::new_with_sources(ScanSources::Paths(paths)) } - pub fn new(path: impl AsRef) -> Self { + pub fn new_with_sources(sources: ScanSources) -> Self { LazyJsonLineReader { - paths: Arc::new(vec![path.as_ref().to_path_buf()]), + sources, batch_size: None, low_memory: false, rechunk: false, @@ -48,6 +48,11 @@ impl LazyJsonLineReader { cloud_options: None, } } + + pub fn new(path: impl AsRef) -> Self { + Self::new_with_sources(ScanSources::Paths([path.as_ref().to_path_buf()].into())) + } + /// Add a row index column. #[must_use] pub fn with_row_index(mut self, row_index: Option) -> Self { @@ -117,8 +122,6 @@ impl LazyJsonLineReader { impl LazyFileListReader for LazyJsonLineReader { fn finish(self) -> PolarsResult { - let paths = Arc::new(Mutex::new((self.paths, false))); - let file_options = FileScanOptions { slice: self.n_rows.map(|x| (0, x)), with_columns: None, @@ -126,7 +129,12 @@ impl LazyFileListReader for LazyJsonLineReader { row_index: self.row_index, rechunk: self.rechunk, file_counter: 0, - hive_options: Default::default(), + hive_options: HiveOptions { + enabled: Some(false), + hive_start_idx: 0, + schema: None, + try_parse_dates: true, + }, glob: true, include_file_paths: self.include_file_paths, }; @@ -147,7 +155,7 @@ impl LazyFileListReader for LazyJsonLineReader { }; Ok(LazyFrame::from(DslPlan::Scan { - paths, + sources: Arc::new(Mutex::new(self.sources.to_dsl(false))), file_info: Arc::new(RwLock::new(None)), hive_parts: None, predicate: None, @@ -160,12 +168,12 @@ impl LazyFileListReader for LazyJsonLineReader { unreachable!(); } - fn paths(&self) -> &[PathBuf] { - &self.paths + fn sources(&self) -> &ScanSources { + &self.sources } - fn with_paths(mut self, paths: Arc>) -> Self { - self.paths = paths; + fn with_sources(mut self, sources: ScanSources) -> Self { + self.sources = sources; self } diff --git a/crates/polars-lazy/src/scan/parquet.rs b/crates/polars-lazy/src/scan/parquet.rs index e87e90e3330a..9adb0f1838be 100644 --- a/crates/polars-lazy/src/scan/parquet.rs +++ b/crates/polars-lazy/src/scan/parquet.rs @@ -44,14 +44,14 @@ impl Default for ScanArgsParquet { #[derive(Clone)] struct LazyParquetReader { args: ScanArgsParquet, - paths: Arc>, + sources: ScanSources, } impl LazyParquetReader { fn new(args: ScanArgsParquet) -> Self { Self { args, - paths: Arc::new(vec![]), + sources: ScanSources::default(), } } } @@ -62,7 +62,7 @@ impl LazyFileListReader for LazyParquetReader { let row_index = self.args.row_index; let mut lf: LazyFrame = DslBuilder::scan_parquet( - self.paths, + self.sources.to_dsl(false), self.args.n_rows, self.args.cache, self.args.parallel, @@ -95,12 +95,12 @@ impl LazyFileListReader for LazyParquetReader { unreachable!(); } - fn paths(&self) -> &[PathBuf] { - &self.paths + fn sources(&self) -> &ScanSources { + &self.sources } - fn with_paths(mut self, paths: Arc>) -> Self { - self.paths = paths; + fn with_sources(mut self, sources: ScanSources) -> Self { + self.sources = sources; self } @@ -139,16 +139,19 @@ impl LazyFileListReader for LazyParquetReader { impl LazyFrame { /// Create a LazyFrame directly from a parquet scan. pub fn scan_parquet(path: impl AsRef, args: ScanArgsParquet) -> PolarsResult { - LazyParquetReader::new(args) - .with_paths(Arc::new(vec![path.as_ref().to_path_buf()])) - .finish() + Self::scan_parquet_sources( + ScanSources::Paths([path.as_ref().to_path_buf()].into()), + args, + ) + } + + /// Create a LazyFrame directly from a parquet scan. + pub fn scan_parquet_sources(sources: ScanSources, args: ScanArgsParquet) -> PolarsResult { + LazyParquetReader::new(args).with_sources(sources).finish() } /// Create a LazyFrame directly from a parquet scan. - pub fn scan_parquet_files( - paths: Arc>, - args: ScanArgsParquet, - ) -> PolarsResult { - LazyParquetReader::new(args).with_paths(paths).finish() + pub fn scan_parquet_files(paths: Arc<[PathBuf]>, args: ScanArgsParquet) -> PolarsResult { + Self::scan_parquet_sources(ScanSources::Paths(paths), args) } } diff --git a/crates/polars-lazy/src/tests/io.rs b/crates/polars-lazy/src/tests/io.rs index 57beafc63033..a1d3f2c050a8 100644 --- a/crates/polars-lazy/src/tests/io.rs +++ b/crates/polars-lazy/src/tests/io.rs @@ -417,7 +417,6 @@ fn test_ipc_globbing() -> PolarsResult<()> { cache: true, rechunk: false, row_index: None, - memory_map: true, cloud_options: None, hive_options: Default::default(), include_file_paths: None, diff --git a/crates/polars-mem-engine/src/executors/projection_simple.rs b/crates/polars-mem-engine/src/executors/projection_simple.rs index f88ad62c8956..c3102d3b7222 100644 --- a/crates/polars-mem-engine/src/executors/projection_simple.rs +++ b/crates/polars-mem-engine/src/executors/projection_simple.rs @@ -15,7 +15,7 @@ impl ProjectionSimple { impl Executor for ProjectionSimple { fn execute(&mut self, state: &mut ExecutionState) -> PolarsResult { state.should_stop()?; - let columns = self.columns.get_names_owned(); + let columns = self.columns.iter_names_cloned().collect::>(); let profile_name = if state.has_node_timer() { let name = comma_delimited("simple-projection".to_string(), columns.as_slice()); diff --git a/crates/polars-mem-engine/src/executors/scan/csv.rs b/crates/polars-mem-engine/src/executors/scan/csv.rs index 50ed974e128b..0ebcb7632ae7 100644 --- a/crates/polars-mem-engine/src/executors/scan/csv.rs +++ b/crates/polars-mem-engine/src/executors/scan/csv.rs @@ -1,4 +1,3 @@ -use std::path::PathBuf; use std::sync::Arc; use polars_core::config; @@ -9,7 +8,7 @@ use polars_core::utils::{ use super::*; pub struct CsvExec { - pub paths: Arc>, + pub sources: ScanSources, pub file_info: FileInfo, pub options: CsvReadOptions, pub file_options: FileScanOptions, @@ -45,7 +44,7 @@ impl CsvExec { .with_row_index(None) .with_path::<&str>(None); - if self.paths.is_empty() { + if self.sources.is_empty() { let out = if let Some(schema) = options_base.schema { DataFrame::from_rows_and_schema(&[], schema.as_ref())? } else { @@ -56,56 +55,31 @@ impl CsvExec { let verbose = config::verbose(); let force_async = config::force_async(); - let run_async = force_async || is_cloud_url(self.paths.first().unwrap()); + let run_async = (self.sources.is_paths() && force_async) || self.sources.is_cloud_url(); - if force_async && verbose { + if self.sources.is_paths() && force_async && verbose { eprintln!("ASYNC READING FORCED"); } let finish_read = |i: usize, options: CsvReadOptions, predicate: Option>| { - let path = &self.paths[i]; - let mut df = if run_async { - #[cfg(feature = "cloud")] - { - let file = polars_io::file_cache::FILE_CACHE - .get_entry(path.to_str().unwrap()) - // Safety: This was initialized by schema inference. - .unwrap() - .try_open_assume_latest()?; - let owned = &mut vec![]; - let mmap = unsafe { memmap::Mmap::map(&file).unwrap() }; - - options - .into_reader_with_file_handle(std::io::Cursor::new( - maybe_decompress_bytes(mmap.as_ref(), owned)?, - )) - ._with_predicate(predicate.clone()) - .finish() - } - #[cfg(not(feature = "cloud"))] - { - panic!("required feature `cloud` is not enabled") - } - } else { - let file = polars_utils::open_file(path)?; - let mmap = unsafe { memmap::Mmap::map(&file).unwrap() }; - let owned = &mut vec![]; - - options - .into_reader_with_file_handle(std::io::Cursor::new(maybe_decompress_bytes( - mmap.as_ref(), - owned, - )?)) - ._with_predicate(predicate.clone()) - .finish() - }?; + let source = self.sources.at(i); + let owned = &mut vec![]; + + let memslice = source.to_memslice_async_latest(run_async)?; + + let reader = std::io::Cursor::new(maybe_decompress_bytes(&memslice, owned)?); + let mut df = options + .into_reader_with_file_handle(reader) + ._with_predicate(predicate.clone()) + .finish()?; if let Some(col) = &self.file_options.include_file_paths { - let path = path.to_str().unwrap(); + let name = source.to_include_path_name(); + unsafe { df.with_column_unchecked( - StringChunked::full(col.clone(), path, df.height()).into_series(), + StringChunked::full(col.clone(), name, df.height()).into_series(), ) }; } @@ -123,14 +97,14 @@ impl CsvExec { } let mut n_rows_read = 0usize; - let mut out = Vec::with_capacity(self.paths.len()); + let mut out = Vec::with_capacity(self.sources.len()); // If we have n_rows or row_index then we need to count how many rows we read, so we need // to delay applying the predicate. let predicate_during_read = predicate .clone() .filter(|_| n_rows.is_none() && self.file_options.row_index.is_none()); - for i in 0..self.paths.len() { + for i in 0..self.sources.len() { let opts = options_base .clone() .with_row_index(self.file_options.row_index.clone().map(|mut ri| { @@ -175,10 +149,10 @@ impl CsvExec { if n_rows.is_some() && n_rows_read == n_rows.unwrap() { if verbose { eprintln!( - "reached n_rows = {} at file {} / {}", + "reached n_rows = {} at source {} / {}", n_rows.unwrap(), 1 + i, - self.paths.len() + self.sources.len() ) } break; @@ -203,10 +177,10 @@ impl CsvExec { let dfs = POOL.install(|| { let step = std::cmp::min(POOL.current_num_threads(), 128); - (0..self.paths.len()) + (0..self.sources.len()) .step_by(step) .map(|start| { - (start..std::cmp::min(start.saturating_add(step), self.paths.len())) + (start..std::cmp::min(start.saturating_add(step), self.sources.len())) .into_par_iter() .map(|i| finish_read(i, options_base.clone(), predicate.clone())) .collect::>>() @@ -235,9 +209,7 @@ impl CsvExec { impl Executor for CsvExec { fn execute(&mut self, state: &mut ExecutionState) -> PolarsResult { let profile_name = if state.has_node_timer() { - let mut ids = vec![PlSmallStr::from_str( - self.paths[0].to_string_lossy().as_ref(), - )]; + let mut ids = vec![self.sources.id()]; if self.predicate.is_some() { ids.push("predicate".into()) } diff --git a/crates/polars-mem-engine/src/executors/scan/ipc.rs b/crates/polars-mem-engine/src/executors/scan/ipc.rs index 18d47c172bcd..78b31f268756 100644 --- a/crates/polars-mem-engine/src/executors/scan/ipc.rs +++ b/crates/polars-mem-engine/src/executors/scan/ipc.rs @@ -1,19 +1,20 @@ -use std::path::PathBuf; - use hive::HivePartitions; use polars_core::config; use polars_core::utils::accumulate_dataframes_vertical; +use polars_error::feature_gated; use polars_io::cloud::CloudOptions; use polars_io::path_utils::is_cloud_url; use polars_io::predicates::apply_predicate; +use polars_utils::mmap::MemSlice; use rayon::prelude::*; use super::*; pub struct IpcExec { - pub(crate) paths: Arc>, + pub(crate) sources: ScanSources, pub(crate) file_info: FileInfo, pub(crate) predicate: Option>, + #[allow(dead_code)] pub(crate) options: IpcScanOptions, pub(crate) file_options: FileScanOptions, pub(crate) hive_parts: Option>>, @@ -22,23 +23,20 @@ pub struct IpcExec { impl IpcExec { fn read(&mut self) -> PolarsResult { - let is_cloud = self.paths.iter().any(is_cloud_url); + let is_cloud = match &self.sources { + ScanSources::Paths(paths) => paths.iter().any(is_cloud_url), + ScanSources::Files(_) | ScanSources::Buffers(_) => false, + }; let force_async = config::force_async(); - let mut out = if is_cloud || force_async { - #[cfg(not(feature = "cloud"))] - { - panic!("activate cloud feature") - } - - #[cfg(feature = "cloud")] - { + let mut out = if is_cloud || (self.sources.is_paths() && force_async) { + feature_gated!("cloud", { if force_async && config::verbose() { eprintln!("ASYNC READING FORCED"); } polars_io::pl_async::get_runtime().block_on_potential_spawn(self.read_async())? - } + }) } else { self.read_sync()? }; @@ -50,9 +48,9 @@ impl IpcExec { Ok(out) } - fn read_impl PolarsResult + Send + Sync>( + fn read_impl( &mut self, - path_idx_to_file: F, + idx_to_cached_file: impl Fn(usize) -> Option> + Send + Sync, ) -> PolarsResult { if config::verbose() { eprintln!("executing ipc read sync with row_index = {:?}, n_rows = {:?}, predicate = {:?} for paths {:?}", @@ -62,7 +60,7 @@ impl IpcExec { x.1 }).as_ref(), self.predicate.is_some(), - self.paths + self.sources, ); } @@ -73,26 +71,36 @@ impl IpcExec { self.file_options.row_index.is_some(), ); - let read_path = |path_index: usize, n_rows: Option| { - IpcReader::new(path_idx_to_file(path_index)?) + let read_path = |index: usize, n_rows: Option| { + let source = self.sources.at(index); + + let memslice = match source { + ScanSourceRef::Path(path) => { + let file = match idx_to_cached_file(index) { + None => std::fs::File::open(path)?, + Some(f) => f?, + }; + + MemSlice::from_file(&file)? + }, + ScanSourceRef::File(file) => MemSlice::from_file(file)?, + ScanSourceRef::Buffer(buff) => MemSlice::from_bytes(buff.clone()), + }; + + IpcReader::new(std::io::Cursor::new(memslice)) .with_n_rows(n_rows) .with_row_index(self.file_options.row_index.clone()) .with_projection(projection.clone()) .with_hive_partition_columns( self.hive_parts .as_ref() - .map(|x| x[path_index].materialize_partition_columns()), + .map(|x| x[index].materialize_partition_columns()), ) - .with_include_file_path(self.file_options.include_file_paths.as_ref().map(|x| { - ( - x.clone(), - Arc::from(self.paths[path_index].to_str().unwrap().to_string()), - ) - })) - .memory_mapped( - self.options - .memory_map - .then(|| self.paths[path_index].clone()), + .with_include_file_path( + self.file_options + .include_file_paths + .as_ref() + .map(|x| (x.clone(), Arc::from(source.to_include_path_name()))), ) .finish() }; @@ -101,9 +109,9 @@ impl IpcExec { assert_eq!(x.0, 0); x.1 }) { - let mut out = Vec::with_capacity(self.paths.len()); + let mut out = Vec::with_capacity(self.sources.len()); - for i in 0..self.paths.len() { + for i in 0..self.sources.len() { let df = read_path(i, Some(n_rows))?; let df_height = df.height(); out.push(df); @@ -121,7 +129,7 @@ impl IpcExec { out } else { POOL.install(|| { - (0..self.paths.len()) + (0..self.sources.len()) .into_par_iter() .map(|i| read_path(i, None)) .collect::>>() @@ -157,8 +165,7 @@ impl IpcExec { } fn read_sync(&mut self) -> PolarsResult { - let paths = self.paths.clone(); - self.read_impl(move |i| std::fs::File::open(&paths[i]).map_err(Into::into)) + self.read_impl(|_| None) } #[cfg(feature = "cloud")] @@ -167,9 +174,11 @@ impl IpcExec { // concurrently. use polars_io::file_cache::init_entries_from_uri_list; + let paths = self.sources.into_paths().unwrap(); + tokio::task::block_in_place(|| { let cache_entries = init_entries_from_uri_list( - self.paths + paths .iter() .map(|x| Arc::from(x.to_str().unwrap())) .collect::>() @@ -177,7 +186,7 @@ impl IpcExec { self.cloud_options.as_ref(), )?; - self.read_impl(move |i| cache_entries[i].try_open_check_latest()) + self.read_impl(|i| Some(cache_entries[i].try_open_check_latest())) }) } } @@ -185,9 +194,7 @@ impl IpcExec { impl Executor for IpcExec { fn execute(&mut self, state: &mut ExecutionState) -> PolarsResult { let profile_name = if state.has_node_timer() { - let mut ids = vec![PlSmallStr::from_str( - self.paths[0].to_string_lossy().as_ref(), - )]; + let mut ids = vec![self.sources.id()]; if self.predicate.is_some() { ids.push("predicate".into()) } diff --git a/crates/polars-mem-engine/src/executors/scan/ndjson.rs b/crates/polars-mem-engine/src/executors/scan/ndjson.rs index 680e5cbf3bed..a662760fd54b 100644 --- a/crates/polars-mem-engine/src/executors/scan/ndjson.rs +++ b/crates/polars-mem-engine/src/executors/scan/ndjson.rs @@ -1,12 +1,10 @@ -use std::path::PathBuf; - use polars_core::config; use polars_core::utils::accumulate_dataframes_vertical; use super::*; pub struct JsonExec { - paths: Arc>, + sources: ScanSources, options: NDJsonReadOptions, file_scan_options: FileScanOptions, file_info: FileInfo, @@ -15,14 +13,14 @@ pub struct JsonExec { impl JsonExec { pub fn new( - paths: Arc>, + sources: ScanSources, options: NDJsonReadOptions, file_scan_options: FileScanOptions, file_info: FileInfo, predicate: Option>, ) -> Self { Self { - paths, + sources, options, file_scan_options, file_info, @@ -41,9 +39,9 @@ impl JsonExec { let verbose = config::verbose(); let force_async = config::force_async(); - let run_async = force_async || is_cloud_url(self.paths.first().unwrap()); + let run_async = (self.sources.is_paths() && force_async) || self.sources.is_cloud_url(); - if force_async && verbose { + if self.sources.is_paths() && force_async && verbose { eprintln!("ASYNC READING FORCED"); } @@ -67,48 +65,27 @@ impl JsonExec { } let dfs = self - .paths + .sources .iter() - .map_while(|p| { + .map_while(|source| { if n_rows == Some(0) { return None; } - let file = if run_async { - #[cfg(feature = "cloud")] - { - match polars_io::file_cache::FILE_CACHE - .get_entry(p.to_str().unwrap()) - // Safety: This was initialized by schema inference. - .unwrap() - .try_open_assume_latest() - { - Ok(v) => v, - Err(e) => return Some(Err(e)), - } - } - #[cfg(not(feature = "cloud"))] - { - panic!("required feature `cloud` is not enabled") - } - } else { - match polars_utils::open_file(p.as_ref()) { - Ok(v) => v, - Err(e) => return Some(Err(e)), - } + let row_index = self.file_scan_options.row_index.as_mut(); + + let memslice = match source.to_memslice_async_latest(run_async) { + Ok(memslice) => memslice, + Err(err) => return Some(Err(err)), }; - let mmap = unsafe { memmap::Mmap::map(&file).unwrap() }; let owned = &mut vec![]; - let curs = - std::io::Cursor::new(match maybe_decompress_bytes(mmap.as_ref(), owned) { - Ok(v) => v, - Err(e) => return Some(Err(e)), - }); + let curs = std::io::Cursor::new(match maybe_decompress_bytes(&memslice, owned) { + Ok(v) => v, + Err(e) => return Some(Err(e)), + }); let reader = JsonLineReader::new(curs); - let row_index = self.file_scan_options.row_index.as_mut(); - let df = reader .with_schema(schema.clone()) .with_rechunk(self.file_scan_options.rechunk) @@ -131,10 +108,10 @@ impl JsonExec { } if let Some(col) = &self.file_scan_options.include_file_paths { - let path = p.to_str().unwrap(); + let name = source.to_include_path_name(); unsafe { df.with_column_unchecked( - StringChunked::full(col.clone(), path, df.height()).into_series(), + StringChunked::full(col.clone(), name, df.height()).into_series(), ) }; } @@ -150,7 +127,7 @@ impl JsonExec { impl Executor for JsonExec { fn execute(&mut self, state: &mut ExecutionState) -> PolarsResult { let profile_name = if state.has_node_timer() { - let ids = vec![self.paths[0].to_string_lossy().clone()]; + let ids = vec![self.sources.id()]; let name = comma_delimited("ndjson".to_string(), &ids); Cow::Owned(name) } else { diff --git a/crates/polars-mem-engine/src/executors/scan/parquet.rs b/crates/polars-mem-engine/src/executors/scan/parquet.rs index bd3d87ff8832..b9012344abb9 100644 --- a/crates/polars-mem-engine/src/executors/scan/parquet.rs +++ b/crates/polars-mem-engine/src/executors/scan/parquet.rs @@ -1,20 +1,18 @@ -use std::path::PathBuf; - use hive::HivePartitions; use polars_core::config; #[cfg(feature = "cloud")] use polars_core::config::{get_file_prefetch_size, verbose}; use polars_core::utils::accumulate_dataframes_vertical; +use polars_error::feature_gated; use polars_io::cloud::CloudOptions; -use polars_io::parquet::metadata::FileMetaDataRef; -use polars_io::path_utils::is_cloud_url; +use polars_io::parquet::metadata::FileMetadataRef; use polars_io::utils::slice::split_slice_at_file; use polars_io::RowIndex; use super::*; pub struct ParquetExec { - paths: Arc>, + sources: ScanSources, file_info: FileInfo, hive_parts: Option>>, predicate: Option>, @@ -23,23 +21,23 @@ pub struct ParquetExec { cloud_options: Option, file_options: FileScanOptions, #[allow(dead_code)] - metadata: Option, + metadata: Option, } impl ParquetExec { #[allow(clippy::too_many_arguments)] pub(crate) fn new( - paths: Arc>, + sources: ScanSources, file_info: FileInfo, hive_parts: Option>>, predicate: Option>, options: ParquetOptions, cloud_options: Option, file_options: FileScanOptions, - metadata: Option, + metadata: Option, ) -> Self { ParquetExec { - paths, + sources, file_info, hive_parts, predicate, @@ -52,7 +50,7 @@ impl ParquetExec { fn read_par(&mut self) -> PolarsResult> { let parallel = match self.options.parallel { - ParallelStrategy::Auto if self.paths.len() > POOL.current_num_threads() => { + ParallelStrategy::Auto if self.sources.len() > POOL.current_num_threads() => { ParallelStrategy::RowGroups }, identity => identity, @@ -62,7 +60,7 @@ impl ParquetExec { let step = std::cmp::min(POOL.current_num_threads(), 128); // Modified if we have a negative slice - let mut first_file = 0; + let mut first_source = 0; // (offset, end) let (slice_offset, slice_end) = if let Some(slice) = self.file_options.slice { @@ -75,15 +73,16 @@ impl ParquetExec { let mut cum_rows = 0; let chunk_size = 8; POOL.install(|| { - for path_indexes in (0..self.paths.len()) + for path_indexes in (0..self.sources.len()) .rev() .collect::>() .chunks(chunk_size) { let row_counts = path_indexes .into_par_iter() - .map(|i| { - ParquetReader::new(std::fs::File::open(&self.paths[*i])?).num_rows() + .map(|&i| { + let memslice = self.sources.at(i).to_memslice()?; + ParquetReader::new(std::io::Cursor::new(memslice)).num_rows() }) .collect::>>()?; @@ -91,12 +90,12 @@ impl ParquetExec { cum_rows += rc; if cum_rows >= slice_start_as_n_from_end { - first_file = *path_idx; + first_source = *path_idx; break; } } - if first_file > 0 { + if first_source > 0 { break; } } @@ -125,10 +124,8 @@ impl ParquetExec { let base_row_index = self.file_options.row_index.take(); // Limit no. of files at a time to prevent open file limits. - for i in (first_file..self.paths.len()).step_by(step) { - let end = std::cmp::min(i.saturating_add(step), self.paths.len()); - let paths = &self.paths[i..end]; - let hive_parts = self.hive_parts.as_ref().map(|x| &x[i..end]); + for i in (first_source..self.sources.len()).step_by(step) { + let end = std::cmp::min(i.saturating_add(step), self.sources.len()); if current_offset >= slice_end && !result.is_empty() { return Ok(result); @@ -137,11 +134,13 @@ impl ParquetExec { // First initialize the readers, predicates and metadata. // This will be used to determine the slices. That way we can actually read all the // files in parallel even if we add row index columns or slices. - let iter = (0..paths.len()).into_par_iter().map(|i| { - let path = &paths[i]; - let hive_partitions = hive_parts.map(|x| x[i].materialize_partition_columns()); + let iter = (i..end).into_par_iter().map(|i| { + let source = self.sources.at(i); + let hive_partitions = self + .hive_parts + .as_ref() + .map(|x| x[i].materialize_partition_columns()); - let file = std::fs::File::open(path)?; let (projection, predicate) = prepare_scan_args( self.predicate.clone(), &mut self.file_options.with_columns.clone(), @@ -150,7 +149,9 @@ impl ParquetExec { hive_partitions.as_deref(), ); - let mut reader = ParquetReader::new(file) + let memslice = source.to_memslice()?; + + let mut reader = ParquetReader::new(std::io::Cursor::new(memslice)) .read_parallel(parallel) .set_low_memory(self.options.low_memory) .use_statistics(self.options.use_statistics) @@ -160,7 +161,7 @@ impl ParquetExec { self.file_options .include_file_paths .as_ref() - .map(|x| (x.clone(), Arc::from(paths[i].to_str().unwrap()))), + .map(|x| (x.clone(), Arc::from(source.to_include_path_name()))), ); reader @@ -221,6 +222,7 @@ impl ParquetExec { result.extend_from_slice(&out) } } + Ok(result) } @@ -231,6 +233,7 @@ impl ParquetExec { use polars_io::utils::slice::split_slice_at_file; let verbose = verbose(); + let paths = self.sources.into_paths().unwrap(); let first_metadata = &self.metadata; let cloud_options = self.cloud_options.as_ref(); @@ -254,13 +257,13 @@ impl ParquetExec { let slice_start_as_n_from_end = -slice.0 as usize; let mut cum_rows = 0; - let paths = &self.paths; + let paths = &paths; let cloud_options = Arc::new(self.cloud_options.clone()); let paths = paths.clone(); let cloud_options = cloud_options.clone(); - let mut iter = stream::iter((0..self.paths.len()).rev().map(|i| { + let mut iter = stream::iter((0..paths.len()).rev().map(|i| { let paths = paths.clone(); let cloud_options = cloud_options.clone(); @@ -312,9 +315,9 @@ impl ParquetExec { let base_row_index = self.file_options.row_index.take(); let mut processed = 0; - for batch_start in (first_file_idx..self.paths.len()).step_by(batch_size) { - let end = std::cmp::min(batch_start.saturating_add(batch_size), self.paths.len()); - let paths = &self.paths[batch_start..end]; + for batch_start in (first_file_idx..paths.len()).step_by(batch_size) { + let end = std::cmp::min(batch_start.saturating_add(batch_size), paths.len()); + let paths = &paths[batch_start..end]; let hive_parts = self.hive_parts.as_ref().map(|x| &x[batch_start..end]); if current_offset >= slice_end && !result.is_empty() { @@ -325,7 +328,7 @@ impl ParquetExec { eprintln!( "querying metadata of {}/{} files...", processed, - self.paths.len() + paths.len() ); } @@ -371,7 +374,7 @@ impl ParquetExec { let include_file_paths = self.file_options.include_file_paths.as_ref(); if verbose { - eprintln!("reading of {}/{} file...", processed, self.paths.len()); + eprintln!("reading of {}/{} file...", processed, paths.len()); } let iter = readers_and_metadata @@ -447,23 +450,17 @@ impl ParquetExec { .and_then(|_| self.predicate.take()) .map(phys_expr_to_io_expr); - let is_cloud = is_cloud_url(self.paths.first().unwrap()); + let is_cloud = self.sources.is_cloud_url(); let force_async = config::force_async(); - let out = if is_cloud || force_async { - #[cfg(not(feature = "cloud"))] - { - panic!("activate cloud feature") - } - - #[cfg(feature = "cloud")] - { + let out = if is_cloud || (self.sources.is_paths() && force_async) { + feature_gated!("cloud", { if force_async && config::verbose() { eprintln!("ASYNC READING FORCED"); } polars_io::pl_async::get_runtime().block_on_potential_spawn(self.read_async())? - } + }) } else { self.read_par()? }; @@ -482,7 +479,7 @@ impl ParquetExec { impl Executor for ParquetExec { fn execute(&mut self, state: &mut ExecutionState) -> PolarsResult { let profile_name = if state.has_node_timer() { - let mut ids = vec![self.paths[0].to_string_lossy()]; + let mut ids = vec![self.sources.id()]; if self.predicate.is_some() { ids.push("predicate".into()) } diff --git a/crates/polars-mem-engine/src/planner/lp.rs b/crates/polars-mem-engine/src/planner/lp.rs index 523cd1e5c588..e1b53bea2151 100644 --- a/crates/polars-mem-engine/src/planner/lp.rs +++ b/crates/polars-mem-engine/src/planner/lp.rs @@ -276,7 +276,7 @@ fn create_physical_plan_impl( }, #[allow(unused_variables)] Scan { - paths, + sources, file_info, hive_parts, output_schema, @@ -306,7 +306,7 @@ fn create_physical_plan_impl( match scan_type { #[cfg(feature = "csv")] FileScan::Csv { options, .. } => Ok(Box::new(executors::CsvExec { - paths, + sources, file_info, options, predicate, @@ -318,7 +318,7 @@ fn create_physical_plan_impl( cloud_options, metadata, } => Ok(Box::new(executors::IpcExec { - paths, + sources, file_info, predicate, options, @@ -332,7 +332,7 @@ fn create_physical_plan_impl( cloud_options, metadata, } => Ok(Box::new(executors::ParquetExec::new( - paths, + sources, file_info, hive_parts, predicate, @@ -343,7 +343,7 @@ fn create_physical_plan_impl( ))), #[cfg(feature = "json")] FileScan::NDJson { options, .. } => Ok(Box::new(executors::JsonExec::new( - paths, + sources, options, file_options, file_info, @@ -430,7 +430,7 @@ fn create_physical_plan_impl( .transpose()?; Ok(Box::new(executors::DataFrameExec { df, - projection: output_schema.map(|s| s.iter_names().cloned().collect()), + projection: output_schema.map(|s| s.iter_names_cloned().collect()), filter: selection, predicate_has_windows: state.has_windows, })) diff --git a/crates/polars-mem-engine/src/utils.rs b/crates/polars-mem-engine/src/utils.rs index cb04d599a7f0..91bd0e17902a 100644 --- a/crates/polars-mem-engine/src/utils.rs +++ b/crates/polars-mem-engine/src/utils.rs @@ -1,22 +1,28 @@ -use std::path::PathBuf; +use std::path::Path; pub(crate) use polars_plan::plans::ArenaLpIter; -use polars_plan::plans::IR; +use polars_plan::plans::{ScanSources, IR}; use polars_utils::aliases::PlHashSet; use polars_utils::arena::{Arena, Node}; /// Get a set of the data source paths in this LogicalPlan -pub(crate) fn agg_source_paths( +/// +/// # Notes +/// +/// - Scan sources with opened files or in-memory buffers are ignored. +pub(crate) fn agg_source_paths<'a>( root_lp: Node, - acc_paths: &mut PlHashSet, - lp_arena: &Arena, + acc_paths: &mut PlHashSet<&'a Path>, + lp_arena: &'a Arena, ) { - lp_arena.iter(root_lp).for_each(|(_, lp)| { - use IR::*; - if let Scan { paths, .. } = lp { - for path in paths.as_ref() { - acc_paths.insert(path.clone()); + for (_, lp) in lp_arena.iter(root_lp) { + if let IR::Scan { sources, .. } = lp { + match sources { + ScanSources::Paths(paths) => acc_paths.extend(paths.iter().map(|p| p.as_path())), + ScanSources::Buffers(_) | ScanSources::Files(_) => { + // Ignore + }, } } - }) + } } diff --git a/crates/polars-ops/Cargo.toml b/crates/polars-ops/Cargo.toml index 0782f188b1df..2f37857c9cd2 100644 --- a/crates/polars-ops/Cargo.toml +++ b/crates/polars-ops/Cargo.toml @@ -117,6 +117,7 @@ pivot = ["polars-core/reinterpret", "polars-core/dtype-struct"] cross_join = [] chunked_ids = [] asof_join = [] +iejoin = [] semi_anti_join = [] array_any_all = ["dtype-array"] array_count = ["dtype-array"] diff --git a/crates/polars-ops/src/chunked_array/list/namespace.rs b/crates/polars-ops/src/chunked_array/list/namespace.rs index 02dc0fe3e68c..0c7a0975488c 100644 --- a/crates/polars-ops/src/chunked_array/list/namespace.rs +++ b/crates/polars-ops/src/chunked_array/list/namespace.rs @@ -447,7 +447,7 @@ pub trait ListNameSpaceImpl: AsList { use DataType::*; match idx.dtype() { - List(_) => { + List(boxed_dt) if boxed_dt.is_integer() => { let idx_ca = idx.list().unwrap(); let mut out = { list_ca diff --git a/crates/polars-ops/src/frame/join/args.rs b/crates/polars-ops/src/frame/join/args.rs index 61ccf86257e2..10eee5d765df 100644 --- a/crates/polars-ops/src/frame/join/args.rs +++ b/crates/polars-ops/src/frame/join/args.rs @@ -58,6 +58,8 @@ impl JoinCoalesce { }, #[cfg(feature = "asof_join")] AsOf(_) => matches!(self, JoinSpecific | CoalesceColumns), + #[cfg(feature = "iejoin")] + IEJoin(_) => false, Cross => false, #[cfg(feature = "semi_anti_join")] Semi | Anti => false, @@ -120,6 +122,8 @@ pub enum JoinType { Semi, #[cfg(feature = "semi_anti_join")] Anti, + #[cfg(feature = "iejoin")] + IEJoin(IEJoinOptions), } impl From for JoinArgs { @@ -138,6 +142,8 @@ impl Display for JoinType { Full { .. } => "FULL", #[cfg(feature = "asof_join")] AsOf(_) => "ASOF", + #[cfg(feature = "iejoin")] + IEJoin(_) => "IEJOIN", Cross => "CROSS", #[cfg(feature = "semi_anti_join")] Semi => "SEMI", diff --git a/crates/polars-ops/src/frame/join/iejoin/filtered_bit_array.rs b/crates/polars-ops/src/frame/join/iejoin/filtered_bit_array.rs new file mode 100644 index 000000000000..2c741a797f11 --- /dev/null +++ b/crates/polars-ops/src/frame/join/iejoin/filtered_bit_array.rs @@ -0,0 +1,49 @@ +use std::cmp::min; + +use arrow::bitmap::MutableBitmap; + +/// Bit array with a filter to speed up searching for set bits when sparse, +/// based on section 4.1 from Khayyat et al. 2015, +/// "Lightning Fast and Space Efficient Inequality Joins" +pub struct FilteredBitArray { + bit_array: MutableBitmap, + filter: MutableBitmap, +} + +impl FilteredBitArray { + const CHUNK_SIZE: usize = 1024; + + pub fn from_len_zeroed(len: usize) -> Self { + Self { + bit_array: MutableBitmap::from_len_zeroed(len), + filter: MutableBitmap::from_len_zeroed(len.div_ceil(Self::CHUNK_SIZE)), + } + } + + pub unsafe fn set_bit_unchecked(&mut self, index: usize) { + self.bit_array.set_unchecked(index, true); + self.filter.set_unchecked(index / Self::CHUNK_SIZE, true); + } + + pub unsafe fn on_set_bits_from(&self, start: usize, mut action: F) + where + F: FnMut(usize), + { + let start_chunk = start / Self::CHUNK_SIZE; + let mut chunk_offset = start % Self::CHUNK_SIZE; + for chunk_idx in start_chunk..self.filter.len() { + if self.filter.get_unchecked(chunk_idx) { + // There are some set bits in this chunk + let start = chunk_idx * Self::CHUNK_SIZE + chunk_offset; + let end = min((chunk_idx + 1) * Self::CHUNK_SIZE, self.bit_array.len()); + for bit_idx in start..end { + // SAFETY: `bit_idx` is always less than `self.bit_array.len()` + if self.bit_array.get_unchecked(bit_idx) { + action(bit_idx); + } + } + } + chunk_offset = 0; + } + } +} diff --git a/crates/polars-ops/src/frame/join/iejoin/l1_l2.rs b/crates/polars-ops/src/frame/join/iejoin/l1_l2.rs new file mode 100644 index 000000000000..67aa4cf6393b --- /dev/null +++ b/crates/polars-ops/src/frame/join/iejoin/l1_l2.rs @@ -0,0 +1,262 @@ +use polars_core::chunked_array::ChunkedArray; +use polars_core::datatypes::{IdxCa, PolarsNumericType}; +use polars_core::prelude::Series; +use polars_core::with_match_physical_numeric_polars_type; +use polars_error::PolarsResult; +use polars_utils::total_ord::TotalOrd; +use polars_utils::IdxSize; + +use super::*; + +/// Create a vector of L1 items from the array of LHS x values concatenated with RHS x values +/// and their ordering. +pub(super) fn build_l1_array( + ca: &ChunkedArray, + order: &IdxCa, + right_df_offset: IdxSize, +) -> PolarsResult>> +where + T: PolarsNumericType, +{ + assert_eq!(order.null_count(), 0); + assert_eq!(ca.chunks().len(), 1); + let arr = ca.downcast_get(0).unwrap(); + // Even if there are nulls, they will not be selected by order. + let values = arr.values().as_slice(); + + let mut array: Vec> = Vec::with_capacity(ca.len()); + + for order_arr in order.downcast_iter() { + for index in order_arr.values().as_slice().iter().copied() { + debug_assert!(arr.get(index as usize).is_some()); + let value = unsafe { *values.get_unchecked(index as usize) }; + let row_index = if index < right_df_offset { + // Row from LHS + index as i64 + 1 + } else { + // Row from RHS + -((index - right_df_offset) as i64) - 1 + }; + array.push(L1Item { row_index, value }); + } + } + + Ok(array) +} + +pub(super) fn build_l2_array(s: &Series, order: &[IdxSize]) -> PolarsResult> { + with_match_physical_numeric_polars_type!(s.dtype(), |$T| { + build_l2_array_impl::<$T>(s.as_ref().as_ref(), order) + }) +} + +/// Create a vector of L2 items from the array of y values ordered according to the L1 order, +/// and their ordering. We don't need to store actual y values but only track whether we're at +/// the end of a run of equal values. +fn build_l2_array_impl(ca: &ChunkedArray, order: &[IdxSize]) -> PolarsResult> +where + T: PolarsNumericType, + T::Native: TotalOrd, +{ + assert_eq!(ca.chunks().len(), 1); + + let mut array = Vec::with_capacity(ca.len()); + let mut prev_index = 0; + let mut prev_value = T::Native::default(); + + let arr = ca.downcast_get(0).unwrap(); + // Even if there are nulls, they will not be selected by order. + let values = arr.values().as_slice(); + + for (i, l1_index) in order.iter().copied().enumerate() { + debug_assert!(arr.get(l1_index as usize).is_some()); + let value = unsafe { *values.get_unchecked(l1_index as usize) }; + if i > 0 { + array.push(L2Item { + l1_index: prev_index, + run_end: value.tot_ne(&prev_value), + }); + } + prev_index = l1_index; + prev_value = value; + } + if !order.is_empty() { + array.push(L2Item { + l1_index: prev_index, + run_end: true, + }); + } + Ok(array) +} + +/// Item in L1 array used in the IEJoin algorithm +#[derive(Clone, Copy, Debug)] +pub(super) struct L1Item { + /// 1 based index for entries from the LHS df, or -1 based index for entries from the RHS + pub(super) row_index: i64, + /// X value + pub(super) value: T, +} + +/// Item in L2 array used in the IEJoin algorithm +#[derive(Clone, Copy, Debug)] +pub(super) struct L2Item { + /// Corresponding index into the L1 array of + pub(super) l1_index: IdxSize, + /// Whether this is the end of a run of equal y values + pub(super) run_end: bool, +} + +pub(super) trait L1Array { + unsafe fn process_entry( + &self, + l1_index: usize, + bit_array: &mut FilteredBitArray, + op1: InequalityOperator, + left_row_ids: &mut Vec, + right_row_ids: &mut Vec, + ) -> i64; + + unsafe fn process_lhs_entry( + &self, + l1_index: usize, + bit_array: &FilteredBitArray, + op1: InequalityOperator, + left_row_ids: &mut Vec, + right_row_ids: &mut Vec, + ) -> i64; + + unsafe fn mark_visited(&self, index: usize, bit_array: &mut FilteredBitArray); +} + +/// Find the position in the L1 array where we should begin checking for matches, +/// given the index in L1 corresponding to the current position in L2. +unsafe fn find_search_start_index( + l1_array: &[L1Item], + index: usize, + operator: InequalityOperator, +) -> usize +where + T: NumericNative, + T: TotalOrd, +{ + let sub_l1 = l1_array.get_unchecked_release(index..); + let value = l1_array.get_unchecked_release(index).value; + + match operator { + InequalityOperator::Gt => { + sub_l1.partition_point_exponential(|a| a.value.tot_ge(&value)) + index + }, + InequalityOperator::Lt => { + sub_l1.partition_point_exponential(|a| a.value.tot_le(&value)) + index + }, + InequalityOperator::GtEq => { + sub_l1.partition_point_exponential(|a| value.tot_lt(&a.value)) + index + }, + InequalityOperator::LtEq => { + sub_l1.partition_point_exponential(|a| value.tot_gt(&a.value)) + index + }, + } +} + +fn find_matches_in_l1( + l1_array: &[L1Item], + l1_index: usize, + row_index: i64, + bit_array: &FilteredBitArray, + op1: InequalityOperator, + left_row_ids: &mut Vec, + right_row_ids: &mut Vec, +) -> i64 +where + T: NumericNative, + T: TotalOrd, +{ + debug_assert!(row_index > 0); + let mut match_count = 0; + + // This entry comes from the left hand side DataFrame. + // Find all following entries in L1 (meaning they satisfy the first operator) + // that have already been visited (so satisfy the second operator). + // Because we use a stable sort for l2, we know that we won't find any + // matches for duplicate y values when traversing forwards in l1. + let start_index = unsafe { find_search_start_index(l1_array, l1_index, op1) }; + unsafe { + bit_array.on_set_bits_from(start_index, |set_bit: usize| { + // SAFETY + // set bit is within bounds. + let right_row_index = l1_array.get_unchecked_release(set_bit).row_index; + debug_assert!(right_row_index < 0); + left_row_ids.push((row_index - 1) as IdxSize); + right_row_ids.push((-right_row_index) as IdxSize - 1); + match_count += 1; + }) + }; + + match_count +} + +impl L1Array for Vec> +where + T: NumericNative, +{ + unsafe fn process_entry( + &self, + l1_index: usize, + bit_array: &mut FilteredBitArray, + op1: InequalityOperator, + left_row_ids: &mut Vec, + right_row_ids: &mut Vec, + ) -> i64 { + let row_index = self.get_unchecked_release(l1_index).row_index; + let from_lhs = row_index > 0; + if from_lhs { + find_matches_in_l1( + self, + l1_index, + row_index, + bit_array, + op1, + left_row_ids, + right_row_ids, + ) + } else { + bit_array.set_bit_unchecked(l1_index); + 0 + } + } + + unsafe fn process_lhs_entry( + &self, + l1_index: usize, + bit_array: &FilteredBitArray, + op1: InequalityOperator, + left_row_ids: &mut Vec, + right_row_ids: &mut Vec, + ) -> i64 { + let row_index = self.get_unchecked_release(l1_index).row_index; + let from_lhs = row_index > 0; + if from_lhs { + find_matches_in_l1( + self, + l1_index, + row_index, + bit_array, + op1, + left_row_ids, + right_row_ids, + ) + } else { + 0 + } + } + + unsafe fn mark_visited(&self, index: usize, bit_array: &mut FilteredBitArray) { + let from_lhs = self.get_unchecked_release(index).row_index > 0; + // We only mark RHS entries as visited, + // so that we don't try to match LHS entries with other LHS entries. + if !from_lhs { + bit_array.set_bit_unchecked(index); + } + } +} diff --git a/crates/polars-ops/src/frame/join/iejoin/mod.rs b/crates/polars-ops/src/frame/join/iejoin/mod.rs new file mode 100644 index 000000000000..d0698018c5bb --- /dev/null +++ b/crates/polars-ops/src/frame/join/iejoin/mod.rs @@ -0,0 +1,383 @@ +mod filtered_bit_array; +mod l1_l2; + +use filtered_bit_array::FilteredBitArray; +use l1_l2::*; +use polars_core::chunked_array::ChunkedArray; +use polars_core::datatypes::{IdxCa, NumericNative, PolarsNumericType}; +use polars_core::frame::DataFrame; +use polars_core::prelude::*; +use polars_core::utils::{_set_partition_size, split}; +use polars_core::{with_match_physical_numeric_polars_type, POOL}; +use polars_error::{polars_err, PolarsResult}; +use polars_utils::binary_search::ExponentialSearch; +use polars_utils::itertools::Itertools; +use polars_utils::slice::GetSaferUnchecked; +use polars_utils::total_ord::TotalEq; +use polars_utils::IdxSize; +use rayon::prelude::*; +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; + +use crate::frame::_finish_join; + +#[derive(Copy, Clone, Debug, Default, PartialEq, Eq, Hash)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +pub enum InequalityOperator { + #[default] + Lt, + LtEq, + Gt, + GtEq, +} + +impl InequalityOperator { + fn is_strict(&self) -> bool { + matches!(self, InequalityOperator::Gt | InequalityOperator::Lt) + } +} +#[derive(Clone, Debug, PartialEq, Eq, Default, Hash)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +pub struct IEJoinOptions { + pub operator1: InequalityOperator, + pub operator2: InequalityOperator, +} + +#[allow(clippy::too_many_arguments)] +fn ie_join_impl_t( + slice: Option<(i64, usize)>, + l1_order: IdxCa, + l2_order: &[IdxSize], + op1: InequalityOperator, + op2: InequalityOperator, + x: Series, + y_ordered_by_x: Series, + left_height: usize, +) -> PolarsResult<(Vec, Vec)> { + // Create a bit array with order corresponding to L1, + // denoting which entries have been visited while traversing L2. + let mut bit_array = FilteredBitArray::from_len_zeroed(l1_order.len()); + + let mut left_row_idx: Vec = vec![]; + let mut right_row_idx: Vec = vec![]; + + let slice_end = match slice { + Some((offset, len)) if offset >= 0 => Some(offset.saturating_add_unsigned(len as u64)), + _ => None, + }; + let mut match_count = 0; + + let ca: &ChunkedArray = x.as_ref().as_ref(); + let l1_array = build_l1_array(ca, &l1_order, left_height as IdxSize)?; + + if op2.is_strict() { + // For strict inequalities, we rely on using a stable sort of l2 so that + // p values only increase as we traverse a run of equal y values. + // To handle inclusive comparisons in x and duplicate x values we also need the + // sort of l1 to be stable, so that the left hand side entries come before the right + // hand side entries (as we mark visited entries from the right hand side). + for &p in l2_order { + match_count += unsafe { + l1_array.process_entry( + p as usize, + &mut bit_array, + op1, + &mut left_row_idx, + &mut right_row_idx, + ) + }; + + if slice_end.is_some_and(|end| match_count >= end) { + break; + } + } + } else { + let l2_array = build_l2_array(&y_ordered_by_x, l2_order)?; + + // For non-strict inequalities in l2, we need to track runs of equal y values and only + // check for matches after we reach the end of the run and have marked all rhs entries + // in the run as visited. + let mut run_start = 0; + + for i in 0..l2_array.len() { + // Elide bound checks + unsafe { + let item = l2_array.get_unchecked_release(i); + let p = item.l1_index; + l1_array.mark_visited(p as usize, &mut bit_array); + + if item.run_end { + for l2_item in l2_array.get_unchecked_release(run_start..i + 1) { + let p = l2_item.l1_index; + match_count += l1_array.process_lhs_entry( + p as usize, + &bit_array, + op1, + &mut left_row_idx, + &mut right_row_idx, + ); + } + + run_start = i + 1; + + if slice_end.is_some_and(|end| match_count >= end) { + break; + } + } + } + } + } + Ok((left_row_idx, right_row_idx)) +} + +pub(super) fn iejoin_par( + left: &DataFrame, + right: &DataFrame, + selected_left: Vec, + selected_right: Vec, + options: &IEJoinOptions, + suffix: Option, + slice: Option<(i64, usize)>, +) -> PolarsResult { + let l1_descending = matches!( + options.operator1, + InequalityOperator::Gt | InequalityOperator::GtEq + ); + + let l1_sort_options = SortOptions::default() + .with_maintain_order(true) + .with_nulls_last(false) + .with_order_descending(l1_descending); + + let sl = &selected_left[0]; + let l1_s_l = sl + .arg_sort(l1_sort_options) + .slice(sl.null_count() as i64, sl.len() - sl.null_count()); + + let sr = &selected_right[0]; + let l1_s_r = sr + .arg_sort(l1_sort_options) + .slice(sr.null_count() as i64, sr.len() - sr.null_count()); + + // Because we do a cartesian product, the number of partitions is squared. + // We take the sqrt, but we don't expect every partition to produce results and work can be + // imbalanced, so we multiply the number of partitions by 2, which leads to 2^2= 4 + let n_partitions = (_set_partition_size() as f32).sqrt() as usize * 2; + let splitted_a = split(&l1_s_l, n_partitions); + let splitted_b = split(&l1_s_r, n_partitions); + + let cartesian_prod = splitted_a + .iter() + .flat_map(|l| splitted_b.iter().map(move |r| (l, r))) + .collect::>(); + + let iter = cartesian_prod.par_iter().map(|(l_l1_idx, r_l1_idx)| { + if l_l1_idx.is_empty() || r_l1_idx.is_empty() { + return Ok(None); + } + fn get_extrema<'a>( + l1_idx: &'a IdxCa, + s: &'a Series, + ) -> Option<(AnyValue<'a>, AnyValue<'a>)> { + let first = l1_idx.first()?; + let last = l1_idx.last()?; + + let start = s.get(first as usize).unwrap(); + let end = s.get(last as usize).unwrap(); + + Some(if start < end { + (start, end) + } else { + (end, start) + }) + } + let Some((min_l, max_l)) = get_extrema(l_l1_idx, sl) else { + return Ok(None); + }; + let Some((min_r, max_r)) = get_extrema(r_l1_idx, sr) else { + return Ok(None); + }; + + let include_block = match options.operator1 { + InequalityOperator::Lt => min_l < max_r, + InequalityOperator::LtEq => min_l <= max_r, + InequalityOperator::Gt => max_l > min_r, + InequalityOperator::GtEq => max_l >= min_r, + }; + + if include_block { + let (l, r) = unsafe { + ( + selected_left + .iter() + .map(|s| s.take_unchecked(l_l1_idx)) + .collect_vec(), + selected_right + .iter() + .map(|s| s.take_unchecked(r_l1_idx)) + .collect_vec(), + ) + }; + + // Compute the row indexes + let (idx_l, idx_r) = iejoin_tuples(l, r, options, None)?; + + if idx_l.is_empty() { + return Ok(None); + } + + // These are row indexes in the slices we have given, so we use those to gather in the + // original l1 offset arrays. This gives us indexes in the original tables. + unsafe { + Ok(Some(( + l_l1_idx.take_unchecked(&idx_l), + r_l1_idx.take_unchecked(&idx_r), + ))) + } + } else { + Ok(None) + } + }); + + let row_indices = POOL.install(|| iter.collect::>>())?; + + let mut left_idx = IdxCa::default(); + let mut right_idx = IdxCa::default(); + for (l, r) in row_indices.into_iter().flatten() { + left_idx.append(&l)?; + right_idx.append(&r)?; + } + if let Some((offset, end)) = slice { + left_idx = left_idx.slice(offset, end); + right_idx = right_idx.slice(offset, end); + } + + unsafe { materialize_join(left, right, &left_idx, &right_idx, suffix) } +} + +pub(super) fn iejoin( + left: &DataFrame, + right: &DataFrame, + selected_left: Vec, + selected_right: Vec, + options: &IEJoinOptions, + suffix: Option, + slice: Option<(i64, usize)>, +) -> PolarsResult { + let (left_row_idx, right_row_idx) = + iejoin_tuples(selected_left, selected_right, options, slice)?; + unsafe { materialize_join(left, right, &left_row_idx, &right_row_idx, suffix) } +} + +unsafe fn materialize_join( + left: &DataFrame, + right: &DataFrame, + left_row_idx: &IdxCa, + right_row_idx: &IdxCa, + suffix: Option, +) -> PolarsResult { + let (join_left, join_right) = { + POOL.join( + || left.take_unchecked(left_row_idx), + || right.take_unchecked(right_row_idx), + ) + }; + + _finish_join(join_left, join_right, suffix) +} + +/// Inequality join. Matches rows between two DataFrames using two inequality operators +/// (one of [<, <=, >, >=]). +/// Based on Khayyat et al. 2015, "Lightning Fast and Space Efficient Inequality Joins" +/// and extended to work with duplicate values. +fn iejoin_tuples( + selected_left: Vec, + selected_right: Vec, + options: &IEJoinOptions, + slice: Option<(i64, usize)>, +) -> PolarsResult<(IdxCa, IdxCa)> { + if selected_left.len() != 2 { + return Err( + polars_err!(ComputeError: "IEJoin requires exactly two expressions from the left DataFrame"), + ); + }; + if selected_right.len() != 2 { + return Err( + polars_err!(ComputeError: "IEJoin requires exactly two expressions from the right DataFrame"), + ); + }; + + let op1 = options.operator1; + let op2 = options.operator2; + + // Determine the sort order based on the comparison operators used. + // We want to sort L1 so that "x[i] op1 x[j]" is true for j > i, + // and L2 so that "y[i] op2 y[j]" is true for j < i + // (except in the case of duplicates and strict inequalities). + // Note that the algorithms published in Khayyat et al. have incorrect logic for + // determining whether to sort descending. + let l1_descending = matches!(op1, InequalityOperator::Gt | InequalityOperator::GtEq); + let l2_descending = matches!(op2, InequalityOperator::Lt | InequalityOperator::LtEq); + + let mut x = selected_left[0].to_physical_repr().into_owned(); + let left_height = x.len(); + + x.extend(&selected_right[0].to_physical_repr())?; + // Rechunk because we will gather. + let x = x.rechunk(); + + let mut y = selected_left[1].to_physical_repr().into_owned(); + y.extend(&selected_right[1].to_physical_repr())?; + // Rechunk because we will gather. + let y = y.rechunk(); + + let l1_sort_options = SortOptions::default() + .with_maintain_order(true) + .with_nulls_last(false) + .with_order_descending(l1_descending); + // Get ordering of x, skipping any null entries as these cannot be matches + let l1_order = x + .arg_sort(l1_sort_options) + .slice(x.null_count() as i64, x.len() - x.null_count()); + + let y_ordered_by_x = unsafe { y.take_unchecked(&l1_order) }; + let l2_sort_options = SortOptions::default() + .with_maintain_order(true) + .with_nulls_last(false) + .with_order_descending(l2_descending); + // Get the indexes into l1, ordered by y values. + // l2_order is the same as "p" from Khayyat et al. + let l2_order = y_ordered_by_x + .arg_sort(l2_sort_options) + .slice( + y_ordered_by_x.null_count() as i64, + y_ordered_by_x.len() - y_ordered_by_x.null_count(), + ) + .rechunk(); + let l2_order = l2_order.downcast_get(0).unwrap().values().as_slice(); + + let (left_row_idx, right_row_idx) = with_match_physical_numeric_polars_type!(x.dtype(), |$T| { + ie_join_impl_t::<$T>( + slice, + l1_order, + l2_order, + op1, + op2, + x, + y_ordered_by_x, + left_height + ) + })?; + + debug_assert_eq!(left_row_idx.len(), right_row_idx.len()); + let left_row_idx = IdxCa::from_vec("".into(), left_row_idx); + let right_row_idx = IdxCa::from_vec("".into(), right_row_idx); + let (left_row_idx, right_row_idx) = match slice { + None => (left_row_idx, right_row_idx), + Some((offset, len)) => ( + left_row_idx.slice(offset, len), + right_row_idx.slice(offset, len), + ), + }; + Ok((left_row_idx, right_row_idx)) +} diff --git a/crates/polars-ops/src/frame/join/mod.rs b/crates/polars-ops/src/frame/join/mod.rs index 93f5b3193828..89507ac216c5 100644 --- a/crates/polars-ops/src/frame/join/mod.rs +++ b/crates/polars-ops/src/frame/join/mod.rs @@ -7,6 +7,8 @@ mod cross_join; mod dispatch_left_right; mod general; mod hash_join; +#[cfg(feature = "iejoin")] +mod iejoin; #[cfg(feature = "merge_sorted")] mod merge_sorted; @@ -28,6 +30,8 @@ use general::create_chunked_index_mapping; pub use general::{_coalesce_full_join, _finish_join, _join_suffix_name}; pub use hash_join::*; use hashbrown::hash_map::{Entry, RawEntryMut}; +#[cfg(feature = "iejoin")] +pub use iejoin::{IEJoinOptions, InequalityOperator}; #[cfg(feature = "merge_sorted")] pub use merge_sorted::_merge_sorted_dfs; use polars_core::hashing::_HASHMAP_INIT_SIZE; @@ -197,6 +201,25 @@ pub trait DataFrameJoinOps: IntoDf { } } + #[cfg(feature = "iejoin")] + if let JoinType::IEJoin(options) = args.how { + let func = if POOL.current_num_threads() > 1 && !left_df.is_empty() && !other.is_empty() + { + iejoin::iejoin_par + } else { + iejoin::iejoin + }; + return func( + left_df, + other, + selected_left, + selected_right, + &options, + args.suffix, + args.slice, + ); + } + // Single keys. if selected_left.len() == 1 { let s_left = &selected_left[0]; @@ -269,6 +292,10 @@ pub trait DataFrameJoinOps: IntoDf { panic!("expected by arguments on both sides") }, }, + #[cfg(feature = "iejoin")] + JoinType::IEJoin(_) => { + unreachable!() + }, JoinType::Cross => { unreachable!() }, @@ -293,6 +320,10 @@ pub trait DataFrameJoinOps: IntoDf { JoinType::AsOf(_) => polars_bail!( ComputeError: "asof join not supported for join on multiple keys" ), + #[cfg(feature = "iejoin")] + JoinType::IEJoin(_) => { + unreachable!() + }, JoinType::Cross => { unreachable!() }, diff --git a/crates/polars-ops/src/lib.rs b/crates/polars-ops/src/lib.rs index 00d10e87c76c..5889f915ef3d 100644 --- a/crates/polars-ops/src/lib.rs +++ b/crates/polars-ops/src/lib.rs @@ -1,7 +1,6 @@ #![cfg_attr(docsrs, feature(doc_auto_cfg))] #![cfg_attr(feature = "nightly", feature(unicode_internals))] #![cfg_attr(feature = "nightly", allow(internal_features))] -extern crate core; pub mod chunked_array; #[cfg(feature = "pivot")] diff --git a/crates/polars-parquet/Cargo.toml b/crates/polars-parquet/Cargo.toml index 5c62479ccaa3..26a57b22e713 100644 --- a/crates/polars-parquet/Cargo.toml +++ b/crates/polars-parquet/Cargo.toml @@ -20,6 +20,7 @@ bytemuck = { workspace = true } ethnum = { workspace = true } fallible-streaming-iterator = { workspace = true, optional = true } futures = { workspace = true, optional = true } +hashbrown = { workspace = true } num-traits = { workspace = true } polars-compute = { workspace = true } polars-error = { workspace = true } diff --git a/crates/polars-parquet/src/arrow/read/deserialize/mod.rs b/crates/polars-parquet/src/arrow/read/deserialize/mod.rs index 17e9434e1d3d..520f7f8596e1 100644 --- a/crates/polars-parquet/src/arrow/read/deserialize/mod.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/mod.rs @@ -26,7 +26,7 @@ use crate::parquet::schema::types::PrimitiveType; /// Creates a new iterator of compressed pages. pub fn get_page_iterator( - column_metadata: &ColumnChunkMetaData, + column_metadata: &ColumnChunkMetadata, reader: MemReader, buffer: Vec, max_header_size: usize, diff --git a/crates/polars-parquet/src/arrow/read/mod.rs b/crates/polars-parquet/src/arrow/read/mod.rs index b3e96d9254f3..1f00987fa074 100644 --- a/crates/polars-parquet/src/arrow/read/mod.rs +++ b/crates/polars-parquet/src/arrow/read/mod.rs @@ -15,7 +15,7 @@ pub use deserialize::{ #[cfg(feature = "async")] use futures::{AsyncRead, AsyncSeek}; use polars_error::PolarsResult; -pub use schema::{infer_schema, FileMetaData}; +pub use schema::{infer_schema, FileMetadata}; use crate::parquet::error::ParquetResult; #[cfg(feature = "async")] @@ -24,7 +24,7 @@ pub use crate::parquet::read::{get_page_stream, read_metadata_async as _read_met pub use crate::parquet::{ error::ParquetError, fallible_streaming_iterator, - metadata::{ColumnChunkMetaData, ColumnDescriptor, RowGroupMetaData}, + metadata::{ColumnChunkMetadata, ColumnDescriptor, RowGroupMetadata}, page::{CompressedDataPage, DataPageHeader, Page}, read::{ decompress, get_column_iterator, read_metadata as _read_metadata, BasicDecompressor, @@ -38,10 +38,10 @@ pub use crate::parquet::{ FallibleStreamingIterator, }; -/// Returns all [`ColumnChunkMetaData`] associated to `field_name`. +/// Returns all [`ColumnChunkMetadata`] associated to `field_name`. /// For non-nested parquet types, this returns a single column pub fn get_field_pages<'a, T>( - columns: &'a [ColumnChunkMetaData], + columns: &'a [ColumnChunkMetadata], items: &'a [T], field_name: &str, ) -> Vec<&'a T> { @@ -54,7 +54,7 @@ pub fn get_field_pages<'a, T>( } /// Reads parquets' metadata synchronously. -pub fn read_metadata(reader: &mut R) -> PolarsResult { +pub fn read_metadata(reader: &mut R) -> PolarsResult { Ok(_read_metadata(reader)?) } @@ -62,7 +62,7 @@ pub fn read_metadata(reader: &mut R) -> PolarsResult( reader: &mut R, -) -> PolarsResult { +) -> PolarsResult { Ok(_read_metadata_async(reader).await?) } diff --git a/crates/polars-parquet/src/arrow/read/schema/mod.rs b/crates/polars-parquet/src/arrow/read/schema/mod.rs index 50d937e7e840..347cd49faefd 100644 --- a/crates/polars-parquet/src/arrow/read/schema/mod.rs +++ b/crates/polars-parquet/src/arrow/read/schema/mod.rs @@ -10,7 +10,7 @@ pub use metadata::read_schema_from_metadata; use polars_error::PolarsResult; use self::metadata::parse_key_value_metadata; -pub use crate::parquet::metadata::{FileMetaData, KeyValue, SchemaDescriptor}; +pub use crate::parquet::metadata::{FileMetadata, KeyValue, SchemaDescriptor}; pub use crate::parquet::schema::types::ParquetType; /// Options when inferring schemas from Parquet @@ -33,7 +33,7 @@ impl Default for SchemaInferenceOptions { } } -/// Infers a [`ArrowSchema`] from parquet's [`FileMetaData`]. +/// Infers a [`ArrowSchema`] from parquet's [`FileMetadata`]. /// /// This first looks for the metadata key `"ARROW:schema"`; if it does not exist, it converts the /// Parquet types declared in the file's Parquet schema to Arrow's equivalent. @@ -41,13 +41,13 @@ impl Default for SchemaInferenceOptions { /// # Error /// This function errors iff the key `"ARROW:schema"` exists but is not correctly encoded, /// indicating that that the file's arrow metadata was incorrectly written. -pub fn infer_schema(file_metadata: &FileMetaData) -> PolarsResult { +pub fn infer_schema(file_metadata: &FileMetadata) -> PolarsResult { infer_schema_with_options(file_metadata, &None) } /// Like [`infer_schema`] but with configurable options which affects the behavior of inference pub fn infer_schema_with_options( - file_metadata: &FileMetaData, + file_metadata: &FileMetadata, options: &Option, ) -> PolarsResult { let mut metadata = parse_key_value_metadata(file_metadata.key_value_metadata()); diff --git a/crates/polars-parquet/src/arrow/read/statistics/mod.rs b/crates/polars-parquet/src/arrow/read/statistics/mod.rs index ef9ee28df79d..cb65827de9a6 100644 --- a/crates/polars-parquet/src/arrow/read/statistics/mod.rs +++ b/crates/polars-parquet/src/arrow/read/statistics/mod.rs @@ -13,7 +13,7 @@ use crate::parquet::schema::types::{ }; use crate::parquet::statistics::{PrimitiveStatistics, Statistics as ParquetStatistics}; use crate::parquet::types::int96_to_i64_ns; -use crate::read::ColumnChunkMetaData; +use crate::read::ColumnChunkMetadata; mod binary; mod binview; @@ -550,11 +550,13 @@ fn push( /// /// # Errors /// This function errors if the deserialization of the statistics fails (e.g. invalid utf8) -pub fn deserialize(field: &Field, field_md: &[&ColumnChunkMetaData]) -> PolarsResult { +pub fn deserialize<'a>( + field: &Field, + field_md: impl ExactSizeIterator, +) -> PolarsResult { let mut statistics = MutableStatistics::try_new(field)?; let mut stats = field_md - .iter() .map(|column| { Ok(( column.statistics().transpose()?, diff --git a/crates/polars-parquet/src/arrow/write/file.rs b/crates/polars-parquet/src/arrow/write/file.rs index d4162b8c08d5..0fd32deb5b07 100644 --- a/crates/polars-parquet/src/arrow/write/file.rs +++ b/crates/polars-parquet/src/arrow/write/file.rs @@ -4,7 +4,7 @@ use arrow::datatypes::ArrowSchema; use polars_error::{PolarsError, PolarsResult}; use super::schema::schema_to_metadata_key; -use super::{to_parquet_schema, ThriftFileMetaData, WriteOptions}; +use super::{to_parquet_schema, ThriftFileMetadata, WriteOptions}; use crate::parquet::metadata::{KeyValue, SchemaDescriptor}; use crate::parquet::write::{RowGroupIterColumns, WriteOptions as FileWriteOptions}; @@ -86,10 +86,10 @@ impl FileWriter { self.writer.into_inner() } - /// Returns the underlying writer and [`ThriftFileMetaData`] + /// Returns the underlying writer and [`ThriftFileMetadata`] /// # Panics /// This function panics if [`Self::end`] has not yet been called - pub fn into_inner_and_metadata(self) -> (W, ThriftFileMetaData) { + pub fn into_inner_and_metadata(self) -> (W, ThriftFileMetadata) { self.writer.into_inner_and_metadata() } } diff --git a/crates/polars-parquet/src/arrow/write/mod.rs b/crates/polars-parquet/src/arrow/write/mod.rs index b5f816518401..02f0165d04c7 100644 --- a/crates/polars-parquet/src/arrow/write/mod.rs +++ b/crates/polars-parquet/src/arrow/write/mod.rs @@ -38,7 +38,7 @@ pub use utils::write_def_levels; pub use crate::parquet::compression::{BrotliLevel, CompressionOptions, GzipLevel, ZstdLevel}; pub use crate::parquet::encoding::Encoding; pub use crate::parquet::metadata::{ - Descriptor, FileMetaData, KeyValue, SchemaDescriptor, ThriftFileMetaData, + Descriptor, FileMetadata, KeyValue, SchemaDescriptor, ThriftFileMetadata, }; pub use crate::parquet::page::{CompressedDataPage, CompressedPage, Page}; use crate::parquet::schema::types::PrimitiveType as ParquetPrimitiveType; diff --git a/crates/polars-parquet/src/parquet/bloom_filter/read.rs b/crates/polars-parquet/src/parquet/bloom_filter/read.rs index 5ebbc29f1218..deda00b36272 100644 --- a/crates/polars-parquet/src/parquet/bloom_filter/read.rs +++ b/crates/polars-parquet/src/parquet/bloom_filter/read.rs @@ -7,14 +7,14 @@ use parquet_format_safe::{ }; use crate::parquet::error::ParquetResult; -use crate::parquet::metadata::ColumnChunkMetaData; +use crate::parquet::metadata::ColumnChunkMetadata; -/// Reads the bloom filter associated to [`ColumnChunkMetaData`] into `bitset`. +/// Reads the bloom filter associated to [`ColumnChunkMetadata`] into `bitset`. /// Results in an empty `bitset` if there is no associated bloom filter or the algorithm is not supported. /// # Error /// Errors if the column contains no metadata or the filter can't be read or deserialized. pub fn read( - column_metadata: &ColumnChunkMetaData, + column_metadata: &ColumnChunkMetadata, mut reader: &mut R, bitset: &mut Vec, ) -> ParquetResult<()> { diff --git a/crates/polars-parquet/src/parquet/encoding/uleb128.rs b/crates/polars-parquet/src/parquet/encoding/uleb128.rs index 08459233961c..0740c9575a15 100644 --- a/crates/polars-parquet/src/parquet/encoding/uleb128.rs +++ b/crates/polars-parquet/src/parquet/encoding/uleb128.rs @@ -1,5 +1,6 @@ // Reads an uleb128 encoded integer with at most 56 bits (8 bytes with 7 bits worth of payload each). /// Returns the integer and the number of bytes that made up this integer. +/// /// If the returned length is bigger than 8 this means the integer required more than 8 bytes and the remaining bytes need to be read sequentially and combined with the return value. /// /// # Safety diff --git a/crates/polars-parquet/src/parquet/metadata/column_chunk_metadata.rs b/crates/polars-parquet/src/parquet/metadata/column_chunk_metadata.rs index ac24bc8199ac..30a606d6108a 100644 --- a/crates/polars-parquet/src/parquet/metadata/column_chunk_metadata.rs +++ b/crates/polars-parquet/src/parquet/metadata/column_chunk_metadata.rs @@ -21,11 +21,14 @@ mod serde_types { use serde_types::*; /// Metadata for a column chunk. -// This contains the `ColumnDescriptor` associated with the chunk so that deserializers have -// access to the descriptor (e.g. physical, converted, logical). -#[derive(Debug, Clone)] +/// +/// This contains the `ColumnDescriptor` associated with the chunk so that deserializers have +/// access to the descriptor (e.g. physical, converted, logical). +/// +/// This struct is intentionally not `Clone`, as it is a huge struct. +#[derive(Debug)] #[cfg_attr(feature = "serde_types", derive(Deserialize, Serialize))] -pub struct ColumnChunkMetaData { +pub struct ColumnChunkMetadata { #[cfg_attr( feature = "serde_types", serde(serialize_with = "serialize_column_chunk") @@ -67,8 +70,8 @@ where } // Represents common operations for a column chunk. -impl ColumnChunkMetaData { - /// Returns a new [`ColumnChunkMetaData`] +impl ColumnChunkMetadata { + /// Returns a new [`ColumnChunkMetadata`] pub fn new(column_chunk: ColumnChunk, column_descr: ColumnDescriptor) -> Self { Self { column_chunk, @@ -164,15 +167,9 @@ impl ColumnChunkMetaData { } /// Returns the offset and length in bytes of the column chunk within the file - pub fn byte_range(&self) -> (u64, u64) { - let start = if let Some(dict_page_offset) = self.dictionary_page_offset() { - dict_page_offset as u64 - } else { - self.data_page_offset() as u64 - }; - let length = self.compressed_size() as u64; + pub fn byte_range(&self) -> core::ops::Range { // this has been validated in [`try_from_thrift`] - (start, length) + column_metadata_byte_range(self.metadata()) } /// Method to convert from Thrift. @@ -205,3 +202,15 @@ impl ColumnChunkMetaData { self.column_chunk } } + +pub(super) fn column_metadata_byte_range( + column_metadata: &ColumnMetaData, +) -> core::ops::Range { + let offset = if let Some(dict_page_offset) = column_metadata.dictionary_page_offset { + dict_page_offset as u64 + } else { + column_metadata.data_page_offset as u64 + }; + let len = column_metadata.total_compressed_size as u64; + offset..offset.checked_add(len).unwrap() +} diff --git a/crates/polars-parquet/src/parquet/metadata/file_metadata.rs b/crates/polars-parquet/src/parquet/metadata/file_metadata.rs index 7ae449c64d90..47c9f160781d 100644 --- a/crates/polars-parquet/src/parquet/metadata/file_metadata.rs +++ b/crates/polars-parquet/src/parquet/metadata/file_metadata.rs @@ -2,7 +2,7 @@ use parquet_format_safe::ColumnOrder as TColumnOrder; use super::column_order::ColumnOrder; use super::schema_descriptor::SchemaDescriptor; -use super::RowGroupMetaData; +use super::RowGroupMetadata; use crate::parquet::error::ParquetError; use crate::parquet::metadata::get_sort_order; pub use crate::parquet::thrift_format::KeyValue; @@ -11,7 +11,7 @@ pub use crate::parquet::thrift_format::KeyValue; // This is almost equal to [`parquet_format_safe::FileMetaData`] but contains the descriptors, // which are crucial to deserialize pages. #[derive(Debug, Clone)] -pub struct FileMetaData { +pub struct FileMetadata { /// version of this file. pub version: i32, /// number of rows in the file. @@ -26,7 +26,7 @@ pub struct FileMetaData { /// ``` pub created_by: Option, /// The row groups of this file - pub row_groups: Vec, + pub row_groups: Vec, /// key_value_metadata of this file. pub key_value_metadata: Option>, /// schema descriptor. @@ -41,7 +41,7 @@ pub struct FileMetaData { pub column_orders: Option>, } -impl FileMetaData { +impl FileMetadata { /// Returns the [`SchemaDescriptor`] that describes schema of this file. pub fn schema(&self) -> &SchemaDescriptor { &self.schema_descr @@ -61,7 +61,7 @@ impl FileMetaData { .unwrap_or(ColumnOrder::Undefined) } - /// Deserializes [`crate::parquet::thrift_format::FileMetaData`] into this struct + /// Deserializes [`crate::parquet::thrift_format::FileMetadata`] into this struct pub fn try_from_thrift( metadata: parquet_format_safe::FileMetaData, ) -> Result { @@ -70,14 +70,14 @@ impl FileMetaData { let row_groups = metadata .row_groups .into_iter() - .map(|rg| RowGroupMetaData::try_from_thrift(&schema_descr, rg)) + .map(|rg| RowGroupMetadata::try_from_thrift(&schema_descr, rg)) .collect::>()?; let column_orders = metadata .column_orders .map(|orders| parse_column_orders(&orders, &schema_descr)); - Ok(FileMetaData { + Ok(FileMetadata { version: metadata.version, num_rows: metadata.num_rows.try_into()?, created_by: metadata.created_by, @@ -87,25 +87,6 @@ impl FileMetaData { column_orders, }) } - - /// Serializes itself to thrift's [`parquet_format_safe::FileMetaData`]. - pub fn into_thrift(self) -> parquet_format_safe::FileMetaData { - parquet_format_safe::FileMetaData { - version: self.version, - schema: self.schema_descr.into_thrift(), - num_rows: self.num_rows as i64, - row_groups: self - .row_groups - .into_iter() - .map(|v| v.into_thrift()) - .collect(), - key_value_metadata: self.key_value_metadata, - created_by: self.created_by, - column_orders: None, // todo - encryption_algorithm: None, - footer_signing_key_metadata: None, - } - } } /// Parses [`ColumnOrder`] from Thrift definition. diff --git a/crates/polars-parquet/src/parquet/metadata/mod.rs b/crates/polars-parquet/src/parquet/metadata/mod.rs index 2dfe81138fdd..b7a80739e719 100644 --- a/crates/polars-parquet/src/parquet/metadata/mod.rs +++ b/crates/polars-parquet/src/parquet/metadata/mod.rs @@ -6,12 +6,12 @@ mod row_metadata; mod schema_descriptor; mod sort; -pub use column_chunk_metadata::ColumnChunkMetaData; +pub use column_chunk_metadata::ColumnChunkMetadata; pub use column_descriptor::{ColumnDescriptor, Descriptor}; pub use column_order::ColumnOrder; -pub use file_metadata::{FileMetaData, KeyValue}; -pub use row_metadata::RowGroupMetaData; +pub use file_metadata::{FileMetadata, KeyValue}; +pub use row_metadata::RowGroupMetadata; pub use schema_descriptor::SchemaDescriptor; pub use sort::*; -pub use crate::parquet::thrift_format::FileMetaData as ThriftFileMetaData; +pub use crate::parquet::thrift_format::FileMetaData as ThriftFileMetadata; diff --git a/crates/polars-parquet/src/parquet/metadata/row_metadata.rs b/crates/polars-parquet/src/parquet/metadata/row_metadata.rs index 54bf1d9ac718..013308ad7f12 100644 --- a/crates/polars-parquet/src/parquet/metadata/row_metadata.rs +++ b/crates/polars-parquet/src/parquet/metadata/row_metadata.rs @@ -1,38 +1,64 @@ +use std::sync::Arc; + +use hashbrown::hash_map::RawEntryMut; use parquet_format_safe::RowGroup; -#[cfg(feature = "serde_types")] -use serde::{Deserialize, Serialize}; +use polars_utils::aliases::{InitHashMaps, PlHashMap}; +use polars_utils::idx_vec::UnitVec; +use polars_utils::pl_str::PlSmallStr; +use polars_utils::unitvec; -use super::column_chunk_metadata::ColumnChunkMetaData; +use super::column_chunk_metadata::{column_metadata_byte_range, ColumnChunkMetadata}; use super::schema_descriptor::SchemaDescriptor; use crate::parquet::error::{ParquetError, ParquetResult}; -use crate::parquet::write::ColumnOffsetsMetadata; + +type ColumnLookup = PlHashMap>; + +trait InitColumnLookup { + fn add_column(&mut self, index: usize, column: &ColumnChunkMetadata); +} + +impl InitColumnLookup for ColumnLookup { + #[inline(always)] + fn add_column(&mut self, index: usize, column: &ColumnChunkMetadata) { + let root_name = &column.descriptor().path_in_schema[0]; + + match self.raw_entry_mut().from_key(root_name) { + RawEntryMut::Vacant(slot) => { + slot.insert(root_name.clone(), unitvec![index]); + }, + RawEntryMut::Occupied(mut slot) => { + slot.get_mut().push(index); + }, + }; + } +} /// Metadata for a row group. #[derive(Debug, Clone, Default)] -#[cfg_attr(feature = "serde_types", derive(Deserialize, Serialize))] -pub struct RowGroupMetaData { - columns: Vec, +pub struct RowGroupMetadata { + columns: Arc<[ColumnChunkMetadata]>, + column_lookup: PlHashMap>, num_rows: usize, total_byte_size: usize, + full_byte_range: core::ops::Range, } -impl RowGroupMetaData { - /// Create a new [`RowGroupMetaData`] - pub fn new( - columns: Vec, - num_rows: usize, - total_byte_size: usize, - ) -> RowGroupMetaData { - Self { - columns, - num_rows, - total_byte_size, - } +impl RowGroupMetadata { + #[inline(always)] + pub fn n_columns(&self) -> usize { + self.columns.len() } - /// Returns slice of column chunk metadata. - pub fn columns(&self) -> &[ColumnChunkMetaData] { - &self.columns + /// Fetch all columns under this root name. + pub fn columns_under_root_iter( + &self, + root_name: &str, + ) -> impl ExactSizeIterator + DoubleEndedIterator { + self.column_lookup + .get(root_name) + .unwrap() + .iter() + .map(|&x| &self.columns[x]) } /// Number of rows in this row group. @@ -53,51 +79,60 @@ impl RowGroupMetaData { .sum::() } + pub fn full_byte_range(&self) -> core::ops::Range { + self.full_byte_range.clone() + } + + pub fn byte_ranges_iter(&self) -> impl '_ + ExactSizeIterator> { + self.columns.iter().map(|x| x.byte_range()) + } + /// Method to convert from Thrift. pub(crate) fn try_from_thrift( schema_descr: &SchemaDescriptor, rg: RowGroup, - ) -> ParquetResult { + ) -> ParquetResult { if schema_descr.columns().len() != rg.columns.len() { return Err(ParquetError::oos(format!("The number of columns in the row group ({}) must be equal to the number of columns in the schema ({})", rg.columns.len(), schema_descr.columns().len()))); } let total_byte_size = rg.total_byte_size.try_into()?; let num_rows = rg.num_rows.try_into()?; + + let mut column_lookup = ColumnLookup::with_capacity(rg.columns.len()); + let mut full_byte_range = if let Some(first_column_chunk) = rg.columns.first() { + let Some(metadata) = &first_column_chunk.meta_data else { + return Err(ParquetError::oos("Column chunk requires metadata")); + }; + column_metadata_byte_range(metadata) + } else { + 0..0 + }; + let columns = rg .columns .into_iter() .zip(schema_descr.columns()) - .map(|(column_chunk, descriptor)| { - ColumnChunkMetaData::try_from_thrift(descriptor.clone(), column_chunk) + .enumerate() + .map(|(i, (column_chunk, descriptor))| { + let column = + ColumnChunkMetadata::try_from_thrift(descriptor.clone(), column_chunk)?; + + column_lookup.add_column(i, &column); + + let byte_range = column.byte_range(); + full_byte_range = full_byte_range.start.min(byte_range.start) + ..full_byte_range.end.max(byte_range.end); + + Ok(column) }) - .collect::>>()?; + .collect::>>()?; - Ok(RowGroupMetaData { + Ok(RowGroupMetadata { columns, + column_lookup, num_rows, total_byte_size, + full_byte_range, }) } - - /// Method to convert to Thrift. - pub(crate) fn into_thrift(self) -> RowGroup { - let file_offset = self - .columns - .iter() - .map(|c| { - ColumnOffsetsMetadata::from_column_chunk_metadata(c).calc_row_group_file_offset() - }) - .next() - .unwrap_or(None); - let total_compressed_size = Some(self.compressed_size() as i64); - RowGroup { - columns: self.columns.into_iter().map(|v| v.into_thrift()).collect(), - total_byte_size: self.total_byte_size as i64, - num_rows: self.num_rows as i64, - sorting_columns: None, - file_offset, - total_compressed_size, - ordinal: None, - } - } } diff --git a/crates/polars-parquet/src/parquet/read/column/mod.rs b/crates/polars-parquet/src/parquet/read/column/mod.rs index d6bcda08fe2d..56f914ba568e 100644 --- a/crates/polars-parquet/src/parquet/read/column/mod.rs +++ b/crates/polars-parquet/src/parquet/read/column/mod.rs @@ -1,8 +1,10 @@ use std::vec::IntoIter; -use super::{get_field_columns, get_page_iterator, MemReader, PageReader}; +use polars_utils::idx_vec::UnitVec; + +use super::{get_page_iterator, MemReader, PageReader}; use crate::parquet::error::{ParquetError, ParquetResult}; -use crate::parquet::metadata::{ColumnChunkMetaData, RowGroupMetaData}; +use crate::parquet::metadata::{ColumnChunkMetadata, RowGroupMetadata}; use crate::parquet::page::CompressedPage; use crate::parquet::schema::types::ParquetType; @@ -13,16 +15,16 @@ use crate::parquet::schema::types::ParquetType; /// For primitive fields (e.g. `i64`), [`ColumnIterator`] yields exactly one column. /// For complex fields, it yields multiple columns. /// `max_page_size` is the maximum number of bytes allowed. -pub fn get_column_iterator( +pub fn get_column_iterator<'a>( reader: MemReader, - row_group: &RowGroupMetaData, + row_group: &'a RowGroupMetadata, field_name: &str, max_page_size: usize, -) -> ColumnIterator { - let columns = get_field_columns(row_group.columns(), field_name) - .cloned() - .collect::>(); - +) -> ColumnIterator<'a> { + let columns = row_group + .columns_under_root_iter(field_name) + .rev() + .collect::>(); ColumnIterator::new(reader, columns, max_page_size) } @@ -46,21 +48,20 @@ pub trait MutStreamingIterator: Sized { /// A [`MutStreamingIterator`] that reads column chunks one by one, /// returning a [`PageReader`] per column. -pub struct ColumnIterator { +pub struct ColumnIterator<'a> { reader: MemReader, - columns: Vec, + columns: UnitVec<&'a ColumnChunkMetadata>, max_page_size: usize, } -impl ColumnIterator { +impl<'a> ColumnIterator<'a> { /// Returns a new [`ColumnIterator`] /// `max_page_size` is the maximum allowed page size pub fn new( reader: MemReader, - mut columns: Vec, + columns: UnitVec<&'a ColumnChunkMetadata>, max_page_size: usize, ) -> Self { - columns.reverse(); Self { reader, columns, @@ -69,8 +70,8 @@ impl ColumnIterator { } } -impl Iterator for ColumnIterator { - type Item = ParquetResult<(PageReader, ColumnChunkMetaData)>; +impl<'a> Iterator for ColumnIterator<'a> { + type Item = ParquetResult<(PageReader, &'a ColumnChunkMetadata)>; fn next(&mut self) -> Option { if self.columns.is_empty() { @@ -79,7 +80,7 @@ impl Iterator for ColumnIterator { let column = self.columns.pop().unwrap(); let iter = - match get_page_iterator(&column, self.reader.clone(), Vec::new(), self.max_page_size) { + match get_page_iterator(column, self.reader.clone(), Vec::new(), self.max_page_size) { Err(e) => return Some(Err(e)), Ok(v) => v, }; @@ -93,11 +94,11 @@ pub struct ReadColumnIterator { field: ParquetType, chunks: Vec<( Vec>, - ColumnChunkMetaData, + ColumnChunkMetadata, )>, current: Option<( IntoIter>, - ColumnChunkMetaData, + ColumnChunkMetadata, )>, } @@ -107,7 +108,7 @@ impl ReadColumnIterator { field: ParquetType, chunks: Vec<( Vec>, - ColumnChunkMetaData, + ColumnChunkMetadata, )>, ) -> Self { Self { @@ -121,7 +122,7 @@ impl ReadColumnIterator { impl MutStreamingIterator for ReadColumnIterator { type Item = ( IntoIter>, - ColumnChunkMetaData, + ColumnChunkMetadata, ); type Error = ParquetError; diff --git a/crates/polars-parquet/src/parquet/read/metadata.rs b/crates/polars-parquet/src/parquet/read/metadata.rs index f92794fc2839..e14a2a60e997 100644 --- a/crates/polars-parquet/src/parquet/read/metadata.rs +++ b/crates/polars-parquet/src/parquet/read/metadata.rs @@ -2,9 +2,9 @@ use std::cmp::min; use std::io::{Read, Seek, SeekFrom}; use parquet_format_safe::thrift::protocol::TCompactInputProtocol; -use parquet_format_safe::FileMetaData as TFileMetaData; +use parquet_format_safe::FileMetaData as TFileMetadata; -use super::super::metadata::FileMetaData; +use super::super::metadata::FileMetadata; use super::super::{DEFAULT_FOOTER_READ_SIZE, FOOTER_SIZE, HEADER_SIZE, PARQUET_MAGIC}; use crate::parquet::error::{ParquetError, ParquetResult}; @@ -26,18 +26,18 @@ fn stream_len(seek: &mut impl Seek) -> std::result::Result Ok(len) } -/// Reads a [`FileMetaData`] from the reader, located at the end of the file. -pub fn read_metadata(reader: &mut R) -> ParquetResult { +/// Reads a [`FileMetadata`] from the reader, located at the end of the file. +pub fn read_metadata(reader: &mut R) -> ParquetResult { // check file is large enough to hold footer let file_size = stream_len(reader)?; read_metadata_with_size(reader, file_size) } -/// Reads a [`FileMetaData`] from the reader, located at the end of the file, with known file size. +/// Reads a [`FileMetadata`] from the reader, located at the end of the file, with known file size. pub fn read_metadata_with_size( reader: &mut R, file_size: u64, -) -> ParquetResult { +) -> ParquetResult { if file_size < HEADER_SIZE + FOOTER_SIZE { return Err(ParquetError::oos( "A parquet file must contain a header and footer with at least 12 bytes", @@ -92,9 +92,9 @@ pub fn read_metadata_with_size( } /// Parse loaded metadata bytes -pub fn deserialize_metadata(reader: R, max_size: usize) -> ParquetResult { +pub fn deserialize_metadata(reader: R, max_size: usize) -> ParquetResult { let mut prot = TCompactInputProtocol::new(reader, max_size); - let metadata = TFileMetaData::read_from_in_protocol(&mut prot)?; + let metadata = TFileMetadata::read_from_in_protocol(&mut prot)?; - FileMetaData::try_from_thrift(metadata) + FileMetadata::try_from_thrift(metadata) } diff --git a/crates/polars-parquet/src/parquet/read/mod.rs b/crates/polars-parquet/src/parquet/read/mod.rs index a66ac4817a8c..c3ec112e6864 100644 --- a/crates/polars-parquet/src/parquet/read/mod.rs +++ b/crates/polars-parquet/src/parquet/read/mod.rs @@ -19,33 +19,16 @@ use polars_utils::mmap::MemReader; pub use stream::read_metadata as read_metadata_async; use crate::parquet::error::ParquetResult; -use crate::parquet::metadata::{ColumnChunkMetaData, FileMetaData, RowGroupMetaData}; - -/// Filters row group metadata to only those row groups, -/// for which the predicate function returns true -pub fn filter_row_groups( - metadata: &FileMetaData, - predicate: &dyn Fn(&RowGroupMetaData, usize) -> bool, -) -> FileMetaData { - let mut filtered_row_groups = Vec::::new(); - for (i, row_group_metadata) in metadata.row_groups.iter().enumerate() { - if predicate(row_group_metadata, i) { - filtered_row_groups.push(row_group_metadata.clone()); - } - } - let mut metadata = metadata.clone(); - metadata.row_groups = filtered_row_groups; - metadata -} +use crate::parquet::metadata::ColumnChunkMetadata; /// Returns a new [`PageReader`] by seeking `reader` to the beginning of `column_chunk`. pub fn get_page_iterator( - column_chunk: &ColumnChunkMetaData, + column_chunk: &ColumnChunkMetadata, mut reader: MemReader, scratch: Vec, max_page_size: usize, ) -> ParquetResult { - let (col_start, _) = column_chunk.byte_range(); + let col_start = column_chunk.byte_range().start; reader.seek(SeekFrom::Start(col_start))?; Ok(PageReader::new( reader, @@ -54,14 +37,3 @@ pub fn get_page_iterator( max_page_size, )) } - -/// Returns all [`ColumnChunkMetaData`] associated to `field_name`. -/// For non-nested types, this returns an iterator with a single column -pub fn get_field_columns<'a>( - columns: &'a [ColumnChunkMetaData], - field_name: &'a str, -) -> impl Iterator { - columns - .iter() - .filter(move |x| x.descriptor().path_in_schema[0].as_str() == field_name) -} diff --git a/crates/polars-parquet/src/parquet/read/page/reader.rs b/crates/polars-parquet/src/parquet/read/page/reader.rs index f01cf55c4a8e..cd23af0499d7 100644 --- a/crates/polars-parquet/src/parquet/read/page/reader.rs +++ b/crates/polars-parquet/src/parquet/read/page/reader.rs @@ -7,14 +7,14 @@ use polars_utils::mmap::{MemReader, MemSlice}; use super::PageIterator; use crate::parquet::compression::Compression; use crate::parquet::error::{ParquetError, ParquetResult}; -use crate::parquet::metadata::{ColumnChunkMetaData, Descriptor}; +use crate::parquet::metadata::{ColumnChunkMetadata, Descriptor}; use crate::parquet::page::{ CompressedDataPage, CompressedDictPage, CompressedPage, DataPageHeader, PageType, ParquetPageHeader, }; use crate::parquet::CowBuffer; -/// This meta is a small part of [`ColumnChunkMetaData`]. +/// This meta is a small part of [`ColumnChunkMetadata`]. #[derive(Debug, Clone, PartialEq, Eq)] pub struct PageMetaData { /// The start offset of this column chunk in file. @@ -44,10 +44,10 @@ impl PageMetaData { } } -impl From<&ColumnChunkMetaData> for PageMetaData { - fn from(column: &ColumnChunkMetaData) -> Self { +impl From<&ColumnChunkMetadata> for PageMetaData { + fn from(column: &ColumnChunkMetadata) -> Self { Self { - column_start: column.byte_range().0, + column_start: column.byte_range().start, num_values: column.num_values(), compression: column.compression(), descriptor: column.descriptor().descriptor.clone(), @@ -89,7 +89,7 @@ impl PageReader { /// The parameter `max_header_size` pub fn new( reader: MemReader, - column: &ColumnChunkMetaData, + column: &ColumnChunkMetadata, scratch: Vec, max_page_size: usize, ) -> Self { diff --git a/crates/polars-parquet/src/parquet/read/page/stream.rs b/crates/polars-parquet/src/parquet/read/page/stream.rs index 0101196f3752..fbd36b3ccfe1 100644 --- a/crates/polars-parquet/src/parquet/read/page/stream.rs +++ b/crates/polars-parquet/src/parquet/read/page/stream.rs @@ -8,13 +8,13 @@ use polars_utils::mmap::MemSlice; use super::reader::{finish_page, PageMetaData}; use crate::parquet::compression::Compression; use crate::parquet::error::{ParquetError, ParquetResult}; -use crate::parquet::metadata::{ColumnChunkMetaData, Descriptor}; +use crate::parquet::metadata::{ColumnChunkMetadata, Descriptor}; use crate::parquet::page::{CompressedPage, DataPageHeader, ParquetPageHeader}; use crate::parquet::parquet_bridge::{Encoding, PageType}; /// Returns a stream of compressed data pages pub async fn get_page_stream<'a, RR: AsyncRead + Unpin + Send + AsyncSeek>( - column_metadata: &'a ColumnChunkMetaData, + column_metadata: &'a ColumnChunkMetadata, reader: &'a mut RR, scratch: Vec, max_page_size: usize, @@ -24,7 +24,7 @@ pub async fn get_page_stream<'a, RR: AsyncRead + Unpin + Send + AsyncSeek>( /// Returns a stream of compressed data pages from a reader that begins at the start of the column pub async fn get_page_stream_from_column_start<'a, R: AsyncRead + Unpin + Send>( - column_metadata: &'a ColumnChunkMetaData, + column_metadata: &'a ColumnChunkMetadata, reader: &'a mut R, scratch: Vec, max_header_size: usize, diff --git a/crates/polars-parquet/src/parquet/read/stream.rs b/crates/polars-parquet/src/parquet/read/stream.rs index ec8b26c3d31d..c3755106742b 100644 --- a/crates/polars-parquet/src/parquet/read/stream.rs +++ b/crates/polars-parquet/src/parquet/read/stream.rs @@ -2,7 +2,7 @@ use std::io::SeekFrom; use futures::{AsyncRead, AsyncReadExt, AsyncSeek, AsyncSeekExt}; -use super::super::metadata::FileMetaData; +use super::super::metadata::FileMetadata; use super::super::{DEFAULT_FOOTER_READ_SIZE, FOOTER_SIZE, PARQUET_MAGIC}; use super::metadata::{deserialize_metadata, metadata_len}; use crate::parquet::error::{ParquetError, ParquetResult}; @@ -26,7 +26,7 @@ async fn stream_len( /// Asynchronously reads the files' metadata pub async fn read_metadata( reader: &mut R, -) -> ParquetResult { +) -> ParquetResult { let file_size = stream_len(reader).await?; if file_size < HEADER_SIZE + FOOTER_SIZE { diff --git a/crates/polars-parquet/src/parquet/write/file.rs b/crates/polars-parquet/src/parquet/write/file.rs index e9a95be68e73..8dd3212bb76a 100644 --- a/crates/polars-parquet/src/parquet/write/file.rs +++ b/crates/polars-parquet/src/parquet/write/file.rs @@ -9,7 +9,7 @@ use super::row_group::write_row_group; use super::{RowGroupIterColumns, WriteOptions}; use crate::parquet::error::{ParquetError, ParquetResult}; pub use crate::parquet::metadata::KeyValue; -use crate::parquet::metadata::{SchemaDescriptor, ThriftFileMetaData}; +use crate::parquet::metadata::{SchemaDescriptor, ThriftFileMetadata}; use crate::parquet::write::State; use crate::parquet::{FOOTER_SIZE, PARQUET_MAGIC}; @@ -20,7 +20,7 @@ pub(super) fn start_file(writer: &mut W) -> ParquetResult { pub(super) fn end_file( mut writer: &mut W, - metadata: &ThriftFileMetaData, + metadata: &ThriftFileMetadata, ) -> ParquetResult { // Write metadata let mut protocol = TCompactOutputProtocol::new(&mut writer); @@ -67,7 +67,7 @@ pub struct FileWriter { /// Used to store the current state for writing the file state: State, // when the file is written, metadata becomes available - metadata: Option, + metadata: Option, } /// Writes a parquet file containing only the header and footer @@ -75,11 +75,11 @@ pub struct FileWriter { /// This is used to write the metadata as a separate Parquet file, usually when data /// is partitioned across multiple files. /// -/// Note: Recall that when combining row groups from [`ThriftFileMetaData`], the `file_path` on each +/// Note: Recall that when combining row groups from [`ThriftFileMetadata`], the `file_path` on each /// of their column chunks must be updated with their path relative to where they are written to. pub fn write_metadata_sidecar( writer: &mut W, - metadata: &ThriftFileMetaData, + metadata: &ThriftFileMetadata, ) -> ParquetResult { let mut len = start_file(writer)?; len += end_file(writer, metadata)?; @@ -98,11 +98,11 @@ impl FileWriter { &self.schema } - /// Returns the [`ThriftFileMetaData`]. This is Some iff the [`Self::end`] has been called. + /// Returns the [`ThriftFileMetadata`]. This is Some iff the [`Self::end`] has been called. /// /// This is used to write the metadata as a separate Parquet file, usually when data /// is partitioned across multiple files - pub fn metadata(&self) -> Option<&ThriftFileMetaData> { + pub fn metadata(&self) -> Option<&ThriftFileMetadata> { self.metadata.as_ref() } } @@ -225,7 +225,7 @@ impl FileWriter { ParquetResult::Ok(()) })?; - let metadata = ThriftFileMetaData::new( + let metadata = ThriftFileMetadata::new( self.options.version.into(), self.schema.clone().into_thrift(), num_rows, @@ -248,10 +248,10 @@ impl FileWriter { self.writer } - /// Returns the underlying writer and [`ThriftFileMetaData`] + /// Returns the underlying writer and [`ThriftFileMetadata`] /// # Panics /// This function panics if [`Self::end`] has not yet been called - pub fn into_inner_and_metadata(self) -> (W, ThriftFileMetaData) { + pub fn into_inner_and_metadata(self) -> (W, ThriftFileMetadata) { (self.writer, self.metadata.expect("File to have ended")) } } diff --git a/crates/polars-parquet/src/parquet/write/row_group.rs b/crates/polars-parquet/src/parquet/write/row_group.rs index 68c25a9c40fb..43404dc32a89 100644 --- a/crates/polars-parquet/src/parquet/write/row_group.rs +++ b/crates/polars-parquet/src/parquet/write/row_group.rs @@ -10,7 +10,7 @@ use super::column_chunk::write_column_chunk_async; use super::page::{is_data_page, PageWriteSpec}; use super::{DynIter, DynStreamingIterator}; use crate::parquet::error::{ParquetError, ParquetResult}; -use crate::parquet::metadata::{ColumnChunkMetaData, ColumnDescriptor}; +use crate::parquet::metadata::{ColumnChunkMetadata, ColumnDescriptor}; use crate::parquet::page::CompressedPage; pub struct ColumnOffsetsMetadata { @@ -34,7 +34,7 @@ impl ColumnOffsetsMetadata { } pub fn from_column_chunk_metadata( - column_chunk_metadata: &ColumnChunkMetaData, + column_chunk_metadata: &ColumnChunkMetadata, ) -> ColumnOffsetsMetadata { ColumnOffsetsMetadata { dictionary_page_offset: column_chunk_metadata.dictionary_page_offset(), diff --git a/crates/polars-parquet/src/parquet/write/stream.rs b/crates/polars-parquet/src/parquet/write/stream.rs index eadc4640e856..eca712db65dc 100644 --- a/crates/polars-parquet/src/parquet/write/stream.rs +++ b/crates/polars-parquet/src/parquet/write/stream.rs @@ -2,7 +2,7 @@ use std::io::Write; use futures::{AsyncWrite, AsyncWriteExt}; use parquet_format_safe::thrift::protocol::TCompactOutputStreamProtocol; -use parquet_format_safe::{FileMetaData, RowGroup}; +use parquet_format_safe::RowGroup; use super::row_group::write_row_group_async; use super::{RowGroupIterColumns, WriteOptions}; @@ -20,7 +20,7 @@ async fn start_file(writer: &mut W) -> ParquetResult async fn end_file( mut writer: &mut W, - metadata: FileMetaData, + metadata: parquet_format_safe::FileMetaData, ) -> ParquetResult { // Write file metadata let mut protocol = TCompactOutputStreamProtocol::new(&mut writer); @@ -169,7 +169,7 @@ impl FileStreamer { } } - let metadata = FileMetaData::new( + let metadata = parquet_format_safe::FileMetaData::new( self.options.version.into(), self.schema.clone().into_thrift(), num_rows, diff --git a/crates/polars-pipe/src/executors/operators/reproject.rs b/crates/polars-pipe/src/executors/operators/reproject.rs index 0c176b134af8..a4f6010bef79 100644 --- a/crates/polars-pipe/src/executors/operators/reproject.rs +++ b/crates/polars-pipe/src/executors/operators/reproject.rs @@ -1,6 +1,5 @@ use polars_core::error::PolarsResult; use polars_core::frame::DataFrame; -use polars_core::prelude::IndexOfSchema; use polars_core::schema::Schema; use crate::operators::DataChunk; @@ -15,12 +14,9 @@ pub(crate) fn reproject_chunk( // the positions for subsequent calls let chunk_schema = chunk.data.schema(); - let check_duplicates = false; - let out = chunk.data._select_with_schema_impl( - schema.get_names_owned().as_slice(), - &chunk_schema, - check_duplicates, - )?; + let out = chunk + .data + .select_with_schema_unchecked(schema.iter_names_cloned(), &chunk_schema)?; *positions = out .get_columns() diff --git a/crates/polars-pipe/src/executors/sinks/reproject.rs b/crates/polars-pipe/src/executors/sinks/reproject.rs index ecba66f188e4..bd9553b75f97 100644 --- a/crates/polars-pipe/src/executors/sinks/reproject.rs +++ b/crates/polars-pipe/src/executors/sinks/reproject.rs @@ -1,6 +1,5 @@ use std::any::Any; -use polars_core::prelude::IndexOfSchema; use polars_core::schema::SchemaRef; use crate::executors::sources::ReProjectSource; @@ -41,7 +40,7 @@ impl Sink for ReProjectSink { fn finalize(&mut self, context: &PExecutionContext) -> PolarsResult { Ok(match self.sink.finalize(context)? { FinalizedSink::Finished(df) => { - FinalizedSink::Finished(df._select_impl(self.schema.get_names_owned().as_slice())?) + FinalizedSink::Finished(df.select(self.schema.iter_names_cloned())?) }, FinalizedSink::Source(source) => { FinalizedSink::Source(Box::new(ReProjectSource::new(self.schema.clone(), source))) diff --git a/crates/polars-pipe/src/executors/sources/csv.rs b/crates/polars-pipe/src/executors/sources/csv.rs index 2c34228bada6..f3267ac1e90a 100644 --- a/crates/polars-pipe/src/executors/sources/csv.rs +++ b/crates/polars-pipe/src/executors/sources/csv.rs @@ -1,10 +1,11 @@ use std::fs::File; -use std::path::PathBuf; +use polars_core::error::feature_gated; use polars_core::{config, POOL}; use polars_io::csv::read::{BatchedCsvReader, CsvReadOptions, CsvReader}; use polars_io::path_utils::is_cloud_url; use polars_plan::global::_set_n_rows_for_scan; +use polars_plan::plans::ScanSources; use polars_plan::prelude::FileScanOptions; use polars_utils::itertools::Itertools; @@ -20,7 +21,7 @@ pub(crate) struct CsvSource { batched_reader: Option>, reader: Option>, n_threads: usize, - paths: Arc>, + sources: ScanSources, options: Option, file_options: FileScanOptions, verbose: bool, @@ -36,6 +37,10 @@ impl CsvSource { // otherwise all files would be opened during construction of the pipeline // leading to Too many Open files error fn init_next_reader(&mut self) -> PolarsResult<()> { + let paths = self + .sources + .as_paths() + .ok_or_else(|| polars_err!(nyi = "Streaming scanning of in-memory buffers"))?; let file_options = self.file_options.clone(); let n_rows = file_options.slice.map(|x| { @@ -43,12 +48,12 @@ impl CsvSource { x.1 }); - if self.current_path_idx == self.paths.len() + if self.current_path_idx == paths.len() || (n_rows.is_some() && n_rows.unwrap() <= self.n_rows_read) { return Ok(()); } - let path = &self.paths[self.current_path_idx]; + let path = &paths[self.current_path_idx]; let force_async = config::force_async(); let run_async = force_async || is_cloud_url(path); @@ -104,8 +109,7 @@ impl CsvSource { .with_row_index(row_index); let reader: CsvReader = if run_async { - #[cfg(feature = "cloud")] - { + feature_gated!("cloud", { options.into_reader_with_file_handle( polars_io::file_cache::FILE_CACHE .get_entry(path.to_str().unwrap()) @@ -113,11 +117,7 @@ impl CsvSource { .unwrap() .try_open_assume_latest()?, ) - } - #[cfg(not(feature = "cloud"))] - { - panic!("required feature `cloud` is not enabled") - } + }) } else { options .with_path(Some(path)) @@ -140,7 +140,7 @@ impl CsvSource { } pub(crate) fn new( - paths: Arc>, + sources: ScanSources, schema: SchemaRef, options: CsvReadOptions, file_options: FileScanOptions, @@ -151,7 +151,7 @@ impl CsvSource { reader: None, batched_reader: None, n_threads: POOL.current_num_threads(), - paths, + sources, options: Some(options), file_options, verbose, diff --git a/crates/polars-pipe/src/executors/sources/parquet.rs b/crates/polars-pipe/src/executors/sources/parquet.rs index cd0cb58f3574..7a0dabeb10df 100644 --- a/crates/polars-pipe/src/executors/sources/parquet.rs +++ b/crates/polars-pipe/src/executors/sources/parquet.rs @@ -10,7 +10,7 @@ use polars_core::error::*; use polars_core::prelude::Series; use polars_core::POOL; use polars_io::cloud::CloudOptions; -use polars_io::parquet::metadata::FileMetaDataRef; +use polars_io::parquet::metadata::FileMetadataRef; use polars_io::parquet::read::{BatchedParquetReader, ParquetOptions, ParquetReader}; use polars_io::path_utils::is_cloud_url; use polars_io::pl_async::get_runtime; @@ -20,7 +20,7 @@ use polars_io::prelude::materialize_projection; use polars_io::prelude::ParquetAsyncReader; use polars_io::utils::slice::split_slice_at_file; use polars_io::SerReader; -use polars_plan::plans::FileInfo; +use polars_plan::plans::{FileInfo, ScanSources}; use polars_plan::prelude::hive::HivePartitions; use polars_plan::prelude::FileScanOptions; use polars_utils::itertools::Itertools; @@ -36,12 +36,12 @@ pub struct ParquetSource { processed_paths: usize, processed_rows: AtomicUsize, iter: Range, - paths: Arc>, + sources: ScanSources, options: ParquetOptions, file_options: FileScanOptions, #[allow(dead_code)] cloud_options: Option, - metadata: Option, + metadata: Option, file_info: FileInfo, hive_parts: Option>>, verbose: bool, @@ -77,7 +77,11 @@ impl ParquetSource { usize, Option>, )> { - let path = &self.paths[index]; + let paths = self + .sources + .as_paths() + .ok_or_else(|| polars_err!(nyi = "Streaming scanning of in-memory buffers"))?; + let path = &paths[index]; let options = self.options; let file_options = self.file_options.clone(); let schema = self.file_info.schema.clone(); @@ -245,16 +249,19 @@ impl ParquetSource { #[allow(unused_variables)] #[allow(clippy::too_many_arguments)] pub(crate) fn new( - paths: Arc>, + sources: ScanSources, options: ParquetOptions, cloud_options: Option, - metadata: Option, + metadata: Option, file_options: FileScanOptions, file_info: FileInfo, hive_parts: Option>>, verbose: bool, predicate: Option>, ) -> PolarsResult { + let paths = sources + .as_paths() + .ok_or_else(|| polars_err!(nyi = "Streaming scanning of in-memory buffers"))?; let n_threads = POOL.current_num_threads(); let iter = 0..paths.len(); @@ -273,7 +280,7 @@ impl ParquetSource { options, file_options, iter, - paths, + sources, cloud_options, metadata, file_info, diff --git a/crates/polars-pipe/src/pipeline/convert.rs b/crates/polars-pipe/src/pipeline/convert.rs index 1e6f93eac9df..0a6a8946feba 100644 --- a/crates/polars-pipe/src/pipeline/convert.rs +++ b/crates/polars-pipe/src/pipeline/convert.rs @@ -67,14 +67,14 @@ where } // projection is free if let Some(schema) = output_schema { - let columns = schema.iter_names().cloned().collect::>(); + let columns = schema.iter_names_cloned().collect::>(); df = df._select_impl_unchecked(&columns)?; } } Ok(Box::new(sources::DataFrameSource::from_df(df)) as Box) }, Scan { - paths, + sources, file_info, hive_parts, file_options, @@ -82,6 +82,8 @@ where output_schema, scan_type, } => { + let paths = sources.into_paths(); + // Add predicate to operators. // Except for parquet, as that format can use statistics to prune file/row-groups. #[cfg(feature = "parquet")] @@ -102,7 +104,7 @@ where #[cfg(feature = "csv")] FileScan::Csv { options, .. } => { let src = sources::CsvSource::new( - paths, + sources, file_info.schema, options, file_options, @@ -144,7 +146,7 @@ where }) .transpose()?; let src = sources::ParquetSource::new( - paths, + sources, parquet_options, cloud_options, metadata, @@ -588,7 +590,7 @@ where let op = match lp_arena.get(node) { SimpleProjection { input, columns, .. } => { let input_schema = lp_arena.get(*input).schema(lp_arena); - let columns = columns.iter_names().cloned().collect(); + let columns = columns.iter_names_cloned().collect(); let op = operators::SimpleProjectionOperator::new(columns, input_schema.into_owned()); Box::new(op) as Box }, diff --git a/crates/polars-plan/Cargo.toml b/crates/polars-plan/Cargo.toml index b37b9b445f10..7edc15ea8616 100644 --- a/crates/polars-plan/Cargo.toml +++ b/crates/polars-plan/Cargo.toml @@ -26,6 +26,7 @@ ahash = { workspace = true } arrow = { workspace = true } bitflags = { workspace = true } bytemuck = { workspace = true } +bytes = { workspace = true } chrono = { workspace = true, optional = true } chrono-tz = { workspace = true, optional = true } ciborium = { workspace = true, optional = true } @@ -115,6 +116,7 @@ is_unique = ["polars-ops/is_unique"] is_between = ["polars-ops/is_between"] cross_join = ["polars-ops/cross_join"] asof_join = ["polars-time", "polars-ops/asof_join"] +iejoin = ["polars-ops/iejoin"] concat_str = [] business = ["polars-ops/business"] range = [] diff --git a/crates/polars-plan/src/client/check.rs b/crates/polars-plan/src/client/check.rs index a01addd9231d..84189840a3dd 100644 --- a/crates/polars-plan/src/client/check.rs +++ b/crates/polars-plan/src/client/check.rs @@ -2,7 +2,7 @@ use polars_core::error::{polars_err, PolarsResult}; use polars_io::path_utils::is_cloud_url; use crate::plans::options::SinkType; -use crate::plans::{DslPlan, FileScan}; +use crate::plans::{DslPlan, FileScan, ScanSources}; /// Assert that the given [`DslPlan`] is eligible to be executed on Polars Cloud. pub(super) fn assert_cloud_eligible(dsl: &DslPlan) -> PolarsResult<()> { @@ -10,15 +10,28 @@ pub(super) fn assert_cloud_eligible(dsl: &DslPlan) -> PolarsResult<()> { match plan_node { #[cfg(feature = "python")] DslPlan::PythonScan { .. } => return ineligible_error("contains Python scan"), - DslPlan::Scan { paths, .. } - if paths.lock().unwrap().0.iter().any(|p| !is_cloud_url(p)) => - { - return ineligible_error("contains scan of local file system") - }, DslPlan::Scan { - scan_type: FileScan::Anonymous { .. }, - .. - } => return ineligible_error("contains anonymous scan"), + sources, scan_type, .. + } => { + let sources_lock = sources.lock().unwrap(); + match &sources_lock.sources { + ScanSources::Paths(paths) => { + if paths.iter().any(|p| !is_cloud_url(p)) { + return ineligible_error("contains scan of local file system"); + } + }, + ScanSources::Files(_) => { + return ineligible_error("contains scan of opened files"); + }, + ScanSources::Buffers(_) => { + return ineligible_error("contains scan of in-memory buffer"); + }, + } + + if matches!(scan_type, FileScan::Anonymous { .. }) { + return ineligible_error("contains anonymous scan"); + } + }, DslPlan::Sink { payload, .. } => { if !matches!(payload, SinkType::Cloud { .. }) { return ineligible_error("contains sink to non-cloud location"); diff --git a/crates/polars-plan/src/dsl/expr.rs b/crates/polars-plan/src/dsl/expr.rs index f9cd0b046abd..a8c48cd17fb8 100644 --- a/crates/polars-plan/src/dsl/expr.rs +++ b/crates/polars-plan/src/dsl/expr.rs @@ -376,7 +376,7 @@ impl Display for Operator { } impl Operator { - pub(crate) fn is_comparison(&self) -> bool { + pub fn is_comparison(&self) -> bool { matches!( self, Self::Eq @@ -393,6 +393,29 @@ impl Operator { ) } + pub fn swap_operands(self) -> Self { + match self { + Operator::Eq => Operator::Eq, + Operator::Gt => Operator::Lt, + Operator::GtEq => Operator::LtEq, + Operator::LtEq => Operator::GtEq, + Operator::Or => Operator::Or, + Operator::LogicalAnd => Operator::LogicalAnd, + Operator::LogicalOr => Operator::LogicalOr, + Operator::Xor => Operator::Xor, + Operator::NotEq => Operator::NotEq, + Operator::EqValidity => Operator::EqValidity, + Operator::NotEqValidity => Operator::NotEqValidity, + Operator::Divide => Operator::Multiply, + Operator::Multiply => Operator::Divide, + Operator::And => Operator::And, + Operator::Plus => Operator::Minus, + Operator::Minus => Operator::Plus, + Operator::Lt => Operator::Gt, + _ => unimplemented!(), + } + } + pub fn is_arithmetic(&self) -> bool { !(self.is_comparison()) } diff --git a/crates/polars-plan/src/plans/aexpr/mod.rs b/crates/polars-plan/src/plans/aexpr/mod.rs index 35035ffee02e..70c6335bcd1e 100644 --- a/crates/polars-plan/src/plans/aexpr/mod.rs +++ b/crates/polars-plan/src/plans/aexpr/mod.rs @@ -426,19 +426,6 @@ impl AExpr { pub(crate) fn is_leaf(&self) -> bool { matches!(self, AExpr::Column(_) | AExpr::Literal(_) | AExpr::Len) } - pub(crate) fn new_null_count(input: &[ExprIR]) -> Self { - AExpr::Function { - input: input.to_vec(), - function: FunctionExpr::NullCount, - options: FunctionOptions { - collect_groups: ApplyOptions::GroupWise, - fmt_str: "", - cast_to_supertypes: None, - check_lengths: UnsafeBool::default(), - flags: FunctionFlags::ALLOW_GROUP_AWARE | FunctionFlags::RETURNS_SCALAR, - }, - } - } } impl IRAggExpr { diff --git a/crates/polars-plan/src/plans/builder_dsl.rs b/crates/polars-plan/src/plans/builder_dsl.rs index 60cc249ed48d..7efa55417509 100644 --- a/crates/polars-plan/src/plans/builder_dsl.rs +++ b/crates/polars-plan/src/plans/builder_dsl.rs @@ -1,5 +1,3 @@ -#[cfg(any(feature = "csv", feature = "ipc", feature = "parquet"))] -use std::path::PathBuf; use std::sync::{Arc, Mutex, RwLock}; use polars_core::prelude::*; @@ -60,7 +58,10 @@ impl DslBuilder { }; Ok(DslPlan::Scan { - paths: Arc::new(Mutex::new((Arc::new(vec![]), true))), + sources: Arc::new(Mutex::new(DslScanSources { + sources: ScanSources::Buffers(Arc::default()), + is_expanded: true, + })), file_info: Arc::new(RwLock::new(Some(file_info))), hive_parts: None, predicate: None, @@ -79,7 +80,7 @@ impl DslBuilder { #[cfg(feature = "parquet")] #[allow(clippy::too_many_arguments)] pub fn scan_parquet( - paths: Arc>, + sources: DslScanSources, n_rows: Option, cache: bool, parallel: polars_io::parquet::read::ParallelStrategy, @@ -92,8 +93,6 @@ impl DslBuilder { glob: bool, include_file_paths: Option, ) -> PolarsResult { - let paths = init_paths(paths); - let options = FileScanOptions { with_columns: None, cache, @@ -106,7 +105,7 @@ impl DslBuilder { include_file_paths, }; Ok(DslPlan::Scan { - paths, + sources: Arc::new(Mutex::new(sources)), file_info: Arc::new(RwLock::new(None)), hive_parts: None, predicate: None, @@ -127,7 +126,7 @@ impl DslBuilder { #[cfg(feature = "ipc")] #[allow(clippy::too_many_arguments)] pub fn scan_ipc( - paths: Arc>, + sources: DslScanSources, options: IpcScanOptions, n_rows: Option, cache: bool, @@ -137,10 +136,8 @@ impl DslBuilder { hive_options: HiveOptions, include_file_paths: Option, ) -> PolarsResult { - let paths = init_paths(paths); - Ok(DslPlan::Scan { - paths, + sources: Arc::new(Mutex::new(sources)), file_info: Arc::new(RwLock::new(None)), hive_parts: None, file_options: FileScanOptions { @@ -167,15 +164,13 @@ impl DslBuilder { #[allow(clippy::too_many_arguments)] #[cfg(feature = "csv")] pub fn scan_csv( - paths: Arc>, + sources: DslScanSources, read_options: CsvReadOptions, cache: bool, cloud_options: Option, glob: bool, include_file_paths: Option, ) -> PolarsResult { - let paths = init_paths(paths); - // This gets partially moved by FileScanOptions let read_options_clone = read_options.clone(); @@ -195,7 +190,7 @@ impl DslBuilder { include_file_paths, }; Ok(DslPlan::Scan { - paths, + sources: Arc::new(Mutex::new(sources)), file_info: Arc::new(RwLock::new(None)), hive_parts: None, file_options: options, @@ -403,6 +398,7 @@ impl DslBuilder { input_right: Arc::new(other), left_on, right_on, + predicates: Default::default(), options, } .into() @@ -463,9 +459,3 @@ impl DslBuilder { .into() } } - -/// Initialize paths as non-expanded. -#[cfg(any(feature = "csv", feature = "ipc", feature = "parquet"))] -fn init_paths(paths: Arc>) -> Arc>, bool)>> { - Arc::new(Mutex::new((paths, false))) -} diff --git a/crates/polars-plan/src/plans/conversion/dsl_to_ir.rs b/crates/polars-plan/src/plans/conversion/dsl_to_ir.rs index 85d09edef4f7..a908378e6f5c 100644 --- a/crates/polars-plan/src/plans/conversion/dsl_to_ir.rs +++ b/crates/polars-plan/src/plans/conversion/dsl_to_ir.rs @@ -52,6 +52,7 @@ macro_rules! failed_here { format!("'{}' failed", stringify!($($t)*)).into() } } +pub(super) use {failed_here, failed_input, failed_input_args}; pub fn to_alp( lp: DslPlan, @@ -65,7 +66,7 @@ pub fn to_alp( opt_flags.contains(OptFlags::TYPE_COERCION), ); - let mut ctxt = ConversionContext { + let mut ctxt = DslConversionContext { expr_arena, lp_arena, conversion_optimizer, @@ -75,39 +76,45 @@ pub fn to_alp( to_alp_impl(lp, &mut ctxt) } -struct ConversionContext<'a> { - expr_arena: &'a mut Arena, - lp_arena: &'a mut Arena, - conversion_optimizer: ConversionOptimizer, - opt_flags: &'a mut OptFlags, +pub(super) struct DslConversionContext<'a> { + pub(super) expr_arena: &'a mut Arena, + pub(super) lp_arena: &'a mut Arena, + pub(super) conversion_optimizer: ConversionOptimizer, + pub(super) opt_flags: &'a mut OptFlags, +} + +pub(super) fn run_conversion( + lp: IR, + ctxt: &mut DslConversionContext, + name: &str, +) -> PolarsResult { + let lp_node = ctxt.lp_arena.add(lp); + ctxt.conversion_optimizer + .coerce_types(ctxt.expr_arena, ctxt.lp_arena, lp_node) + .map_err(|e| e.context(format!("'{name}' failed").into()))?; + + Ok(lp_node) } /// converts LogicalPlan to IR /// it adds expressions & lps to the respective arenas as it traverses the plan /// finally it returns the top node of the logical plan #[recursive] -pub fn to_alp_impl(lp: DslPlan, ctxt: &mut ConversionContext) -> PolarsResult { +pub fn to_alp_impl(lp: DslPlan, ctxt: &mut DslConversionContext) -> PolarsResult { let owned = Arc::unwrap_or_clone; - fn run_conversion(lp: IR, ctxt: &mut ConversionContext, name: &str) -> PolarsResult { - let lp_node = ctxt.lp_arena.add(lp); - ctxt.conversion_optimizer - .coerce_types(ctxt.expr_arena, ctxt.lp_arena, lp_node) - .map_err(|e| e.context(format!("'{name}' failed").into()))?; - - Ok(lp_node) - } - let v = match lp { DslPlan::Scan { - paths, + sources, file_info, hive_parts, predicate, mut file_options, mut scan_type, } => { - let paths = expand_scan_paths(paths, &mut scan_type, &mut file_options)?; + let mut sources_lock = sources.lock().unwrap(); + sources_lock.expand_paths(&mut scan_type, &mut file_options)?; + let sources = sources_lock.sources.clone(); let file_info_read = file_info.read().unwrap(); @@ -133,9 +140,12 @@ pub fn to_alp_impl(lp: DslPlan, ctxt: &mut ConversionContext) -> PolarsResult { - let (file_info, md) = - scans::parquet_file_info(&paths, &file_options, cloud_options.as_ref()) - .map_err(|e| e.context(failed_here!(parquet scan)))?; + let (file_info, md) = scans::parquet_file_info( + &sources, + &file_options, + cloud_options.as_ref(), + ) + .map_err(|e| e.context(failed_here!(parquet scan)))?; *metadata = md; file_info }, @@ -146,7 +156,7 @@ pub fn to_alp_impl(lp: DslPlan, ctxt: &mut ConversionContext) -> PolarsResult { let (file_info, md) = - scans::ipc_file_info(&paths, &file_options, cloud_options.as_ref()) + scans::ipc_file_info(&sources, &file_options, cloud_options.as_ref()) .map_err(|e| e.context(failed_here!(ipc scan)))?; *metadata = Some(md); file_info @@ -155,16 +165,19 @@ pub fn to_alp_impl(lp: DslPlan, ctxt: &mut ConversionContext) -> PolarsResult { - scans::csv_file_info(&paths, &file_options, options, cloud_options.as_ref()) - .map_err(|e| e.context(failed_here!(csv scan)))? - }, + } => scans::csv_file_info( + &sources, + &file_options, + options, + cloud_options.as_ref(), + ) + .map_err(|e| e.context(failed_here!(csv scan)))?, #[cfg(feature = "json")] FileScan::NDJson { options, cloud_options, } => scans::ndjson_file_info( - &paths, + &sources, &file_options, options, cloud_options.as_ref(), @@ -180,11 +193,15 @@ pub fn to_alp_impl(lp: DslPlan, ctxt: &mut ConversionContext) -> PolarsResult PolarsResult PolarsResult { - if matches!(options.args.how, JoinType::Cross) { - polars_ensure!(left_on.len() + right_on.len() == 0, InvalidOperation: "a 'cross' join doesn't expect any join keys"); - } else { - let mut turn_off_coalesce = false; - for e in left_on.iter().chain(right_on.iter()) { - if has_expr(e, |e| matches!(e, Expr::Alias(_, _))) { - polars_bail!( - ComputeError: - "'alias' is not allowed in a join key, use 'with_columns' first", - ) - } - // Any expression that is not a simple column expression will turn of coalescing. - turn_off_coalesce |= has_expr(e, |e| !matches!(e, Expr::Column(_))); - } - if turn_off_coalesce { - let options = Arc::make_mut(&mut options); - if matches!(options.args.coalesce, JoinCoalesce::CoalesceColumns) { - polars_warn!("coalescing join requested but not all join keys are column references, turning off key coalescing"); - } - options.args.coalesce = JoinCoalesce::KeepColumns; - } - - options.args.validation.is_valid_join(&options.args.how)?; - - polars_ensure!( - left_on.len() == right_on.len(), - ComputeError: - format!( - "the number of columns given as join key (left: {}, right:{}) should be equal", - left_on.len(), - right_on.len() - ) - ); - } - - let input_left = to_alp_impl(owned(input_left), ctxt) - .map_err(|e| e.context(failed_input!(join left)))?; - let input_right = to_alp_impl(owned(input_right), ctxt) - .map_err(|e| e.context(failed_input!(join, right)))?; - - let schema_left = ctxt.lp_arena.get(input_left).schema(ctxt.lp_arena); - let schema_right = ctxt.lp_arena.get(input_right).schema(ctxt.lp_arena); - - let schema = - det_join_schema(&schema_left, &schema_right, &left_on, &right_on, &options) - .map_err(|e| e.context(failed_here!(join schema resolving)))?; - - let left_on = to_expr_irs_ignore_alias(left_on, ctxt.expr_arena)?; - let right_on = to_expr_irs_ignore_alias(right_on, ctxt.expr_arena)?; - let mut joined_on = PlHashSet::new(); - for (l, r) in left_on.iter().zip(right_on.iter()) { - polars_ensure!( - joined_on.insert((l.output_name(), r.output_name())), - InvalidOperation: "joining with repeated key names; already joined on {} and {}", - l.output_name(), - r.output_name() - ) - } - drop(joined_on); - - ctxt.conversion_optimizer - .fill_scratch(&left_on, ctxt.expr_arena); - ctxt.conversion_optimizer - .fill_scratch(&right_on, ctxt.expr_arena); - - // Every expression must be elementwise so that we are - // guaranteed the keys for a join are all the same length. - let all_elementwise = - |aexprs: &[ExprIR]| all_streamable(aexprs, &*ctxt.expr_arena, Context::Default); - polars_ensure!( - all_elementwise(&left_on) && all_elementwise(&right_on), - InvalidOperation: "All join key expressions must be elementwise." - ); - let lp = IR::Join { - input_left, - input_right, - schema, + return join::resolve_join( + Either::Left(input_left), + Either::Left(input_right), left_on, right_on, + predicates, options, - }; - return run_conversion(lp, ctxt, "join"); + ctxt, + ) }, DslPlan::HStack { input, @@ -870,47 +815,48 @@ pub fn to_alp_impl(lp: DslPlan, ctxt: &mut ConversionContext) -> PolarsResult>, bool)>>, - scan_type: &mut FileScan, - file_options: &mut FileScanOptions, -) -> PolarsResult>> { - #[allow(unused_mut)] - let mut lock = paths.lock().unwrap(); +impl DslScanSources { + /// Expand scan paths if they were not already expanded. + pub fn expand_paths( + &mut self, + scan_type: &mut FileScan, + file_options: &mut FileScanOptions, + ) -> PolarsResult<()> { + if self.is_expanded { + return Ok(()); + } - // Return if paths are already expanded - if lock.1 { - return Ok(lock.0.clone()); - } + let ScanSources::Paths(paths) = &self.sources else { + self.is_expanded = true; + return Ok(()); + }; - { - let paths_expanded = match &scan_type { + let expanded_sources = match &scan_type { #[cfg(feature = "parquet")] FileScan::Parquet { cloud_options, .. } => { - expand_scan_paths_with_hive_update(&lock.0, file_options, cloud_options)? + expand_scan_paths_with_hive_update(paths, file_options, cloud_options)? }, #[cfg(feature = "ipc")] FileScan::Ipc { cloud_options, .. } => { - expand_scan_paths_with_hive_update(&lock.0, file_options, cloud_options)? + expand_scan_paths_with_hive_update(paths, file_options, cloud_options)? }, #[cfg(feature = "csv")] FileScan::Csv { cloud_options, .. } => { - expand_paths(&lock.0, file_options.glob, cloud_options.as_ref())? + expand_paths(paths, file_options.glob, cloud_options.as_ref())? }, #[cfg(feature = "json")] FileScan::NDJson { cloud_options, .. } => { - expand_paths(&lock.0, file_options.glob, cloud_options.as_ref())? + expand_paths(paths, file_options.glob, cloud_options.as_ref())? }, FileScan::Anonymous { .. } => unreachable!(), // Invariant: Anonymous scans are already expanded. }; #[allow(unreachable_code)] { - *lock = (paths_expanded, true); + self.sources = ScanSources::Paths(expanded_sources); + self.is_expanded = true; - Ok(lock.0.clone()) + Ok(()) } } } @@ -921,7 +867,7 @@ fn expand_scan_paths_with_hive_update( paths: &[PathBuf], file_options: &mut FileScanOptions, cloud_options: &Option, -) -> PolarsResult>> { +) -> PolarsResult> { let hive_enabled = file_options.hive_options.enabled; let (expanded_paths, hive_start_idx) = expand_paths_hive( paths, @@ -1122,27 +1068,26 @@ pub(crate) fn maybe_init_projection_excluding_hive( // Update `with_columns` with a projection so that hive columns aren't loaded from the // file let hive_parts = hive_parts?; - let hive_schema = hive_parts.schema(); - let (first_hive_name, _) = hive_schema.get_at_index(0)?; - - let names = match reader_schema { - Either::Left(ref v) => { - let names = v.get_names_owned(); - names.contains(first_hive_name).then_some(names) - }, - Either::Right(ref v) => v - .contains(first_hive_name.as_str()) - .then(|| v.get_names_owned()), - }; - - let names = names?; - - Some( - names - .into_iter() - .filter(|x| !hive_schema.contains(x)) - .collect::>(), - ) + match &reader_schema { + Either::Left(reader_schema) => hive_schema + .iter_names() + .any(|x| reader_schema.contains(x)) + .then(|| { + reader_schema + .iter_names_cloned() + .filter(|x| !hive_schema.contains(x)) + .collect::>() + }), + Either::Right(reader_schema) => hive_schema + .iter_names() + .any(|x| reader_schema.contains(x)) + .then(|| { + reader_schema + .iter_names_cloned() + .filter(|x| !hive_schema.contains(x)) + .collect::>() + }), + } } diff --git a/crates/polars-plan/src/plans/conversion/join.rs b/crates/polars-plan/src/plans/conversion/join.rs new file mode 100644 index 000000000000..e7199b2c13d2 --- /dev/null +++ b/crates/polars-plan/src/plans/conversion/join.rs @@ -0,0 +1,473 @@ +use arrow::legacy::error::PolarsResult; +use either::Either; +use polars_core::error::feature_gated; + +use super::*; +use crate::dsl::Expr; +#[cfg(feature = "iejoin")] +use crate::plans::AExpr; + +fn check_join_keys(keys: &[Expr]) -> PolarsResult<()> { + for e in keys { + if has_expr(e, |e| matches!(e, Expr::Alias(_, _))) { + polars_bail!( + InvalidOperation: + "'alias' is not allowed in a join key, use 'with_columns' first", + ) + } + } + Ok(()) +} +pub fn resolve_join( + input_left: Either, Node>, + input_right: Either, Node>, + left_on: Vec, + right_on: Vec, + predicates: Vec, + mut options: Arc, + ctxt: &mut DslConversionContext, +) -> PolarsResult { + if !predicates.is_empty() { + feature_gated!("iejoin", { + debug_assert!(left_on.is_empty() && right_on.is_empty()); + return resolve_join_where( + input_left.unwrap_left(), + input_right.unwrap_left(), + predicates, + options, + ctxt, + ); + }) + } + + let owned = Arc::unwrap_or_clone; + if matches!(options.args.how, JoinType::Cross) { + polars_ensure!(left_on.len() + right_on.len() == 0, InvalidOperation: "a 'cross' join doesn't expect any join keys"); + } else { + polars_ensure!(left_on.len() + right_on.len() > 0, InvalidOperation: "expected join keys/predicates"); + check_join_keys(&left_on)?; + check_join_keys(&right_on)?; + + let mut turn_off_coalesce = false; + for e in left_on.iter().chain(right_on.iter()) { + // Any expression that is not a simple column expression will turn of coalescing. + turn_off_coalesce |= has_expr(e, |e| !matches!(e, Expr::Column(_))); + } + if turn_off_coalesce { + let options = Arc::make_mut(&mut options); + if matches!(options.args.coalesce, JoinCoalesce::CoalesceColumns) { + polars_warn!("coalescing join requested but not all join keys are column references, turning off key coalescing"); + } + options.args.coalesce = JoinCoalesce::KeepColumns; + } + + options.args.validation.is_valid_join(&options.args.how)?; + + polars_ensure!( + left_on.len() == right_on.len(), + InvalidOperation: + format!( + "the number of columns given as join key (left: {}, right:{}) should be equal", + left_on.len(), + right_on.len() + ) + ); + } + + let input_left = input_left.map_right(Ok).right_or_else(|input| { + to_alp_impl(owned(input), ctxt).map_err(|e| e.context(failed_input!(join left))) + })?; + let input_right = input_right.map_right(Ok).right_or_else(|input| { + to_alp_impl(owned(input), ctxt).map_err(|e| e.context(failed_input!(join right))) + })?; + + let schema_left = ctxt.lp_arena.get(input_left).schema(ctxt.lp_arena); + let schema_right = ctxt.lp_arena.get(input_right).schema(ctxt.lp_arena); + + let schema = det_join_schema(&schema_left, &schema_right, &left_on, &right_on, &options) + .map_err(|e| e.context(failed_here!(join schema resolving)))?; + + let left_on = to_expr_irs_ignore_alias(left_on, ctxt.expr_arena)?; + let right_on = to_expr_irs_ignore_alias(right_on, ctxt.expr_arena)?; + let mut joined_on = PlHashSet::new(); + + #[cfg(feature = "iejoin")] + let check = !matches!(options.args.how, JoinType::IEJoin(_)); + #[cfg(not(feature = "iejoin"))] + let check = true; + if check { + for (l, r) in left_on.iter().zip(right_on.iter()) { + polars_ensure!( + joined_on.insert((l.output_name(), r.output_name())), + InvalidOperation: "joining with repeated key names; already joined on {} and {}", + l.output_name(), + r.output_name() + ) + } + } + drop(joined_on); + + ctxt.conversion_optimizer + .fill_scratch(&left_on, ctxt.expr_arena); + ctxt.conversion_optimizer + .fill_scratch(&right_on, ctxt.expr_arena); + + // Every expression must be elementwise so that we are + // guaranteed the keys for a join are all the same length. + let all_elementwise = + |aexprs: &[ExprIR]| all_streamable(aexprs, &*ctxt.expr_arena, Context::Default); + polars_ensure!( + all_elementwise(&left_on) && all_elementwise(&right_on), + InvalidOperation: "All join key expressions must be elementwise." + ); + let lp = IR::Join { + input_left, + input_right, + schema, + left_on, + right_on, + options, + }; + run_conversion(lp, ctxt, "join") +} + +#[cfg(feature = "iejoin")] +impl From for Operator { + fn from(value: InequalityOperator) -> Self { + match value { + InequalityOperator::LtEq => Operator::LtEq, + InequalityOperator::Lt => Operator::Lt, + InequalityOperator::GtEq => Operator::GtEq, + InequalityOperator::Gt => Operator::Gt, + } + } +} + +#[cfg(feature = "iejoin")] +fn resolve_join_where( + input_left: Arc, + input_right: Arc, + predicates: Vec, + mut options: Arc, + ctxt: &mut DslConversionContext, +) -> PolarsResult { + check_join_keys(&predicates)?; + for e in &predicates { + let no_binary_comparisons = e + .into_iter() + .filter(|e| match e { + Expr::BinaryExpr { op, .. } => op.is_comparison(), + _ => false, + }) + .count(); + polars_ensure!(no_binary_comparisons == 1, InvalidOperation: "only 1 binary comparison allowed as join condition"); + } + let input_left = to_alp_impl(Arc::unwrap_or_clone(input_left), ctxt) + .map_err(|e| e.context(failed_input!(join left)))?; + let input_right = to_alp_impl(Arc::unwrap_or_clone(input_right), ctxt) + .map_err(|e| e.context(failed_input!(join left)))?; + + let schema_left = ctxt.lp_arena.get(input_left).schema(ctxt.lp_arena); + let schema_right = ctxt + .lp_arena + .get(input_right) + .schema(ctxt.lp_arena) + .into_owned(); + + let owned = |e: Arc| (*e).clone(); + + // We do a few things + // First we partition to: + // - IEjoin supported inequality predicates + // - equality predicates + // - remaining predicates + // And then decide to which join we dispatch. + // The remaining predicates will be applied as filter. + + // What make things a bit complicated is that duplicate join names + // are referred to in the query with the name post-join, but on joins + // we refer to the names pre-join (e.g. without suffix). So there is some + // bookkeeping. + // + // - First we determine which side of the binary expression refers to the left and right table + // and make sure that lhs of the binary expr, maps to the lhs of the join tables and vice versa. + // Next we ensure the suffixes are removed when we partition. + // + // If a predicate has to be applied as post-join filter, we put the suffixes back if needed. + let mut ie_left_on = vec![]; + let mut ie_right_on = vec![]; + let mut ie_op = vec![]; + + let mut eq_left_on = vec![]; + let mut eq_right_on = vec![]; + + let mut remaining_preds = vec![]; + + fn to_inequality_operator(op: &Operator) -> Option { + match op { + Operator::Lt => Some(InequalityOperator::Lt), + Operator::LtEq => Some(InequalityOperator::LtEq), + Operator::Gt => Some(InequalityOperator::Gt), + Operator::GtEq => Some(InequalityOperator::GtEq), + _ => None, + } + } + + fn rename_expr(e: Expr, old: &str, new: &str) -> Expr { + e.map_expr(|e| match e { + Expr::Column(name) if name.as_str() == old => Expr::Column(new.into()), + e => e, + }) + } + + fn determine_order_and_pre_join_names( + left: Expr, + op: Operator, + right: Expr, + schema_left: &Schema, + schema_right: &Schema, + suffix: &str, + ) -> PolarsResult<(Expr, Operator, Expr)> { + let left_names = expr_to_leaf_column_names_iter(&left).collect::>(); + let right_names = expr_to_leaf_column_names_iter(&right).collect::>(); + + // All left should be in the left schema. + let (left_names, right_names, left, op, mut right) = + if !left_names.iter().all(|n| schema_left.contains(n)) { + // If all right names are in left schema -> swap + if right_names.iter().all(|n| schema_left.contains(n)) { + (right_names, left_names, right, op.swap_operands(), left) + } else { + polars_bail!(InvalidOperation: "got ambiguous column names in 'join_where'") + } + } else { + (left_names, right_names, left, op, right) + }; + for name in &left_names { + polars_ensure!(!right_names.contains(name.as_str()), InvalidOperation: "got ambiguous column names in 'join_where'\n\n\ + Note that you should refer to the column names as they are post-join operation.") + } + + // Now we know left belongs to the left schema, rhs suffixes are dealt with. + for post_join_name in right_names { + if let Some(pre_join_name) = post_join_name.strip_suffix(suffix) { + // Name is both sides, so a suffix will be added by the join. + // We rename + if schema_right.contains(pre_join_name) && schema_left.contains(pre_join_name) { + right = rename_expr(right, &post_join_name, pre_join_name); + } + } + } + Ok((left, op, right)) + } + + // Make it a binary comparison and ensure the columns refer to post join names. + fn to_binary_post_join( + l: Expr, + op: Operator, + mut r: Expr, + schema_right: &Schema, + suffix: &str, + ) -> Expr { + let names = expr_to_leaf_column_names_iter(&r).collect::>(); + for pre_join_name in &names { + if !schema_right.contains(pre_join_name) { + let post_join_name = _join_suffix_name(pre_join_name, suffix); + r = rename_expr(r, pre_join_name, post_join_name.as_str()); + } + } + + Expr::BinaryExpr { + left: Arc::from(l), + op, + right: Arc::from(r), + } + } + + let suffix = options.args.suffix().clone(); + for pred in predicates.into_iter() { + let Expr::BinaryExpr { left, op, right } = pred.clone() else { + polars_bail!(InvalidOperation: "can only join on binary expressions") + }; + polars_ensure!(op.is_comparison(), InvalidOperation: "expected comparison in join predicate"); + let (left, op, right) = determine_order_and_pre_join_names( + owned(left), + op, + owned(right), + &schema_left, + &schema_right, + &suffix, + )?; + + if let Some(ie_op_) = to_inequality_operator(&op) { + // We already have an IEjoin or an Inner join, push to remaining + if ie_op.len() >= 2 || !eq_right_on.is_empty() { + remaining_preds.push(to_binary_post_join(left, op, right, &schema_right, &suffix)) + } else { + ie_left_on.push(left); + ie_right_on.push(right); + ie_op.push(ie_op_) + } + } else if matches!(op, Operator::Eq) { + eq_left_on.push(left); + eq_right_on.push(right); + } else { + remaining_preds.push(to_binary_post_join(left, op, right, &schema_right, &suffix)); + } + } + + // Now choose a primary join and do the remaining predicates as filters + // Add the ie predicates to the remaining predicates buffer so that they will be executed in the + // filter node. + fn ie_predicates_to_remaining( + remaining_preds: &mut Vec, + ie_left_on: Vec, + ie_right_on: Vec, + ie_op: Vec, + schema_right: &Schema, + suffix: &str, + ) { + for ((l, op), r) in ie_left_on + .into_iter() + .zip(ie_op.into_iter()) + .zip(ie_right_on.into_iter()) + { + remaining_preds.push(to_binary_post_join(l, op.into(), r, schema_right, suffix)) + } + } + + let join_node = if !eq_left_on.is_empty() { + // We found one or more equality predicates. Go into a default equi join + // as those are cheapest on avg. + let join_node = resolve_join( + Either::Right(input_left), + Either::Right(input_right), + eq_left_on, + eq_right_on, + vec![], + options.clone(), + ctxt, + )?; + + ie_predicates_to_remaining( + &mut remaining_preds, + ie_left_on, + ie_right_on, + ie_op, + &schema_right, + &suffix, + ); + join_node + } + // TODO! once we support single IEjoin predicates, we must add a branch for the singe ie_pred case. + else if ie_right_on.len() >= 2 { + // Do an IEjoin. + let opts = Arc::make_mut(&mut options); + opts.args.how = JoinType::IEJoin(IEJoinOptions { + operator1: ie_op[0], + operator2: ie_op[1], + }); + + let join_node = resolve_join( + Either::Right(input_left), + Either::Right(input_right), + ie_left_on[..2].to_vec(), + ie_right_on[..2].to_vec(), + vec![], + options.clone(), + ctxt, + )?; + + // The surplus ie-predicates will be added to the remaining predicates so that + // they will be applied in a filter node. + while ie_right_on.len() > 2 { + // Invariant: they all have equal length, so we can pop and unwrap all while len > 2. + // The first 2 predicates are used in the + let l = ie_right_on.pop().unwrap(); + let r = ie_left_on.pop().unwrap(); + let op = ie_op.pop().unwrap(); + + remaining_preds.push(to_binary_post_join(l, op.into(), r, &schema_right, &suffix)) + } + join_node + } else { + // No predicates found that are supported in a fast algorithm. + // Do a cross join and follow up with filters. + let opts = Arc::make_mut(&mut options); + opts.args.how = JoinType::Cross; + + let join_node = resolve_join( + Either::Right(input_left), + Either::Right(input_right), + vec![], + vec![], + vec![], + options.clone(), + ctxt, + )?; + // TODO: This can be removed once we support the single IEjoin. + ie_predicates_to_remaining( + &mut remaining_preds, + ie_left_on, + ie_right_on, + ie_op, + &schema_right, + &suffix, + ); + join_node + }; + + let IR::Join { + input_left, + input_right, + .. + } = ctxt.lp_arena.get(join_node) + else { + unreachable!() + }; + let schema_right = ctxt + .lp_arena + .get(*input_right) + .schema(ctxt.lp_arena) + .into_owned(); + + let schema_left = ctxt + .lp_arena + .get(*input_left) + .schema(ctxt.lp_arena) + .into_owned(); + + let mut last_node = join_node; + + // Ensure that the predicates use the proper suffix + for e in remaining_preds { + let predicate = to_expr_ir_ignore_alias(e, ctxt.expr_arena)?; + let AExpr::BinaryExpr { mut right, .. } = *ctxt.expr_arena.get(predicate.node()) else { + unreachable!() + }; + + let original_right = right; + + for name in aexpr_to_leaf_names(right, ctxt.expr_arena) { + polars_ensure!(schema_right.contains(name.as_str()), ColumnNotFound: "could not find column {name} in the right table during join operation"); + if schema_left.contains(name.as_str()) { + let new_name = _join_suffix_name(name.as_str(), suffix.as_str()); + + right = rename_matching_aexpr_leaf_names( + right, + ctxt.expr_arena, + name.as_str(), + new_name, + ); + } + } + ctxt.expr_arena.swap(right, original_right); + + let ir = IR::Filter { + input: last_node, + predicate, + }; + last_node = ctxt.lp_arena.add(ir); + } + Ok(last_node) +} diff --git a/crates/polars-plan/src/plans/conversion/mod.rs b/crates/polars-plan/src/plans/conversion/mod.rs index e07f8bc2848e..b9ed8711a438 100644 --- a/crates/polars-plan/src/plans/conversion/mod.rs +++ b/crates/polars-plan/src/plans/conversion/mod.rs @@ -21,6 +21,7 @@ use polars_core::prelude::*; use polars_utils::vec::ConvertVec; use recursive::recursive; mod functions; +mod join; pub(crate) mod type_coercion; pub(crate) use expr_expansion::{expand_selectors, is_regex_projection, prepare_projection}; @@ -49,7 +50,7 @@ impl IR { }; match lp { IR::Scan { - paths, + sources, file_info, hive_parts, predicate, @@ -57,7 +58,10 @@ impl IR { output_schema: _, file_options: options, } => DslPlan::Scan { - paths: Arc::new(Mutex::new((paths, true))), + sources: Arc::new(Mutex::new(DslScanSources { + sources, + is_expanded: true, + })), file_info: Arc::new(RwLock::new(Some(file_info))), hive_parts, predicate: predicate.map(|e| e.to_expr(expr_arena)), @@ -215,6 +219,7 @@ impl IR { DslPlan::Join { input_left: Arc::new(i_l), input_right: Arc::new(i_r), + predicates: Default::default(), left_on, right_on, options, diff --git a/crates/polars-plan/src/plans/conversion/scans.rs b/crates/polars-plan/src/plans/conversion/scans.rs index 9b2636430622..9fd419f90f63 100644 --- a/crates/polars-plan/src/plans/conversion/scans.rs +++ b/crates/polars-plan/src/plans/conversion/scans.rs @@ -1,5 +1,3 @@ -use std::path::PathBuf; - use either::Either; use polars_io::path_utils::is_cloud_url; #[cfg(feature = "cloud")] @@ -9,13 +7,6 @@ use polars_io::RowIndex; use super::*; -fn get_first_path(paths: &[PathBuf]) -> PolarsResult<&PathBuf> { - // Use first path to get schema. - paths - .first() - .ok_or_else(|| polars_err!(ComputeError: "expected at least 1 path")) -} - #[cfg(any(feature = "parquet", feature = "ipc"))] fn prepare_output_schema(mut schema: Schema, row_index: Option<&RowIndex>) -> SchemaRef { if let Some(rc) = row_index { @@ -38,48 +29,47 @@ fn prepare_schemas(mut schema: Schema, row_index: Option<&RowIndex>) -> (SchemaR #[cfg(feature = "parquet")] pub(super) fn parquet_file_info( - paths: &[PathBuf], + sources: &ScanSources, file_options: &FileScanOptions, #[allow(unused)] cloud_options: Option<&polars_io::cloud::CloudOptions>, -) -> PolarsResult<(FileInfo, Option)> { - let path = get_first_path(paths)?; - - let (schema, reader_schema, num_rows, metadata) = if is_cloud_url(path) { - #[cfg(not(feature = "cloud"))] - panic!("One or more of the cloud storage features ('aws', 'gcp', ...) must be enabled."); - - #[cfg(feature = "cloud")] - { - let uri = path.to_string_lossy(); - get_runtime().block_on(async { - let mut reader = ParquetAsyncReader::from_uri(&uri, cloud_options, None).await?; - let reader_schema = reader.schema().await?; - let num_rows = reader.num_rows().await?; - let metadata = reader.get_metadata().await?.clone(); - - let schema = prepare_output_schema( - Schema::from_arrow_schema(reader_schema.as_ref()), - file_options.row_index.as_ref(), - ); - PolarsResult::Ok((schema, reader_schema, Some(num_rows), Some(metadata))) - })? +) -> PolarsResult<(FileInfo, Option)> { + use polars_core::error::feature_gated; + + let (reader_schema, num_rows, metadata) = { + if sources.is_cloud_url() { + let first_path = &sources.as_paths().unwrap()[0]; + feature_gated!("cloud", { + let uri = first_path.to_string_lossy(); + get_runtime().block_on(async { + let mut reader = + ParquetAsyncReader::from_uri(&uri, cloud_options, None).await?; + + PolarsResult::Ok(( + reader.schema().await?, + Some(reader.num_rows().await?), + Some(reader.get_metadata().await?.clone()), + )) + })? + }) + } else { + let first_source = sources + .first() + .ok_or_else(|| polars_err!(ComputeError: "expected at least 1 source"))?; + let memslice = first_source.to_memslice()?; + let mut reader = ParquetReader::new(std::io::Cursor::new(memslice)); + ( + reader.schema()?, + Some(reader.num_rows()?), + Some(reader.get_metadata()?.clone()), + ) } - } else { - let file = polars_utils::open_file(path)?; - let mut reader = ParquetReader::new(file); - let reader_schema = reader.schema()?; - let schema = prepare_output_schema( - Schema::from_arrow_schema(reader_schema.as_ref()), - file_options.row_index.as_ref(), - ); - ( - schema, - reader_schema, - Some(reader.num_rows()?), - Some(reader.get_metadata()?.clone()), - ) }; + let schema = prepare_output_schema( + Schema::from_arrow_schema(reader_schema.as_ref()), + file_options.row_index.as_ref(), + ); + let file_info = FileInfo::new( schema, Some(Either::Left(reader_schema)), @@ -92,31 +82,42 @@ pub(super) fn parquet_file_info( // TODO! return metadata arced #[cfg(feature = "ipc")] pub(super) fn ipc_file_info( - paths: &[PathBuf], + sources: &ScanSources, file_options: &FileScanOptions, cloud_options: Option<&polars_io::cloud::CloudOptions>, ) -> PolarsResult<(FileInfo, arrow::io::ipc::read::FileMetadata)> { - let path = get_first_path(paths)?; - - let metadata = if is_cloud_url(path) { - #[cfg(not(feature = "cloud"))] - panic!("One or more of the cloud storage features ('aws', 'gcp', ...) must be enabled."); - - #[cfg(feature = "cloud")] - { - let uri = path.to_string_lossy(); - get_runtime().block_on(async { - polars_io::ipc::IpcReaderAsync::from_uri(&uri, cloud_options) - .await? - .metadata() - .await - })? - } - } else { - arrow::io::ipc::read::read_file_metadata(&mut std::io::BufReader::new( - polars_utils::open_file(path)?, - ))? + use polars_core::error::feature_gated; + + let Some(first) = sources.first() else { + polars_bail!(ComputeError: "expected at least 1 source"); + }; + + let metadata = match first { + ScanSourceRef::Path(path) => { + if is_cloud_url(path) { + feature_gated!("cloud", { + let uri = path.to_string_lossy(); + get_runtime().block_on(async { + polars_io::ipc::IpcReaderAsync::from_uri(&uri, cloud_options) + .await? + .metadata() + .await + })? + }) + } else { + arrow::io::ipc::read::read_file_metadata(&mut std::io::BufReader::new( + polars_utils::open_file(path)?, + ))? + } + }, + ScanSourceRef::File(file) => { + arrow::io::ipc::read::read_file_metadata(&mut std::io::BufReader::new(file))? + }, + ScanSourceRef::Buffer(buff) => { + arrow::io::ipc::read::read_file_metadata(&mut std::io::Cursor::new(buff))? + }, }; + let file_info = FileInfo::new( prepare_output_schema( Schema::from_arrow_schema(metadata.schema.as_ref()), @@ -131,115 +132,94 @@ pub(super) fn ipc_file_info( #[cfg(feature = "csv")] pub(super) fn csv_file_info( - paths: &[PathBuf], + sources: &ScanSources, file_options: &FileScanOptions, csv_options: &mut CsvReadOptions, cloud_options: Option<&polars_io::cloud::CloudOptions>, ) -> PolarsResult { use std::io::{Read, Seek}; + use polars_core::error::feature_gated; use polars_core::{config, POOL}; use polars_io::csv::read::schema_inference::SchemaInferenceResult; use polars_io::utils::get_reader_bytes; use rayon::iter::{IntoParallelIterator, ParallelIterator}; + polars_ensure!(!sources.is_empty(), ComputeError: "expected at least 1 source"); + // TODO: // * See if we can do better than scanning all files if there is a row limit // * See if we can do this without downloading the entire file // prints the error message if paths is empty. - let first_path = get_first_path(paths)?; - let run_async = is_cloud_url(first_path) || config::force_async(); + let run_async = sources.is_cloud_url() || (sources.is_paths() && config::force_async()); let cache_entries = { - #[cfg(feature = "cloud")] - { - if run_async { + if run_async { + feature_gated!("cloud", { Some(polars_io::file_cache::init_entries_from_uri_list( - paths + sources + .as_paths() + .unwrap() .iter() .map(|path| Arc::from(path.to_str().unwrap())) .collect::>() .as_slice(), cloud_options, )?) - } else { - None - } - } - #[cfg(not(feature = "cloud"))] - { - if run_async { - panic!("required feature `cloud` is not enabled") - } + }) + } else { + None } }; let infer_schema_func = |i| { - let file = if run_async { - #[cfg(feature = "cloud")] - { - let entry: &Arc = - &cache_entries.as_ref().unwrap()[i]; - entry.try_open_check_latest()? - } - #[cfg(not(feature = "cloud"))] - { - panic!("required feature `cloud` is not enabled") - } - } else { - let p: &PathBuf = &paths[i]; - polars_utils::open_file(p.as_ref())? - }; - - let mmap = unsafe { memmap::Mmap::map(&file).unwrap() }; + let source = sources.at(i); + let memslice = source.to_memslice_possibly_async(run_async, cache_entries.as_ref(), i)?; let owned = &mut vec![]; - - let mut curs = std::io::Cursor::new(maybe_decompress_bytes(mmap.as_ref(), owned)?); - - if curs.read(&mut [0; 4])? < 2 && csv_options.raise_if_empty { + let mut reader = std::io::Cursor::new(maybe_decompress_bytes(&memslice, owned)?); + if reader.read(&mut [0; 4])? < 2 && csv_options.raise_if_empty { polars_bail!(NoData: "empty CSV") } - curs.rewind()?; + reader.rewind()?; - let reader_bytes = get_reader_bytes(&mut curs).expect("could not mmap file"); + let reader_bytes = get_reader_bytes(&mut reader).expect("could not mmap file"); // this needs a way to estimated bytes/rows. - let si_result = - SchemaInferenceResult::try_from_reader_bytes_and_options(&reader_bytes, csv_options)?; - - Ok(si_result) + SchemaInferenceResult::try_from_reader_bytes_and_options(&reader_bytes, csv_options) }; let merge_func = |a: PolarsResult, - b: PolarsResult| match (a, b) { - (Err(e), _) | (_, Err(e)) => Err(e), - (Ok(a), Ok(b)) => { - let merged_schema = if csv_options.schema.is_some() { - csv_options.schema.clone().unwrap() - } else { - let schema_a = a.get_inferred_schema(); - let schema_b = b.get_inferred_schema(); - - match (schema_a.is_empty(), schema_b.is_empty()) { - (true, _) => schema_b, - (_, true) => schema_a, - _ => { - let mut s = Arc::unwrap_or_clone(schema_a); - s.to_supertype(&schema_b)?; - Arc::new(s) - }, - } - }; - - Ok(a.with_inferred_schema(merged_schema)) - }, + b: PolarsResult| { + match (a, b) { + (Err(e), _) | (_, Err(e)) => Err(e), + (Ok(a), Ok(b)) => { + let merged_schema = if csv_options.schema.is_some() { + csv_options.schema.clone().unwrap() + } else { + let schema_a = a.get_inferred_schema(); + let schema_b = b.get_inferred_schema(); + + match (schema_a.is_empty(), schema_b.is_empty()) { + (true, _) => schema_b, + (_, true) => schema_a, + _ => { + let mut s = Arc::unwrap_or_clone(schema_a); + s.to_supertype(&schema_b)?; + Arc::new(s) + }, + } + }; + + Ok(a.with_inferred_schema(merged_schema)) + }, + } }; let si_results = POOL.join( || infer_schema_func(0), || { - (1..paths.len()) + (1..sources.len()) .into_par_iter() .map(infer_schema_func) .reduce(|| Ok(Default::default()), merge_func) @@ -276,58 +256,40 @@ pub(super) fn csv_file_info( #[cfg(feature = "json")] pub(super) fn ndjson_file_info( - paths: &[PathBuf], + sources: &ScanSources, file_options: &FileScanOptions, ndjson_options: &mut NDJsonReadOptions, cloud_options: Option<&polars_io::cloud::CloudOptions>, ) -> PolarsResult { use polars_core::config; + use polars_core::error::feature_gated; - let run_async = !paths.is_empty() && is_cloud_url(&paths[0]) || config::force_async(); + let Some(first) = sources.first() else { + polars_bail!(ComputeError: "expected at least 1 source"); + }; + + let run_async = sources.is_cloud_url() || (sources.is_paths() && config::force_async()); let cache_entries = { - #[cfg(feature = "cloud")] - { - if run_async { + if run_async { + feature_gated!("cloud", { Some(polars_io::file_cache::init_entries_from_uri_list( - paths + sources + .as_paths() + .unwrap() .iter() .map(|path| Arc::from(path.to_str().unwrap())) .collect::>() .as_slice(), cloud_options, )?) - } else { - None - } - } - #[cfg(not(feature = "cloud"))] - { - if run_async { - panic!("required feature `cloud` is not enabled") - } - } - }; - - let first_path = get_first_path(paths)?; - - let f = if run_async { - #[cfg(feature = "cloud")] - { - cache_entries.unwrap()[0].try_open_check_latest()? - } - #[cfg(not(feature = "cloud"))] - { - panic!("required feature `cloud` is not enabled") + }) + } else { + None } - } else { - polars_utils::open_file(first_path)? }; let owned = &mut vec![]; - let mmap = unsafe { memmap::Mmap::map(&f).unwrap() }; - - let mut reader = std::io::BufReader::new(maybe_decompress_bytes(mmap.as_ref(), owned)?); let (mut reader_schema, schema) = if let Some(schema) = ndjson_options.schema.take() { if file_options.row_index.is_none() { @@ -339,8 +301,12 @@ pub(super) fn ndjson_file_info( ) } } else { + let memslice = first.to_memslice_possibly_async(run_async, cache_entries.as_ref(), 0)?; + let mut reader = std::io::Cursor::new(maybe_decompress_bytes(&memslice, owned)?); + let schema = polars_io::ndjson::infer_schema(&mut reader, ndjson_options.infer_schema_length)?; + prepare_schemas(schema, file_options.row_index.as_ref()) }; diff --git a/crates/polars-plan/src/plans/file_scan.rs b/crates/polars-plan/src/plans/file_scan.rs index 73ae85d93646..e868b98d2799 100644 --- a/crates/polars-plan/src/plans/file_scan.rs +++ b/crates/polars-plan/src/plans/file_scan.rs @@ -5,7 +5,7 @@ use polars_io::csv::read::CsvReadOptions; #[cfg(feature = "ipc")] use polars_io::ipc::IpcScanOptions; #[cfg(feature = "parquet")] -use polars_io::parquet::metadata::FileMetaDataRef; +use polars_io::parquet::metadata::FileMetadataRef; #[cfg(feature = "parquet")] use polars_io::parquet::read::ParquetOptions; @@ -24,7 +24,7 @@ pub enum FileScan { options: ParquetOptions, cloud_options: Option, #[cfg_attr(feature = "serde", serde(skip))] - metadata: Option, + metadata: Option, }, #[cfg(feature = "ipc")] Ipc { diff --git a/crates/polars-plan/src/plans/functions/count.rs b/crates/polars-plan/src/plans/functions/count.rs index bd68db61a06c..7375ff47ff31 100644 --- a/crates/polars-plan/src/plans/functions/count.rs +++ b/crates/polars-plan/src/plans/functions/count.rs @@ -1,11 +1,18 @@ #[cfg(feature = "ipc")] use arrow::io::ipc::read::get_row_count as count_rows_ipc_sync; +#[cfg(any( + feature = "parquet", + feature = "ipc", + feature = "json", + feature = "csv" +))] +use polars_core::error::feature_gated; #[cfg(any(feature = "parquet", feature = "json"))] use polars_io::cloud::CloudOptions; #[cfg(feature = "csv")] -use polars_io::csv::read::count_rows as count_rows_csv; -#[cfg(any(feature = "parquet", feature = "ipc", feature = "json"))] -use polars_io::is_cloud_url; +use polars_io::csv::read::{ + count_rows as count_rows_csv, count_rows_from_slice as count_rows_csv_from_slice, +}; #[cfg(all(feature = "parquet", feature = "cloud"))] use polars_io::parquet::read::ParquetAsyncReader; #[cfg(feature = "parquet")] @@ -18,7 +25,11 @@ use polars_io::SerReader; use super::*; #[allow(unused_variables)] -pub fn count_rows(paths: &Arc>, scan_type: &FileScan) -> PolarsResult { +pub fn count_rows( + sources: &ScanSources, + scan_type: &FileScan, + alias: Option, +) -> PolarsResult { #[cfg(not(any( feature = "parquet", feature = "ipc", @@ -41,26 +52,10 @@ pub fn count_rows(paths: &Arc>, scan_type: &FileScan) -> PolarsResu FileScan::Csv { options, cloud_options, - } => { - let parse_options = options.get_parse_options(); - let n_rows: PolarsResult = paths - .iter() - .map(|path| { - count_rows_csv( - path, - parse_options.separator, - parse_options.quote_char, - parse_options.comment_prefix.as_ref(), - parse_options.eol_char, - options.has_header, - ) - }) - .sum(); - n_rows - }, + } => count_all_rows_csv(sources, options), #[cfg(feature = "parquet")] FileScan::Parquet { cloud_options, .. } => { - count_rows_parquet(paths, cloud_options.as_ref()) + count_rows_parquet(sources, cloud_options.as_ref()) }, #[cfg(feature = "ipc")] FileScan::Ipc { @@ -68,7 +63,7 @@ pub fn count_rows(paths: &Arc>, scan_type: &FileScan) -> PolarsResu cloud_options, metadata, } => count_rows_ipc( - paths, + sources, #[cfg(feature = "cloud")] cloud_options.as_ref(), metadata.as_ref(), @@ -77,7 +72,7 @@ pub fn count_rows(paths: &Arc>, scan_type: &FileScan) -> PolarsResu FileScan::NDJson { options, cloud_options, - } => count_rows_ndjson(paths, cloud_options.as_ref()), + } => count_rows_ndjson(sources, cloud_options.as_ref()), FileScan::Anonymous { .. } => { unreachable!() }, @@ -86,37 +81,67 @@ pub fn count_rows(paths: &Arc>, scan_type: &FileScan) -> PolarsResu let count: IdxSize = count.try_into().map_err( |_| polars_err!(ComputeError: "count of {} exceeded maximum row size", count), )?; - DataFrame::new(vec![Series::new( - PlSmallStr::from_static(crate::constants::LEN), - [count], - )]) + let column_name = alias.unwrap_or(PlSmallStr::from_static(crate::constants::LEN)); + DataFrame::new(vec![Series::new(column_name, [count])]) } } + +#[cfg(feature = "csv")] +fn count_all_rows_csv( + sources: &ScanSources, + options: &polars_io::prelude::CsvReadOptions, +) -> PolarsResult { + let parse_options = options.get_parse_options(); + + sources + .iter() + .map(|source| match source { + ScanSourceRef::Path(path) => count_rows_csv( + path, + parse_options.separator, + parse_options.quote_char, + parse_options.comment_prefix.as_ref(), + parse_options.eol_char, + options.has_header, + ), + _ => { + let memslice = source.to_memslice()?; + + count_rows_csv_from_slice( + &memslice[..], + parse_options.separator, + parse_options.quote_char, + parse_options.comment_prefix.as_ref(), + parse_options.eol_char, + options.has_header, + ) + }, + }) + .sum() +} + #[cfg(feature = "parquet")] pub(super) fn count_rows_parquet( - paths: &Arc>, + sources: &ScanSources, #[allow(unused)] cloud_options: Option<&CloudOptions>, ) -> PolarsResult { - if paths.is_empty() { + if sources.is_empty() { return Ok(0); }; - let is_cloud = is_cloud_url(paths.first().unwrap().as_path()); + let is_cloud = sources.is_cloud_url(); if is_cloud { - #[cfg(not(feature = "cloud"))] - panic!("One or more of the cloud storage features ('aws', 'gcp', ...) must be enabled."); - - #[cfg(feature = "cloud")] - { - get_runtime().block_on(count_rows_cloud_parquet(paths, cloud_options)) - } + feature_gated!("cloud", { + get_runtime().block_on(count_rows_cloud_parquet( + sources.as_paths().unwrap(), + cloud_options, + )) + }) } else { - paths + sources .iter() - .map(|path| { - let file = polars_utils::open_file(path)?; - let mut reader = ParquetReader::new(file); - reader.num_rows() + .map(|source| { + ParquetReader::new(std::io::Cursor::new(source.to_memslice()?)).num_rows() }) .sum::>() } @@ -124,7 +149,7 @@ pub(super) fn count_rows_parquet( #[cfg(all(feature = "parquet", feature = "async"))] async fn count_rows_cloud_parquet( - paths: &Arc>, + paths: &[std::path::PathBuf], cloud_options: Option<&CloudOptions>, ) -> PolarsResult { let collection = paths.iter().map(|path| { @@ -141,37 +166,37 @@ async fn count_rows_cloud_parquet( #[cfg(feature = "ipc")] pub(super) fn count_rows_ipc( - paths: &Arc>, + sources: &ScanSources, #[cfg(feature = "cloud")] cloud_options: Option<&CloudOptions>, metadata: Option<&arrow::io::ipc::read::FileMetadata>, ) -> PolarsResult { - if paths.is_empty() { + if sources.is_empty() { return Ok(0); }; - let is_cloud = is_cloud_url(paths.first().unwrap().as_path()); + let is_cloud = sources.is_cloud_url(); if is_cloud { - #[cfg(not(feature = "cloud"))] - panic!("One or more of the cloud storage features ('aws', 'gcp', ...) must be enabled."); - - #[cfg(feature = "cloud")] - { - get_runtime().block_on(count_rows_cloud_ipc(paths, cloud_options, metadata)) - } + feature_gated!("cloud", { + get_runtime().block_on(count_rows_cloud_ipc( + sources.as_paths().unwrap(), + cloud_options, + metadata, + )) + }) } else { - paths + sources .iter() - .map(|path| { - let mut reader = polars_utils::open_file(path)?; - count_rows_ipc_sync(&mut reader).map(|v| v as usize) + .map(|source| { + let memslice = source.to_memslice()?; + count_rows_ipc_sync(&mut std::io::Cursor::new(memslice)).map(|v| v as usize) }) - .sum() + .sum::>() } } #[cfg(all(feature = "ipc", feature = "async"))] async fn count_rows_cloud_ipc( - paths: &Arc>, + paths: &[std::path::PathBuf], cloud_options: Option<&CloudOptions>, metadata: Option<&arrow::io::ipc::read::FileMetadata>, ) -> PolarsResult { @@ -190,60 +215,47 @@ async fn count_rows_cloud_ipc( #[cfg(feature = "json")] pub(super) fn count_rows_ndjson( - paths: &Arc>, + sources: &ScanSources, cloud_options: Option<&CloudOptions>, ) -> PolarsResult { use polars_core::config; use polars_io::utils::maybe_decompress_bytes; - let run_async = !paths.is_empty() && is_cloud_url(&paths[0]) || config::force_async(); + if sources.is_empty() { + return Ok(0); + } + + let is_cloud_url = sources.is_cloud_url(); + let run_async = is_cloud_url || (sources.is_paths() && config::force_async()); let cache_entries = { - #[cfg(feature = "cloud")] - { - if run_async { + if run_async { + feature_gated!("cloud", { Some(polars_io::file_cache::init_entries_from_uri_list( - paths + sources + .as_paths() + .unwrap() .iter() .map(|path| Arc::from(path.to_str().unwrap())) .collect::>() .as_slice(), cloud_options, )?) - } else { - None - } - } - #[cfg(not(feature = "cloud"))] - { - if run_async { - panic!("required feature `cloud` is not enabled") - } + }) + } else { + None } }; - (0..paths.len()) - .map(|i| { - let f = if run_async { - #[cfg(feature = "cloud")] - { - let entry: &Arc = - &cache_entries.as_ref().unwrap()[0]; - entry.try_open_check_latest()? - } - #[cfg(not(feature = "cloud"))] - { - panic!("required feature `cloud` is not enabled") - } - } else { - polars_utils::open_file(&paths[i])? - }; - - let mmap = unsafe { memmap::Mmap::map(&f).unwrap() }; - let owned = &mut vec![]; + sources + .iter() + .map(|source| { + let memslice = + source.to_memslice_possibly_async(run_async, cache_entries.as_ref(), 0)?; + let owned = &mut vec![]; let reader = polars_io::ndjson::core::JsonLineReader::new(std::io::Cursor::new( - maybe_decompress_bytes(mmap.as_ref(), owned)?, + maybe_decompress_bytes(&memslice[..], owned)?, )); reader.count() }) diff --git a/crates/polars-plan/src/plans/functions/mod.rs b/crates/polars-plan/src/plans/functions/mod.rs index b0e5bb444689..61cce46de9af 100644 --- a/crates/polars-plan/src/plans/functions/mod.rs +++ b/crates/polars-plan/src/plans/functions/mod.rs @@ -10,10 +10,10 @@ mod schema; use std::borrow::Cow; use std::fmt::{Debug, Display, Formatter}; use std::hash::{Hash, Hasher}; -use std::path::PathBuf; use std::sync::{Arc, Mutex}; pub use dsl::*; +use polars_core::error::feature_gated; use polars_core::prelude::*; use polars_utils::pl_str::PlSmallStr; #[cfg(feature = "serde")] @@ -24,6 +24,7 @@ use strum_macros::IntoStaticStr; use crate::dsl::python_udf::PythonFunction; #[cfg(feature = "merge_sorted")] use crate::plans::functions::merge_sorted::merge_sorted; +use crate::plans::ir::ScanSourcesDisplay; use crate::prelude::*; #[cfg_attr(feature = "ir_serde", derive(Serialize, Deserialize))] @@ -45,7 +46,7 @@ pub enum FunctionIR { fmt_str: PlSmallStr, }, FastCount { - paths: Arc>, + sources: ScanSources, scan_type: FileScan, alias: Option, }, @@ -104,9 +105,14 @@ impl PartialEq for FunctionIR { use FunctionIR::*; match (self, other) { (Rechunk, Rechunk) => true, - (FastCount { paths: paths_l, .. }, FastCount { paths: paths_r, .. }) => { - paths_l == paths_r - }, + ( + FastCount { + sources: srcs_l, .. + }, + FastCount { + sources: srcs_r, .. + }, + ) => srcs_l == srcs_r, ( Rename { existing: existing_l, @@ -138,11 +144,11 @@ impl Hash for FunctionIR { FunctionIR::OpaquePython { .. } => {}, FunctionIR::Opaque { fmt_str, .. } => fmt_str.hash(state), FunctionIR::FastCount { - paths, + sources, scan_type, alias, } => { - paths.hash(state); + sources.hash(state); scan_type.hash(state); alias.hash(state); }, @@ -261,8 +267,10 @@ impl FunctionIR { .. }) => python_udf::call_python_udf(function, df, *validate_output, schema.as_deref()), FastCount { - paths, scan_type, .. - } => count::count_rows(paths, scan_type), + sources, + scan_type, + alias, + } => count::count_rows(sources, scan_type, alias.clone()), Rechunk => { df.as_single_chunk_par(); Ok(df) @@ -270,14 +278,7 @@ impl FunctionIR { #[cfg(feature = "merge_sorted")] MergeSorted { column } => merge_sorted(&df, column.as_ref()), Unnest { columns: _columns } => { - #[cfg(feature = "dtype-struct")] - { - df.unnest(_columns.iter().cloned()) - } - #[cfg(not(feature = "dtype-struct"))] - { - panic!("activate feature 'dtype-struct'") - } + feature_gated!("dtype-struct", df.unnest(_columns.iter().cloned())) }, Pipeline { function, .. } => { // we use a global string cache here as streaming chunks all have different rev maps @@ -346,6 +347,21 @@ impl Display for FunctionIR { write!(f, "STREAMING") } }, + FastCount { + sources, + scan_type, + alias, + } => { + let scan_type: &str = scan_type.into(); + let default_column_name = PlSmallStr::from_static(crate::constants::LEN); + let alias = alias.as_ref().unwrap_or(&default_column_name); + + write!( + f, + "FAST COUNT ({scan_type}) {} as \"{alias}\"", + ScanSourcesDisplay(sources) + ) + }, v => { let s: &str = v.into(); write!(f, "{s}") diff --git a/crates/polars-plan/src/plans/hive.rs b/crates/polars-plan/src/plans/hive.rs index 3fc7531ea2b3..a711aeb11848 100644 --- a/crates/polars-plan/src/plans/hive.rs +++ b/crates/polars-plan/src/plans/hive.rs @@ -57,6 +57,8 @@ impl HivePartitions { } } +/// Note: Returned hive partitions are ordered by their position in the `reader_schema` +/// /// # Safety /// `hive_start_idx <= [min path length]` pub fn hive_partitions_from_paths( @@ -198,10 +200,11 @@ pub fn hive_partitions_from_paths( } let mut hive_partitions = Vec::with_capacity(paths.len()); - let buffers = buffers + let mut buffers = buffers .into_iter() .map(|x| x.into_series()) .collect::>>()?; + buffers.sort_by_key(|s| reader_schema.index_of(s.name()).unwrap_or(usize::MAX)); #[allow(clippy::needless_range_loop)] for i in 0..paths.len() { diff --git a/crates/polars-plan/src/plans/ir/dot.rs b/crates/polars-plan/src/plans/ir/dot.rs index 69e3a69733c5..51050f2fa877 100644 --- a/crates/polars-plan/src/plans/ir/dot.rs +++ b/crates/polars-plan/src/plans/ir/dot.rs @@ -247,7 +247,7 @@ impl<'a> IRDotDisplay<'a> { })?; }, Scan { - paths, + sources, file_info, hive_parts: _, predicate, @@ -256,7 +256,7 @@ impl<'a> IRDotDisplay<'a> { output_schema: _, } => { let name: &str = scan_type.into(); - let path = PathsDisplay(paths.as_ref()); + let path = ScanSourcesDisplay(sources); let with_columns = options.with_columns.as_ref().map(|cols| cols.as_ref()); let with_columns = NumColumns(with_columns); let total_columns = @@ -343,10 +343,37 @@ impl<'a> IRDotDisplay<'a> { // A few utility structures for formatting pub struct PathsDisplay<'a>(pub &'a [PathBuf]); +pub struct ScanSourcesDisplay<'a>(pub &'a ScanSources); struct NumColumns<'a>(Option<&'a [PlSmallStr]>); struct NumColumnsSchema<'a>(Option<&'a Schema>); struct OptionExprIRDisplay<'a>(Option>); +impl fmt::Display for ScanSourceRef<'_> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + ScanSourceRef::Path(path) => path.display().fmt(f), + ScanSourceRef::File(_) => f.write_str("open-file"), + ScanSourceRef::Buffer(buff) => write!(f, "{} in-mem bytes", buff.len()), + } + } +} + +impl fmt::Display for ScanSourcesDisplay<'_> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self.0.len() { + 0 => write!(f, "[]"), + 1 => write!(f, "[{}]", self.0.at(0)), + 2 => write!(f, "[{}, {}]", self.0.at(0), self.0.at(1)), + _ => write!( + f, + "[{}, ... {} other sources]", + self.0.at(0), + self.0.len() - 1, + ), + } + } +} + impl fmt::Display for PathsDisplay<'_> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self.0.len() { @@ -356,7 +383,7 @@ impl fmt::Display for PathsDisplay<'_> { _ => write!( f, "[{}, ... {} other files]", - self.0[0].to_string_lossy(), + self.0[0].display(), self.0.len() - 1, ), } diff --git a/crates/polars-plan/src/plans/ir/format.rs b/crates/polars-plan/src/plans/ir/format.rs index 60699be85095..76de9f3beb24 100644 --- a/crates/polars-plan/src/plans/ir/format.rs +++ b/crates/polars-plan/src/plans/ir/format.rs @@ -1,14 +1,13 @@ use std::borrow::Cow; use std::fmt; use std::fmt::{Display, Formatter}; -use std::path::PathBuf; use polars_core::datatypes::AnyValue; use polars_core::schema::Schema; use polars_io::RowIndex; use recursive::recursive; -use super::ir::dot::PathsDisplay; +use self::ir::dot::ScanSourcesDisplay; use crate::prelude::*; pub struct IRDisplay<'a> { @@ -56,7 +55,7 @@ impl AsExpr for ExprIR { fn write_scan( f: &mut Formatter, name: &str, - path: &[PathBuf], + sources: &ScanSources, indent: usize, n_columns: i64, total_columns: usize, @@ -64,7 +63,12 @@ fn write_scan( slice: Option<(i64, usize)>, row_index: Option<&RowIndex>, ) -> fmt::Result { - write!(f, "{:indent$}{name} SCAN {}", "", PathsDisplay(path))?; + write!( + f, + "{:indent$}{name} SCAN {}", + "", + ScanSourcesDisplay(sources) + )?; let total_columns = total_columns - usize::from(row_index.is_some()); if n_columns > 0 { @@ -171,7 +175,7 @@ impl<'a> IRDisplay<'a> { write_scan( f, "PYTHON", - &[], + &ScanSources::default(), indent, n_columns, total_columns, @@ -221,7 +225,7 @@ impl<'a> IRDisplay<'a> { self.with_root(*input)._format(f, sub_indent) }, Scan { - paths, + sources, file_info, predicate, scan_type, @@ -239,7 +243,7 @@ impl<'a> IRDisplay<'a> { write_scan( f, scan_type.into(), - paths, + sources, indent, n_columns, file_info.schema.len(), diff --git a/crates/polars-plan/src/plans/ir/inputs.rs b/crates/polars-plan/src/plans/ir/inputs.rs index b00c91cddae4..2a7c14e300de 100644 --- a/crates/polars-plan/src/plans/ir/inputs.rs +++ b/crates/polars-plan/src/plans/ir/inputs.rs @@ -101,7 +101,7 @@ impl IR { options: *options, }, Scan { - paths, + sources, file_info, hive_parts, output_schema, @@ -114,7 +114,7 @@ impl IR { new_predicate = exprs.pop() } Scan { - paths: paths.clone(), + sources: sources.clone(), file_info: file_info.clone(), hive_parts: hive_parts.clone(), output_schema: output_schema.clone(), diff --git a/crates/polars-plan/src/plans/ir/mod.rs b/crates/polars-plan/src/plans/ir/mod.rs index 443726affad0..a9eb45b6406f 100644 --- a/crates/polars-plan/src/plans/ir/mod.rs +++ b/crates/polars-plan/src/plans/ir/mod.rs @@ -1,19 +1,20 @@ mod dot; mod format; mod inputs; +mod scan_sources; mod schema; pub(crate) mod tree_format; use std::borrow::Cow; use std::fmt; -use std::path::PathBuf; -pub use dot::{EscapeLabel, IRDotDisplay, PathsDisplay}; +pub use dot::{EscapeLabel, IRDotDisplay, PathsDisplay, ScanSourcesDisplay}; pub use format::{ExprIRDisplay, IRDisplay}; use hive::HivePartitions; use polars_core::prelude::*; use polars_utils::idx_vec::UnitVec; use polars_utils::unitvec; +pub use scan_sources::{ScanSourceIter, ScanSourceRef, ScanSources}; #[cfg(feature = "ir_serde")] use serde::{Deserialize, Serialize}; @@ -52,7 +53,7 @@ pub enum IR { predicate: ExprIR, }, Scan { - paths: Arc>, + sources: ScanSources, file_info: FileInfo, hive_parts: Option>>, predicate: Option, diff --git a/crates/polars-plan/src/plans/ir/scan_sources.rs b/crates/polars-plan/src/plans/ir/scan_sources.rs new file mode 100644 index 000000000000..08d8cad0bf49 --- /dev/null +++ b/crates/polars-plan/src/plans/ir/scan_sources.rs @@ -0,0 +1,282 @@ +use std::fs::File; +use std::path::{Path, PathBuf}; +use std::sync::Arc; + +use polars_core::error::{feature_gated, PolarsResult}; +use polars_io::cloud::CloudOptions; +#[cfg(feature = "cloud")] +use polars_io::utils::byte_source::{DynByteSource, DynByteSourceBuilder}; +use polars_utils::mmap::MemSlice; +use polars_utils::pl_str::PlSmallStr; + +use super::DslScanSources; + +/// Set of sources to scan from +/// +/// This is can either be a list of paths to files, opened files or in-memory buffers. Mixing of +/// buffers is not currently possible. +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] +#[derive(Debug, Clone)] +pub enum ScanSources { + Paths(Arc<[PathBuf]>), + + #[cfg_attr(feature = "serde", serde(skip))] + Files(Arc<[File]>), + #[cfg_attr(feature = "serde", serde(skip))] + Buffers(Arc<[bytes::Bytes]>), +} + +/// A reference to a single item in [`ScanSources`] +#[derive(Debug, Clone, Copy)] +pub enum ScanSourceRef<'a> { + Path(&'a Path), + File(&'a File), + Buffer(&'a bytes::Bytes), +} + +/// An iterator for [`ScanSources`] +pub struct ScanSourceIter<'a> { + sources: &'a ScanSources, + offset: usize, +} + +impl Default for ScanSources { + fn default() -> Self { + Self::Buffers(Arc::default()) + } +} + +impl std::hash::Hash for ScanSources { + fn hash(&self, state: &mut H) { + std::mem::discriminant(self).hash(state); + + // @NOTE: This is a bit crazy + // + // We don't really want to hash the file descriptors or the whole buffers so for now we + // just settle with the fact that the memory behind Arc's does not really move. Therefore, + // we can just hash the pointer. + match self { + Self::Paths(paths) => paths.hash(state), + Self::Files(files) => files.as_ptr().hash(state), + Self::Buffers(buffers) => buffers.as_ptr().hash(state), + } + } +} + +impl PartialEq for ScanSources { + fn eq(&self, other: &Self) -> bool { + match (self, other) { + (ScanSources::Paths(l), ScanSources::Paths(r)) => l == r, + (ScanSources::Files(l), ScanSources::Files(r)) => std::ptr::eq(l.as_ptr(), r.as_ptr()), + (ScanSources::Buffers(l), ScanSources::Buffers(r)) => { + std::ptr::eq(l.as_ptr(), r.as_ptr()) + }, + _ => false, + } + } +} + +impl Eq for ScanSources {} + +impl ScanSources { + pub fn iter(&self) -> ScanSourceIter { + ScanSourceIter { + sources: self, + offset: 0, + } + } + + pub fn to_dsl(self, is_expanded: bool) -> DslScanSources { + DslScanSources { + sources: self, + is_expanded, + } + } + + /// Are the sources all paths? + pub fn is_paths(&self) -> bool { + matches!(self, Self::Paths(_)) + } + + /// Try cast the scan sources to [`ScanSources::Paths`] + pub fn as_paths(&self) -> Option<&[PathBuf]> { + match self { + Self::Paths(paths) => Some(paths.as_ref()), + Self::Files(_) | Self::Buffers(_) => None, + } + } + + /// Try cast the scan sources to [`ScanSources::Paths`] with a clone + pub fn into_paths(&self) -> Option> { + match self { + Self::Paths(paths) => Some(paths.clone()), + Self::Files(_) | Self::Buffers(_) => None, + } + } + + /// Try get the first path in the scan sources + pub fn first_path(&self) -> Option<&Path> { + match self { + Self::Paths(paths) => paths.first().map(|p| p.as_path()), + Self::Files(_) | Self::Buffers(_) => None, + } + } + + /// Is the first path a cloud URL? + pub fn is_cloud_url(&self) -> bool { + self.first_path().is_some_and(polars_io::is_cloud_url) + } + + pub fn len(&self) -> usize { + match self { + Self::Paths(s) => s.len(), + Self::Files(s) => s.len(), + Self::Buffers(s) => s.len(), + } + } + + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + pub fn first(&self) -> Option { + self.get(0) + } + + /// Turn the [`ScanSources`] into some kind of identifier + pub fn id(&self) -> PlSmallStr { + if self.is_empty() { + return PlSmallStr::from_static("EMPTY"); + } + + match self { + Self::Paths(paths) => { + PlSmallStr::from_str(paths.first().unwrap().to_string_lossy().as_ref()) + }, + Self::Files(_) => PlSmallStr::from_static("OPEN_FILES"), + Self::Buffers(_) => PlSmallStr::from_static("IN_MEMORY"), + } + } + + /// Get the scan source at specific address + pub fn get(&self, idx: usize) -> Option { + match self { + Self::Paths(paths) => paths.get(idx).map(|p| ScanSourceRef::Path(p)), + Self::Files(files) => files.get(idx).map(ScanSourceRef::File), + Self::Buffers(buffers) => buffers.get(idx).map(ScanSourceRef::Buffer), + } + } + + /// Get the scan source at specific address + /// + /// # Panics + /// + /// If the `idx` is out of range. + #[track_caller] + pub fn at(&self, idx: usize) -> ScanSourceRef { + self.get(idx).unwrap() + } +} + +impl<'a> ScanSourceRef<'a> { + /// Get the name for `include_paths` + pub fn to_include_path_name(&self) -> &str { + match self { + Self::Path(path) => path.to_str().unwrap(), + Self::File(_) => "open-file", + Self::Buffer(_) => "in-mem", + } + } + + /// Turn the scan source into a memory slice + pub fn to_memslice(&self) -> PolarsResult { + self.to_memslice_possibly_async(false, None, 0) + } + + pub fn to_memslice_async_latest(&self, run_async: bool) -> PolarsResult { + match self { + ScanSourceRef::Path(path) => { + let file = if run_async { + feature_gated!("cloud", { + polars_io::file_cache::FILE_CACHE + .get_entry(path.to_str().unwrap()) + // Safety: This was initialized by schema inference. + .unwrap() + .try_open_assume_latest()? + }) + } else { + polars_utils::open_file(path)? + }; + + MemSlice::from_file(&file) + }, + ScanSourceRef::File(file) => MemSlice::from_file(file), + ScanSourceRef::Buffer(buff) => Ok(MemSlice::from_bytes((*buff).clone())), + } + } + + pub fn to_memslice_possibly_async( + &self, + run_async: bool, + #[cfg(feature = "cloud")] cache_entries: Option< + &Vec>, + >, + #[cfg(not(feature = "cloud"))] cache_entries: Option<&()>, + index: usize, + ) -> PolarsResult { + match self { + Self::Path(path) => { + let file = if run_async { + feature_gated!("cloud", { + cache_entries.unwrap()[index].try_open_check_latest()? + }) + } else { + polars_utils::open_file(path)? + }; + + MemSlice::from_file(&file) + }, + Self::File(file) => MemSlice::from_file(file), + Self::Buffer(buff) => Ok(MemSlice::from_bytes((*buff).clone())), + } + } + + #[cfg(feature = "cloud")] + pub async fn to_dyn_byte_source( + &self, + builder: &DynByteSourceBuilder, + cloud_options: Option<&CloudOptions>, + ) -> PolarsResult { + match self { + Self::Path(path) => { + builder + .try_build_from_path(path.to_str().unwrap(), cloud_options) + .await + }, + Self::File(file) => Ok(DynByteSource::from(MemSlice::from_file(file)?)), + Self::Buffer(buff) => Ok(DynByteSource::from(MemSlice::from_bytes((*buff).clone()))), + } + } +} + +impl<'a> Iterator for ScanSourceIter<'a> { + type Item = ScanSourceRef<'a>; + + fn next(&mut self) -> Option { + let item = match self.sources { + ScanSources::Paths(paths) => ScanSourceRef::Path(paths.get(self.offset)?), + ScanSources::Files(files) => ScanSourceRef::File(files.get(self.offset)?), + ScanSources::Buffers(buffers) => ScanSourceRef::Buffer(buffers.get(self.offset)?), + }; + + self.offset += 1; + Some(item) + } + + fn size_hint(&self) -> (usize, Option) { + let len = self.sources.len() - self.offset; + (len, Some(len)) + } +} + +impl<'a> ExactSizeIterator for ScanSourceIter<'a> {} diff --git a/crates/polars-plan/src/plans/mod.rs b/crates/polars-plan/src/plans/mod.rs index d225683a0d3f..92eeb783bf76 100644 --- a/crates/polars-plan/src/plans/mod.rs +++ b/crates/polars-plan/src/plans/mod.rs @@ -1,6 +1,5 @@ use std::fmt; use std::fmt::Debug; -use std::path::PathBuf; use std::sync::{Arc, Mutex, RwLock}; use hive::HivePartitions; @@ -59,6 +58,13 @@ pub enum Context { Default, } +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Clone)] +pub struct DslScanSources { + pub sources: ScanSources, + pub is_expanded: bool, +} + // https://stackoverflow.com/questions/1031076/what-are-projection-and-selection #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] pub enum DslPlan { @@ -76,7 +82,7 @@ pub enum DslPlan { cache_hits: u32, }, Scan { - paths: Arc>, bool)>>, + sources: Arc>, // Option as this is mostly materialized on the IR phase. // During conversion we update the value in the DSL as well // This is to cater to use cases where parts of a `LazyFrame` @@ -117,8 +123,11 @@ pub enum DslPlan { Join { input_left: Arc, input_right: Arc, + // Invariant: left_on and right_on are equal length. left_on: Vec, right_on: Vec, + // Invariant: Either left_on/right_on or predicates is set (non-empty). + predicates: Vec, options: Arc, }, /// Adding columns to the table without a Join @@ -190,11 +199,11 @@ impl Clone for DslPlan { Self::PythonScan { options } => Self::PythonScan { options: options.clone() }, Self::Filter { input, predicate } => Self::Filter { input: input.clone(), predicate: predicate.clone() }, Self::Cache { input, id, cache_hits } => Self::Cache { input: input.clone(), id: id.clone(), cache_hits: cache_hits.clone() }, - Self::Scan { paths, file_info, hive_parts, predicate, file_options, scan_type } => Self::Scan { paths: paths.clone(), file_info: file_info.clone(), hive_parts: hive_parts.clone(), predicate: predicate.clone(), file_options: file_options.clone(), scan_type: scan_type.clone() }, + Self::Scan { sources, file_info, hive_parts, predicate, file_options, scan_type } => Self::Scan { sources: sources.clone(), file_info: file_info.clone(), hive_parts: hive_parts.clone(), predicate: predicate.clone(), file_options: file_options.clone(), scan_type: scan_type.clone() }, Self::DataFrameScan { df, schema, output_schema, filter: selection } => Self::DataFrameScan { df: df.clone(), schema: schema.clone(), output_schema: output_schema.clone(), filter: selection.clone() }, Self::Select { expr, input, options } => Self::Select { expr: expr.clone(), input: input.clone(), options: options.clone() }, Self::GroupBy { input, keys, aggs, apply, maintain_order, options } => Self::GroupBy { input: input.clone(), keys: keys.clone(), aggs: aggs.clone(), apply: apply.clone(), maintain_order: maintain_order.clone(), options: options.clone() }, - Self::Join { input_left, input_right, left_on, right_on, options } => Self::Join { input_left: input_left.clone(), input_right: input_right.clone(), left_on: left_on.clone(), right_on: right_on.clone(), options: options.clone() }, + Self::Join { input_left, input_right, left_on, right_on, predicates, options } => Self::Join { input_left: input_left.clone(), input_right: input_right.clone(), left_on: left_on.clone(), right_on: right_on.clone(), options: options.clone(), predicates: predicates.clone() }, Self::HStack { input, exprs, options } => Self::HStack { input: input.clone(), exprs: exprs.clone(), options: options.clone() }, Self::Distinct { input, options } => Self::Distinct { input: input.clone(), options: options.clone() }, Self::Sort {input,by_column, slice, sort_options } => Self::Sort { input: input.clone(), by_column: by_column.clone(), slice: slice.clone(), sort_options: sort_options.clone() }, diff --git a/crates/polars-plan/src/plans/optimizer/cache_states.rs b/crates/polars-plan/src/plans/optimizer/cache_states.rs index 4a2a2d87e75b..da13d047d43f 100644 --- a/crates/polars-plan/src/plans/optimizer/cache_states.rs +++ b/crates/polars-plan/src/plans/optimizer/cache_states.rs @@ -15,7 +15,7 @@ fn get_upper_projections( // During projection pushdown all accumulated. match parent { SimpleProjection { columns, .. } => { - let iter = columns.iter_names().cloned(); + let iter = columns.iter_names_cloned(); names_scratch.extend(iter); *found_required_columns = true; false @@ -264,7 +264,7 @@ pub(super) fn set_cache_states( // all columns if !found_required_columns { let schema = lp.schema(lp_arena); - v.names_union.extend(schema.iter_names().cloned()); + v.names_union.extend(schema.iter_names_cloned()); } } frame.cache_id = Some(*id); diff --git a/crates/polars-plan/src/plans/optimizer/count_star.rs b/crates/polars-plan/src/plans/optimizer/count_star.rs index 32a95cc3ede3..1f20c83f6a87 100644 --- a/crates/polars-plan/src/plans/optimizer/count_star.rs +++ b/crates/polars-plan/src/plans/optimizer/count_star.rs @@ -32,7 +32,7 @@ impl OptimizationRule for CountStar { let alp = IR::MapFunction { input: placeholder_node, function: FunctionIR::FastCount { - paths: count_star_expr.paths, + sources: count_star_expr.sources, scan_type: count_star_expr.scan_type, alias: count_star_expr.alias, }, @@ -49,7 +49,7 @@ struct CountStarExpr { // Top node of the projection to replace node: Node, // Paths to the input files - paths: Arc>, + sources: ScanSources, // File Type scan_type: FileScan, // Column Alias @@ -66,12 +66,34 @@ fn visit_logical_plan_for_scan_paths( ) -> Option { match lp_arena.get(node) { IR::Union { inputs, .. } => { + enum MutableSources { + Paths(Vec), + Buffers(Vec), + } + let mut scan_type: Option = None; - let mut paths = Vec::with_capacity(inputs.len()); + let mut sources = None; for input in inputs { match visit_logical_plan_for_scan_paths(*input, lp_arena, expr_arena, true) { Some(expr) => { - paths.extend(expr.paths.iter().cloned()); + match (expr.sources, &mut sources) { + ( + ScanSources::Paths(paths), + Some(MutableSources::Paths(ref mut mutable_paths)), + ) => mutable_paths.extend_from_slice(&paths[..]), + (ScanSources::Paths(paths), None) => { + sources = Some(MutableSources::Paths(paths.to_vec())) + }, + ( + ScanSources::Buffers(buffers), + Some(MutableSources::Buffers(ref mut mutable_buffers)), + ) => mutable_buffers.extend_from_slice(&buffers[..]), + (ScanSources::Buffers(buffers), None) => { + sources = Some(MutableSources::Buffers(buffers.to_vec())) + }, + _ => return None, + } + match &scan_type { None => scan_type = Some(expr.scan_type), Some(scan_type) => { @@ -88,16 +110,20 @@ fn visit_logical_plan_for_scan_paths( } } Some(CountStarExpr { - paths: paths.into(), + sources: match sources { + Some(MutableSources::Paths(paths)) => ScanSources::Paths(paths.into()), + Some(MutableSources::Buffers(buffers)) => ScanSources::Buffers(buffers.into()), + None => ScanSources::default(), + }, scan_type: scan_type.unwrap(), node, alias: None, }) }, IR::Scan { - scan_type, paths, .. + scan_type, sources, .. } if !matches!(scan_type, FileScan::Anonymous { .. }) => Some(CountStarExpr { - paths: paths.clone(), + sources: sources.clone(), scan_type: scan_type.clone(), node, alias: None, diff --git a/crates/polars-plan/src/plans/optimizer/mod.rs b/crates/polars-plan/src/plans/optimizer/mod.rs index 4215347f2e7d..34dc6dca9a29 100644 --- a/crates/polars-plan/src/plans/optimizer/mod.rs +++ b/crates/polars-plan/src/plans/optimizer/mod.rs @@ -18,7 +18,6 @@ mod join_utils; mod predicate_pushdown; mod projection_pushdown; mod simplify_expr; -mod simplify_functions; mod slice_pushdown_expr; mod slice_pushdown_lp; mod stack_opt; diff --git a/crates/polars-plan/src/plans/optimizer/predicate_pushdown/mod.rs b/crates/polars-plan/src/plans/optimizer/predicate_pushdown/mod.rs index 1def3d375958..7cb0753e5a6d 100644 --- a/crates/polars-plan/src/plans/optimizer/predicate_pushdown/mod.rs +++ b/crates/polars-plan/src/plans/optimizer/predicate_pushdown/mod.rs @@ -325,7 +325,7 @@ impl<'a> PredicatePushDown<'a> { Ok(lp) }, Scan { - mut paths, + mut sources, file_info, hive_parts: mut scan_hive_parts, ref predicate, @@ -366,6 +366,9 @@ impl<'a> PredicatePushDown<'a> { if let (Some(hive_parts), Some(predicate)) = (&scan_hive_parts, &predicate) { if let Some(io_expr) = self.expr_eval.unwrap()(predicate, expr_arena) { if let Some(stats_evaluator) = io_expr.as_stats_evaluator() { + let paths = sources.as_paths().ok_or_else(|| { + polars_err!(nyi = "Hive partitioning of in-memory buffers") + })?; let mut new_paths = Vec::with_capacity(paths.len()); let mut new_hive_parts = Vec::with_capacity(paths.len()); @@ -400,7 +403,7 @@ impl<'a> PredicatePushDown<'a> { filter: None, }); } else { - paths = Arc::from(new_paths); + sources = ScanSources::Paths(new_paths.into()); scan_hive_parts = Some(Arc::from(new_hive_parts)); } } @@ -422,7 +425,7 @@ impl<'a> PredicatePushDown<'a> { let lp = if do_optimization { Scan { - paths, + sources, file_info, hive_parts, predicate, @@ -432,7 +435,7 @@ impl<'a> PredicatePushDown<'a> { } } else { let lp = Scan { - paths, + sources, file_info, hive_parts, predicate: None, diff --git a/crates/polars-plan/src/plans/optimizer/projection_pushdown/mod.rs b/crates/polars-plan/src/plans/optimizer/projection_pushdown/mod.rs index e5e2fb94ccde..61c86e789d95 100644 --- a/crates/polars-plan/src/plans/optimizer/projection_pushdown/mod.rs +++ b/crates/polars-plan/src/plans/optimizer/projection_pushdown/mod.rs @@ -346,7 +346,7 @@ impl ProjectionPushDown { expr_arena, ), SimpleProjection { columns, input, .. } => { - let exprs = names_to_expr_irs(columns.iter_names().cloned(), expr_arena); + let exprs = names_to_expr_irs(columns.iter_names_cloned(), expr_arena); process_projection( self, input, @@ -398,7 +398,7 @@ impl ProjectionPushDown { Ok(PythonScan { options }) }, Scan { - paths, + sources, mut file_info, mut hive_parts, scan_type, @@ -510,7 +510,7 @@ impl ProjectionPushDown { } }; let lp = Scan { - paths, + sources, file_info, hive_parts, output_schema, diff --git a/crates/polars-plan/src/plans/optimizer/simplify_expr.rs b/crates/polars-plan/src/plans/optimizer/simplify_expr/mod.rs similarity index 97% rename from crates/polars-plan/src/plans/optimizer/simplify_expr.rs rename to crates/polars-plan/src/plans/optimizer/simplify_expr/mod.rs index aec74543a5dc..1df68a0adcfa 100644 --- a/crates/polars-plan/src/plans/optimizer/simplify_expr.rs +++ b/crates/polars-plan/src/plans/optimizer/simplify_expr/mod.rs @@ -1,8 +1,24 @@ +mod simplify_functions; + use polars_utils::floor_divmod::FloorDivMod; use polars_utils::total_ord::ToTotalOrd; +use simplify_functions::optimize_functions; use crate::plans::*; -use crate::prelude::optimizer::simplify_functions::optimize_functions; + +fn new_null_count(input: &[ExprIR]) -> AExpr { + AExpr::Function { + input: input.to_vec(), + function: FunctionExpr::NullCount, + options: FunctionOptions { + collect_groups: ApplyOptions::GroupWise, + fmt_str: "", + cast_to_supertypes: None, + check_lengths: UnsafeBool::default(), + flags: FunctionFlags::ALLOW_GROUP_AWARE | FunctionFlags::RETURNS_SCALAR, + }, + } +} macro_rules! eval_binary_same_type { ($lhs:expr, $rhs:expr, |$l: ident, $r: ident| $ret: expr) => {{ @@ -457,7 +473,7 @@ impl OptimizationRule for SimplifyExprRule { match expr_arena.get(drop_nulls_input_node) { AExpr::Column(_) => Some(AExpr::BinaryExpr { op: Operator::Minus, - right: expr_arena.add(AExpr::new_null_count(input)), + right: expr_arena.add(new_null_count(input)), left: expr_arena.add(AExpr::Agg(IRAggExpr::Count( drop_nulls_input_node, true, @@ -481,7 +497,7 @@ impl OptimizationRule for SimplifyExprRule { input, function: FunctionExpr::Boolean(BooleanFunction::IsNull), options: _, - } => Some(AExpr::new_null_count(input)), + } => Some(new_null_count(input)), AExpr::Function { input, function: FunctionExpr::Boolean(BooleanFunction::IsNotNull), @@ -494,7 +510,7 @@ impl OptimizationRule for SimplifyExprRule { match expr_arena.get(is_not_null_input_node) { AExpr::Column(_) => Some(AExpr::BinaryExpr { op: Operator::Minus, - right: expr_arena.add(AExpr::new_null_count(input)), + right: expr_arena.add(new_null_count(input)), left: expr_arena.add(AExpr::Agg(IRAggExpr::Count( is_not_null_input_node, true, diff --git a/crates/polars-plan/src/plans/optimizer/simplify_functions.rs b/crates/polars-plan/src/plans/optimizer/simplify_expr/simplify_functions.rs similarity index 97% rename from crates/polars-plan/src/plans/optimizer/simplify_functions.rs rename to crates/polars-plan/src/plans/optimizer/simplify_expr/simplify_functions.rs index 3d9518193276..2b5493c62e6b 100644 --- a/crates/polars-plan/src/plans/optimizer/simplify_functions.rs +++ b/crates/polars-plan/src/plans/optimizer/simplify_expr/simplify_functions.rs @@ -18,7 +18,7 @@ pub(super) fn optimize_functions( function: FunctionExpr::Boolean(BooleanFunction::IsNull), options: _, } => Some(AExpr::BinaryExpr { - left: expr_arena.add(AExpr::new_null_count(input)), + left: expr_arena.add(new_null_count(input)), op: Operator::Gt, right: expr_arena.add(AExpr::Literal(LiteralValue::new_idxsize(0))), }), @@ -34,7 +34,7 @@ pub(super) fn optimize_functions( match expr_arena.get(is_not_null_input_node) { AExpr::Column(_) => Some(AExpr::BinaryExpr { op: Operator::Lt, - left: expr_arena.add(AExpr::new_null_count(input)), + left: expr_arena.add(new_null_count(input)), right: expr_arena.add(AExpr::Agg(IRAggExpr::Count( is_not_null_input_node, true, @@ -66,7 +66,7 @@ pub(super) fn optimize_functions( match expr_arena.get(is_null_input_node) { AExpr::Column(_) => Some(AExpr::BinaryExpr { op: Operator::Eq, - right: expr_arena.add(AExpr::new_null_count(input)), + right: expr_arena.add(new_null_count(input)), left: expr_arena .add(AExpr::Agg(IRAggExpr::Count(is_null_input_node, true))), }), @@ -81,7 +81,7 @@ pub(super) fn optimize_functions( function: FunctionExpr::Boolean(BooleanFunction::IsNotNull), options: _, } => Some(AExpr::BinaryExpr { - left: expr_arena.add(AExpr::new_null_count(input)), + left: expr_arena.add(new_null_count(input)), op: Operator::Eq, right: expr_arena.add(AExpr::Literal(LiteralValue::new_idxsize(0))), }), diff --git a/crates/polars-plan/src/plans/optimizer/slice_pushdown_lp.rs b/crates/polars-plan/src/plans/optimizer/slice_pushdown_lp.rs index f62bd9ee197d..b656795f53d2 100644 --- a/crates/polars-plan/src/plans/optimizer/slice_pushdown_lp.rs +++ b/crates/polars-plan/src/plans/optimizer/slice_pushdown_lp.rs @@ -165,7 +165,7 @@ impl SlicePushDown { } #[cfg(feature = "csv")] (Scan { - paths, + sources, file_info, hive_parts, output_schema, @@ -176,7 +176,7 @@ impl SlicePushDown { file_options.slice = Some((0, state.offset as usize + state.len as usize)); let lp = Scan { - paths, + sources, file_info, hive_parts, output_schema, @@ -189,7 +189,7 @@ impl SlicePushDown { }, #[cfg(feature = "parquet")] (Scan { - paths, + sources, file_info, hive_parts, output_schema, @@ -200,7 +200,7 @@ impl SlicePushDown { file_options.slice = Some((state.offset, state.len as usize)); let lp = Scan { - paths, + sources, file_info, hive_parts, output_schema, @@ -213,7 +213,7 @@ impl SlicePushDown { }, // TODO! we currently skip slice pushdown if there is a predicate. (Scan { - paths, + sources, file_info, hive_parts, output_schema, @@ -224,7 +224,7 @@ impl SlicePushDown { options.slice = Some((0, state.len as usize)); let lp = Scan { - paths, + sources, file_info, hive_parts, output_schema, diff --git a/crates/polars-plan/src/plans/visitor/hash.rs b/crates/polars-plan/src/plans/visitor/hash.rs index 80c251108297..7087122802ea 100644 --- a/crates/polars-plan/src/plans/visitor/hash.rs +++ b/crates/polars-plan/src/plans/visitor/hash.rs @@ -74,7 +74,7 @@ impl Hash for HashableEqLP<'_> { predicate.traverse_and_hash(self.expr_arena, state); }, IR::Scan { - paths, + sources, file_info: _, hive_parts: _, predicate, @@ -84,7 +84,7 @@ impl Hash for HashableEqLP<'_> { } => { // We don't have to traverse the schema, hive partitions etc. as they are derivative from the paths. scan_type.hash(state); - paths.hash(state); + sources.hash(state); hash_option_expr(predicate, self.expr_arena, state); file_options.hash(state); }, @@ -254,7 +254,7 @@ impl HashableEqLP<'_> { ) => expr_ir_eq(l, r, self.expr_arena), ( IR::Scan { - paths: pl, + sources: pl, file_info: _, hive_parts: _, predicate: pred_l, @@ -263,7 +263,7 @@ impl HashableEqLP<'_> { file_options: ol, }, IR::Scan { - paths: pr, + sources: pr, file_info: _, hive_parts: _, predicate: pred_r, @@ -272,7 +272,7 @@ impl HashableEqLP<'_> { file_options: or, }, ) => { - pl == pr + pl.as_paths() == pr.as_paths() && stl == str && ol == or && opt_expr_ir_eq(pred_l, pred_r, self.expr_arena) diff --git a/crates/polars-python/Cargo.toml b/crates/polars-python/Cargo.toml index 03178d684e34..9ed35648c89f 100644 --- a/crates/polars-python/Cargo.toml +++ b/crates/polars-python/Cargo.toml @@ -25,6 +25,7 @@ polars-stream = { workspace = true } ahash = { workspace = true } arboard = { workspace = true, optional = true } bytemuck = { workspace = true } +bytes = { workspace = true } ciborium = { workspace = true } either = { workspace = true } itoa = { workspace = true } @@ -121,6 +122,7 @@ json = ["polars/serde", "serde_json", "polars/json", "polars-utils/serde"] trigonometry = ["polars/trigonometry"] sign = ["polars/sign"] asof_join = ["polars/asof_join"] +iejoin = ["polars/iejoin"] cross_join = ["polars/cross_join"] pct_change = ["polars/pct_change"] repeat_by = ["polars/repeat_by"] diff --git a/crates/polars-python/src/conversion/mod.rs b/crates/polars-python/src/conversion/mod.rs index 8d5c96f3b58c..fd8e97cb7adc 100644 --- a/crates/polars-python/src/conversion/mod.rs +++ b/crates/polars-python/src/conversion/mod.rs @@ -2,7 +2,9 @@ pub(crate) mod any_value; pub(crate) mod chunked_array; mod datetime; use std::fmt::{Display, Formatter}; +use std::fs::File; use std::hash::{Hash, Hasher}; +use std::path::PathBuf; #[cfg(feature = "object")] use polars::chunked_array::object::PolarsObjectSafe; @@ -19,6 +21,7 @@ use polars_core::utils::materialize_dyn_int; use polars_lazy::prelude::*; #[cfg(feature = "parquet")] use polars_parquet::write::StatisticsOptions; +use polars_plan::plans::ScanSources; use polars_utils::pl_str::PlSmallStr; use polars_utils::total_ord::{TotalEq, TotalHash}; use pyo3::basic::CompareOp; @@ -29,6 +32,7 @@ use pyo3::pybacked::PyBackedStr; use pyo3::types::{PyDict, PyList, PySequence}; use crate::error::PyPolarsErr; +use crate::file::{get_python_scan_source_input, PythonScanSourceInput}; #[cfg(feature = "object")] use crate::object::OBJECT_NAME; use crate::prelude::*; @@ -528,6 +532,68 @@ impl<'py> FromPyObject<'py> for Wrap { } } +impl<'py> FromPyObject<'py> for Wrap { + fn extract_bound(ob: &Bound<'py, PyAny>) -> PyResult { + let list = ob.downcast::()?.to_owned(); + + if list.is_empty() { + return Ok(Wrap(ScanSources::default())); + } + + enum MutableSources { + Paths(Vec), + Files(Vec), + Buffers(Vec), + } + + let num_items = list.len(); + let mut iter = list + .into_iter() + .map(|val| get_python_scan_source_input(val.unbind(), false)); + + let Some(first) = iter.next() else { + return Ok(Wrap(ScanSources::default())); + }; + + let mut sources = match first? { + PythonScanSourceInput::Path(path) => { + let mut sources = Vec::with_capacity(num_items); + sources.push(path); + MutableSources::Paths(sources) + }, + PythonScanSourceInput::File(file) => { + let mut sources = Vec::with_capacity(num_items); + sources.push(file); + MutableSources::Files(sources) + }, + PythonScanSourceInput::Buffer(buffer) => { + let mut sources = Vec::with_capacity(num_items); + sources.push(buffer); + MutableSources::Buffers(sources) + }, + }; + + for source in iter { + match (&mut sources, source?) { + (MutableSources::Paths(v), PythonScanSourceInput::Path(p)) => v.push(p), + (MutableSources::Files(v), PythonScanSourceInput::File(f)) => v.push(f), + (MutableSources::Buffers(v), PythonScanSourceInput::Buffer(f)) => v.push(f), + _ => { + return Err(PyTypeError::new_err( + "Cannot combine in-memory bytes, paths and files for scan sources", + )) + }, + } + } + + Ok(Wrap(match sources { + MutableSources::Paths(i) => ScanSources::Paths(i.into()), + MutableSources::Files(i) => ScanSources::Files(i.into()), + MutableSources::Buffers(i) => ScanSources::Buffers(i.into()), + })) + } +} + impl IntoPy for Wrap<&Schema> { fn into_py(self, py: Python<'_>) -> PyObject { let dict = PyDict::new_bound(py); diff --git a/crates/polars-python/src/dataframe/io.rs b/crates/polars-python/src/dataframe/io.rs index 12707e93dd85..dbdf91ddff09 100644 --- a/crates/polars-python/src/dataframe/io.rs +++ b/crates/polars-python/src/dataframe/io.rs @@ -4,11 +4,11 @@ use std::sync::Arc; #[cfg(feature = "avro")] use polars::io::avro::AvroCompression; -use polars::io::mmap::ensure_not_mapped; use polars::io::RowIndex; use polars::prelude::*; #[cfg(feature = "parquet")] use polars_parquet::arrow::write::StatisticsOptions; +use polars_utils::mmap::ensure_not_mapped; use pyo3::prelude::*; use pyo3::pybacked::PyBackedStr; diff --git a/crates/polars-python/src/file.rs b/crates/polars-python/src/file.rs index bcdfd7ff6ee7..33d084c5130c 100644 --- a/crates/polars-python/src/file.rs +++ b/crates/polars-python/src/file.rs @@ -12,7 +12,7 @@ use polars::io::mmap::MmapBytesReader; use polars_error::{polars_err, polars_warn}; use pyo3::exceptions::PyTypeError; use pyo3::prelude::*; -use pyo3::types::{PyBytes, PyString}; +use pyo3::types::{PyBytes, PyString, PyStringMethods}; use crate::error::PyPolarsErr; use crate::prelude::resolve_homedir; @@ -31,6 +31,10 @@ impl PyFileLikeObject { PyFileLikeObject { inner: object } } + pub fn as_bytes(&self) -> bytes::Bytes { + self.as_file_buffer().into_inner().into() + } + pub fn as_buffer(&self) -> std::io::Cursor> { let data = self.as_file_buffer().into_inner(); std::io::Cursor::new(data) @@ -43,11 +47,19 @@ impl PyFileLikeObject { .call_method_bound(py, "read", (), None) .expect("no read method found"); - let bytes: &Bound<'_, PyBytes> = bytes - .downcast_bound(py) - .expect("Expecting to be able to downcast into bytes from read result."); + if let Ok(bytes) = bytes.downcast_bound::(py) { + return bytes.as_bytes().to_vec(); + } - bytes.as_bytes().to_vec() + if let Ok(bytes) = bytes.downcast_bound::(py) { + return bytes + .to_cow() + .expect("PyString is not valid UTF-8") + .into_owned() + .into_bytes(); + } + + panic!("Expecting to be able to downcast into bytes from read result."); }); Cursor::new(buf) @@ -191,7 +203,127 @@ impl EitherRustPythonFile { } } -fn get_either_file_and_path( +pub enum PythonScanSourceInput { + Buffer(bytes::Bytes), + Path(PathBuf), + File(File), +} + +pub fn get_python_scan_source_input( + py_f: PyObject, + write: bool, +) -> PyResult { + Python::with_gil(|py| { + let py_f = py_f.into_bound(py); + + // If the pyobject is a `bytes` class + if let Ok(bytes) = py_f.downcast::() { + return Ok(PythonScanSourceInput::Buffer( + bytes::Bytes::copy_from_slice(bytes.as_bytes()), + )); + } + + if let Ok(s) = py_f.extract::>() { + let file_path = std::path::Path::new(&*s); + let file_path = resolve_homedir(file_path); + Ok(PythonScanSourceInput::Path(file_path)) + } else { + let io = py.import_bound("io").unwrap(); + let is_utf8_encoding = |py_f: &Bound| -> PyResult { + let encoding = py_f.getattr("encoding")?; + let encoding = encoding.extract::>()?; + Ok(encoding.eq_ignore_ascii_case("utf-8") || encoding.eq_ignore_ascii_case("utf8")) + }; + + #[cfg(target_family = "unix")] + if let Some(fd) = (py_f.is_exact_instance(&io.getattr("FileIO").unwrap()) + || (py_f.is_exact_instance(&io.getattr("BufferedReader").unwrap()) + || py_f.is_exact_instance(&io.getattr("BufferedWriter").unwrap()) + || py_f.is_exact_instance(&io.getattr("BufferedRandom").unwrap()) + || py_f.is_exact_instance(&io.getattr("BufferedRWPair").unwrap()) + || (py_f.is_exact_instance(&io.getattr("TextIOWrapper").unwrap()) + && is_utf8_encoding(&py_f)?)) + && if write { + // invalidate read buffer + py_f.call_method0("flush").is_ok() + } else { + // flush write buffer + py_f.call_method1("seek", (0, 1)).is_ok() + }) + .then(|| { + py_f.getattr("fileno") + .and_then(|fileno| fileno.call0()) + .and_then(|fileno| fileno.extract::()) + .ok() + }) + .flatten() + .map(|fileno| unsafe { + // `File::from_raw_fd()` takes the ownership of the file descriptor. + // When the File is dropped, it closes the file descriptor. + // This is undesired - the Python file object will become invalid. + // Therefore, we duplicate the file descriptor here. + // Closing the duplicated file descriptor will not close + // the original file descriptor; + // and the status, e.g. stream position, is still shared with + // the original file descriptor. + // We use `F_DUPFD_CLOEXEC` here instead of `dup()` + // because it also sets the `O_CLOEXEC` flag on the duplicated file descriptor, + // which `dup()` clears. + // `open()` in both Rust and Python automatically set `O_CLOEXEC` flag; + // it prevents leaking file descriptors across processes, + // and we want to be consistent with them. + // `F_DUPFD_CLOEXEC` is defined in POSIX.1-2008 + // and is present on all alive UNIX(-like) systems. + libc::fcntl(fileno, libc::F_DUPFD_CLOEXEC, 0) + }) + .filter(|fileno| *fileno != -1) + .map(|fileno| fileno as RawFd) + { + return Ok(PythonScanSourceInput::File(unsafe { + File::from_raw_fd(fd) + })); + } + + // BytesIO / StringIO is relatively fast, and some code relies on it. + if !py_f.is_exact_instance(&io.getattr("BytesIO").unwrap()) + && !py_f.is_exact_instance(&io.getattr("StringIO").unwrap()) + { + polars_warn!("Polars found a filename. \ + Ensure you pass a path to the file instead of a python file object when possible for best \ + performance."); + } + // Unwrap TextIOWrapper + // Allow subclasses to allow things like pytest.capture.CaptureIO + let py_f = if py_f + .is_instance(&io.getattr("TextIOWrapper").unwrap()) + .unwrap_or_default() + { + if !is_utf8_encoding(&py_f)? { + return Err(PyPolarsErr::from( + polars_err!(InvalidOperation: "file encoding is not UTF-8"), + ) + .into()); + } + // XXX: we have to clear buffer here. + // Is there a better solution? + if write { + py_f.call_method0("flush")?; + } else { + py_f.call_method1("seek", (0, 1))?; + } + py_f.getattr("buffer")? + } else { + py_f + }; + PyFileLikeObject::ensure_requirements(&py_f, !write, write, !write)?; + Ok(PythonScanSourceInput::Buffer( + PyFileLikeObject::new(py_f.to_object(py)).as_bytes(), + )) + } + }) +} + +fn get_either_buffer_or_path( py_f: PyObject, write: bool, ) -> PyResult<(EitherRustPythonFile, Option)> { @@ -265,8 +397,10 @@ fn get_either_file_and_path( )); } - // BytesIO is relatively fast, and some code relies on it. - if !py_f.is_exact_instance(&io.getattr("BytesIO").unwrap()) { + // BytesIO / StringIO is relatively fast, and some code relies on it. + if !py_f.is_exact_instance(&io.getattr("BytesIO").unwrap()) + && !py_f.is_exact_instance(&io.getattr("StringIO").unwrap()) + { polars_warn!("Polars found a filename. \ Ensure you pass a path to the file instead of a python file object when possible for best \ performance."); @@ -305,7 +439,7 @@ fn get_either_file_and_path( /// # Arguments /// * `write` - open for writing; will truncate existing file and create new file if not. pub fn get_either_file(py_f: PyObject, write: bool) -> PyResult { - Ok(get_either_file_and_path(py_f, write)?.0) + Ok(get_either_buffer_or_path(py_f, write)?.0) } pub fn get_file_like(f: PyObject, truncate: bool) -> PyResult> { @@ -342,7 +476,7 @@ pub fn get_mmap_bytes_reader_and_path<'a>( } // string so read file else { - match get_either_file_and_path(py_f.to_object(py_f.py()), false)? { + match get_either_buffer_or_path(py_f.to_object(py_f.py()), false)? { (EitherRustPythonFile::Rust(f), path) => Ok((Box::new(f), path)), (EitherRustPythonFile::Py(f), path) => Ok((Box::new(f), path)), } diff --git a/crates/polars-python/src/functions/lazy.rs b/crates/polars-python/src/functions/lazy.rs index c0f9d0f7152a..108aaf2121b1 100644 --- a/crates/polars-python/src/functions/lazy.rs +++ b/crates/polars-python/src/functions/lazy.rs @@ -436,18 +436,31 @@ pub fn lit(value: &Bound<'_, PyAny>, allow_object: bool) -> PyResult { ) { let av = py_object_to_any_value(value, true)?; Ok(Expr::Literal(LiteralValue::try_from(av).unwrap()).into()) - } else if allow_object { - let s = Python::with_gil(|py| { - PySeries::new_object(py, "", vec![ObjectValue::from(value.into_py(py))], false).series - }); - Ok(dsl::lit(s).into()) } else { - Err(PyTypeError::new_err(format!( - "cannot create expression literal for value of type {}: {}\ - \n\nHint: Pass `allow_object=True` to accept any value and create a literal of type Object.", - value.get_type().qualname()?, - value.repr()? - ))) + Python::with_gil(|py| { + // One final attempt before erroring. Do we have a date/datetime subclass? + // E.g. pd.Timestamp, or Freezegun. + let datetime_module = PyModule::import_bound(py, "datetime")?; + let datetime_class = datetime_module.getattr("datetime")?; + let date_class = datetime_module.getattr("date")?; + if value.is_instance(&datetime_class)? || value.is_instance(&date_class)? { + let av = py_object_to_any_value(value, true)?; + Ok(Expr::Literal(LiteralValue::try_from(av).unwrap()).into()) + } else if allow_object { + let s = Python::with_gil(|py| { + PySeries::new_object(py, "", vec![ObjectValue::from(value.into_py(py))], false) + .series + }); + Ok(dsl::lit(s).into()) + } else { + Err(PyTypeError::new_err(format!( + "cannot create expression literal for value of type {}: {}\ + \n\nHint: Pass `allow_object=True` to accept any value and create a literal of type Object.", + value.get_type().qualname()?, + value.repr()? + ))) + } + }) } } diff --git a/crates/polars-python/src/lazyframe/general.rs b/crates/polars-python/src/lazyframe/general.rs index 83e5da4f12dc..86bcd3c2566b 100644 --- a/crates/polars-python/src/lazyframe/general.rs +++ b/crates/polars-python/src/lazyframe/general.rs @@ -7,7 +7,7 @@ use polars::time::*; use polars_core::prelude::*; #[cfg(feature = "parquet")] use polars_parquet::arrow::write::StatisticsOptions; -use pyo3::exceptions::PyValueError; +use polars_plan::plans::ScanSources; use pyo3::prelude::*; use pyo3::pybacked::PyBackedStr; use pyo3::types::{PyDict, PyList}; @@ -20,6 +20,19 @@ use crate::lazyframe::visit::NodeTraverser; use crate::prelude::*; use crate::{PyDataFrame, PyExpr, PyLazyGroupBy}; +fn pyobject_to_first_path_and_scan_sources( + obj: PyObject, +) -> PyResult<(Option, ScanSources)> { + use crate::file::{get_python_scan_source_input, PythonScanSourceInput}; + Ok(match get_python_scan_source_input(obj, false)? { + PythonScanSourceInput::Path(path) => { + (Some(path.clone()), ScanSources::Paths([path].into())) + }, + PythonScanSourceInput::File(file) => (None, ScanSources::Files([file].into())), + PythonScanSourceInput::Buffer(buff) => (None, ScanSources::Buffers([buff].into())), + }) +} + #[pymethods] #[allow(clippy::should_implement_trait)] impl PyLazyFrame { @@ -27,12 +40,12 @@ impl PyLazyFrame { #[cfg(feature = "json")] #[allow(clippy::too_many_arguments)] #[pyo3(signature = ( - path, paths, infer_schema_length, schema, schema_overrides, batch_size, n_rows, low_memory, rechunk, + source, sources, infer_schema_length, schema, schema_overrides, batch_size, n_rows, low_memory, rechunk, row_index, ignore_errors, include_file_paths, cloud_options, retries, file_cache_ttl ))] fn new_from_ndjson( - path: Option, - paths: Vec, + source: Option, + sources: Wrap, infer_schema_length: Option, schema: Option>, schema_overrides: Option>, @@ -52,37 +65,27 @@ impl PyLazyFrame { offset, }); - #[cfg(feature = "cloud")] - let cloud_options = { - let first_path = if let Some(path) = &path { - path - } else { - paths - .first() - .ok_or_else(|| PyValueError::new_err("expected a path argument"))? - }; + let sources = sources.0; + let (first_path, sources) = match source { + None => (sources.first_path().map(|p| p.to_path_buf()), sources), + Some(source) => pyobject_to_first_path_and_scan_sources(source)?, + }; - let first_path_url = first_path.to_string_lossy(); + let mut r = LazyJsonLineReader::new_with_sources(sources); - let mut cloud_options = if let Some(opts) = cloud_options { - parse_cloud_options(&first_path_url, opts)? - } else { - parse_cloud_options(&first_path_url, vec![])? - }; + #[cfg(feature = "cloud")] + if let Some(first_path) = first_path { + let first_path_url = first_path.to_string_lossy(); + let mut cloud_options = + parse_cloud_options(&first_path_url, cloud_options.unwrap_or_default())?; cloud_options = cloud_options.with_max_retries(retries); if let Some(file_cache_ttl) = file_cache_ttl { cloud_options.file_cache_ttl = file_cache_ttl; } - Some(cloud_options) - }; - - let r = if let Some(path) = &path { - LazyJsonLineReader::new(path) - } else { - LazyJsonLineReader::new_paths(paths.into()) + r = r.with_cloud_options(Some(cloud_options)); }; let lf = r @@ -96,7 +99,6 @@ impl PyLazyFrame { .with_row_index(row_index) .with_ignore_errors(ignore_errors) .with_include_file_paths(include_file_paths.map(|x| x.into())) - .with_cloud_options(cloud_options) .finish() .map_err(PyPolarsErr::from)?; @@ -105,7 +107,7 @@ impl PyLazyFrame { #[staticmethod] #[cfg(feature = "csv")] - #[pyo3(signature = (path, paths, separator, has_header, ignore_errors, skip_rows, n_rows, cache, overwrite_dtype, + #[pyo3(signature = (source, sources, separator, has_header, ignore_errors, skip_rows, n_rows, cache, overwrite_dtype, low_memory, comment_prefix, quote_char, null_values, missing_utf8_is_empty_string, infer_schema_length, with_schema_modify, rechunk, skip_rows_after_header, encoding, row_index, try_parse_dates, eol_char, raise_if_empty, truncate_ragged_lines, decimal_comma, glob, schema, @@ -113,8 +115,8 @@ impl PyLazyFrame { ) )] fn new_from_csv( - path: Option, - paths: Vec, + source: Option, + sources: Wrap, separator: &str, has_header: bool, ignore_errors: bool, @@ -161,38 +163,26 @@ impl PyLazyFrame { .collect::() }); - #[cfg(feature = "cloud")] - let cloud_options = { - let first_path = if let Some(path) = &path { - path - } else { - paths - .first() - .ok_or_else(|| PyValueError::new_err("expected a path argument"))? - }; - - let first_path_url = first_path.to_string_lossy(); + let sources = sources.0; + let (first_path, sources) = match source { + None => (sources.first_path().map(|p| p.to_path_buf()), sources), + Some(source) => pyobject_to_first_path_and_scan_sources(source)?, + }; - let mut cloud_options = if let Some(opts) = cloud_options { - parse_cloud_options(&first_path_url, opts)? - } else { - parse_cloud_options(&first_path_url, vec![])? - }; + let mut r = LazyCsvReader::new_with_sources(sources); - cloud_options = cloud_options.with_max_retries(retries); + #[cfg(feature = "cloud")] + if let Some(first_path) = first_path { + let first_path_url = first_path.to_string_lossy(); + let mut cloud_options = + parse_cloud_options(&first_path_url, cloud_options.unwrap_or_default())?; if let Some(file_cache_ttl) = file_cache_ttl { cloud_options.file_cache_ttl = file_cache_ttl; } - - Some(cloud_options) - }; - - let r = if let Some(path) = path.as_ref() { - LazyCsvReader::new(path) - } else { - LazyCsvReader::new_paths(paths.into()) - }; + cloud_options = cloud_options.with_max_retries(retries); + r = r.with_cloud_options(Some(cloud_options)); + } let mut r = r .with_infer_schema_length(infer_schema_length) @@ -219,7 +209,6 @@ impl PyLazyFrame { .with_decimal_comma(decimal_comma) .with_glob(glob) .with_raise_if_empty(raise_if_empty) - .with_cloud_options(cloud_options) .with_include_file_paths(include_file_paths.map(|x| x.into())); if let Some(lambda) = with_schema_modify { @@ -250,12 +239,12 @@ impl PyLazyFrame { #[cfg(feature = "parquet")] #[staticmethod] - #[pyo3(signature = (path, paths, n_rows, cache, parallel, rechunk, row_index, + #[pyo3(signature = (source, sources, n_rows, cache, parallel, rechunk, row_index, low_memory, cloud_options, use_statistics, hive_partitioning, hive_schema, try_parse_hive_dates, retries, glob, include_file_paths) )] fn new_from_parquet( - path: Option, - paths: Vec, + source: Option, + sources: Wrap, n_rows: Option, cache: bool, parallel: Wrap, @@ -274,33 +263,11 @@ impl PyLazyFrame { let parallel = parallel.0; let hive_schema = hive_schema.map(|s| Arc::new(s.0)); - let first_path = if let Some(path) = &path { - path - } else { - paths - .first() - .ok_or_else(|| PyValueError::new_err("expected a path argument"))? - }; - - #[cfg(feature = "cloud")] - let cloud_options = { - let first_path_url = first_path.to_string_lossy(); - - let mut cloud_options = if let Some(opts) = cloud_options { - parse_cloud_options(&first_path_url, opts)? - } else { - parse_cloud_options(&first_path_url, vec![])? - }; - - cloud_options = cloud_options.with_max_retries(retries); - - Some(cloud_options) - }; - let row_index = row_index.map(|(name, offset)| RowIndex { name: name.into(), offset, }); + let hive_options = HiveOptions { enabled: hive_partitioning, hive_start_idx: 0, @@ -308,40 +275,49 @@ impl PyLazyFrame { try_parse_dates: try_parse_hive_dates, }; - let args = ScanArgsParquet { + let mut args = ScanArgsParquet { n_rows, cache, parallel, rechunk, row_index, low_memory, - cloud_options, + cloud_options: None, use_statistics, hive_options, glob, include_file_paths: include_file_paths.map(|x| x.into()), }; - let lf = if path.is_some() { - LazyFrame::scan_parquet(first_path, args) - } else { - LazyFrame::scan_parquet_files(Arc::from(paths), args) + let sources = sources.0; + let (first_path, sources) = match source { + None => (sources.first_path().map(|p| p.to_path_buf()), sources), + Some(source) => pyobject_to_first_path_and_scan_sources(source)?, + }; + + #[cfg(feature = "cloud")] + if let Some(first_path) = first_path { + let first_path_url = first_path.to_string_lossy(); + let cloud_options = + parse_cloud_options(&first_path_url, cloud_options.unwrap_or_default())?; + args.cloud_options = Some(cloud_options.with_max_retries(retries)); } - .map_err(PyPolarsErr::from)?; + + let lf = LazyFrame::scan_parquet_sources(sources, args).map_err(PyPolarsErr::from)?; + Ok(lf.into()) } #[cfg(feature = "ipc")] #[staticmethod] - #[pyo3(signature = (path, paths, n_rows, cache, rechunk, row_index, memory_map, cloud_options, hive_partitioning, hive_schema, try_parse_hive_dates, retries, file_cache_ttl, include_file_paths))] + #[pyo3(signature = (source, sources, n_rows, cache, rechunk, row_index, cloud_options, hive_partitioning, hive_schema, try_parse_hive_dates, retries, file_cache_ttl, include_file_paths))] fn new_from_ipc( - path: Option, - paths: Vec, + source: Option, + sources: Wrap, n_rows: Option, cache: bool, rechunk: bool, row_index: Option<(String, IdxSize)>, - memory_map: bool, cloud_options: Option>, hive_partitioning: Option, hive_schema: Option>, @@ -355,33 +331,6 @@ impl PyLazyFrame { offset, }); - #[cfg(feature = "cloud")] - let cloud_options = { - let first_path = if let Some(path) = &path { - path - } else { - paths - .first() - .ok_or_else(|| PyValueError::new_err("expected a path argument"))? - }; - - let first_path_url = first_path.to_string_lossy(); - - let mut cloud_options = if let Some(opts) = cloud_options { - parse_cloud_options(&first_path_url, opts)? - } else { - parse_cloud_options(&first_path_url, vec![])? - }; - - cloud_options = cloud_options.with_max_retries(retries); - - if let Some(file_cache_ttl) = file_cache_ttl { - cloud_options.file_cache_ttl = file_cache_ttl; - } - - Some(cloud_options) - }; - let hive_options = HiveOptions { enabled: hive_partitioning, hive_start_idx: 0, @@ -389,24 +338,36 @@ impl PyLazyFrame { try_parse_dates: try_parse_hive_dates, }; - let args = ScanArgsIpc { + let mut args = ScanArgsIpc { n_rows, cache, rechunk, row_index, - memory_map, #[cfg(feature = "cloud")] - cloud_options, + cloud_options: None, hive_options, include_file_paths: include_file_paths.map(|x| x.into()), }; - let lf = if let Some(path) = &path { - LazyFrame::scan_ipc(path, args) - } else { - LazyFrame::scan_ipc_files(paths.into(), args) + let sources = sources.0; + let (first_path, sources) = match source { + None => (sources.first_path().map(|p| p.to_path_buf()), sources), + Some(source) => pyobject_to_first_path_and_scan_sources(source)?, + }; + + #[cfg(feature = "cloud")] + if let Some(first_path) = first_path { + let first_path_url = first_path.to_string_lossy(); + + let mut cloud_options = + parse_cloud_options(&first_path_url, cloud_options.unwrap_or_default())?; + if let Some(file_cache_ttl) = file_cache_ttl { + cloud_options.file_cache_ttl = file_cache_ttl; + } + args.cloud_options = Some(cloud_options.with_max_retries(retries)); } - .map_err(PyPolarsErr::from)?; + + let lf = LazyFrame::scan_ipc_sources(sources, args).map_err(PyPolarsErr::from)?; Ok(lf.into()) } @@ -488,10 +449,14 @@ impl PyLazyFrame { .with_simplify_expr(simplify_expression) .with_slice_pushdown(slice_pushdown) .with_cluster_with_columns(cluster_with_columns) - .with_streaming(streaming) ._with_eager(_eager) .with_projection_pushdown(projection_pushdown); + #[cfg(feature = "streaming")] + { + ldf = ldf.with_streaming(streaming); + } + #[cfg(feature = "new_streaming")] { ldf = ldf.with_new_streaming(new_streaming); @@ -969,6 +934,20 @@ impl PyLazyFrame { .into()) } + fn join_where(&self, other: Self, predicates: Vec, suffix: String) -> PyResult { + let ldf = self.ldf.clone(); + let other = other.ldf; + + let predicates = predicates.to_exprs(); + + Ok(ldf + .join_builder() + .with(other) + .suffix(suffix) + .join_where(predicates) + .into()) + } + fn with_columns(&mut self, exprs: Vec) -> Self { let ldf = self.ldf.clone(); ldf.with_columns(exprs.to_exprs()).into() diff --git a/crates/polars-python/src/lazyframe/visitor/nodes.rs b/crates/polars-python/src/lazyframe/visitor/nodes.rs index 1bea0ce71865..d8dbb71281bc 100644 --- a/crates/polars-python/src/lazyframe/visitor/nodes.rs +++ b/crates/polars-python/src/lazyframe/visitor/nodes.rs @@ -317,7 +317,7 @@ pub(crate) fn into_py(py: Python<'_>, plan: &IR) -> PyResult { )) }, IR::Scan { - paths, + sources, file_info: _, hive_parts: _, predicate, @@ -325,7 +325,10 @@ pub(crate) fn into_py(py: Python<'_>, plan: &IR) -> PyResult { scan_type, file_options, } => Scan { - paths: paths.to_object(py), + paths: sources + .into_paths() + .ok_or_else(|| PyNotImplementedError::new_err("scan with BytesIO"))? + .to_object(py), // TODO: file info file_info: py.None(), predicate: predicate.as_ref().map(|e| e.into()), @@ -478,6 +481,8 @@ pub(crate) fn into_py(py: Python<'_>, plan: &IR) -> PyResult { JoinType::Cross => "cross", JoinType::Semi => "leftsemi", JoinType::Anti => "leftanti", + #[cfg(feature = "iejoin")] + JoinType::IEJoin(_) => return Err(PyNotImplementedError::new_err("IEJoin")), }, options.args.join_nulls, options.args.slice, @@ -595,7 +600,7 @@ pub(crate) fn into_py(py: Python<'_>, plan: &IR) -> PyResult { offset, } => ("row_index", name.to_string(), offset.unwrap_or(0)).to_object(py), FunctionIR::FastCount { - paths: _, + sources: _, scan_type: _, alias: _, } => return Err(PyNotImplementedError::new_err("function count")), diff --git a/crates/polars-schema/src/schema.rs b/crates/polars-schema/src/schema.rs index b927a88f3b63..3f03bdffde24 100644 --- a/crates/polars-schema/src/schema.rs +++ b/crates/polars-schema/src/schema.rs @@ -14,10 +14,7 @@ pub struct Schema { impl Eq for Schema {} -impl Schema -where - D: Clone + Default, -{ +impl Schema { pub fn with_capacity(capacity: usize) -> Self { let fields = PlIndexMap::with_capacity(capacity); Self { fields } @@ -59,42 +56,6 @@ where self.fields.insert(key, value) } - /// Create a new schema from this one, inserting a field with `name` and `dtype` at the given `index`. - /// - /// If a field named `name` already exists, it is updated with the new dtype. Regardless, the field named `name` is - /// always moved to the given index. Valid indices range from `0` (front of the schema) to `self.len()` (after the - /// end of the schema). - /// - /// For a mutating version that doesn't clone, see [`insert_at_index`][Self::insert_at_index]. - /// - /// Runtime: **O(m * n)** where `m` is the (average) length of the field names and `n` is the number of fields in - /// the schema. This method clones every field in the schema. - /// - /// Returns: `Ok(new_schema)` if `index <= self.len()`, else `Err(PolarsError)` - pub fn new_inserting_at_index( - &self, - index: usize, - name: PlSmallStr, - field: D, - ) -> PolarsResult { - polars_ensure!( - index <= self.len(), - OutOfBounds: - "index {} is out of bounds for schema with length {} (the max index allowed is self.len())", - index, - self.len() - ); - - let mut new = Self::default(); - let mut iter = self.fields.iter().filter_map(|(fld_name, dtype)| { - (fld_name != &name).then_some((fld_name.clone(), dtype.clone())) - }); - new.fields.extend(iter.by_ref().take(index)); - new.fields.insert(name.clone(), field); - new.fields.extend(iter); - Ok(new) - } - /// Insert a field with `name` and `dtype` at the given `index` into this schema. /// /// If a field named `name` already exists, it is updated with the new dtype. Regardless, the field named `name` is @@ -275,21 +236,6 @@ where self.fields.extend(other.fields) } - /// Merge borrowed `other` into `self`. - /// - /// Merging logic: - /// - Fields that occur in `self` but not `other` are unmodified - /// - Fields that occur in `other` but not `self` are appended, in order, to the end of `self` - /// - Fields that occur in both `self` and `other` are updated with the dtype from `other`, but keep their original - /// index - pub fn merge_from_ref(&mut self, other: &Self) { - self.fields.extend( - other - .iter() - .map(|(column, field)| (column.clone(), field.clone())), - ) - } - /// Iterates over the `(&name, &dtype)` pairs in this schema. /// /// For an owned version, use [`iter_fields`][Self::iter_fields], which clones the data to iterate owned `Field`s @@ -306,6 +252,10 @@ where self.fields.iter().map(|(name, _dtype)| name) } + pub fn iter_names_cloned(&self) -> impl '_ + ExactSizeIterator { + self.iter_names().cloned() + } + /// Iterates over references to the dtypes in this schema. pub fn iter_values(&self) -> impl '_ + ExactSizeIterator { self.fields.iter().map(|(_name, dtype)| dtype) @@ -324,6 +274,74 @@ where self.fields.get_index_of(name) } + pub fn try_index_of(&self, name: &str) -> PolarsResult { + let Some(i) = self.fields.get_index_of(name) else { + polars_bail!( + ColumnNotFound: + "unable to find column {:?}; valid columns: {:?}", + name, self.iter_names().collect::>(), + ) + }; + + Ok(i) + } +} + +impl Schema +where + D: Clone + Default, +{ + /// Create a new schema from this one, inserting a field with `name` and `dtype` at the given `index`. + /// + /// If a field named `name` already exists, it is updated with the new dtype. Regardless, the field named `name` is + /// always moved to the given index. Valid indices range from `0` (front of the schema) to `self.len()` (after the + /// end of the schema). + /// + /// For a mutating version that doesn't clone, see [`insert_at_index`][Self::insert_at_index]. + /// + /// Runtime: **O(m * n)** where `m` is the (average) length of the field names and `n` is the number of fields in + /// the schema. This method clones every field in the schema. + /// + /// Returns: `Ok(new_schema)` if `index <= self.len()`, else `Err(PolarsError)` + pub fn new_inserting_at_index( + &self, + index: usize, + name: PlSmallStr, + field: D, + ) -> PolarsResult { + polars_ensure!( + index <= self.len(), + OutOfBounds: + "index {} is out of bounds for schema with length {} (the max index allowed is self.len())", + index, + self.len() + ); + + let mut new = Self::default(); + let mut iter = self.fields.iter().filter_map(|(fld_name, dtype)| { + (fld_name != &name).then_some((fld_name.clone(), dtype.clone())) + }); + new.fields.extend(iter.by_ref().take(index)); + new.fields.insert(name.clone(), field); + new.fields.extend(iter); + Ok(new) + } + + /// Merge borrowed `other` into `self`. + /// + /// Merging logic: + /// - Fields that occur in `self` but not `other` are unmodified + /// - Fields that occur in `other` but not `self` are appended, in order, to the end of `self` + /// - Fields that occur in both `self` and `other` are updated with the dtype from `other`, but keep their original + /// index + pub fn merge_from_ref(&mut self, other: &Self) { + self.fields.extend( + other + .iter() + .map(|(column, field)| (column.clone(), field.clone())), + ) + } + /// Generates another schema with just the specified columns selected from this one. pub fn try_project(&self, columns: I) -> PolarsResult where @@ -421,7 +439,6 @@ impl From> for Schema { impl FromIterator for Schema where F: Into<(PlSmallStr, D)>, - D: Clone, { fn from_iter>(iter: I) -> Self { let fields = PlIndexMap::from_iter(iter.into_iter().map(|x| x.into())); diff --git a/crates/polars-sql/src/context.rs b/crates/polars-sql/src/context.rs index 58120da0002c..23ffb25070fa 100644 --- a/crates/polars-sql/src/context.rs +++ b/crates/polars-sql/src/context.rs @@ -847,8 +847,28 @@ impl SQLContext { expr: &Option, ) -> PolarsResult { if let Some(expr) = expr { - let schema = Some(self.get_frame_schema(&mut lf)?); - let mut filter_expression = parse_sql_expr(expr, self, schema.as_deref())?; + let schema = self.get_frame_schema(&mut lf)?; + + // shortcut filter evaluation if given expression is just TRUE or FALSE + let (all_true, all_false) = match expr { + SQLExpr::Value(SQLValue::Boolean(b)) => (*b, !*b), + SQLExpr::BinaryOp { left, op, right } => match (&**left, &**right, op) { + (SQLExpr::Value(a), SQLExpr::Value(b), BinaryOperator::Eq) => (a == b, a != b), + (SQLExpr::Value(a), SQLExpr::Value(b), BinaryOperator::NotEq) => { + (a != b, a == b) + }, + _ => (false, false), + }, + _ => (false, false), + }; + if all_true { + return Ok(lf); + } else if all_false { + return Ok(DataFrame::empty_with_schema(schema.as_ref()).lazy()); + } + + // ...otherwise parse and apply the filter as normal + let mut filter_expression = parse_sql_expr(expr, self, Some(schema).as_deref())?; if filter_expression.clone().meta().has_multiple_outputs() { filter_expression = all_horizontal([filter_expression])?; } diff --git a/crates/polars-sql/src/function_registry.rs b/crates/polars-sql/src/function_registry.rs index c85f8307af73..aa693025b072 100644 --- a/crates/polars-sql/src/function_registry.rs +++ b/crates/polars-sql/src/function_registry.rs @@ -1,4 +1,4 @@ -//! This module defines the function registry and user defined functions. +//! This module defines a FunctionRegistry for supported SQL functions and UDFs. use polars_error::{polars_bail, PolarsResult}; use polars_plan::prelude::udf::UserDefinedFunction; diff --git a/crates/polars-sql/src/keywords.rs b/crates/polars-sql/src/keywords.rs index 1442a91cd89f..990bc046aa5b 100644 --- a/crates/polars-sql/src/keywords.rs +++ b/crates/polars-sql/src/keywords.rs @@ -1,10 +1,8 @@ -//! Keywords that are supported by Polars SQL -//! -//! This is useful for syntax highlighting +//! Keywords that are supported by the Polars SQL interface. //! //! This module defines: -//! - all Polars SQL keywords [`all_keywords`] -//! - all of polars SQL functions [`all_functions`] +//! - all recognised Polars SQL keywords [`all_keywords`] +//! - all recognised Polars SQL functions [`all_functions`] use crate::functions::PolarsSQLFunctions; use crate::table_functions::PolarsTableFunctions; diff --git a/crates/polars-sql/src/lib.rs b/crates/polars-sql/src/lib.rs index a811a4cfad9b..528f21eafaf2 100644 --- a/crates/polars-sql/src/lib.rs +++ b/crates/polars-sql/src/lib.rs @@ -7,6 +7,7 @@ mod functions; pub mod keywords; mod sql_expr; mod table_functions; +mod types; pub use context::SQLContext; pub use sql_expr::sql_expr; diff --git a/crates/polars-sql/src/sql_expr.rs b/crates/polars-sql/src/sql_expr.rs index 6d18a75c42fe..148a7fe5735e 100644 --- a/crates/polars-sql/src/sql_expr.rs +++ b/crates/polars-sql/src/sql_expr.rs @@ -1,3 +1,11 @@ +//! Expressions that are supported by the Polars SQL interface. +//! +//! This is useful for syntax highlighting +//! +//! This module defines: +//! - all Polars SQL keywords [`all_keywords`] +//! - all of polars SQL functions [`all_functions`] + use std::fmt::Display; use std::ops::Div; @@ -9,216 +17,39 @@ use polars_plan::prelude::LiteralValue::Null; use polars_time::Duration; use rand::distributions::Alphanumeric; use rand::{thread_rng, Rng}; -use regex::{Regex, RegexBuilder}; #[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; -#[cfg(feature = "dtype-decimal")] -use sqlparser::ast::ExactNumberInfo; use sqlparser::ast::{ - ArrayElemTypeDef, BinaryOperator as SQLBinaryOperator, BinaryOperator, CastFormat, CastKind, + BinaryOperator as SQLBinaryOperator, BinaryOperator, CastFormat, CastKind, DataType as SQLDataType, DateTimeField, Expr as SQLExpr, Function as SQLFunction, Ident, - Interval, ObjectName, Query as Subquery, SelectItem, Subscript, TimezoneInfo, TrimWhereField, + Interval, Query as Subquery, SelectItem, Subscript, TimezoneInfo, TrimWhereField, UnaryOperator, Value as SQLValue, }; use sqlparser::dialect::GenericDialect; use sqlparser::parser::{Parser, ParserOptions}; use crate::functions::SQLFunctionVisitor; +use crate::types::{ + bitstring_to_bytes_literal, is_iso_date, is_iso_datetime, is_iso_time, map_sql_dtype_to_polars, +}; use crate::SQLContext; -static DATETIME_LITERAL_RE: std::sync::OnceLock = std::sync::OnceLock::new(); -static DATE_LITERAL_RE: std::sync::OnceLock = std::sync::OnceLock::new(); -static TIME_LITERAL_RE: std::sync::OnceLock = std::sync::OnceLock::new(); - -fn is_iso_datetime(value: &str) -> bool { - let dtm_regex = DATETIME_LITERAL_RE.get_or_init(|| { - RegexBuilder::new( - r"^\d{4}-[01]\d-[0-3]\d[ T](?:[01][0-9]|2[0-3]):[0-5][0-9]:[0-5][0-9](\.\d{1,9})?$", - ) - .build() - .unwrap() - }); - dtm_regex.is_match(value) -} - -fn is_iso_date(value: &str) -> bool { - let dt_regex = DATE_LITERAL_RE.get_or_init(|| { - RegexBuilder::new(r"^\d{4}-[01]\d-[0-3]\d$") - .build() - .unwrap() - }); - dt_regex.is_match(value) -} - -fn is_iso_time(value: &str) -> bool { - let tm_regex = TIME_LITERAL_RE.get_or_init(|| { - RegexBuilder::new(r"^(?:[01][0-9]|2[0-3]):[0-5][0-9]:[0-5][0-9](\.\d{1,9})?$") - .build() - .unwrap() - }); - tm_regex.is_match(value) -} - #[inline] #[cold] #[must_use] +/// Convert a Display-able error to PolarsError::SQLInterface pub fn to_sql_interface_err(err: impl Display) -> PolarsError { PolarsError::SQLInterface(err.to_string().into()) } -fn timeunit_from_precision(prec: &Option) -> PolarsResult { - Ok(match prec { - None => TimeUnit::Microseconds, - Some(n) if (1u64..=3u64).contains(n) => TimeUnit::Milliseconds, - Some(n) if (4u64..=6u64).contains(n) => TimeUnit::Microseconds, - Some(n) if (7u64..=9u64).contains(n) => TimeUnit::Nanoseconds, - Some(n) => { - polars_bail!(SQLSyntax: "invalid temporal type precision (expected 1-9, found {})", n) - }, - }) -} - -pub(crate) fn map_sql_polars_datatype(dtype: &SQLDataType) -> PolarsResult { - Ok(match dtype { - // --------------------------------- - // array/list - // --------------------------------- - SQLDataType::Array(ArrayElemTypeDef::AngleBracket(inner_type)) - | SQLDataType::Array(ArrayElemTypeDef::SquareBracket(inner_type, _)) => { - DataType::List(Box::new(map_sql_polars_datatype(inner_type)?)) - }, - - // --------------------------------- - // binary - // --------------------------------- - SQLDataType::Bytea - | SQLDataType::Bytes(_) - | SQLDataType::Binary(_) - | SQLDataType::Blob(_) - | SQLDataType::Varbinary(_) => DataType::Binary, - - // --------------------------------- - // boolean - // --------------------------------- - SQLDataType::Boolean | SQLDataType::Bool => DataType::Boolean, - - // --------------------------------- - // signed integer - // --------------------------------- - SQLDataType::Int(_) | SQLDataType::Integer(_) => DataType::Int32, - SQLDataType::Int2(_) | SQLDataType::SmallInt(_) => DataType::Int16, - SQLDataType::Int4(_) | SQLDataType::MediumInt(_) => DataType::Int32, - SQLDataType::Int8(_) | SQLDataType::BigInt(_) => DataType::Int64, - SQLDataType::TinyInt(_) => DataType::Int8, - - // --------------------------------- - // unsigned integer: the following do not map to PostgreSQL types/syntax, but - // are enabled for wider compatibility (eg: "CAST(col AS BIGINT UNSIGNED)"). - // --------------------------------- - SQLDataType::UnsignedTinyInt(_) => DataType::UInt8, // see also: "custom" types below - SQLDataType::UnsignedInt(_) | SQLDataType::UnsignedInteger(_) => DataType::UInt32, - SQLDataType::UnsignedInt2(_) | SQLDataType::UnsignedSmallInt(_) => DataType::UInt16, - SQLDataType::UnsignedInt4(_) | SQLDataType::UnsignedMediumInt(_) => DataType::UInt32, - SQLDataType::UnsignedInt8(_) | SQLDataType::UnsignedBigInt(_) | SQLDataType::UInt8 => { - DataType::UInt64 - }, - - // --------------------------------- - // float - // --------------------------------- - SQLDataType::Double | SQLDataType::DoublePrecision | SQLDataType::Float8 => { - DataType::Float64 - }, - SQLDataType::Float(n_bytes) => match n_bytes { - Some(n) if (1u64..=24u64).contains(n) => DataType::Float32, - Some(n) if (25u64..=53u64).contains(n) => DataType::Float64, - Some(n) => { - polars_bail!(SQLSyntax: "unsupported `float` size (expected a value between 1 and 53, found {})", n) - }, - None => DataType::Float64, - }, - SQLDataType::Float4 | SQLDataType::Real => DataType::Float32, - - // --------------------------------- - // decimal - // --------------------------------- - #[cfg(feature = "dtype-decimal")] - SQLDataType::Dec(info) | SQLDataType::Decimal(info) | SQLDataType::Numeric(info) => { - match *info { - ExactNumberInfo::PrecisionAndScale(p, s) => { - DataType::Decimal(Some(p as usize), Some(s as usize)) - }, - ExactNumberInfo::Precision(p) => DataType::Decimal(Some(p as usize), Some(0)), - ExactNumberInfo::None => DataType::Decimal(Some(38), Some(9)), - } - }, - - // --------------------------------- - // temporal - // --------------------------------- - SQLDataType::Date => DataType::Date, - SQLDataType::Interval => DataType::Duration(TimeUnit::Microseconds), - SQLDataType::Time(_, tz) => match tz { - TimezoneInfo::None => DataType::Time, - _ => { - polars_bail!(SQLInterface: "`time` with timezone is not supported; found tz={}", tz) - }, - }, - SQLDataType::Datetime(prec) => DataType::Datetime(timeunit_from_precision(prec)?, None), - SQLDataType::Timestamp(prec, tz) => match tz { - TimezoneInfo::None => DataType::Datetime(timeunit_from_precision(prec)?, None), - _ => { - polars_bail!(SQLInterface: "`timestamp` with timezone is not (yet) supported") - }, - }, - - // --------------------------------- - // string - // --------------------------------- - SQLDataType::Char(_) - | SQLDataType::CharVarying(_) - | SQLDataType::Character(_) - | SQLDataType::CharacterVarying(_) - | SQLDataType::Clob(_) - | SQLDataType::String(_) - | SQLDataType::Text - | SQLDataType::Uuid - | SQLDataType::Varchar(_) => DataType::String, - - // --------------------------------- - // custom - // --------------------------------- - SQLDataType::Custom(ObjectName(idents), _) => match idents.as_slice() { - [Ident { value, .. }] => match value.to_lowercase().as_str() { - // these integer types are not supported by the PostgreSQL core distribution, - // but they ARE available via `pguint` (https://github.com/petere/pguint), an - // extension maintained by one of the PostgreSQL core developers. - "uint1" => DataType::UInt8, - "uint2" => DataType::UInt16, - "uint4" | "uint" => DataType::UInt32, - "uint8" => DataType::UInt64, - // `pguint` also provides a 1 byte (8bit) integer type alias - "int1" => DataType::Int8, - _ => { - polars_bail!(SQLInterface: "datatype {:?} is not currently supported", value) - }, - }, - _ => { - polars_bail!(SQLInterface: "datatype {:?} is not currently supported", idents) - }, - }, - _ => { - polars_bail!(SQLInterface: "datatype {:?} is not currently supported", dtype) - }, - }) -} - #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] #[derive(Clone, Copy, PartialEq, Debug, Eq, Hash)] +/// Categorises the type of (allowed) subquery constraint pub enum SubqueryRestriction { - // SingleValue, + /// Subquery must return a single column SingleColumn, // SingleRow, + // SingleValue, // Any } @@ -889,7 +720,7 @@ impl SQLExprVisitor<'_> { if dtype == &SQLDataType::JSON { return Ok(expr.str().json_decode(None, None)); } - let polars_type = map_sql_polars_datatype(dtype)?; + let polars_type = map_sql_dtype_to_polars(dtype)?; Ok(match cast_kind { CastKind::Cast | CastKind::DoubleColon => expr.strict_cast(polars_type), CastKind::TryCast | CastKind::SafeCast => expr.cast(polars_type), @@ -1319,24 +1150,6 @@ pub(crate) fn adjust_one_indexed_param(idx: Expr, null_if_zero: bool) -> Expr { } } -fn bitstring_to_bytes_literal(b: &String) -> PolarsResult { - let n_bits = b.len(); - if !b.chars().all(|c| c == '0' || c == '1') || n_bits > 64 { - polars_bail!( - SQLSyntax: - "bit string literal should contain only 0s and 1s and have length <= 64; found '{}' with length {}", b, n_bits - ) - } - let s = b.as_str(); - Ok(lit(match n_bits { - 0 => b"".to_vec(), - 1..=8 => u8::from_str_radix(s, 2).unwrap().to_be_bytes().to_vec(), - 9..=16 => u16::from_str_radix(s, 2).unwrap().to_be_bytes().to_vec(), - 17..=32 => u32::from_str_radix(s, 2).unwrap().to_be_bytes().to_vec(), - _ => u64::from_str_radix(s, 2).unwrap().to_be_bytes().to_vec(), - })) -} - pub(crate) fn resolve_compound_identifier( ctx: &mut SQLContext, idents: &[Ident], diff --git a/crates/polars-sql/src/types.rs b/crates/polars-sql/src/types.rs new file mode 100644 index 000000000000..800ead8c233e --- /dev/null +++ b/crates/polars-sql/src/types.rs @@ -0,0 +1,208 @@ +//! This module supports mapping SQL datatypes to Polars datatypes. +//! +//! It also provides utility functions for working with SQL datatypes. +use polars_core::datatypes::{DataType, TimeUnit}; +use polars_core::export::regex::{Regex, RegexBuilder}; +use polars_error::{polars_bail, PolarsResult}; +use polars_plan::dsl::{lit, Expr}; +use sqlparser::ast::{ + ArrayElemTypeDef, DataType as SQLDataType, ExactNumberInfo, Ident, ObjectName, TimezoneInfo, +}; + +static DATETIME_LITERAL_RE: std::sync::OnceLock = std::sync::OnceLock::new(); +static DATE_LITERAL_RE: std::sync::OnceLock = std::sync::OnceLock::new(); +static TIME_LITERAL_RE: std::sync::OnceLock = std::sync::OnceLock::new(); + +pub fn bitstring_to_bytes_literal(b: &String) -> PolarsResult { + let n_bits = b.len(); + if !b.chars().all(|c| c == '0' || c == '1') || n_bits > 64 { + polars_bail!( + SQLSyntax: + "bit string literal should contain only 0s and 1s and have length <= 64; found '{}' with length {}", b, n_bits + ) + } + let s = b.as_str(); + Ok(lit(match n_bits { + 0 => b"".to_vec(), + 1..=8 => u8::from_str_radix(s, 2).unwrap().to_be_bytes().to_vec(), + 9..=16 => u16::from_str_radix(s, 2).unwrap().to_be_bytes().to_vec(), + 17..=32 => u32::from_str_radix(s, 2).unwrap().to_be_bytes().to_vec(), + _ => u64::from_str_radix(s, 2).unwrap().to_be_bytes().to_vec(), + })) +} + +pub fn is_iso_datetime(value: &str) -> bool { + let dtm_regex = DATETIME_LITERAL_RE.get_or_init(|| { + RegexBuilder::new( + r"^\d{4}-[01]\d-[0-3]\d[ T](?:[01][0-9]|2[0-3]):[0-5][0-9]:[0-5][0-9](\.\d{1,9})?$", + ) + .build() + .unwrap() + }); + dtm_regex.is_match(value) +} + +pub fn is_iso_date(value: &str) -> bool { + let dt_regex = DATE_LITERAL_RE.get_or_init(|| { + RegexBuilder::new(r"^\d{4}-[01]\d-[0-3]\d$") + .build() + .unwrap() + }); + dt_regex.is_match(value) +} + +pub fn is_iso_time(value: &str) -> bool { + let tm_regex = TIME_LITERAL_RE.get_or_init(|| { + RegexBuilder::new(r"^(?:[01][0-9]|2[0-3]):[0-5][0-9]:[0-5][0-9](\.\d{1,9})?$") + .build() + .unwrap() + }); + tm_regex.is_match(value) +} + +fn timeunit_from_precision(prec: &Option) -> PolarsResult { + Ok(match prec { + None => TimeUnit::Microseconds, + Some(n) if (1u64..=3u64).contains(n) => TimeUnit::Milliseconds, + Some(n) if (4u64..=6u64).contains(n) => TimeUnit::Microseconds, + Some(n) if (7u64..=9u64).contains(n) => TimeUnit::Nanoseconds, + Some(n) => { + polars_bail!(SQLSyntax: "invalid temporal type precision (expected 1-9, found {})", n) + }, + }) +} + +pub(crate) fn map_sql_dtype_to_polars(dtype: &SQLDataType) -> PolarsResult { + Ok(match dtype { + // --------------------------------- + // array/list + // --------------------------------- + SQLDataType::Array(ArrayElemTypeDef::AngleBracket(inner_type)) + | SQLDataType::Array(ArrayElemTypeDef::SquareBracket(inner_type, _)) => { + DataType::List(Box::new(map_sql_dtype_to_polars(inner_type)?)) + }, + + // --------------------------------- + // binary + // --------------------------------- + SQLDataType::Bytea + | SQLDataType::Bytes(_) + | SQLDataType::Binary(_) + | SQLDataType::Blob(_) + | SQLDataType::Varbinary(_) => DataType::Binary, + + // --------------------------------- + // boolean + // --------------------------------- + SQLDataType::Boolean | SQLDataType::Bool => DataType::Boolean, + + // --------------------------------- + // signed integer + // --------------------------------- + SQLDataType::Int(_) | SQLDataType::Integer(_) => DataType::Int32, + SQLDataType::Int2(_) | SQLDataType::SmallInt(_) => DataType::Int16, + SQLDataType::Int4(_) | SQLDataType::MediumInt(_) => DataType::Int32, + SQLDataType::Int8(_) | SQLDataType::BigInt(_) => DataType::Int64, + SQLDataType::TinyInt(_) => DataType::Int8, + + // --------------------------------- + // unsigned integer: the following do not map to PostgreSQL types/syntax, but + // are enabled for wider compatibility (eg: "CAST(col AS BIGINT UNSIGNED)"). + // --------------------------------- + SQLDataType::UnsignedTinyInt(_) => DataType::UInt8, // see also: "custom" types below + SQLDataType::UnsignedInt(_) | SQLDataType::UnsignedInteger(_) => DataType::UInt32, + SQLDataType::UnsignedInt2(_) | SQLDataType::UnsignedSmallInt(_) => DataType::UInt16, + SQLDataType::UnsignedInt4(_) | SQLDataType::UnsignedMediumInt(_) => DataType::UInt32, + SQLDataType::UnsignedInt8(_) | SQLDataType::UnsignedBigInt(_) | SQLDataType::UInt8 => { + DataType::UInt64 + }, + + // --------------------------------- + // float + // --------------------------------- + SQLDataType::Double | SQLDataType::DoublePrecision | SQLDataType::Float8 => { + DataType::Float64 + }, + SQLDataType::Float(n_bytes) => match n_bytes { + Some(n) if (1u64..=24u64).contains(n) => DataType::Float32, + Some(n) if (25u64..=53u64).contains(n) => DataType::Float64, + Some(n) => { + polars_bail!(SQLSyntax: "unsupported `float` size (expected a value between 1 and 53, found {})", n) + }, + None => DataType::Float64, + }, + SQLDataType::Float4 | SQLDataType::Real => DataType::Float32, + + // --------------------------------- + // decimal + // --------------------------------- + #[cfg(feature = "dtype-decimal")] + SQLDataType::Dec(info) | SQLDataType::Decimal(info) | SQLDataType::Numeric(info) => { + match *info { + ExactNumberInfo::PrecisionAndScale(p, s) => { + DataType::Decimal(Some(p as usize), Some(s as usize)) + }, + ExactNumberInfo::Precision(p) => DataType::Decimal(Some(p as usize), Some(0)), + ExactNumberInfo::None => DataType::Decimal(Some(38), Some(9)), + } + }, + + // --------------------------------- + // temporal + // --------------------------------- + SQLDataType::Date => DataType::Date, + SQLDataType::Interval => DataType::Duration(TimeUnit::Microseconds), + SQLDataType::Time(_, tz) => match tz { + TimezoneInfo::None => DataType::Time, + _ => { + polars_bail!(SQLInterface: "`time` with timezone is not supported; found tz={}", tz) + }, + }, + SQLDataType::Datetime(prec) => DataType::Datetime(timeunit_from_precision(prec)?, None), + SQLDataType::Timestamp(prec, tz) => match tz { + TimezoneInfo::None => DataType::Datetime(timeunit_from_precision(prec)?, None), + _ => { + polars_bail!(SQLInterface: "`timestamp` with timezone is not (yet) supported") + }, + }, + + // --------------------------------- + // string + // --------------------------------- + SQLDataType::Char(_) + | SQLDataType::CharVarying(_) + | SQLDataType::Character(_) + | SQLDataType::CharacterVarying(_) + | SQLDataType::Clob(_) + | SQLDataType::String(_) + | SQLDataType::Text + | SQLDataType::Uuid + | SQLDataType::Varchar(_) => DataType::String, + + // --------------------------------- + // custom + // --------------------------------- + SQLDataType::Custom(ObjectName(idents), _) => match idents.as_slice() { + [Ident { value, .. }] => match value.to_lowercase().as_str() { + // these integer types are not supported by the PostgreSQL core distribution, + // but they ARE available via `pguint` (https://github.com/petere/pguint), an + // extension maintained by one of the PostgreSQL core developers. + "uint1" => DataType::UInt8, + "uint2" => DataType::UInt16, + "uint4" | "uint" => DataType::UInt32, + "uint8" => DataType::UInt64, + // `pguint` also provides a 1 byte (8bit) integer type alias + "int1" => DataType::Int8, + _ => { + polars_bail!(SQLInterface: "datatype {:?} is not currently supported", value) + }, + }, + _ => { + polars_bail!(SQLInterface: "datatype {:?} is not currently supported", idents) + }, + }, + _ => { + polars_bail!(SQLInterface: "datatype {:?} is not currently supported", dtype) + }, + }) +} diff --git a/crates/polars-stream/src/nodes/parquet_source.rs b/crates/polars-stream/src/nodes/parquet_source.rs deleted file mode 100644 index 398780f594e4..000000000000 --- a/crates/polars-stream/src/nodes/parquet_source.rs +++ /dev/null @@ -1,1897 +0,0 @@ -use std::future::Future; -use std::path::PathBuf; -use std::sync::atomic::AtomicBool; -use std::sync::Arc; - -use futures::stream::FuturesUnordered; -use futures::StreamExt; -use polars_core::config; -use polars_core::frame::DataFrame; -use polars_core::prelude::{ - ArrowSchema, ChunkFull, DataType, IdxCa, InitHashMaps, PlHashMap, StringChunked, -}; -use polars_core::series::{IntoSeries, IsSorted, Series}; -use polars_core::utils::operation_exceeded_idxsize_msg; -use polars_error::{polars_bail, polars_err, PolarsResult}; -use polars_expr::prelude::PhysicalExpr; -use polars_io::cloud::CloudOptions; -use polars_io::predicates::PhysicalIoExpr; -use polars_io::prelude::{FileMetaData, ParquetOptions}; -use polars_io::utils::byte_source::{ - ByteSource, DynByteSource, DynByteSourceBuilder, MemSliceByteSource, -}; -use polars_io::utils::slice::SplitSlicePosition; -use polars_io::{is_cloud_url, RowIndex}; -use polars_parquet::read::RowGroupMetaData; -use polars_plan::plans::hive::HivePartitions; -use polars_plan::plans::FileInfo; -use polars_plan::prelude::FileScanOptions; -use polars_utils::aliases::PlHashSet; -use polars_utils::mmap::MemSlice; -use polars_utils::pl_str::PlSmallStr; -use polars_utils::slice::GetSaferUnchecked; -use polars_utils::IdxSize; - -use super::{MorselSeq, TaskPriority}; -use crate::async_executor::{self}; -use crate::async_primitives::connector::connector; -use crate::async_primitives::wait_group::{WaitGroup, WaitToken}; -use crate::morsel::get_ideal_morsel_size; -use crate::utils::task_handles_ext; - -type AsyncTaskData = Option<( - Vec>, - async_executor::AbortOnDropHandle>, -)>; - -#[allow(clippy::type_complexity)] -pub struct ParquetSourceNode { - paths: Arc>, - file_info: FileInfo, - hive_parts: Option>>, - predicate: Option>, - options: ParquetOptions, - cloud_options: Option, - file_options: FileScanOptions, - // Run-time vars - config: Config, - verbose: bool, - physical_predicate: Option>, - projected_arrow_fields: Arc<[polars_core::prelude::ArrowField]>, - byte_source_builder: DynByteSourceBuilder, - memory_prefetch_func: fn(&[u8]) -> (), - // This permit blocks execution until the first morsel is requested. - morsel_stream_starter: Option>, - // This is behind a Mutex so that we can call `shutdown()` asynchronously. - async_task_data: Arc>, - row_group_decoder: Option>, - is_finished: Arc, -} - -#[allow(clippy::too_many_arguments)] -impl ParquetSourceNode { - pub fn new( - paths: Arc>, - file_info: FileInfo, - hive_parts: Option>>, - predicate: Option>, - options: ParquetOptions, - cloud_options: Option, - file_options: FileScanOptions, - ) -> Self { - let verbose = config::verbose(); - - let byte_source_builder = - if is_cloud_url(paths[0].to_str().unwrap()) || config::force_async() { - DynByteSourceBuilder::ObjectStore - } else { - DynByteSourceBuilder::Mmap - }; - let memory_prefetch_func = get_memory_prefetch_func(verbose); - - Self { - paths, - file_info, - hive_parts, - predicate, - options, - cloud_options, - file_options, - - config: Config { - // Initialized later - num_pipelines: 0, - metadata_prefetch_size: 0, - metadata_decode_ahead_size: 0, - row_group_prefetch_size: 0, - }, - verbose, - physical_predicate: None, - projected_arrow_fields: Arc::new([]), - byte_source_builder, - memory_prefetch_func, - - morsel_stream_starter: None, - async_task_data: Arc::new(tokio::sync::Mutex::new(None)), - row_group_decoder: None, - is_finished: Arc::new(AtomicBool::new(false)), - } - } -} - -mod compute_node_impl { - - use std::sync::Arc; - - use polars_expr::prelude::phys_expr_to_io_expr; - - use super::super::compute_node_prelude::*; - use super::{Config, ParquetSourceNode}; - use crate::morsel::SourceToken; - - impl ComputeNode for ParquetSourceNode { - fn name(&self) -> &str { - "parquet_source" - } - - fn initialize(&mut self, num_pipelines: usize) { - self.config = { - let metadata_prefetch_size = polars_core::config::get_file_prefetch_size(); - // Limit metadata decode to the number of threads. - let metadata_decode_ahead_size = - (metadata_prefetch_size / 2).min(1 + num_pipelines).max(1); - let row_group_prefetch_size = polars_core::config::get_rg_prefetch_size(); - - Config { - num_pipelines, - metadata_prefetch_size, - metadata_decode_ahead_size, - row_group_prefetch_size, - } - }; - - if self.verbose { - eprintln!("[ParquetSource]: {:?}", &self.config); - } - - self.init_projected_arrow_fields(); - self.physical_predicate = self.predicate.clone().map(phys_expr_to_io_expr); - - let (raw_morsel_receivers, morsel_stream_task_handle) = self.init_raw_morsel_stream(); - - self.async_task_data - .try_lock() - .unwrap() - .replace((raw_morsel_receivers, morsel_stream_task_handle)); - - let row_group_decoder = self.init_row_group_decoder(); - self.row_group_decoder = Some(Arc::new(row_group_decoder)); - } - - fn update_state( - &mut self, - recv: &mut [PortState], - send: &mut [PortState], - ) -> PolarsResult<()> { - use std::sync::atomic::Ordering; - - assert!(recv.is_empty()); - assert_eq!(send.len(), 1); - - if self.is_finished.load(Ordering::Relaxed) { - send[0] = PortState::Done; - assert!( - self.async_task_data.try_lock().unwrap().is_none(), - "should have already been shut down" - ); - } else if send[0] == PortState::Done { - { - // Early shutdown - our port state was set to `Done` by the downstream nodes. - self.shutdown_in_background(); - }; - self.is_finished.store(true, Ordering::Relaxed); - } else { - send[0] = PortState::Ready - } - - Ok(()) - } - - fn spawn<'env, 's>( - &'env mut self, - scope: &'s TaskScope<'s, 'env>, - recv: &mut [Option>], - send: &mut [Option>], - _state: &'s ExecutionState, - join_handles: &mut Vec>>, - ) { - use std::sync::atomic::Ordering; - - assert!(recv.is_empty()); - assert_eq!(send.len(), 1); - assert!(!self.is_finished.load(Ordering::Relaxed)); - - let morsel_senders = send[0].take().unwrap().parallel(); - - let mut async_task_data_guard = self.async_task_data.try_lock().unwrap(); - let (raw_morsel_receivers, _) = async_task_data_guard.as_mut().unwrap(); - - assert_eq!(raw_morsel_receivers.len(), morsel_senders.len()); - - if let Some(v) = self.morsel_stream_starter.take() { - v.send(()).unwrap(); - } - let is_finished = self.is_finished.clone(); - - let task_handles = raw_morsel_receivers - .drain(..) - .zip(morsel_senders) - .map(|(mut raw_morsel_rx, mut morsel_tx)| { - let is_finished = is_finished.clone(); - - scope.spawn_task(TaskPriority::Low, async move { - let source_token = SourceToken::new(); - loop { - let Ok((df, morsel_seq, wait_token)) = raw_morsel_rx.recv().await - else { - is_finished.store(true, Ordering::Relaxed); - break; - }; - - let mut morsel = Morsel::new(df, morsel_seq, source_token.clone()); - morsel.set_consume_token(wait_token); - - if morsel_tx.send(morsel).await.is_err() { - break; - } - - if source_token.stop_requested() { - break; - } - } - - raw_morsel_rx - }) - }) - .collect::>(); - - drop(async_task_data_guard); - - let async_task_data = self.async_task_data.clone(); - - join_handles.push(scope.spawn_task(TaskPriority::Low, async move { - { - let mut async_task_data_guard = async_task_data.try_lock().unwrap(); - let (raw_morsel_receivers, _) = async_task_data_guard.as_mut().unwrap(); - - for handle in task_handles { - raw_morsel_receivers.push(handle.await); - } - } - - if self.is_finished.load(Ordering::Relaxed) { - self.shutdown().await?; - } - - Ok(()) - })) - } - } -} - -impl ParquetSourceNode { - /// # Panics - /// Panics if called more than once. - async fn shutdown_impl( - async_task_data: Arc>, - verbose: bool, - ) -> PolarsResult<()> { - if verbose { - eprintln!("[ParquetSource]: Shutting down"); - } - - let (mut raw_morsel_receivers, morsel_stream_task_handle) = - async_task_data.try_lock().unwrap().take().unwrap(); - - raw_morsel_receivers.clear(); - // Join on the producer handle to catch errors/panics. - // Safety - // * We dropped the receivers on the line above - // * This function is only called once. - morsel_stream_task_handle.await - } - - fn shutdown(&self) -> impl Future> { - if self.verbose { - eprintln!("[ParquetSource]: Shutdown via `shutdown()`"); - } - Self::shutdown_impl(self.async_task_data.clone(), self.verbose) - } - - /// Spawns a task to shut down the source node to avoid blocking the current thread. This is - /// usually called when data is no longer needed from the source node, as such it does not - /// propagate any (non-critical) errors. If on the other hand the source node does not provide - /// more data when requested, then it is more suitable to call [`Self::shutdown`], as it returns - /// a result that can be used to distinguish between whether the data stream stopped due to an - /// error or EOF. - fn shutdown_in_background(&self) { - if self.verbose { - eprintln!("[ParquetSource]: Shutdown via `shutdown_in_background()`"); - } - let async_task_data = self.async_task_data.clone(); - polars_io::pl_async::get_runtime() - .spawn(Self::shutdown_impl(async_task_data, self.verbose)); - } - - /// Constructs the task that provides a morsel stream. - #[allow(clippy::type_complexity)] - fn init_raw_morsel_stream( - &mut self, - ) -> ( - Vec>, - async_executor::AbortOnDropHandle>, - ) { - let verbose = self.verbose; - - let use_statistics = self.options.use_statistics; - - let (mut raw_morsel_senders, raw_morsel_receivers): (Vec<_>, Vec<_>) = - (0..self.config.num_pipelines).map(|_| connector()).unzip(); - - if let Some((_, 0)) = self.file_options.slice { - return ( - raw_morsel_receivers, - async_executor::AbortOnDropHandle::new(async_executor::spawn( - TaskPriority::Low, - std::future::ready(Ok(())), - )), - ); - } - - let reader_schema = self - .file_info - .reader_schema - .as_ref() - .unwrap() - .as_ref() - .unwrap_left() - .clone(); - - let (normalized_slice_oneshot_rx, metadata_rx, metadata_task_handle) = - self.init_metadata_fetcher(); - - let num_pipelines = self.config.num_pipelines; - let row_group_prefetch_size = self.config.row_group_prefetch_size; - let projection = self.file_options.with_columns.clone(); - assert_eq!(self.physical_predicate.is_some(), self.predicate.is_some()); - let predicate = self.physical_predicate.clone(); - let memory_prefetch_func = self.memory_prefetch_func; - - let mut row_group_data_fetcher = RowGroupDataFetcher { - metadata_rx, - use_statistics, - verbose, - reader_schema, - projection, - predicate, - slice_range: None, // Initialized later - memory_prefetch_func, - current_path_index: 0, - current_byte_source: Default::default(), - current_row_groups: Default::default(), - current_row_group_idx: 0, - current_max_row_group_height: 0, - current_row_offset: 0, - current_shared_file_state: Default::default(), - }; - - let row_group_decoder = self.init_row_group_decoder(); - let row_group_decoder = Arc::new(row_group_decoder); - - // Processes row group metadata and spawns I/O tasks to fetch row group data. This is - // currently spawned onto the CPU runtime as it does not directly make any async I/O calls, - // but instead it potentially performs predicate/slice evaluation on metadata. If we observe - // that under heavy CPU load scenarios the I/O throughput drops due to this task not being - // scheduled we can change it to be a high priority task. - let morsel_stream_task_handle = async_executor::spawn(TaskPriority::Low, async move { - let slice_range = { - let Ok(slice) = normalized_slice_oneshot_rx.await else { - // If we are here then the producer probably errored. - drop(row_group_data_fetcher); - return metadata_task_handle.await.unwrap(); - }; - - slice.map(|(offset, len)| offset..offset + len) - }; - - row_group_data_fetcher.slice_range = slice_range; - - // Pins a wait group to a channel index. - struct IndexedWaitGroup { - index: usize, - wait_group: WaitGroup, - } - - impl IndexedWaitGroup { - async fn wait(self) -> Self { - self.wait_group.wait().await; - self - } - } - - // Ensure proper backpressure by only polling the buffered iterator when a wait group - // is free. - let mut wait_groups = (0..num_pipelines) - .map(|index| { - let wait_group = WaitGroup::default(); - { - let _prime_this_wait_group = wait_group.token(); - } - IndexedWaitGroup { - index, - wait_group: WaitGroup::default(), - } - .wait() - }) - .collect::>(); - - let mut df_stream = row_group_data_fetcher - .into_stream() - .map(|x| async { - match x { - Ok(handle) => handle.await, - Err(e) => Err(e), - } - }) - .buffered(row_group_prefetch_size) - .map(|x| async { - let row_group_decoder = row_group_decoder.clone(); - - match x { - Ok(row_group_data) => { - async_executor::spawn(TaskPriority::Low, async move { - row_group_decoder.row_group_data_to_df(row_group_data).await - }) - .await - }, - Err(e) => Err(e), - } - }) - .buffered( - // Because we are using an ordered buffer, we may suffer from head-of-line blocking, - // so we add a small amount of buffer. - num_pipelines + 4, - ); - - let morsel_seq_ref = &mut MorselSeq::default(); - let mut dfs = vec![].into_iter(); - - 'main: loop { - let Some(mut indexed_wait_group) = wait_groups.next().await else { - break; - }; - - if dfs.len() == 0 { - let Some(v) = df_stream.next().await else { - break; - }; - - let v = v?; - assert!(!v.is_empty()); - - dfs = v.into_iter(); - } - - let mut df = dfs.next().unwrap(); - let morsel_seq = *morsel_seq_ref; - *morsel_seq_ref = morsel_seq.successor(); - - loop { - use crate::async_primitives::connector::SendError; - - let channel_index = indexed_wait_group.index; - let wait_token = indexed_wait_group.wait_group.token(); - - match raw_morsel_senders[channel_index].try_send((df, morsel_seq, wait_token)) { - Ok(_) => { - wait_groups.push(indexed_wait_group.wait()); - break; - }, - Err(SendError::Closed(v)) => { - // The channel assigned to this wait group has been closed, so we will not - // add it back to the list of wait groups, and we will try to send this - // across another channel. - df = v.0 - }, - Err(SendError::Full(_)) => unreachable!(), - } - - let Some(v) = wait_groups.next().await else { - // All channels have closed - break 'main; - }; - - indexed_wait_group = v; - } - } - - // Join on the producer handle to catch errors/panics. - drop(df_stream); - metadata_task_handle.await.unwrap() - }); - - let morsel_stream_task_handle = - async_executor::AbortOnDropHandle::new(morsel_stream_task_handle); - - (raw_morsel_receivers, morsel_stream_task_handle) - } - - /// Constructs the task that fetches file metadata. - /// Note: This must be called AFTER `self.projected_arrow_fields` has been initialized. - /// - /// TODO: During IR conversion the metadata of the first file is already downloaded - see if - /// we can find a way to re-use it. - #[allow(clippy::type_complexity)] - fn init_metadata_fetcher( - &mut self, - ) -> ( - tokio::sync::oneshot::Receiver>, - crate::async_primitives::connector::Receiver<( - usize, - usize, - Arc, - FileMetaData, - usize, - )>, - task_handles_ext::AbortOnDropHandle>, - ) { - let verbose = self.verbose; - let io_runtime = polars_io::pl_async::get_runtime(); - - assert!( - !self.projected_arrow_fields.is_empty() - || self.file_options.with_columns.as_deref() == Some(&[]) - ); - let projected_arrow_fields = self.projected_arrow_fields.clone(); - let needs_max_row_group_height_calc = - self.file_options.include_file_paths.is_some() || self.hive_parts.is_some(); - - let (normalized_slice_oneshot_tx, normalized_slice_oneshot_rx) = - tokio::sync::oneshot::channel(); - let (mut metadata_tx, metadata_rx) = connector(); - - let byte_source_builder = self.byte_source_builder.clone(); - - if self.verbose { - eprintln!( - "[ParquetSource]: Byte source builder: {:?}", - &byte_source_builder - ); - } - - let fetch_metadata_bytes_for_path_index = { - let paths = &self.paths; - let cloud_options = Arc::new(self.cloud_options.clone()); - - let paths = paths.clone(); - let cloud_options = cloud_options.clone(); - let byte_source_builder = byte_source_builder.clone(); - - move |path_idx: usize| { - let paths = paths.clone(); - let cloud_options = cloud_options.clone(); - let byte_source_builder = byte_source_builder.clone(); - - let handle = io_runtime.spawn(async move { - let mut byte_source = Arc::new( - byte_source_builder - .try_build_from_path( - paths[path_idx].to_str().unwrap(), - cloud_options.as_ref().as_ref(), - ) - .await?, - ); - let (metadata_bytes, maybe_full_bytes) = - read_parquet_metadata_bytes(byte_source.as_ref(), verbose).await?; - - if let Some(v) = maybe_full_bytes { - if !matches!(byte_source.as_ref(), DynByteSource::MemSlice(_)) { - if verbose { - eprintln!( - "[ParquetSource]: Parquet file was fully fetched during \ - metadata read ({} bytes).", - v.len(), - ); - } - - byte_source = Arc::new(DynByteSource::from(MemSliceByteSource(v))) - } - } - - PolarsResult::Ok((path_idx, byte_source, metadata_bytes)) - }); - - let handle = task_handles_ext::AbortOnDropHandle(handle); - - std::future::ready(handle) - } - }; - - let process_metadata_bytes = { - move |handle: task_handles_ext::AbortOnDropHandle< - PolarsResult<(usize, Arc, MemSlice)>, - >| { - let projected_arrow_fields = projected_arrow_fields.clone(); - // Run on CPU runtime - metadata deserialization is expensive, especially - // for very wide tables. - let handle = async_executor::spawn(TaskPriority::Low, async move { - let (path_index, byte_source, metadata_bytes) = handle.await.unwrap()?; - - let metadata = polars_parquet::parquet::read::deserialize_metadata( - metadata_bytes.as_ref(), - metadata_bytes.len() * 2 + 1024, - )?; - - ensure_metadata_has_projected_fields( - projected_arrow_fields.as_ref(), - &metadata, - )?; - - let file_max_row_group_height = if needs_max_row_group_height_calc { - metadata - .row_groups - .iter() - .map(|x| x.num_rows()) - .max() - .unwrap_or(0) - } else { - 0 - }; - - PolarsResult::Ok((path_index, byte_source, metadata, file_max_row_group_height)) - }); - - async_executor::AbortOnDropHandle::new(handle) - } - }; - - let metadata_prefetch_size = self.config.metadata_prefetch_size; - let metadata_decode_ahead_size = self.config.metadata_decode_ahead_size; - - let (start_tx, start_rx) = tokio::sync::oneshot::channel(); - self.morsel_stream_starter = Some(start_tx); - - let metadata_task_handle = if self - .file_options - .slice - .map(|(offset, _)| offset >= 0) - .unwrap_or(true) - { - normalized_slice_oneshot_tx - .send( - self.file_options - .slice - .map(|(offset, len)| (offset as usize, len)), - ) - .unwrap(); - - // Safety: `offset + len` does not overflow. - let slice_range = self - .file_options - .slice - .map(|(offset, len)| offset as usize..offset as usize + len); - - let mut metadata_stream = futures::stream::iter(0..self.paths.len()) - .map(fetch_metadata_bytes_for_path_index) - .buffered(metadata_prefetch_size) - .map(process_metadata_bytes) - .buffered(metadata_decode_ahead_size); - - let paths = self.paths.clone(); - - // We need to be able to both stop early as well as skip values, which is easier to do - // using a custom task instead of futures::stream - io_runtime.spawn(async move { - let current_row_offset_ref = &mut 0usize; - let current_path_index_ref = &mut 0usize; - - if start_rx.await.is_err() { - return Ok(()); - } - - if verbose { - eprintln!("[ParquetSource]: Starting data fetch") - } - - loop { - let current_path_index = *current_path_index_ref; - *current_path_index_ref += 1; - - let Some(v) = metadata_stream.next().await else { - break; - }; - - let (path_index, byte_source, metadata, file_max_row_group_height) = v - .map_err(|err| { - err.wrap_msg(|msg| { - format!( - "error at path (index: {}, path: {}): {}", - current_path_index, - paths[current_path_index].to_str().unwrap(), - msg - ) - }) - })?; - - assert_eq!(path_index, current_path_index); - - let current_row_offset = *current_row_offset_ref; - *current_row_offset_ref = current_row_offset.saturating_add(metadata.num_rows); - - if let Some(slice_range) = slice_range.clone() { - match SplitSlicePosition::split_slice_at_file( - current_row_offset, - metadata.num_rows, - slice_range, - ) { - SplitSlicePosition::Before => { - if verbose { - eprintln!( - "[ParquetSource]: Slice pushdown: \ - Skipped file at index {} ({} rows)", - current_path_index, metadata.num_rows - ); - } - continue; - }, - SplitSlicePosition::After => unreachable!(), - SplitSlicePosition::Overlapping(..) => {}, - }; - }; - - if metadata_tx - .send(( - path_index, - current_row_offset, - byte_source, - metadata, - file_max_row_group_height, - )) - .await - .is_err() - { - break; - } - - if let Some(slice_range) = slice_range.as_ref() { - if *current_row_offset_ref >= slice_range.end { - if verbose { - eprintln!( - "[ParquetSource]: Slice pushdown: \ - Stopped reading at file at index {} \ - (remaining {} files will not be read)", - current_path_index, - paths.len() - current_path_index - 1, - ); - } - break; - } - }; - } - - Ok(()) - }) - } else { - // Walk the files in reverse to translate the slice into a positive offset. - let slice = self.file_options.slice.unwrap(); - let slice_start_as_n_from_end = -slice.0 as usize; - - let mut metadata_stream = futures::stream::iter((0..self.paths.len()).rev()) - .map(fetch_metadata_bytes_for_path_index) - .buffered(metadata_prefetch_size) - .map(process_metadata_bytes) - .buffered(metadata_decode_ahead_size); - - // Note: - // * We want to wait until the first morsel is requested before starting this - let init_negative_slice_and_metadata = async move { - let mut processed_metadata_rev = vec![]; - let mut cum_rows = 0; - - while let Some(v) = metadata_stream.next().await { - let v = v?; - let (_, _, metadata, _) = &v; - cum_rows += metadata.num_rows; - processed_metadata_rev.push(v); - - if cum_rows >= slice_start_as_n_from_end { - break; - } - } - - let (start, len) = if slice_start_as_n_from_end > cum_rows { - // We need to trim the slice, e.g. SLICE[offset: -100, len: 75] on a file of 50 - // rows should only give the first 25 rows. - let first_file_position = slice_start_as_n_from_end - cum_rows; - (0, slice.1.saturating_sub(first_file_position)) - } else { - (cum_rows - slice_start_as_n_from_end, slice.1) - }; - - if len == 0 { - processed_metadata_rev.clear(); - } - - normalized_slice_oneshot_tx - .send(Some((start, len))) - .unwrap(); - - let slice_range = start..(start + len); - - PolarsResult::Ok((slice_range, processed_metadata_rev, cum_rows)) - }; - - let path_count = self.paths.len(); - - io_runtime.spawn(async move { - if start_rx.await.is_err() { - return Ok(()); - } - - if verbose { - eprintln!("[ParquetSource]: Starting data fetch (negative slice)") - } - - let (slice_range, processed_metadata_rev, cum_rows) = - async_executor::AbortOnDropHandle::new(async_executor::spawn( - TaskPriority::Low, - init_negative_slice_and_metadata, - )) - .await?; - - if verbose { - if let Some((path_index, ..)) = processed_metadata_rev.last() { - eprintln!( - "[ParquetSource]: Slice pushdown: Negatively-offsetted slice {:?} \ - begins at file index {}, translated to {:?}", - slice, path_index, slice_range - ); - } else { - eprintln!( - "[ParquetSource]: Slice pushdown: Negatively-offsetted slice {:?} \ - skipped all files ({} files containing {} rows)", - slice, path_count, cum_rows - ) - } - } - - let metadata_iter = processed_metadata_rev.into_iter().rev(); - let current_row_offset_ref = &mut 0usize; - - for (current_path_index, byte_source, metadata, file_max_row_group_height) in - metadata_iter - { - let current_row_offset = *current_row_offset_ref; - *current_row_offset_ref = current_row_offset.saturating_add(metadata.num_rows); - - assert!(matches!( - SplitSlicePosition::split_slice_at_file( - current_row_offset, - metadata.num_rows, - slice_range.clone(), - ), - SplitSlicePosition::Overlapping(..) - )); - - if metadata_tx - .send(( - current_path_index, - current_row_offset, - byte_source, - metadata, - file_max_row_group_height, - )) - .await - .is_err() - { - break; - } - - if *current_row_offset_ref >= slice_range.end { - if verbose { - eprintln!( - "[ParquetSource]: Slice pushdown: \ - Stopped reading at file at index {} \ - (remaining {} files will not be read)", - current_path_index, - path_count - current_path_index - 1, - ); - } - break; - } - } - - Ok(()) - }) - }; - - let metadata_task_handle = task_handles_ext::AbortOnDropHandle(metadata_task_handle); - - ( - normalized_slice_oneshot_rx, - metadata_rx, - metadata_task_handle, - ) - } - - /// Creates a `RowGroupDecoder` that turns `RowGroupData` into DataFrames. - /// This must be called AFTER the following have been initialized: - /// * `self.projected_arrow_fields` - /// * `self.physical_predicate` - fn init_row_group_decoder(&self) -> RowGroupDecoder { - assert!( - !self.projected_arrow_fields.is_empty() - || self.file_options.with_columns.as_deref() == Some(&[]) - ); - assert_eq!(self.predicate.is_some(), self.physical_predicate.is_some()); - - let paths = self.paths.clone(); - let hive_partitions = self.hive_parts.clone(); - let hive_partitions_width = hive_partitions - .as_deref() - .map(|x| x[0].get_statistics().column_stats().len()) - .unwrap_or(0); - let include_file_paths = self.file_options.include_file_paths.clone(); - let projected_arrow_fields = self.projected_arrow_fields.clone(); - let row_index = self.file_options.row_index.clone(); - let physical_predicate = self.physical_predicate.clone(); - let ideal_morsel_size = get_ideal_morsel_size(); - - RowGroupDecoder { - paths, - hive_partitions, - hive_partitions_width, - include_file_paths, - projected_arrow_fields, - row_index, - physical_predicate, - ideal_morsel_size, - } - } - - fn init_projected_arrow_fields(&mut self) { - let reader_schema = self - .file_info - .reader_schema - .as_ref() - .unwrap() - .as_ref() - .unwrap_left() - .clone(); - - self.projected_arrow_fields = - if let Some(columns) = self.file_options.with_columns.as_deref() { - columns - .iter() - .map(|x| reader_schema.get(x).unwrap().clone()) - .collect() - } else { - reader_schema.iter_values().cloned().collect() - }; - - if self.verbose { - eprintln!( - "[ParquetSource]: {} columns to be projected from {} files", - self.projected_arrow_fields.len(), - self.paths.len(), - ); - } - } -} - -#[derive(Debug)] -struct Config { - num_pipelines: usize, - /// Number of files to pre-fetch metadata for concurrently - metadata_prefetch_size: usize, - /// Number of files to decode metadata for in parallel in advance - metadata_decode_ahead_size: usize, - /// Number of row groups to pre-fetch concurrently, this can be across files - row_group_prefetch_size: usize, -} - -/// Represents byte-data that can be transformed into a DataFrame after some computation. -struct RowGroupData { - byte_source: FetchedBytes, - path_index: usize, - row_offset: usize, - slice: Option<(usize, usize)>, - file_max_row_group_height: usize, - row_group_metadata: RowGroupMetaData, - shared_file_state: Arc>, -} - -struct RowGroupDataFetcher { - metadata_rx: crate::async_primitives::connector::Receiver<( - usize, - usize, - Arc, - FileMetaData, - usize, - )>, - use_statistics: bool, - verbose: bool, - reader_schema: Arc, - projection: Option>, - predicate: Option>, - slice_range: Option>, - memory_prefetch_func: fn(&[u8]) -> (), - current_path_index: usize, - current_byte_source: Arc, - current_row_groups: std::vec::IntoIter, - current_row_group_idx: usize, - current_max_row_group_height: usize, - current_row_offset: usize, - current_shared_file_state: Arc>, -} - -fn read_this_row_group( - rg_md: &RowGroupMetaData, - predicate: Option<&dyn PhysicalIoExpr>, - reader_schema: &ArrowSchema, -) -> PolarsResult { - let Some(pred) = predicate else { - return Ok(true); - }; - use polars_io::prelude::_internal::*; - // TODO! - // Optimize this. Now we partition the predicate columns twice. (later on reading as well) - // I think we must add metadata context where we can cache and amortize the partitioning. - let mut part_md = PartitionedColumnChunkMD::new(rg_md); - let live = pred.live_variables(); - part_md.set_partitions( - live.as_ref() - .map(|vars| vars.iter().map(|s| s.as_ref()).collect::>()) - .as_ref(), - ); - read_this_row_group(Some(pred), &part_md, reader_schema) -} - -impl RowGroupDataFetcher { - fn into_stream(self) -> RowGroupDataStream { - RowGroupDataStream::new(self) - } - - async fn init_next_file_state(&mut self) -> bool { - let Ok((path_index, row_offset, byte_source, metadata, file_max_row_group_height)) = - self.metadata_rx.recv().await - else { - return false; - }; - - self.current_path_index = path_index; - self.current_byte_source = byte_source; - self.current_max_row_group_height = file_max_row_group_height; - // The metadata task also sends a row offset to start counting from as it may skip files - // during slice pushdown. - self.current_row_offset = row_offset; - self.current_row_group_idx = 0; - self.current_row_groups = metadata.row_groups.into_iter(); - self.current_shared_file_state = Default::default(); - - true - } - - async fn next( - &mut self, - ) -> Option>>> { - 'main: loop { - for row_group_metadata in self.current_row_groups.by_ref() { - let current_row_offset = self.current_row_offset; - let current_row_group_idx = self.current_row_group_idx; - - let num_rows = row_group_metadata.num_rows(); - - self.current_row_offset = current_row_offset.saturating_add(num_rows); - self.current_row_group_idx += 1; - - if self.use_statistics - && !match read_this_row_group( - &row_group_metadata, - self.predicate.as_deref(), - self.reader_schema.as_ref(), - ) { - Ok(v) => v, - Err(e) => return Some(Err(e)), - } - { - if self.verbose { - eprintln!( - "[ParquetSource]: Predicate pushdown: \ - Skipped row group {} in file {} ({} rows)", - current_row_group_idx, self.current_path_index, num_rows - ); - } - continue; - } - - if num_rows > IdxSize::MAX as usize { - let msg = operation_exceeded_idxsize_msg( - format!("number of rows in row group ({})", num_rows).as_str(), - ); - return Some(Err(polars_err!(ComputeError: msg))); - } - - let slice = if let Some(slice_range) = self.slice_range.clone() { - let (offset, len) = match SplitSlicePosition::split_slice_at_file( - current_row_offset, - num_rows, - slice_range, - ) { - SplitSlicePosition::Before => { - if self.verbose { - eprintln!( - "[ParquetSource]: Slice pushdown: \ - Skipped row group {} in file {} ({} rows)", - current_row_group_idx, self.current_path_index, num_rows - ); - } - continue; - }, - SplitSlicePosition::After => { - if self.verbose { - eprintln!( - "[ParquetSource]: Slice pushdown: \ - Stop at row group {} in file {} \ - (remaining {} row groups will not be read)", - current_row_group_idx, - self.current_path_index, - self.current_row_groups.len(), - ); - }; - break 'main; - }, - SplitSlicePosition::Overlapping(offset, len) => (offset, len), - }; - - Some((offset, len)) - } else { - None - }; - - let current_byte_source = self.current_byte_source.clone(); - let projection = self.projection.clone(); - let current_shared_file_state = self.current_shared_file_state.clone(); - let memory_prefetch_func = self.memory_prefetch_func; - let io_runtime = polars_io::pl_async::get_runtime(); - let current_path_index = self.current_path_index; - let current_max_row_group_height = self.current_max_row_group_height; - - // Push calculation of byte ranges to a task to run in parallel, as it can be - // expensive for very wide tables and projections. - let handle = async_executor::spawn(TaskPriority::Low, async move { - let byte_source = if let DynByteSource::MemSlice(mem_slice) = - current_byte_source.as_ref() - { - // Skip byte range calculation for `no_prefetch`. - if memory_prefetch_func as usize != mem_prefetch_funcs::no_prefetch as usize - { - let slice = mem_slice.0.as_ref(); - - if let Some(columns) = projection.as_ref() { - for range in get_row_group_byte_ranges_for_projection( - &row_group_metadata, - columns.as_ref(), - ) { - memory_prefetch_func(unsafe { - slice.get_unchecked_release(range) - }) - } - } else { - let mut iter = get_row_group_byte_ranges(&row_group_metadata); - let first = iter.next().unwrap(); - let range = - iter.fold(first, |l, r| l.start.min(r.start)..l.end.max(r.end)); - - memory_prefetch_func(unsafe { slice.get_unchecked_release(range) }) - }; - } - - // We have a mmapped or in-memory slice representing the entire - // file that can be sliced directly, so we can skip the byte-range - // calculations and HashMap allocation. - let mem_slice = mem_slice.0.clone(); - FetchedBytes::MemSlice { - offset: 0, - mem_slice, - } - } else if let Some(columns) = projection.as_ref() { - let ranges = get_row_group_byte_ranges_for_projection( - &row_group_metadata, - columns.as_ref(), - ) - .collect::>(); - - let bytes = { - let ranges_2 = ranges.clone(); - task_handles_ext::AbortOnDropHandle(io_runtime.spawn(async move { - current_byte_source.get_ranges(ranges_2.as_ref()).await - })) - .await - .unwrap()? - }; - - assert_eq!(bytes.len(), ranges.len()); - - let mut bytes_map = PlHashMap::with_capacity(ranges.len()); - - for (range, bytes) in ranges.iter().zip(bytes) { - memory_prefetch_func(bytes.as_ref()); - let v = bytes_map.insert(range.start, bytes); - debug_assert!(v.is_none(), "duplicate range start {}", range.start); - } - - FetchedBytes::BytesMap(bytes_map) - } else { - // We have a dedicated code-path for a full projection that performs a - // single range request for the entire row group. During testing this - // provided much higher throughput from cloud than making multiple range - // request with `get_ranges()`. - let mut iter = get_row_group_byte_ranges(&row_group_metadata); - let mut ranges = Vec::with_capacity(iter.len()); - let first = iter.next().unwrap(); - ranges.push(first.clone()); - let full_range = iter.fold(first, |l, r| { - ranges.push(r.clone()); - l.start.min(r.start)..l.end.max(r.end) - }); - - let mem_slice = { - let full_range_2 = full_range.clone(); - task_handles_ext::AbortOnDropHandle(io_runtime.spawn(async move { - current_byte_source.get_range(full_range_2).await - })) - .await - .unwrap()? - }; - - FetchedBytes::MemSlice { - offset: full_range.start, - mem_slice, - } - }; - - PolarsResult::Ok(RowGroupData { - byte_source, - path_index: current_path_index, - row_offset: current_row_offset, - slice, - file_max_row_group_height: current_max_row_group_height, - row_group_metadata, - shared_file_state: current_shared_file_state.clone(), - }) - }); - - let handle = async_executor::AbortOnDropHandle::new(handle); - return Some(Ok(handle)); - } - - // Initialize state to the next file. - if !self.init_next_file_state().await { - break; - } - } - - None - } -} - -enum FetchedBytes { - MemSlice { mem_slice: MemSlice, offset: usize }, - BytesMap(PlHashMap), -} - -impl FetchedBytes { - fn get_range(&self, range: std::ops::Range) -> MemSlice { - match self { - Self::MemSlice { mem_slice, offset } => { - let offset = *offset; - debug_assert!(range.start >= offset); - mem_slice.slice(range.start - offset..range.end - offset) - }, - Self::BytesMap(v) => { - let v = v.get(&range.start).unwrap(); - debug_assert_eq!(v.len(), range.len()); - v.clone() - }, - } - } -} - -#[rustfmt::skip] -type RowGroupDataStreamFut = std::pin::Pin , - Option < - PolarsResult < - async_executor::AbortOnDropHandle < - PolarsResult < - RowGroupData > > > > - ) - > + Send ->>; - -struct RowGroupDataStream { - current_future: RowGroupDataStreamFut, -} - -impl RowGroupDataStream { - fn new(row_group_data_fetcher: RowGroupDataFetcher) -> Self { - // [`RowGroupDataFetcher`] is a big struct, so we Box it once here to avoid boxing it on - // every `next()` call. - let current_future = Self::call_next_owned(Box::new(row_group_data_fetcher)); - Self { current_future } - } - - fn call_next_owned( - mut row_group_data_fetcher: Box, - ) -> RowGroupDataStreamFut { - Box::pin(async move { - let out = row_group_data_fetcher.next().await; - (row_group_data_fetcher, out) - }) - } -} - -impl futures::stream::Stream for RowGroupDataStream { - type Item = PolarsResult>>; - - fn poll_next( - mut self: std::pin::Pin<&mut Self>, - cx: &mut std::task::Context<'_>, - ) -> std::task::Poll> { - use std::pin::Pin; - use std::task::Poll; - - match Pin::new(&mut self.current_future.as_mut()).poll(cx) { - Poll::Ready((row_group_data_fetcher, out)) => { - if out.is_some() { - self.current_future = Self::call_next_owned(row_group_data_fetcher); - } - - Poll::Ready(out) - }, - Poll::Pending => Poll::Pending, - } - } -} - -/// State shared across row groups for a single file. -struct SharedFileState { - path_index: usize, - hive_series: Vec, - file_path_series: Option, -} - -/// Turns row group data into DataFrames. -struct RowGroupDecoder { - paths: Arc>, - hive_partitions: Option>>, - hive_partitions_width: usize, - include_file_paths: Option, - projected_arrow_fields: Arc<[polars_core::prelude::ArrowField]>, - row_index: Option, - physical_predicate: Option>, - ideal_morsel_size: usize, -} - -impl RowGroupDecoder { - async fn row_group_data_to_df( - &self, - row_group_data: RowGroupData, - ) -> PolarsResult> { - let row_group_data = Arc::new(row_group_data); - - let out_width = self.row_index.is_some() as usize - + self.projected_arrow_fields.len() - + self.hive_partitions_width - + self.include_file_paths.is_some() as usize; - - let mut out_columns = Vec::with_capacity(out_width); - - if self.row_index.is_some() { - // Add a placeholder so that we don't have to shift the entire vec - // later. - out_columns.push(Series::default()); - } - - let slice_range = row_group_data - .slice - .map(|(offset, len)| offset..offset + len) - .unwrap_or(0..row_group_data.row_group_metadata.num_rows()); - - let projected_arrow_fields = &self.projected_arrow_fields; - let projected_arrow_fields = projected_arrow_fields.clone(); - - let row_group_data_2 = row_group_data.clone(); - let slice_range_2 = slice_range.clone(); - - // Minimum number of values to amortize the overhead of spawning tasks. - // This value is arbitrarily chosen. - const VALUES_PER_THREAD: usize = 16_777_216; - let n_rows = row_group_data.row_group_metadata.num_rows(); - let cols_per_task = 1 + VALUES_PER_THREAD / n_rows; - - let decode_fut_iter = (0..self.projected_arrow_fields.len()) - .step_by(cols_per_task) - .map(move |offset| { - let row_group_data = row_group_data_2.clone(); - let slice_range = slice_range_2.clone(); - let projected_arrow_fields = projected_arrow_fields.clone(); - - async move { - (offset - ..offset - .saturating_add(cols_per_task) - .min(projected_arrow_fields.len())) - .map(|i| { - let arrow_field = projected_arrow_fields[i].clone(); - - let columns_to_deserialize = row_group_data - .row_group_metadata - .columns() - .iter() - .filter(|col_md| { - col_md.descriptor().path_in_schema[0] == arrow_field.name - }) - .map(|col_md| { - let (offset, len) = col_md.byte_range(); - let offset = offset as usize; - let len = len as usize; - - ( - col_md, - row_group_data.byte_source.get_range(offset..offset + len), - ) - }) - .collect::>(); - - assert!( - slice_range.end <= row_group_data.row_group_metadata.num_rows() - ); - - let array = polars_io::prelude::_internal::to_deserializer( - columns_to_deserialize, - arrow_field.clone(), - Some(polars_parquet::read::Filter::Range(slice_range.clone())), - )?; - - let series = Series::try_from((&arrow_field, array))?; - - // TODO: Also load in the metadata. - - PolarsResult::Ok(series) - }) - .collect::>>() - } - }); - - if decode_fut_iter.len() > 1 { - for handle in decode_fut_iter.map(|fut| { - async_executor::AbortOnDropHandle::new(async_executor::spawn( - TaskPriority::Low, - fut, - )) - }) { - out_columns.extend(handle.await?); - } - } else { - for fut in decode_fut_iter { - out_columns.extend(fut.await?); - } - } - - let projection_height = if self.projected_arrow_fields.is_empty() { - slice_range.len() - } else { - debug_assert!(out_columns.len() > self.row_index.is_some() as usize); - out_columns.last().unwrap().len() - }; - - if let Some(RowIndex { name, offset }) = self.row_index.as_ref() { - let Some(offset) = (|| { - let offset = offset - .checked_add((row_group_data.row_offset + slice_range.start) as IdxSize)?; - offset.checked_add(projection_height as IdxSize)?; - - Some(offset) - })() else { - let msg = format!( - "adding a row index column with offset {} overflows at {} rows", - offset, - row_group_data.row_offset + slice_range.end - ); - polars_bail!(ComputeError: msg) - }; - - // The DataFrame can be empty at this point if no columns were projected from the file, - // so we create the row index column manually instead of using `df.with_row_index` to - // ensure it has the correct number of rows. - let mut ca = IdxCa::from_vec( - name.clone(), - (offset..offset + projection_height as IdxSize).collect(), - ); - ca.set_sorted_flag(IsSorted::Ascending); - - out_columns[0] = ca.into_series(); - } - - let shared_file_state = row_group_data - .shared_file_state - .get_or_init(|| async { - let path_index = row_group_data.path_index; - - let hive_series = if let Some(hp) = self.hive_partitions.as_deref() { - let mut v = hp[path_index].materialize_partition_columns(); - for s in v.iter_mut() { - *s = s.new_from_index(0, row_group_data.file_max_row_group_height); - } - v - } else { - vec![] - }; - - let file_path_series = self.include_file_paths.clone().map(|file_path_col| { - StringChunked::full( - file_path_col, - self.paths[path_index].to_str().unwrap(), - row_group_data.file_max_row_group_height, - ) - .into_series() - }); - - SharedFileState { - path_index, - hive_series, - file_path_series, - } - }) - .await; - - assert_eq!(shared_file_state.path_index, row_group_data.path_index); - - for s in &shared_file_state.hive_series { - debug_assert!(s.len() >= projection_height); - out_columns.push(s.slice(0, projection_height)); - } - - if let Some(file_path_series) = &shared_file_state.file_path_series { - debug_assert!(file_path_series.len() >= projection_height); - out_columns.push(file_path_series.slice(0, projection_height)); - } - - let df = unsafe { DataFrame::new_no_checks(out_columns) }; - - // Re-calculate: A slice may have been applied. - let cols_per_task = 1 + VALUES_PER_THREAD / df.height(); - - let df = if let Some(predicate) = self.physical_predicate.as_deref() { - let mask = predicate.evaluate_io(&df)?; - let mask = mask.bool().unwrap(); - - if cols_per_task <= df.width() { - df._filter_seq(mask)? - } else { - let mask = mask.clone(); - let cols = Arc::new(df.take_columns()); - let mut out_cols = Vec::with_capacity(cols.len()); - - for handle in (0..cols.len()) - .step_by(cols_per_task) - .map(move |offset| { - let cols = cols.clone(); - let mask = mask.clone(); - async move { - cols[offset..offset.saturating_add(cols_per_task).min(cols.len())] - .iter() - .map(|s| s.filter(&mask)) - .collect::>>() - } - }) - .map(|fut| { - async_executor::AbortOnDropHandle::new(async_executor::spawn( - TaskPriority::Low, - fut, - )) - }) - { - out_cols.extend(handle.await?); - } - - unsafe { DataFrame::new_no_checks(out_cols) } - } - } else { - df - }; - - assert_eq!(df.width(), out_width); - - let n_morsels = if df.height() > 3 * self.ideal_morsel_size / 2 { - // num_rows > (1.5 * ideal_morsel_size) - (df.height() / self.ideal_morsel_size).max(2) - } else { - 1 - } as u64; - - if n_morsels == 1 { - return Ok(vec![df]); - } - - let rows_per_morsel = 1 + df.height() / n_morsels as usize; - - let out = (0..i64::try_from(df.height()).unwrap()) - .step_by(rows_per_morsel) - .map(|offset| df.slice(offset, rows_per_morsel)) - .collect::>(); - - Ok(out) - } -} - -/// Read the metadata bytes of a parquet file, does not decode the bytes. If during metadata fetch -/// the bytes of the entire file are loaded, it is returned in the second return value. -async fn read_parquet_metadata_bytes( - byte_source: &DynByteSource, - verbose: bool, -) -> PolarsResult<(MemSlice, Option)> { - use polars_parquet::parquet::error::ParquetError; - use polars_parquet::parquet::PARQUET_MAGIC; - - const FOOTER_HEADER_SIZE: usize = polars_parquet::parquet::FOOTER_SIZE as usize; - - let file_size = byte_source.get_size().await?; - - if file_size < FOOTER_HEADER_SIZE { - return Err(ParquetError::OutOfSpec(format!( - "file size ({}) is less than minimum size required to store parquet footer ({})", - file_size, FOOTER_HEADER_SIZE - )) - .into()); - } - - let estimated_metadata_size = if let DynByteSource::MemSlice(_) = byte_source { - // Mmapped or in-memory, reads are free. - file_size - } else { - (file_size / 2048).clamp(16_384, 131_072).min(file_size) - }; - - let bytes = byte_source - .get_range((file_size - estimated_metadata_size)..file_size) - .await?; - - let footer_header_bytes = bytes.slice((bytes.len() - FOOTER_HEADER_SIZE)..bytes.len()); - - let (v, remaining) = footer_header_bytes.split_at(4); - let footer_size = i32::from_le_bytes(v.try_into().unwrap()); - - if remaining != PARQUET_MAGIC { - return Err(ParquetError::OutOfSpec(format!( - r#"expected parquet magic bytes "{}" in footer, got "{}" instead"#, - std::str::from_utf8(&PARQUET_MAGIC).unwrap(), - String::from_utf8_lossy(remaining) - )) - .into()); - } - - if footer_size < 0 { - return Err(ParquetError::OutOfSpec(format!( - "expected positive footer size, got {} instead", - footer_size - )) - .into()); - } - - let footer_size = footer_size as usize + FOOTER_HEADER_SIZE; - - if file_size < footer_size { - return Err(ParquetError::OutOfSpec(format!( - "file size ({}) is less than the indicated footer size ({})", - file_size, footer_size - )) - .into()); - } - - if bytes.len() < footer_size { - debug_assert!(!matches!(byte_source, DynByteSource::MemSlice(_))); - if verbose { - eprintln!( - "[ParquetSource]: Extra {} bytes need to be fetched for metadata \ - (initial estimate = {}, actual size = {})", - footer_size - estimated_metadata_size, - bytes.len(), - footer_size, - ); - } - - let mut out = Vec::with_capacity(footer_size); - let offset = file_size - footer_size; - let len = footer_size - bytes.len(); - let delta_bytes = byte_source.get_range(offset..(offset + len)).await?; - - debug_assert!(out.capacity() >= delta_bytes.len() + bytes.len()); - - out.extend_from_slice(&delta_bytes); - out.extend_from_slice(&bytes); - - Ok((MemSlice::from_vec(out), None)) - } else { - if verbose && !matches!(byte_source, DynByteSource::MemSlice(_)) { - eprintln!( - "[ParquetSource]: Fetched all bytes for metadata on first try \ - (initial estimate = {}, actual size = {}, excess = {})", - bytes.len(), - footer_size, - estimated_metadata_size - footer_size, - ); - } - - let metadata_bytes = bytes.slice((bytes.len() - footer_size)..bytes.len()); - - if bytes.len() == file_size { - Ok((metadata_bytes, Some(bytes))) - } else { - debug_assert!(!matches!(byte_source, DynByteSource::MemSlice(_))); - let metadata_bytes = if bytes.len() - footer_size >= bytes.len() { - // Re-allocate to drop the excess bytes - MemSlice::from_vec(metadata_bytes.to_vec()) - } else { - metadata_bytes - }; - - Ok((metadata_bytes, None)) - } - } -} - -fn get_row_group_byte_ranges( - row_group_metadata: &RowGroupMetaData, -) -> impl ExactSizeIterator> + '_ { - let row_group_columns = row_group_metadata.columns(); - - row_group_columns.iter().map(|rg_col_metadata| { - let (offset, len) = rg_col_metadata.byte_range(); - (offset as usize)..(offset + len) as usize - }) -} - -/// TODO: This is quadratic - incorporate https://github.com/pola-rs/polars/pull/18327 that is -/// merged. -fn get_row_group_byte_ranges_for_projection<'a>( - row_group_metadata: &'a RowGroupMetaData, - columns: &'a [PlSmallStr], -) -> impl Iterator> + 'a { - let row_group_columns = row_group_metadata.columns(); - - row_group_columns.iter().filter_map(move |rg_col_metadata| { - for col_name in columns { - if rg_col_metadata.descriptor().path_in_schema[0] == col_name { - let (offset, len) = rg_col_metadata.byte_range(); - let range = (offset as usize)..((offset + len) as usize); - return Some(range); - } - } - None - }) -} - -/// Ensures that a parquet file has all the necessary columns for a projection with the correct -/// dtype. There are no ordering requirements and extra columns are permitted. -fn ensure_metadata_has_projected_fields( - projected_fields: &[polars_core::prelude::ArrowField], - metadata: &FileMetaData, -) -> PolarsResult<()> { - let schema = polars_parquet::arrow::read::infer_schema(metadata)?; - - // Note: We convert to Polars-native dtypes for timezone normalization. - let mut schema = schema - .into_iter_values() - .map(|x| { - let dtype = DataType::from_arrow(&x.dtype, true); - (x.name, dtype) - }) - .collect::>(); - - for field in projected_fields { - let Some(dtype) = schema.remove(&field.name) else { - polars_bail!(SchemaMismatch: "did not find column: {}", field.name) - }; - - let expected_dtype = DataType::from_arrow(&field.dtype, true); - - if dtype != expected_dtype { - polars_bail!(SchemaMismatch: "data type mismatch for column {}: found: {}, expected: {}", - &field.name, dtype, expected_dtype - ) - } - } - - Ok(()) -} - -fn get_memory_prefetch_func(verbose: bool) -> fn(&[u8]) -> () { - let memory_prefetch_func = match std::env::var("POLARS_MEMORY_PREFETCH").ok().as_deref() { - None => { - // Sequential advice was observed to provide speedups on Linux. - // ref https://github.com/pola-rs/polars/pull/18152#discussion_r1721701965 - #[cfg(target_os = "linux")] - { - mem_prefetch_funcs::madvise_sequential - } - #[cfg(not(target_os = "linux"))] - { - mem_prefetch_funcs::no_prefetch - } - }, - Some("no_prefetch") => mem_prefetch_funcs::no_prefetch, - Some("prefetch_l2") => mem_prefetch_funcs::prefetch_l2, - Some("madvise_sequential") => { - #[cfg(target_family = "unix")] - { - mem_prefetch_funcs::madvise_sequential - } - #[cfg(not(target_family = "unix"))] - { - panic!("POLARS_MEMORY_PREFETCH=madvise_sequential is not supported by this system"); - } - }, - Some("madvise_willneed") => { - #[cfg(target_family = "unix")] - { - mem_prefetch_funcs::madvise_willneed - } - #[cfg(not(target_family = "unix"))] - { - panic!("POLARS_MEMORY_PREFETCH=madvise_willneed is not supported by this system"); - } - }, - Some("madvise_populate_read") => { - #[cfg(target_os = "linux")] - { - mem_prefetch_funcs::madvise_populate_read - } - #[cfg(not(target_os = "linux"))] - { - panic!( - "POLARS_MEMORY_PREFETCH=madvise_populate_read is not supported by this system" - ); - } - }, - Some(v) => panic!("invalid value for POLARS_MEMORY_PREFETCH: {}", v), - }; - - if verbose { - let func_name = match memory_prefetch_func as usize { - v if v == mem_prefetch_funcs::no_prefetch as usize => "no_prefetch", - v if v == mem_prefetch_funcs::prefetch_l2 as usize => "prefetch_l2", - v if v == mem_prefetch_funcs::madvise_sequential as usize => "madvise_sequential", - v if v == mem_prefetch_funcs::madvise_willneed as usize => "madvise_willneed", - v if v == mem_prefetch_funcs::madvise_populate_read as usize => "madvise_populate_read", - _ => unreachable!(), - }; - - eprintln!("[ParquetSource] Memory prefetch function: {}", func_name); - } - - memory_prefetch_func -} - -mod mem_prefetch_funcs { - pub use polars_utils::mem::{ - madvise_populate_read, madvise_sequential, madvise_willneed, prefetch_l2, - }; - - pub fn no_prefetch(_: &[u8]) {} -} diff --git a/crates/polars-stream/src/nodes/parquet_source/init.rs b/crates/polars-stream/src/nodes/parquet_source/init.rs new file mode 100644 index 000000000000..661ea4b84825 --- /dev/null +++ b/crates/polars-stream/src/nodes/parquet_source/init.rs @@ -0,0 +1,328 @@ +use std::future::Future; +use std::sync::Arc; + +use futures::stream::FuturesUnordered; +use futures::StreamExt; +use polars_core::frame::DataFrame; +use polars_error::PolarsResult; + +use super::row_group_data_fetch::RowGroupDataFetcher; +use super::row_group_decode::RowGroupDecoder; +use super::{AsyncTaskData, ParquetSourceNode}; +use crate::async_executor; +use crate::async_primitives::connector::connector; +use crate::async_primitives::wait_group::{WaitGroup, WaitToken}; +use crate::morsel::get_ideal_morsel_size; +use crate::nodes::{MorselSeq, TaskPriority}; + +impl ParquetSourceNode { + /// # Panics + /// Panics if called more than once. + async fn shutdown_impl( + async_task_data: Arc>, + verbose: bool, + ) -> PolarsResult<()> { + if verbose { + eprintln!("[ParquetSource]: Shutting down"); + } + + let (mut raw_morsel_receivers, morsel_stream_task_handle) = + async_task_data.try_lock().unwrap().take().unwrap(); + + raw_morsel_receivers.clear(); + // Join on the producer handle to catch errors/panics. + // Safety + // * We dropped the receivers on the line above + // * This function is only called once. + morsel_stream_task_handle.await + } + + pub(super) fn shutdown(&self) -> impl Future> { + if self.verbose { + eprintln!("[ParquetSource]: Shutdown via `shutdown()`"); + } + Self::shutdown_impl(self.async_task_data.clone(), self.verbose) + } + + /// Spawns a task to shut down the source node to avoid blocking the current thread. This is + /// usually called when data is no longer needed from the source node, as such it does not + /// propagate any (non-critical) errors. If on the other hand the source node does not provide + /// more data when requested, then it is more suitable to call [`Self::shutdown`], as it returns + /// a result that can be used to distinguish between whether the data stream stopped due to an + /// error or EOF. + pub(super) fn shutdown_in_background(&self) { + if self.verbose { + eprintln!("[ParquetSource]: Shutdown via `shutdown_in_background()`"); + } + let async_task_data = self.async_task_data.clone(); + polars_io::pl_async::get_runtime() + .spawn(Self::shutdown_impl(async_task_data, self.verbose)); + } + + /// Constructs the task that provides a morsel stream. + #[allow(clippy::type_complexity)] + pub(super) fn init_raw_morsel_stream( + &mut self, + ) -> ( + Vec>, + async_executor::AbortOnDropHandle>, + ) { + let verbose = self.verbose; + + let use_statistics = self.options.use_statistics; + + let (mut raw_morsel_senders, raw_morsel_receivers): (Vec<_>, Vec<_>) = + (0..self.config.num_pipelines).map(|_| connector()).unzip(); + + if let Some((_, 0)) = self.file_options.slice { + return ( + raw_morsel_receivers, + async_executor::AbortOnDropHandle::new(async_executor::spawn( + TaskPriority::Low, + std::future::ready(Ok(())), + )), + ); + } + + let reader_schema = self + .file_info + .reader_schema + .as_ref() + .unwrap() + .as_ref() + .unwrap_left() + .clone(); + + let (normalized_slice_oneshot_rx, metadata_rx, metadata_task_handle) = + self.init_metadata_fetcher(); + + let num_pipelines = self.config.num_pipelines; + let row_group_prefetch_size = self.config.row_group_prefetch_size; + let projection = self.file_options.with_columns.clone(); + assert_eq!(self.physical_predicate.is_some(), self.predicate.is_some()); + let predicate = self.physical_predicate.clone(); + let memory_prefetch_func = self.memory_prefetch_func; + + let mut row_group_data_fetcher = RowGroupDataFetcher { + metadata_rx, + use_statistics, + verbose, + reader_schema, + projection, + predicate, + slice_range: None, // Initialized later + memory_prefetch_func, + current_path_index: 0, + current_byte_source: Default::default(), + current_row_groups: Default::default(), + current_row_group_idx: 0, + current_max_row_group_height: 0, + current_row_offset: 0, + current_shared_file_state: Default::default(), + }; + + let row_group_decoder = self.init_row_group_decoder(); + let row_group_decoder = Arc::new(row_group_decoder); + + // Processes row group metadata and spawns I/O tasks to fetch row group data. This is + // currently spawned onto the CPU runtime as it does not directly make any async I/O calls, + // but instead it potentially performs predicate/slice evaluation on metadata. If we observe + // that under heavy CPU load scenarios the I/O throughput drops due to this task not being + // scheduled we can change it to be a high priority task. + let morsel_stream_task_handle = async_executor::spawn(TaskPriority::Low, async move { + let slice_range = { + let Ok(slice) = normalized_slice_oneshot_rx.await else { + // If we are here then the producer probably errored. + drop(row_group_data_fetcher); + return metadata_task_handle.await.unwrap(); + }; + + slice.map(|(offset, len)| offset..offset + len) + }; + + row_group_data_fetcher.slice_range = slice_range; + + // Pins a wait group to a channel index. + struct IndexedWaitGroup { + index: usize, + wait_group: WaitGroup, + } + + impl IndexedWaitGroup { + async fn wait(self) -> Self { + self.wait_group.wait().await; + self + } + } + + // Ensure proper backpressure by only polling the buffered iterator when a wait group + // is free. + let mut wait_groups = (0..num_pipelines) + .map(|index| { + let wait_group = WaitGroup::default(); + { + let _prime_this_wait_group = wait_group.token(); + } + IndexedWaitGroup { + index, + wait_group: WaitGroup::default(), + } + .wait() + }) + .collect::>(); + + let mut df_stream = row_group_data_fetcher + .into_stream() + .map(|x| async { + match x { + Ok(handle) => handle.await, + Err(e) => Err(e), + } + }) + .buffered(row_group_prefetch_size) + .map(|x| async { + let row_group_decoder = row_group_decoder.clone(); + + match x { + Ok(row_group_data) => { + async_executor::spawn(TaskPriority::Low, async move { + row_group_decoder.row_group_data_to_df(row_group_data).await + }) + .await + }, + Err(e) => Err(e), + } + }) + .buffered( + // Because we are using an ordered buffer, we may suffer from head-of-line blocking, + // so we add a small amount of buffer. + num_pipelines + 4, + ); + + let morsel_seq_ref = &mut MorselSeq::default(); + let mut dfs = vec![].into_iter(); + + 'main: loop { + let Some(mut indexed_wait_group) = wait_groups.next().await else { + break; + }; + + if dfs.len() == 0 { + let Some(v) = df_stream.next().await else { + break; + }; + + let v = v?; + assert!(!v.is_empty()); + + dfs = v.into_iter(); + } + + let mut df = dfs.next().unwrap(); + let morsel_seq = *morsel_seq_ref; + *morsel_seq_ref = morsel_seq.successor(); + + loop { + use crate::async_primitives::connector::SendError; + + let channel_index = indexed_wait_group.index; + let wait_token = indexed_wait_group.wait_group.token(); + + match raw_morsel_senders[channel_index].try_send((df, morsel_seq, wait_token)) { + Ok(_) => { + wait_groups.push(indexed_wait_group.wait()); + break; + }, + Err(SendError::Closed(v)) => { + // The channel assigned to this wait group has been closed, so we will not + // add it back to the list of wait groups, and we will try to send this + // across another channel. + df = v.0 + }, + Err(SendError::Full(_)) => unreachable!(), + } + + let Some(v) = wait_groups.next().await else { + // All channels have closed + break 'main; + }; + + indexed_wait_group = v; + } + } + + // Join on the producer handle to catch errors/panics. + drop(df_stream); + metadata_task_handle.await.unwrap() + }); + + let morsel_stream_task_handle = + async_executor::AbortOnDropHandle::new(morsel_stream_task_handle); + + (raw_morsel_receivers, morsel_stream_task_handle) + } + + /// Creates a `RowGroupDecoder` that turns `RowGroupData` into DataFrames. + /// This must be called AFTER the following have been initialized: + /// * `self.projected_arrow_fields` + /// * `self.physical_predicate` + pub(super) fn init_row_group_decoder(&self) -> RowGroupDecoder { + assert!( + !self.projected_arrow_fields.is_empty() + || self.file_options.with_columns.as_deref() == Some(&[]) + ); + assert_eq!(self.predicate.is_some(), self.physical_predicate.is_some()); + + let scan_sources = self.scan_sources.clone(); + let hive_partitions = self.hive_parts.clone(); + let hive_partitions_width = hive_partitions + .as_deref() + .map(|x| x[0].get_statistics().column_stats().len()) + .unwrap_or(0); + let include_file_paths = self.file_options.include_file_paths.clone(); + let projected_arrow_fields = self.projected_arrow_fields.clone(); + let row_index = self.file_options.row_index.clone(); + let physical_predicate = self.physical_predicate.clone(); + let ideal_morsel_size = get_ideal_morsel_size(); + + RowGroupDecoder { + scan_sources, + hive_partitions, + hive_partitions_width, + include_file_paths, + projected_arrow_fields, + row_index, + physical_predicate, + ideal_morsel_size, + } + } + + pub(super) fn init_projected_arrow_fields(&mut self) { + let reader_schema = self + .file_info + .reader_schema + .as_ref() + .unwrap() + .as_ref() + .unwrap_left() + .clone(); + + self.projected_arrow_fields = + if let Some(columns) = self.file_options.with_columns.as_deref() { + columns + .iter() + .map(|x| reader_schema.get(x).unwrap().clone()) + .collect() + } else { + reader_schema.iter_values().cloned().collect() + }; + + if self.verbose { + eprintln!( + "[ParquetSource]: {} columns to be projected from {} files", + self.projected_arrow_fields.len(), + self.scan_sources.len(), + ); + } + } +} diff --git a/crates/polars-stream/src/nodes/parquet_source/mem_prefetch_funcs.rs b/crates/polars-stream/src/nodes/parquet_source/mem_prefetch_funcs.rs new file mode 100644 index 000000000000..a8a356551ff6 --- /dev/null +++ b/crates/polars-stream/src/nodes/parquet_source/mem_prefetch_funcs.rs @@ -0,0 +1,71 @@ +pub(super) use polars_utils::mem::{ + madvise_populate_read, madvise_sequential, madvise_willneed, prefetch_l2, +}; +pub(super) fn no_prefetch(_: &[u8]) {} + +pub(super) fn get_memory_prefetch_func(verbose: bool) -> fn(&[u8]) -> () { + let memory_prefetch_func = match std::env::var("POLARS_MEMORY_PREFETCH").ok().as_deref() { + None => { + // Sequential advice was observed to provide speedups on Linux. + // ref https://github.com/pola-rs/polars/pull/18152#discussion_r1721701965 + #[cfg(target_os = "linux")] + { + madvise_sequential + } + #[cfg(not(target_os = "linux"))] + { + no_prefetch + } + }, + Some("no_prefetch") => no_prefetch, + Some("prefetch_l2") => prefetch_l2, + Some("madvise_sequential") => { + #[cfg(target_family = "unix")] + { + madvise_sequential + } + #[cfg(not(target_family = "unix"))] + { + panic!("POLARS_MEMORY_PREFETCH=madvise_sequential is not supported by this system"); + } + }, + Some("madvise_willneed") => { + #[cfg(target_family = "unix")] + { + madvise_willneed + } + #[cfg(not(target_family = "unix"))] + { + panic!("POLARS_MEMORY_PREFETCH=madvise_willneed is not supported by this system"); + } + }, + Some("madvise_populate_read") => { + #[cfg(target_os = "linux")] + { + madvise_populate_read + } + #[cfg(not(target_os = "linux"))] + { + panic!( + "POLARS_MEMORY_PREFETCH=madvise_populate_read is not supported by this system" + ); + } + }, + Some(v) => panic!("invalid value for POLARS_MEMORY_PREFETCH: {}", v), + }; + + if verbose { + let func_name = match memory_prefetch_func as usize { + v if v == no_prefetch as usize => "no_prefetch", + v if v == prefetch_l2 as usize => "prefetch_l2", + v if v == madvise_sequential as usize => "madvise_sequential", + v if v == madvise_willneed as usize => "madvise_willneed", + v if v == madvise_populate_read as usize => "madvise_populate_read", + _ => unreachable!(), + }; + + eprintln!("[ParquetSource] Memory prefetch function: {}", func_name); + } + + memory_prefetch_func +} diff --git a/crates/polars-stream/src/nodes/parquet_source/metadata_fetch.rs b/crates/polars-stream/src/nodes/parquet_source/metadata_fetch.rs new file mode 100644 index 000000000000..5f3281145083 --- /dev/null +++ b/crates/polars-stream/src/nodes/parquet_source/metadata_fetch.rs @@ -0,0 +1,430 @@ +use std::sync::Arc; + +use futures::StreamExt; +use polars_error::PolarsResult; +use polars_io::prelude::FileMetadata; +use polars_io::utils::byte_source::{DynByteSource, MemSliceByteSource}; +use polars_io::utils::slice::SplitSlicePosition; +use polars_utils::mmap::MemSlice; +use polars_utils::pl_str::PlSmallStr; + +use super::metadata_utils::{ensure_metadata_has_projected_fields, read_parquet_metadata_bytes}; +use super::ParquetSourceNode; +use crate::async_executor; +use crate::async_primitives::connector::connector; +use crate::nodes::TaskPriority; +use crate::utils::task_handles_ext; + +impl ParquetSourceNode { + /// Constructs the task that fetches file metadata. + /// Note: This must be called AFTER `self.projected_arrow_fields` has been initialized. + #[allow(clippy::type_complexity)] + pub(super) fn init_metadata_fetcher( + &mut self, + ) -> ( + tokio::sync::oneshot::Receiver>, + crate::async_primitives::connector::Receiver<( + usize, + usize, + Arc, + FileMetadata, + usize, + )>, + task_handles_ext::AbortOnDropHandle>, + ) { + let verbose = self.verbose; + let io_runtime = polars_io::pl_async::get_runtime(); + + assert!( + !self.projected_arrow_fields.is_empty() + || self.file_options.with_columns.as_deref() == Some(&[]) + ); + let projected_arrow_fields = self.projected_arrow_fields.clone(); + let needs_max_row_group_height_calc = + self.file_options.include_file_paths.is_some() || self.hive_parts.is_some(); + + let (normalized_slice_oneshot_tx, normalized_slice_oneshot_rx) = + tokio::sync::oneshot::channel(); + let (mut metadata_tx, metadata_rx) = connector(); + + let byte_source_builder = self.byte_source_builder.clone(); + + if self.verbose { + eprintln!( + "[ParquetSource]: Byte source builder: {:?}", + &byte_source_builder + ); + } + + let fetch_metadata_bytes_for_path_index = { + let scan_sources = &self.scan_sources; + let cloud_options = Arc::new(self.cloud_options.clone()); + + let scan_sources = scan_sources.clone(); + let cloud_options = cloud_options.clone(); + let byte_source_builder = byte_source_builder.clone(); + + move |path_idx: usize| { + let scan_sources = scan_sources.clone(); + let cloud_options = cloud_options.clone(); + let byte_source_builder = byte_source_builder.clone(); + + let handle = io_runtime.spawn(async move { + let mut byte_source = Arc::new( + scan_sources + .get(path_idx) + .unwrap() + .to_dyn_byte_source( + &byte_source_builder, + cloud_options.as_ref().as_ref(), + ) + .await?, + ); + + if path_idx == 0 { + let metadata_bytes = MemSlice::EMPTY; + return Ok((0, byte_source, metadata_bytes)); + } + + let (metadata_bytes, maybe_full_bytes) = + read_parquet_metadata_bytes(byte_source.as_ref(), verbose).await?; + + if let Some(v) = maybe_full_bytes { + if !matches!(byte_source.as_ref(), DynByteSource::MemSlice(_)) { + if verbose { + eprintln!( + "[ParquetSource]: Parquet file was fully fetched during \ + metadata read ({} bytes).", + v.len(), + ); + } + + byte_source = Arc::new(DynByteSource::from(MemSliceByteSource(v))) + } + } + + PolarsResult::Ok((path_idx, byte_source, metadata_bytes)) + }); + + let handle = task_handles_ext::AbortOnDropHandle(handle); + + std::future::ready(handle) + } + }; + + let first_metadata = self.first_metadata.clone(); + + let process_metadata_bytes = { + move |handle: task_handles_ext::AbortOnDropHandle< + PolarsResult<(usize, Arc, MemSlice)>, + >| { + let projected_arrow_fields = projected_arrow_fields.clone(); + let first_metadata = first_metadata.clone(); + // Run on CPU runtime - metadata deserialization is expensive, especially + // for very wide tables. + let handle = async_executor::spawn(TaskPriority::Low, async move { + let (path_index, byte_source, metadata_bytes) = handle.await.unwrap()?; + + let metadata = if path_index == 0 { + Arc::unwrap_or_clone(first_metadata) + } else { + polars_parquet::parquet::read::deserialize_metadata( + metadata_bytes.as_ref(), + metadata_bytes.len() * 2 + 1024, + )? + }; + + ensure_metadata_has_projected_fields( + projected_arrow_fields.as_ref(), + &metadata, + )?; + + let file_max_row_group_height = if needs_max_row_group_height_calc { + metadata + .row_groups + .iter() + .map(|x| x.num_rows()) + .max() + .unwrap_or(0) + } else { + 0 + }; + + PolarsResult::Ok((path_index, byte_source, metadata, file_max_row_group_height)) + }); + + async_executor::AbortOnDropHandle::new(handle) + } + }; + + let metadata_prefetch_size = self.config.metadata_prefetch_size; + let metadata_decode_ahead_size = self.config.metadata_decode_ahead_size; + + let (start_tx, start_rx) = tokio::sync::oneshot::channel(); + self.morsel_stream_starter = Some(start_tx); + + let metadata_task_handle = if self + .file_options + .slice + .map(|(offset, _)| offset >= 0) + .unwrap_or(true) + { + normalized_slice_oneshot_tx + .send( + self.file_options + .slice + .map(|(offset, len)| (offset as usize, len)), + ) + .unwrap(); + + // Safety: `offset + len` does not overflow. + let slice_range = self + .file_options + .slice + .map(|(offset, len)| offset as usize..offset as usize + len); + + let mut metadata_stream = futures::stream::iter(0..self.scan_sources.len()) + .map(fetch_metadata_bytes_for_path_index) + .buffered(metadata_prefetch_size) + .map(process_metadata_bytes) + .buffered(metadata_decode_ahead_size); + + let scan_sources = self.scan_sources.clone(); + + // We need to be able to both stop early as well as skip values, which is easier to do + // using a custom task instead of futures::stream + io_runtime.spawn(async move { + let current_row_offset_ref = &mut 0usize; + let current_path_index_ref = &mut 0usize; + + if start_rx.await.is_err() { + return Ok(()); + } + + if verbose { + eprintln!("[ParquetSource]: Starting data fetch") + } + + loop { + let current_path_index = *current_path_index_ref; + *current_path_index_ref += 1; + + let Some(v) = metadata_stream.next().await else { + break; + }; + + let (path_index, byte_source, metadata, file_max_row_group_height) = v + .map_err(|err| { + err.wrap_msg(|msg| { + format!( + "error at path (index: {}, path: {:?}): {}", + current_path_index, + scan_sources + .get(current_path_index) + .map(|x| PlSmallStr::from_str(x.to_include_path_name())), + msg + ) + }) + })?; + + assert_eq!(path_index, current_path_index); + + let current_row_offset = *current_row_offset_ref; + *current_row_offset_ref = current_row_offset.saturating_add(metadata.num_rows); + + if let Some(slice_range) = slice_range.clone() { + match SplitSlicePosition::split_slice_at_file( + current_row_offset, + metadata.num_rows, + slice_range, + ) { + SplitSlicePosition::Before => { + if verbose { + eprintln!( + "[ParquetSource]: Slice pushdown: \ + Skipped file at index {} ({} rows)", + current_path_index, metadata.num_rows + ); + } + continue; + }, + SplitSlicePosition::After => unreachable!(), + SplitSlicePosition::Overlapping(..) => {}, + }; + }; + + if metadata_tx + .send(( + path_index, + current_row_offset, + byte_source, + metadata, + file_max_row_group_height, + )) + .await + .is_err() + { + break; + } + + if let Some(slice_range) = slice_range.as_ref() { + if *current_row_offset_ref >= slice_range.end { + if verbose { + eprintln!( + "[ParquetSource]: Slice pushdown: \ + Stopped reading at file at index {} \ + (remaining {} files will not be read)", + current_path_index, + scan_sources.len() - current_path_index - 1, + ); + } + break; + } + }; + } + + Ok(()) + }) + } else { + // Walk the files in reverse to translate the slice into a positive offset. + let slice = self.file_options.slice.unwrap(); + let slice_start_as_n_from_end = -slice.0 as usize; + + let mut metadata_stream = futures::stream::iter((0..self.scan_sources.len()).rev()) + .map(fetch_metadata_bytes_for_path_index) + .buffered(metadata_prefetch_size) + .map(process_metadata_bytes) + .buffered(metadata_decode_ahead_size); + + // Note: + // * We want to wait until the first morsel is requested before starting this + let init_negative_slice_and_metadata = async move { + let mut processed_metadata_rev = vec![]; + let mut cum_rows = 0; + + while let Some(v) = metadata_stream.next().await { + let v = v?; + let (_, _, metadata, _) = &v; + cum_rows += metadata.num_rows; + processed_metadata_rev.push(v); + + if cum_rows >= slice_start_as_n_from_end { + break; + } + } + + let (start, len) = if slice_start_as_n_from_end > cum_rows { + // We need to trim the slice, e.g. SLICE[offset: -100, len: 75] on a file of 50 + // rows should only give the first 25 rows. + let first_file_position = slice_start_as_n_from_end - cum_rows; + (0, slice.1.saturating_sub(first_file_position)) + } else { + (cum_rows - slice_start_as_n_from_end, slice.1) + }; + + if len == 0 { + processed_metadata_rev.clear(); + } + + normalized_slice_oneshot_tx + .send(Some((start, len))) + .unwrap(); + + let slice_range = start..(start + len); + + PolarsResult::Ok((slice_range, processed_metadata_rev, cum_rows)) + }; + + let path_count = self.scan_sources.len(); + + io_runtime.spawn(async move { + if start_rx.await.is_err() { + return Ok(()); + } + + if verbose { + eprintln!("[ParquetSource]: Starting data fetch (negative slice)") + } + + let (slice_range, processed_metadata_rev, cum_rows) = + async_executor::AbortOnDropHandle::new(async_executor::spawn( + TaskPriority::Low, + init_negative_slice_and_metadata, + )) + .await?; + + if verbose { + if let Some((path_index, ..)) = processed_metadata_rev.last() { + eprintln!( + "[ParquetSource]: Slice pushdown: Negatively-offsetted slice {:?} \ + begins at file index {}, translated to {:?}", + slice, path_index, slice_range + ); + } else { + eprintln!( + "[ParquetSource]: Slice pushdown: Negatively-offsetted slice {:?} \ + skipped all files ({} files containing {} rows)", + slice, path_count, cum_rows + ) + } + } + + let metadata_iter = processed_metadata_rev.into_iter().rev(); + let current_row_offset_ref = &mut 0usize; + + for (current_path_index, byte_source, metadata, file_max_row_group_height) in + metadata_iter + { + let current_row_offset = *current_row_offset_ref; + *current_row_offset_ref = current_row_offset.saturating_add(metadata.num_rows); + + assert!(matches!( + SplitSlicePosition::split_slice_at_file( + current_row_offset, + metadata.num_rows, + slice_range.clone(), + ), + SplitSlicePosition::Overlapping(..) + )); + + if metadata_tx + .send(( + current_path_index, + current_row_offset, + byte_source, + metadata, + file_max_row_group_height, + )) + .await + .is_err() + { + break; + } + + if *current_row_offset_ref >= slice_range.end { + if verbose { + eprintln!( + "[ParquetSource]: Slice pushdown: \ + Stopped reading at file at index {} \ + (remaining {} files will not be read)", + current_path_index, + path_count - current_path_index - 1, + ); + } + break; + } + } + + Ok(()) + }) + }; + + let metadata_task_handle = task_handles_ext::AbortOnDropHandle(metadata_task_handle); + + ( + normalized_slice_oneshot_rx, + metadata_rx, + metadata_task_handle, + ) + } +} diff --git a/crates/polars-stream/src/nodes/parquet_source/metadata_utils.rs b/crates/polars-stream/src/nodes/parquet_source/metadata_utils.rs new file mode 100644 index 000000000000..7c848b07b750 --- /dev/null +++ b/crates/polars-stream/src/nodes/parquet_source/metadata_utils.rs @@ -0,0 +1,156 @@ +use polars_core::prelude::{DataType, PlHashMap}; +use polars_error::{polars_bail, PolarsResult}; +use polars_io::prelude::FileMetadata; +use polars_io::utils::byte_source::{ByteSource, DynByteSource}; +use polars_utils::mmap::MemSlice; +use polars_utils::pl_str::PlSmallStr; + +/// Read the metadata bytes of a parquet file, does not decode the bytes. If during metadata fetch +/// the bytes of the entire file are loaded, it is returned in the second return value. +pub(super) async fn read_parquet_metadata_bytes( + byte_source: &DynByteSource, + verbose: bool, +) -> PolarsResult<(MemSlice, Option)> { + use polars_parquet::parquet::error::ParquetError; + use polars_parquet::parquet::PARQUET_MAGIC; + + const FOOTER_HEADER_SIZE: usize = polars_parquet::parquet::FOOTER_SIZE as usize; + + let file_size = byte_source.get_size().await?; + + if file_size < FOOTER_HEADER_SIZE { + return Err(ParquetError::OutOfSpec(format!( + "file size ({}) is less than minimum size required to store parquet footer ({})", + file_size, FOOTER_HEADER_SIZE + )) + .into()); + } + + let estimated_metadata_size = if let DynByteSource::MemSlice(_) = byte_source { + // Mmapped or in-memory, reads are free. + file_size + } else { + (file_size / 2048).clamp(16_384, 131_072).min(file_size) + }; + + let bytes = byte_source + .get_range((file_size - estimated_metadata_size)..file_size) + .await?; + + let footer_header_bytes = bytes.slice((bytes.len() - FOOTER_HEADER_SIZE)..bytes.len()); + + let (v, remaining) = footer_header_bytes.split_at(4); + let footer_size = i32::from_le_bytes(v.try_into().unwrap()); + + if remaining != PARQUET_MAGIC { + return Err(ParquetError::OutOfSpec(format!( + r#"expected parquet magic bytes "{}" in footer, got "{}" instead"#, + std::str::from_utf8(&PARQUET_MAGIC).unwrap(), + String::from_utf8_lossy(remaining) + )) + .into()); + } + + if footer_size < 0 { + return Err(ParquetError::OutOfSpec(format!( + "expected positive footer size, got {} instead", + footer_size + )) + .into()); + } + + let footer_size = footer_size as usize + FOOTER_HEADER_SIZE; + + if file_size < footer_size { + return Err(ParquetError::OutOfSpec(format!( + "file size ({}) is less than the indicated footer size ({})", + file_size, footer_size + )) + .into()); + } + + if bytes.len() < footer_size { + debug_assert!(!matches!(byte_source, DynByteSource::MemSlice(_))); + if verbose { + eprintln!( + "[ParquetSource]: Extra {} bytes need to be fetched for metadata \ + (initial estimate = {}, actual size = {})", + footer_size - estimated_metadata_size, + bytes.len(), + footer_size, + ); + } + + let mut out = Vec::with_capacity(footer_size); + let offset = file_size - footer_size; + let len = footer_size - bytes.len(); + let delta_bytes = byte_source.get_range(offset..(offset + len)).await?; + + debug_assert!(out.capacity() >= delta_bytes.len() + bytes.len()); + + out.extend_from_slice(&delta_bytes); + out.extend_from_slice(&bytes); + + Ok((MemSlice::from_vec(out), None)) + } else { + if verbose && !matches!(byte_source, DynByteSource::MemSlice(_)) { + eprintln!( + "[ParquetSource]: Fetched all bytes for metadata on first try \ + (initial estimate = {}, actual size = {}, excess = {})", + bytes.len(), + footer_size, + estimated_metadata_size - footer_size, + ); + } + + let metadata_bytes = bytes.slice((bytes.len() - footer_size)..bytes.len()); + + if bytes.len() == file_size { + Ok((metadata_bytes, Some(bytes))) + } else { + debug_assert!(!matches!(byte_source, DynByteSource::MemSlice(_))); + let metadata_bytes = if bytes.len() - footer_size >= bytes.len() { + // Re-allocate to drop the excess bytes + MemSlice::from_vec(metadata_bytes.to_vec()) + } else { + metadata_bytes + }; + + Ok((metadata_bytes, None)) + } + } +} + +/// Ensures that a parquet file has all the necessary columns for a projection with the correct +/// dtype. There are no ordering requirements and extra columns are permitted. +pub(super) fn ensure_metadata_has_projected_fields( + projected_fields: &[polars_core::prelude::ArrowField], + metadata: &FileMetadata, +) -> PolarsResult<()> { + let schema = polars_parquet::arrow::read::infer_schema(metadata)?; + + // Note: We convert to Polars-native dtypes for timezone normalization. + let mut schema = schema + .into_iter_values() + .map(|x| { + let dtype = DataType::from_arrow(&x.dtype, true); + (x.name, dtype) + }) + .collect::>(); + + for field in projected_fields { + let Some(dtype) = schema.remove(&field.name) else { + polars_bail!(SchemaMismatch: "did not find column: {}", field.name) + }; + + let expected_dtype = DataType::from_arrow(&field.dtype, true); + + if dtype != expected_dtype { + polars_bail!(SchemaMismatch: "data type mismatch for column {}: found: {}, expected: {}", + &field.name, dtype, expected_dtype + ) + } + } + + Ok(()) +} diff --git a/crates/polars-stream/src/nodes/parquet_source/mod.rs b/crates/polars-stream/src/nodes/parquet_source/mod.rs new file mode 100644 index 000000000000..10df7ef0e3bf --- /dev/null +++ b/crates/polars-stream/src/nodes/parquet_source/mod.rs @@ -0,0 +1,266 @@ +use std::sync::atomic::AtomicBool; +use std::sync::Arc; + +use mem_prefetch_funcs::get_memory_prefetch_func; +use polars_core::config; +use polars_core::frame::DataFrame; +use polars_error::PolarsResult; +use polars_expr::prelude::{phys_expr_to_io_expr, PhysicalExpr}; +use polars_io::cloud::CloudOptions; +use polars_io::predicates::PhysicalIoExpr; +use polars_io::prelude::{FileMetadata, ParquetOptions}; +use polars_io::utils::byte_source::DynByteSourceBuilder; +use polars_plan::plans::hive::HivePartitions; +use polars_plan::plans::{FileInfo, ScanSources}; +use polars_plan::prelude::FileScanOptions; +use row_group_decode::RowGroupDecoder; + +use super::compute_node_prelude::*; +use super::{MorselSeq, TaskPriority}; +use crate::async_executor::{self}; +use crate::async_primitives::wait_group::WaitToken; +use crate::morsel::SourceToken; + +mod init; +mod mem_prefetch_funcs; +mod metadata_fetch; +mod metadata_utils; +mod row_group_data_fetch; +mod row_group_decode; + +type AsyncTaskData = Option<( + Vec>, + async_executor::AbortOnDropHandle>, +)>; + +#[allow(clippy::type_complexity)] +pub struct ParquetSourceNode { + scan_sources: ScanSources, + file_info: FileInfo, + hive_parts: Option>>, + predicate: Option>, + options: ParquetOptions, + cloud_options: Option, + file_options: FileScanOptions, + first_metadata: Arc, + // Run-time vars + config: Config, + verbose: bool, + physical_predicate: Option>, + projected_arrow_fields: Arc<[polars_core::prelude::ArrowField]>, + byte_source_builder: DynByteSourceBuilder, + memory_prefetch_func: fn(&[u8]) -> (), + // This permit blocks execution until the first morsel is requested. + morsel_stream_starter: Option>, + // This is behind a Mutex so that we can call `shutdown()` asynchronously. + async_task_data: Arc>, + row_group_decoder: Option>, + is_finished: Arc, +} + +#[derive(Debug)] +struct Config { + num_pipelines: usize, + /// Number of files to pre-fetch metadata for concurrently + metadata_prefetch_size: usize, + /// Number of files to decode metadata for in parallel in advance + metadata_decode_ahead_size: usize, + /// Number of row groups to pre-fetch concurrently, this can be across files + row_group_prefetch_size: usize, +} + +#[allow(clippy::too_many_arguments)] +impl ParquetSourceNode { + pub fn new( + scan_sources: ScanSources, + file_info: FileInfo, + hive_parts: Option>>, + predicate: Option>, + options: ParquetOptions, + cloud_options: Option, + file_options: FileScanOptions, + first_metadata: Arc, + ) -> Self { + let verbose = config::verbose(); + + let byte_source_builder = if scan_sources.is_cloud_url() || config::force_async() { + DynByteSourceBuilder::ObjectStore + } else { + DynByteSourceBuilder::Mmap + }; + let memory_prefetch_func = get_memory_prefetch_func(verbose); + + Self { + scan_sources, + file_info, + hive_parts, + predicate, + options, + cloud_options, + file_options, + first_metadata, + + config: Config { + // Initialized later + num_pipelines: 0, + metadata_prefetch_size: 0, + metadata_decode_ahead_size: 0, + row_group_prefetch_size: 0, + }, + verbose, + physical_predicate: None, + projected_arrow_fields: Arc::new([]), + byte_source_builder, + memory_prefetch_func, + + morsel_stream_starter: None, + async_task_data: Arc::new(tokio::sync::Mutex::new(None)), + row_group_decoder: None, + is_finished: Arc::new(AtomicBool::new(false)), + } + } +} + +impl ComputeNode for ParquetSourceNode { + fn name(&self) -> &str { + "parquet_source" + } + + fn initialize(&mut self, num_pipelines: usize) { + self.config = { + let metadata_prefetch_size = polars_core::config::get_file_prefetch_size(); + // Limit metadata decode to the number of threads. + let metadata_decode_ahead_size = + (metadata_prefetch_size / 2).min(1 + num_pipelines).max(1); + let row_group_prefetch_size = polars_core::config::get_rg_prefetch_size(); + + Config { + num_pipelines, + metadata_prefetch_size, + metadata_decode_ahead_size, + row_group_prefetch_size, + } + }; + + if self.verbose { + eprintln!("[ParquetSource]: {:?}", &self.config); + } + + self.init_projected_arrow_fields(); + self.physical_predicate = self.predicate.clone().map(phys_expr_to_io_expr); + + let (raw_morsel_receivers, morsel_stream_task_handle) = self.init_raw_morsel_stream(); + + self.async_task_data + .try_lock() + .unwrap() + .replace((raw_morsel_receivers, morsel_stream_task_handle)); + + let row_group_decoder = self.init_row_group_decoder(); + self.row_group_decoder = Some(Arc::new(row_group_decoder)); + } + + fn update_state(&mut self, recv: &mut [PortState], send: &mut [PortState]) -> PolarsResult<()> { + use std::sync::atomic::Ordering; + + assert!(recv.is_empty()); + assert_eq!(send.len(), 1); + + if self.is_finished.load(Ordering::Relaxed) { + send[0] = PortState::Done; + assert!( + self.async_task_data.try_lock().unwrap().is_none(), + "should have already been shut down" + ); + } else if send[0] == PortState::Done { + { + // Early shutdown - our port state was set to `Done` by the downstream nodes. + self.shutdown_in_background(); + }; + self.is_finished.store(true, Ordering::Relaxed); + } else { + send[0] = PortState::Ready + } + + Ok(()) + } + + fn spawn<'env, 's>( + &'env mut self, + scope: &'s TaskScope<'s, 'env>, + recv: &mut [Option>], + send: &mut [Option>], + _state: &'s ExecutionState, + join_handles: &mut Vec>>, + ) { + use std::sync::atomic::Ordering; + + assert!(recv.is_empty()); + assert_eq!(send.len(), 1); + assert!(!self.is_finished.load(Ordering::Relaxed)); + + let morsel_senders = send[0].take().unwrap().parallel(); + + let mut async_task_data_guard = self.async_task_data.try_lock().unwrap(); + let (raw_morsel_receivers, _) = async_task_data_guard.as_mut().unwrap(); + + assert_eq!(raw_morsel_receivers.len(), morsel_senders.len()); + + if let Some(v) = self.morsel_stream_starter.take() { + v.send(()).unwrap(); + } + let is_finished = self.is_finished.clone(); + + let task_handles = raw_morsel_receivers + .drain(..) + .zip(morsel_senders) + .map(|(mut raw_morsel_rx, mut morsel_tx)| { + let is_finished = is_finished.clone(); + + scope.spawn_task(TaskPriority::Low, async move { + let source_token = SourceToken::new(); + loop { + let Ok((df, morsel_seq, wait_token)) = raw_morsel_rx.recv().await else { + is_finished.store(true, Ordering::Relaxed); + break; + }; + + let mut morsel = Morsel::new(df, morsel_seq, source_token.clone()); + morsel.set_consume_token(wait_token); + + if morsel_tx.send(morsel).await.is_err() { + break; + } + + if source_token.stop_requested() { + break; + } + } + + raw_morsel_rx + }) + }) + .collect::>(); + + drop(async_task_data_guard); + + let async_task_data = self.async_task_data.clone(); + + join_handles.push(scope.spawn_task(TaskPriority::Low, async move { + { + let mut async_task_data_guard = async_task_data.try_lock().unwrap(); + let (raw_morsel_receivers, _) = async_task_data_guard.as_mut().unwrap(); + + for handle in task_handles { + raw_morsel_receivers.push(handle.await); + } + } + + if self.is_finished.load(Ordering::Relaxed) { + self.shutdown().await?; + } + + Ok(()) + })) + } +} diff --git a/crates/polars-stream/src/nodes/parquet_source/row_group_data_fetch.rs b/crates/polars-stream/src/nodes/parquet_source/row_group_data_fetch.rs new file mode 100644 index 000000000000..773a5a9e3625 --- /dev/null +++ b/crates/polars-stream/src/nodes/parquet_source/row_group_data_fetch.rs @@ -0,0 +1,374 @@ +use std::future::Future; +use std::sync::Arc; + +use polars_core::prelude::{ArrowSchema, InitHashMaps, PlHashMap}; +use polars_core::utils::operation_exceeded_idxsize_msg; +use polars_error::{polars_err, PolarsResult}; +use polars_io::predicates::PhysicalIoExpr; +use polars_io::prelude::FileMetadata; +use polars_io::prelude::_internal::read_this_row_group; +use polars_io::utils::byte_source::{ByteSource, DynByteSource}; +use polars_io::utils::slice::SplitSlicePosition; +use polars_parquet::read::RowGroupMetadata; +use polars_utils::mmap::MemSlice; +use polars_utils::pl_str::PlSmallStr; +use polars_utils::slice::GetSaferUnchecked; +use polars_utils::IdxSize; + +use super::mem_prefetch_funcs; +use super::row_group_decode::SharedFileState; +use crate::async_executor; +use crate::nodes::TaskPriority; +use crate::utils::task_handles_ext; + +/// Represents byte-data that can be transformed into a DataFrame after some computation. +pub(super) struct RowGroupData { + pub(super) byte_source: FetchedBytes, + pub(super) path_index: usize, + pub(super) row_offset: usize, + pub(super) slice: Option<(usize, usize)>, + pub(super) file_max_row_group_height: usize, + pub(super) row_group_metadata: RowGroupMetadata, + pub(super) shared_file_state: Arc>, +} + +pub(super) struct RowGroupDataFetcher { + pub(super) metadata_rx: crate::async_primitives::connector::Receiver<( + usize, + usize, + Arc, + FileMetadata, + usize, + )>, + pub(super) use_statistics: bool, + pub(super) verbose: bool, + pub(super) reader_schema: Arc, + pub(super) projection: Option>, + pub(super) predicate: Option>, + pub(super) slice_range: Option>, + pub(super) memory_prefetch_func: fn(&[u8]) -> (), + pub(super) current_path_index: usize, + pub(super) current_byte_source: Arc, + pub(super) current_row_groups: std::vec::IntoIter, + pub(super) current_row_group_idx: usize, + pub(super) current_max_row_group_height: usize, + pub(super) current_row_offset: usize, + pub(super) current_shared_file_state: Arc>, +} + +impl RowGroupDataFetcher { + pub(super) fn into_stream(self) -> RowGroupDataStream { + RowGroupDataStream::new(self) + } + + pub(super) async fn init_next_file_state(&mut self) -> bool { + let Ok((path_index, row_offset, byte_source, metadata, file_max_row_group_height)) = + self.metadata_rx.recv().await + else { + return false; + }; + + self.current_path_index = path_index; + self.current_byte_source = byte_source; + self.current_max_row_group_height = file_max_row_group_height; + // The metadata task also sends a row offset to start counting from as it may skip files + // during slice pushdown. + self.current_row_offset = row_offset; + self.current_row_group_idx = 0; + self.current_row_groups = metadata.row_groups.into_iter(); + self.current_shared_file_state = Default::default(); + + true + } + + pub(super) async fn next( + &mut self, + ) -> Option>>> { + 'main: loop { + for row_group_metadata in self.current_row_groups.by_ref() { + let current_row_offset = self.current_row_offset; + let current_row_group_idx = self.current_row_group_idx; + + let num_rows = row_group_metadata.num_rows(); + + self.current_row_offset = current_row_offset.saturating_add(num_rows); + self.current_row_group_idx += 1; + + if self.use_statistics + && !match read_this_row_group( + self.predicate.as_deref(), + &row_group_metadata, + self.reader_schema.as_ref(), + ) { + Ok(v) => v, + Err(e) => return Some(Err(e)), + } + { + if self.verbose { + eprintln!( + "[ParquetSource]: Predicate pushdown: \ + Skipped row group {} in file {} ({} rows)", + current_row_group_idx, self.current_path_index, num_rows + ); + } + continue; + } + + if num_rows > IdxSize::MAX as usize { + let msg = operation_exceeded_idxsize_msg( + format!("number of rows in row group ({})", num_rows).as_str(), + ); + return Some(Err(polars_err!(ComputeError: msg))); + } + + let slice = if let Some(slice_range) = self.slice_range.clone() { + let (offset, len) = match SplitSlicePosition::split_slice_at_file( + current_row_offset, + num_rows, + slice_range, + ) { + SplitSlicePosition::Before => { + if self.verbose { + eprintln!( + "[ParquetSource]: Slice pushdown: \ + Skipped row group {} in file {} ({} rows)", + current_row_group_idx, self.current_path_index, num_rows + ); + } + continue; + }, + SplitSlicePosition::After => { + if self.verbose { + eprintln!( + "[ParquetSource]: Slice pushdown: \ + Stop at row group {} in file {} \ + (remaining {} row groups will not be read)", + current_row_group_idx, + self.current_path_index, + self.current_row_groups.len(), + ); + }; + break 'main; + }, + SplitSlicePosition::Overlapping(offset, len) => (offset, len), + }; + + Some((offset, len)) + } else { + None + }; + + let current_byte_source = self.current_byte_source.clone(); + let projection = self.projection.clone(); + let current_shared_file_state = self.current_shared_file_state.clone(); + let memory_prefetch_func = self.memory_prefetch_func; + let io_runtime = polars_io::pl_async::get_runtime(); + let current_path_index = self.current_path_index; + let current_max_row_group_height = self.current_max_row_group_height; + + // Push calculation of byte ranges to a task to run in parallel, as it can be + // expensive for very wide tables and projections. + let handle = async_executor::spawn(TaskPriority::Low, async move { + let byte_source = if let DynByteSource::MemSlice(mem_slice) = + current_byte_source.as_ref() + { + // Skip byte range calculation for `no_prefetch`. + if memory_prefetch_func as usize != mem_prefetch_funcs::no_prefetch as usize + { + let slice = mem_slice.0.as_ref(); + + if let Some(columns) = projection.as_ref() { + for range in get_row_group_byte_ranges_for_projection( + &row_group_metadata, + columns.as_ref(), + ) { + memory_prefetch_func(unsafe { + slice.get_unchecked_release(range) + }) + } + } else { + let range = row_group_metadata.full_byte_range(); + let range = range.start as usize..range.end as usize; + + memory_prefetch_func(unsafe { slice.get_unchecked_release(range) }) + }; + } + + // We have a mmapped or in-memory slice representing the entire + // file that can be sliced directly, so we can skip the byte-range + // calculations and HashMap allocation. + let mem_slice = mem_slice.0.clone(); + FetchedBytes::MemSlice { + offset: 0, + mem_slice, + } + } else if let Some(columns) = projection.as_ref() { + let ranges = get_row_group_byte_ranges_for_projection( + &row_group_metadata, + columns.as_ref(), + ) + .collect::>(); + + let bytes = { + let ranges_2 = ranges.clone(); + task_handles_ext::AbortOnDropHandle(io_runtime.spawn(async move { + current_byte_source.get_ranges(ranges_2.as_ref()).await + })) + .await + .unwrap()? + }; + + assert_eq!(bytes.len(), ranges.len()); + + let mut bytes_map = PlHashMap::with_capacity(ranges.len()); + + for (range, bytes) in ranges.iter().zip(bytes) { + memory_prefetch_func(bytes.as_ref()); + let v = bytes_map.insert(range.start, bytes); + debug_assert!(v.is_none(), "duplicate range start {}", range.start); + } + + FetchedBytes::BytesMap(bytes_map) + } else { + // We have a dedicated code-path for a full projection that performs a + // single range request for the entire row group. During testing this + // provided much higher throughput from cloud than making multiple range + // request with `get_ranges()`. + let full_range = row_group_metadata.full_byte_range(); + let full_range = full_range.start as usize..full_range.end as usize; + + let mem_slice = { + let full_range_2 = full_range.clone(); + task_handles_ext::AbortOnDropHandle(io_runtime.spawn(async move { + current_byte_source.get_range(full_range_2).await + })) + .await + .unwrap()? + }; + + FetchedBytes::MemSlice { + offset: full_range.start, + mem_slice, + } + }; + + PolarsResult::Ok(RowGroupData { + byte_source, + path_index: current_path_index, + row_offset: current_row_offset, + slice, + file_max_row_group_height: current_max_row_group_height, + row_group_metadata, + shared_file_state: current_shared_file_state.clone(), + }) + }); + + let handle = async_executor::AbortOnDropHandle::new(handle); + return Some(Ok(handle)); + } + + // Initialize state to the next file. + if !self.init_next_file_state().await { + break; + } + } + + None + } +} + +pub(super) enum FetchedBytes { + MemSlice { mem_slice: MemSlice, offset: usize }, + BytesMap(PlHashMap), +} + +impl FetchedBytes { + pub(super) fn get_range(&self, range: std::ops::Range) -> MemSlice { + match self { + Self::MemSlice { mem_slice, offset } => { + let offset = *offset; + debug_assert!(range.start >= offset); + mem_slice.slice(range.start - offset..range.end - offset) + }, + Self::BytesMap(v) => { + let v = v.get(&range.start).unwrap(); + debug_assert_eq!(v.len(), range.len()); + v.clone() + }, + } + } +} + +#[rustfmt::skip] +type RowGroupDataStreamFut = std::pin::Pin , + Option < + PolarsResult < + async_executor::AbortOnDropHandle < + PolarsResult < + RowGroupData > > > > + ) + > + Send +>>; + +pub(super) struct RowGroupDataStream { + current_future: RowGroupDataStreamFut, +} + +impl RowGroupDataStream { + fn new(row_group_data_fetcher: RowGroupDataFetcher) -> Self { + // [`RowGroupDataFetcher`] is a big struct, so we Box it once here to avoid boxing it on + // every `next()` call. + let current_future = Self::call_next_owned(Box::new(row_group_data_fetcher)); + Self { current_future } + } + + fn call_next_owned( + mut row_group_data_fetcher: Box, + ) -> RowGroupDataStreamFut { + Box::pin(async move { + let out = row_group_data_fetcher.next().await; + (row_group_data_fetcher, out) + }) + } +} + +impl futures::stream::Stream for RowGroupDataStream { + type Item = PolarsResult>>; + + fn poll_next( + mut self: std::pin::Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + ) -> std::task::Poll> { + use std::pin::Pin; + use std::task::Poll; + + match Pin::new(&mut self.current_future.as_mut()).poll(cx) { + Poll::Ready((row_group_data_fetcher, out)) => { + if out.is_some() { + self.current_future = Self::call_next_owned(row_group_data_fetcher); + } + + Poll::Ready(out) + }, + Poll::Pending => Poll::Pending, + } + } +} + +fn get_row_group_byte_ranges_for_projection<'a>( + row_group_metadata: &'a RowGroupMetadata, + columns: &'a [PlSmallStr], +) -> impl Iterator> + 'a { + columns.iter().flat_map(|col_name| { + row_group_metadata + .columns_under_root_iter(col_name) + .map(|col| { + let byte_range = col.byte_range(); + byte_range.start as usize..byte_range.end as usize + }) + }) +} diff --git a/crates/polars-stream/src/nodes/parquet_source/row_group_decode.rs b/crates/polars-stream/src/nodes/parquet_source/row_group_decode.rs new file mode 100644 index 000000000000..b3249e60057c --- /dev/null +++ b/crates/polars-stream/src/nodes/parquet_source/row_group_decode.rs @@ -0,0 +1,287 @@ +use std::sync::Arc; + +use polars_core::frame::DataFrame; +use polars_core::prelude::{ChunkFull, IdxCa, StringChunked}; +use polars_core::series::{IntoSeries, IsSorted, Series}; +use polars_error::{polars_bail, PolarsResult}; +use polars_io::predicates::PhysicalIoExpr; +use polars_io::RowIndex; +use polars_plan::plans::hive::HivePartitions; +use polars_plan::plans::ScanSources; +use polars_utils::pl_str::PlSmallStr; +use polars_utils::IdxSize; + +use super::row_group_data_fetch::RowGroupData; +use crate::async_executor; +use crate::nodes::TaskPriority; + +/// Turns row group data into DataFrames. +pub(super) struct RowGroupDecoder { + pub(super) scan_sources: ScanSources, + pub(super) hive_partitions: Option>>, + pub(super) hive_partitions_width: usize, + pub(super) include_file_paths: Option, + pub(super) projected_arrow_fields: Arc<[polars_core::prelude::ArrowField]>, + pub(super) row_index: Option, + pub(super) physical_predicate: Option>, + pub(super) ideal_morsel_size: usize, +} + +impl RowGroupDecoder { + pub(super) async fn row_group_data_to_df( + &self, + row_group_data: RowGroupData, + ) -> PolarsResult> { + let row_group_data = Arc::new(row_group_data); + + let out_width = self.row_index.is_some() as usize + + self.projected_arrow_fields.len() + + self.hive_partitions_width + + self.include_file_paths.is_some() as usize; + + let mut out_columns = Vec::with_capacity(out_width); + + if self.row_index.is_some() { + // Add a placeholder so that we don't have to shift the entire vec + // later. + out_columns.push(Series::default()); + } + + let slice_range = row_group_data + .slice + .map(|(offset, len)| offset..offset + len) + .unwrap_or(0..row_group_data.row_group_metadata.num_rows()); + + let projected_arrow_fields = &self.projected_arrow_fields; + let projected_arrow_fields = projected_arrow_fields.clone(); + + let row_group_data_2 = row_group_data.clone(); + let slice_range_2 = slice_range.clone(); + + // Minimum number of values to amortize the overhead of spawning tasks. + // This value is arbitrarily chosen. + const VALUES_PER_THREAD: usize = 16_777_216; + let n_rows = row_group_data.row_group_metadata.num_rows(); + let cols_per_task = 1 + VALUES_PER_THREAD / n_rows; + + let decode_fut_iter = (0..self.projected_arrow_fields.len()) + .step_by(cols_per_task) + .map(move |offset| { + let row_group_data = row_group_data_2.clone(); + let slice_range = slice_range_2.clone(); + let projected_arrow_fields = projected_arrow_fields.clone(); + + async move { + (offset + ..offset + .saturating_add(cols_per_task) + .min(projected_arrow_fields.len())) + .map(|i| { + let arrow_field = projected_arrow_fields[i].clone(); + + let columns_to_deserialize = row_group_data + .row_group_metadata + .columns_under_root_iter(&arrow_field.name) + .map(|col_md| { + let byte_range = col_md.byte_range(); + + ( + col_md, + row_group_data.byte_source.get_range( + byte_range.start as usize..byte_range.end as usize, + ), + ) + }) + .collect::>(); + + assert!( + slice_range.end <= row_group_data.row_group_metadata.num_rows() + ); + + let array = polars_io::prelude::_internal::to_deserializer( + columns_to_deserialize, + arrow_field.clone(), + Some(polars_parquet::read::Filter::Range(slice_range.clone())), + )?; + + let series = Series::try_from((&arrow_field, array))?; + + // TODO: Also load in the metadata. + + PolarsResult::Ok(series) + }) + .collect::>>() + } + }); + + if decode_fut_iter.len() > 1 { + for handle in decode_fut_iter.map(|fut| { + async_executor::AbortOnDropHandle::new(async_executor::spawn( + TaskPriority::Low, + fut, + )) + }) { + out_columns.extend(handle.await?); + } + } else { + for fut in decode_fut_iter { + out_columns.extend(fut.await?); + } + } + + let projection_height = if self.projected_arrow_fields.is_empty() { + slice_range.len() + } else { + debug_assert!(out_columns.len() > self.row_index.is_some() as usize); + out_columns.last().unwrap().len() + }; + + if let Some(RowIndex { name, offset }) = self.row_index.as_ref() { + let Some(offset) = (|| { + let offset = offset + .checked_add((row_group_data.row_offset + slice_range.start) as IdxSize)?; + offset.checked_add(projection_height as IdxSize)?; + + Some(offset) + })() else { + let msg = format!( + "adding a row index column with offset {} overflows at {} rows", + offset, + row_group_data.row_offset + slice_range.end + ); + polars_bail!(ComputeError: msg) + }; + + // The DataFrame can be empty at this point if no columns were projected from the file, + // so we create the row index column manually instead of using `df.with_row_index` to + // ensure it has the correct number of rows. + let mut ca = IdxCa::from_vec( + name.clone(), + (offset..offset + projection_height as IdxSize).collect(), + ); + ca.set_sorted_flag(IsSorted::Ascending); + + out_columns[0] = ca.into_series(); + } + + let shared_file_state = row_group_data + .shared_file_state + .get_or_init(|| async { + let path_index = row_group_data.path_index; + + let hive_series = if let Some(hp) = self.hive_partitions.as_deref() { + let mut v = hp[path_index].materialize_partition_columns(); + for s in v.iter_mut() { + *s = s.new_from_index(0, row_group_data.file_max_row_group_height); + } + v + } else { + vec![] + }; + + let file_path_series = self.include_file_paths.clone().map(|file_path_col| { + StringChunked::full( + file_path_col, + self.scan_sources + .get(path_index) + .unwrap() + .to_include_path_name(), + row_group_data.file_max_row_group_height, + ) + .into_series() + }); + + SharedFileState { + path_index, + hive_series, + file_path_series, + } + }) + .await; + + assert_eq!(shared_file_state.path_index, row_group_data.path_index); + + for s in &shared_file_state.hive_series { + debug_assert!(s.len() >= projection_height); + out_columns.push(s.slice(0, projection_height)); + } + + if let Some(file_path_series) = &shared_file_state.file_path_series { + debug_assert!(file_path_series.len() >= projection_height); + out_columns.push(file_path_series.slice(0, projection_height)); + } + + let df = unsafe { DataFrame::new_no_checks(out_columns) }; + + // Re-calculate: A slice may have been applied. + let cols_per_task = 1 + VALUES_PER_THREAD / df.height(); + + let df = if let Some(predicate) = self.physical_predicate.as_deref() { + let mask = predicate.evaluate_io(&df)?; + let mask = mask.bool().unwrap(); + + if cols_per_task <= df.width() { + df._filter_seq(mask)? + } else { + let mask = mask.clone(); + let cols = Arc::new(df.take_columns()); + let mut out_cols = Vec::with_capacity(cols.len()); + + for handle in (0..cols.len()) + .step_by(cols_per_task) + .map(move |offset| { + let cols = cols.clone(); + let mask = mask.clone(); + async move { + cols[offset..offset.saturating_add(cols_per_task).min(cols.len())] + .iter() + .map(|s| s.filter(&mask)) + .collect::>>() + } + }) + .map(|fut| { + async_executor::AbortOnDropHandle::new(async_executor::spawn( + TaskPriority::Low, + fut, + )) + }) + { + out_cols.extend(handle.await?); + } + + unsafe { DataFrame::new_no_checks(out_cols) } + } + } else { + df + }; + + assert_eq!(df.width(), out_width); + + let n_morsels = if df.height() > 3 * self.ideal_morsel_size / 2 { + // num_rows > (1.5 * ideal_morsel_size) + (df.height() / self.ideal_morsel_size).max(2) + } else { + 1 + } as u64; + + if n_morsels == 1 { + return Ok(vec![df]); + } + + let rows_per_morsel = 1 + df.height() / n_morsels as usize; + + let out = (0..i64::try_from(df.height()).unwrap()) + .step_by(rows_per_morsel) + .map(|offset| df.slice(offset, rows_per_morsel)) + .collect::>(); + + Ok(out) + } +} + +/// State shared across row groups for a single file. +pub(super) struct SharedFileState { + path_index: usize, + hive_series: Vec, + file_path_series: Option, +} diff --git a/crates/polars-stream/src/nodes/reduce.rs b/crates/polars-stream/src/nodes/reduce.rs index 2ce9ee2c9464..f6de3bd1124a 100644 --- a/crates/polars-stream/src/nodes/reduce.rs +++ b/crates/polars-stream/src/nodes/reduce.rs @@ -59,9 +59,8 @@ impl ReduceNode { scope.spawn_task(TaskPriority::High, async move { while let Ok(morsel) = recv.recv().await { for (reducer, selector) in local_reducers.iter_mut().zip(selectors) { - // TODO: don't convert to physical representation here. let input = selector.evaluate(morsel.df(), state).await?; - reducer.update(&input.to_physical_repr())?; + reducer.update(&input)?; } } diff --git a/crates/polars-stream/src/physical_plan/fmt.rs b/crates/polars-stream/src/physical_plan/fmt.rs index 8a3e7a1b8ac4..7d15337389a8 100644 --- a/crates/polars-stream/src/physical_plan/fmt.rs +++ b/crates/polars-stream/src/physical_plan/fmt.rs @@ -1,7 +1,7 @@ use std::fmt::Write; use polars_plan::plans::expr_ir::ExprIR; -use polars_plan::plans::{AExpr, EscapeLabel, FileScan, PathsDisplay}; +use polars_plan::plans::{AExpr, EscapeLabel, FileScan, ScanSourcesDisplay}; use polars_utils::arena::Arena; use polars_utils::itertools::Itertools; use slotmap::{Key, SecondaryMap, SlotMap}; @@ -107,7 +107,7 @@ fn visualize_plan_rec( }, PhysNodeKind::Multiplexer { input } => ("multiplexer".to_string(), from_ref(input)), PhysNodeKind::FileScan { - paths, + scan_sources, file_info, hive_parts, output_schema: _, @@ -127,9 +127,9 @@ fn visualize_plan_rec( let mut f = EscapeLabel(&mut out); { - let paths_display = PathsDisplay(paths.as_ref()); + let disp = ScanSourcesDisplay(scan_sources); - write!(f, "\npaths: {}", paths_display).unwrap(); + write!(f, "\npaths: {}", disp).unwrap(); } { diff --git a/crates/polars-stream/src/physical_plan/lower_ir.rs b/crates/polars-stream/src/physical_plan/lower_ir.rs index b9693e6c3c56..beec7a57e358 100644 --- a/crates/polars-stream/src/physical_plan/lower_ir.rs +++ b/crates/polars-stream/src/physical_plan/lower_ir.rs @@ -1,7 +1,7 @@ use std::sync::Arc; use polars_core::prelude::{InitHashMaps, PlHashMap, PlIndexMap}; -use polars_core::schema::{IndexOfSchema, Schema}; +use polars_core::schema::Schema; use polars_error::PolarsResult; use polars_plan::plans::expr_ir::{ExprIR, OutputName}; use polars_plan::plans::{AExpr, IR}; @@ -26,7 +26,7 @@ pub fn lower_ir( let output_schema = IR::schema_with_cache(node, ir_arena, schema_cache); let node_kind = match ir_node { IR::SimpleProjection { input, columns } => { - let columns = columns.get_names_owned(); + let columns = columns.iter_names_cloned().collect::>(); let phys_input = lower_ir( *input, ir_arena, @@ -200,7 +200,7 @@ pub fn lower_ir( let phys_input = phys_sm.insert(PhysNode::new(schema, node_kind)); node_kind = PhysNodeKind::SimpleProjection { input: phys_input, - columns: projection_schema.get_names_owned(), + columns: projection_schema.iter_names_cloned().collect::>(), }; schema = projection_schema.clone(); } @@ -331,7 +331,7 @@ pub fn lower_ir( v @ IR::Scan { .. } => { let IR::Scan { - paths, + sources: scan_sources, file_info, hive_parts, output_schema, @@ -344,7 +344,7 @@ pub fn lower_ir( }; PhysNodeKind::FileScan { - paths, + scan_sources, file_info, hive_parts, output_schema, @@ -354,7 +354,14 @@ pub fn lower_ir( } }, - _ => todo!(), + IR::PythonScan { .. } => todo!(), + IR::Reduce { .. } => todo!(), + IR::Cache { .. } => todo!(), + IR::GroupBy { .. } => todo!(), + IR::Join { .. } => todo!(), + IR::Distinct { .. } => todo!(), + IR::ExtContext { .. } => todo!(), + IR::Invalid => unreachable!(), }; Ok(phys_sm.insert(PhysNode::new(output_schema, node_kind))) diff --git a/crates/polars-stream/src/physical_plan/mod.rs b/crates/polars-stream/src/physical_plan/mod.rs index 99103343565a..e4ba35ce767e 100644 --- a/crates/polars-stream/src/physical_plan/mod.rs +++ b/crates/polars-stream/src/physical_plan/mod.rs @@ -1,4 +1,3 @@ -use std::path::PathBuf; use std::sync::Arc; use polars_core::frame::DataFrame; @@ -6,7 +5,7 @@ use polars_core::prelude::{InitHashMaps, PlHashMap, SortMultipleOptions}; use polars_core::schema::{Schema, SchemaRef}; use polars_error::PolarsResult; use polars_plan::plans::hive::HivePartitions; -use polars_plan::plans::{AExpr, DataFrameUdf, FileInfo, FileScan, IR}; +use polars_plan::plans::{AExpr, DataFrameUdf, FileInfo, FileScan, ScanSources, IR}; use polars_plan::prelude::expr_ir::ExprIR; mod fmt; @@ -119,7 +118,7 @@ pub enum PhysNodeKind { }, FileScan { - paths: Arc>, + scan_sources: ScanSources, file_info: FileInfo, hive_parts: Option>>, predicate: Option, diff --git a/crates/polars-stream/src/physical_plan/to_graph.rs b/crates/polars-stream/src/physical_plan/to_graph.rs index d0bd342b0f65..e5cbf86b0351 100644 --- a/crates/polars-stream/src/physical_plan/to_graph.rs +++ b/crates/polars-stream/src/physical_plan/to_graph.rs @@ -256,7 +256,7 @@ fn to_graph_rec<'a>( v @ FileScan { .. } => { let FileScan { - paths, + scan_sources, file_info, hive_parts, output_schema, @@ -293,18 +293,19 @@ fn to_graph_rec<'a>( FileScan::Parquet { options, cloud_options, - metadata: _, + metadata: first_metadata, } => { if std::env::var("POLARS_DISABLE_PARQUET_SOURCE").as_deref() != Ok("1") { ctx.graph.add_node( nodes::parquet_source::ParquetSourceNode::new( - paths, + scan_sources, file_info, hive_parts, predicate, options, cloud_options, file_options, + first_metadata.unwrap(), ), [], ) diff --git a/crates/polars-stream/src/utils/late_materialized_df.rs b/crates/polars-stream/src/utils/late_materialized_df.rs index 2173598d5369..b18c5cea0657 100644 --- a/crates/polars-stream/src/utils/late_materialized_df.rs +++ b/crates/polars-stream/src/utils/late_materialized_df.rs @@ -4,7 +4,7 @@ use parking_lot::Mutex; use polars_core::frame::DataFrame; use polars_core::schema::Schema; use polars_error::PolarsResult; -use polars_plan::plans::{AnonymousScan, AnonymousScanArgs, FileInfo, FileScan, IR}; +use polars_plan::plans::{AnonymousScan, AnonymousScanArgs, FileInfo, FileScan, ScanSources, IR}; use polars_plan::prelude::{AnonymousScanOptions, FileScanOptions}; /// Used to insert a dataframe into in-memory-engine query plan after the query @@ -25,7 +25,7 @@ impl LateMaterializedDataFrame { fmt_str: "LateMaterializedDataFrame", }); IR::Scan { - paths: Arc::new(vec![]), + sources: ScanSources::Paths(Arc::default()), file_info: FileInfo::new(schema, None, (None, usize::MAX)), hive_parts: None, predicate: None, diff --git a/crates/polars-utils/src/binary_search.rs b/crates/polars-utils/src/binary_search.rs index b24aa3e33877..5cabb5fab654 100644 --- a/crates/polars-utils/src/binary_search.rs +++ b/crates/polars-utils/src/binary_search.rs @@ -1,3 +1,8 @@ +use std::cmp::Ordering; +use std::cmp::Ordering::{Greater, Less}; + +use crate::slice::GetSaferUnchecked; + /// Find the index of the first element of `arr` that is greater /// or equal to `val`. /// Assumes that `arr` is sorted. @@ -23,3 +28,66 @@ where Err(x) => x, } } + +// https://en.wikipedia.org/wiki/Exponential_search +// Use if you expect matches to be close by. Otherwise use binary search. +pub trait ExponentialSearch { + fn exponential_search_by(&self, f: F) -> Result + where + F: FnMut(&T) -> Ordering; + + fn partition_point_exponential

(&self, mut pred: P) -> usize + where + P: FnMut(&T) -> bool, + { + self.exponential_search_by(|x| if pred(x) { Less } else { Greater }) + .unwrap_or_else(|i| i) + } +} + +impl ExponentialSearch for &[T] { + fn exponential_search_by(&self, mut f: F) -> Result + where + F: FnMut(&T) -> Ordering, + { + if self.is_empty() { + return Err(0); + } + + let mut bound = 1; + + while bound < self.len() { + // SAFETY + // Bound is always >=0 and < len. + let cmp = f(unsafe { self.get_unchecked_release(bound) }); + + if cmp == Greater { + break; + } + bound *= 2 + } + let end_bound = std::cmp::min(self.len(), bound); + // SAFETY: + // We checked the end bound and previous bound was within slice as per the `while` condition. + let prev_bound = bound / 2; + + let slice = unsafe { self.get_unchecked_release(prev_bound..end_bound) }; + + match slice.binary_search_by(f) { + Ok(i) => Ok(i + prev_bound), + Err(i) => Err(i + prev_bound), + } + } +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_partition_point() { + let v = [1, 2, 3, 3, 5, 6, 7]; + let i = v.as_slice().partition_point_exponential(|&x| x < 5); + assert_eq!(i, 4); + } +} diff --git a/crates/polars-utils/src/mmap.rs b/crates/polars-utils/src/mmap.rs index d8db6d0ae671..cd33ab85438a 100644 --- a/crates/polars-utils/src/mmap.rs +++ b/crates/polars-utils/src/mmap.rs @@ -1,14 +1,16 @@ +use std::fs::File; use std::io; -use std::sync::Arc; pub use memmap::Mmap; mod private { + use std::fs::File; use std::ops::Deref; use std::sync::Arc; - pub use memmap::Mmap; + use polars_error::PolarsResult; + use super::MMapSemaphore; use crate::mem::prefetch_l2; /// A read-only reference to a slice of memory that can potentially be memory-mapped. @@ -34,7 +36,7 @@ mod private { #[allow(unused)] enum MemSliceInner { Bytes(bytes::Bytes), - Mmap(Arc), + Mmap(Arc), } impl Deref for MemSlice { @@ -46,6 +48,13 @@ mod private { } } + impl AsRef<[u8]> for MemSlice { + #[inline(always)] + fn as_ref(&self) -> &[u8] { + self.slice + } + } + impl Default for MemSlice { fn default() -> Self { Self::from_bytes(bytes::Bytes::new()) @@ -53,6 +62,8 @@ mod private { } impl MemSlice { + pub const EMPTY: Self = Self::from_static(&[]); + /// Copy the contents into a new owned `Vec` #[inline(always)] pub fn to_vec(self) -> Vec { @@ -75,7 +86,7 @@ mod private { } #[inline] - pub fn from_mmap(mmap: Arc) -> Self { + pub fn from_mmap(mmap: Arc) -> Self { Self { slice: unsafe { std::mem::transmute::<&[u8], &'static [u8]>(mmap.as_ref().as_ref()) @@ -84,10 +95,17 @@ mod private { } } + #[inline] + pub fn from_file(file: &File) -> PolarsResult { + let mmap = MMapSemaphore::new_from_file(file)?; + Ok(Self::from_mmap(Arc::new(mmap))) + } + /// Construct a `MemSlice` that simply wraps around a `&[u8]`. #[inline] - pub fn from_slice(slice: &'static [u8]) -> Self { - Self::from_bytes(bytes::Bytes::from_static(slice)) + pub const fn from_static(slice: &'static [u8]) -> Self { + let inner = MemSliceInner::Bytes(bytes::Bytes::from_static(slice)); + Self { slice, inner } } /// Attempt to prefetch the memory belonging to to this [`MemSlice`] @@ -108,6 +126,8 @@ mod private { } } +use memmap::MmapOptions; +use polars_error::{polars_bail, PolarsResult}; pub use private::MemSlice; /// A cursor over a [`MemSlice`]. @@ -149,16 +169,11 @@ impl MemReader { Self::new(MemSlice::from_bytes(bytes)) } - #[inline(always)] - pub fn from_mmap(mmap: Arc) -> Self { - Self::new(MemSlice::from_mmap(mmap)) - } - // Construct a `MemSlice` that simply wraps around a `&[u8]`. The caller must ensure the /// slice outlives the returned `MemSlice`. #[inline] pub fn from_slice(slice: &'static [u8]) -> Self { - Self::new(MemSlice::from_slice(slice)) + Self::new(MemSlice::from_static(slice)) } #[inline(always)] @@ -224,8 +239,91 @@ impl io::Seek for MemReader { } } -mod tests { +// Keep track of memory mapped files so we don't write to them while reading +// Use a btree as it uses less memory than a hashmap and this thing never shrinks. +// Write handle in Windows is exclusive, so this is only necessary in Unix. +#[cfg(target_family = "unix")] +static MEMORY_MAPPED_FILES: once_cell::sync::Lazy< + std::sync::Mutex>, +> = once_cell::sync::Lazy::new(|| std::sync::Mutex::new(Default::default())); + +#[derive(Debug)] +pub struct MMapSemaphore { + #[cfg(target_family = "unix")] + key: (u64, u64), + mmap: Mmap, +} + +impl MMapSemaphore { + pub fn new_from_file_with_options( + file: &File, + options: MmapOptions, + ) -> PolarsResult { + let mmap = unsafe { options.map(file) }?; + + #[cfg(target_family = "unix")] + { + use std::os::unix::fs::MetadataExt; + let metadata = file.metadata()?; + + let mut guard = MEMORY_MAPPED_FILES.lock().unwrap(); + let key = (metadata.dev(), metadata.ino()); + match guard.entry(key) { + std::collections::btree_map::Entry::Occupied(mut e) => *e.get_mut() += 1, + std::collections::btree_map::Entry::Vacant(e) => _ = e.insert(1), + } + Ok(Self { key, mmap }) + } + + #[cfg(not(target_family = "unix"))] + Ok(Self { mmap }) + } + pub fn new_from_file(file: &File) -> PolarsResult { + Self::new_from_file_with_options(file, MmapOptions::default()) + } + + pub fn as_ptr(&self) -> *const u8 { + self.mmap.as_ptr() + } +} + +impl AsRef<[u8]> for MMapSemaphore { + #[inline] + fn as_ref(&self) -> &[u8] { + self.mmap.as_ref() + } +} + +#[cfg(target_family = "unix")] +impl Drop for MMapSemaphore { + fn drop(&mut self) { + let mut guard = MEMORY_MAPPED_FILES.lock().unwrap(); + if let std::collections::btree_map::Entry::Occupied(mut e) = guard.entry(self.key) { + let v = e.get_mut(); + *v -= 1; + + if *v == 0 { + e.remove_entry(); + } + } + } +} + +pub fn ensure_not_mapped(#[allow(unused)] file: &File) -> PolarsResult<()> { + #[cfg(target_family = "unix")] + { + use std::os::unix::fs::MetadataExt; + let guard = MEMORY_MAPPED_FILES.lock().unwrap(); + let metadata = file.metadata()?; + if guard.contains_key(&(metadata.dev(), metadata.ino())) { + polars_bail!(ComputeError: "cannot write to file: already memory mapped"); + } + } + Ok(()) +} + +mod tests { #[test] fn test_mem_slice_zero_copy() { use std::sync::Arc; @@ -264,9 +362,11 @@ mod tests { } { + use crate::mmap::MMapSemaphore; + let path = "../../examples/datasets/foods1.csv"; let file = std::fs::File::open(path).unwrap(); - let mmap = unsafe { memmap::Mmap::map(&file) }.unwrap(); + let mmap = MMapSemaphore::new_from_file(&file).unwrap(); let ptr = mmap.as_ptr(); let mem_slice = MemSlice::from_mmap(Arc::new(mmap)); @@ -280,8 +380,9 @@ mod tests { let slice = vec.as_slice(); let ptr = slice.as_ptr(); - let mem_slice = - MemSlice::from_slice(unsafe { std::mem::transmute::<&[u8], &'static [u8]>(slice) }); + let mem_slice = MemSlice::from_static(unsafe { + std::mem::transmute::<&[u8], &'static [u8]>(slice) + }); let ptr_out = mem_slice.as_ptr(); assert_eq!(ptr_out, ptr); @@ -296,8 +397,9 @@ mod tests { let vec = vec![1u8, 2, 3, 4, 5]; let slice = vec.as_slice(); - let mem_slice = - MemSlice::from_slice(unsafe { std::mem::transmute::<&[u8], &'static [u8]>(slice) }); + let mem_slice = MemSlice::from_static(unsafe { + std::mem::transmute::<&[u8], &'static [u8]>(slice) + }); let out = &*mem_slice.slice(3..5); assert_eq!(out, &slice[3..5]); diff --git a/crates/polars/Cargo.toml b/crates/polars/Cargo.toml index a27907484369..b858dbc36678 100644 --- a/crates/polars/Cargo.toml +++ b/crates/polars/Cargo.toml @@ -130,6 +130,7 @@ approx_unique = ["polars-lazy?/approx_unique", "polars-ops/approx_unique"] arg_where = ["polars-lazy?/arg_where"] array_any_all = ["polars-lazy?/array_any_all", "dtype-array"] asof_join = ["polars-lazy?/asof_join", "polars-ops/asof_join"] +iejoin = ["polars-lazy?/iejoin"] binary_encoding = ["polars-ops/binary_encoding", "polars-lazy?/binary_encoding", "polars-sql?/binary_encoding"] business = ["polars-lazy?/business", "polars-ops/business"] checked_arithmetic = ["polars-core/checked_arithmetic"] diff --git a/crates/polars/tests/it/io/csv.rs b/crates/polars/tests/it/io/csv.rs index 604ef2782959..992754436c60 100644 --- a/crates/polars/tests/it/io/csv.rs +++ b/crates/polars/tests/it/io/csv.rs @@ -520,7 +520,7 @@ fn test_empty_bytes_to_dataframe() { let result = CsvReadOptions::default() .with_has_header(false) - .with_columns(Some(schema.iter_names().cloned().collect())) + .with_columns(Some(schema.iter_names_cloned().collect())) .with_schema(Some(Arc::new(schema))) .into_reader_with_file_handle(file) .finish(); diff --git a/crates/polars/tests/it/io/parquet/read/file.rs b/crates/polars/tests/it/io/parquet/read/file.rs index 5007dcdf0755..d2be2c5402d9 100644 --- a/crates/polars/tests/it/io/parquet/read/file.rs +++ b/crates/polars/tests/it/io/parquet/read/file.rs @@ -4,7 +4,7 @@ use arrow::array::Array; use arrow::datatypes::ArrowSchema; use arrow::record_batch::RecordBatchT; use polars_error::PolarsResult; -use polars_parquet::read::{Filter, RowGroupMetaData}; +use polars_parquet::read::{Filter, RowGroupMetadata}; use super::row_group::{read_columns_many, RowGroupDeserializer}; @@ -25,7 +25,7 @@ impl FileReader { /// Returns a new [`FileReader`]. pub fn new( reader: R, - row_groups: Vec, + row_groups: Vec, schema: ArrowSchema, limit: Option, ) -> Self { @@ -104,7 +104,7 @@ impl Iterator for FileReader { pub struct RowGroupReader { reader: R, schema: ArrowSchema, - row_groups: std::vec::IntoIter, + row_groups: std::vec::IntoIter, remaining_rows: usize, } @@ -113,7 +113,7 @@ impl RowGroupReader { pub fn new( reader: R, schema: ArrowSchema, - row_groups: Vec, + row_groups: Vec, limit: Option, ) -> Self { Self { diff --git a/crates/polars/tests/it/io/parquet/read/mod.rs b/crates/polars/tests/it/io/parquet/read/mod.rs index 73625107685f..c4ba7d5e418e 100644 --- a/crates/polars/tests/it/io/parquet/read/mod.rs +++ b/crates/polars/tests/it/io/parquet/read/mod.rs @@ -17,11 +17,9 @@ use std::fs::File; use dictionary::DecodedDictPage; use polars_parquet::parquet::encoding::hybrid_rle::HybridRleDecoder; use polars_parquet::parquet::error::{ParquetError, ParquetResult}; -use polars_parquet::parquet::metadata::ColumnChunkMetaData; +use polars_parquet::parquet::metadata::ColumnChunkMetadata; use polars_parquet::parquet::page::DataPage; -use polars_parquet::parquet::read::{ - get_column_iterator, get_field_columns, read_metadata, BasicDecompressor, -}; +use polars_parquet::parquet::read::{get_column_iterator, read_metadata, BasicDecompressor}; use polars_parquet::parquet::schema::types::{GroupConvertedType, ParquetType}; use polars_parquet::parquet::schema::Repetition; use polars_parquet::parquet::types::int96_to_i64_ns; @@ -143,9 +141,9 @@ pub fn page_to_array(page: &DataPage, dict: Option<&DecodedDictPage>) -> Parquet /// Reads columns into an [`Array`]. /// This is CPU-intensive: decompress, decode and de-serialize. -pub fn columns_to_array(mut columns: I, field: &ParquetType) -> ParquetResult +pub fn columns_to_array<'a, I>(mut columns: I, field: &ParquetType) -> ParquetResult where - I: Iterator>, + I: Iterator>, { let mut validity = vec![]; let mut has_filled = false; @@ -205,7 +203,8 @@ pub fn read_column( usize::MAX, ); - let mut statistics = get_field_columns(metadata.row_groups[row_group].columns(), field.name()) + let mut statistics = metadata.row_groups[row_group] + .columns_under_root_iter(field.name()) .map(|column_meta| column_meta.statistics().transpose()) .collect::>>()?; diff --git a/crates/polars/tests/it/io/parquet/read/row_group.rs b/crates/polars/tests/it/io/parquet/read/row_group.rs index 54c8d17c0076..6d567a120c92 100644 --- a/crates/polars/tests/it/io/parquet/read/row_group.rs +++ b/crates/polars/tests/it/io/parquet/read/row_group.rs @@ -6,9 +6,9 @@ use arrow::record_batch::RecordBatchT; use polars::prelude::ArrowSchema; use polars_error::PolarsResult; use polars_parquet::arrow::read::{column_iter_to_arrays, Filter}; -use polars_parquet::parquet::metadata::ColumnChunkMetaData; -use polars_parquet::parquet::read::{get_field_columns, BasicDecompressor, PageReader}; -use polars_parquet::read::RowGroupMetaData; +use polars_parquet::parquet::metadata::ColumnChunkMetadata; +use polars_parquet::parquet::read::{BasicDecompressor, PageReader}; +use polars_parquet::read::RowGroupMetadata; use polars_utils::mmap::MemReader; /// An [`Iterator`] of [`RecordBatchT`] that (dynamically) adapts a vector of iterators of [`Array`] into @@ -70,23 +70,25 @@ impl Iterator for RowGroupDeserializer { /// the field (one for non-nested types) pub fn read_columns<'a, R: Read + Seek>( reader: &mut R, - columns: &'a [ColumnChunkMetaData], + row_group_metadata: &'a RowGroupMetadata, field_name: &'a str, -) -> PolarsResult)>> { - get_field_columns(columns, field_name) +) -> PolarsResult)>> { + row_group_metadata + .columns_under_root_iter(field_name) .map(|meta| _read_single_column(reader, meta)) .collect() } fn _read_single_column<'a, R>( reader: &mut R, - meta: &'a ColumnChunkMetaData, -) -> PolarsResult<(&'a ColumnChunkMetaData, Vec)> + meta: &'a ColumnChunkMetadata, +) -> PolarsResult<(&'a ColumnChunkMetadata, Vec)> where R: Read + Seek, { - let (start, length) = meta.byte_range(); - reader.seek(std::io::SeekFrom::Start(start))?; + let byte_range = meta.byte_range(); + let length = byte_range.end - byte_range.start; + reader.seek(std::io::SeekFrom::Start(byte_range.start))?; let mut chunk = vec![]; chunk.try_reserve(length as usize)?; @@ -97,7 +99,7 @@ where /// Converts a vector of columns associated with the parquet field whose name is [`Field`] /// to an iterator of [`Array`], [`ArrayIter`] of chunk size `chunk_size`. pub fn to_deserializer( - columns: Vec<(&ColumnChunkMetaData, Vec)>, + columns: Vec<(&ColumnChunkMetadata, Vec)>, field: Field, filter: Option, ) -> PolarsResult> { @@ -133,7 +135,7 @@ pub fn to_deserializer( /// and convert them to [`ArrayIter`] via [`to_deserializer`]. pub fn read_columns_many( reader: &mut R, - row_group: &RowGroupMetaData, + row_group: &RowGroupMetadata, fields: &ArrowSchema, filter: Option, ) -> PolarsResult>> { @@ -141,7 +143,7 @@ pub fn read_columns_many( // This operation is IO-bounded `O(C)` where C is the number of columns in the row group let field_columns = fields .iter_values() - .map(|field| read_columns(reader, row_group.columns(), &field.name)) + .map(|field| read_columns(reader, row_group, &field.name)) .collect::>>()?; field_columns diff --git a/crates/polars/tests/it/io/parquet/write/mod.rs b/crates/polars/tests/it/io/parquet/write/mod.rs index 02715030fb14..4403277a0552 100644 --- a/crates/polars/tests/it/io/parquet/write/mod.rs +++ b/crates/polars/tests/it/io/parquet/write/mod.rs @@ -213,7 +213,11 @@ fn basic() -> ParquetResult<()> { // validated against an equivalent array produced by pyarrow. let expected = 51; assert_eq!( - metadata.row_groups[0].columns()[0].uncompressed_size(), + metadata.row_groups[0] + .columns_under_root_iter("col") + .next() + .unwrap() + .uncompressed_size(), expected ); diff --git a/docs/user-guide/misc/multiprocessing.md b/docs/user-guide/misc/multiprocessing.md index 4973da8c0155..d46a96a52bc5 100644 --- a/docs/user-guide/misc/multiprocessing.md +++ b/docs/user-guide/misc/multiprocessing.md @@ -52,7 +52,6 @@ Consider the example below, which is a slightly modified example posted on the [ {{code_block('user-guide/misc/multiprocess','example1',[])}} Using `fork` as the method, instead of `spawn`, will cause a dead lock. -Please note: Polars will not even start and raise the error on multiprocessing method being set wrong, but if the check had not been there, the deadlock would exist. The fork method is equivalent to calling `os.fork()`, which is a system call as defined in [the POSIX standard](https://pubs.opengroup.org/onlinepubs/9699919799/functions/fork.html): diff --git a/py-polars/Cargo.toml b/py-polars/Cargo.toml index a42c643516ea..1147cbdde89a 100644 --- a/py-polars/Cargo.toml +++ b/py-polars/Cargo.toml @@ -9,7 +9,7 @@ crate-type = ["cdylib"] [dependencies] libc = { workspace = true } -polars-python = { workspace = true, features = ["pymethods"] } +polars-python = { workspace = true, features = ["pymethods", "iejoin"] } pyo3 = { workspace = true, features = ["abi3-py38", "chrono", "extension-module", "multiple-pymethods"] } [build-dependencies] diff --git a/py-polars/docs/source/reference/config.rst b/py-polars/docs/source/reference/config.rst index 289656b2a313..edc606a31e60 100644 --- a/py-polars/docs/source/reference/config.rst +++ b/py-polars/docs/source/reference/config.rst @@ -20,10 +20,10 @@ Config options Config.set_tbl_cell_alignment Config.set_tbl_cell_numeric_alignment Config.set_tbl_cols - Config.set_tbl_column_dtype_inline + Config.set_tbl_column_data_type_inline Config.set_tbl_dataframe_shape_below Config.set_tbl_formatting - Config.set_tbl_hide_column_dtypes + Config.set_tbl_hide_column_data_types Config.set_tbl_hide_column_names Config.set_tbl_hide_dataframe_shape Config.set_tbl_hide_dtype_separator diff --git a/py-polars/docs/source/reference/dataframe/modify_select.rst b/py-polars/docs/source/reference/dataframe/modify_select.rst index 4d9e62556533..b3a3d024ebd2 100644 --- a/py-polars/docs/source/reference/dataframe/modify_select.rst +++ b/py-polars/docs/source/reference/dataframe/modify_select.rst @@ -35,6 +35,7 @@ Manipulation/selection DataFrame.iter_slices DataFrame.join DataFrame.join_asof + DataFrame.join_where DataFrame.limit DataFrame.melt DataFrame.merge_sorted diff --git a/py-polars/docs/source/reference/lazyframe/modify_select.rst b/py-polars/docs/source/reference/lazyframe/modify_select.rst index 925591ed8649..f26a600966d2 100644 --- a/py-polars/docs/source/reference/lazyframe/modify_select.rst +++ b/py-polars/docs/source/reference/lazyframe/modify_select.rst @@ -26,6 +26,7 @@ Manipulation/selection LazyFrame.interpolate LazyFrame.join LazyFrame.join_asof + LazyFrame.join_where LazyFrame.last LazyFrame.limit LazyFrame.melt diff --git a/py-polars/polars/_utils/various.py b/py-polars/polars/_utils/various.py index 014e601de8e2..f82bbec0d785 100644 --- a/py-polars/polars/_utils/various.py +++ b/py-polars/polars/_utils/various.py @@ -84,6 +84,24 @@ def _is_iterable_of(val: Iterable[object], eltype: type | tuple[type, ...]) -> b return all(isinstance(x, eltype) for x in val) +def is_path_or_str_sequence( + val: object, *, allow_str: bool = False, include_series: bool = False +) -> TypeGuard[Sequence[str | Path]]: + """ + Check that `val` is a sequence of strings or paths. + + Note that a single string is a sequence of strings by definition, use + `allow_str=False` to return False on a single string. + """ + if allow_str is False and isinstance(val, str): + return False + elif _check_for_numpy(val) and isinstance(val, np.ndarray): + return np.issubdtype(val.dtype, np.str_) + elif include_series and isinstance(val, pl.Series): + return val.dtype == pl.String + return isinstance(val, Sequence) and _is_iterable_of(val, (Path, str)) + + def is_bool_sequence( val: object, *, include_series: bool = False ) -> TypeGuard[Sequence[bool]]: diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py index 3d5d974f6e90..2d8710b9f431 100644 --- a/py-polars/polars/dataframe/frame.py +++ b/py-polars/polars/dataframe/frame.py @@ -2857,10 +2857,20 @@ def write_csv( if not null_value: null_value = None + def write_csv_to_string() -> str: + with BytesIO() as buf: + self.write_csv(buf) + csv_bytes = buf.getvalue() + return csv_bytes.decode("utf8") + should_return_buffer = False if file is None: buffer = file = BytesIO() should_return_buffer = True + elif isinstance(file, StringIO): + csv_str = write_csv_to_string() + file.write(csv_str) + return None elif isinstance(file, (str, os.PathLike)): file = normalize_filepath(file) @@ -7085,6 +7095,85 @@ def join( .collect(_eager=True) ) + @unstable() + def join_where( + self, + other: DataFrame, + *predicates: Expr | Iterable[Expr], + suffix: str = "_right", + ) -> DataFrame: + """ + Perform a join based on one or multiple equality predicates. + + .. warning:: + This functionality is experimental. It may be + changed at any point without it being considered a breaking change. + + A row from this table may be included in zero or multiple rows in the result, + and the relative order of rows may differ between the input and output tables. + + Parameters + ---------- + other + DataFrame to join with. + *predicates + (In)Equality condition to join the two table on. + When a column name occurs in both tables, the proper suffix must + be applied in the predicate. + suffix + Suffix to append to columns with a duplicate name. + + Examples + -------- + >>> east = pl.DataFrame( + ... { + ... "id": [100, 101, 102], + ... "dur": [120, 140, 160], + ... "rev": [12, 14, 16], + ... "cores": [2, 8, 4], + ... } + ... ) + >>> west = pl.DataFrame( + ... { + ... "t_id": [404, 498, 676, 742], + ... "time": [90, 130, 150, 170], + ... "cost": [9, 13, 15, 16], + ... "cores": [4, 2, 1, 4], + ... } + ... ) + >>> east.join_where( + ... west, + ... pl.col("dur") < pl.col("time"), + ... pl.col("rev") < pl.col("cost"), + ... ) + shape: (5, 8) + β”Œβ”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” + β”‚ id ┆ dur ┆ rev ┆ cores ┆ t_id ┆ time ┆ cost ┆ cores_right β”‚ + β”‚ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- β”‚ + β”‚ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 β”‚ + β•žβ•β•β•β•β•β•ͺ═════β•ͺ═════β•ͺ═══════β•ͺ══════β•ͺ══════β•ͺ══════β•ͺ═════════════║ + β”‚ 100 ┆ 120 ┆ 12 ┆ 2 ┆ 498 ┆ 130 ┆ 13 ┆ 2 β”‚ + β”‚ 100 ┆ 120 ┆ 12 ┆ 2 ┆ 676 ┆ 150 ┆ 15 ┆ 1 β”‚ + β”‚ 100 ┆ 120 ┆ 12 ┆ 2 ┆ 742 ┆ 170 ┆ 16 ┆ 4 β”‚ + β”‚ 101 ┆ 140 ┆ 14 ┆ 8 ┆ 676 ┆ 150 ┆ 15 ┆ 1 β”‚ + β”‚ 101 ┆ 140 ┆ 14 ┆ 8 ┆ 742 ┆ 170 ┆ 16 ┆ 4 β”‚ + β””β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + + """ + if not isinstance(other, DataFrame): + msg = f"expected `other` join table to be a DataFrame, got {type(other).__name__!r}" + raise TypeError(msg) + + return ( + self.lazy() + .join_where( + other.lazy(), + *predicates, + suffix=suffix, + ) + .collect(_eager=True) + ) + def map_rows( self, function: Callable[[tuple[Any, ...]], Any], diff --git a/py-polars/polars/expr/expr.py b/py-polars/polars/expr/expr.py index 312866a76fe8..e24297aa93ee 100644 --- a/py-polars/polars/expr/expr.py +++ b/py-polars/polars/expr/expr.py @@ -10247,7 +10247,12 @@ def replace( old, new, default=default, return_dtype=return_dtype ) - if new is no_default and isinstance(old, Mapping): + if new is no_default: + if not isinstance(old, Mapping): + msg = ( + "`new` argument is required if `old` argument is not a Mapping type" + ) + raise TypeError(msg) new = pl.Series(old.values()) old = pl.Series(old.keys()) else: @@ -10257,7 +10262,7 @@ def replace( new = pl.Series(new) old = parse_into_expression(old, str_as_lit=True) # type: ignore[arg-type] - new = parse_into_expression(new, str_as_lit=True) # type: ignore[arg-type] + new = parse_into_expression(new, str_as_lit=True) result = self._from_pyexpr(self._pyexpr.replace(old, new)) @@ -10438,7 +10443,12 @@ def replace_strict( β”‚ 3 ┆ 1.0 ┆ 10.0 β”‚ β””β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ """ # noqa: W505 - if new is no_default and isinstance(old, Mapping): + if new is no_default: + if not isinstance(old, Mapping): + msg = ( + "`new` argument is required if `old` argument is not a Mapping type" + ) + raise TypeError(msg) new = pl.Series(old.values()) old = pl.Series(old.keys()) diff --git a/py-polars/polars/io/csv/functions.py b/py-polars/polars/io/csv/functions.py index 3a27911d716e..ceba49391560 100644 --- a/py-polars/polars/io/csv/functions.py +++ b/py-polars/polars/io/csv/functions.py @@ -11,6 +11,7 @@ from polars._utils.deprecation import deprecate_renamed_parameter from polars._utils.various import ( _process_null_values, + is_path_or_str_sequence, is_str_sequence, normalize_filepath, ) @@ -443,6 +444,8 @@ def read_csv( # * The `storage_options` configuration keys are different between # fsspec and object_store (would require a breaking change) ): + source = normalize_filepath(v, check_not_directory=False) + if schema_overrides_is_list: msg = "passing a list to `schema_overrides` is unsupported for hf:// paths" raise ValueError(msg) @@ -451,7 +454,7 @@ def read_csv( raise ValueError(msg) lf = _scan_csv_impl( - source, # type: ignore[arg-type] + source, has_header=has_header, separator=separator, comment_prefix=comment_prefix, @@ -984,7 +987,16 @@ def read_csv_batched( @deprecate_renamed_parameter("row_count_name", "row_index_name", version="0.20.4") @deprecate_renamed_parameter("row_count_offset", "row_index_offset", version="0.20.4") def scan_csv( - source: str | Path | list[str] | list[Path], + source: str + | Path + | IO[str] + | IO[bytes] + | bytes + | list[str] + | list[Path] + | list[IO[str]] + | list[IO[bytes]] + | list[bytes], *, has_header: bool = True, separator: str = ",", @@ -1232,7 +1244,7 @@ def with_column_names(cols: list[str]) -> list[str]: if isinstance(source, (str, Path)): source = normalize_filepath(source, check_not_directory=False) - else: + elif is_path_or_str_sequence(source, allow_str=False): source = [ normalize_filepath(source, check_not_directory=False) for source in source ] @@ -1276,7 +1288,15 @@ def with_column_names(cols: list[str]) -> list[str]: def _scan_csv_impl( - source: str | list[str] | list[Path], + source: str + | IO[str] + | IO[bytes] + | bytes + | list[str] + | list[Path] + | list[IO[str]] + | list[IO[bytes]] + | list[bytes], *, has_header: bool = True, separator: str = ",", @@ -1329,8 +1349,8 @@ def _scan_csv_impl( storage_options = None pylf = PyLazyFrame.new_from_csv( - path=source, - paths=sources, + source, + sources, separator=separator, has_header=has_header, ignore_errors=ignore_errors, diff --git a/py-polars/polars/io/ipc/functions.py b/py-polars/polars/io/ipc/functions.py index 4443c31d513f..43fbc8136de2 100644 --- a/py-polars/polars/io/ipc/functions.py +++ b/py-polars/polars/io/ipc/functions.py @@ -9,6 +9,7 @@ import polars.functions as F from polars._utils.deprecation import deprecate_renamed_parameter from polars._utils.various import ( + is_path_or_str_sequence, is_str_sequence, normalize_filepath, ) @@ -111,9 +112,8 @@ def read_ipc( raise ValueError(msg) lf = scan_ipc( - source, # type: ignore[arg-type] + source, n_rows=n_rows, - memory_map=memory_map, storage_options=storage_options, row_index_name=row_index_name, row_index_offset=row_index_offset, @@ -188,7 +188,6 @@ def _read_ipc_impl( rechunk=rechunk, row_index_name=row_index_name, row_index_offset=row_index_offset, - memory_map=memory_map, ) if columns is None: df = scan.collect() @@ -346,7 +345,14 @@ def read_ipc_schema(source: str | Path | IO[bytes] | bytes) -> dict[str, DataTyp @deprecate_renamed_parameter("row_count_name", "row_index_name", version="0.20.4") @deprecate_renamed_parameter("row_count_offset", "row_index_offset", version="0.20.4") def scan_ipc( - source: str | Path | list[str] | list[Path], + source: str + | Path + | IO[bytes] + | bytes + | list[str] + | list[Path] + | list[IO[bytes]] + | list[bytes], *, n_rows: int | None = None, cache: bool = True, @@ -426,15 +432,23 @@ def scan_ipc( include_file_paths Include the path of the source file(s) as a column with this name. """ + sources: list[str] | list[Path] | list[IO[bytes]] | list[bytes] = [] if isinstance(source, (str, Path)): source = normalize_filepath(source, check_not_directory=False) - sources = [] - else: - sources = [ - normalize_filepath(source, check_not_directory=False) for source in source - ] + elif isinstance(source, list): + if is_path_or_str_sequence(source): + sources = [ + normalize_filepath(source, check_not_directory=False) + for source in source + ] + else: + sources = source + source = None # type: ignore[assignment] + # Memory Mapping is now a no-op + _ = memory_map + pylf = PyLazyFrame.new_from_ipc( source, sources, @@ -442,7 +456,6 @@ def scan_ipc( cache, rechunk, parse_row_index_args(row_index_name, row_index_offset), - memory_map=memory_map, cloud_options=storage_options, retries=retries, file_cache_ttl=file_cache_ttl, diff --git a/py-polars/polars/io/ndjson.py b/py-polars/polars/io/ndjson.py index e8eccca53ccd..cd9ea92bf3c0 100644 --- a/py-polars/polars/io/ndjson.py +++ b/py-polars/polars/io/ndjson.py @@ -3,10 +3,10 @@ import contextlib from io import BytesIO, StringIO from pathlib import Path -from typing import TYPE_CHECKING, Any, Sequence +from typing import IO, TYPE_CHECKING, Any, Sequence from polars._utils.deprecation import deprecate_renamed_parameter -from polars._utils.various import normalize_filepath +from polars._utils.various import is_path_or_str_sequence, normalize_filepath from polars._utils.wrap import wrap_df, wrap_ldf from polars.datatypes import N_INFER_DEFAULT from polars.io._utils import parse_row_index_args @@ -145,7 +145,7 @@ def read_ndjson( return df return scan_ndjson( - source, # type: ignore[arg-type] + source, schema=schema, schema_overrides=schema_overrides, infer_schema_length=infer_schema_length, @@ -166,7 +166,16 @@ def read_ndjson( @deprecate_renamed_parameter("row_count_name", "row_index_name", version="0.20.4") @deprecate_renamed_parameter("row_count_offset", "row_index_offset", version="0.20.4") def scan_ndjson( - source: str | Path | list[str] | list[Path], + source: str + | Path + | IO[str] + | IO[bytes] + | bytes + | list[str] + | list[Path] + | list[IO[str]] + | list[IO[bytes]] + | bytes, *, schema: SchemaDefinition | None = None, schema_overrides: SchemaDefinition | None = None, @@ -247,14 +256,20 @@ def scan_ndjson( include_file_paths Include the path of the source file(s) as a column with this name. """ + sources: list[str] | list[Path] | list[IO[str]] | list[IO[bytes]] = [] if isinstance(source, (str, Path)): source = normalize_filepath(source, check_not_directory=False) - sources = [] - else: - sources = [ - normalize_filepath(source, check_not_directory=False) for source in source - ] + elif isinstance(source, list): + if is_path_or_str_sequence(source): + sources = [ + normalize_filepath(source, check_not_directory=False) + for source in source + ] + else: + sources = source + source = None # type: ignore[assignment] + if infer_schema_length == 0: msg = "'infer_schema_length' should be positive" raise ValueError(msg) @@ -266,8 +281,8 @@ def scan_ndjson( storage_options = None pylf = PyLazyFrame.new_from_ndjson( - path=source, - paths=sources, + source, + sources, infer_schema_length=infer_schema_length, schema=schema, schema_overrides=schema_overrides, diff --git a/py-polars/polars/io/parquet/functions.py b/py-polars/polars/io/parquet/functions.py index 90b6137c4924..bc434b05cc2d 100644 --- a/py-polars/polars/io/parquet/functions.py +++ b/py-polars/polars/io/parquet/functions.py @@ -3,26 +3,27 @@ import contextlib import io from pathlib import Path -from typing import IO, TYPE_CHECKING, Any, Sequence +from typing import IO, TYPE_CHECKING, Any import polars.functions as F +from polars import concat as plconcat from polars._utils.deprecation import deprecate_renamed_parameter from polars._utils.unstable import issue_unstable_warning from polars._utils.various import ( is_int_sequence, + is_path_or_str_sequence, normalize_filepath, ) -from polars._utils.wrap import wrap_df, wrap_ldf +from polars._utils.wrap import wrap_ldf from polars.convert import from_arrow from polars.dependencies import import_optional from polars.io._utils import ( - parse_columns_arg, parse_row_index_args, prepare_file_arg, ) with contextlib.suppress(ImportError): - from polars.polars import PyDataFrame, PyLazyFrame + from polars.polars import PyLazyFrame from polars.polars import read_parquet_schema as _read_parquet_schema if TYPE_CHECKING: @@ -33,7 +34,14 @@ @deprecate_renamed_parameter("row_count_name", "row_index_name", version="0.20.4") @deprecate_renamed_parameter("row_count_offset", "row_index_offset", version="0.20.4") def read_parquet( - source: str | Path | list[str] | list[Path] | IO[bytes] | bytes, + source: str + | Path + | IO[bytes] + | bytes + | list[str] + | list[Path] + | list[IO[bytes]] + | list[bytes], *, columns: list[int] | list[str] | None = None, n_rows: int | None = None, @@ -166,18 +174,11 @@ def read_parquet( ) # Read file and bytes inputs using `read_parquet` - elif isinstance(source, (io.IOBase, bytes)): - return _read_parquet_binary( - source, - columns=columns, - n_rows=n_rows, - parallel=parallel, - row_index_name=row_index_name, - row_index_offset=row_index_offset, - low_memory=low_memory, - use_statistics=use_statistics, - rechunk=rechunk, - ) + if isinstance(source, bytes): + source = io.BytesIO(source) + elif isinstance(source, list) and len(source) > 0 and isinstance(source[0], bytes): + assert all(isinstance(s, bytes) for s in source) + source = [io.BytesIO(s) for s in source] # type: ignore[arg-type, assignment] # For other inputs, defer to `scan_parquet` lf = scan_parquet( @@ -209,7 +210,14 @@ def read_parquet( def _read_parquet_with_pyarrow( - source: str | Path | list[str] | list[Path] | IO[bytes] | bytes, + source: str + | Path + | IO[bytes] + | bytes + | list[str] + | list[Path] + | list[IO[bytes]] + | list[bytes], *, columns: list[int] | list[str] | None = None, storage_options: dict[str, Any] | None = None, @@ -224,48 +232,35 @@ def _read_parquet_with_pyarrow( ) pyarrow_options = pyarrow_options or {} - with prepare_file_arg( - source, # type: ignore[arg-type] - use_pyarrow=True, - storage_options=storage_options, - ) as source_prep: - pa_table = pyarrow_parquet.read_table( - source_prep, - memory_map=memory_map, - columns=columns, - **pyarrow_options, - ) - return from_arrow(pa_table, rechunk=rechunk) # type: ignore[return-value] - + sources: list[str | Path | IO[bytes] | bytes | list[str] | list[Path]] = [] + if isinstance(source, list): + if len(source) > 0 and isinstance(source[0], (bytes, io.IOBase)): + sources = source # type: ignore[assignment] + else: + sources = [source] # type: ignore[list-item] + else: + sources = [source] -def _read_parquet_binary( - source: IO[bytes] | bytes, - *, - columns: Sequence[int] | Sequence[str] | None = None, - n_rows: int | None = None, - row_index_name: str | None = None, - row_index_offset: int = 0, - parallel: ParallelStrategy = "auto", - use_statistics: bool = True, - rechunk: bool = False, - low_memory: bool = False, -) -> DataFrame: - projection, columns = parse_columns_arg(columns) - row_index = parse_row_index_args(row_index_name, row_index_offset) + results: list[DataFrame] = [] + for source in sources: + with prepare_file_arg( + source, # type: ignore[arg-type] + use_pyarrow=True, + storage_options=storage_options, + ) as source_prep: + pa_table = pyarrow_parquet.read_table( + source_prep, + memory_map=memory_map, + columns=columns, + **pyarrow_options, + ) + result = from_arrow(pa_table, rechunk=rechunk) + results.append(result) # type: ignore[arg-type] - with prepare_file_arg(source) as source_prep: - pydf = PyDataFrame.read_parquet( - source_prep, - columns=columns, - projection=projection, - n_rows=n_rows, - row_index=row_index, - parallel=parallel, - use_statistics=use_statistics, - rechunk=rechunk, - low_memory=low_memory, - ) - return wrap_df(pydf) + if len(results) == 1: + return results[0] + else: + return plconcat(results) def read_parquet_schema(source: str | Path | IO[bytes] | bytes) -> dict[str, DataType]: @@ -295,7 +290,7 @@ def read_parquet_schema(source: str | Path | IO[bytes] | bytes) -> dict[str, Dat @deprecate_renamed_parameter("row_count_name", "row_index_name", version="0.20.4") @deprecate_renamed_parameter("row_count_offset", "row_index_offset", version="0.20.4") def scan_parquet( - source: str | Path | list[str] | list[Path], + source: str | Path | IO[bytes] | list[str] | list[Path] | list[IO[bytes]], *, n_rows: int | None = None, row_index_name: str | None = None, @@ -422,13 +417,13 @@ def scan_parquet( if isinstance(source, (str, Path)): source = normalize_filepath(source, check_not_directory=False) - else: + elif is_path_or_str_sequence(source): source = [ normalize_filepath(source, check_not_directory=False) for source in source ] return _scan_parquet_impl( - source, + source, # type: ignore[arg-type] n_rows=n_rows, cache=cache, parallel=parallel, @@ -448,7 +443,7 @@ def scan_parquet( def _scan_parquet_impl( - source: str | list[str] | list[Path], + source: str | list[str] | list[Path] | IO[str] | IO[bytes], *, n_rows: int | None = None, cache: bool = True, diff --git a/py-polars/polars/lazyframe/frame.py b/py-polars/polars/lazyframe/frame.py index dead2681da0b..ec329898441a 100644 --- a/py-polars/polars/lazyframe/frame.py +++ b/py-polars/polars/lazyframe/frame.py @@ -4561,6 +4561,85 @@ def join( ) ) + @unstable() + def join_where( + self, + other: LazyFrame, + *predicates: Expr | Iterable[Expr], + suffix: str = "_right", + ) -> LazyFrame: + """ + Perform a join based on one or multiple (in)equality predicates. + + A row from this table may be included in zero or multiple rows in the result, + and the relative order of rows may differ between the input and output tables. + + .. warning:: + This functionality is experimental. It may be + changed at any point without it being considered a breaking change. + + Parameters + ---------- + other + DataFrame to join with. + *predicates + (In)Equality condition to join the two table on. + When a column name occurs in both tables, the proper suffix must + be applied in the predicate. + suffix + Suffix to append to columns with a duplicate name. + + Examples + -------- + >>> east = pl.LazyFrame( + ... { + ... "id": [100, 101, 102], + ... "dur": [120, 140, 160], + ... "rev": [12, 14, 16], + ... "cores": [2, 8, 4], + ... } + ... ) + >>> west = pl.LazyFrame( + ... { + ... "t_id": [404, 498, 676, 742], + ... "time": [90, 130, 150, 170], + ... "cost": [9, 13, 15, 16], + ... "cores": [4, 2, 1, 4], + ... } + ... ) + >>> east.join_where( + ... west, + ... pl.col("dur") < pl.col("time"), + ... pl.col("rev") < pl.col("cost"), + ... ).collect() + shape: (5, 8) + β”Œβ”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” + β”‚ id ┆ dur ┆ rev ┆ cores ┆ t_id ┆ time ┆ cost ┆ cores_right β”‚ + β”‚ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- β”‚ + β”‚ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 β”‚ + β•žβ•β•β•β•β•β•ͺ═════β•ͺ═════β•ͺ═══════β•ͺ══════β•ͺ══════β•ͺ══════β•ͺ═════════════║ + β”‚ 100 ┆ 120 ┆ 12 ┆ 2 ┆ 498 ┆ 130 ┆ 13 ┆ 2 β”‚ + β”‚ 100 ┆ 120 ┆ 12 ┆ 2 ┆ 676 ┆ 150 ┆ 15 ┆ 1 β”‚ + β”‚ 100 ┆ 120 ┆ 12 ┆ 2 ┆ 742 ┆ 170 ┆ 16 ┆ 4 β”‚ + β”‚ 101 ┆ 140 ┆ 14 ┆ 8 ┆ 676 ┆ 150 ┆ 15 ┆ 1 β”‚ + β”‚ 101 ┆ 140 ┆ 14 ┆ 8 ┆ 742 ┆ 170 ┆ 16 ┆ 4 β”‚ + β””β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + + """ + if not isinstance(other, LazyFrame): + msg = f"expected `other` join table to be a LazyFrame, not a {type(other).__name__!r}" + raise TypeError(msg) + + pyexprs = parse_into_list_of_expressions(*predicates) + + return self._from_pyldf( + self._ldf.join_where( + other._ldf, + pyexprs, + suffix, + ) + ) + def with_columns( self, *exprs: IntoExpr | Iterable[IntoExpr], diff --git a/py-polars/tests/benchmark/test_join_where.py b/py-polars/tests/benchmark/test_join_where.py new file mode 100644 index 000000000000..d0bfd7d15b6d --- /dev/null +++ b/py-polars/tests/benchmark/test_join_where.py @@ -0,0 +1,73 @@ +"""Benchmark tests for join_where with inequality conditions.""" + +from __future__ import annotations + +import numpy as np +import pytest + +import polars as pl + +pytestmark = pytest.mark.benchmark() + + +def test_strict_inequalities(east_west: tuple[pl.DataFrame, pl.DataFrame]) -> None: + east, west = east_west + result = ( + east.lazy() + .join_where( + west.lazy(), + [pl.col("dur") < pl.col("time"), pl.col("rev") > pl.col("cost")], + ) + .collect() + ) + + assert len(result) > 0 + + +def test_non_strict_inequalities(east_west: tuple[pl.DataFrame, pl.DataFrame]) -> None: + east, west = east_west + result = ( + east.lazy() + .join_where( + west.lazy(), + [pl.col("dur") <= pl.col("time"), pl.col("rev") >= pl.col("cost")], + ) + .collect() + ) + + assert len(result) > 0 + + +@pytest.fixture(scope="module") +def east_west() -> tuple[pl.DataFrame, pl.DataFrame]: + num_rows_left, num_rows_right = 50_000, 5_000 + rng = np.random.default_rng(42) + + # Generate two separate datasets where revenue/cost are linearly related to + # duration/time, but add some noise to the west table so that there are some + # rows where the cost for the same or greater time will be less than the east table. + east_dur = rng.integers(1_000, 50_000, num_rows_left) + east_rev = (east_dur * 0.123).astype(np.int32) + west_time = rng.integers(1_000, 50_000, num_rows_right) + west_cost = west_time * 0.123 + west_cost += rng.normal(0.0, 1.0, num_rows_right) + west_cost = west_cost.astype(np.int32) + + east = pl.DataFrame( + { + "id": np.arange(0, num_rows_left), + "dur": east_dur, + "rev": east_rev, + "cores": rng.integers(1, 10, num_rows_left), + } + ) + west = pl.DataFrame( + { + "t_id": np.arange(0, num_rows_right), + "time": west_time, + "cost": west_cost, + "cores": rng.integers(1, 10, num_rows_right), + } + ) + + return east, west diff --git a/py-polars/tests/unit/datatypes/test_list.py b/py-polars/tests/unit/datatypes/test_list.py index 4607cfa89426..8c5502d698fd 100644 --- a/py-polars/tests/unit/datatypes/test_list.py +++ b/py-polars/tests/unit/datatypes/test_list.py @@ -114,20 +114,16 @@ def test_cast_inner() -> None: def test_list_empty_group_by_result_3521() -> None: - # Create a left relation where the join column contains a null value - left = pl.DataFrame().with_columns( - pl.lit(1).alias("group_by_column"), - pl.lit(None).cast(pl.Int32).alias("join_column"), + # Create a left relation where the join column contains a null value. + left = pl.DataFrame( + {"group_by_column": [1], "join_column": [None]}, + schema_overrides={"join_column": pl.Int64}, ) - # Create a right relation where there is a column to count distinct on - right = pl.DataFrame().with_columns( - pl.lit(1).alias("join_column"), - pl.lit(1).alias("n_unique_column"), - ) + # Create a right relation where there is a column to count distinct on. + right = pl.DataFrame({"join_column": [1], "n_unique_column": [1]}) - # Calculate n_unique after dropping nulls - # This will panic on polars version 0.13.38 and 0.13.39 + # Calculate n_unique after dropping nulls. result = ( left.join(right, on="join_column", how="left") .group_by("group_by_column") diff --git a/py-polars/tests/unit/datatypes/test_object.py b/py-polars/tests/unit/datatypes/test_object.py index 803e7933b8ab..8db373d3f58a 100644 --- a/py-polars/tests/unit/datatypes/test_object.py +++ b/py-polars/tests/unit/datatypes/test_object.py @@ -7,6 +7,7 @@ import polars as pl from polars.exceptions import ComputeError +from polars.testing import assert_series_equal def test_series_init_instantiated_object() -> None: @@ -190,3 +191,11 @@ def test_raise_list_object() -> None: # We don't want to support this. Unsafe enough as it is already. with pytest.raises(ValueError): pl.Series([[object()]], dtype=pl.List(pl.Object())) + + +def test_object_null_slice() -> None: + s = pl.Series("x", [1, None, 42], dtype=pl.Object) + assert_series_equal(s.is_null(), pl.Series("x", [False, True, False])) + assert_series_equal(s.slice(0, 2).is_null(), pl.Series("x", [False, True])) + assert_series_equal(s.slice(1, 1).is_null(), pl.Series("x", [True])) + assert_series_equal(s.slice(2, 1).is_null(), pl.Series("x", [False])) diff --git a/py-polars/tests/unit/datatypes/test_struct.py b/py-polars/tests/unit/datatypes/test_struct.py index 49a223f76fd4..6489a83e5a6b 100644 --- a/py-polars/tests/unit/datatypes/test_struct.py +++ b/py-polars/tests/unit/datatypes/test_struct.py @@ -265,6 +265,7 @@ def test_from_dicts_struct() -> None: ] +@pytest.mark.may_fail_auto_streaming def test_list_to_struct() -> None: df = pl.DataFrame({"a": [[1, 2, 3], [1, 2]]}) assert df.select([pl.col("a").list.to_struct()]).to_series().to_list() == [ diff --git a/py-polars/tests/unit/datatypes/test_temporal.py b/py-polars/tests/unit/datatypes/test_temporal.py index ea1798fe7114..e0c9f6498c65 100644 --- a/py-polars/tests/unit/datatypes/test_temporal.py +++ b/py-polars/tests/unit/datatypes/test_temporal.py @@ -1399,12 +1399,12 @@ def test_replace_time_zone_sortedness_expressions( from_tz: str | None, expected_sortedness: bool, ambiguous: str ) -> None: df = ( - pl.Series("ts", [1603584000000000, 1603587600000000]) + pl.Series("ts", [1603584000000000, 1603584060000000, 1603587600000000]) .cast(pl.Datetime("us", from_tz)) .sort() .to_frame() ) - df = df.with_columns(ambiguous=pl.Series([ambiguous] * 2)) + df = df.with_columns(ambiguous=pl.Series([ambiguous] * 3)) assert df["ts"].flags["SORTED_ASC"] result = df.select( pl.col("ts").dt.replace_time_zone("UTC", ambiguous=pl.col("ambiguous")) diff --git a/py-polars/tests/unit/functions/range/test_date_range.py b/py-polars/tests/unit/functions/range/test_date_range.py index 0c15adae778a..a881d30c1e41 100644 --- a/py-polars/tests/unit/functions/range/test_date_range.py +++ b/py-polars/tests/unit/functions/range/test_date_range.py @@ -310,3 +310,17 @@ def test_date_ranges_datetime_input() -> None: "literal", [[date(2022, 1, 1), date(2022, 1, 2), date(2022, 1, 3)]] ) assert_series_equal(result, expected) + + +def test_date_range_with_subclass_18470_18447() -> None: + class MyAmazingDate(date): + pass + + class MyAmazingDatetime(datetime): + pass + + result = pl.datetime_range( + MyAmazingDate(2020, 1, 1), MyAmazingDatetime(2020, 1, 2), eager=True + ) + expected = pl.Series("literal", [datetime(2020, 1, 1), datetime(2020, 1, 2)]) + assert_series_equal(result, expected) diff --git a/py-polars/tests/unit/functions/test_lit.py b/py-polars/tests/unit/functions/test_lit.py index 430a626d9b62..1f13ba122825 100644 --- a/py-polars/tests/unit/functions/test_lit.py +++ b/py-polars/tests/unit/functions/test_lit.py @@ -1,7 +1,7 @@ from __future__ import annotations import enum -from datetime import datetime, timedelta +from datetime import date, datetime, timedelta from decimal import Decimal from typing import TYPE_CHECKING, Any @@ -195,3 +195,27 @@ def test_lit_decimal_parametric(s: pl.Series) -> None: assert df.dtypes[0] == pl.Decimal(None, scale) assert result == value + + +def test_lit_datetime_subclass_w_allow_object() -> None: + class MyAmazingDate(date): + pass + + class MyAmazingDatetime(datetime): + pass + + result = pl.select( + a=pl.lit(MyAmazingDatetime(2020, 1, 1)), + b=pl.lit(MyAmazingDate(2020, 1, 1)), + c=pl.lit(MyAmazingDatetime(2020, 1, 1), allow_object=True), + d=pl.lit(MyAmazingDate(2020, 1, 1), allow_object=True), + ) + expected = pl.DataFrame( + { + "a": [datetime(2020, 1, 1)], + "b": [date(2020, 1, 1)], + "c": [datetime(2020, 1, 1)], + "d": [date(2020, 1, 1)], + } + ) + assert_frame_equal(result, expected) diff --git a/py-polars/tests/unit/io/test_csv.py b/py-polars/tests/unit/io/test_csv.py index eab89d3b7855..fcacedead1d4 100644 --- a/py-polars/tests/unit/io/test_csv.py +++ b/py-polars/tests/unit/io/test_csv.py @@ -953,6 +953,7 @@ def test_write_csv_separator() -> None: df.write_csv(f, separator="\t") f.seek(0) assert f.read() == b"a\tb\n1\t1\n2\t2\n3\t3\n" + f.seek(0) assert_frame_equal(df, pl.read_csv(f, separator="\t")) @@ -962,6 +963,7 @@ def test_write_csv_line_terminator() -> None: df.write_csv(f, line_terminator="\r\n") f.seek(0) assert f.read() == b"a,b\r\n1,1\r\n2,2\r\n3,3\r\n" + f.seek(0) assert_frame_equal(df, pl.read_csv(f, eol_char="\n")) @@ -996,6 +998,7 @@ def test_quoting_round_trip() -> None: } ) df.write_csv(f) + f.seek(0) read_df = pl.read_csv(f) assert_frame_equal(read_df, df) @@ -1183,6 +1186,7 @@ def test_csv_write_escape_headers() -> None: out = io.BytesIO() df1.write_csv(out) + out.seek(0) df2 = pl.read_csv(out) assert_frame_equal(df1, df2) assert df2.schema == {"c,o,l,u,m,n": pl.Int64} @@ -2279,4 +2283,5 @@ def test_read_csv_cast_unparsable_later( ) -> None: f = io.BytesIO() df.write_csv(f) + f.seek(0) assert df.equals(pl.read_csv(f, schema={"x": dtype})) diff --git a/py-polars/tests/unit/io/test_delta.py b/py-polars/tests/unit/io/test_delta.py index 6cb487f4abb0..e7874dbbba6f 100644 --- a/py-polars/tests/unit/io/test_delta.py +++ b/py-polars/tests/unit/io/test_delta.py @@ -460,21 +460,25 @@ def test_write_delta_with_merge(tmp_path: Path) -> None: df = pl.DataFrame({"a": [1, 2, 3]}) df.write_delta(tmp_path) - - merger = df.write_delta( - tmp_path, - mode="merge", - delta_merge_options={ - "predicate": "s.a = t.a", - "source_alias": "s", - "target_alias": "t", - }, - ) - - assert isinstance(merger, TableMerger) - assert merger.predicate == "s.a = t.a" - assert merger.source_alias == "s" - assert merger.target_alias == "t" + try: + merger = df.write_delta( + tmp_path, + mode="merge", + delta_merge_options={ + "predicate": "s.a = t.a", + "source_alias": "s", + "target_alias": "t", + }, + ) + assert isinstance(merger, TableMerger) + assert merger.predicate == "s.a = t.a" + assert merger.source_alias == "s" + assert merger.target_alias == "t" + except AttributeError as err: + import deltalake + + msg = f"dl ver {deltalake.__version__}, {dir(merger)}" + raise ValueError(msg) from err merger.when_matched_delete(predicate="t.a > 2").execute() diff --git a/py-polars/tests/unit/io/test_hive.py b/py-polars/tests/unit/io/test_hive.py index ad285b82f3b3..a01a2ef6e59d 100644 --- a/py-polars/tests/unit/io/test_hive.py +++ b/py-polars/tests/unit/io/test_hive.py @@ -554,6 +554,42 @@ def assert_with_projections(lf: pl.LazyFrame, df: pl.DataFrame) -> None: ) assert_with_projections(lf, rhs) + # partial cols in file + partial_path = tmp_path / "a=1/b=2/partial_data.bin" + df = pl.DataFrame( + {"x": 1, "b": 2, "y": 1}, + schema={"x": pl.Int32, "b": pl.Int16, "y": pl.Int32}, + ) + write_func(df, partial_path) + + rhs = rhs.select( + pl.col("x").cast(pl.Int32), + pl.col("b").cast(pl.Int16), + pl.col("y").cast(pl.Int32), + pl.col("a").cast(pl.Int64), + ) + + lf = scan_func(partial_path, hive_partitioning=True) # type: ignore[call-arg] + assert_frame_equal(lf.collect(projection_pushdown=projection_pushdown), rhs) + assert_with_projections(lf, rhs) + + lf = scan_func( # type: ignore[call-arg] + partial_path, + hive_schema={"a": pl.String, "b": pl.String}, + hive_partitioning=True, + ) + rhs = rhs.select( + pl.col("x").cast(pl.Int32), + pl.col("b").cast(pl.String), + pl.col("y").cast(pl.Int32), + pl.col("a").cast(pl.String), + ) + assert_frame_equal( + lf.collect(projection_pushdown=projection_pushdown), + rhs, + ) + assert_with_projections(lf, rhs) + @pytest.mark.write_disk def test_hive_partition_dates(tmp_path: Path) -> None: diff --git a/py-polars/tests/unit/io/test_ipc.py b/py-polars/tests/unit/io/test_ipc.py index 18e19f4ec885..dd60d0ae209c 100644 --- a/py-polars/tests/unit/io/test_ipc.py +++ b/py-polars/tests/unit/io/test_ipc.py @@ -1,8 +1,6 @@ from __future__ import annotations import io -import os -import re from decimal import Decimal from typing import TYPE_CHECKING, Any @@ -10,7 +8,6 @@ import pytest import polars as pl -from polars.exceptions import ComputeError from polars.interchange.protocol import CompatLevel from polars.testing import assert_frame_equal @@ -44,11 +41,13 @@ def test_from_to_buffer( ) -> None: # use an ad-hoc buffer (file=None) buf1 = write_ipc(df, stream, None, compression=compression) + buf1.seek(0) read_df = read_ipc(stream, buf1, use_pyarrow=False) assert_frame_equal(df, read_df, categorical_as_str=True) # explicitly supply an existing buffer buf2 = io.BytesIO() + buf2.seek(0) write_ipc(df, stream, buf2, compression=compression) buf2.seek(0) read_df = read_ipc(stream, buf2, use_pyarrow=False) @@ -245,6 +244,7 @@ def test_list_nested_enum() -> None: df = pl.DataFrame(pl.Series("list_cat", [["a", "b", "c", None]], dtype=dtype)) buffer = io.BytesIO() df.write_ipc(buffer, compat_level=CompatLevel.newest()) + buffer.seek(0) df = pl.read_ipc(buffer) assert df.get_column("list_cat").dtype == dtype @@ -258,6 +258,7 @@ def test_struct_nested_enum() -> None: ) buffer = io.BytesIO() df.write_ipc(buffer, compat_level=CompatLevel.newest()) + buffer.seek(0) df = pl.read_ipc(buffer) assert df.get_column("struct_cat").dtype == dtype @@ -339,29 +340,3 @@ def test_ipc_decimal_15920( path = f"{tmp_path}/data" df.write_ipc(path) assert_frame_equal(pl.read_ipc(path), df) - - -@pytest.mark.write_disk -def test_ipc_raise_on_writing_mmap(tmp_path: Path) -> None: - p = tmp_path / "foo.ipc" - df = pl.DataFrame({"foo": [1, 2, 3]}) - # first write is allowed - df.write_ipc(p) - - # now open as memory mapped - df = pl.read_ipc(p, memory_map=True) - - if os.name == "nt": - # In Windows, it's the duty of the system to ensure exclusive access - with pytest.raises( - OSError, - match=re.escape( - "The requested operation cannot be performed on a file with a user-mapped section open. (os error 1224)" - ), - ): - df.write_ipc(p) - else: - with pytest.raises( - ComputeError, match="cannot write to file: already memory mapped" - ): - df.write_ipc(p) diff --git a/py-polars/tests/unit/io/test_json.py b/py-polars/tests/unit/io/test_json.py index 4bce4ee4e0ce..ff56c390c207 100644 --- a/py-polars/tests/unit/io/test_json.py +++ b/py-polars/tests/unit/io/test_json.py @@ -385,3 +385,13 @@ def test_empty_json() -> None: df = pl.read_json(b'{"j":{}}') assert df.dtypes == [pl.Struct([])] assert df.shape == (0, 1) + + +def test_empty_list_json() -> None: + df = pl.read_json(io.StringIO("[]")) + assert df.shape == (0, 0) + assert isinstance(df, pl.DataFrame) + + df = pl.read_json(b"[]") + assert df.shape == (0, 0) + assert isinstance(df, pl.DataFrame) diff --git a/py-polars/tests/unit/io/test_lazy_count_star.py b/py-polars/tests/unit/io/test_lazy_count_star.py index 7b988bed75c7..a2c03596dd15 100644 --- a/py-polars/tests/unit/io/test_lazy_count_star.py +++ b/py-polars/tests/unit/io/test_lazy_count_star.py @@ -23,7 +23,7 @@ def test_count_csv(io_files_path: Path, path: str, n_rows: int) -> None: expected = pl.DataFrame(pl.Series("len", [n_rows], dtype=pl.UInt32)) # Check if we are using our fast count star - assert "FAST_COUNT" in lf.explain() + assert "FAST COUNT" in lf.explain() assert_frame_equal(lf.collect(), expected) @@ -42,7 +42,7 @@ def test_commented_csv() -> None: expected = pl.DataFrame(pl.Series("len", [2], dtype=pl.UInt32)) lf = pl.scan_csv(csv_a.name, comment_prefix="#").select(pl.len()) - assert "FAST_COUNT" in lf.explain() + assert "FAST COUNT" in lf.explain() assert_frame_equal(lf.collect(), expected) @@ -55,7 +55,7 @@ def test_count_parquet(io_files_path: Path, pattern: str, n_rows: int) -> None: expected = pl.DataFrame(pl.Series("len", [n_rows], dtype=pl.UInt32)) # Check if we are using our fast count star - assert "FAST_COUNT" in lf.explain() + assert "FAST COUNT" in lf.explain() assert_frame_equal(lf.collect(), expected) @@ -68,7 +68,7 @@ def test_count_ipc(io_files_path: Path, path: str, n_rows: int) -> None: expected = pl.DataFrame(pl.Series("len", [n_rows], dtype=pl.UInt32)) # Check if we are using our fast count star - assert "FAST_COUNT" in lf.explain() + assert "FAST COUNT" in lf.explain() assert_frame_equal(lf.collect(), expected) @@ -81,7 +81,7 @@ def test_count_ndjson(io_files_path: Path, path: str, n_rows: int) -> None: expected = pl.DataFrame(pl.Series("len", [n_rows], dtype=pl.UInt32)) # Check if we are using our fast count star - assert "FAST_COUNT" in lf.explain() + assert "FAST COUNT" in lf.explain() assert_frame_equal(lf.collect(), expected) diff --git a/py-polars/tests/unit/io/test_lazy_parquet.py b/py-polars/tests/unit/io/test_lazy_parquet.py index 68f7431354b5..e21844003375 100644 --- a/py-polars/tests/unit/io/test_lazy_parquet.py +++ b/py-polars/tests/unit/io/test_lazy_parquet.py @@ -504,11 +504,15 @@ def trim_to_metadata(path: str | Path) -> None: assert pl.read_parquet_schema(paths[0]) == dfs[0].schema # * Attempting to read any data will error with pytest.raises(ComputeError): - pl.scan_parquet(paths[0]).collect() + pl.scan_parquet(paths[0]).collect(streaming=streaming) df = dfs[1] - assert_frame_equal(pl.scan_parquet(paths).slice(1, 1).collect(), df) - assert_frame_equal(pl.scan_parquet(paths[1:]).head(1).collect(), df) + assert_frame_equal( + pl.scan_parquet(paths).slice(1, 1).collect(streaming=streaming), df + ) + assert_frame_equal( + pl.scan_parquet(paths[1:]).head(1).collect(streaming=streaming), df + ) # Negative slice unsupported in streaming if not streaming: diff --git a/py-polars/tests/unit/io/test_parquet.py b/py-polars/tests/unit/io/test_parquet.py index 9ec82b991f39..3da465561bd1 100644 --- a/py-polars/tests/unit/io/test_parquet.py +++ b/py-polars/tests/unit/io/test_parquet.py @@ -3,7 +3,7 @@ import io from datetime import datetime, time, timezone from decimal import Decimal -from typing import TYPE_CHECKING, Any, Literal, cast +from typing import IO, TYPE_CHECKING, Any, Literal, cast import fsspec import numpy as np @@ -12,7 +12,7 @@ import pyarrow.dataset as ds import pyarrow.parquet as pq import pytest -from hypothesis import HealthCheck, given, settings +from hypothesis import given from hypothesis import strategies as st import polars as pl @@ -34,12 +34,12 @@ def test_round_trip(df: pl.DataFrame) -> None: assert_frame_equal(pl.read_parquet(f), df) -def test_scan_round_trip(tmp_path: Path, df: pl.DataFrame) -> None: - tmp_path.mkdir(exist_ok=True) - f = tmp_path / "test.parquet" - +def test_scan_round_trip(df: pl.DataFrame) -> None: + f = io.BytesIO() df.write_parquet(f) + f.seek(0) assert_frame_equal(pl.scan_parquet(f).collect(), df) + f.seek(0) assert_frame_equal(pl.scan_parquet(f).head().collect(), df.head()) @@ -685,6 +685,19 @@ def test_write_parquet_with_null_col(tmp_path: Path) -> None: assert_frame_equal(out, df) +@pytest.mark.write_disk +def test_scan_parquet_binary_buffered_reader(tmp_path: Path) -> None: + tmp_path.mkdir(exist_ok=True) + + df = pl.DataFrame({"a": [1, 2, 3]}) + file_path = tmp_path / "test.parquet" + df.write_parquet(file_path) + + with file_path.open("rb") as f: + out = pl.scan_parquet(f).collect() + assert_frame_equal(out, df) + + @pytest.mark.write_disk def test_read_parquet_binary_buffered_reader(tmp_path: Path) -> None: tmp_path.mkdir(exist_ok=True) @@ -919,8 +932,7 @@ def test_parquet_array_dtype_nulls() -> None: ), ], ) -@pytest.mark.write_disk -def test_complex_types(tmp_path: Path, series: list[Any], dtype: pl.DataType) -> None: +def test_complex_types(series: list[Any], dtype: pl.DataType) -> None: xs = pl.Series(series, dtype=dtype) df = pl.DataFrame({"x": xs}) @@ -981,20 +993,18 @@ def test_read_parquet_only_loads_selected_columns_15098( @pytest.mark.release -@pytest.mark.write_disk -def test_max_statistic_parquet_writer(tmp_path: Path) -> None: +def test_max_statistic_parquet_writer() -> None: # this hits the maximal page size # so the row group will be split into multiple pages # the page statistics need to be correctly reduced # for this query to make sense n = 150_000 - tmp_path.mkdir(exist_ok=True) - # int64 is important to hit the page size df = pl.int_range(0, n, eager=True, dtype=pl.Int64).alias("int").to_frame() - f = tmp_path / "tmp.parquet" + f = io.BytesIO() df.write_parquet(f, statistics=True, use_pyarrow=False, row_group_size=n) + f.seek(0) result = pl.scan_parquet(f).filter(pl.col("int") > n - 3).collect() expected = pl.DataFrame({"int": [149998, 149999]}) assert_frame_equal(result, expected) @@ -1088,14 +1098,11 @@ def test_hybrid_rle() -> None: ) ) @pytest.mark.slow -@pytest.mark.write_disk -@settings(suppress_health_check=[HealthCheck.function_scoped_fixture]) -def test_roundtrip_parametric(df: pl.DataFrame, tmp_path: Path) -> None: - # delete if exists - path = tmp_path / "data.parquet" - - df.write_parquet(path) - result = pl.read_parquet(path) +def test_roundtrip_parametric(df: pl.DataFrame) -> None: + f = io.BytesIO() + df.write_parquet(f) + f.seek(0) + result = pl.read_parquet(f) assert_frame_equal(df, result) @@ -1207,18 +1214,14 @@ def test_read_byte_stream_split_arrays( assert_frame_equal(read, df) -@pytest.mark.write_disk -def test_parquet_nested_null_array_17795(tmp_path: Path) -> None: - filename = tmp_path / "nested_null.parquet" - - pl.DataFrame([{"struct": {"field": None}}]).write_parquet(filename) - pq.read_table(filename) - +def test_parquet_nested_null_array_17795() -> None: + f = io.BytesIO() + pl.DataFrame([{"struct": {"field": None}}]).write_parquet(f) + f.seek(0) + pq.read_table(f) -@pytest.mark.write_disk -def test_parquet_record_batches_pyarrow_fixed_size_list_16614(tmp_path: Path) -> None: - filename = tmp_path / "a.parquet" +def test_parquet_record_batches_pyarrow_fixed_size_list_16614() -> None: # @NOTE: # The minimum that I could get it to crash which was ~132000, but let's # just do 150000 to be sure. @@ -1228,27 +1231,28 @@ def test_parquet_record_batches_pyarrow_fixed_size_list_16614(tmp_path: Path) -> schema={"x": pl.Array(pl.Float32, 2)}, ) - x.write_parquet(filename) - b = pl.read_parquet(filename, use_pyarrow=True) + f = io.BytesIO() + x.write_parquet(f) + f.seek(0) + b = pl.read_parquet(f, use_pyarrow=True) assert b["x"].shape[0] == n assert_frame_equal(b, x) -@pytest.mark.write_disk -def test_parquet_list_element_field_name(tmp_path: Path) -> None: - filename = tmp_path / "list.parquet" - +def test_parquet_list_element_field_name() -> None: + f = io.BytesIO() ( pl.DataFrame( { "a": [[1, 2], [1, 1, 1]], }, schema={"a": pl.List(pl.Int64)}, - ).write_parquet(filename, use_pyarrow=False) + ).write_parquet(f, use_pyarrow=False) ) - schema_str = str(pq.read_schema(filename)) + f.seek(0) + schema_str = str(pq.read_schema(f)) assert "" in schema_str assert "child 0, element: int64" in schema_str @@ -1368,8 +1372,7 @@ def test_parquet_high_nested_null_17805( ) -@pytest.mark.write_disk -def test_struct_plain_encoded_statistics(tmp_path: Path) -> None: +def test_struct_plain_encoded_statistics() -> None: df = pl.DataFrame( { "a": [None, None, None, None, {"x": None, "y": 0}], @@ -1377,17 +1380,12 @@ def test_struct_plain_encoded_statistics(tmp_path: Path) -> None: schema={"a": pl.Struct({"x": pl.Int8, "y": pl.Int8})}, ) - test_scan_round_trip(tmp_path, df) + test_scan_round_trip(df) @given(df=dataframes(min_size=5, excluded_dtypes=[pl.Decimal, pl.Categorical])) -@settings( - max_examples=100, - deadline=None, - suppress_health_check=[HealthCheck.function_scoped_fixture], -) -def test_scan_round_trip_parametric(tmp_path: Path, df: pl.DataFrame) -> None: - test_scan_round_trip(tmp_path, df) +def test_scan_round_trip_parametric(df: pl.DataFrame) -> None: + test_scan_round_trip(df) def test_empty_rg_no_dict_page_18146() -> None: @@ -1532,13 +1530,7 @@ def test_delta_strings_encoding_roundtrip( r2=st.integers(min_value=0, max_value=1000), ) @pytest.mark.parametrize("parallel_st", ["auto", "prefiltered"]) -@settings( - deadline=None, - suppress_health_check=[HealthCheck.function_scoped_fixture], -) -@pytest.mark.write_disk def test_predicate_filtering( - tmp_path: Path, df: pl.DataFrame, first_op: str, second_op: None | tuple[str, str], @@ -1548,9 +1540,7 @@ def test_predicate_filtering( r2: int, parallel_st: Literal["auto", "prefiltered"], ) -> None: - tmp_path.mkdir(exist_ok=True) - f = tmp_path / "test.parquet" - + f = io.BytesIO() df.write_parquet(f, row_group_size=5) cols = df.columns @@ -1566,6 +1556,7 @@ def test_predicate_filtering( (getattr(pl.col(r1s), second_op[1]))(pl.col(r2s)) ) + f.seek(0) result = pl.scan_parquet(f, parallel=parallel_st).filter(expr).collect() assert_frame_equal(result, df.filter(expr)) @@ -1581,33 +1572,24 @@ def test_predicate_filtering( offset=st.integers(0, 100), length=st.integers(0, 100), ) -@settings( - suppress_health_check=[HealthCheck.function_scoped_fixture], -) -@pytest.mark.write_disk -def test_slice_roundtrip( - df: pl.DataFrame, offset: int, length: int, tmp_path: Path -) -> None: - tmp_path.mkdir(exist_ok=True) - f = tmp_path / "test.parquet" - +def test_slice_roundtrip(df: pl.DataFrame, offset: int, length: int) -> None: offset %= df.height + 1 length %= df.height - offset + 1 + f = io.BytesIO() df.write_parquet(f) + f.seek(0) scanned = pl.scan_parquet(f).slice(offset, length).collect() assert_frame_equal(scanned, df.slice(offset, length)) -@pytest.mark.write_disk -def test_struct_prefiltered(tmp_path: Path) -> None: - tmp_path.mkdir(exist_ok=True) - f = tmp_path / "test.parquet" - +def test_struct_prefiltered() -> None: df = pl.DataFrame({"a": {"x": 1, "y": 2}}) + f = io.BytesIO() df.write_parquet(f) + f.seek(0) ( pl.scan_parquet(f, parallel="prefiltered") .filter(pl.col("a").struct.field("x") == 1) @@ -1641,19 +1623,17 @@ def test_struct_prefiltered(tmp_path: Path) -> None: ], ) @pytest.mark.parametrize("nullable", [False, True]) -@pytest.mark.write_disk def test_nested_skip_18303( data: tuple[list[dict[str, str] | list[str]], pa.DataType], nullable: bool, - tmp_path: Path, ) -> None: - tmp_path.mkdir(exist_ok=True) - f = tmp_path / "test.parquet" - schema = pa.schema([pa.field("a", data[1], nullable=nullable)]) tb = pa.table({"a": data[0]}, schema=schema) + + f = io.BytesIO() pq.write_table(tb, f) + f.seek(0) scanned = pl.scan_parquet(f).slice(1, 1).collect() assert_frame_equal(scanned, pl.DataFrame(tb).slice(1, 1)) @@ -1697,20 +1677,12 @@ def test_nested_span_multiple_pages_18400() -> None: include_cols=[column("filter_col", pl.Boolean, allow_null=False)], ), ) -@pytest.mark.write_disk -@settings( - suppress_health_check=[HealthCheck.function_scoped_fixture], -) -def test_parametric_small_page_mask_filtering( - tmp_path: Path, - df: pl.DataFrame, -) -> None: - tmp_path.mkdir(exist_ok=True) - f = tmp_path / "test.parquet" - +def test_parametric_small_page_mask_filtering(df: pl.DataFrame) -> None: + f = io.BytesIO() df.write_parquet(f, data_page_size=1024) expr = pl.col("filter_col") + f.seek(0) result = pl.scan_parquet(f, parallel="prefiltered").filter(expr).collect() assert_frame_equal(result, df.filter(expr)) @@ -1756,23 +1728,13 @@ def test_different_page_validity_across_pages(value: str | int | float | bool) - ], ), ) -@settings( - deadline=None, - suppress_health_check=[HealthCheck.function_scoped_fixture], -) -@pytest.mark.write_disk -def test_delta_length_byte_array_prefiltering( - tmp_path: Path, - df: pl.DataFrame, -) -> None: - tmp_path.mkdir(exist_ok=True) - f = tmp_path / "test.parquet" - +def test_delta_length_byte_array_prefiltering(df: pl.DataFrame) -> None: cols = df.columns encodings = {col: "DELTA_LENGTH_BYTE_ARRAY" for col in cols} encodings["filter_col"] = "PLAIN" + f = io.BytesIO() pq.write_table( df.to_arrow(), f, @@ -1780,6 +1742,7 @@ def test_delta_length_byte_array_prefiltering( column_encoding=encodings, ) + f.seek(0) expr = pl.col("filter_col") == 0 result = pl.scan_parquet(f, parallel="prefiltered").filter(expr).collect() assert_frame_equal(result, df.filter(expr)) @@ -1797,22 +1760,13 @@ def test_delta_length_byte_array_prefiltering( ], ), ) -@settings( - deadline=None, - suppress_health_check=[HealthCheck.function_scoped_fixture], -) -@pytest.mark.write_disk -def test_general_prefiltering( - tmp_path: Path, - df: pl.DataFrame, -) -> None: - tmp_path.mkdir(exist_ok=True) - f = tmp_path / "test.parquet" - +def test_general_prefiltering(df: pl.DataFrame) -> None: + f = io.BytesIO() df.write_parquet(f) expr = pl.col("filter_col") == 0 + f.seek(0) result = pl.scan_parquet(f, parallel="prefiltered").filter(expr).collect() assert_frame_equal(result, df.filter(expr)) @@ -1827,22 +1781,13 @@ def test_general_prefiltering( include_cols=[column("filter_col", pl.Boolean, allow_null=False)], ), ) -@settings( - deadline=None, - suppress_health_check=[HealthCheck.function_scoped_fixture], -) -@pytest.mark.write_disk -def test_row_index_prefiltering( - tmp_path: Path, - df: pl.DataFrame, -) -> None: - tmp_path.mkdir(exist_ok=True) - f = tmp_path / "test.parquet" - +def test_row_index_prefiltering(df: pl.DataFrame) -> None: + f = io.BytesIO() df.write_parquet(f) expr = pl.col("filter_col") + f.seek(0) result = ( pl.scan_parquet( f, row_index_name="ri", row_index_offset=42, parallel="prefiltered" @@ -1903,3 +1848,46 @@ def test_row_index_projection_pushdown_18463( df.select("index").slice(1, 1).collect(), df.collect().select("index").slice(1, 1), ) + + +def test_concat_multiple_inmem() -> None: + f = io.BytesIO() + g = io.BytesIO() + + df1 = pl.DataFrame( + { + "a": [1, 2, 3], + "b": ["xyz", "abc", "wow"], + } + ) + df2 = pl.DataFrame( + { + "a": [5, 6, 7], + "b": ["a", "few", "entries"], + } + ) + + dfs = pl.concat([df1, df2]) + + df1.write_parquet(f) + df2.write_parquet(g) + + f.seek(0) + g.seek(0) + + items: list[IO[bytes]] = [f, g] + assert_frame_equal(pl.read_parquet(items), dfs) + + f.seek(0) + g.seek(0) + + assert_frame_equal(pl.read_parquet(items, use_pyarrow=True), dfs) + + f.seek(0) + g.seek(0) + + fb = f.read() + gb = g.read() + + assert_frame_equal(pl.read_parquet([fb, gb]), dfs) + assert_frame_equal(pl.read_parquet([fb, gb], use_pyarrow=True), dfs) diff --git a/py-polars/tests/unit/io/test_scan.py b/py-polars/tests/unit/io/test_scan.py index 1bcc463bd2e7..d710954fcbfe 100644 --- a/py-polars/tests/unit/io/test_scan.py +++ b/py-polars/tests/unit/io/test_scan.py @@ -1,5 +1,6 @@ from __future__ import annotations +import io from dataclasses import dataclass from functools import partial from math import ceil @@ -575,6 +576,17 @@ def test_path_expansion_excludes_empty_files_17362(tmp_path: Path) -> None: assert_frame_equal(pl.scan_parquet(tmp_path / "*").collect(), df) +@pytest.mark.write_disk +def test_path_expansion_empty_directory_does_not_panic(tmp_path: Path) -> None: + tmp_path.mkdir(exist_ok=True) + + with pytest.raises(pl.exceptions.ComputeError): + pl.scan_parquet(tmp_path).collect() + + with pytest.raises(pl.exceptions.ComputeError): + pl.scan_parquet(tmp_path / "**/*").collect() + + @pytest.mark.write_disk def test_scan_single_dir_differing_file_extensions_raises_17436(tmp_path: Path) -> None: tmp_path.mkdir(exist_ok=True) @@ -690,3 +702,86 @@ def test_async_path_expansion_bracket_17629(tmp_path: Path) -> None: df.write_parquet(path) assert_frame_equal(pl.scan_parquet(tmp_path / "[d]ata.parquet").collect(), df) + + +@pytest.mark.parametrize( + "method", + ["parquet", "csv", "ipc", "ndjson"], +) +def test_scan_in_memory(method: str) -> None: + f = io.BytesIO() + df = pl.DataFrame( + { + "a": [1, 2, 3], + "b": ["x", "y", "z"], + } + ) + + (getattr(df, f"write_{method}"))(f) + + f.seek(0) + result = (getattr(pl, f"scan_{method}"))(f).collect() + assert_frame_equal(df, result) + + f.seek(0) + result = (getattr(pl, f"scan_{method}"))(f).slice(1, 2).collect() + assert_frame_equal(df.slice(1, 2), result) + + f.seek(0) + result = (getattr(pl, f"scan_{method}"))(f).slice(-1, 1).collect() + assert_frame_equal(df.slice(-1, 1), result) + + g = io.BytesIO() + (getattr(df, f"write_{method}"))(g) + + f.seek(0) + g.seek(0) + result = (getattr(pl, f"scan_{method}"))([f, g]).collect() + assert_frame_equal(df.vstack(df), result) + + f.seek(0) + g.seek(0) + result = (getattr(pl, f"scan_{method}"))([f, g]).slice(1, 2).collect() + assert_frame_equal(df.vstack(df).slice(1, 2), result) + + f.seek(0) + g.seek(0) + result = (getattr(pl, f"scan_{method}"))([f, g]).slice(-1, 1).collect() + assert_frame_equal(df.vstack(df).slice(-1, 1), result) + + +@pytest.mark.parametrize( + "method", + ["csv", "ndjson"], +) +def test_scan_stringio(method: str) -> None: + f = io.StringIO() + df = pl.DataFrame( + { + "a": [1, 2, 3], + "b": ["x", "y", "z"], + } + ) + + (getattr(df, f"write_{method}"))(f) + + f.seek(0) + result = (getattr(pl, f"scan_{method}"))(f).collect() + assert_frame_equal(df, result) + + g = io.StringIO() + (getattr(df, f"write_{method}"))(g) + + f.seek(0) + g.seek(0) + result = (getattr(pl, f"scan_{method}"))([f, g]).collect() + assert_frame_equal(df.vstack(df), result) + + +@pytest.mark.parametrize( + "method", + [pl.scan_parquet, pl.scan_csv, pl.scan_ipc, pl.scan_ndjson], +) +def test_empty_list(method: Callable[[list[str]], pl.LazyFrame]) -> None: + with pytest.raises(pl.exceptions.ComputeError, match="expected at least 1 source"): + _ = (method)([]).collect() diff --git a/py-polars/tests/unit/lazyframe/optimizations.py b/py-polars/tests/unit/lazyframe/optimizations.py index a44816fad0e6..2417edecdeb8 100644 --- a/py-polars/tests/unit/lazyframe/optimizations.py +++ b/py-polars/tests/unit/lazyframe/optimizations.py @@ -1,3 +1,5 @@ +import io + import polars as pl from polars.testing import assert_frame_equal @@ -27,3 +29,14 @@ def test_double_sort_maintain_order_18558() -> None: ) assert_frame_equal(lf.collect(), expect) + + +def test_fast_count_alias_18581() -> None: + f = io.BytesIO() + f.write(b"a,b,c\n1,2,3\n4,5,6") + f.flush() + f.seek(0) + + df = pl.scan_csv(f).select(pl.len().alias("weird_name")).collect() + + assert_frame_equal(pl.DataFrame({"weird_name": 2}), df) diff --git a/py-polars/tests/unit/operations/namespaces/list/test_list.py b/py-polars/tests/unit/operations/namespaces/list/test_list.py index 77ed41f5bba3..f306bbff5d7b 100644 --- a/py-polars/tests/unit/operations/namespaces/list/test_list.py +++ b/py-polars/tests/unit/operations/namespaces/list/test_list.py @@ -1,5 +1,6 @@ from __future__ import annotations +import re from datetime import date, datetime import numpy as np @@ -159,6 +160,76 @@ def test_list_categorical_get() -> None: ) +def test_list_gather_wrong_indices_list_type() -> None: + a = pl.Series("a", [[1, 2, 3], [4, 5], [6, 7, 8, 9]]) + expected = pl.Series("a", [[1, 2], [4], [6, 9]]) + + # int8 + indices_series = pl.Series("indices", [[0, 1], [0], [0, 3]], dtype=pl.List(pl.Int8)) + result = a.list.gather(indices=indices_series) + assert_series_equal(result, expected) + + # int16 + indices_series = pl.Series( + "indices", [[0, 1], [0], [0, 3]], dtype=pl.List(pl.Int16) + ) + result = a.list.gather(indices=indices_series) + assert_series_equal(result, expected) + + # int32 + indices_series = pl.Series( + "indices", [[0, 1], [0], [0, 3]], dtype=pl.List(pl.Int32) + ) + result = a.list.gather(indices=indices_series) + assert_series_equal(result, expected) + + # int64 + indices_series = pl.Series( + "indices", [[0, 1], [0], [0, 3]], dtype=pl.List(pl.Int64) + ) + result = a.list.gather(indices=indices_series) + assert_series_equal(result, expected) + + # uint8 + indices_series = pl.Series( + "indices", [[0, 1], [0], [0, 3]], dtype=pl.List(pl.UInt8) + ) + result = a.list.gather(indices=indices_series) + assert_series_equal(result, expected) + + # uint16 + indices_series = pl.Series( + "indices", [[0, 1], [0], [0, 3]], dtype=pl.List(pl.UInt16) + ) + result = a.list.gather(indices=indices_series) + assert_series_equal(result, expected) + + # uint32 + indices_series = pl.Series( + "indices", [[0, 1], [0], [0, 3]], dtype=pl.List(pl.UInt32) + ) + result = a.list.gather(indices=indices_series) + assert_series_equal(result, expected) + + # uint64 + indices_series = pl.Series( + "indices", [[0, 1], [0], [0, 3]], dtype=pl.List(pl.UInt64) + ) + result = a.list.gather(indices=indices_series) + assert_series_equal(result, expected) + + df = pl.DataFrame( + { + "index": [["2"], ["2"], ["2"]], + "lists": [[3, 4, 5], [4, 5, 6], [7, 8, 9, 4]], + } + ) + with pytest.raises( + ComputeError, match=re.escape("cannot use dtype `list[str]` as an index") + ): + df.select(pl.col("lists").list.gather(pl.col("index"))) + + def test_contains() -> None: a = pl.Series("a", [[1, 2, 3], [2, 5], [6, 7, 8, 9]]) out = a.list.contains(2) diff --git a/py-polars/tests/unit/operations/namespaces/temporal/test_datetime.py b/py-polars/tests/unit/operations/namespaces/temporal/test_datetime.py index fb4ddee68146..a4fcfde344cc 100644 --- a/py-polars/tests/unit/operations/namespaces/temporal/test_datetime.py +++ b/py-polars/tests/unit/operations/namespaces/temporal/test_datetime.py @@ -138,15 +138,13 @@ def test_local_date_sortedness(time_zone: str | None, expected: bool) -> None: ser = (pl.Series([datetime(2022, 1, 1, 23)]).dt.replace_time_zone(time_zone)).sort() result = ser.dt.date() assert result.flags["SORTED_ASC"] - assert result.flags["SORTED_DESC"] is False # 2 elements - depends on time zone ser = ( pl.Series([datetime(2022, 1, 1, 23)] * 2).dt.replace_time_zone(time_zone) ).sort() result = ser.dt.date() - assert result.flags["SORTED_ASC"] == expected - assert result.flags["SORTED_DESC"] is False + assert result.flags["SORTED_ASC"] >= expected @pytest.mark.parametrize("time_zone", [None, "Asia/Kathmandu", "UTC"]) @@ -155,11 +153,16 @@ def test_local_time_sortedness(time_zone: str | None) -> None: ser = (pl.Series([datetime(2022, 1, 1, 23)]).dt.replace_time_zone(time_zone)).sort() result = ser.dt.time() assert result.flags["SORTED_ASC"] - assert not result.flags["SORTED_DESC"] - # two elements - not sorted + # three elements - not sorted ser = ( - pl.Series([datetime(2022, 1, 1, 23)] * 2).dt.replace_time_zone(time_zone) + pl.Series( + [ + datetime(2022, 1, 1, 23), + datetime(2022, 1, 2, 21), + datetime(2022, 1, 3, 22), + ] + ).dt.replace_time_zone(time_zone) ).sort() result = ser.dt.time() assert not result.flags["SORTED_ASC"] @@ -180,31 +183,34 @@ def test_local_time_before_epoch(time_unit: TimeUnit) -> None: ("time_zone", "offset", "expected"), [ (None, "1d", True), - ("Asia/Kathmandu", "1d", False), + ("Europe/London", "1d", False), ("UTC", "1d", True), (None, "1mo", True), - ("Asia/Kathmandu", "1mo", False), + ("Europe/London", "1mo", False), ("UTC", "1mo", True), (None, "1w", True), - ("Asia/Kathmandu", "1w", False), + ("Europe/London", "1w", False), ("UTC", "1w", True), (None, "1h", True), - ("Asia/Kathmandu", "1h", True), + ("Europe/London", "1h", True), ("UTC", "1h", True), ], ) def test_offset_by_sortedness( time_zone: str | None, offset: str, expected: bool ) -> None: - # create 2 values, as a single value is always sorted - ser = ( - pl.Series( - [datetime(2022, 1, 1, 22), datetime(2022, 1, 1, 22)] - ).dt.replace_time_zone(time_zone) + s = pl.datetime_range( + datetime(2020, 10, 25), + datetime(2020, 10, 25, 3), + "30m", + time_zone=time_zone, + eager=True, ).sort() - result = ser.dt.offset_by(offset) + assert s.flags["SORTED_ASC"] + assert not s.flags["SORTED_DESC"] + result = s.dt.offset_by(offset) assert result.flags["SORTED_ASC"] == expected - assert result.flags["SORTED_DESC"] is False + assert not result.flags["SORTED_DESC"] def test_dt_datetime_date_time_invalid() -> None: diff --git a/py-polars/tests/unit/operations/test_inequality_join.py b/py-polars/tests/unit/operations/test_inequality_join.py new file mode 100644 index 000000000000..cc4ea5c6bb02 --- /dev/null +++ b/py-polars/tests/unit/operations/test_inequality_join.py @@ -0,0 +1,484 @@ +from __future__ import annotations + +from datetime import datetime +from typing import TYPE_CHECKING, Any + +import hypothesis.strategies as st +import numpy as np +import pytest +from hypothesis import given + +import polars as pl +from polars.testing import assert_frame_equal + +if TYPE_CHECKING: + from hypothesis.strategies import DrawFn, SearchStrategy + + +@pytest.mark.parametrize( + ("pred_1", "pred_2"), + [ + (pl.col("time") > pl.col("time_right"), pl.col("cost") < pl.col("cost_right")), + (pl.col("time_right") < pl.col("time"), pl.col("cost_right") > pl.col("cost")), + ], +) +def test_self_join(pred_1: pl.Expr, pred_2: pl.Expr) -> None: + west = pl.DataFrame( + { + "t_id": [404, 498, 676, 742], + "time": [100, 140, 80, 90], + "cost": [6, 11, 10, 5], + "cores": [4, 2, 1, 4], + } + ) + + actual = west.join_where(west, pred_1, pred_2) + + expected = pl.DataFrame( + { + "t_id": [742, 404], + "time": [90, 100], + "cost": [5, 6], + "cores": [4, 4], + "t_id_right": [676, 676], + "time_right": [80, 80], + "cost_right": [10, 10], + "cores_right": [1, 1], + } + ) + assert_frame_equal(actual, expected, check_row_order=False, check_exact=True) + + +def test_basic_ie_join() -> None: + east = pl.DataFrame( + { + "id": [100, 101, 102], + "dur": [140, 100, 90], + "rev": [12, 12, 5], + "cores": [2, 8, 4], + } + ) + west = pl.DataFrame( + { + "t_id": [404, 498, 676, 742], + "time": [100, 140, 80, 90], + "cost": [6, 11, 10, 5], + "cores": [4, 2, 1, 4], + } + ) + + actual = east.join_where( + west, pl.col("dur") < pl.col("time"), pl.col("rev") > pl.col("cost") + ) + + expected = pl.DataFrame( + { + "id": [101], + "dur": [100], + "rev": [12], + "cores": [8], + "t_id": [498], + "time": [140], + "cost": [11], + "cores_right": [2], + } + ) + assert_frame_equal(actual, expected, check_row_order=False, check_exact=True) + + +@given( + offset=st.integers(-6, 5), + length=st.integers(0, 6), +) +def test_ie_join_with_slice(offset: int, length: int) -> None: + east = pl.DataFrame( + { + "id": [100, 101, 102], + "dur": [120, 140, 160], + "rev": [12, 14, 16], + "cores": [2, 8, 4], + } + ).lazy() + west = pl.DataFrame( + { + "t_id": [404, 498, 676, 742], + "time": [90, 130, 150, 170], + "cost": [9, 13, 15, 16], + "cores": [4, 2, 1, 4], + } + ).lazy() + + actual = ( + east.join_where( + west, pl.col("dur") < pl.col("time"), pl.col("rev") < pl.col("cost") + ) + .slice(offset, length) + .collect() + ) + + expected_full = pl.DataFrame( + { + "id": [101, 101, 100, 100, 100], + "dur": [140, 140, 120, 120, 120], + "rev": [14, 14, 12, 12, 12], + "cores": [8, 8, 2, 2, 2], + "t_id": [676, 742, 498, 676, 742], + "time": [150, 170, 130, 150, 170], + "cost": [15, 16, 13, 15, 16], + "cores_right": [1, 4, 2, 1, 4], + } + ) + # The ordering of the result is arbitrary, so we can + # only verify that each row of the slice is present in the full expected result. + assert len(actual) == len(expected_full.slice(offset, length)) + + expected_rows = set(expected_full.iter_rows()) + for row in actual.iter_rows(): + assert row in expected_rows, f"{row} not in expected rows" + + +def test_ie_join_with_expressions() -> None: + east = pl.DataFrame( + { + "id": [100, 101, 102], + "dur": [70, 50, 45], + "rev": [12, 12, 5], + "cores": [2, 8, 4], + } + ) + west = pl.DataFrame( + { + "t_id": [404, 498, 676, 742], + "time": [100, 140, 80, 90], + "cost": [12, 22, 20, 10], + "cores": [4, 2, 1, 4], + } + ) + + actual = east.join_where( + west, + (pl.col("dur") * 2) < pl.col("time"), + pl.col("rev") > (pl.col("cost").cast(pl.Int32) // 2).cast(pl.Int64), + ) + + expected = pl.DataFrame( + { + "id": [101], + "dur": [50], + "rev": [12], + "cores": [8], + "t_id": [498], + "time": [140], + "cost": [22], + "cores_right": [2], + } + ) + assert_frame_equal(actual, expected, check_row_order=False, check_exact=True) + + +def test_join_where_predicates() -> None: + left = pl.DataFrame( + { + "id": [0, 1, 2, 3, 4, 5], + "group": [0, 0, 0, 1, 1, 1], + "time": [ + datetime(2024, 8, 26, 15, 34, 30), + datetime(2024, 8, 26, 15, 35, 30), + datetime(2024, 8, 26, 15, 36, 30), + datetime(2024, 8, 26, 15, 37, 30), + datetime(2024, 8, 26, 15, 38, 0), + datetime(2024, 8, 26, 15, 39, 0), + ], + } + ) + right = pl.DataFrame( + { + "id": [0, 1, 2], + "group": [0, 1, 1], + "start_time": [ + datetime(2024, 8, 26, 15, 34, 0), + datetime(2024, 8, 26, 15, 35, 0), + datetime(2024, 8, 26, 15, 38, 0), + ], + "end_time": [ + datetime(2024, 8, 26, 15, 36, 0), + datetime(2024, 8, 26, 15, 37, 0), + datetime(2024, 8, 26, 15, 39, 0), + ], + } + ) + + actual = left.join_where( + right, + pl.col("time") >= pl.col("start_time"), + pl.col("time") < pl.col("end_time"), + ).select("id", "id_right") + + expected = pl.DataFrame( + { + "id": [0, 1, 1, 2, 4], + "id_right": [0, 0, 1, 1, 2], + } + ) + assert_frame_equal(actual, expected, check_row_order=False, check_exact=True) + + q = ( + left.lazy() + .join_where( + right.lazy(), + pl.col("time") >= pl.col("start_time"), + pl.col("time") < pl.col("end_time"), + pl.col("group_right") == pl.col("group"), + ) + .select("id", "id_right", "group") + .sort("id") + ) + + explained = q.explain() + assert "INNER JOIN" in explained + assert "FILTER" in explained + actual = q.collect() + + expected = ( + left.join(right, how="cross") + .filter( + pl.col("time") >= pl.col("start_time"), + pl.col("time") < pl.col("end_time"), + pl.col("group") == pl.col("group_right"), + ) + .select("id", "id_right", "group") + .sort("id") + ) + assert_frame_equal(actual, expected, check_exact=True) + + q = ( + left.lazy() + .join_where( + right.lazy(), + pl.col("time") >= pl.col("start_time"), + pl.col("time") < pl.col("end_time"), + pl.col("group") != pl.col("group_right"), + ) + .select("id", "id_right", "group") + .sort("id") + ) + + explained = q.explain() + assert "IEJOIN" in explained + assert "FILTER" in explained + actual = q.collect() + + expected = ( + left.join(right, how="cross") + .filter( + pl.col("time") >= pl.col("start_time"), + pl.col("time") < pl.col("end_time"), + pl.col("group") != pl.col("group_right"), + ) + .select("id", "id_right", "group") + .sort("id") + ) + assert_frame_equal(actual, expected, check_exact=True) + + q = ( + left.lazy() + .join_where( + right.lazy(), + pl.col("group") != pl.col("group_right"), + ) + .select("id", "group", "group_right") + .sort("id") + .select("group", "group_right") + ) + + explained = q.explain() + assert "CROSS" in explained + assert "FILTER" in explained + actual = q.collect() + assert actual.to_dict(as_series=False) == { + "group": [0, 0, 0, 0, 0, 0, 1, 1, 1], + "group_right": [1, 1, 1, 1, 1, 1, 0, 0, 0], + } + + +def _inequality_expression(col1: str, op: str, col2: str) -> pl.Expr: + if op == "<": + return pl.col(col1) < pl.col(col2) + elif op == "<=": + return pl.col(col1) <= pl.col(col2) + elif op == ">": + return pl.col(col1) > pl.col(col2) + elif op == ">=": + return pl.col(col1) >= pl.col(col2) + else: + message = f"Invalid operator '{op}'" + raise ValueError(message) + + +def operators() -> SearchStrategy[str]: + valid_operators = ["<", "<=", ">", ">="] + return st.sampled_from(valid_operators) + + +@st.composite +def east_df( + draw: DrawFn, with_nulls: bool = False, use_floats: bool = False +) -> pl.DataFrame: + height = draw(st.integers(min_value=0, max_value=20)) + + if use_floats: + dur_strategy: SearchStrategy[Any] = st.floats(allow_nan=True) + rev_strategy: SearchStrategy[Any] = st.floats(allow_nan=True) + dur_dtype: type[pl.DataType] = pl.Float32 + rev_dtype: type[pl.DataType] = pl.Float32 + else: + dur_strategy = st.integers(min_value=100, max_value=105) + rev_strategy = st.integers(min_value=9, max_value=13) + dur_dtype = pl.Int64 + rev_dtype = pl.Int64 + + if with_nulls: + dur_strategy = dur_strategy | st.none() + rev_strategy = rev_strategy | st.none() + + cores_strategy = st.integers(min_value=1, max_value=10) + + ids = np.arange(0, height) + dur = draw(st.lists(dur_strategy, min_size=height, max_size=height)) + rev = draw(st.lists(rev_strategy, min_size=height, max_size=height)) + cores = draw(st.lists(cores_strategy, min_size=height, max_size=height)) + + return pl.DataFrame( + [ + pl.Series("id", ids, dtype=pl.Int64), + pl.Series("dur", dur, dtype=dur_dtype), + pl.Series("rev", rev, dtype=rev_dtype), + pl.Series("cores", cores, dtype=pl.Int64), + ] + ) + + +@st.composite +def west_df( + draw: DrawFn, with_nulls: bool = False, use_floats: bool = False +) -> pl.DataFrame: + height = draw(st.integers(min_value=0, max_value=20)) + + if use_floats: + time_strategy: SearchStrategy[Any] = st.floats(allow_nan=True) + cost_strategy: SearchStrategy[Any] = st.floats(allow_nan=True) + time_dtype: type[pl.DataType] = pl.Float32 + cost_dtype: type[pl.DataType] = pl.Float32 + else: + time_strategy = st.integers(min_value=100, max_value=105) + cost_strategy = st.integers(min_value=9, max_value=13) + time_dtype = pl.Int64 + cost_dtype = pl.Int64 + + if with_nulls: + time_strategy = time_strategy | st.none() + cost_strategy = cost_strategy | st.none() + + cores_strategy = st.integers(min_value=1, max_value=10) + + t_id = np.arange(100, 100 + height) + time = draw(st.lists(time_strategy, min_size=height, max_size=height)) + cost = draw(st.lists(cost_strategy, min_size=height, max_size=height)) + cores = draw(st.lists(cores_strategy, min_size=height, max_size=height)) + + return pl.DataFrame( + [ + pl.Series("t_id", t_id, dtype=pl.Int64), + pl.Series("time", time, dtype=time_dtype), + pl.Series("cost", cost, dtype=cost_dtype), + pl.Series("cores", cores, dtype=pl.Int64), + ] + ) + + +@given( + east=east_df(), + west=west_df(), + op1=operators(), + op2=operators(), +) +def test_ie_join(east: pl.DataFrame, west: pl.DataFrame, op1: str, op2: str) -> None: + expr0 = _inequality_expression("dur", op1, "time") + expr1 = _inequality_expression("rev", op2, "cost") + + actual = east.join_where(west, expr0, expr1) + + expected = east.join(west, how="cross").filter(expr0 & expr1) + assert_frame_equal(actual, expected, check_row_order=False, check_exact=True) + + +@given( + east=east_df(with_nulls=True), + west=west_df(with_nulls=True), + op1=operators(), + op2=operators(), +) +def test_ie_join_with_nulls( + east: pl.DataFrame, west: pl.DataFrame, op1: str, op2: str +) -> None: + expr0 = _inequality_expression("dur", op1, "time") + expr1 = _inequality_expression("rev", op2, "cost") + + actual = east.join_where(west, expr0, expr1) + + expected = east.join(west, how="cross").filter(expr0 & expr1) + assert_frame_equal(actual, expected, check_row_order=False, check_exact=True) + + +@given( + east=east_df(use_floats=True), + west=west_df(use_floats=True), + op1=operators(), + op2=operators(), +) +def test_ie_join_with_floats( + east: pl.DataFrame, west: pl.DataFrame, op1: str, op2: str +) -> None: + expr0 = _inequality_expression("dur", op1, "time") + expr1 = _inequality_expression("rev", op2, "cost") + + actual = east.join_where(west, expr0, expr1) + + expected = east.join(west, how="cross").filter(expr0 & expr1) + assert_frame_equal(actual, expected, check_row_order=False, check_exact=True) + + +def test_raise_on_ambiguous_name() -> None: + df = pl.DataFrame({"id": [1, 2]}) + with pytest.raises(pl.exceptions.InvalidOperationError): + df.join_where(df, pl.col("id") >= pl.col("id")) + + +def test_raise_on_multiple_binary_comparisons() -> None: + df = pl.DataFrame({"id": [1, 2]}) + with pytest.raises(pl.exceptions.InvalidOperationError): + df.join_where( + df, (pl.col("id") < pl.col("id")) & (pl.col("id") >= pl.col("id")) + ) + + +def test_raise_invalid_input_join_where() -> None: + df = pl.DataFrame({"id": [1, 2]}) + with pytest.raises(pl.exceptions.InvalidOperationError): + df.join_where(df) + + +def test_ie_join_use_keys_multiple() -> None: + a = pl.LazyFrame({"a": [1, 2, 3], "x": [7, 2, 1]}) + b = pl.LazyFrame({"b": [2, 2, 2], "x": [7, 1, 3]}) + + assert a.join_where( + b, + pl.col.a >= pl.col.b, + pl.col.a <= pl.col.b, + ).collect().sort("x_right").to_dict(as_series=False) == { + "a": [2, 2, 2], + "x": [2, 2, 2], + "b": [2, 2, 2], + "x_right": [1, 3, 7], + } diff --git a/py-polars/tests/unit/operations/test_replace.py b/py-polars/tests/unit/operations/test_replace.py index 03d1feb2681c..81edb16a6d49 100644 --- a/py-polars/tests/unit/operations/test_replace.py +++ b/py-polars/tests/unit/operations/test_replace.py @@ -281,3 +281,12 @@ def test_replace_default_deprecated() -> None: result = s.replace(1, 10, default=None) expected = pl.Series([10, None, None], dtype=pl.Int32) assert_series_equal(result, expected) + + +def test_replace_single_argument_not_mapping() -> None: + df = pl.DataFrame({"a": ["a", "b", "c"]}) + with pytest.raises( + TypeError, + match="`new` argument is required if `old` argument is not a Mapping type", + ): + df.select(pl.col("a").replace("b")) diff --git a/py-polars/tests/unit/operations/test_replace_strict.py b/py-polars/tests/unit/operations/test_replace_strict.py index d72f0c7968d6..14f99585e64e 100644 --- a/py-polars/tests/unit/operations/test_replace_strict.py +++ b/py-polars/tests/unit/operations/test_replace_strict.py @@ -398,3 +398,12 @@ def test_replace_strict_cat_cat( s = pl.Series("s", ["a", "b"], dtype=dt) s_replaced = s.replace_strict(old, new, default=pl.lit("OTHER", dtype=dt)) # type: ignore[arg-type] assert_series_equal(s_replaced, expected.fill_null("OTHER")) + + +def test_replace_strict_single_argument_not_mapping() -> None: + df = pl.DataFrame({"a": ["b", "b", "b"]}) + with pytest.raises( + TypeError, + match="`new` argument is required if `old` argument is not a Mapping type", + ): + df.select(pl.col("a").replace_strict("b")) diff --git a/py-polars/tests/unit/operations/test_slice.py b/py-polars/tests/unit/operations/test_slice.py index 692fcb5634dc..94dc1e3283ff 100644 --- a/py-polars/tests/unit/operations/test_slice.py +++ b/py-polars/tests/unit/operations/test_slice.py @@ -273,3 +273,18 @@ def test_group_by_slice_all_keys() -> None: gb = df.group_by(["a", "b", "c"], maintain_order=True) assert_frame_equal(gb.tail(1), gb.head(1)) + + +def test_slice_first_in_agg_18551() -> None: + df = pl.DataFrame({"id": [1, 1, 2], "name": ["A", "B", "C"], "value": [31, 21, 32]}) + + assert df.group_by("id", maintain_order=True).agg( + sort_by=pl.col("name").sort_by("value"), + x=pl.col("name").sort_by("value").slice(0, 1).first(), + y=pl.col("name").sort_by("value").slice(1, 1).first(), + ).to_dict(as_series=False) == { + "id": [1, 2], + "sort_by": [["B", "A"], ["C"]], + "x": ["B", "C"], + "y": ["A", None], + } diff --git a/py-polars/tests/unit/sql/test_miscellaneous.py b/py-polars/tests/unit/sql/test_miscellaneous.py index 77aa60e08af8..95ba8461bebe 100644 --- a/py-polars/tests/unit/sql/test_miscellaneous.py +++ b/py-polars/tests/unit/sql/test_miscellaneous.py @@ -2,6 +2,7 @@ from datetime import date from pathlib import Path +from typing import TYPE_CHECKING, Any import pytest @@ -9,6 +10,9 @@ from polars.exceptions import SQLInterfaceError, SQLSyntaxError from polars.testing import assert_frame_equal +if TYPE_CHECKING: + from polars.datatypes import DataType + @pytest.fixture def foods_ipc_path() -> Path: @@ -53,6 +57,28 @@ def test_any_all() -> None: } +@pytest.mark.parametrize( + ("data", "schema"), + [ + ({"x": [1, 2, 3, 4]}, None), + ({"x": [9, 8, 7, 6]}, {"x": pl.Int8}), + ({"x": ["aa", "bb"]}, {"x": pl.Struct}), + ({"x": [None, None], "y": [None, None]}, {"x": pl.Date, "y": pl.Float64}), + ], +) +def test_boolean_where_clauses( + data: dict[str, Any], schema: dict[str, DataType] | None +) -> None: + df = pl.DataFrame(data=data, schema=schema) + empty_df = df.clear() + + for true in ("TRUE", "1=1", "2 == 2", "'xx' = 'xx'", "TRUE AND 1=1"): + assert_frame_equal(df, df.sql(f"SELECT * FROM self WHERE {true}")) + + for false in ("false", "1!=1", "2 != 2", "'xx' != 'xx'", "FALSE OR 1!=1"): + assert_frame_equal(empty_df, df.sql(f"SELECT * FROM self WHERE {false}")) + + def test_count() -> None: df = pl.DataFrame( { diff --git a/py-polars/tests/unit/streaming/test_streaming_io.py b/py-polars/tests/unit/streaming/test_streaming_io.py index ff526d609a0a..0cbf0d90e4ba 100644 --- a/py-polars/tests/unit/streaming/test_streaming_io.py +++ b/py-polars/tests/unit/streaming/test_streaming_io.py @@ -1,5 +1,6 @@ from __future__ import annotations +import io from typing import TYPE_CHECKING, Any from unittest.mock import patch @@ -294,3 +295,26 @@ def test_streaming_empty_parquet_16523(tmp_path: Path) -> None: q = pl.scan_parquet(file_path) q2 = pl.LazyFrame({"a": [1]}, schema={"a": pl.Int32}) assert q.join(q2, on="a").collect(streaming=True).shape == (0, 1) + + +@pytest.mark.parametrize( + "method", + ["parquet", "csv"], +) +def test_nyi_scan_in_memory(method: str) -> None: + f = io.BytesIO() + df = pl.DataFrame( + { + "a": [1, 2, 3], + "b": ["x", "y", "z"], + } + ) + + (getattr(df, f"write_{method}"))(f) + + f.seek(0) + with pytest.raises( + pl.exceptions.ComputeError, + match="not yet implemented: Streaming scanning of in-memory buffers", + ): + (getattr(pl, f"scan_{method}"))(f).collect(streaming=True) diff --git a/py-polars/tests/unit/test_errors.py b/py-polars/tests/unit/test_errors.py index 2087387b1a8a..07b98d9d8111 100644 --- a/py-polars/tests/unit/test_errors.py +++ b/py-polars/tests/unit/test_errors.py @@ -349,7 +349,7 @@ def test_arr_eval_named_cols() -> None: def test_alias_in_join_keys() -> None: df = pl.DataFrame({"A": ["a", "b"], "B": [["a", "b"], ["c", "d"]]}) with pytest.raises( - ComputeError, + InvalidOperationError, match=r"'alias' is not allowed in a join key, use 'with_columns' first", ): df.join(df, on=pl.col("A").alias("foo"))