From 5c29bd73d4eda7446d29b9a6736e79ea1184e280 Mon Sep 17 00:00:00 2001 From: coastalwhite Date: Tue, 3 Sep 2024 15:08:26 +0200 Subject: [PATCH 01/27] feat: Allow BytesIO for Parquet scan --- crates/polars-io/src/path_utils/mod.rs | 6 +- crates/polars-lazy/src/lib.rs | 1 + crates/polars-lazy/src/scan/csv.rs | 91 +++-- .../polars-lazy/src/scan/file_list_reader.rs | 22 +- crates/polars-lazy/src/scan/ipc.rs | 19 +- crates/polars-lazy/src/scan/ndjson.rs | 22 +- crates/polars-lazy/src/scan/parquet.rs | 26 +- .../src/executors/scan/csv.rs | 23 +- .../src/executors/scan/ipc.rs | 30 +- .../src/executors/scan/ndjson.rs | 17 +- .../src/executors/scan/parquet.rs | 323 +++++++++--------- crates/polars-mem-engine/src/planner/lp.rs | 10 +- crates/polars-mem-engine/src/utils.rs | 4 +- .../polars-pipe/src/executors/sources/csv.rs | 13 +- .../src/executors/sources/parquet.rs | 12 +- crates/polars-pipe/src/pipeline/convert.rs | 8 +- crates/polars-plan/src/client/check.rs | 33 +- crates/polars-plan/src/plans/builder_dsl.rs | 31 +- .../src/plans/conversion/dsl_to_ir.rs | 134 +++++--- .../polars-plan/src/plans/conversion/mod.rs | 6 +- .../polars-plan/src/plans/conversion/scans.rs | 97 ++++-- crates/polars-plan/src/plans/ir/dot.rs | 3 +- crates/polars-plan/src/plans/ir/format.rs | 3 +- crates/polars-plan/src/plans/ir/inputs.rs | 4 +- crates/polars-plan/src/plans/ir/mod.rs | 176 +++++++++- crates/polars-plan/src/plans/mod.rs | 19 +- .../src/plans/optimizer/count_star.rs | 5 +- .../plans/optimizer/predicate_pushdown/mod.rs | 9 +- .../optimizer/projection_pushdown/mod.rs | 4 +- .../src/plans/optimizer/slice_pushdown_lp.rs | 12 +- crates/polars-plan/src/plans/visitor/hash.rs | 10 +- crates/polars-python/src/file.rs | 64 ++++ crates/polars-python/src/lazyframe/general.rs | 53 ++- .../src/lazyframe/visitor/nodes.rs | 4 +- .../polars-stream/src/nodes/parquet_source.rs | 6 +- .../src/physical_plan/lower_ir.rs | 4 +- crates/polars-stream/src/physical_plan/mod.rs | 2 +- .../src/utils/late_materialized_df.rs | 4 +- py-polars/polars/io/parquet/functions.py | 6 +- py-polars/tests/unit/io/test_parquet.py | 176 +++------- 40 files changed, 908 insertions(+), 584 deletions(-) diff --git a/crates/polars-io/src/path_utils/mod.rs b/crates/polars-io/src/path_utils/mod.rs index 5c4e48f7e6e4..d98034f6096c 100644 --- a/crates/polars-io/src/path_utils/mod.rs +++ b/crates/polars-io/src/path_utils/mod.rs @@ -88,7 +88,7 @@ pub fn expand_paths( paths: &[PathBuf], glob: bool, #[allow(unused_variables)] cloud_options: Option<&CloudOptions>, -) -> PolarsResult>> { +) -> PolarsResult> { expand_paths_hive(paths, glob, cloud_options, false).map(|x| x.0) } @@ -129,7 +129,7 @@ pub fn expand_paths_hive( glob: bool, #[allow(unused_variables)] cloud_options: Option<&CloudOptions>, check_directory_level: bool, -) -> PolarsResult<(Arc>, usize)> { +) -> PolarsResult<(Arc<[PathBuf]>, usize)> { let Some(first_path) = paths.first() else { return Ok((vec![].into(), 0)); }; @@ -361,7 +361,7 @@ pub fn expand_paths_hive( out_paths }; - Ok((Arc::new(out_paths), hive_idx_tracker.idx)) + Ok((out_paths.into(), hive_idx_tracker.idx)) } /// Ignores errors from `std::fs::create_dir_all` if the directory exists. diff --git a/crates/polars-lazy/src/lib.rs b/crates/polars-lazy/src/lib.rs index 024f2a26bffb..005a09186ba2 100644 --- a/crates/polars-lazy/src/lib.rs +++ b/crates/polars-lazy/src/lib.rs @@ -206,6 +206,7 @@ pub mod dsl; pub mod frame; pub mod physical_plan; pub mod prelude; + mod scan; #[cfg(test)] mod tests; diff --git a/crates/polars-lazy/src/scan/csv.rs b/crates/polars-lazy/src/scan/csv.rs index 54e9c77e2480..676c34b6a71e 100644 --- a/crates/polars-lazy/src/scan/csv.rs +++ b/crates/polars-lazy/src/scan/csv.rs @@ -5,6 +5,7 @@ use polars_io::cloud::CloudOptions; use polars_io::csv::read::{ infer_file_schema, CommentPrefix, CsvEncoding, CsvParseOptions, CsvReadOptions, NullValues, }; +use polars_io::mmap::ReaderBytes; use polars_io::path_utils::expand_paths; use polars_io::utils::get_reader_bytes; use polars_io::RowIndex; @@ -14,7 +15,7 @@ use crate::prelude::*; #[derive(Clone)] #[cfg(feature = "csv")] pub struct LazyCsvReader { - paths: Arc>, + source: ScanSource, glob: bool, cache: bool, read_options: CsvReadOptions, @@ -30,13 +31,13 @@ impl LazyCsvReader { self } - pub fn new_paths(paths: Arc>) -> Self { + pub fn new_paths(paths: Arc<[PathBuf]>) -> Self { Self::new("").with_paths(paths) } pub fn new(path: impl AsRef) -> Self { LazyCsvReader { - paths: Arc::new(vec![path.as_ref().to_path_buf()]), + source: ScanSource::Files([path.as_ref().to_path_buf()].into()), glob: true, cache: true, read_options: Default::default(), @@ -219,38 +220,54 @@ impl LazyCsvReader { where F: Fn(Schema) -> PolarsResult, { - // TODO: Path expansion should happen when converting to the IR - // https://github.com/pola-rs/polars/issues/17634 - let paths = expand_paths(self.paths(), self.glob(), self.cloud_options())?; + let mut n_threads = self.read_options.n_threads; + + let mut infer_schema = |reader_bytes: ReaderBytes| { + let skip_rows = self.read_options.skip_rows; + let parse_options = self.read_options.get_parse_options(); + + PolarsResult::Ok( + infer_file_schema( + &reader_bytes, + parse_options.separator, + self.read_options.infer_schema_length, + self.read_options.has_header, + // we set it to None and modify them after the schema is updated + None, + skip_rows, + self.read_options.skip_rows_after_header, + parse_options.comment_prefix.as_ref(), + parse_options.quote_char, + parse_options.eol_char, + None, + parse_options.try_parse_dates, + self.read_options.raise_if_empty, + &mut n_threads, + parse_options.decimal_comma, + )? + .0, + ) + }; - let Some(path) = paths.first() else { - polars_bail!(ComputeError: "no paths specified for this reader"); + let schema = match self.source.clone() { + ScanSource::Files(paths) => { + // TODO: Path expansion should happen when converting to the IR + // https://github.com/pola-rs/polars/issues/17634 + let paths = expand_paths(&paths[..], self.glob(), self.cloud_options())?; + + let Some(path) = paths.first() else { + polars_bail!(ComputeError: "no paths specified for this reader"); + }; + + let mut file = polars_utils::open_file(path)?; + infer_schema(get_reader_bytes(&mut file).expect("could not mmap file"))? + }, + ScanSource::Buffer(buffer) => infer_schema( + get_reader_bytes(&mut std::io::Cursor::new(buffer)).expect("could not mmap file"), + )?, }; - let mut file = polars_utils::open_file(path)?; - - let reader_bytes = get_reader_bytes(&mut file).expect("could not mmap file"); - let skip_rows = self.read_options.skip_rows; - let parse_options = self.read_options.get_parse_options(); - - let (schema, _, _) = infer_file_schema( - &reader_bytes, - parse_options.separator, - self.read_options.infer_schema_length, - self.read_options.has_header, - // we set it to None and modify them after the schema is updated - None, - skip_rows, - self.read_options.skip_rows_after_header, - parse_options.comment_prefix.as_ref(), - parse_options.quote_char, - parse_options.eol_char, - None, - parse_options.try_parse_dates, - self.read_options.raise_if_empty, - &mut self.read_options.n_threads, - parse_options.decimal_comma, - )?; + self.read_options.n_threads = n_threads; let mut schema = f(schema)?; // the dtypes set may be for the new names, so update again @@ -273,7 +290,7 @@ impl LazyFileListReader for LazyCsvReader { /// Get the final [LazyFrame]. fn finish(self) -> PolarsResult { let mut lf: LazyFrame = DslBuilder::scan_csv( - self.paths, + self.source.to_dsl(false), self.read_options, self.cache, self.cloud_options, @@ -294,12 +311,12 @@ impl LazyFileListReader for LazyCsvReader { self.glob } - fn paths(&self) -> &[PathBuf] { - &self.paths + fn source(&self) -> &ScanSource { + &self.source } - fn with_paths(mut self, paths: Arc>) -> Self { - self.paths = paths; + fn with_source(mut self, source: ScanSource) -> Self { + self.source = source; self } diff --git a/crates/polars-lazy/src/scan/file_list_reader.rs b/crates/polars-lazy/src/scan/file_list_reader.rs index f7b91d427200..8992b8df5a65 100644 --- a/crates/polars-lazy/src/scan/file_list_reader.rs +++ b/crates/polars-lazy/src/scan/file_list_reader.rs @@ -1,4 +1,5 @@ use std::path::PathBuf; +use std::sync::Arc; use polars_core::prelude::*; use polars_io::cloud::CloudOptions; @@ -18,8 +19,11 @@ pub trait LazyFileListReader: Clone { return self.finish_no_glob(); } - let lfs = self - .paths() + let ScanSource::Files(paths) = self.source() else { + unreachable!("Should never be globbed"); + }; + + let lfs = paths .iter() .map(|path| { self.clone() @@ -27,7 +31,7 @@ pub trait LazyFileListReader: Clone { .with_n_rows(None) // Each individual reader should not apply a row index. .with_row_index(None) - .with_paths(Arc::new(vec![path.clone()])) + .with_paths([path.clone()].into()) .with_rechunk(false) .finish_no_glob() .map_err(|e| { @@ -40,7 +44,7 @@ pub trait LazyFileListReader: Clone { polars_ensure!( !lfs.is_empty(), - ComputeError: "no matching files found in {:?}", self.paths().iter().map(|x| x.to_str().unwrap()).collect::>() + ComputeError: "no matching files found in {:?}", paths.iter().map(|x| x.to_str().unwrap()).collect::>() ); let mut lf = self.concat_impl(lfs)?; @@ -79,11 +83,17 @@ pub trait LazyFileListReader: Clone { true } - fn paths(&self) -> &[PathBuf]; + fn source(&self) -> &ScanSource; + + /// Set paths of the scanned files. + #[must_use] + fn with_source(self, source: ScanSource) -> Self; /// Set paths of the scanned files. #[must_use] - fn with_paths(self, paths: Arc>) -> Self; + fn with_paths(self, paths: Arc<[PathBuf]>) -> Self { + self.with_source(ScanSource::Files(paths)) + } /// Configure the row limit. fn with_n_rows(self, n_rows: impl Into>) -> Self; diff --git a/crates/polars-lazy/src/scan/ipc.rs b/crates/polars-lazy/src/scan/ipc.rs index 9d981bc74c0e..af0b53ade823 100644 --- a/crates/polars-lazy/src/scan/ipc.rs +++ b/crates/polars-lazy/src/scan/ipc.rs @@ -37,21 +37,20 @@ impl Default for ScanArgsIpc { #[derive(Clone)] struct LazyIpcReader { args: ScanArgsIpc, - paths: Arc>, + source: ScanSource, } impl LazyIpcReader { fn new(args: ScanArgsIpc) -> Self { Self { args, - paths: Arc::new(vec![]), + source: ScanSource::default(), } } } impl LazyFileListReader for LazyIpcReader { fn finish(self) -> PolarsResult { - let paths = self.paths; let args = self.args; let options = IpcScanOptions { @@ -59,7 +58,7 @@ impl LazyFileListReader for LazyIpcReader { }; let mut lf: LazyFrame = DslBuilder::scan_ipc( - paths, + self.source.to_dsl(false), options, args.n_rows, args.cache, @@ -80,12 +79,12 @@ impl LazyFileListReader for LazyIpcReader { unreachable!() } - fn paths(&self) -> &[PathBuf] { - &self.paths + fn source(&self) -> &ScanSource { + &self.source } - fn with_paths(mut self, paths: Arc>) -> Self { - self.paths = paths; + fn with_source(mut self, source: ScanSource) -> Self { + self.source = source; self } @@ -126,11 +125,11 @@ impl LazyFrame { /// Create a LazyFrame directly from a ipc scan. pub fn scan_ipc(path: impl AsRef, args: ScanArgsIpc) -> PolarsResult { LazyIpcReader::new(args) - .with_paths(Arc::new(vec![path.as_ref().to_path_buf()])) + .with_paths([path.as_ref().to_path_buf()].into()) .finish() } - pub fn scan_ipc_files(paths: Arc>, args: ScanArgsIpc) -> PolarsResult { + pub fn scan_ipc_files(paths: Arc<[PathBuf]>, args: ScanArgsIpc) -> PolarsResult { LazyIpcReader::new(args).with_paths(paths).finish() } } diff --git a/crates/polars-lazy/src/scan/ndjson.rs b/crates/polars-lazy/src/scan/ndjson.rs index 0effd26d5497..9a1d071f8365 100644 --- a/crates/polars-lazy/src/scan/ndjson.rs +++ b/crates/polars-lazy/src/scan/ndjson.rs @@ -1,11 +1,11 @@ use std::num::NonZeroUsize; use std::path::{Path, PathBuf}; -use std::sync::{Arc, Mutex, RwLock}; +use std::sync::{Arc, RwLock}; use polars_core::prelude::*; use polars_io::cloud::CloudOptions; use polars_io::RowIndex; -use polars_plan::plans::{DslPlan, FileScan}; +use polars_plan::plans::{DslPlan, FileScan, ScanSource}; use polars_plan::prelude::{FileScanOptions, NDJsonReadOptions}; use crate::prelude::LazyFrame; @@ -13,7 +13,7 @@ use crate::scan::file_list_reader::LazyFileListReader; #[derive(Clone)] pub struct LazyJsonLineReader { - pub(crate) paths: Arc>, + pub(crate) source: ScanSource, pub(crate) batch_size: Option, pub(crate) low_memory: bool, pub(crate) rechunk: bool, @@ -28,13 +28,13 @@ pub struct LazyJsonLineReader { } impl LazyJsonLineReader { - pub fn new_paths(paths: Arc>) -> Self { + pub fn new_paths(paths: Arc<[PathBuf]>) -> Self { Self::new(PathBuf::new()).with_paths(paths) } pub fn new(path: impl AsRef) -> Self { LazyJsonLineReader { - paths: Arc::new(vec![path.as_ref().to_path_buf()]), + source: ScanSource::Files([path.as_ref().to_path_buf()].into()), batch_size: None, low_memory: false, rechunk: false, @@ -117,8 +117,6 @@ impl LazyJsonLineReader { impl LazyFileListReader for LazyJsonLineReader { fn finish(self) -> PolarsResult { - let paths = Arc::new(Mutex::new((self.paths, false))); - let file_options = FileScanOptions { slice: self.n_rows.map(|x| (0, x)), with_columns: None, @@ -147,7 +145,7 @@ impl LazyFileListReader for LazyJsonLineReader { }; Ok(LazyFrame::from(DslPlan::Scan { - paths, + sources: self.source.to_dsl(false), file_info: Arc::new(RwLock::new(None)), hive_parts: None, predicate: None, @@ -160,12 +158,12 @@ impl LazyFileListReader for LazyJsonLineReader { unreachable!(); } - fn paths(&self) -> &[PathBuf] { - &self.paths + fn source(&self) -> &ScanSource { + &self.source } - fn with_paths(mut self, paths: Arc>) -> Self { - self.paths = paths; + fn with_source(mut self, source: ScanSource) -> Self { + self.source = source; self } diff --git a/crates/polars-lazy/src/scan/parquet.rs b/crates/polars-lazy/src/scan/parquet.rs index e87e90e3330a..491ae3ee126c 100644 --- a/crates/polars-lazy/src/scan/parquet.rs +++ b/crates/polars-lazy/src/scan/parquet.rs @@ -44,14 +44,14 @@ impl Default for ScanArgsParquet { #[derive(Clone)] struct LazyParquetReader { args: ScanArgsParquet, - paths: Arc>, + source: ScanSource, } impl LazyParquetReader { fn new(args: ScanArgsParquet) -> Self { Self { args, - paths: Arc::new(vec![]), + source: ScanSource::default(), } } } @@ -62,7 +62,7 @@ impl LazyFileListReader for LazyParquetReader { let row_index = self.args.row_index; let mut lf: LazyFrame = DslBuilder::scan_parquet( - self.paths, + self.source.to_dsl(false), self.args.n_rows, self.args.cache, self.args.parallel, @@ -95,12 +95,12 @@ impl LazyFileListReader for LazyParquetReader { unreachable!(); } - fn paths(&self) -> &[PathBuf] { - &self.paths + fn source(&self) -> &ScanSource { + &self.source } - fn with_paths(mut self, paths: Arc>) -> Self { - self.paths = paths; + fn with_source(mut self, source: ScanSource) -> Self { + self.source = source; self } @@ -140,15 +140,17 @@ impl LazyFrame { /// Create a LazyFrame directly from a parquet scan. pub fn scan_parquet(path: impl AsRef, args: ScanArgsParquet) -> PolarsResult { LazyParquetReader::new(args) - .with_paths(Arc::new(vec![path.as_ref().to_path_buf()])) + .with_paths(vec![path.as_ref().to_path_buf()].into()) .finish() } /// Create a LazyFrame directly from a parquet scan. - pub fn scan_parquet_files( - paths: Arc>, - args: ScanArgsParquet, - ) -> PolarsResult { + pub fn scan_parquet_sourced(source: ScanSource, args: ScanArgsParquet) -> PolarsResult { + LazyParquetReader::new(args).with_source(source).finish() + } + + /// Create a LazyFrame directly from a parquet scan. + pub fn scan_parquet_files(paths: Arc<[PathBuf]>, args: ScanArgsParquet) -> PolarsResult { LazyParquetReader::new(args).with_paths(paths).finish() } } diff --git a/crates/polars-mem-engine/src/executors/scan/csv.rs b/crates/polars-mem-engine/src/executors/scan/csv.rs index 50ed974e128b..24e813329bcf 100644 --- a/crates/polars-mem-engine/src/executors/scan/csv.rs +++ b/crates/polars-mem-engine/src/executors/scan/csv.rs @@ -1,4 +1,3 @@ -use std::path::PathBuf; use std::sync::Arc; use polars_core::config; @@ -9,7 +8,7 @@ use polars_core::utils::{ use super::*; pub struct CsvExec { - pub paths: Arc>, + pub sources: ScanSource, pub file_info: FileInfo, pub options: CsvReadOptions, pub file_options: FileScanOptions, @@ -18,6 +17,7 @@ pub struct CsvExec { impl CsvExec { fn read(&self) -> PolarsResult { + let paths = self.sources.as_paths(); let with_columns = self .file_options .with_columns @@ -45,7 +45,7 @@ impl CsvExec { .with_row_index(None) .with_path::<&str>(None); - if self.paths.is_empty() { + if paths.is_empty() { let out = if let Some(schema) = options_base.schema { DataFrame::from_rows_and_schema(&[], schema.as_ref())? } else { @@ -56,7 +56,7 @@ impl CsvExec { let verbose = config::verbose(); let force_async = config::force_async(); - let run_async = force_async || is_cloud_url(self.paths.first().unwrap()); + let run_async = force_async || is_cloud_url(paths.first().unwrap()); if force_async && verbose { eprintln!("ASYNC READING FORCED"); @@ -64,7 +64,7 @@ impl CsvExec { let finish_read = |i: usize, options: CsvReadOptions, predicate: Option>| { - let path = &self.paths[i]; + let path = &paths[i]; let mut df = if run_async { #[cfg(feature = "cloud")] { @@ -123,14 +123,14 @@ impl CsvExec { } let mut n_rows_read = 0usize; - let mut out = Vec::with_capacity(self.paths.len()); + let mut out = Vec::with_capacity(paths.len()); // If we have n_rows or row_index then we need to count how many rows we read, so we need // to delay applying the predicate. let predicate_during_read = predicate .clone() .filter(|_| n_rows.is_none() && self.file_options.row_index.is_none()); - for i in 0..self.paths.len() { + for i in 0..paths.len() { let opts = options_base .clone() .with_row_index(self.file_options.row_index.clone().map(|mut ri| { @@ -178,7 +178,7 @@ impl CsvExec { "reached n_rows = {} at file {} / {}", n_rows.unwrap(), 1 + i, - self.paths.len() + paths.len() ) } break; @@ -203,10 +203,10 @@ impl CsvExec { let dfs = POOL.install(|| { let step = std::cmp::min(POOL.current_num_threads(), 128); - (0..self.paths.len()) + (0..paths.len()) .step_by(step) .map(|start| { - (start..std::cmp::min(start.saturating_add(step), self.paths.len())) + (start..std::cmp::min(start.saturating_add(step), paths.len())) .into_par_iter() .map(|i| finish_read(i, options_base.clone(), predicate.clone())) .collect::>>() @@ -234,9 +234,10 @@ impl CsvExec { impl Executor for CsvExec { fn execute(&mut self, state: &mut ExecutionState) -> PolarsResult { + let paths = self.sources.as_paths(); let profile_name = if state.has_node_timer() { let mut ids = vec![PlSmallStr::from_str( - self.paths[0].to_string_lossy().as_ref(), + paths[0].to_string_lossy().as_ref(), )]; if self.predicate.is_some() { ids.push("predicate".into()) diff --git a/crates/polars-mem-engine/src/executors/scan/ipc.rs b/crates/polars-mem-engine/src/executors/scan/ipc.rs index 18d47c172bcd..b29e44a5e33c 100644 --- a/crates/polars-mem-engine/src/executors/scan/ipc.rs +++ b/crates/polars-mem-engine/src/executors/scan/ipc.rs @@ -1,4 +1,3 @@ -use std::path::PathBuf; use hive::HivePartitions; use polars_core::config; @@ -11,7 +10,7 @@ use rayon::prelude::*; use super::*; pub struct IpcExec { - pub(crate) paths: Arc>, + pub(crate) sources: ScanSource, pub(crate) file_info: FileInfo, pub(crate) predicate: Option>, pub(crate) options: IpcScanOptions, @@ -22,7 +21,8 @@ pub struct IpcExec { impl IpcExec { fn read(&mut self) -> PolarsResult { - let is_cloud = self.paths.iter().any(is_cloud_url); + let paths = self.sources.as_paths(); + let is_cloud = paths.iter().any(is_cloud_url); let force_async = config::force_async(); let mut out = if is_cloud || force_async { @@ -54,6 +54,7 @@ impl IpcExec { &mut self, path_idx_to_file: F, ) -> PolarsResult { + let paths = self.sources.as_paths(); if config::verbose() { eprintln!("executing ipc read sync with row_index = {:?}, n_rows = {:?}, predicate = {:?} for paths {:?}", self.file_options.row_index.as_ref(), @@ -62,7 +63,7 @@ impl IpcExec { x.1 }).as_ref(), self.predicate.is_some(), - self.paths + paths ); } @@ -86,13 +87,13 @@ impl IpcExec { .with_include_file_path(self.file_options.include_file_paths.as_ref().map(|x| { ( x.clone(), - Arc::from(self.paths[path_index].to_str().unwrap().to_string()), + Arc::from(paths[path_index].to_str().unwrap().to_string()), ) })) .memory_mapped( self.options .memory_map - .then(|| self.paths[path_index].clone()), + .then(|| paths[path_index].clone()), ) .finish() }; @@ -101,9 +102,9 @@ impl IpcExec { assert_eq!(x.0, 0); x.1 }) { - let mut out = Vec::with_capacity(self.paths.len()); + let mut out = Vec::with_capacity(paths.len()); - for i in 0..self.paths.len() { + for i in 0..paths.len() { let df = read_path(i, Some(n_rows))?; let df_height = df.height(); out.push(df); @@ -121,7 +122,7 @@ impl IpcExec { out } else { POOL.install(|| { - (0..self.paths.len()) + (0..paths.len()) .into_par_iter() .map(|i| read_path(i, None)) .collect::>>() @@ -157,7 +158,8 @@ impl IpcExec { } fn read_sync(&mut self) -> PolarsResult { - let paths = self.paths.clone(); + let paths = self.sources.into_paths(); + let paths = paths.clone(); self.read_impl(move |i| std::fs::File::open(&paths[i]).map_err(Into::into)) } @@ -167,9 +169,11 @@ impl IpcExec { // concurrently. use polars_io::file_cache::init_entries_from_uri_list; + let paths = self.sources.into_paths(); + tokio::task::block_in_place(|| { let cache_entries = init_entries_from_uri_list( - self.paths + paths .iter() .map(|x| Arc::from(x.to_str().unwrap())) .collect::>() @@ -184,9 +188,11 @@ impl IpcExec { impl Executor for IpcExec { fn execute(&mut self, state: &mut ExecutionState) -> PolarsResult { + let paths = self.sources.as_paths(); + let profile_name = if state.has_node_timer() { let mut ids = vec![PlSmallStr::from_str( - self.paths[0].to_string_lossy().as_ref(), + paths[0].to_string_lossy().as_ref(), )]; if self.predicate.is_some() { ids.push("predicate".into()) diff --git a/crates/polars-mem-engine/src/executors/scan/ndjson.rs b/crates/polars-mem-engine/src/executors/scan/ndjson.rs index 680e5cbf3bed..68ad24ab837e 100644 --- a/crates/polars-mem-engine/src/executors/scan/ndjson.rs +++ b/crates/polars-mem-engine/src/executors/scan/ndjson.rs @@ -1,12 +1,10 @@ -use std::path::PathBuf; - use polars_core::config; use polars_core::utils::accumulate_dataframes_vertical; use super::*; pub struct JsonExec { - paths: Arc>, + sources: ScanSource, options: NDJsonReadOptions, file_scan_options: FileScanOptions, file_info: FileInfo, @@ -15,14 +13,14 @@ pub struct JsonExec { impl JsonExec { pub fn new( - paths: Arc>, + sources: ScanSource, options: NDJsonReadOptions, file_scan_options: FileScanOptions, file_info: FileInfo, predicate: Option>, ) -> Self { Self { - paths, + sources, options, file_scan_options, file_info, @@ -38,10 +36,11 @@ impl JsonExec { .unwrap() .as_ref() .unwrap_right(); + let paths = self.sources.as_paths(); let verbose = config::verbose(); let force_async = config::force_async(); - let run_async = force_async || is_cloud_url(self.paths.first().unwrap()); + let run_async = force_async || is_cloud_url(paths.first().unwrap()); if force_async && verbose { eprintln!("ASYNC READING FORCED"); @@ -66,8 +65,7 @@ impl JsonExec { return Ok(df); } - let dfs = self - .paths + let dfs = paths .iter() .map_while(|p| { if n_rows == Some(0) { @@ -149,8 +147,9 @@ impl JsonExec { impl Executor for JsonExec { fn execute(&mut self, state: &mut ExecutionState) -> PolarsResult { + let paths = self.sources.as_paths(); let profile_name = if state.has_node_timer() { - let ids = vec![self.paths[0].to_string_lossy().clone()]; + let ids = vec![paths[0].to_string_lossy().clone()]; let name = comma_delimited("ndjson".to_string(), &ids); Cow::Owned(name) } else { diff --git a/crates/polars-mem-engine/src/executors/scan/parquet.rs b/crates/polars-mem-engine/src/executors/scan/parquet.rs index bd3d87ff8832..efed503ad511 100644 --- a/crates/polars-mem-engine/src/executors/scan/parquet.rs +++ b/crates/polars-mem-engine/src/executors/scan/parquet.rs @@ -1,10 +1,9 @@ -use std::path::PathBuf; - use hive::HivePartitions; use polars_core::config; #[cfg(feature = "cloud")] use polars_core::config::{get_file_prefetch_size, verbose}; use polars_core::utils::accumulate_dataframes_vertical; +use polars_error::feature_gated; use polars_io::cloud::CloudOptions; use polars_io::parquet::metadata::FileMetaDataRef; use polars_io::path_utils::is_cloud_url; @@ -14,7 +13,7 @@ use polars_io::RowIndex; use super::*; pub struct ParquetExec { - paths: Arc>, + sources: ScanSource, file_info: FileInfo, hive_parts: Option>>, predicate: Option>, @@ -29,7 +28,7 @@ pub struct ParquetExec { impl ParquetExec { #[allow(clippy::too_many_arguments)] pub(crate) fn new( - paths: Arc>, + sources: ScanSource, file_info: FileInfo, hive_parts: Option>>, predicate: Option>, @@ -39,7 +38,7 @@ impl ParquetExec { metadata: Option, ) -> Self { ParquetExec { - paths, + sources, file_info, hive_parts, predicate, @@ -52,7 +51,7 @@ impl ParquetExec { fn read_par(&mut self) -> PolarsResult> { let parallel = match self.options.parallel { - ParallelStrategy::Auto if self.paths.len() > POOL.current_num_threads() => { + ParallelStrategy::Auto if self.sources.num_sources() > POOL.current_num_threads() => { ParallelStrategy::RowGroups }, identity => identity, @@ -61,166 +60,159 @@ impl ParquetExec { let mut result = vec![]; let step = std::cmp::min(POOL.current_num_threads(), 128); - // Modified if we have a negative slice - let mut first_file = 0; - - // (offset, end) - let (slice_offset, slice_end) = if let Some(slice) = self.file_options.slice { - if slice.0 >= 0 { - (slice.0 as usize, slice.1.saturating_add(slice.0 as usize)) - } else { - // Walk the files in reverse until we find the first file, and then translate the - // slice into a positive-offset equivalent. - let slice_start_as_n_from_end = -slice.0 as usize; - let mut cum_rows = 0; - let chunk_size = 8; - POOL.install(|| { - for path_indexes in (0..self.paths.len()) - .rev() - .collect::>() - .chunks(chunk_size) - { - let row_counts = path_indexes - .into_par_iter() - .map(|i| { - ParquetReader::new(std::fs::File::open(&self.paths[*i])?).num_rows() - }) - .collect::>>()?; - - for (path_idx, rc) in path_indexes.iter().zip(row_counts) { - cum_rows += rc; - - if cum_rows >= slice_start_as_n_from_end { - first_file = *path_idx; - break; - } - } - - if first_file > 0 { - break; - } - } - - PolarsResult::Ok(()) - })?; - - let (start, len) = if slice_start_as_n_from_end > cum_rows { - // We need to trim the slice, e.g. SLICE[offset: -100, len: 75] on a file of 50 - // rows should only give the first 25 rows. - let first_file_position = slice_start_as_n_from_end - cum_rows; - (0, slice.1.saturating_sub(first_file_position)) - } else { - (cum_rows - slice_start_as_n_from_end, slice.1) - }; - - let end = start.saturating_add(len); - - (start, end) - } - } else { - (0, usize::MAX) + let slice_info = match self.file_options.slice { + None => ScanSourceSliceInfo { + item_slice: 0..usize::MAX, + source_slice: 0..self.sources.num_sources(), + }, + Some(slice) => self.sources.collect_slice_information( + slice, + |path| ParquetReader::new(std::fs::File::open(path)?).num_rows(), + |buff| ParquetReader::new(std::io::Cursor::new(buff)).num_rows(), + )?, }; - let mut current_offset = 0; - let base_row_index = self.file_options.row_index.take(); - // Limit no. of files at a time to prevent open file limits. - - for i in (first_file..self.paths.len()).step_by(step) { - let end = std::cmp::min(i.saturating_add(step), self.paths.len()); - let paths = &self.paths[i..end]; - let hive_parts = self.hive_parts.as_ref().map(|x| &x[i..end]); - - if current_offset >= slice_end && !result.is_empty() { - return Ok(result); - } - - // First initialize the readers, predicates and metadata. - // This will be used to determine the slices. That way we can actually read all the - // files in parallel even if we add row index columns or slices. - let iter = (0..paths.len()).into_par_iter().map(|i| { - let path = &paths[i]; - let hive_partitions = hive_parts.map(|x| x[i].materialize_partition_columns()); - - let file = std::fs::File::open(path)?; + match &self.sources { + ScanSource::Buffer(buffer) => { + let row_index = self.file_options.row_index.take(); let (projection, predicate) = prepare_scan_args( self.predicate.clone(), &mut self.file_options.with_columns.clone(), &mut self.file_info.schema.clone(), - base_row_index.is_some(), - hive_partitions.as_deref(), + row_index.is_some(), + None, ); - let mut reader = ParquetReader::new(file) + result = vec![ParquetReader::new(std::io::Cursor::new(buffer)) .read_parallel(parallel) .set_low_memory(self.options.low_memory) .use_statistics(self.options.use_statistics) .set_rechunk(false) - .with_hive_partition_columns(hive_partitions) - .with_include_file_path( - self.file_options - .include_file_paths - .as_ref() - .map(|x| (x.clone(), Arc::from(paths[i].to_str().unwrap()))), - ); - - reader - .num_rows() - .map(|num_rows| (reader, num_rows, predicate, projection)) - }); + .with_slice(Some((slice_info.item_slice.start, slice_info.item_slice.len()))) + .with_row_index(row_index) + .with_predicate(predicate.clone()) + .with_projection(projection.clone()) + .check_schema( + self.file_info + .reader_schema + .clone() + .unwrap() + .unwrap_left() + .as_ref(), + )? + .finish()?]; + }, + ScanSource::Files(paths) => { + let mut current_offset = 0; + let base_row_index = self.file_options.row_index.take(); + // Limit no. of files at a time to prevent open file limits. + + for i in slice_info.source_slice.step_by(step) { + let end = std::cmp::min(i.saturating_add(step), paths.len()); + let paths = &paths[i..end]; + let hive_parts = self.hive_parts.as_ref().map(|x| &x[i..end]); + + if current_offset >= slice_info.item_slice.end && !result.is_empty() { + return Ok(result); + } - // We do this in parallel because wide tables can take a long time deserializing metadata. - let readers_and_metadata = POOL.install(|| iter.collect::>>())?; + // First initialize the readers, predicates and metadata. + // This will be used to determine the slices. That way we can actually read all the + // files in parallel even if we add row index columns or slices. + let iter = (0..paths.len()).into_par_iter().map(|i| { + let path = &paths[i]; + let hive_partitions = + hive_parts.map(|x| x[i].materialize_partition_columns()); - let current_offset_ref = &mut current_offset; - let row_statistics = readers_and_metadata - .iter() - .map(|(_, num_rows, _, _)| { - let cum_rows = *current_offset_ref; - ( - cum_rows, - split_slice_at_file(current_offset_ref, *num_rows, slice_offset, slice_end), - ) - }) - .collect::>(); + let file = std::fs::File::open(path)?; + let (projection, predicate) = prepare_scan_args( + self.predicate.clone(), + &mut self.file_options.with_columns.clone(), + &mut self.file_info.schema.clone(), + base_row_index.is_some(), + hive_partitions.as_deref(), + ); - let out = POOL.install(|| { - readers_and_metadata - .into_par_iter() - .zip(row_statistics.into_par_iter()) - .map( - |((reader, _, predicate, projection), (cumulative_read, slice))| { - let row_index = base_row_index.as_ref().map(|rc| RowIndex { - name: rc.name.clone(), - offset: rc.offset + cumulative_read as IdxSize, - }); - - let df = reader - .with_slice(Some(slice)) - .with_row_index(row_index) - .with_predicate(predicate.clone()) - .with_projection(projection.clone()) - .check_schema( - self.file_info - .reader_schema - .clone() - .unwrap() - .unwrap_left() - .as_ref(), - )? - .finish()?; - - Ok(df) - }, - ) - .collect::>>() - })?; + let mut reader = ParquetReader::new(file) + .read_parallel(parallel) + .set_low_memory(self.options.low_memory) + .use_statistics(self.options.use_statistics) + .set_rechunk(false) + .with_hive_partition_columns(hive_partitions) + .with_include_file_path( + self.file_options + .include_file_paths + .as_ref() + .map(|x| (x.clone(), Arc::from(paths[i].to_str().unwrap()))), + ); - if result.is_empty() { - result = out; - } else { - result.extend_from_slice(&out) - } + reader + .num_rows() + .map(|num_rows| (reader, num_rows, predicate, projection)) + }); + + // We do this in parallel because wide tables can take a long time deserializing metadata. + let readers_and_metadata = + POOL.install(|| iter.collect::>>())?; + + let current_offset_ref = &mut current_offset; + let row_statistics = readers_and_metadata + .iter() + .map(|(_, num_rows, _, _)| { + let cum_rows = *current_offset_ref; + ( + cum_rows, + split_slice_at_file( + current_offset_ref, + *num_rows, + slice_info.item_slice.start, + slice_info.item_slice.end, + ), + ) + }) + .collect::>(); + + let out = POOL.install(|| { + readers_and_metadata + .into_par_iter() + .zip(row_statistics.into_par_iter()) + .map( + |((reader, _, predicate, projection), (cumulative_read, slice))| { + let row_index = base_row_index.as_ref().map(|rc| RowIndex { + name: rc.name.clone(), + offset: rc.offset + cumulative_read as IdxSize, + }); + + let df = reader + .with_slice(Some(slice)) + .with_row_index(row_index) + .with_predicate(predicate.clone()) + .with_projection(projection.clone()) + .check_schema( + self.file_info + .reader_schema + .clone() + .unwrap() + .unwrap_left() + .as_ref(), + )? + .finish()?; + + Ok(df) + }, + ) + .collect::>>() + })?; + + if result.is_empty() { + result = out; + } else { + result.extend_from_slice(&out) + } + } + }, } + Ok(result) } @@ -231,6 +223,7 @@ impl ParquetExec { use polars_io::utils::slice::split_slice_at_file; let verbose = verbose(); + let paths = self.sources.into_paths(); let first_metadata = &self.metadata; let cloud_options = self.cloud_options.as_ref(); @@ -254,13 +247,13 @@ impl ParquetExec { let slice_start_as_n_from_end = -slice.0 as usize; let mut cum_rows = 0; - let paths = &self.paths; + let paths = &paths; let cloud_options = Arc::new(self.cloud_options.clone()); let paths = paths.clone(); let cloud_options = cloud_options.clone(); - let mut iter = stream::iter((0..self.paths.len()).rev().map(|i| { + let mut iter = stream::iter((0..paths.len()).rev().map(|i| { let paths = paths.clone(); let cloud_options = cloud_options.clone(); @@ -312,9 +305,9 @@ impl ParquetExec { let base_row_index = self.file_options.row_index.take(); let mut processed = 0; - for batch_start in (first_file_idx..self.paths.len()).step_by(batch_size) { - let end = std::cmp::min(batch_start.saturating_add(batch_size), self.paths.len()); - let paths = &self.paths[batch_start..end]; + for batch_start in (first_file_idx..paths.len()).step_by(batch_size) { + let end = std::cmp::min(batch_start.saturating_add(batch_size), paths.len()); + let paths = &paths[batch_start..end]; let hive_parts = self.hive_parts.as_ref().map(|x| &x[batch_start..end]); if current_offset >= slice_end && !result.is_empty() { @@ -325,7 +318,7 @@ impl ParquetExec { eprintln!( "querying metadata of {}/{} files...", processed, - self.paths.len() + paths.len() ); } @@ -371,7 +364,7 @@ impl ParquetExec { let include_file_paths = self.file_options.include_file_paths.as_ref(); if verbose { - eprintln!("reading of {}/{} file...", processed, self.paths.len()); + eprintln!("reading of {}/{} file...", processed, paths.len()); } let iter = readers_and_metadata @@ -447,23 +440,20 @@ impl ParquetExec { .and_then(|_| self.predicate.take()) .map(phys_expr_to_io_expr); - let is_cloud = is_cloud_url(self.paths.first().unwrap()); + let is_cloud = match &self.sources { + ScanSource::Files(paths) => is_cloud_url(paths.first().unwrap()), + ScanSource::Buffer(_) => false, + }; let force_async = config::force_async(); let out = if is_cloud || force_async { - #[cfg(not(feature = "cloud"))] - { - panic!("activate cloud feature") - } - - #[cfg(feature = "cloud")] - { + feature_gated!("cloud", { if force_async && config::verbose() { eprintln!("ASYNC READING FORCED"); } polars_io::pl_async::get_runtime().block_on_potential_spawn(self.read_async())? - } + }) } else { self.read_par()? }; @@ -482,7 +472,8 @@ impl ParquetExec { impl Executor for ParquetExec { fn execute(&mut self, state: &mut ExecutionState) -> PolarsResult { let profile_name = if state.has_node_timer() { - let mut ids = vec![self.paths[0].to_string_lossy()]; + let paths = self.sources.as_paths(); + let mut ids = vec![paths[0].to_string_lossy()]; if self.predicate.is_some() { ids.push("predicate".into()) } diff --git a/crates/polars-mem-engine/src/planner/lp.rs b/crates/polars-mem-engine/src/planner/lp.rs index 523cd1e5c588..45487f7b7024 100644 --- a/crates/polars-mem-engine/src/planner/lp.rs +++ b/crates/polars-mem-engine/src/planner/lp.rs @@ -276,7 +276,7 @@ fn create_physical_plan_impl( }, #[allow(unused_variables)] Scan { - paths, + sources, file_info, hive_parts, output_schema, @@ -306,7 +306,7 @@ fn create_physical_plan_impl( match scan_type { #[cfg(feature = "csv")] FileScan::Csv { options, .. } => Ok(Box::new(executors::CsvExec { - paths, + sources, file_info, options, predicate, @@ -318,7 +318,7 @@ fn create_physical_plan_impl( cloud_options, metadata, } => Ok(Box::new(executors::IpcExec { - paths, + sources, file_info, predicate, options, @@ -332,7 +332,7 @@ fn create_physical_plan_impl( cloud_options, metadata, } => Ok(Box::new(executors::ParquetExec::new( - paths, + sources, file_info, hive_parts, predicate, @@ -343,7 +343,7 @@ fn create_physical_plan_impl( ))), #[cfg(feature = "json")] FileScan::NDJson { options, .. } => Ok(Box::new(executors::JsonExec::new( - paths, + sources, options, file_options, file_info, diff --git a/crates/polars-mem-engine/src/utils.rs b/crates/polars-mem-engine/src/utils.rs index cb04d599a7f0..b104da3c4e78 100644 --- a/crates/polars-mem-engine/src/utils.rs +++ b/crates/polars-mem-engine/src/utils.rs @@ -13,8 +13,8 @@ pub(crate) fn agg_source_paths( ) { lp_arena.iter(root_lp).for_each(|(_, lp)| { use IR::*; - if let Scan { paths, .. } = lp { - for path in paths.as_ref() { + if let Scan { sources, .. } = lp { + for path in sources.as_paths() { acc_paths.insert(path.clone()); } } diff --git a/crates/polars-pipe/src/executors/sources/csv.rs b/crates/polars-pipe/src/executors/sources/csv.rs index 2c34228bada6..5ca5551c506d 100644 --- a/crates/polars-pipe/src/executors/sources/csv.rs +++ b/crates/polars-pipe/src/executors/sources/csv.rs @@ -1,10 +1,10 @@ use std::fs::File; -use std::path::PathBuf; use polars_core::{config, POOL}; use polars_io::csv::read::{BatchedCsvReader, CsvReadOptions, CsvReader}; use polars_io::path_utils::is_cloud_url; use polars_plan::global::_set_n_rows_for_scan; +use polars_plan::plans::ScanSource; use polars_plan::prelude::FileScanOptions; use polars_utils::itertools::Itertools; @@ -20,7 +20,7 @@ pub(crate) struct CsvSource { batched_reader: Option>, reader: Option>, n_threads: usize, - paths: Arc>, + sources: ScanSource, options: Option, file_options: FileScanOptions, verbose: bool, @@ -36,6 +36,7 @@ impl CsvSource { // otherwise all files would be opened during construction of the pipeline // leading to Too many Open files error fn init_next_reader(&mut self) -> PolarsResult<()> { + let paths = self.sources.as_paths(); let file_options = self.file_options.clone(); let n_rows = file_options.slice.map(|x| { @@ -43,12 +44,12 @@ impl CsvSource { x.1 }); - if self.current_path_idx == self.paths.len() + if self.current_path_idx == paths.len() || (n_rows.is_some() && n_rows.unwrap() <= self.n_rows_read) { return Ok(()); } - let path = &self.paths[self.current_path_idx]; + let path = &paths[self.current_path_idx]; let force_async = config::force_async(); let run_async = force_async || is_cloud_url(path); @@ -140,7 +141,7 @@ impl CsvSource { } pub(crate) fn new( - paths: Arc>, + sources: ScanSource, schema: SchemaRef, options: CsvReadOptions, file_options: FileScanOptions, @@ -151,7 +152,7 @@ impl CsvSource { reader: None, batched_reader: None, n_threads: POOL.current_num_threads(), - paths, + sources, options: Some(options), file_options, verbose, diff --git a/crates/polars-pipe/src/executors/sources/parquet.rs b/crates/polars-pipe/src/executors/sources/parquet.rs index cd0cb58f3574..ab5abbade817 100644 --- a/crates/polars-pipe/src/executors/sources/parquet.rs +++ b/crates/polars-pipe/src/executors/sources/parquet.rs @@ -20,7 +20,7 @@ use polars_io::prelude::materialize_projection; use polars_io::prelude::ParquetAsyncReader; use polars_io::utils::slice::split_slice_at_file; use polars_io::SerReader; -use polars_plan::plans::FileInfo; +use polars_plan::plans::{FileInfo, ScanSource}; use polars_plan::prelude::hive::HivePartitions; use polars_plan::prelude::FileScanOptions; use polars_utils::itertools::Itertools; @@ -36,7 +36,7 @@ pub struct ParquetSource { processed_paths: usize, processed_rows: AtomicUsize, iter: Range, - paths: Arc>, + sources: ScanSource, options: ParquetOptions, file_options: FileScanOptions, #[allow(dead_code)] @@ -77,7 +77,8 @@ impl ParquetSource { usize, Option>, )> { - let path = &self.paths[index]; + let paths = self.sources.as_paths(); + let path = &paths[index]; let options = self.options; let file_options = self.file_options.clone(); let schema = self.file_info.schema.clone(); @@ -245,7 +246,7 @@ impl ParquetSource { #[allow(unused_variables)] #[allow(clippy::too_many_arguments)] pub(crate) fn new( - paths: Arc>, + sources: ScanSource, options: ParquetOptions, cloud_options: Option, metadata: Option, @@ -255,6 +256,7 @@ impl ParquetSource { verbose: bool, predicate: Option>, ) -> PolarsResult { + let paths = sources.as_paths(); let n_threads = POOL.current_num_threads(); let iter = 0..paths.len(); @@ -273,7 +275,7 @@ impl ParquetSource { options, file_options, iter, - paths, + sources, cloud_options, metadata, file_info, diff --git a/crates/polars-pipe/src/pipeline/convert.rs b/crates/polars-pipe/src/pipeline/convert.rs index 1e6f93eac9df..368fc91b17ef 100644 --- a/crates/polars-pipe/src/pipeline/convert.rs +++ b/crates/polars-pipe/src/pipeline/convert.rs @@ -74,7 +74,7 @@ where Ok(Box::new(sources::DataFrameSource::from_df(df)) as Box) }, Scan { - paths, + sources, file_info, hive_parts, file_options, @@ -82,6 +82,8 @@ where output_schema, scan_type, } => { + let paths = sources.into_paths(); + // Add predicate to operators. // Except for parquet, as that format can use statistics to prune file/row-groups. #[cfg(feature = "parquet")] @@ -102,7 +104,7 @@ where #[cfg(feature = "csv")] FileScan::Csv { options, .. } => { let src = sources::CsvSource::new( - paths, + sources, file_info.schema, options, file_options, @@ -144,7 +146,7 @@ where }) .transpose()?; let src = sources::ParquetSource::new( - paths, + sources, parquet_options, cloud_options, metadata, diff --git a/crates/polars-plan/src/client/check.rs b/crates/polars-plan/src/client/check.rs index a01addd9231d..e28e1906c8ea 100644 --- a/crates/polars-plan/src/client/check.rs +++ b/crates/polars-plan/src/client/check.rs @@ -2,7 +2,7 @@ use polars_core::error::{polars_err, PolarsResult}; use polars_io::path_utils::is_cloud_url; use crate::plans::options::SinkType; -use crate::plans::{DslPlan, FileScan}; +use crate::plans::{DslPlan, FileScan, DslScanSource}; /// Assert that the given [`DslPlan`] is eligible to be executed on Polars Cloud. pub(super) fn assert_cloud_eligible(dsl: &DslPlan) -> PolarsResult<()> { @@ -10,15 +10,30 @@ pub(super) fn assert_cloud_eligible(dsl: &DslPlan) -> PolarsResult<()> { match plan_node { #[cfg(feature = "python")] DslPlan::PythonScan { .. } => return ineligible_error("contains Python scan"), - DslPlan::Scan { paths, .. } - if paths.lock().unwrap().0.iter().any(|p| !is_cloud_url(p)) => - { - return ineligible_error("contains scan of local file system") - }, DslPlan::Scan { - scan_type: FileScan::Anonymous { .. }, - .. - } => return ineligible_error("contains anonymous scan"), + sources, scan_type, .. + } => { + match sources { + DslScanSource::File(file) => { + if file + .lock() + .unwrap() + .paths + .iter() + .any(|p| !is_cloud_url(p)) + { + return ineligible_error("contains scan of local file system"); + } + }, + DslScanSource::Buffer(_) => { + return ineligible_error("contains scan of in-memory buffer"); + }, + } + + if matches!(scan_type, FileScan::Anonymous { .. }) { + return ineligible_error("contains anonymous scan"); + } + }, DslPlan::Sink { payload, .. } => { if !matches!(payload, SinkType::Cloud { .. }) { return ineligible_error("contains sink to non-cloud location"); diff --git a/crates/polars-plan/src/plans/builder_dsl.rs b/crates/polars-plan/src/plans/builder_dsl.rs index 893dbeb00e6e..1170f95ec7a2 100644 --- a/crates/polars-plan/src/plans/builder_dsl.rs +++ b/crates/polars-plan/src/plans/builder_dsl.rs @@ -1,6 +1,4 @@ -#[cfg(any(feature = "csv", feature = "ipc", feature = "parquet"))] -use std::path::PathBuf; -use std::sync::{Arc, Mutex, RwLock}; +use std::sync::{Arc, RwLock}; use polars_core::prelude::*; #[cfg(any(feature = "parquet", feature = "ipc", feature = "csv"))] @@ -60,7 +58,7 @@ impl DslBuilder { }; Ok(DslPlan::Scan { - paths: Arc::new(Mutex::new((Arc::new(vec![]), true))), + sources: DslScanSource::Buffer(Arc::default()), file_info: Arc::new(RwLock::new(Some(file_info))), hive_parts: None, predicate: None, @@ -79,7 +77,7 @@ impl DslBuilder { #[cfg(feature = "parquet")] #[allow(clippy::too_many_arguments)] pub fn scan_parquet( - paths: Arc>, + source: DslScanSource, n_rows: Option, cache: bool, parallel: polars_io::parquet::read::ParallelStrategy, @@ -92,8 +90,6 @@ impl DslBuilder { glob: bool, include_file_paths: Option, ) -> PolarsResult { - let paths = init_paths(paths); - let options = FileScanOptions { with_columns: None, cache, @@ -106,7 +102,8 @@ impl DslBuilder { include_file_paths, }; Ok(DslPlan::Scan { - paths, + // @FIX: sources -> source + sources: source, file_info: Arc::new(RwLock::new(None)), hive_parts: None, predicate: None, @@ -127,7 +124,7 @@ impl DslBuilder { #[cfg(feature = "ipc")] #[allow(clippy::too_many_arguments)] pub fn scan_ipc( - paths: Arc>, + source: DslScanSource, options: IpcScanOptions, n_rows: Option, cache: bool, @@ -137,10 +134,8 @@ impl DslBuilder { hive_options: HiveOptions, include_file_paths: Option, ) -> PolarsResult { - let paths = init_paths(paths); - Ok(DslPlan::Scan { - paths, + sources: source, file_info: Arc::new(RwLock::new(None)), hive_parts: None, file_options: FileScanOptions { @@ -167,15 +162,13 @@ impl DslBuilder { #[allow(clippy::too_many_arguments)] #[cfg(feature = "csv")] pub fn scan_csv( - paths: Arc>, + source: DslScanSource, read_options: CsvReadOptions, cache: bool, cloud_options: Option, glob: bool, include_file_paths: Option, ) -> PolarsResult { - let paths = init_paths(paths); - // This gets partially moved by FileScanOptions let read_options_clone = read_options.clone(); @@ -195,7 +188,7 @@ impl DslBuilder { include_file_paths, }; Ok(DslPlan::Scan { - paths, + sources: source, file_info: Arc::new(RwLock::new(None)), hive_parts: None, file_options: options, @@ -464,9 +457,3 @@ impl DslBuilder { .into() } } - -/// Initialize paths as non-expanded. -#[cfg(any(feature = "csv", feature = "ipc", feature = "parquet"))] -fn init_paths(paths: Arc>) -> Arc>, bool)>> { - Arc::new(Mutex::new((paths, false))) -} diff --git a/crates/polars-plan/src/plans/conversion/dsl_to_ir.rs b/crates/polars-plan/src/plans/conversion/dsl_to_ir.rs index a902b2da1e5d..825c5896097b 100644 --- a/crates/polars-plan/src/plans/conversion/dsl_to_ir.rs +++ b/crates/polars-plan/src/plans/conversion/dsl_to_ir.rs @@ -105,14 +105,21 @@ pub fn to_alp_impl(lp: DslPlan, ctxt: &mut DslConversionContext) -> PolarsResult let v = match lp { DslPlan::Scan { - paths, + mut sources, file_info, hive_parts, predicate, mut file_options, mut scan_type, } => { - let paths = expand_scan_paths(paths, &mut scan_type, &mut file_options)?; + sources.expand_paths(&mut scan_type, &mut file_options)?; + + let source = match sources { + DslScanSource::File(paths) => { + ScanSource::Files(paths.as_ref().lock().unwrap().paths.clone()) + }, + DslScanSource::Buffer(buf) => ScanSource::Buffer(buf), + }; let file_info_read = file_info.read().unwrap(); @@ -139,7 +146,7 @@ pub fn to_alp_impl(lp: DslPlan, ctxt: &mut DslConversionContext) -> PolarsResult .. } => { let (file_info, md) = - scans::parquet_file_info(&paths, &file_options, cloud_options.as_ref()) + scans::parquet_file_info(&source, &file_options, cloud_options.as_ref()) .map_err(|e| e.context(failed_here!(parquet scan)))?; *metadata = md; file_info @@ -150,9 +157,12 @@ pub fn to_alp_impl(lp: DslPlan, ctxt: &mut DslConversionContext) -> PolarsResult metadata, .. } => { - let (file_info, md) = - scans::ipc_file_info(&paths, &file_options, cloud_options.as_ref()) - .map_err(|e| e.context(failed_here!(ipc scan)))?; + let (file_info, md) = scans::ipc_file_info( + source.as_paths(), + &file_options, + cloud_options.as_ref(), + ) + .map_err(|e| e.context(failed_here!(ipc scan)))?; *metadata = Some(md); file_info }, @@ -160,16 +170,19 @@ pub fn to_alp_impl(lp: DslPlan, ctxt: &mut DslConversionContext) -> PolarsResult FileScan::Csv { options, cloud_options, - } => { - scans::csv_file_info(&paths, &file_options, options, cloud_options.as_ref()) - .map_err(|e| e.context(failed_here!(csv scan)))? - }, + } => scans::csv_file_info( + source.as_paths(), + &file_options, + options, + cloud_options.as_ref(), + ) + .map_err(|e| e.context(failed_here!(csv scan)))?, #[cfg(feature = "json")] FileScan::NDJson { options, cloud_options, } => scans::ndjson_file_info( - &paths, + source.as_paths(), &file_options, options, cloud_options.as_ref(), @@ -189,7 +202,7 @@ pub fn to_alp_impl(lp: DslPlan, ctxt: &mut DslConversionContext) -> PolarsResult let mut owned = None; hive_partitions_from_paths( - paths.as_ref(), + source.as_paths().as_ref(), file_options.hive_options.hive_start_idx, file_options.hive_options.schema.clone(), match resolved_file_info.reader_schema.as_ref().unwrap() { @@ -263,7 +276,7 @@ pub fn to_alp_impl(lp: DslPlan, ctxt: &mut DslConversionContext) -> PolarsResult } IR::Scan { - paths, + sources: source, file_info: resolved_file_info, hive_parts, output_schema: None, @@ -803,47 +816,64 @@ pub fn to_alp_impl(lp: DslPlan, ctxt: &mut DslConversionContext) -> PolarsResult Ok(ctxt.lp_arena.add(v)) } -/// Expand scan paths if they were not already expanded. -#[allow(unused_variables)] -fn expand_scan_paths( - paths: Arc>, bool)>>, - scan_type: &mut FileScan, - file_options: &mut FileScanOptions, -) -> PolarsResult>> { - #[allow(unused_mut)] - let mut lock = paths.lock().unwrap(); - - // Return if paths are already expanded - if lock.1 { - return Ok(lock.0.clone()); - } +impl DslScanSource { + /// Expand scan paths if they were not already expanded. + pub fn expand_paths( + &mut self, + scan_type: &mut FileScan, + file_options: &mut FileScanOptions, + ) -> PolarsResult<()> { + match self { + DslScanSource::File(source) => { + #[allow(unused_mut)] + let mut lock = source.lock().unwrap(); + + // Return if paths are already expanded + if lock.is_expanded { + return Ok(()); + } - { - let paths_expanded = match &scan_type { - #[cfg(feature = "parquet")] - FileScan::Parquet { cloud_options, .. } => { - expand_scan_paths_with_hive_update(&lock.0, file_options, cloud_options)? - }, - #[cfg(feature = "ipc")] - FileScan::Ipc { cloud_options, .. } => { - expand_scan_paths_with_hive_update(&lock.0, file_options, cloud_options)? - }, - #[cfg(feature = "csv")] - FileScan::Csv { cloud_options, .. } => { - expand_paths(&lock.0, file_options.glob, cloud_options.as_ref())? - }, - #[cfg(feature = "json")] - FileScan::NDJson { cloud_options, .. } => { - expand_paths(&lock.0, file_options.glob, cloud_options.as_ref())? - }, - FileScan::Anonymous { .. } => unreachable!(), // Invariant: Anonymous scans are already expanded. - }; + { + let paths_expanded = match &scan_type { + #[cfg(feature = "parquet")] + FileScan::Parquet { cloud_options, .. } => { + expand_scan_paths_with_hive_update( + &lock.paths[..], + file_options, + cloud_options, + )? + }, + #[cfg(feature = "ipc")] + FileScan::Ipc { cloud_options, .. } => expand_scan_paths_with_hive_update( + &lock.paths[..], + file_options, + cloud_options, + )?, + #[cfg(feature = "csv")] + FileScan::Csv { cloud_options, .. } => expand_paths( + &lock.paths[..], + file_options.glob, + cloud_options.as_ref(), + )?, + #[cfg(feature = "json")] + FileScan::NDJson { cloud_options, .. } => expand_paths( + &lock.paths[..], + file_options.glob, + cloud_options.as_ref(), + )?, + FileScan::Anonymous { .. } => unreachable!(), // Invariant: Anonymous scans are already expanded. + }; - #[allow(unreachable_code)] - { - *lock = (paths_expanded, true); + #[allow(unreachable_code)] + { + lock.paths = paths_expanded; + lock.is_expanded = true; - Ok(lock.0.clone()) + Ok(()) + } + } + }, + DslScanSource::Buffer(_) => Ok(()), } } } @@ -854,7 +884,7 @@ fn expand_scan_paths_with_hive_update( paths: &[PathBuf], file_options: &mut FileScanOptions, cloud_options: &Option, -) -> PolarsResult>> { +) -> PolarsResult> { let hive_enabled = file_options.hive_options.enabled; let (expanded_paths, hive_start_idx) = expand_paths_hive( paths, diff --git a/crates/polars-plan/src/plans/conversion/mod.rs b/crates/polars-plan/src/plans/conversion/mod.rs index 89167a124534..3e8f8748e618 100644 --- a/crates/polars-plan/src/plans/conversion/mod.rs +++ b/crates/polars-plan/src/plans/conversion/mod.rs @@ -12,7 +12,7 @@ mod ir_to_dsl; mod scans; mod stack_opt; -use std::sync::{Arc, Mutex, RwLock}; +use std::sync::{Arc, RwLock}; pub use dsl_to_ir::*; pub use expr_to_ir::*; @@ -50,7 +50,7 @@ impl IR { }; match lp { IR::Scan { - paths, + sources, file_info, hive_parts, predicate, @@ -58,7 +58,7 @@ impl IR { output_schema: _, file_options: options, } => DslPlan::Scan { - paths: Arc::new(Mutex::new((paths, true))), + sources: sources.into(), file_info: Arc::new(RwLock::new(Some(file_info))), hive_parts, predicate: predicate.map(|e| e.to_expr(expr_arena)), diff --git a/crates/polars-plan/src/plans/conversion/scans.rs b/crates/polars-plan/src/plans/conversion/scans.rs index 9b2636430622..82c953e2ffa2 100644 --- a/crates/polars-plan/src/plans/conversion/scans.rs +++ b/crates/polars-plan/src/plans/conversion/scans.rs @@ -1,4 +1,5 @@ use std::path::PathBuf; +use std::sync::{Arc, Mutex}; use either::Either; use polars_io::path_utils::is_cloud_url; @@ -16,6 +17,18 @@ fn get_first_path(paths: &[PathBuf]) -> PolarsResult<&PathBuf> { .ok_or_else(|| polars_err!(ComputeError: "expected at least 1 path")) } +impl From for DslScanSource { + fn from(value: ScanSource) -> Self { + match value { + ScanSource::Files(paths) => DslScanSource::File(Arc::new(Mutex::new(ScanFileSource { + paths, + is_expanded: true, + }))), + ScanSource::Buffer(buffer) => DslScanSource::Buffer(buffer), + } + } +} + #[cfg(any(feature = "parquet", feature = "ipc"))] fn prepare_output_schema(mut schema: Schema, row_index: Option<&RowIndex>) -> SchemaRef { if let Some(rc) = row_index { @@ -38,46 +51,64 @@ fn prepare_schemas(mut schema: Schema, row_index: Option<&RowIndex>) -> (SchemaR #[cfg(feature = "parquet")] pub(super) fn parquet_file_info( - paths: &[PathBuf], + source: &ScanSource, file_options: &FileScanOptions, #[allow(unused)] cloud_options: Option<&polars_io::cloud::CloudOptions>, ) -> PolarsResult<(FileInfo, Option)> { - let path = get_first_path(paths)?; - - let (schema, reader_schema, num_rows, metadata) = if is_cloud_url(path) { - #[cfg(not(feature = "cloud"))] - panic!("One or more of the cloud storage features ('aws', 'gcp', ...) must be enabled."); - - #[cfg(feature = "cloud")] - { - let uri = path.to_string_lossy(); - get_runtime().block_on(async { - let mut reader = ParquetAsyncReader::from_uri(&uri, cloud_options, None).await?; - let reader_schema = reader.schema().await?; - let num_rows = reader.num_rows().await?; - let metadata = reader.get_metadata().await?.clone(); - + let (schema, reader_schema, num_rows, metadata) = match source { + ScanSource::Files(paths) => { + let path = get_first_path(paths)?; + if is_cloud_url(path) { + #[cfg(not(feature = "cloud"))] + panic!("One or more of the cloud storage features ('aws', 'gcp', ...) must be enabled."); + + #[cfg(feature = "cloud")] + { + let uri = path.to_string_lossy(); + get_runtime().block_on(async { + let mut reader = + ParquetAsyncReader::from_uri(&uri, cloud_options, None).await?; + let reader_schema = reader.schema().await?; + let num_rows = reader.num_rows().await?; + let metadata = reader.get_metadata().await?.clone(); + + let schema = prepare_output_schema( + Schema::from_arrow_schema(reader_schema.as_ref()), + file_options.row_index.as_ref(), + ); + PolarsResult::Ok((schema, reader_schema, Some(num_rows), Some(metadata))) + })? + } + } else { + let file = polars_utils::open_file(path)?; + let mut reader = ParquetReader::new(file); + let reader_schema = reader.schema()?; let schema = prepare_output_schema( Schema::from_arrow_schema(reader_schema.as_ref()), file_options.row_index.as_ref(), ); - PolarsResult::Ok((schema, reader_schema, Some(num_rows), Some(metadata))) - })? - } - } else { - let file = polars_utils::open_file(path)?; - let mut reader = ParquetReader::new(file); - let reader_schema = reader.schema()?; - let schema = prepare_output_schema( - Schema::from_arrow_schema(reader_schema.as_ref()), - file_options.row_index.as_ref(), - ); - ( - schema, - reader_schema, - Some(reader.num_rows()?), - Some(reader.get_metadata()?.clone()), - ) + ( + schema, + reader_schema, + Some(reader.num_rows()?), + Some(reader.get_metadata()?.clone()), + ) + } + }, + ScanSource::Buffer(buffer) => { + let mut reader = ParquetReader::new(std::io::Cursor::new(buffer)); + let reader_schema = reader.schema()?; + let schema = prepare_output_schema( + Schema::from_arrow_schema(reader_schema.as_ref()), + file_options.row_index.as_ref(), + ); + ( + schema, + reader_schema, + Some(reader.num_rows()?), + Some(reader.get_metadata()?.clone()), + ) + }, }; let file_info = FileInfo::new( diff --git a/crates/polars-plan/src/plans/ir/dot.rs b/crates/polars-plan/src/plans/ir/dot.rs index 69e3a69733c5..c3b8f2e94874 100644 --- a/crates/polars-plan/src/plans/ir/dot.rs +++ b/crates/polars-plan/src/plans/ir/dot.rs @@ -247,7 +247,7 @@ impl<'a> IRDotDisplay<'a> { })?; }, Scan { - paths, + sources, file_info, hive_parts: _, predicate, @@ -255,6 +255,7 @@ impl<'a> IRDotDisplay<'a> { file_options: options, output_schema: _, } => { + let paths = sources.as_paths(); let name: &str = scan_type.into(); let path = PathsDisplay(paths.as_ref()); let with_columns = options.with_columns.as_ref().map(|cols| cols.as_ref()); diff --git a/crates/polars-plan/src/plans/ir/format.rs b/crates/polars-plan/src/plans/ir/format.rs index 60699be85095..6c1c37b78671 100644 --- a/crates/polars-plan/src/plans/ir/format.rs +++ b/crates/polars-plan/src/plans/ir/format.rs @@ -221,13 +221,14 @@ impl<'a> IRDisplay<'a> { self.with_root(*input)._format(f, sub_indent) }, Scan { - paths, + sources, file_info, predicate, scan_type, file_options, .. } => { + let paths = sources.as_paths(); let n_columns = file_options .with_columns .as_ref() diff --git a/crates/polars-plan/src/plans/ir/inputs.rs b/crates/polars-plan/src/plans/ir/inputs.rs index b00c91cddae4..2a7c14e300de 100644 --- a/crates/polars-plan/src/plans/ir/inputs.rs +++ b/crates/polars-plan/src/plans/ir/inputs.rs @@ -101,7 +101,7 @@ impl IR { options: *options, }, Scan { - paths, + sources, file_info, hive_parts, output_schema, @@ -114,7 +114,7 @@ impl IR { new_predicate = exprs.pop() } Scan { - paths: paths.clone(), + sources: sources.clone(), file_info: file_info.clone(), hive_parts: hive_parts.clone(), output_schema: output_schema.clone(), diff --git a/crates/polars-plan/src/plans/ir/mod.rs b/crates/polars-plan/src/plans/ir/mod.rs index 443726affad0..7062514f7689 100644 --- a/crates/polars-plan/src/plans/ir/mod.rs +++ b/crates/polars-plan/src/plans/ir/mod.rs @@ -6,12 +6,14 @@ pub(crate) mod tree_format; use std::borrow::Cow; use std::fmt; -use std::path::PathBuf; +use std::path::{Path, PathBuf}; +use std::sync::Mutex; pub use dot::{EscapeLabel, IRDotDisplay, PathsDisplay}; pub use format::{ExprIRDisplay, IRDisplay}; use hive::HivePartitions; use polars_core::prelude::*; +use polars_core::POOL; use polars_utils::idx_vec::UnitVec; use polars_utils::unitvec; #[cfg(feature = "ir_serde")] @@ -33,6 +35,176 @@ pub struct IRPlanRef<'a> { pub expr_arena: &'a Arena, } +#[cfg_attr(feature = "ir_serde", derive(Serialize, Deserialize))] +#[derive(Debug, Clone, Hash)] +pub enum ScanSource { + Files(Arc<[PathBuf]>), + #[cfg_attr(feature = "ir_serde", serde(skip))] + Buffer(Arc<[u8]>), +} + +impl Default for ScanSource { + fn default() -> Self { + Self::Files(Arc::default()) + } +} + +pub struct ScanSourceSliceInfo { + pub item_slice: std::ops::Range, + pub source_slice: std::ops::Range, +} + +impl ScanSource { + pub fn as_paths(&self) -> &[PathBuf] { + match self { + ScanSource::Files(paths) => paths, + ScanSource::Buffer(_) => unimplemented!(), + } + } + + pub fn into_paths(&self) -> Arc<[PathBuf]> { + match self { + ScanSource::Files(paths) => paths.clone(), + ScanSource::Buffer(_) => unimplemented!(), + } + } + + pub fn to_dsl(self, is_expanded: bool) -> DslScanSource { + match self { + ScanSource::Files(paths) => { + DslScanSource::File(Arc::new(Mutex::new(ScanFileSource { paths, is_expanded }))) + }, + ScanSource::Buffer(buffer) => DslScanSource::Buffer(buffer), + } + } + + pub fn num_sources(&self) -> usize { + match self { + ScanSource::Files(paths) => paths.len(), + ScanSource::Buffer(_) => 1, + } + } + + pub fn is_cloud_url(&self) -> PolarsResult { + match self { + ScanSource::Files(paths) => { + Ok(polars_io::is_cloud_url(paths.first().ok_or_else( + || polars_err!(ComputeError: "expected at least 1 path"), + )?)) + }, + ScanSource::Buffer(_) => Ok(false), + } + } + + /// Normalize the slice and collect information as to what rows and parts of the source are + /// used in this slice. + pub fn collect_slice_information( + &self, + slice: (i64, usize), + path_to_num_rows: impl Fn(&Path) -> PolarsResult + Send + Sync, + buffer_to_num_rows: impl Fn(&[u8]) -> PolarsResult + Send + Sync, + ) -> PolarsResult { + fn slice_to_start_end( + offset: i64, + length: usize, + num_rows: usize, + ) -> std::ops::Range { + if offset < 0 { + let slice_start_as_n_from_end = -offset as usize; + let (start, len) = if slice_start_as_n_from_end > num_rows { + // We need to trim the slice, e.g. SLICE[offset: -100, len: 75] on a file of 50 + // rows should only give the first 25 rows. + let start_position = slice_start_as_n_from_end - num_rows; + (0, length.saturating_sub(start_position)) + } else { + (num_rows - slice_start_as_n_from_end, length) + }; + + let end = start.saturating_add(len); + + start..end + } else { + let offset = offset as usize; + offset.min(num_rows)..(offset + length).min(num_rows) + } + } + + let (offset, length) = slice; + + Ok(match self { + ScanSource::Files(paths) if paths.len() == 1 => { + let num_rows = path_to_num_rows(&paths[0])?; + ScanSourceSliceInfo { + item_slice: slice_to_start_end(offset, length, num_rows), + source_slice: 0..1, + } + }, + ScanSource::Files(paths) => { + use rayon::prelude::*; + + assert_ne!(paths.len(), 0); + + // Walk the files in reverse until we find the first file, and then translate the + // slice into a positive-offset equivalent. + const CHUNK_SIZE: usize = 8; + let mut row_counts = Vec::with_capacity(paths.len()); + + POOL.install(|| { + for idx_end in (0..paths.len()).step_by(CHUNK_SIZE) { + let idx_start = idx_end.saturating_sub(CHUNK_SIZE); + + row_counts.extend( + (idx_start..=idx_end) + .into_par_iter() + .map(|i| path_to_num_rows(&paths[i])) + .collect::>>()? + .into_iter() + .rev(), + ); + } + + PolarsResult::Ok(()) + })?; + + let num_rows = row_counts.iter().sum::(); + + let item_slice = slice_to_start_end(offset, length, num_rows); + + let mut source_start = paths.len() - 1; + let mut source_end = 0; + + let mut sum = 0; + for (i, row_count) in row_counts.iter().rev().enumerate() { + if sum < item_slice.end { + source_end = usize::max(source_end, i); + } + + sum += row_count; + + if sum >= item_slice.start { + source_start = usize::min(source_start, i); + } + } + + let source_slice = source_start..source_end + 1; + + ScanSourceSliceInfo { + item_slice, + source_slice, + } + }, + ScanSource::Buffer(buffer) => { + let num_rows = buffer_to_num_rows(buffer)?; + + ScanSourceSliceInfo { + item_slice: slice_to_start_end(offset, length, num_rows), + source_slice: 0..1, + } + }, + }) + } +} + /// [`IR`] is a representation of [`DslPlan`] with [`Node`]s which are allocated in an [`Arena`] /// In this IR the logical plan has access to the full dataset. #[derive(Clone, Debug, Default)] @@ -52,7 +224,7 @@ pub enum IR { predicate: ExprIR, }, Scan { - paths: Arc>, + sources: ScanSource, file_info: FileInfo, hive_parts: Option>>, predicate: Option, diff --git a/crates/polars-plan/src/plans/mod.rs b/crates/polars-plan/src/plans/mod.rs index cee1a3bb1045..9e2b4d56d6a4 100644 --- a/crates/polars-plan/src/plans/mod.rs +++ b/crates/polars-plan/src/plans/mod.rs @@ -59,6 +59,21 @@ pub enum Context { Default, } +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Clone)] +pub struct ScanFileSource { + pub paths: Arc<[PathBuf]>, + pub is_expanded: bool, +} + +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Clone)] +pub enum DslScanSource { + File(Arc>), + // @Q? Can we serde skip this? + Buffer(Arc<[u8]>), +} + // https://stackoverflow.com/questions/1031076/what-are-projection-and-selection #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] pub enum DslPlan { @@ -76,7 +91,7 @@ pub enum DslPlan { cache_hits: u32, }, Scan { - paths: Arc>, bool)>>, + sources: DslScanSource, // Option as this is mostly materialized on the IR phase. // During conversion we update the value in the DSL as well // This is to cater to use cases where parts of a `LazyFrame` @@ -193,7 +208,7 @@ impl Clone for DslPlan { Self::PythonScan { options } => Self::PythonScan { options: options.clone() }, Self::Filter { input, predicate } => Self::Filter { input: input.clone(), predicate: predicate.clone() }, Self::Cache { input, id, cache_hits } => Self::Cache { input: input.clone(), id: id.clone(), cache_hits: cache_hits.clone() }, - Self::Scan { paths, file_info, hive_parts, predicate, file_options, scan_type } => Self::Scan { paths: paths.clone(), file_info: file_info.clone(), hive_parts: hive_parts.clone(), predicate: predicate.clone(), file_options: file_options.clone(), scan_type: scan_type.clone() }, + Self::Scan { sources, file_info, hive_parts, predicate, file_options, scan_type } => Self::Scan { sources: sources.clone(), file_info: file_info.clone(), hive_parts: hive_parts.clone(), predicate: predicate.clone(), file_options: file_options.clone(), scan_type: scan_type.clone() }, Self::DataFrameScan { df, schema, output_schema, filter: selection } => Self::DataFrameScan { df: df.clone(), schema: schema.clone(), output_schema: output_schema.clone(), filter: selection.clone() }, Self::Select { expr, input, options } => Self::Select { expr: expr.clone(), input: input.clone(), options: options.clone() }, Self::GroupBy { input, keys, aggs, apply, maintain_order, options } => Self::GroupBy { input: input.clone(), keys: keys.clone(), aggs: aggs.clone(), apply: apply.clone(), maintain_order: maintain_order.clone(), options: options.clone() }, diff --git a/crates/polars-plan/src/plans/optimizer/count_star.rs b/crates/polars-plan/src/plans/optimizer/count_star.rs index 32a95cc3ede3..8565e066dcb4 100644 --- a/crates/polars-plan/src/plans/optimizer/count_star.rs +++ b/crates/polars-plan/src/plans/optimizer/count_star.rs @@ -95,9 +95,10 @@ fn visit_logical_plan_for_scan_paths( }) }, IR::Scan { - scan_type, paths, .. + scan_type, sources, .. } if !matches!(scan_type, FileScan::Anonymous { .. }) => Some(CountStarExpr { - paths: paths.clone(), + // @FIX: Count Star Should probably just have a Arc Slice + paths: Arc::new(sources.as_paths().as_ref().to_vec()), scan_type: scan_type.clone(), node, alias: None, diff --git a/crates/polars-plan/src/plans/optimizer/predicate_pushdown/mod.rs b/crates/polars-plan/src/plans/optimizer/predicate_pushdown/mod.rs index 1def3d375958..3b9e6c8d8ef9 100644 --- a/crates/polars-plan/src/plans/optimizer/predicate_pushdown/mod.rs +++ b/crates/polars-plan/src/plans/optimizer/predicate_pushdown/mod.rs @@ -325,7 +325,7 @@ impl<'a> PredicatePushDown<'a> { Ok(lp) }, Scan { - mut paths, + mut sources, file_info, hive_parts: mut scan_hive_parts, ref predicate, @@ -366,6 +366,7 @@ impl<'a> PredicatePushDown<'a> { if let (Some(hive_parts), Some(predicate)) = (&scan_hive_parts, &predicate) { if let Some(io_expr) = self.expr_eval.unwrap()(predicate, expr_arena) { if let Some(stats_evaluator) = io_expr.as_stats_evaluator() { + let paths = sources.as_paths(); let mut new_paths = Vec::with_capacity(paths.len()); let mut new_hive_parts = Vec::with_capacity(paths.len()); @@ -400,7 +401,7 @@ impl<'a> PredicatePushDown<'a> { filter: None, }); } else { - paths = Arc::from(new_paths); + sources = ScanSource::Files(new_paths.into()); scan_hive_parts = Some(Arc::from(new_hive_parts)); } } @@ -422,7 +423,7 @@ impl<'a> PredicatePushDown<'a> { let lp = if do_optimization { Scan { - paths, + sources, file_info, hive_parts, predicate, @@ -432,7 +433,7 @@ impl<'a> PredicatePushDown<'a> { } } else { let lp = Scan { - paths, + sources, file_info, hive_parts, predicate: None, diff --git a/crates/polars-plan/src/plans/optimizer/projection_pushdown/mod.rs b/crates/polars-plan/src/plans/optimizer/projection_pushdown/mod.rs index e5e2fb94ccde..20e0d0d28633 100644 --- a/crates/polars-plan/src/plans/optimizer/projection_pushdown/mod.rs +++ b/crates/polars-plan/src/plans/optimizer/projection_pushdown/mod.rs @@ -398,7 +398,7 @@ impl ProjectionPushDown { Ok(PythonScan { options }) }, Scan { - paths, + sources, mut file_info, mut hive_parts, scan_type, @@ -510,7 +510,7 @@ impl ProjectionPushDown { } }; let lp = Scan { - paths, + sources, file_info, hive_parts, output_schema, diff --git a/crates/polars-plan/src/plans/optimizer/slice_pushdown_lp.rs b/crates/polars-plan/src/plans/optimizer/slice_pushdown_lp.rs index f62bd9ee197d..b656795f53d2 100644 --- a/crates/polars-plan/src/plans/optimizer/slice_pushdown_lp.rs +++ b/crates/polars-plan/src/plans/optimizer/slice_pushdown_lp.rs @@ -165,7 +165,7 @@ impl SlicePushDown { } #[cfg(feature = "csv")] (Scan { - paths, + sources, file_info, hive_parts, output_schema, @@ -176,7 +176,7 @@ impl SlicePushDown { file_options.slice = Some((0, state.offset as usize + state.len as usize)); let lp = Scan { - paths, + sources, file_info, hive_parts, output_schema, @@ -189,7 +189,7 @@ impl SlicePushDown { }, #[cfg(feature = "parquet")] (Scan { - paths, + sources, file_info, hive_parts, output_schema, @@ -200,7 +200,7 @@ impl SlicePushDown { file_options.slice = Some((state.offset, state.len as usize)); let lp = Scan { - paths, + sources, file_info, hive_parts, output_schema, @@ -213,7 +213,7 @@ impl SlicePushDown { }, // TODO! we currently skip slice pushdown if there is a predicate. (Scan { - paths, + sources, file_info, hive_parts, output_schema, @@ -224,7 +224,7 @@ impl SlicePushDown { options.slice = Some((0, state.len as usize)); let lp = Scan { - paths, + sources, file_info, hive_parts, output_schema, diff --git a/crates/polars-plan/src/plans/visitor/hash.rs b/crates/polars-plan/src/plans/visitor/hash.rs index 80c251108297..7087122802ea 100644 --- a/crates/polars-plan/src/plans/visitor/hash.rs +++ b/crates/polars-plan/src/plans/visitor/hash.rs @@ -74,7 +74,7 @@ impl Hash for HashableEqLP<'_> { predicate.traverse_and_hash(self.expr_arena, state); }, IR::Scan { - paths, + sources, file_info: _, hive_parts: _, predicate, @@ -84,7 +84,7 @@ impl Hash for HashableEqLP<'_> { } => { // We don't have to traverse the schema, hive partitions etc. as they are derivative from the paths. scan_type.hash(state); - paths.hash(state); + sources.hash(state); hash_option_expr(predicate, self.expr_arena, state); file_options.hash(state); }, @@ -254,7 +254,7 @@ impl HashableEqLP<'_> { ) => expr_ir_eq(l, r, self.expr_arena), ( IR::Scan { - paths: pl, + sources: pl, file_info: _, hive_parts: _, predicate: pred_l, @@ -263,7 +263,7 @@ impl HashableEqLP<'_> { file_options: ol, }, IR::Scan { - paths: pr, + sources: pr, file_info: _, hive_parts: _, predicate: pred_r, @@ -272,7 +272,7 @@ impl HashableEqLP<'_> { file_options: or, }, ) => { - pl == pr + pl.as_paths() == pr.as_paths() && stl == str && ol == or && opt_expr_ir_eq(pred_l, pred_r, self.expr_arena) diff --git a/crates/polars-python/src/file.rs b/crates/polars-python/src/file.rs index bcdfd7ff6ee7..9443fd0b5213 100644 --- a/crates/polars-python/src/file.rs +++ b/crates/polars-python/src/file.rs @@ -7,6 +7,7 @@ use std::io::{Cursor, ErrorKind, Read, Seek, SeekFrom, Write}; #[cfg(target_family = "unix")] use std::os::fd::{FromRawFd, RawFd}; use std::path::PathBuf; +use std::sync::Arc; use polars::io::mmap::MmapBytesReader; use polars_error::{polars_err, polars_warn}; @@ -31,6 +32,10 @@ impl PyFileLikeObject { PyFileLikeObject { inner: object } } + pub fn as_arc(&self) -> Arc<[u8]> { + self.as_file_buffer().into_inner().into() + } + pub fn as_buffer(&self) -> std::io::Cursor> { let data = self.as_file_buffer().into_inner(); std::io::Cursor::new(data) @@ -191,6 +196,65 @@ impl EitherRustPythonFile { } } +pub enum EitherPythonFileOrPath { + Py(PyFileLikeObject), + Path(PathBuf), +} + +pub fn get_either_file_or_path( + py_f: PyObject, + write: bool, +) -> PyResult { + Python::with_gil(|py| { + let py_f = py_f.into_bound(py); + if let Ok(s) = py_f.extract::>() { + let file_path = std::path::Path::new(&*s); + let file_path = resolve_homedir(file_path); + Ok(EitherPythonFileOrPath::Path(file_path)) + } else { + let io = py.import_bound("io").unwrap(); + let is_utf8_encoding = |py_f: &Bound| -> PyResult { + let encoding = py_f.getattr("encoding")?; + let encoding = encoding.extract::>()?; + Ok(encoding.eq_ignore_ascii_case("utf-8") || encoding.eq_ignore_ascii_case("utf8")) + }; + + // BytesIO is relatively fast, and some code relies on it. + if !py_f.is_exact_instance(&io.getattr("BytesIO").unwrap()) { + polars_warn!("Polars found a filename. \ + Ensure you pass a path to the file instead of a python file object when possible for best \ + performance."); + } + // Unwrap TextIOWrapper + // Allow subclasses to allow things like pytest.capture.CaptureIO + let py_f = if py_f + .is_instance(&io.getattr("TextIOWrapper").unwrap()) + .unwrap_or_default() + { + if !is_utf8_encoding(&py_f)? { + return Err(PyPolarsErr::from( + polars_err!(InvalidOperation: "file encoding is not UTF-8"), + ) + .into()); + } + // XXX: we have to clear buffer here. + // Is there a better solution? + if write { + py_f.call_method0("flush")?; + } else { + py_f.call_method1("seek", (0, 1))?; + } + py_f.getattr("buffer")? + } else { + py_f + }; + PyFileLikeObject::ensure_requirements(&py_f, !write, write, !write)?; + let f = PyFileLikeObject::new(py_f.to_object(py)); + Ok(EitherPythonFileOrPath::Py(f)) + } + }) +} + fn get_either_file_and_path( py_f: PyObject, write: bool, diff --git a/crates/polars-python/src/lazyframe/general.rs b/crates/polars-python/src/lazyframe/general.rs index 4cfd25258b61..d5f64c9f35ac 100644 --- a/crates/polars-python/src/lazyframe/general.rs +++ b/crates/polars-python/src/lazyframe/general.rs @@ -254,7 +254,7 @@ impl PyLazyFrame { low_memory, cloud_options, use_statistics, hive_partitioning, hive_schema, try_parse_hive_dates, retries, glob, include_file_paths) )] fn new_from_parquet( - path: Option, + path: Option, paths: Vec, n_rows: Option, cache: bool, @@ -271,15 +271,54 @@ impl PyLazyFrame { glob: bool, include_file_paths: Option, ) -> PyResult { + use crate::file::{get_either_file_or_path, EitherPythonFileOrPath}; + let parallel = parallel.0; let hive_schema = hive_schema.map(|s| Arc::new(s.0)); - let first_path = if let Some(path) = &path { - path - } else { - paths + use polars_plan::plans::ScanSource; + use EitherPythonFileOrPath as EF; + let use_first_path = path.is_some(); + let first_path = match path + .map(|py_f| get_either_file_or_path(py_f, false)) + .transpose()? + { + Some(EF::Path(path)) => path, + Some(EF::Py(f)) => { + let scan_source = ScanSource::Buffer(f.as_arc()); + + let row_index = row_index.map(|(name, offset)| RowIndex { + name: name.into(), + offset, + }); + + let args = ScanArgsParquet { + n_rows, + cache, + parallel, + rechunk, + row_index, + low_memory, + cloud_options: None, + use_statistics, + hive_options: HiveOptions { + enabled: hive_partitioning, + hive_start_idx: 0, + schema: hive_schema, + try_parse_dates: try_parse_hive_dates, + }, + glob, + include_file_paths: include_file_paths.map(|x| x.into()), + }; + + let lf = LazyFrame::scan_parquet_sourced(scan_source, args) + .map_err(PyPolarsErr::from)?; + return Ok(lf.into()); + }, + None => paths .first() - .ok_or_else(|| PyValueError::new_err("expected a path argument"))? + .cloned() + .ok_or_else(|| PyValueError::new_err("expected a path argument"))?, }; #[cfg(feature = "cloud")] @@ -322,7 +361,7 @@ impl PyLazyFrame { include_file_paths: include_file_paths.map(|x| x.into()), }; - let lf = if path.is_some() { + let lf = if use_first_path { LazyFrame::scan_parquet(first_path, args) } else { LazyFrame::scan_parquet_files(Arc::from(paths), args) diff --git a/crates/polars-python/src/lazyframe/visitor/nodes.rs b/crates/polars-python/src/lazyframe/visitor/nodes.rs index 02960a1aad23..973a4ce432ef 100644 --- a/crates/polars-python/src/lazyframe/visitor/nodes.rs +++ b/crates/polars-python/src/lazyframe/visitor/nodes.rs @@ -317,7 +317,7 @@ pub(crate) fn into_py(py: Python<'_>, plan: &IR) -> PyResult { )) }, IR::Scan { - paths, + sources, file_info: _, hive_parts: _, predicate, @@ -325,7 +325,7 @@ pub(crate) fn into_py(py: Python<'_>, plan: &IR) -> PyResult { scan_type, file_options, } => Scan { - paths: paths.to_object(py), + paths: sources.into_paths().to_object(py), // TODO: file info file_info: py.None(), predicate: predicate.as_ref().map(|e| e.into()), diff --git a/crates/polars-stream/src/nodes/parquet_source.rs b/crates/polars-stream/src/nodes/parquet_source.rs index f167f12b6fdc..bf5d4262fed6 100644 --- a/crates/polars-stream/src/nodes/parquet_source.rs +++ b/crates/polars-stream/src/nodes/parquet_source.rs @@ -46,7 +46,7 @@ type AsyncTaskData = Option<( #[allow(clippy::type_complexity)] pub struct ParquetSourceNode { - paths: Arc>, + paths: Arc<[PathBuf]>, file_info: FileInfo, hive_parts: Option>>, predicate: Option>, @@ -71,7 +71,7 @@ pub struct ParquetSourceNode { #[allow(clippy::too_many_arguments)] impl ParquetSourceNode { pub fn new( - paths: Arc>, + paths: Arc<[PathBuf]>, file_info: FileInfo, hive_parts: Option>>, predicate: Option>, @@ -1355,7 +1355,7 @@ struct SharedFileState { /// Turns row group data into DataFrames. struct RowGroupDecoder { - paths: Arc>, + paths: Arc<[PathBuf]>, hive_partitions: Option>>, hive_partitions_width: usize, include_file_paths: Option, diff --git a/crates/polars-stream/src/physical_plan/lower_ir.rs b/crates/polars-stream/src/physical_plan/lower_ir.rs index b9693e6c3c56..d50d90afe52a 100644 --- a/crates/polars-stream/src/physical_plan/lower_ir.rs +++ b/crates/polars-stream/src/physical_plan/lower_ir.rs @@ -331,7 +331,7 @@ pub fn lower_ir( v @ IR::Scan { .. } => { let IR::Scan { - paths, + sources, file_info, hive_parts, output_schema, @@ -343,6 +343,8 @@ pub fn lower_ir( unreachable!(); }; + let paths = sources.into_paths(); + PhysNodeKind::FileScan { paths, file_info, diff --git a/crates/polars-stream/src/physical_plan/mod.rs b/crates/polars-stream/src/physical_plan/mod.rs index 99103343565a..d22a5f968900 100644 --- a/crates/polars-stream/src/physical_plan/mod.rs +++ b/crates/polars-stream/src/physical_plan/mod.rs @@ -119,7 +119,7 @@ pub enum PhysNodeKind { }, FileScan { - paths: Arc>, + paths: Arc<[PathBuf]>, file_info: FileInfo, hive_parts: Option>>, predicate: Option, diff --git a/crates/polars-stream/src/utils/late_materialized_df.rs b/crates/polars-stream/src/utils/late_materialized_df.rs index 2173598d5369..87fe97135aad 100644 --- a/crates/polars-stream/src/utils/late_materialized_df.rs +++ b/crates/polars-stream/src/utils/late_materialized_df.rs @@ -4,7 +4,7 @@ use parking_lot::Mutex; use polars_core::frame::DataFrame; use polars_core::schema::Schema; use polars_error::PolarsResult; -use polars_plan::plans::{AnonymousScan, AnonymousScanArgs, FileInfo, FileScan, IR}; +use polars_plan::plans::{AnonymousScan, AnonymousScanArgs, FileInfo, FileScan, ScanSource, IR}; use polars_plan::prelude::{AnonymousScanOptions, FileScanOptions}; /// Used to insert a dataframe into in-memory-engine query plan after the query @@ -25,7 +25,7 @@ impl LateMaterializedDataFrame { fmt_str: "LateMaterializedDataFrame", }); IR::Scan { - paths: Arc::new(vec![]), + sources: ScanSource::Files(Arc::default()), file_info: FileInfo::new(schema, None, (None, usize::MAX)), hive_parts: None, predicate: None, diff --git a/py-polars/polars/io/parquet/functions.py b/py-polars/polars/io/parquet/functions.py index 90b6137c4924..0fc52142e5de 100644 --- a/py-polars/polars/io/parquet/functions.py +++ b/py-polars/polars/io/parquet/functions.py @@ -295,7 +295,7 @@ def read_parquet_schema(source: str | Path | IO[bytes] | bytes) -> dict[str, Dat @deprecate_renamed_parameter("row_count_name", "row_index_name", version="0.20.4") @deprecate_renamed_parameter("row_count_offset", "row_index_offset", version="0.20.4") def scan_parquet( - source: str | Path | list[str] | list[Path], + source: str | Path | list[str] | list[Path] | io.BytesIO, *, n_rows: int | None = None, row_index_name: str | None = None, @@ -422,6 +422,8 @@ def scan_parquet( if isinstance(source, (str, Path)): source = normalize_filepath(source, check_not_directory=False) + elif isinstance(source, io.BytesIO): + pass else: source = [ normalize_filepath(source, check_not_directory=False) for source in source @@ -448,7 +450,7 @@ def scan_parquet( def _scan_parquet_impl( - source: str | list[str] | list[Path], + source: str | list[str] | list[Path] | io.BytesIO, *, n_rows: int | None = None, cache: bool = True, diff --git a/py-polars/tests/unit/io/test_parquet.py b/py-polars/tests/unit/io/test_parquet.py index 9ec82b991f39..b46f21f3893e 100644 --- a/py-polars/tests/unit/io/test_parquet.py +++ b/py-polars/tests/unit/io/test_parquet.py @@ -34,12 +34,12 @@ def test_round_trip(df: pl.DataFrame) -> None: assert_frame_equal(pl.read_parquet(f), df) -def test_scan_round_trip(tmp_path: Path, df: pl.DataFrame) -> None: - tmp_path.mkdir(exist_ok=True) - f = tmp_path / "test.parquet" - +def test_scan_round_trip(df: pl.DataFrame) -> None: + f = io.BytesIO() df.write_parquet(f) + f.seek(0) assert_frame_equal(pl.scan_parquet(f).collect(), df) + f.seek(0) assert_frame_equal(pl.scan_parquet(f).head().collect(), df.head()) @@ -919,8 +919,7 @@ def test_parquet_array_dtype_nulls() -> None: ), ], ) -@pytest.mark.write_disk -def test_complex_types(tmp_path: Path, series: list[Any], dtype: pl.DataType) -> None: +def test_complex_types(series: list[Any], dtype: pl.DataType) -> None: xs = pl.Series(series, dtype=dtype) df = pl.DataFrame({"x": xs}) @@ -981,20 +980,18 @@ def test_read_parquet_only_loads_selected_columns_15098( @pytest.mark.release -@pytest.mark.write_disk -def test_max_statistic_parquet_writer(tmp_path: Path) -> None: +def test_max_statistic_parquet_writer() -> None: # this hits the maximal page size # so the row group will be split into multiple pages # the page statistics need to be correctly reduced # for this query to make sense n = 150_000 - tmp_path.mkdir(exist_ok=True) - # int64 is important to hit the page size df = pl.int_range(0, n, eager=True, dtype=pl.Int64).alias("int").to_frame() - f = tmp_path / "tmp.parquet" + f = io.BytesIO() df.write_parquet(f, statistics=True, use_pyarrow=False, row_group_size=n) + f.seek(0) result = pl.scan_parquet(f).filter(pl.col("int") > n - 3).collect() expected = pl.DataFrame({"int": [149998, 149999]}) assert_frame_equal(result, expected) @@ -1088,14 +1085,11 @@ def test_hybrid_rle() -> None: ) ) @pytest.mark.slow -@pytest.mark.write_disk -@settings(suppress_health_check=[HealthCheck.function_scoped_fixture]) -def test_roundtrip_parametric(df: pl.DataFrame, tmp_path: Path) -> None: - # delete if exists - path = tmp_path / "data.parquet" - - df.write_parquet(path) - result = pl.read_parquet(path) +def test_roundtrip_parametric(df: pl.DataFrame) -> None: + f = io.BytesIO() + df.write_parquet(f) + f.seek(0) + result = pl.read_parquet(f) assert_frame_equal(df, result) @@ -1207,18 +1201,14 @@ def test_read_byte_stream_split_arrays( assert_frame_equal(read, df) -@pytest.mark.write_disk -def test_parquet_nested_null_array_17795(tmp_path: Path) -> None: - filename = tmp_path / "nested_null.parquet" - - pl.DataFrame([{"struct": {"field": None}}]).write_parquet(filename) - pq.read_table(filename) - +def test_parquet_nested_null_array_17795() -> None: + f = io.BytesIO() + pl.DataFrame([{"struct": {"field": None}}]).write_parquet(f) + f.seek(0) + pq.read_table(f) -@pytest.mark.write_disk -def test_parquet_record_batches_pyarrow_fixed_size_list_16614(tmp_path: Path) -> None: - filename = tmp_path / "a.parquet" +def test_parquet_record_batches_pyarrow_fixed_size_list_16614() -> None: # @NOTE: # The minimum that I could get it to crash which was ~132000, but let's # just do 150000 to be sure. @@ -1228,27 +1218,28 @@ def test_parquet_record_batches_pyarrow_fixed_size_list_16614(tmp_path: Path) -> schema={"x": pl.Array(pl.Float32, 2)}, ) - x.write_parquet(filename) - b = pl.read_parquet(filename, use_pyarrow=True) + f = io.BytesIO() + x.write_parquet(f) + f.seek(0) + b = pl.read_parquet(f, use_pyarrow=True) assert b["x"].shape[0] == n assert_frame_equal(b, x) -@pytest.mark.write_disk -def test_parquet_list_element_field_name(tmp_path: Path) -> None: - filename = tmp_path / "list.parquet" - +def test_parquet_list_element_field_name() -> None: + f = io.BytesIO() ( pl.DataFrame( { "a": [[1, 2], [1, 1, 1]], }, schema={"a": pl.List(pl.Int64)}, - ).write_parquet(filename, use_pyarrow=False) + ).write_parquet(f, use_pyarrow=False) ) - schema_str = str(pq.read_schema(filename)) + f.seek(0) + schema_str = str(pq.read_schema(f)) assert "" in schema_str assert "child 0, element: int64" in schema_str @@ -1368,8 +1359,7 @@ def test_parquet_high_nested_null_17805( ) -@pytest.mark.write_disk -def test_struct_plain_encoded_statistics(tmp_path: Path) -> None: +def test_struct_plain_encoded_statistics() -> None: df = pl.DataFrame( { "a": [None, None, None, None, {"x": None, "y": 0}], @@ -1377,17 +1367,12 @@ def test_struct_plain_encoded_statistics(tmp_path: Path) -> None: schema={"a": pl.Struct({"x": pl.Int8, "y": pl.Int8})}, ) - test_scan_round_trip(tmp_path, df) + test_scan_round_trip(df) @given(df=dataframes(min_size=5, excluded_dtypes=[pl.Decimal, pl.Categorical])) -@settings( - max_examples=100, - deadline=None, - suppress_health_check=[HealthCheck.function_scoped_fixture], -) -def test_scan_round_trip_parametric(tmp_path: Path, df: pl.DataFrame) -> None: - test_scan_round_trip(tmp_path, df) +def test_scan_round_trip_parametric(df: pl.DataFrame) -> None: + test_scan_round_trip(df) def test_empty_rg_no_dict_page_18146() -> None: @@ -1532,13 +1517,7 @@ def test_delta_strings_encoding_roundtrip( r2=st.integers(min_value=0, max_value=1000), ) @pytest.mark.parametrize("parallel_st", ["auto", "prefiltered"]) -@settings( - deadline=None, - suppress_health_check=[HealthCheck.function_scoped_fixture], -) -@pytest.mark.write_disk def test_predicate_filtering( - tmp_path: Path, df: pl.DataFrame, first_op: str, second_op: None | tuple[str, str], @@ -1548,9 +1527,7 @@ def test_predicate_filtering( r2: int, parallel_st: Literal["auto", "prefiltered"], ) -> None: - tmp_path.mkdir(exist_ok=True) - f = tmp_path / "test.parquet" - + f = io.BytesIO() df.write_parquet(f, row_group_size=5) cols = df.columns @@ -1566,6 +1543,7 @@ def test_predicate_filtering( (getattr(pl.col(r1s), second_op[1]))(pl.col(r2s)) ) + f.seek(0) result = pl.scan_parquet(f, parallel=parallel_st).filter(expr).collect() assert_frame_equal(result, df.filter(expr)) @@ -1581,33 +1559,26 @@ def test_predicate_filtering( offset=st.integers(0, 100), length=st.integers(0, 100), ) -@settings( - suppress_health_check=[HealthCheck.function_scoped_fixture], -) -@pytest.mark.write_disk def test_slice_roundtrip( - df: pl.DataFrame, offset: int, length: int, tmp_path: Path + df: pl.DataFrame, offset: int, length: int ) -> None: - tmp_path.mkdir(exist_ok=True) - f = tmp_path / "test.parquet" - offset %= df.height + 1 length %= df.height - offset + 1 + f = io.BytesIO() df.write_parquet(f) + f.seek(0) scanned = pl.scan_parquet(f).slice(offset, length).collect() assert_frame_equal(scanned, df.slice(offset, length)) -@pytest.mark.write_disk -def test_struct_prefiltered(tmp_path: Path) -> None: - tmp_path.mkdir(exist_ok=True) - f = tmp_path / "test.parquet" - +def test_struct_prefiltered() -> None: df = pl.DataFrame({"a": {"x": 1, "y": 2}}) + f = io.BytesIO() df.write_parquet(f) + f.seek(0) ( pl.scan_parquet(f, parallel="prefiltered") .filter(pl.col("a").struct.field("x") == 1) @@ -1641,19 +1612,17 @@ def test_struct_prefiltered(tmp_path: Path) -> None: ], ) @pytest.mark.parametrize("nullable", [False, True]) -@pytest.mark.write_disk def test_nested_skip_18303( data: tuple[list[dict[str, str] | list[str]], pa.DataType], nullable: bool, - tmp_path: Path, ) -> None: - tmp_path.mkdir(exist_ok=True) - f = tmp_path / "test.parquet" - schema = pa.schema([pa.field("a", data[1], nullable=nullable)]) tb = pa.table({"a": data[0]}, schema=schema) + + f = io.BytesIO() pq.write_table(tb, f) + f.seek(0) scanned = pl.scan_parquet(f).slice(1, 1).collect() assert_frame_equal(scanned, pl.DataFrame(tb).slice(1, 1)) @@ -1697,20 +1666,12 @@ def test_nested_span_multiple_pages_18400() -> None: include_cols=[column("filter_col", pl.Boolean, allow_null=False)], ), ) -@pytest.mark.write_disk -@settings( - suppress_health_check=[HealthCheck.function_scoped_fixture], -) -def test_parametric_small_page_mask_filtering( - tmp_path: Path, - df: pl.DataFrame, -) -> None: - tmp_path.mkdir(exist_ok=True) - f = tmp_path / "test.parquet" - +def test_parametric_small_page_mask_filtering(df: pl.DataFrame) -> None: + f = io.BytesIO() df.write_parquet(f, data_page_size=1024) expr = pl.col("filter_col") + f.seek(0) result = pl.scan_parquet(f, parallel="prefiltered").filter(expr).collect() assert_frame_equal(result, df.filter(expr)) @@ -1756,23 +1717,13 @@ def test_different_page_validity_across_pages(value: str | int | float | bool) - ], ), ) -@settings( - deadline=None, - suppress_health_check=[HealthCheck.function_scoped_fixture], -) -@pytest.mark.write_disk -def test_delta_length_byte_array_prefiltering( - tmp_path: Path, - df: pl.DataFrame, -) -> None: - tmp_path.mkdir(exist_ok=True) - f = tmp_path / "test.parquet" - +def test_delta_length_byte_array_prefiltering(df: pl.DataFrame) -> None: cols = df.columns encodings = {col: "DELTA_LENGTH_BYTE_ARRAY" for col in cols} encodings["filter_col"] = "PLAIN" + f = io.BytesIO() pq.write_table( df.to_arrow(), f, @@ -1780,6 +1731,7 @@ def test_delta_length_byte_array_prefiltering( column_encoding=encodings, ) + f.seek(0) expr = pl.col("filter_col") == 0 result = pl.scan_parquet(f, parallel="prefiltered").filter(expr).collect() assert_frame_equal(result, df.filter(expr)) @@ -1797,22 +1749,13 @@ def test_delta_length_byte_array_prefiltering( ], ), ) -@settings( - deadline=None, - suppress_health_check=[HealthCheck.function_scoped_fixture], -) -@pytest.mark.write_disk -def test_general_prefiltering( - tmp_path: Path, - df: pl.DataFrame, -) -> None: - tmp_path.mkdir(exist_ok=True) - f = tmp_path / "test.parquet" - +def test_general_prefiltering(df: pl.DataFrame) -> None: + f = io.BytesIO() df.write_parquet(f) expr = pl.col("filter_col") == 0 + f.seek(0) result = pl.scan_parquet(f, parallel="prefiltered").filter(expr).collect() assert_frame_equal(result, df.filter(expr)) @@ -1827,22 +1770,13 @@ def test_general_prefiltering( include_cols=[column("filter_col", pl.Boolean, allow_null=False)], ), ) -@settings( - deadline=None, - suppress_health_check=[HealthCheck.function_scoped_fixture], -) -@pytest.mark.write_disk -def test_row_index_prefiltering( - tmp_path: Path, - df: pl.DataFrame, -) -> None: - tmp_path.mkdir(exist_ok=True) - f = tmp_path / "test.parquet" - +def test_row_index_prefiltering(df: pl.DataFrame) -> None: + f = io.BytesIO() df.write_parquet(f) expr = pl.col("filter_col") + f.seek(0) result = ( pl.scan_parquet( f, row_index_name="ri", row_index_offset=42, parallel="prefiltered" From e14b78ce05c811ff6b68964c3a591e0cae904400 Mon Sep 17 00:00:00 2001 From: coastalwhite Date: Tue, 3 Sep 2024 16:18:10 +0200 Subject: [PATCH 02/27] refactor: Add ScanSource to CountStar --- crates/polars-io/src/csv/read/mod.rs | 2 +- crates/polars-io/src/csv/read/parser.rs | 36 ++- .../polars-plan/src/plans/functions/count.rs | 215 +++++++++++------- crates/polars-plan/src/plans/functions/mod.rs | 15 +- crates/polars-plan/src/plans/ir/format.rs | 15 +- crates/polars-plan/src/plans/ir/mod.rs | 9 +- .../src/plans/optimizer/count_star.rs | 15 +- .../src/lazyframe/visitor/nodes.rs | 7 +- 8 files changed, 189 insertions(+), 125 deletions(-) diff --git a/crates/polars-io/src/csv/read/mod.rs b/crates/polars-io/src/csv/read/mod.rs index 969be1a58908..b9d48291f8ce 100644 --- a/crates/polars-io/src/csv/read/mod.rs +++ b/crates/polars-io/src/csv/read/mod.rs @@ -26,7 +26,7 @@ mod splitfields; mod utils; pub use options::{CommentPrefix, CsvEncoding, CsvParseOptions, CsvReadOptions, NullValues}; -pub use parser::count_rows; +pub use parser::{count_rows, count_rows_from_slice}; pub use read_impl::batched::{BatchedCsvReader, OwnedBatchedCsvReader}; pub use reader::CsvReader; pub use schema_inference::infer_file_schema; diff --git a/crates/polars-io/src/csv/read/parser.rs b/crates/polars-io/src/csv/read/parser.rs index 18e6ef5f3f6d..9d2852a02c82 100644 --- a/crates/polars-io/src/csv/read/parser.rs +++ b/crates/polars-io/src/csv/read/parser.rs @@ -54,12 +54,32 @@ pub fn count_rows( reader_bytes = &reader_bytes[1..]; } + count_rows_from_slice( + reader_bytes, + separator, + quote_char, + comment_prefix, + eol_char, + has_header, + ) +} + +/// Read the number of rows without parsing columns +/// useful for count(*) queries +pub fn count_rows_from_slice( + bytes: &[u8], + separator: u8, + quote_char: Option, + comment_prefix: Option<&CommentPrefix>, + eol_char: u8, + has_header: bool, +) -> PolarsResult { const MIN_ROWS_PER_THREAD: usize = 1024; let max_threads = POOL.current_num_threads(); // Determine if parallelism is beneficial and how many threads let n_threads = get_line_stats( - reader_bytes, + bytes, MIN_ROWS_PER_THREAD, eol_char, None, @@ -67,22 +87,16 @@ pub fn count_rows( quote_char, ) .map(|(mean, std)| { - let n_rows = (reader_bytes.len() as f32 / (mean - 0.01 * std)) as usize; + let n_rows = (bytes.len() as f32 / (mean - 0.01 * std)) as usize; (n_rows / MIN_ROWS_PER_THREAD).clamp(1, max_threads) }) .unwrap_or(1); - let file_chunks: Vec<(usize, usize)> = get_file_chunks( - reader_bytes, - n_threads, - None, - separator, - quote_char, - eol_char, - ); + let file_chunks: Vec<(usize, usize)> = + get_file_chunks(bytes, n_threads, None, separator, quote_char, eol_char); let iter = file_chunks.into_par_iter().map(|(start, stop)| { - let local_bytes = &reader_bytes[start..stop]; + let local_bytes = &bytes[start..stop]; let row_iterator = SplitLines::new(local_bytes, quote_char.unwrap_or(b'"'), eol_char); if comment_prefix.is_some() { Ok(row_iterator diff --git a/crates/polars-plan/src/plans/functions/count.rs b/crates/polars-plan/src/plans/functions/count.rs index bd68db61a06c..dca574e67808 100644 --- a/crates/polars-plan/src/plans/functions/count.rs +++ b/crates/polars-plan/src/plans/functions/count.rs @@ -3,9 +3,7 @@ use arrow::io::ipc::read::get_row_count as count_rows_ipc_sync; #[cfg(any(feature = "parquet", feature = "json"))] use polars_io::cloud::CloudOptions; #[cfg(feature = "csv")] -use polars_io::csv::read::count_rows as count_rows_csv; -#[cfg(any(feature = "parquet", feature = "ipc", feature = "json"))] -use polars_io::is_cloud_url; +use polars_io::csv::read::{count_rows as count_rows_csv, count_rows_from_slice as count_rows_csv_from_slice}; #[cfg(all(feature = "parquet", feature = "cloud"))] use polars_io::parquet::read::ParquetAsyncReader; #[cfg(feature = "parquet")] @@ -18,7 +16,7 @@ use polars_io::SerReader; use super::*; #[allow(unused_variables)] -pub fn count_rows(paths: &Arc>, scan_type: &FileScan) -> PolarsResult { +pub fn count_rows(sources: &Arc<[ScanSource]>, scan_type: &FileScan) -> PolarsResult { #[cfg(not(any( feature = "parquet", feature = "ipc", @@ -41,26 +39,10 @@ pub fn count_rows(paths: &Arc>, scan_type: &FileScan) -> PolarsResu FileScan::Csv { options, cloud_options, - } => { - let parse_options = options.get_parse_options(); - let n_rows: PolarsResult = paths - .iter() - .map(|path| { - count_rows_csv( - path, - parse_options.separator, - parse_options.quote_char, - parse_options.comment_prefix.as_ref(), - parse_options.eol_char, - options.has_header, - ) - }) - .sum(); - n_rows - }, + } => count_all_rows_csv(sources, options), #[cfg(feature = "parquet")] FileScan::Parquet { cloud_options, .. } => { - count_rows_parquet(paths, cloud_options.as_ref()) + count_rows_parquet(sources, cloud_options.as_ref()) }, #[cfg(feature = "ipc")] FileScan::Ipc { @@ -68,7 +50,7 @@ pub fn count_rows(paths: &Arc>, scan_type: &FileScan) -> PolarsResu cloud_options, metadata, } => count_rows_ipc( - paths, + sources, #[cfg(feature = "cloud")] cloud_options.as_ref(), metadata.as_ref(), @@ -77,7 +59,7 @@ pub fn count_rows(paths: &Arc>, scan_type: &FileScan) -> PolarsResu FileScan::NDJson { options, cloud_options, - } => count_rows_ndjson(paths, cloud_options.as_ref()), + } => count_rows_ndjson(sources, cloud_options.as_ref()), FileScan::Anonymous { .. } => { unreachable!() }, @@ -92,15 +74,51 @@ pub fn count_rows(paths: &Arc>, scan_type: &FileScan) -> PolarsResu )]) } } + +#[cfg(feature = "csv")] +fn count_all_rows_csv( + sources: &Arc<[ScanSource]>, + options: &polars_io::prelude::CsvReadOptions, +) -> PolarsResult { + let parse_options = options.get_parse_options(); + + sources + .iter() + .map(|source| match source { + ScanSource::Files(paths) => paths + .iter() + .map(|path| { + count_rows_csv( + path, + parse_options.separator, + parse_options.quote_char, + parse_options.comment_prefix.as_ref(), + parse_options.eol_char, + options.has_header, + ) + }) + .sum::>(), + ScanSource::Buffer(buf) => count_rows_csv_from_slice( + &buf[..], + parse_options.separator, + parse_options.quote_char, + parse_options.comment_prefix.as_ref(), + parse_options.eol_char, + options.has_header, + ), + }) + .sum() +} + #[cfg(feature = "parquet")] pub(super) fn count_rows_parquet( - paths: &Arc>, + sources: &Arc<[ScanSource]>, #[allow(unused)] cloud_options: Option<&CloudOptions>, ) -> PolarsResult { - if paths.is_empty() { + if sources.is_empty() { return Ok(0); }; - let is_cloud = is_cloud_url(paths.first().unwrap().as_path()); + let is_cloud = sources.first().unwrap().is_cloud_url()?; if is_cloud { #[cfg(not(feature = "cloud"))] @@ -108,15 +126,19 @@ pub(super) fn count_rows_parquet( #[cfg(feature = "cloud")] { - get_runtime().block_on(count_rows_cloud_parquet(paths, cloud_options)) + get_runtime().block_on(count_rows_cloud_parquet(sources, cloud_options)) } } else { - paths + sources .iter() - .map(|path| { - let file = polars_utils::open_file(path)?; - let mut reader = ParquetReader::new(file); - reader.num_rows() + .map(|source| match source { + ScanSource::Files(paths) => paths + .iter() + .map(|path| ParquetReader::new(polars_utils::open_file(path)?).num_rows()) + .sum::>(), + ScanSource::Buffer(buffer) => { + ParquetReader::new(std::io::Cursor::new(buffer)).num_rows() + }, }) .sum::>() } @@ -124,14 +146,17 @@ pub(super) fn count_rows_parquet( #[cfg(all(feature = "parquet", feature = "async"))] async fn count_rows_cloud_parquet( - paths: &Arc>, + sources: &Arc<[ScanSource]>, cloud_options: Option<&CloudOptions>, ) -> PolarsResult { - let collection = paths.iter().map(|path| { - with_concurrency_budget(1, || async { - let mut reader = - ParquetAsyncReader::from_uri(&path.to_string_lossy(), cloud_options, None).await?; - reader.num_rows().await + let collection = sources.iter().flat_map(|source| { + source.as_paths().iter().map(|path| { + with_concurrency_budget(1, || async { + let mut reader = + ParquetAsyncReader::from_uri(&path.to_string_lossy(), cloud_options, None) + .await?; + reader.num_rows().await + }) }) }); futures::future::try_join_all(collection) @@ -141,14 +166,14 @@ async fn count_rows_cloud_parquet( #[cfg(feature = "ipc")] pub(super) fn count_rows_ipc( - paths: &Arc>, + sources: &Arc<[ScanSource]>, #[cfg(feature = "cloud")] cloud_options: Option<&CloudOptions>, metadata: Option<&arrow::io::ipc::read::FileMetadata>, ) -> PolarsResult { - if paths.is_empty() { + if sources.is_empty() { return Ok(0); }; - let is_cloud = is_cloud_url(paths.first().unwrap().as_path()); + let is_cloud = sources.first().unwrap().is_cloud_url()?; if is_cloud { #[cfg(not(feature = "cloud"))] @@ -156,31 +181,41 @@ pub(super) fn count_rows_ipc( #[cfg(feature = "cloud")] { - get_runtime().block_on(count_rows_cloud_ipc(paths, cloud_options, metadata)) + get_runtime().block_on(count_rows_cloud_ipc(sources, cloud_options, metadata)) } } else { - paths + sources .iter() - .map(|path| { - let mut reader = polars_utils::open_file(path)?; - count_rows_ipc_sync(&mut reader).map(|v| v as usize) + .map(|source| match source { + ScanSource::Files(paths) => paths + .iter() + .map(|path| { + count_rows_ipc_sync(&mut polars_utils::open_file(path)?).map(|v| v as usize) + }) + .sum::>(), + ScanSource::Buffer(buffer) => { + count_rows_ipc_sync(&mut std::io::Cursor::new(buffer)).map(|v| v as usize) + }, }) - .sum() + .sum::>() } } #[cfg(all(feature = "ipc", feature = "async"))] async fn count_rows_cloud_ipc( - paths: &Arc>, + sources: &Arc<[ScanSource]>, cloud_options: Option<&CloudOptions>, metadata: Option<&arrow::io::ipc::read::FileMetadata>, ) -> PolarsResult { use polars_io::ipc::IpcReaderAsync; - let collection = paths.iter().map(|path| { - with_concurrency_budget(1, || async { - let reader = IpcReaderAsync::from_uri(&path.to_string_lossy(), cloud_options).await?; - reader.count_rows(metadata).await + let collection = sources.iter().flat_map(|source| { + source.as_paths().iter().map(|path| { + with_concurrency_budget(1, || async { + let reader = + IpcReaderAsync::from_uri(&path.to_string_lossy(), cloud_options).await?; + reader.count_rows(metadata).await + }) }) }); futures::future::try_join_all(collection) @@ -190,21 +225,23 @@ async fn count_rows_cloud_ipc( #[cfg(feature = "json")] pub(super) fn count_rows_ndjson( - paths: &Arc>, + sources: &Arc<[ScanSource]>, cloud_options: Option<&CloudOptions>, ) -> PolarsResult { use polars_core::config; + use polars_core::error::feature_gated; use polars_io::utils::maybe_decompress_bytes; - let run_async = !paths.is_empty() && is_cloud_url(&paths[0]) || config::force_async(); + let run_async = + !sources.is_empty() && sources.first().unwrap().is_cloud_url()? || config::force_async(); let cache_entries = { - #[cfg(feature = "cloud")] - { + feature_gated!("cloud", { if run_async { Some(polars_io::file_cache::init_entries_from_uri_list( - paths + sources .iter() + .flat_map(|source| source.as_paths()) .map(|path| Arc::from(path.to_str().unwrap())) .collect::>() .as_slice(), @@ -213,39 +250,43 @@ pub(super) fn count_rows_ndjson( } else { None } - } - #[cfg(not(feature = "cloud"))] - { - if run_async { - panic!("required feature `cloud` is not enabled") - } - } + }) }; - (0..paths.len()) - .map(|i| { - let f = if run_async { - #[cfg(feature = "cloud")] - { - let entry: &Arc = - &cache_entries.as_ref().unwrap()[0]; - entry.try_open_check_latest()? - } - #[cfg(not(feature = "cloud"))] - { - panic!("required feature `cloud` is not enabled") - } - } else { - polars_utils::open_file(&paths[i])? - }; + sources + .iter() + .map(|source| match source { + ScanSource::Files(paths) => paths + .iter() + .map(|path| { + let f = if run_async { + feature_gated!("cloud", { + let entry: &Arc = + &cache_entries.as_ref().unwrap()[0]; + entry.try_open_check_latest()? + }) + } else { + polars_utils::open_file(path)? + }; - let mmap = unsafe { memmap::Mmap::map(&f).unwrap() }; - let owned = &mut vec![]; + let mmap = unsafe { memmap::Mmap::map(&f).unwrap() }; + let owned = &mut vec![]; - let reader = polars_io::ndjson::core::JsonLineReader::new(std::io::Cursor::new( - maybe_decompress_bytes(mmap.as_ref(), owned)?, - )); - reader.count() + let reader = polars_io::ndjson::core::JsonLineReader::new( + std::io::Cursor::new(maybe_decompress_bytes(mmap.as_ref(), owned)?), + ); + reader.count() + }) + .sum::>(), + ScanSource::Buffer(buffer) => { + polars_ensure!(!run_async, nyi = "BytesIO with force_async"); + + let owned = &mut vec![]; + let reader = polars_io::ndjson::core::JsonLineReader::new(std::io::Cursor::new( + maybe_decompress_bytes(buffer, owned)?, + )); + reader.count() + }, }) .sum() } diff --git a/crates/polars-plan/src/plans/functions/mod.rs b/crates/polars-plan/src/plans/functions/mod.rs index b0e5bb444689..37e01bcc654f 100644 --- a/crates/polars-plan/src/plans/functions/mod.rs +++ b/crates/polars-plan/src/plans/functions/mod.rs @@ -10,7 +10,6 @@ mod schema; use std::borrow::Cow; use std::fmt::{Debug, Display, Formatter}; use std::hash::{Hash, Hasher}; -use std::path::PathBuf; use std::sync::{Arc, Mutex}; pub use dsl::*; @@ -45,7 +44,7 @@ pub enum FunctionIR { fmt_str: PlSmallStr, }, FastCount { - paths: Arc>, + sources: Arc<[ScanSource]>, scan_type: FileScan, alias: Option, }, @@ -104,8 +103,8 @@ impl PartialEq for FunctionIR { use FunctionIR::*; match (self, other) { (Rechunk, Rechunk) => true, - (FastCount { paths: paths_l, .. }, FastCount { paths: paths_r, .. }) => { - paths_l == paths_r + (FastCount { sources: srcs_l, .. }, FastCount { sources: srcs_r, .. }) => { + srcs_l == srcs_r }, ( Rename { @@ -138,11 +137,11 @@ impl Hash for FunctionIR { FunctionIR::OpaquePython { .. } => {}, FunctionIR::Opaque { fmt_str, .. } => fmt_str.hash(state), FunctionIR::FastCount { - paths, + sources, scan_type, alias, } => { - paths.hash(state); + sources.hash(state); scan_type.hash(state); alias.hash(state); }, @@ -261,8 +260,8 @@ impl FunctionIR { .. }) => python_udf::call_python_udf(function, df, *validate_output, schema.as_deref()), FastCount { - paths, scan_type, .. - } => count::count_rows(paths, scan_type), + sources, scan_type, .. + } => count::count_rows(sources, scan_type), Rechunk => { df.as_single_chunk_par(); Ok(df) diff --git a/crates/polars-plan/src/plans/ir/format.rs b/crates/polars-plan/src/plans/ir/format.rs index 6c1c37b78671..cc64daf67a30 100644 --- a/crates/polars-plan/src/plans/ir/format.rs +++ b/crates/polars-plan/src/plans/ir/format.rs @@ -1,7 +1,6 @@ use std::borrow::Cow; use std::fmt; use std::fmt::{Display, Formatter}; -use std::path::PathBuf; use polars_core::datatypes::AnyValue; use polars_core::schema::Schema; @@ -56,7 +55,7 @@ impl AsExpr for ExprIR { fn write_scan( f: &mut Formatter, name: &str, - path: &[PathBuf], + source: &ScanSource, indent: usize, n_columns: i64, total_columns: usize, @@ -64,7 +63,12 @@ fn write_scan( slice: Option<(i64, usize)>, row_index: Option<&RowIndex>, ) -> fmt::Result { - write!(f, "{:indent$}{name} SCAN {}", "", PathsDisplay(path))?; + write!(f, "{:indent$}{name} SCAN ", "")?; + + match source { + ScanSource::Files(paths) => write!(f, "{}", PathsDisplay(paths.as_ref()))?, + ScanSource::Buffer(_) => write!(f, "IN MEMORY BUFFER")?, + } let total_columns = total_columns - usize::from(row_index.is_some()); if n_columns > 0 { @@ -171,7 +175,7 @@ impl<'a> IRDisplay<'a> { write_scan( f, "PYTHON", - &[], + &ScanSource::default(), indent, n_columns, total_columns, @@ -228,7 +232,6 @@ impl<'a> IRDisplay<'a> { file_options, .. } => { - let paths = sources.as_paths(); let n_columns = file_options .with_columns .as_ref() @@ -240,7 +243,7 @@ impl<'a> IRDisplay<'a> { write_scan( f, scan_type.into(), - paths, + &sources, indent, n_columns, file_info.schema.len(), diff --git a/crates/polars-plan/src/plans/ir/mod.rs b/crates/polars-plan/src/plans/ir/mod.rs index 7062514f7689..52593e3d2b28 100644 --- a/crates/polars-plan/src/plans/ir/mod.rs +++ b/crates/polars-plan/src/plans/ir/mod.rs @@ -36,7 +36,7 @@ pub struct IRPlanRef<'a> { } #[cfg_attr(feature = "ir_serde", derive(Serialize, Deserialize))] -#[derive(Debug, Clone, Hash)] +#[derive(Debug, Clone, Hash, PartialEq, Eq)] pub enum ScanSource { Files(Arc<[PathBuf]>), #[cfg_attr(feature = "ir_serde", serde(skip))] @@ -62,6 +62,13 @@ impl ScanSource { } } + pub fn try_into_paths(&self) -> PolarsResult> { + match self { + ScanSource::Files(paths) => Ok(paths.clone()), + ScanSource::Buffer(_) => Err(polars_err!(nyi = "Unable to convert BytesIO scan into path")), + } + } + pub fn into_paths(&self) -> Arc<[PathBuf]> { match self { ScanSource::Files(paths) => paths.clone(), diff --git a/crates/polars-plan/src/plans/optimizer/count_star.rs b/crates/polars-plan/src/plans/optimizer/count_star.rs index 8565e066dcb4..d88956d2903f 100644 --- a/crates/polars-plan/src/plans/optimizer/count_star.rs +++ b/crates/polars-plan/src/plans/optimizer/count_star.rs @@ -1,5 +1,3 @@ -use std::path::PathBuf; - use super::*; pub(super) struct CountStar; @@ -32,7 +30,7 @@ impl OptimizationRule for CountStar { let alp = IR::MapFunction { input: placeholder_node, function: FunctionIR::FastCount { - paths: count_star_expr.paths, + sources: count_star_expr.sources, scan_type: count_star_expr.scan_type, alias: count_star_expr.alias, }, @@ -49,7 +47,7 @@ struct CountStarExpr { // Top node of the projection to replace node: Node, // Paths to the input files - paths: Arc>, + sources: Arc<[ScanSource]>, // File Type scan_type: FileScan, // Column Alias @@ -67,11 +65,11 @@ fn visit_logical_plan_for_scan_paths( match lp_arena.get(node) { IR::Union { inputs, .. } => { let mut scan_type: Option = None; - let mut paths = Vec::with_capacity(inputs.len()); + let mut sources = Vec::with_capacity(inputs.len()); for input in inputs { match visit_logical_plan_for_scan_paths(*input, lp_arena, expr_arena, true) { Some(expr) => { - paths.extend(expr.paths.iter().cloned()); + sources.extend(expr.sources.iter().cloned()); match &scan_type { None => scan_type = Some(expr.scan_type), Some(scan_type) => { @@ -88,7 +86,7 @@ fn visit_logical_plan_for_scan_paths( } } Some(CountStarExpr { - paths: paths.into(), + sources: sources.into(), scan_type: scan_type.unwrap(), node, alias: None, @@ -97,8 +95,7 @@ fn visit_logical_plan_for_scan_paths( IR::Scan { scan_type, sources, .. } if !matches!(scan_type, FileScan::Anonymous { .. }) => Some(CountStarExpr { - // @FIX: Count Star Should probably just have a Arc Slice - paths: Arc::new(sources.as_paths().as_ref().to_vec()), + sources: [sources.clone()].into(), scan_type: scan_type.clone(), node, alias: None, diff --git a/crates/polars-python/src/lazyframe/visitor/nodes.rs b/crates/polars-python/src/lazyframe/visitor/nodes.rs index 973a4ce432ef..3c31ff11b63a 100644 --- a/crates/polars-python/src/lazyframe/visitor/nodes.rs +++ b/crates/polars-python/src/lazyframe/visitor/nodes.rs @@ -325,7 +325,10 @@ pub(crate) fn into_py(py: Python<'_>, plan: &IR) -> PyResult { scan_type, file_options, } => Scan { - paths: sources.into_paths().to_object(py), + paths: sources + .try_into_paths() + .map_err(|_| PyNotImplementedError::new_err("scan with BytesIO"))? + .to_object(py), // TODO: file info file_info: py.None(), predicate: predicate.as_ref().map(|e| e.into()), @@ -596,7 +599,7 @@ pub(crate) fn into_py(py: Python<'_>, plan: &IR) -> PyResult { offset, } => ("row_index", name.to_string(), offset.unwrap_or(0)).to_object(py), FunctionIR::FastCount { - paths: _, + sources: _, scan_type: _, alias: _, } => return Err(PyNotImplementedError::new_err("function count")), From 3e72f51a936495db4cea5387cc71118101e1c095 Mon Sep 17 00:00:00 2001 From: coastalwhite Date: Tue, 3 Sep 2024 17:35:28 +0200 Subject: [PATCH 03/27] refactor: Add ScanSource to other scan functions --- crates/polars-lazy/src/scan/csv.rs | 4 + crates/polars-lazy/src/scan/ipc.rs | 4 + .../src/executors/scan/csv.rs | 56 ++--- .../src/executors/scan/ipc.rs | 11 +- .../src/executors/scan/parquet.rs | 26 +- crates/polars-mem-engine/src/planner/lp.rs | 2 +- crates/polars-plan/src/client/check.rs | 10 +- .../src/plans/conversion/dsl_to_ir.rs | 11 +- .../polars-plan/src/plans/conversion/mod.rs | 2 +- .../polars-plan/src/plans/conversion/scans.rs | 207 ++++++++-------- .../polars-plan/src/plans/functions/count.rs | 4 +- crates/polars-plan/src/plans/functions/mod.rs | 11 +- crates/polars-plan/src/plans/ir/mod.rs | 16 +- crates/polars-python/src/file.rs | 5 +- crates/polars-python/src/lazyframe/general.rs | 228 ++++++++---------- py-polars/polars/io/csv/functions.py | 6 +- py-polars/polars/io/ipc/functions.py | 5 +- py-polars/polars/io/ndjson.py | 6 +- py-polars/polars/io/parquet/functions.py | 8 +- 19 files changed, 309 insertions(+), 313 deletions(-) diff --git a/crates/polars-lazy/src/scan/csv.rs b/crates/polars-lazy/src/scan/csv.rs index 676c34b6a71e..e408681789c3 100644 --- a/crates/polars-lazy/src/scan/csv.rs +++ b/crates/polars-lazy/src/scan/csv.rs @@ -35,6 +35,10 @@ impl LazyCsvReader { Self::new("").with_paths(paths) } + pub fn new_sourced(source: ScanSource) -> Self { + Self::new("").with_source(source) + } + pub fn new(path: impl AsRef) -> Self { LazyCsvReader { source: ScanSource::Files([path.as_ref().to_path_buf()].into()), diff --git a/crates/polars-lazy/src/scan/ipc.rs b/crates/polars-lazy/src/scan/ipc.rs index af0b53ade823..18043a15717a 100644 --- a/crates/polars-lazy/src/scan/ipc.rs +++ b/crates/polars-lazy/src/scan/ipc.rs @@ -132,4 +132,8 @@ impl LazyFrame { pub fn scan_ipc_files(paths: Arc<[PathBuf]>, args: ScanArgsIpc) -> PolarsResult { LazyIpcReader::new(args).with_paths(paths).finish() } + + pub fn scan_ipc_sourced(source: ScanSource, args: ScanArgsIpc) -> PolarsResult { + LazyIpcReader::new(args).with_source(source).finish() + } } diff --git a/crates/polars-mem-engine/src/executors/scan/csv.rs b/crates/polars-mem-engine/src/executors/scan/csv.rs index 24e813329bcf..4ab5a034c584 100644 --- a/crates/polars-mem-engine/src/executors/scan/csv.rs +++ b/crates/polars-mem-engine/src/executors/scan/csv.rs @@ -4,11 +4,12 @@ use polars_core::config; use polars_core::utils::{ accumulate_dataframes_vertical, accumulate_dataframes_vertical_unchecked, }; +use polars_error::feature_gated; use super::*; pub struct CsvExec { - pub sources: ScanSource, + pub source: ScanSource, pub file_info: FileInfo, pub options: CsvReadOptions, pub file_options: FileScanOptions, @@ -17,7 +18,7 @@ pub struct CsvExec { impl CsvExec { fn read(&self) -> PolarsResult { - let paths = self.sources.as_paths(); + let paths = self.source.as_paths(); let with_columns = self .file_options .with_columns @@ -65,42 +66,28 @@ impl CsvExec { let finish_read = |i: usize, options: CsvReadOptions, predicate: Option>| { let path = &paths[i]; - let mut df = if run_async { - #[cfg(feature = "cloud")] - { - let file = polars_io::file_cache::FILE_CACHE + let file = if run_async { + feature_gated!("cloud", { + polars_io::file_cache::FILE_CACHE .get_entry(path.to_str().unwrap()) // Safety: This was initialized by schema inference. .unwrap() - .try_open_assume_latest()?; - let owned = &mut vec![]; - let mmap = unsafe { memmap::Mmap::map(&file).unwrap() }; - - options - .into_reader_with_file_handle(std::io::Cursor::new( - maybe_decompress_bytes(mmap.as_ref(), owned)?, - )) - ._with_predicate(predicate.clone()) - .finish() - } - #[cfg(not(feature = "cloud"))] - { - panic!("required feature `cloud` is not enabled") - } + .try_open_assume_latest() + }) } else { - let file = polars_utils::open_file(path)?; - let mmap = unsafe { memmap::Mmap::map(&file).unwrap() }; - let owned = &mut vec![]; - - options - .into_reader_with_file_handle(std::io::Cursor::new(maybe_decompress_bytes( - mmap.as_ref(), - owned, - )?)) - ._with_predicate(predicate.clone()) - .finish() + polars_utils::open_file(path) }?; + let owned = &mut vec![]; + let mmap = unsafe { memmap::Mmap::map(&file).unwrap() }; + let mut df = options + .into_reader_with_file_handle(std::io::Cursor::new(maybe_decompress_bytes( + mmap.as_ref(), + owned, + )?)) + ._with_predicate(predicate.clone()) + .finish()?; + if let Some(col) = &self.file_options.include_file_paths { let path = path.to_str().unwrap(); unsafe { @@ -234,11 +221,8 @@ impl CsvExec { impl Executor for CsvExec { fn execute(&mut self, state: &mut ExecutionState) -> PolarsResult { - let paths = self.sources.as_paths(); let profile_name = if state.has_node_timer() { - let mut ids = vec![PlSmallStr::from_str( - paths[0].to_string_lossy().as_ref(), - )]; + let mut ids = vec![self.source.id()]; if self.predicate.is_some() { ids.push("predicate".into()) } diff --git a/crates/polars-mem-engine/src/executors/scan/ipc.rs b/crates/polars-mem-engine/src/executors/scan/ipc.rs index b29e44a5e33c..b9387cad5878 100644 --- a/crates/polars-mem-engine/src/executors/scan/ipc.rs +++ b/crates/polars-mem-engine/src/executors/scan/ipc.rs @@ -1,4 +1,3 @@ - use hive::HivePartitions; use polars_core::config; use polars_core::utils::accumulate_dataframes_vertical; @@ -90,11 +89,7 @@ impl IpcExec { Arc::from(paths[path_index].to_str().unwrap().to_string()), ) })) - .memory_mapped( - self.options - .memory_map - .then(|| paths[path_index].clone()), - ) + .memory_mapped(self.options.memory_map.then(|| paths[path_index].clone())) .finish() }; @@ -191,9 +186,7 @@ impl Executor for IpcExec { let paths = self.sources.as_paths(); let profile_name = if state.has_node_timer() { - let mut ids = vec![PlSmallStr::from_str( - paths[0].to_string_lossy().as_ref(), - )]; + let mut ids = vec![PlSmallStr::from_str(paths[0].to_string_lossy().as_ref())]; if self.predicate.is_some() { ids.push("predicate".into()) } diff --git a/crates/polars-mem-engine/src/executors/scan/parquet.rs b/crates/polars-mem-engine/src/executors/scan/parquet.rs index efed503ad511..ed740809fcd3 100644 --- a/crates/polars-mem-engine/src/executors/scan/parquet.rs +++ b/crates/polars-mem-engine/src/executors/scan/parquet.rs @@ -13,7 +13,7 @@ use polars_io::RowIndex; use super::*; pub struct ParquetExec { - sources: ScanSource, + source: ScanSource, file_info: FileInfo, hive_parts: Option>>, predicate: Option>, @@ -28,7 +28,7 @@ pub struct ParquetExec { impl ParquetExec { #[allow(clippy::too_many_arguments)] pub(crate) fn new( - sources: ScanSource, + source: ScanSource, file_info: FileInfo, hive_parts: Option>>, predicate: Option>, @@ -38,7 +38,7 @@ impl ParquetExec { metadata: Option, ) -> Self { ParquetExec { - sources, + source, file_info, hive_parts, predicate, @@ -51,7 +51,7 @@ impl ParquetExec { fn read_par(&mut self) -> PolarsResult> { let parallel = match self.options.parallel { - ParallelStrategy::Auto if self.sources.num_sources() > POOL.current_num_threads() => { + ParallelStrategy::Auto if self.source.num_sources() > POOL.current_num_threads() => { ParallelStrategy::RowGroups }, identity => identity, @@ -63,16 +63,16 @@ impl ParquetExec { let slice_info = match self.file_options.slice { None => ScanSourceSliceInfo { item_slice: 0..usize::MAX, - source_slice: 0..self.sources.num_sources(), + source_slice: 0..self.source.num_sources(), }, - Some(slice) => self.sources.collect_slice_information( + Some(slice) => self.source.collect_slice_information( slice, |path| ParquetReader::new(std::fs::File::open(path)?).num_rows(), |buff| ParquetReader::new(std::io::Cursor::new(buff)).num_rows(), )?, }; - match &self.sources { + match &self.source { ScanSource::Buffer(buffer) => { let row_index = self.file_options.row_index.take(); let (projection, predicate) = prepare_scan_args( @@ -88,7 +88,10 @@ impl ParquetExec { .set_low_memory(self.options.low_memory) .use_statistics(self.options.use_statistics) .set_rechunk(false) - .with_slice(Some((slice_info.item_slice.start, slice_info.item_slice.len()))) + .with_slice(Some(( + slice_info.item_slice.start, + slice_info.item_slice.len(), + ))) .with_row_index(row_index) .with_predicate(predicate.clone()) .with_projection(projection.clone()) @@ -223,7 +226,7 @@ impl ParquetExec { use polars_io::utils::slice::split_slice_at_file; let verbose = verbose(); - let paths = self.sources.into_paths(); + let paths = self.source.into_paths(); let first_metadata = &self.metadata; let cloud_options = self.cloud_options.as_ref(); @@ -440,7 +443,7 @@ impl ParquetExec { .and_then(|_| self.predicate.take()) .map(phys_expr_to_io_expr); - let is_cloud = match &self.sources { + let is_cloud = match &self.source { ScanSource::Files(paths) => is_cloud_url(paths.first().unwrap()), ScanSource::Buffer(_) => false, }; @@ -472,8 +475,7 @@ impl ParquetExec { impl Executor for ParquetExec { fn execute(&mut self, state: &mut ExecutionState) -> PolarsResult { let profile_name = if state.has_node_timer() { - let paths = self.sources.as_paths(); - let mut ids = vec![paths[0].to_string_lossy()]; + let mut ids = vec![self.source.id()]; if self.predicate.is_some() { ids.push("predicate".into()) } diff --git a/crates/polars-mem-engine/src/planner/lp.rs b/crates/polars-mem-engine/src/planner/lp.rs index 45487f7b7024..1f161a34587a 100644 --- a/crates/polars-mem-engine/src/planner/lp.rs +++ b/crates/polars-mem-engine/src/planner/lp.rs @@ -306,7 +306,7 @@ fn create_physical_plan_impl( match scan_type { #[cfg(feature = "csv")] FileScan::Csv { options, .. } => Ok(Box::new(executors::CsvExec { - sources, + source: sources, file_info, options, predicate, diff --git a/crates/polars-plan/src/client/check.rs b/crates/polars-plan/src/client/check.rs index e28e1906c8ea..c7070d22ed0c 100644 --- a/crates/polars-plan/src/client/check.rs +++ b/crates/polars-plan/src/client/check.rs @@ -2,7 +2,7 @@ use polars_core::error::{polars_err, PolarsResult}; use polars_io::path_utils::is_cloud_url; use crate::plans::options::SinkType; -use crate::plans::{DslPlan, FileScan, DslScanSource}; +use crate::plans::{DslPlan, DslScanSource, FileScan}; /// Assert that the given [`DslPlan`] is eligible to be executed on Polars Cloud. pub(super) fn assert_cloud_eligible(dsl: &DslPlan) -> PolarsResult<()> { @@ -15,13 +15,7 @@ pub(super) fn assert_cloud_eligible(dsl: &DslPlan) -> PolarsResult<()> { } => { match sources { DslScanSource::File(file) => { - if file - .lock() - .unwrap() - .paths - .iter() - .any(|p| !is_cloud_url(p)) - { + if file.lock().unwrap().paths.iter().any(|p| !is_cloud_url(p)) { return ineligible_error("contains scan of local file system"); } }, diff --git a/crates/polars-plan/src/plans/conversion/dsl_to_ir.rs b/crates/polars-plan/src/plans/conversion/dsl_to_ir.rs index 825c5896097b..72e75d2b3017 100644 --- a/crates/polars-plan/src/plans/conversion/dsl_to_ir.rs +++ b/crates/polars-plan/src/plans/conversion/dsl_to_ir.rs @@ -145,9 +145,12 @@ pub fn to_alp_impl(lp: DslPlan, ctxt: &mut DslConversionContext) -> PolarsResult metadata, .. } => { - let (file_info, md) = - scans::parquet_file_info(&source, &file_options, cloud_options.as_ref()) - .map_err(|e| e.context(failed_here!(parquet scan)))?; + let (file_info, md) = scans::parquet_file_info( + &source, + &file_options, + cloud_options.as_ref(), + ) + .map_err(|e| e.context(failed_here!(parquet scan)))?; *metadata = md; file_info }, @@ -171,7 +174,7 @@ pub fn to_alp_impl(lp: DslPlan, ctxt: &mut DslConversionContext) -> PolarsResult options, cloud_options, } => scans::csv_file_info( - source.as_paths(), + &source, &file_options, options, cloud_options.as_ref(), diff --git a/crates/polars-plan/src/plans/conversion/mod.rs b/crates/polars-plan/src/plans/conversion/mod.rs index 3e8f8748e618..9851a6d2c3ba 100644 --- a/crates/polars-plan/src/plans/conversion/mod.rs +++ b/crates/polars-plan/src/plans/conversion/mod.rs @@ -58,7 +58,7 @@ impl IR { output_schema: _, file_options: options, } => DslPlan::Scan { - sources: sources.into(), + sources: sources.to_dsl(true), file_info: Arc::new(RwLock::new(Some(file_info))), hive_parts, predicate: predicate.map(|e| e.to_expr(expr_arena)), diff --git a/crates/polars-plan/src/plans/conversion/scans.rs b/crates/polars-plan/src/plans/conversion/scans.rs index 82c953e2ffa2..1cc939417d60 100644 --- a/crates/polars-plan/src/plans/conversion/scans.rs +++ b/crates/polars-plan/src/plans/conversion/scans.rs @@ -1,5 +1,5 @@ use std::path::PathBuf; -use std::sync::{Arc, Mutex}; +use std::sync::Arc; use either::Either; use polars_io::path_utils::is_cloud_url; @@ -17,18 +17,6 @@ fn get_first_path(paths: &[PathBuf]) -> PolarsResult<&PathBuf> { .ok_or_else(|| polars_err!(ComputeError: "expected at least 1 path")) } -impl From for DslScanSource { - fn from(value: ScanSource) -> Self { - match value { - ScanSource::Files(paths) => DslScanSource::File(Arc::new(Mutex::new(ScanFileSource { - paths, - is_expanded: true, - }))), - ScanSource::Buffer(buffer) => DslScanSource::Buffer(buffer), - } - } -} - #[cfg(any(feature = "parquet", feature = "ipc"))] fn prepare_output_schema(mut schema: Schema, row_index: Option<&RowIndex>) -> SchemaRef { if let Some(rc) = row_index { @@ -162,13 +150,14 @@ pub(super) fn ipc_file_info( #[cfg(feature = "csv")] pub(super) fn csv_file_info( - paths: &[PathBuf], + source: &ScanSource, file_options: &FileScanOptions, csv_options: &mut CsvReadOptions, cloud_options: Option<&polars_io::cloud::CloudOptions>, ) -> PolarsResult { use std::io::{Read, Seek}; + use polars_core::error::feature_gated; use polars_core::{config, POOL}; use polars_io::csv::read::schema_inference::SchemaInferenceResult; use polars_io::utils::get_reader_bytes; @@ -179,105 +168,123 @@ pub(super) fn csv_file_info( // * See if we can do this without downloading the entire file // prints the error message if paths is empty. - let first_path = get_first_path(paths)?; - let run_async = is_cloud_url(first_path) || config::force_async(); - - let cache_entries = { - #[cfg(feature = "cloud")] - { - if run_async { - Some(polars_io::file_cache::init_entries_from_uri_list( - paths - .iter() - .map(|path| Arc::from(path.to_str().unwrap())) - .collect::>() - .as_slice(), - cloud_options, - )?) - } else { - None - } - } - #[cfg(not(feature = "cloud"))] - { - if run_async { - panic!("required feature `cloud` is not enabled") - } - } - }; - - let infer_schema_func = |i| { - let file = if run_async { - #[cfg(feature = "cloud")] - { - let entry: &Arc = - &cache_entries.as_ref().unwrap()[i]; - entry.try_open_check_latest()? - } - #[cfg(not(feature = "cloud"))] - { - panic!("required feature `cloud` is not enabled") - } - } else { - let p: &PathBuf = &paths[i]; - polars_utils::open_file(p.as_ref())? - }; - - let mmap = unsafe { memmap::Mmap::map(&file).unwrap() }; - let owned = &mut vec![]; + let run_async = source.is_cloud_url()? || config::force_async(); - let mut curs = std::io::Cursor::new(maybe_decompress_bytes(mmap.as_ref(), owned)?); + let si_result = match source { + ScanSource::Files(paths) => { + let cache_entries = { + feature_gated!("cloud", { + if run_async { + Some(polars_io::file_cache::init_entries_from_uri_list( + source + .as_paths() + .iter() + .flat_map(|p| p.iter()) + .map(|path| Arc::from(path.to_str().unwrap())) + .collect::>() + .as_slice(), + cloud_options, + )?) + } else { + None + } + }) + }; - if curs.read(&mut [0; 4])? < 2 && csv_options.raise_if_empty { - polars_bail!(NoData: "empty CSV") - } - curs.rewind()?; + let infer_schema_func = |i| { + let file = if run_async { + feature_gated!("cloud", { + let entry: &Arc = + &cache_entries.as_ref().unwrap()[i]; + entry.try_open_check_latest()? + }) + } else { + let p: &PathBuf = &paths[i]; + polars_utils::open_file(p.as_ref())? + }; + + let mmap = unsafe { memmap::Mmap::map(&file).unwrap() }; + let owned = &mut vec![]; + + let mut curs = std::io::Cursor::new(maybe_decompress_bytes(mmap.as_ref(), owned)?); + + if curs.read(&mut [0; 4])? < 2 && csv_options.raise_if_empty { + polars_bail!(NoData: "empty CSV") + } + curs.rewind()?; - let reader_bytes = get_reader_bytes(&mut curs).expect("could not mmap file"); + let reader_bytes = get_reader_bytes(&mut curs).expect("could not mmap file"); - // this needs a way to estimated bytes/rows. - let si_result = - SchemaInferenceResult::try_from_reader_bytes_and_options(&reader_bytes, csv_options)?; + // this needs a way to estimated bytes/rows. + let si_result = SchemaInferenceResult::try_from_reader_bytes_and_options( + &reader_bytes, + csv_options, + )?; - Ok(si_result) - }; + Ok(si_result) + }; - let merge_func = |a: PolarsResult, - b: PolarsResult| match (a, b) { - (Err(e), _) | (_, Err(e)) => Err(e), - (Ok(a), Ok(b)) => { - let merged_schema = if csv_options.schema.is_some() { - csv_options.schema.clone().unwrap() - } else { - let schema_a = a.get_inferred_schema(); - let schema_b = b.get_inferred_schema(); - - match (schema_a.is_empty(), schema_b.is_empty()) { - (true, _) => schema_b, - (_, true) => schema_a, - _ => { - let mut s = Arc::unwrap_or_clone(schema_a); - s.to_supertype(&schema_b)?; - Arc::new(s) + let merge_func = |a: PolarsResult, + b: PolarsResult| { + match (a, b) { + (Err(e), _) | (_, Err(e)) => Err(e), + (Ok(a), Ok(b)) => { + let merged_schema = if csv_options.schema.is_some() { + csv_options.schema.clone().unwrap() + } else { + let schema_a = a.get_inferred_schema(); + let schema_b = b.get_inferred_schema(); + + match (schema_a.is_empty(), schema_b.is_empty()) { + (true, _) => schema_b, + (_, true) => schema_a, + _ => { + let mut s = Arc::unwrap_or_clone(schema_a); + s.to_supertype(&schema_b)?; + Arc::new(s) + }, + } + }; + + Ok(a.with_inferred_schema(merged_schema)) }, } }; - Ok(a.with_inferred_schema(merged_schema)) - }, - }; + let si_results = POOL.join( + || infer_schema_func(0), + || { + (1..paths.len()) + .into_par_iter() + .map(infer_schema_func) + .reduce(|| Ok(Default::default()), merge_func) + }, + ); - let si_results = POOL.join( - || infer_schema_func(0), - || { - (1..paths.len()) - .into_par_iter() - .map(infer_schema_func) - .reduce(|| Ok(Default::default()), merge_func) + merge_func(si_results.0, si_results.1)? }, - ); + ScanSource::Buffer(buffer) => { + polars_ensure!(!run_async, nyi = "BytesIO scan with async"); - let si_result = merge_func(si_results.0, si_results.1)?; + let owned = &mut vec![]; + let mut reader = std::io::Cursor::new(maybe_decompress_bytes(buffer, owned)?); + + if reader.read(&mut [0; 4])? < 2 && csv_options.raise_if_empty { + polars_bail!(NoData: "empty CSV") + } + reader.rewind()?; + + let reader_bytes = get_reader_bytes(&mut reader).expect("could not open file"); + + // this needs a way to estimated bytes/rows. + let si_result = SchemaInferenceResult::try_from_reader_bytes_and_options( + &reader_bytes, + csv_options, + )?; + + si_result + }, + }; csv_options.update_with_inference_result(&si_result); diff --git a/crates/polars-plan/src/plans/functions/count.rs b/crates/polars-plan/src/plans/functions/count.rs index dca574e67808..f3120bad8dff 100644 --- a/crates/polars-plan/src/plans/functions/count.rs +++ b/crates/polars-plan/src/plans/functions/count.rs @@ -3,7 +3,9 @@ use arrow::io::ipc::read::get_row_count as count_rows_ipc_sync; #[cfg(any(feature = "parquet", feature = "json"))] use polars_io::cloud::CloudOptions; #[cfg(feature = "csv")] -use polars_io::csv::read::{count_rows as count_rows_csv, count_rows_from_slice as count_rows_csv_from_slice}; +use polars_io::csv::read::{ + count_rows as count_rows_csv, count_rows_from_slice as count_rows_csv_from_slice, +}; #[cfg(all(feature = "parquet", feature = "cloud"))] use polars_io::parquet::read::ParquetAsyncReader; #[cfg(feature = "parquet")] diff --git a/crates/polars-plan/src/plans/functions/mod.rs b/crates/polars-plan/src/plans/functions/mod.rs index 37e01bcc654f..468a85273ea4 100644 --- a/crates/polars-plan/src/plans/functions/mod.rs +++ b/crates/polars-plan/src/plans/functions/mod.rs @@ -103,9 +103,14 @@ impl PartialEq for FunctionIR { use FunctionIR::*; match (self, other) { (Rechunk, Rechunk) => true, - (FastCount { sources: srcs_l, .. }, FastCount { sources: srcs_r, .. }) => { - srcs_l == srcs_r - }, + ( + FastCount { + sources: srcs_l, .. + }, + FastCount { + sources: srcs_r, .. + }, + ) => srcs_l == srcs_r, ( Rename { existing: existing_l, diff --git a/crates/polars-plan/src/plans/ir/mod.rs b/crates/polars-plan/src/plans/ir/mod.rs index 52593e3d2b28..ff4e46e64dd8 100644 --- a/crates/polars-plan/src/plans/ir/mod.rs +++ b/crates/polars-plan/src/plans/ir/mod.rs @@ -15,7 +15,7 @@ use hive::HivePartitions; use polars_core::prelude::*; use polars_core::POOL; use polars_utils::idx_vec::UnitVec; -use polars_utils::unitvec; +use polars_utils::{format_pl_smallstr, unitvec}; #[cfg(feature = "ir_serde")] use serde::{Deserialize, Serialize}; @@ -65,7 +65,9 @@ impl ScanSource { pub fn try_into_paths(&self) -> PolarsResult> { match self { ScanSource::Files(paths) => Ok(paths.clone()), - ScanSource::Buffer(_) => Err(polars_err!(nyi = "Unable to convert BytesIO scan into path")), + ScanSource::Buffer(_) => Err(polars_err!( + nyi = "Unable to convert BytesIO scan into path" + )), } } @@ -103,8 +105,16 @@ impl ScanSource { } } + pub fn id(&self) -> PlSmallStr { + match self { + ScanSource::Files(paths) if paths.is_empty() => PlSmallStr::from_static("EMPTY"), + ScanSource::Files(paths) => PlSmallStr::from_str(paths[0].to_string_lossy().as_ref()), + ScanSource::Buffer(_) => PlSmallStr::from_static("IN_MEMORY"), + } + } + /// Normalize the slice and collect information as to what rows and parts of the source are - /// used in this slice. + /// used in this slice. pub fn collect_slice_information( &self, slice: (i64, usize), diff --git a/crates/polars-python/src/file.rs b/crates/polars-python/src/file.rs index 9443fd0b5213..6225ee5427f7 100644 --- a/crates/polars-python/src/file.rs +++ b/crates/polars-python/src/file.rs @@ -201,10 +201,7 @@ pub enum EitherPythonFileOrPath { Path(PathBuf), } -pub fn get_either_file_or_path( - py_f: PyObject, - write: bool, -) -> PyResult { +pub fn get_either_file_or_path(py_f: PyObject, write: bool) -> PyResult { Python::with_gil(|py| { let py_f = py_f.into_bound(py); if let Ok(s) = py_f.extract::>() { diff --git a/crates/polars-python/src/lazyframe/general.rs b/crates/polars-python/src/lazyframe/general.rs index d5f64c9f35ac..2e2ce702f5bd 100644 --- a/crates/polars-python/src/lazyframe/general.rs +++ b/crates/polars-python/src/lazyframe/general.rs @@ -113,7 +113,7 @@ impl PyLazyFrame { ) )] fn new_from_csv( - path: Option, + path: Option, paths: Vec, separator: &str, has_header: bool, @@ -145,6 +145,10 @@ impl PyLazyFrame { file_cache_ttl: Option, include_file_paths: Option, ) -> PyResult { + use std::path::Path; + + use crate::file::{get_either_file_or_path, EitherPythonFileOrPath}; + let null_values = null_values.map(|w| w.0); let quote_char = quote_char.map(|s| s.as_bytes()[0]); let separator = separator.as_bytes()[0]; @@ -161,38 +165,43 @@ impl PyLazyFrame { .collect::() }); - #[cfg(feature = "cloud")] - let cloud_options = { - let first_path = if let Some(path) = &path { - path - } else { - paths - .first() - .ok_or_else(|| PyValueError::new_err("expected a path argument"))? - }; + use polars_plan::plans::ScanSource; + use EitherPythonFileOrPath as EF; + let (first_path, mut r) = match path + .map(|py_f| get_either_file_or_path(py_f, false)) + .transpose()? + { + Some(EF::Path(path)) => { + let reader = LazyCsvReader::new(>::as_ref(&path)); + (Some(path), reader) + }, + Some(EF::Py(f)) => ( + None, + LazyCsvReader::new_sourced(ScanSource::Buffer(f.as_arc())), + ), + None => ( + Some( + paths + .first() + .cloned() + .ok_or_else(|| PyValueError::new_err("expected a path argument"))?, + ), + LazyCsvReader::new_paths(paths.into()), + ), + }; + #[cfg(feature = "cloud")] + if let Some(first_path) = first_path { let first_path_url = first_path.to_string_lossy(); - let mut cloud_options = if let Some(opts) = cloud_options { - parse_cloud_options(&first_path_url, opts)? - } else { - parse_cloud_options(&first_path_url, vec![])? - }; - - cloud_options = cloud_options.with_max_retries(retries); - + let mut cloud_options = + parse_cloud_options(&first_path_url, cloud_options.unwrap_or_default())?; if let Some(file_cache_ttl) = file_cache_ttl { cloud_options.file_cache_ttl = file_cache_ttl; } - - Some(cloud_options) - }; - - let r = if let Some(path) = path.as_ref() { - LazyCsvReader::new(path) - } else { - LazyCsvReader::new_paths(paths.into()) - }; + cloud_options = cloud_options.with_max_retries(retries); + r = r.with_cloud_options(Some(cloud_options)); + } let mut r = r .with_infer_schema_length(infer_schema_length) @@ -219,7 +228,6 @@ impl PyLazyFrame { .with_decimal_comma(decimal_comma) .with_glob(glob) .with_raise_if_empty(raise_if_empty) - .with_cloud_options(cloud_options) .with_include_file_paths(include_file_paths.map(|x| x.into())); if let Some(lambda) = with_schema_modify { @@ -276,70 +284,11 @@ impl PyLazyFrame { let parallel = parallel.0; let hive_schema = hive_schema.map(|s| Arc::new(s.0)); - use polars_plan::plans::ScanSource; - use EitherPythonFileOrPath as EF; - let use_first_path = path.is_some(); - let first_path = match path - .map(|py_f| get_either_file_or_path(py_f, false)) - .transpose()? - { - Some(EF::Path(path)) => path, - Some(EF::Py(f)) => { - let scan_source = ScanSource::Buffer(f.as_arc()); - - let row_index = row_index.map(|(name, offset)| RowIndex { - name: name.into(), - offset, - }); - - let args = ScanArgsParquet { - n_rows, - cache, - parallel, - rechunk, - row_index, - low_memory, - cloud_options: None, - use_statistics, - hive_options: HiveOptions { - enabled: hive_partitioning, - hive_start_idx: 0, - schema: hive_schema, - try_parse_dates: try_parse_hive_dates, - }, - glob, - include_file_paths: include_file_paths.map(|x| x.into()), - }; - - let lf = LazyFrame::scan_parquet_sourced(scan_source, args) - .map_err(PyPolarsErr::from)?; - return Ok(lf.into()); - }, - None => paths - .first() - .cloned() - .ok_or_else(|| PyValueError::new_err("expected a path argument"))?, - }; - - #[cfg(feature = "cloud")] - let cloud_options = { - let first_path_url = first_path.to_string_lossy(); - - let mut cloud_options = if let Some(opts) = cloud_options { - parse_cloud_options(&first_path_url, opts)? - } else { - parse_cloud_options(&first_path_url, vec![])? - }; - - cloud_options = cloud_options.with_max_retries(retries); - - Some(cloud_options) - }; - let row_index = row_index.map(|(name, offset)| RowIndex { name: name.into(), offset, }); + let hive_options = HiveOptions { enabled: hive_partitioning, hive_start_idx: 0, @@ -347,20 +296,48 @@ impl PyLazyFrame { try_parse_dates: try_parse_hive_dates, }; - let args = ScanArgsParquet { + let mut args = ScanArgsParquet { n_rows, cache, parallel, rechunk, row_index, low_memory, - cloud_options, + cloud_options: None, use_statistics, hive_options, glob, include_file_paths: include_file_paths.map(|x| x.into()), }; + use polars_plan::plans::ScanSource; + use EitherPythonFileOrPath as EF; + let use_first_path = path.is_some(); + let first_path = match path + .map(|py_f| get_either_file_or_path(py_f, false)) + .transpose()? + { + Some(EF::Path(path)) => path, + Some(EF::Py(f)) => { + return LazyFrame::scan_parquet_sourced(ScanSource::Buffer(f.as_arc()), args) + .map(Self::from) + .map_err(PyPolarsErr::from) + .map_err(From::from); + }, + None => paths + .first() + .cloned() + .ok_or_else(|| PyValueError::new_err("expected a path argument"))?, + }; + + #[cfg(feature = "cloud")] + { + let first_path_url = first_path.to_string_lossy(); + let cloud_options = + parse_cloud_options(&first_path_url, cloud_options.unwrap_or_default())?; + args.cloud_options = Some(cloud_options.with_max_retries(retries)); + } + let lf = if use_first_path { LazyFrame::scan_parquet(first_path, args) } else { @@ -374,7 +351,7 @@ impl PyLazyFrame { #[staticmethod] #[pyo3(signature = (path, paths, n_rows, cache, rechunk, row_index, memory_map, cloud_options, hive_partitioning, hive_schema, try_parse_hive_dates, retries, file_cache_ttl, include_file_paths))] fn new_from_ipc( - path: Option, + path: Option, paths: Vec, n_rows: Option, cache: bool, @@ -389,38 +366,13 @@ impl PyLazyFrame { file_cache_ttl: Option, include_file_paths: Option, ) -> PyResult { + use crate::file::{get_either_file_or_path, EitherPythonFileOrPath}; + let row_index = row_index.map(|(name, offset)| RowIndex { name: name.into(), offset, }); - #[cfg(feature = "cloud")] - let cloud_options = { - let first_path = if let Some(path) = &path { - path - } else { - paths - .first() - .ok_or_else(|| PyValueError::new_err("expected a path argument"))? - }; - - let first_path_url = first_path.to_string_lossy(); - - let mut cloud_options = if let Some(opts) = cloud_options { - parse_cloud_options(&first_path_url, opts)? - } else { - parse_cloud_options(&first_path_url, vec![])? - }; - - cloud_options = cloud_options.with_max_retries(retries); - - if let Some(file_cache_ttl) = file_cache_ttl { - cloud_options.file_cache_ttl = file_cache_ttl; - } - - Some(cloud_options) - }; - let hive_options = HiveOptions { enabled: hive_partitioning, hive_start_idx: 0, @@ -428,20 +380,52 @@ impl PyLazyFrame { try_parse_dates: try_parse_hive_dates, }; - let args = ScanArgsIpc { + let mut args = ScanArgsIpc { n_rows, cache, rechunk, row_index, memory_map, #[cfg(feature = "cloud")] - cloud_options, + cloud_options: None, hive_options, include_file_paths: include_file_paths.map(|x| x.into()), }; - let lf = if let Some(path) = &path { - LazyFrame::scan_ipc(path, args) + use polars_plan::plans::ScanSource; + use EitherPythonFileOrPath as EF; + let use_first_path = path.is_some(); + let first_path = match path + .map(|py_f| get_either_file_or_path(py_f, false)) + .transpose()? + { + Some(EF::Path(path)) => path, + Some(EF::Py(f)) => { + return LazyFrame::scan_ipc_sourced(ScanSource::Buffer(f.as_arc()), args) + .map(Self::from) + .map_err(PyPolarsErr::from) + .map_err(From::from); + }, + None => paths + .first() + .cloned() + .ok_or_else(|| PyValueError::new_err("expected a path argument"))?, + }; + + #[cfg(feature = "cloud")] + { + let first_path_url = first_path.to_string_lossy(); + + let mut cloud_options = + parse_cloud_options(&first_path_url, cloud_options.unwrap_or_default())?; + if let Some(file_cache_ttl) = file_cache_ttl { + cloud_options.file_cache_ttl = file_cache_ttl; + } + args.cloud_options = Some(cloud_options.with_max_retries(retries)); + } + + let lf = if use_first_path { + LazyFrame::scan_ipc(first_path, args) } else { LazyFrame::scan_ipc_files(paths.into(), args) } diff --git a/py-polars/polars/io/csv/functions.py b/py-polars/polars/io/csv/functions.py index 3a27911d716e..b7b5c4764845 100644 --- a/py-polars/polars/io/csv/functions.py +++ b/py-polars/polars/io/csv/functions.py @@ -984,7 +984,7 @@ def read_csv_batched( @deprecate_renamed_parameter("row_count_name", "row_index_name", version="0.20.4") @deprecate_renamed_parameter("row_count_offset", "row_index_offset", version="0.20.4") def scan_csv( - source: str | Path | list[str] | list[Path], + source: str | Path | list[str] | list[Path] | IO[str] | IO[bytes], *, has_header: bool = True, separator: str = ",", @@ -1232,6 +1232,8 @@ def with_column_names(cols: list[str]) -> list[str]: if isinstance(source, (str, Path)): source = normalize_filepath(source, check_not_directory=False) + elif isinstance(source, (IO, BytesIO)): + pass else: source = [ normalize_filepath(source, check_not_directory=False) for source in source @@ -1276,7 +1278,7 @@ def with_column_names(cols: list[str]) -> list[str]: def _scan_csv_impl( - source: str | list[str] | list[Path], + source: str | list[str] | list[Path] | IO[str] | IO[bytes], *, has_header: bool = True, separator: str = ",", diff --git a/py-polars/polars/io/ipc/functions.py b/py-polars/polars/io/ipc/functions.py index 4443c31d513f..8f3c21bdf286 100644 --- a/py-polars/polars/io/ipc/functions.py +++ b/py-polars/polars/io/ipc/functions.py @@ -2,6 +2,7 @@ import contextlib import os +from io import BytesIO from pathlib import Path from typing import IO, TYPE_CHECKING, Any, Sequence @@ -346,7 +347,7 @@ def read_ipc_schema(source: str | Path | IO[bytes] | bytes) -> dict[str, DataTyp @deprecate_renamed_parameter("row_count_name", "row_index_name", version="0.20.4") @deprecate_renamed_parameter("row_count_offset", "row_index_offset", version="0.20.4") def scan_ipc( - source: str | Path | list[str] | list[Path], + source: str | Path | list[str] | list[Path] | IO[str] | IO[bytes], *, n_rows: int | None = None, cache: bool = True, @@ -429,6 +430,8 @@ def scan_ipc( if isinstance(source, (str, Path)): source = normalize_filepath(source, check_not_directory=False) sources = [] + elif isinstance(source, (IO, BytesIO)): + sources = [] else: sources = [ normalize_filepath(source, check_not_directory=False) for source in source diff --git a/py-polars/polars/io/ndjson.py b/py-polars/polars/io/ndjson.py index e8eccca53ccd..5482ccc52c42 100644 --- a/py-polars/polars/io/ndjson.py +++ b/py-polars/polars/io/ndjson.py @@ -3,7 +3,7 @@ import contextlib from io import BytesIO, StringIO from pathlib import Path -from typing import TYPE_CHECKING, Any, Sequence +from typing import IO, TYPE_CHECKING, Any, Sequence from polars._utils.deprecation import deprecate_renamed_parameter from polars._utils.various import normalize_filepath @@ -166,7 +166,7 @@ def read_ndjson( @deprecate_renamed_parameter("row_count_name", "row_index_name", version="0.20.4") @deprecate_renamed_parameter("row_count_offset", "row_index_offset", version="0.20.4") def scan_ndjson( - source: str | Path | list[str] | list[Path], + source: str | Path | list[str] | list[Path] | IO[str] | IO[bytes], *, schema: SchemaDefinition | None = None, schema_overrides: SchemaDefinition | None = None, @@ -250,6 +250,8 @@ def scan_ndjson( if isinstance(source, (str, Path)): source = normalize_filepath(source, check_not_directory=False) sources = [] + elif isinstance(source, (IO, BytesIO)): + sources = [] else: sources = [ normalize_filepath(source, check_not_directory=False) for source in source diff --git a/py-polars/polars/io/parquet/functions.py b/py-polars/polars/io/parquet/functions.py index 0fc52142e5de..ef01b24955b0 100644 --- a/py-polars/polars/io/parquet/functions.py +++ b/py-polars/polars/io/parquet/functions.py @@ -295,7 +295,7 @@ def read_parquet_schema(source: str | Path | IO[bytes] | bytes) -> dict[str, Dat @deprecate_renamed_parameter("row_count_name", "row_index_name", version="0.20.4") @deprecate_renamed_parameter("row_count_offset", "row_index_offset", version="0.20.4") def scan_parquet( - source: str | Path | list[str] | list[Path] | io.BytesIO, + source: str | Path | list[str] | list[Path] | IO[str] | IO[bytes], *, n_rows: int | None = None, row_index_name: str | None = None, @@ -422,8 +422,8 @@ def scan_parquet( if isinstance(source, (str, Path)): source = normalize_filepath(source, check_not_directory=False) - elif isinstance(source, io.BytesIO): - pass + elif isinstance(source, (IO, BytesIO)): + sources = [] else: source = [ normalize_filepath(source, check_not_directory=False) for source in source @@ -450,7 +450,7 @@ def scan_parquet( def _scan_parquet_impl( - source: str | list[str] | list[Path] | io.BytesIO, + source: str | list[str] | list[Path] | IO[str] | IO[bytes], *, n_rows: int | None = None, cache: bool = True, From 5fb9ffa31b8b956cd2000b9dd03d14cee747cdb8 Mon Sep 17 00:00:00 2001 From: coastalwhite Date: Wed, 4 Sep 2024 16:05:31 +0200 Subject: [PATCH 04/27] working for all scans --- crates/polars-io/src/csv/read/parser.rs | 14 +- crates/polars-lazy/src/scan/csv.rs | 35 +- .../polars-lazy/src/scan/file_list_reader.rs | 10 +- crates/polars-lazy/src/scan/ipc.rs | 18 +- crates/polars-lazy/src/scan/ndjson.rs | 34 +- crates/polars-lazy/src/scan/parquet.rs | 18 +- .../src/executors/scan/csv.rs | 78 ++-- .../src/executors/scan/ipc.rs | 103 +++-- .../src/executors/scan/ndjson.rs | 128 ++++--- .../src/executors/scan/parquet.rs | 239 +++++------- crates/polars-mem-engine/src/planner/lp.rs | 2 +- .../polars-pipe/src/executors/sources/csv.rs | 6 +- .../src/executors/sources/parquet.rs | 6 +- crates/polars-plan/src/client/check.rs | 11 +- crates/polars-plan/src/plans/builder_dsl.rs | 20 +- .../src/plans/conversion/dsl_to_ir.rs | 114 +++--- .../polars-plan/src/plans/conversion/mod.rs | 7 +- .../polars-plan/src/plans/conversion/scans.rs | 359 +++++++++--------- .../polars-plan/src/plans/functions/count.rs | 166 ++++---- crates/polars-plan/src/plans/functions/mod.rs | 12 +- crates/polars-plan/src/plans/ir/dot.rs | 31 +- crates/polars-plan/src/plans/ir/format.rs | 18 +- crates/polars-plan/src/plans/ir/mod.rs | 253 +++++++----- crates/polars-plan/src/plans/mod.rs | 15 +- .../src/plans/optimizer/count_star.rs | 41 +- .../plans/optimizer/predicate_pushdown/mod.rs | 2 +- crates/polars-python/src/lazyframe/general.rs | 88 +++-- .../src/lazyframe/visitor/nodes.rs | 2 +- .../src/utils/late_materialized_df.rs | 4 +- py-polars/polars/io/parquet/functions.py | 2 +- 30 files changed, 965 insertions(+), 871 deletions(-) diff --git a/crates/polars-io/src/csv/read/parser.rs b/crates/polars-io/src/csv/read/parser.rs index 9d2852a02c82..ccda4805792b 100644 --- a/crates/polars-io/src/csv/read/parser.rs +++ b/crates/polars-io/src/csv/read/parser.rs @@ -1,9 +1,10 @@ -use std::path::PathBuf; +use std::path::Path; use memchr::memchr2_iter; use num_traits::Pow; use polars_core::prelude::*; use polars_core::{config, POOL}; +use polars_error::feature_gated; use polars_utils::index::Bounded; use polars_utils::slice::GetSaferUnchecked; use rayon::prelude::*; @@ -18,7 +19,7 @@ use crate::utils::maybe_decompress_bytes; /// Read the number of rows without parsing columns /// useful for count(*) queries pub fn count_rows( - path: &PathBuf, + path: &Path, separator: u8, quote_char: Option, comment_prefix: Option<&CommentPrefix>, @@ -26,18 +27,13 @@ pub fn count_rows( has_header: bool, ) -> PolarsResult { let file = if is_cloud_url(path) || config::force_async() { - #[cfg(feature = "cloud")] - { + feature_gated!("cloud", { crate::file_cache::FILE_CACHE .get_entry(path.to_str().unwrap()) // Safety: This was initialized by schema inference. .unwrap() .try_open_assume_latest()? - } - #[cfg(not(feature = "cloud"))] - { - panic!("required feature `cloud` is not enabled") - } + }) } else { polars_utils::open_file(path)? }; diff --git a/crates/polars-lazy/src/scan/csv.rs b/crates/polars-lazy/src/scan/csv.rs index e408681789c3..a8687aba3b8b 100644 --- a/crates/polars-lazy/src/scan/csv.rs +++ b/crates/polars-lazy/src/scan/csv.rs @@ -15,7 +15,7 @@ use crate::prelude::*; #[derive(Clone)] #[cfg(feature = "csv")] pub struct LazyCsvReader { - source: ScanSource, + sources: ScanSources, glob: bool, cache: bool, read_options: CsvReadOptions, @@ -35,13 +35,13 @@ impl LazyCsvReader { Self::new("").with_paths(paths) } - pub fn new_sourced(source: ScanSource) -> Self { - Self::new("").with_source(source) + pub fn new_sourced(sources: ScanSources) -> Self { + Self::new("").with_sources(sources) } pub fn new(path: impl AsRef) -> Self { LazyCsvReader { - source: ScanSource::Files([path.as_ref().to_path_buf()].into()), + sources: ScanSources::Files([path.as_ref().to_path_buf()].into()), glob: true, cache: true, read_options: Default::default(), @@ -253,8 +253,8 @@ impl LazyCsvReader { ) }; - let schema = match self.source.clone() { - ScanSource::Files(paths) => { + let schema = match self.sources.clone() { + ScanSources::Files(paths) => { // TODO: Path expansion should happen when converting to the IR // https://github.com/pola-rs/polars/issues/17634 let paths = expand_paths(&paths[..], self.glob(), self.cloud_options())?; @@ -266,9 +266,16 @@ impl LazyCsvReader { let mut file = polars_utils::open_file(path)?; infer_schema(get_reader_bytes(&mut file).expect("could not mmap file"))? }, - ScanSource::Buffer(buffer) => infer_schema( - get_reader_bytes(&mut std::io::Cursor::new(buffer)).expect("could not mmap file"), - )?, + ScanSources::Buffers(buffers) => { + let Some(buffer) = buffers.first() else { + polars_bail!(ComputeError: "no buffers specified for this reader"); + }; + + infer_schema( + get_reader_bytes(&mut std::io::Cursor::new(buffer)) + .expect("could not mmap file"), + )? + }, }; self.read_options.n_threads = n_threads; @@ -294,7 +301,7 @@ impl LazyFileListReader for LazyCsvReader { /// Get the final [LazyFrame]. fn finish(self) -> PolarsResult { let mut lf: LazyFrame = DslBuilder::scan_csv( - self.source.to_dsl(false), + self.sources.to_dsl(false), self.read_options, self.cache, self.cloud_options, @@ -315,12 +322,12 @@ impl LazyFileListReader for LazyCsvReader { self.glob } - fn source(&self) -> &ScanSource { - &self.source + fn sources(&self) -> &ScanSources { + &self.sources } - fn with_source(mut self, source: ScanSource) -> Self { - self.source = source; + fn with_sources(mut self, sources: ScanSources) -> Self { + self.sources = sources; self } diff --git a/crates/polars-lazy/src/scan/file_list_reader.rs b/crates/polars-lazy/src/scan/file_list_reader.rs index 8992b8df5a65..b25cec6eda3b 100644 --- a/crates/polars-lazy/src/scan/file_list_reader.rs +++ b/crates/polars-lazy/src/scan/file_list_reader.rs @@ -19,8 +19,8 @@ pub trait LazyFileListReader: Clone { return self.finish_no_glob(); } - let ScanSource::Files(paths) = self.source() else { - unreachable!("Should never be globbed"); + let ScanSources::Files(paths) = self.sources() else { + unreachable!("in-memory buffers should never be globbed"); }; let lfs = paths @@ -83,16 +83,16 @@ pub trait LazyFileListReader: Clone { true } - fn source(&self) -> &ScanSource; + fn sources(&self) -> &ScanSources; /// Set paths of the scanned files. #[must_use] - fn with_source(self, source: ScanSource) -> Self; + fn with_sources(self, source: ScanSources) -> Self; /// Set paths of the scanned files. #[must_use] fn with_paths(self, paths: Arc<[PathBuf]>) -> Self { - self.with_source(ScanSource::Files(paths)) + self.with_sources(ScanSources::Files(paths)) } /// Configure the row limit. diff --git a/crates/polars-lazy/src/scan/ipc.rs b/crates/polars-lazy/src/scan/ipc.rs index 18043a15717a..fa11ef8e4455 100644 --- a/crates/polars-lazy/src/scan/ipc.rs +++ b/crates/polars-lazy/src/scan/ipc.rs @@ -37,14 +37,14 @@ impl Default for ScanArgsIpc { #[derive(Clone)] struct LazyIpcReader { args: ScanArgsIpc, - source: ScanSource, + sources: ScanSources, } impl LazyIpcReader { fn new(args: ScanArgsIpc) -> Self { Self { args, - source: ScanSource::default(), + sources: ScanSources::default(), } } } @@ -58,7 +58,7 @@ impl LazyFileListReader for LazyIpcReader { }; let mut lf: LazyFrame = DslBuilder::scan_ipc( - self.source.to_dsl(false), + self.sources.to_dsl(false), options, args.n_rows, args.cache, @@ -79,12 +79,12 @@ impl LazyFileListReader for LazyIpcReader { unreachable!() } - fn source(&self) -> &ScanSource { - &self.source + fn sources(&self) -> &ScanSources { + &self.sources } - fn with_source(mut self, source: ScanSource) -> Self { - self.source = source; + fn with_sources(mut self, sources: ScanSources) -> Self { + self.sources = sources; self } @@ -133,7 +133,7 @@ impl LazyFrame { LazyIpcReader::new(args).with_paths(paths).finish() } - pub fn scan_ipc_sourced(source: ScanSource, args: ScanArgsIpc) -> PolarsResult { - LazyIpcReader::new(args).with_source(source).finish() + pub fn scan_ipc_sourced(sources: ScanSources, args: ScanArgsIpc) -> PolarsResult { + LazyIpcReader::new(args).with_sources(sources).finish() } } diff --git a/crates/polars-lazy/src/scan/ndjson.rs b/crates/polars-lazy/src/scan/ndjson.rs index 9a1d071f8365..8d71d9a585a2 100644 --- a/crates/polars-lazy/src/scan/ndjson.rs +++ b/crates/polars-lazy/src/scan/ndjson.rs @@ -1,11 +1,11 @@ use std::num::NonZeroUsize; use std::path::{Path, PathBuf}; -use std::sync::{Arc, RwLock}; +use std::sync::{Arc, Mutex, RwLock}; use polars_core::prelude::*; use polars_io::cloud::CloudOptions; -use polars_io::RowIndex; -use polars_plan::plans::{DslPlan, FileScan, ScanSource}; +use polars_io::{HiveOptions, RowIndex}; +use polars_plan::plans::{DslPlan, FileScan, ScanSources}; use polars_plan::prelude::{FileScanOptions, NDJsonReadOptions}; use crate::prelude::LazyFrame; @@ -13,7 +13,7 @@ use crate::scan::file_list_reader::LazyFileListReader; #[derive(Clone)] pub struct LazyJsonLineReader { - pub(crate) source: ScanSource, + pub(crate) sources: ScanSources, pub(crate) batch_size: Option, pub(crate) low_memory: bool, pub(crate) rechunk: bool, @@ -29,12 +29,12 @@ pub struct LazyJsonLineReader { impl LazyJsonLineReader { pub fn new_paths(paths: Arc<[PathBuf]>) -> Self { - Self::new(PathBuf::new()).with_paths(paths) + Self::new_sourced(ScanSources::Files(paths)) } - pub fn new(path: impl AsRef) -> Self { + pub fn new_sourced(sources: ScanSources) -> Self { LazyJsonLineReader { - source: ScanSource::Files([path.as_ref().to_path_buf()].into()), + sources, batch_size: None, low_memory: false, rechunk: false, @@ -48,6 +48,10 @@ impl LazyJsonLineReader { cloud_options: None, } } + + pub fn new(path: impl AsRef) -> Self { + Self::new_sourced(ScanSources::Files([path.as_ref().to_path_buf()].into())) + } /// Add a row index column. #[must_use] pub fn with_row_index(mut self, row_index: Option) -> Self { @@ -124,7 +128,11 @@ impl LazyFileListReader for LazyJsonLineReader { row_index: self.row_index, rechunk: self.rechunk, file_counter: 0, - hive_options: Default::default(), + hive_options: { + let mut options = HiveOptions::default(); + options.enabled = Some(false); + options + }, glob: true, include_file_paths: self.include_file_paths, }; @@ -145,7 +153,7 @@ impl LazyFileListReader for LazyJsonLineReader { }; Ok(LazyFrame::from(DslPlan::Scan { - sources: self.source.to_dsl(false), + sources: Arc::new(Mutex::new(self.sources.to_dsl(false))), file_info: Arc::new(RwLock::new(None)), hive_parts: None, predicate: None, @@ -158,12 +166,12 @@ impl LazyFileListReader for LazyJsonLineReader { unreachable!(); } - fn source(&self) -> &ScanSource { - &self.source + fn sources(&self) -> &ScanSources { + &self.sources } - fn with_source(mut self, source: ScanSource) -> Self { - self.source = source; + fn with_sources(mut self, sources: ScanSources) -> Self { + self.sources = sources; self } diff --git a/crates/polars-lazy/src/scan/parquet.rs b/crates/polars-lazy/src/scan/parquet.rs index 491ae3ee126c..c198ccf690c1 100644 --- a/crates/polars-lazy/src/scan/parquet.rs +++ b/crates/polars-lazy/src/scan/parquet.rs @@ -44,14 +44,14 @@ impl Default for ScanArgsParquet { #[derive(Clone)] struct LazyParquetReader { args: ScanArgsParquet, - source: ScanSource, + sources: ScanSources, } impl LazyParquetReader { fn new(args: ScanArgsParquet) -> Self { Self { args, - source: ScanSource::default(), + sources: ScanSources::default(), } } } @@ -62,7 +62,7 @@ impl LazyFileListReader for LazyParquetReader { let row_index = self.args.row_index; let mut lf: LazyFrame = DslBuilder::scan_parquet( - self.source.to_dsl(false), + self.sources.to_dsl(false), self.args.n_rows, self.args.cache, self.args.parallel, @@ -95,12 +95,12 @@ impl LazyFileListReader for LazyParquetReader { unreachable!(); } - fn source(&self) -> &ScanSource { - &self.source + fn sources(&self) -> &ScanSources { + &self.sources } - fn with_source(mut self, source: ScanSource) -> Self { - self.source = source; + fn with_sources(mut self, sources: ScanSources) -> Self { + self.sources = sources; self } @@ -145,8 +145,8 @@ impl LazyFrame { } /// Create a LazyFrame directly from a parquet scan. - pub fn scan_parquet_sourced(source: ScanSource, args: ScanArgsParquet) -> PolarsResult { - LazyParquetReader::new(args).with_source(source).finish() + pub fn scan_parquet_sourced(sources: ScanSources, args: ScanArgsParquet) -> PolarsResult { + LazyParquetReader::new(args).with_sources(sources).finish() } /// Create a LazyFrame directly from a parquet scan. diff --git a/crates/polars-mem-engine/src/executors/scan/csv.rs b/crates/polars-mem-engine/src/executors/scan/csv.rs index 4ab5a034c584..b06386cdfa03 100644 --- a/crates/polars-mem-engine/src/executors/scan/csv.rs +++ b/crates/polars-mem-engine/src/executors/scan/csv.rs @@ -9,7 +9,7 @@ use polars_error::feature_gated; use super::*; pub struct CsvExec { - pub source: ScanSource, + pub sources: ScanSources, pub file_info: FileInfo, pub options: CsvReadOptions, pub file_options: FileScanOptions, @@ -18,7 +18,6 @@ pub struct CsvExec { impl CsvExec { fn read(&self) -> PolarsResult { - let paths = self.source.as_paths(); let with_columns = self .file_options .with_columns @@ -46,7 +45,7 @@ impl CsvExec { .with_row_index(None) .with_path::<&str>(None); - if paths.is_empty() { + if self.sources.is_empty() { let out = if let Some(schema) = options_base.schema { DataFrame::from_rows_and_schema(&[], schema.as_ref())? } else { @@ -57,7 +56,7 @@ impl CsvExec { let verbose = config::verbose(); let force_async = config::force_async(); - let run_async = force_async || is_cloud_url(paths.first().unwrap()); + let run_async = force_async || self.sources.is_cloud_url(); if force_async && verbose { eprintln!("ASYNC READING FORCED"); @@ -65,34 +64,45 @@ impl CsvExec { let finish_read = |i: usize, options: CsvReadOptions, predicate: Option>| { - let path = &paths[i]; - let file = if run_async { - feature_gated!("cloud", { - polars_io::file_cache::FILE_CACHE - .get_entry(path.to_str().unwrap()) - // Safety: This was initialized by schema inference. - .unwrap() - .try_open_assume_latest() - }) - } else { - polars_utils::open_file(path) - }?; - + let source = self.sources.at(i); let owned = &mut vec![]; - let mmap = unsafe { memmap::Mmap::map(&file).unwrap() }; - let mut df = options - .into_reader_with_file_handle(std::io::Cursor::new(maybe_decompress_bytes( - mmap.as_ref(), - owned, - )?)) - ._with_predicate(predicate.clone()) - .finish()?; + + let mut df = match source { + ScanSourceRef::File(path) => { + let file = if run_async { + feature_gated!("cloud", { + polars_io::file_cache::FILE_CACHE + .get_entry(path.to_str().unwrap()) + // Safety: This was initialized by schema inference. + .unwrap() + .try_open_assume_latest() + }) + } else { + polars_utils::open_file(path) + }?; + + let mmap = unsafe { memmap::Mmap::map(&file).unwrap() }; + options + .into_reader_with_file_handle(std::io::Cursor::new( + maybe_decompress_bytes(mmap.as_ref(), owned)?, + )) + ._with_predicate(predicate.clone()) + .finish()? + }, + ScanSourceRef::Buffer(buffer) => options + .into_reader_with_file_handle(std::io::Cursor::new(maybe_decompress_bytes( + buffer, owned, + )?)) + ._with_predicate(predicate.clone()) + .finish()?, + }; if let Some(col) = &self.file_options.include_file_paths { - let path = path.to_str().unwrap(); + let name = source.to_file_path(); + unsafe { df.with_column_unchecked( - StringChunked::full(col.clone(), path, df.height()).into_series(), + StringChunked::full(col.clone(), name, df.height()).into_series(), ) }; } @@ -110,14 +120,14 @@ impl CsvExec { } let mut n_rows_read = 0usize; - let mut out = Vec::with_capacity(paths.len()); + let mut out = Vec::with_capacity(self.sources.len()); // If we have n_rows or row_index then we need to count how many rows we read, so we need // to delay applying the predicate. let predicate_during_read = predicate .clone() .filter(|_| n_rows.is_none() && self.file_options.row_index.is_none()); - for i in 0..paths.len() { + for i in 0..self.sources.len() { let opts = options_base .clone() .with_row_index(self.file_options.row_index.clone().map(|mut ri| { @@ -162,10 +172,10 @@ impl CsvExec { if n_rows.is_some() && n_rows_read == n_rows.unwrap() { if verbose { eprintln!( - "reached n_rows = {} at file {} / {}", + "reached n_rows = {} at source {} / {}", n_rows.unwrap(), 1 + i, - paths.len() + self.sources.len() ) } break; @@ -190,10 +200,10 @@ impl CsvExec { let dfs = POOL.install(|| { let step = std::cmp::min(POOL.current_num_threads(), 128); - (0..paths.len()) + (0..self.sources.len()) .step_by(step) .map(|start| { - (start..std::cmp::min(start.saturating_add(step), paths.len())) + (start..std::cmp::min(start.saturating_add(step), self.sources.len())) .into_par_iter() .map(|i| finish_read(i, options_base.clone(), predicate.clone())) .collect::>>() @@ -222,7 +232,7 @@ impl CsvExec { impl Executor for CsvExec { fn execute(&mut self, state: &mut ExecutionState) -> PolarsResult { let profile_name = if state.has_node_timer() { - let mut ids = vec![self.source.id()]; + let mut ids = vec![self.sources.id()]; if self.predicate.is_some() { ids.push("predicate".into()) } diff --git a/crates/polars-mem-engine/src/executors/scan/ipc.rs b/crates/polars-mem-engine/src/executors/scan/ipc.rs index b9387cad5878..ae1e3bcf30f2 100644 --- a/crates/polars-mem-engine/src/executors/scan/ipc.rs +++ b/crates/polars-mem-engine/src/executors/scan/ipc.rs @@ -1,6 +1,7 @@ use hive::HivePartitions; use polars_core::config; use polars_core::utils::accumulate_dataframes_vertical; +use polars_error::feature_gated; use polars_io::cloud::CloudOptions; use polars_io::path_utils::is_cloud_url; use polars_io::predicates::apply_predicate; @@ -9,7 +10,7 @@ use rayon::prelude::*; use super::*; pub struct IpcExec { - pub(crate) sources: ScanSource, + pub(crate) sources: ScanSources, pub(crate) file_info: FileInfo, pub(crate) predicate: Option>, pub(crate) options: IpcScanOptions, @@ -20,24 +21,20 @@ pub struct IpcExec { impl IpcExec { fn read(&mut self) -> PolarsResult { - let paths = self.sources.as_paths(); - let is_cloud = paths.iter().any(is_cloud_url); + let is_cloud = match &self.sources { + ScanSources::Files(paths) => paths.iter().any(is_cloud_url), + ScanSources::Buffers(_) => false, + }; let force_async = config::force_async(); let mut out = if is_cloud || force_async { - #[cfg(not(feature = "cloud"))] - { - panic!("activate cloud feature") - } - - #[cfg(feature = "cloud")] - { + feature_gated!("cloud", { if force_async && config::verbose() { eprintln!("ASYNC READING FORCED"); } polars_io::pl_async::get_runtime().block_on_potential_spawn(self.read_async())? - } + }) } else { self.read_sync()? }; @@ -49,11 +46,10 @@ impl IpcExec { Ok(out) } - fn read_impl PolarsResult + Send + Sync>( + fn read_impl( &mut self, - path_idx_to_file: F, + idx_to_cached_file: impl Fn(usize) -> Option> + Send + Sync, ) -> PolarsResult { - let paths = self.sources.as_paths(); if config::verbose() { eprintln!("executing ipc read sync with row_index = {:?}, n_rows = {:?}, predicate = {:?} for paths {:?}", self.file_options.row_index.as_ref(), @@ -62,7 +58,7 @@ impl IpcExec { x.1 }).as_ref(), self.predicate.is_some(), - paths + self.sources, ); } @@ -73,33 +69,60 @@ impl IpcExec { self.file_options.row_index.is_some(), ); - let read_path = |path_index: usize, n_rows: Option| { - IpcReader::new(path_idx_to_file(path_index)?) - .with_n_rows(n_rows) - .with_row_index(self.file_options.row_index.clone()) - .with_projection(projection.clone()) - .with_hive_partition_columns( - self.hive_parts - .as_ref() - .map(|x| x[path_index].materialize_partition_columns()), - ) - .with_include_file_path(self.file_options.include_file_paths.as_ref().map(|x| { - ( - x.clone(), - Arc::from(paths[path_index].to_str().unwrap().to_string()), + let read_path = |index: usize, n_rows: Option| { + let source = self.sources.at(index); + + match source { + ScanSourceRef::File(path) => { + let file = match idx_to_cached_file(index) { + None => std::fs::File::open(path)?, + Some(f) => f?, + }; + + IpcReader::new(file) + .with_n_rows(n_rows) + .with_row_index(self.file_options.row_index.clone()) + .with_projection(projection.clone()) + .with_hive_partition_columns( + self.hive_parts + .as_ref() + .map(|x| x[index].materialize_partition_columns()), + ) + .with_include_file_path( + self.file_options + .include_file_paths + .as_ref() + .map(|x| (x.clone(), Arc::from(source.to_file_path()))), + ) + .memory_mapped(self.options.memory_map.then(|| path.to_path_buf())) + .finish() + }, + ScanSourceRef::Buffer(buff) => IpcReader::new(std::io::Cursor::new(buff)) + .with_n_rows(n_rows) + .with_row_index(self.file_options.row_index.clone()) + .with_projection(projection.clone()) + .with_hive_partition_columns( + self.hive_parts + .as_ref() + .map(|x| x[index].materialize_partition_columns()), ) - })) - .memory_mapped(self.options.memory_map.then(|| paths[path_index].clone())) - .finish() + .with_include_file_path( + self.file_options + .include_file_paths + .as_ref() + .map(|x| (x.clone(), Arc::from(source.to_file_path()))), + ) + .finish(), + } }; let mut dfs = if let Some(mut n_rows) = self.file_options.slice.map(|x| { assert_eq!(x.0, 0); x.1 }) { - let mut out = Vec::with_capacity(paths.len()); + let mut out = Vec::with_capacity(self.sources.len()); - for i in 0..paths.len() { + for i in 0..self.sources.len() { let df = read_path(i, Some(n_rows))?; let df_height = df.height(); out.push(df); @@ -117,7 +140,7 @@ impl IpcExec { out } else { POOL.install(|| { - (0..paths.len()) + (0..self.sources.len()) .into_par_iter() .map(|i| read_path(i, None)) .collect::>>() @@ -153,9 +176,7 @@ impl IpcExec { } fn read_sync(&mut self) -> PolarsResult { - let paths = self.sources.into_paths(); - let paths = paths.clone(); - self.read_impl(move |i| std::fs::File::open(&paths[i]).map_err(Into::into)) + self.read_impl(|_| None) } #[cfg(feature = "cloud")] @@ -176,17 +197,15 @@ impl IpcExec { self.cloud_options.as_ref(), )?; - self.read_impl(move |i| cache_entries[i].try_open_check_latest()) + self.read_impl(|i| Some(cache_entries[i].try_open_check_latest())) }) } } impl Executor for IpcExec { fn execute(&mut self, state: &mut ExecutionState) -> PolarsResult { - let paths = self.sources.as_paths(); - let profile_name = if state.has_node_timer() { - let mut ids = vec![PlSmallStr::from_str(paths[0].to_string_lossy().as_ref())]; + let mut ids = vec![self.sources.id()]; if self.predicate.is_some() { ids.push("predicate".into()) } diff --git a/crates/polars-mem-engine/src/executors/scan/ndjson.rs b/crates/polars-mem-engine/src/executors/scan/ndjson.rs index 68ad24ab837e..27aab29fd0c1 100644 --- a/crates/polars-mem-engine/src/executors/scan/ndjson.rs +++ b/crates/polars-mem-engine/src/executors/scan/ndjson.rs @@ -1,10 +1,11 @@ use polars_core::config; use polars_core::utils::accumulate_dataframes_vertical; +use polars_error::feature_gated; use super::*; pub struct JsonExec { - sources: ScanSource, + sources: ScanSources, options: NDJsonReadOptions, file_scan_options: FileScanOptions, file_info: FileInfo, @@ -13,7 +14,7 @@ pub struct JsonExec { impl JsonExec { pub fn new( - sources: ScanSource, + sources: ScanSources, options: NDJsonReadOptions, file_scan_options: FileScanOptions, file_info: FileInfo, @@ -36,11 +37,10 @@ impl JsonExec { .unwrap() .as_ref() .unwrap_right(); - let paths = self.sources.as_paths(); let verbose = config::verbose(); let force_async = config::force_async(); - let run_async = force_async || is_cloud_url(paths.first().unwrap()); + let run_async = force_async || self.sources.is_cloud_url(); if force_async && verbose { eprintln!("ASYNC READING FORCED"); @@ -65,59 +65,80 @@ impl JsonExec { return Ok(df); } - let dfs = paths + let dfs = self + .sources .iter() - .map_while(|p| { + .map_while(|source| { if n_rows == Some(0) { return None; } - let file = if run_async { - #[cfg(feature = "cloud")] - { - match polars_io::file_cache::FILE_CACHE - .get_entry(p.to_str().unwrap()) - // Safety: This was initialized by schema inference. - .unwrap() - .try_open_assume_latest() - { - Ok(v) => v, - Err(e) => return Some(Err(e)), - } - } - #[cfg(not(feature = "cloud"))] - { - panic!("required feature `cloud` is not enabled") - } - } else { - match polars_utils::open_file(p.as_ref()) { - Ok(v) => v, - Err(e) => return Some(Err(e)), - } - }; - - let mmap = unsafe { memmap::Mmap::map(&file).unwrap() }; - let owned = &mut vec![]; - let curs = - std::io::Cursor::new(match maybe_decompress_bytes(mmap.as_ref(), owned) { - Ok(v) => v, - Err(e) => return Some(Err(e)), - }); - let reader = JsonLineReader::new(curs); - let row_index = self.file_scan_options.row_index.as_mut(); - let df = reader - .with_schema(schema.clone()) - .with_rechunk(self.file_scan_options.rechunk) - .with_chunk_size(Some(self.options.chunk_size)) - .with_row_index(row_index) - .with_predicate(self.predicate.clone().map(phys_expr_to_io_expr)) - .with_projection(self.file_scan_options.with_columns.clone()) - .low_memory(self.options.low_memory) - .with_n_rows(n_rows) - .with_ignore_errors(self.options.ignore_errors) - .finish(); + let owned = &mut vec![]; + let df = match source { + ScanSourceRef::File(path) => { + let file = if run_async { + feature_gated!("cloud", { + match polars_io::file_cache::FILE_CACHE + .get_entry(path.to_str().unwrap()) + // Safety: This was initialized by schema inference. + .unwrap() + .try_open_assume_latest() + { + Ok(v) => v, + Err(e) => return Some(Err(e)), + } + }) + } else { + match polars_utils::open_file(path) { + Ok(v) => v, + Err(e) => return Some(Err(e)), + } + }; + + let mmap = unsafe { memmap::Mmap::map(&file).unwrap() }; + let curs = std::io::Cursor::new( + match maybe_decompress_bytes(mmap.as_ref(), owned) { + Ok(v) => v, + Err(e) => return Some(Err(e)), + }, + ); + let reader = JsonLineReader::new(curs); + + reader + .with_schema(schema.clone()) + .with_rechunk(self.file_scan_options.rechunk) + .with_chunk_size(Some(self.options.chunk_size)) + .with_row_index(row_index) + .with_predicate(self.predicate.clone().map(phys_expr_to_io_expr)) + .with_projection(self.file_scan_options.with_columns.clone()) + .low_memory(self.options.low_memory) + .with_n_rows(n_rows) + .with_ignore_errors(self.options.ignore_errors) + .finish() + }, + ScanSourceRef::Buffer(buff) => { + let curs = + std::io::Cursor::new(match maybe_decompress_bytes(buff, owned) { + Ok(v) => v, + Err(e) => return Some(Err(e)), + }); + let reader = JsonLineReader::new(curs); + + reader + .with_schema(schema.clone()) + .with_rechunk(self.file_scan_options.rechunk) + .with_chunk_size(Some(self.options.chunk_size)) + .with_row_index(row_index) + .with_predicate(self.predicate.clone().map(phys_expr_to_io_expr)) + .with_projection(self.file_scan_options.with_columns.clone()) + .low_memory(self.options.low_memory) + .with_n_rows(n_rows) + .with_ignore_errors(self.options.ignore_errors) + .finish() + }, + }; let mut df = match df { Ok(df) => df, @@ -129,10 +150,10 @@ impl JsonExec { } if let Some(col) = &self.file_scan_options.include_file_paths { - let path = p.to_str().unwrap(); + let name = source.to_file_path(); unsafe { df.with_column_unchecked( - StringChunked::full(col.clone(), path, df.height()).into_series(), + StringChunked::full(col.clone(), name, df.height()).into_series(), ) }; } @@ -147,9 +168,8 @@ impl JsonExec { impl Executor for JsonExec { fn execute(&mut self, state: &mut ExecutionState) -> PolarsResult { - let paths = self.sources.as_paths(); let profile_name = if state.has_node_timer() { - let ids = vec![paths[0].to_string_lossy().clone()]; + let ids = vec![self.sources.id()]; let name = comma_delimited("ndjson".to_string(), &ids); Cow::Owned(name) } else { diff --git a/crates/polars-mem-engine/src/executors/scan/parquet.rs b/crates/polars-mem-engine/src/executors/scan/parquet.rs index ed740809fcd3..bb47eb458a49 100644 --- a/crates/polars-mem-engine/src/executors/scan/parquet.rs +++ b/crates/polars-mem-engine/src/executors/scan/parquet.rs @@ -6,14 +6,13 @@ use polars_core::utils::accumulate_dataframes_vertical; use polars_error::feature_gated; use polars_io::cloud::CloudOptions; use polars_io::parquet::metadata::FileMetaDataRef; -use polars_io::path_utils::is_cloud_url; use polars_io::utils::slice::split_slice_at_file; use polars_io::RowIndex; use super::*; pub struct ParquetExec { - source: ScanSource, + sources: ScanSources, file_info: FileInfo, hive_parts: Option>>, predicate: Option>, @@ -28,7 +27,7 @@ pub struct ParquetExec { impl ParquetExec { #[allow(clippy::too_many_arguments)] pub(crate) fn new( - source: ScanSource, + sources: ScanSources, file_info: FileInfo, hive_parts: Option>>, predicate: Option>, @@ -38,7 +37,7 @@ impl ParquetExec { metadata: Option, ) -> Self { ParquetExec { - source, + sources, file_info, hive_parts, predicate, @@ -51,7 +50,7 @@ impl ParquetExec { fn read_par(&mut self) -> PolarsResult> { let parallel = match self.options.parallel { - ParallelStrategy::Auto if self.source.num_sources() > POOL.current_num_threads() => { + ParallelStrategy::Auto if self.sources.len() > POOL.current_num_threads() => { ParallelStrategy::RowGroups }, identity => identity, @@ -63,78 +62,53 @@ impl ParquetExec { let slice_info = match self.file_options.slice { None => ScanSourceSliceInfo { item_slice: 0..usize::MAX, - source_slice: 0..self.source.num_sources(), + source_slice: 0..self.sources.len(), + }, + Some(slice) => { + self.sources + .collect_slice_information(slice, |source| match source { + ScanSourceRef::File(path) => { + ParquetReader::new(std::fs::File::open(path)?).num_rows() + }, + ScanSourceRef::Buffer(buff) => { + ParquetReader::new(std::io::Cursor::new(buff)).num_rows() + }, + })? }, - Some(slice) => self.source.collect_slice_information( - slice, - |path| ParquetReader::new(std::fs::File::open(path)?).num_rows(), - |buff| ParquetReader::new(std::io::Cursor::new(buff)).num_rows(), - )?, }; - match &self.source { - ScanSource::Buffer(buffer) => { - let row_index = self.file_options.row_index.take(); + let mut current_offset = 0; + let base_row_index = self.file_options.row_index.take(); + // Limit no. of files at a time to prevent open file limits. + + let paths = self.sources.as_paths(); + + for i in slice_info.source_slice.step_by(step) { + let end = std::cmp::min(i.saturating_add(step), paths.len()); + let hive_parts = self.hive_parts.as_ref().map(|x| &x[i..end]); + + if current_offset >= slice_info.item_slice.end && !result.is_empty() { + return Ok(result); + } + + // First initialize the readers, predicates and metadata. + // This will be used to determine the slices. That way we can actually read all the + // files in parallel even if we add row index columns or slices. + let iter = (0..self.sources.len()).into_par_iter().map(|i| { + let source = self.sources.at(i); + let hive_partitions = hive_parts.map(|x| x[i].materialize_partition_columns()); + let (projection, predicate) = prepare_scan_args( self.predicate.clone(), &mut self.file_options.with_columns.clone(), &mut self.file_info.schema.clone(), - row_index.is_some(), - None, + base_row_index.is_some(), + hive_partitions.as_deref(), ); - result = vec![ParquetReader::new(std::io::Cursor::new(buffer)) - .read_parallel(parallel) - .set_low_memory(self.options.low_memory) - .use_statistics(self.options.use_statistics) - .set_rechunk(false) - .with_slice(Some(( - slice_info.item_slice.start, - slice_info.item_slice.len(), - ))) - .with_row_index(row_index) - .with_predicate(predicate.clone()) - .with_projection(projection.clone()) - .check_schema( - self.file_info - .reader_schema - .clone() - .unwrap() - .unwrap_left() - .as_ref(), - )? - .finish()?]; - }, - ScanSource::Files(paths) => { - let mut current_offset = 0; - let base_row_index = self.file_options.row_index.take(); - // Limit no. of files at a time to prevent open file limits. - - for i in slice_info.source_slice.step_by(step) { - let end = std::cmp::min(i.saturating_add(step), paths.len()); - let paths = &paths[i..end]; - let hive_parts = self.hive_parts.as_ref().map(|x| &x[i..end]); - - if current_offset >= slice_info.item_slice.end && !result.is_empty() { - return Ok(result); - } - - // First initialize the readers, predicates and metadata. - // This will be used to determine the slices. That way we can actually read all the - // files in parallel even if we add row index columns or slices. - let iter = (0..paths.len()).into_par_iter().map(|i| { - let path = &paths[i]; - let hive_partitions = - hive_parts.map(|x| x[i].materialize_partition_columns()); - + match source { + ScanSourceRef::File(path) => { let file = std::fs::File::open(path)?; - let (projection, predicate) = prepare_scan_args( - self.predicate.clone(), - &mut self.file_options.with_columns.clone(), - &mut self.file_info.schema.clone(), - base_row_index.is_some(), - hive_partitions.as_deref(), - ); let mut reader = ParquetReader::new(file) .read_parallel(parallel) @@ -152,68 +126,68 @@ impl ParquetExec { reader .num_rows() .map(|num_rows| (reader, num_rows, predicate, projection)) - }); - - // We do this in parallel because wide tables can take a long time deserializing metadata. - let readers_and_metadata = - POOL.install(|| iter.collect::>>())?; - - let current_offset_ref = &mut current_offset; - let row_statistics = readers_and_metadata - .iter() - .map(|(_, num_rows, _, _)| { - let cum_rows = *current_offset_ref; - ( - cum_rows, - split_slice_at_file( - current_offset_ref, - *num_rows, - slice_info.item_slice.start, - slice_info.item_slice.end, - ), - ) - }) - .collect::>(); - - let out = POOL.install(|| { - readers_and_metadata - .into_par_iter() - .zip(row_statistics.into_par_iter()) - .map( - |((reader, _, predicate, projection), (cumulative_read, slice))| { - let row_index = base_row_index.as_ref().map(|rc| RowIndex { - name: rc.name.clone(), - offset: rc.offset + cumulative_read as IdxSize, - }); - - let df = reader - .with_slice(Some(slice)) - .with_row_index(row_index) - .with_predicate(predicate.clone()) - .with_projection(projection.clone()) - .check_schema( - self.file_info - .reader_schema - .clone() - .unwrap() - .unwrap_left() - .as_ref(), - )? - .finish()?; - - Ok(df) - }, - ) - .collect::>>() - })?; - - if result.is_empty() { - result = out; - } else { - result.extend_from_slice(&out) - } + }, + ScanSourceRef::Buffer(_) => todo!(), } - }, + }); + + // We do this in parallel because wide tables can take a long time deserializing metadata. + let readers_and_metadata = POOL.install(|| iter.collect::>>())?; + + let current_offset_ref = &mut current_offset; + let row_statistics = readers_and_metadata + .iter() + .map(|(_, num_rows, _, _)| { + let cum_rows = *current_offset_ref; + ( + cum_rows, + split_slice_at_file( + current_offset_ref, + *num_rows, + slice_info.item_slice.start, + slice_info.item_slice.end, + ), + ) + }) + .collect::>(); + + let out = POOL.install(|| { + readers_and_metadata + .into_par_iter() + .zip(row_statistics.into_par_iter()) + .map( + |((reader, _, predicate, projection), (cumulative_read, slice))| { + let row_index = base_row_index.as_ref().map(|rc| RowIndex { + name: rc.name.clone(), + offset: rc.offset + cumulative_read as IdxSize, + }); + + let df = reader + .with_slice(Some(slice)) + .with_row_index(row_index) + .with_predicate(predicate.clone()) + .with_projection(projection.clone()) + .check_schema( + self.file_info + .reader_schema + .clone() + .unwrap() + .unwrap_left() + .as_ref(), + )? + .finish()?; + + Ok(df) + }, + ) + .collect::>>() + })?; + + if result.is_empty() { + result = out; + } else { + result.extend_from_slice(&out) + } } Ok(result) @@ -226,7 +200,7 @@ impl ParquetExec { use polars_io::utils::slice::split_slice_at_file; let verbose = verbose(); - let paths = self.source.into_paths(); + let paths = self.sources.into_paths(); let first_metadata = &self.metadata; let cloud_options = self.cloud_options.as_ref(); @@ -443,10 +417,7 @@ impl ParquetExec { .and_then(|_| self.predicate.take()) .map(phys_expr_to_io_expr); - let is_cloud = match &self.source { - ScanSource::Files(paths) => is_cloud_url(paths.first().unwrap()), - ScanSource::Buffer(_) => false, - }; + let is_cloud = self.sources.is_cloud_url(); let force_async = config::force_async(); let out = if is_cloud || force_async { @@ -475,7 +446,7 @@ impl ParquetExec { impl Executor for ParquetExec { fn execute(&mut self, state: &mut ExecutionState) -> PolarsResult { let profile_name = if state.has_node_timer() { - let mut ids = vec![self.source.id()]; + let mut ids = vec![self.sources.id()]; if self.predicate.is_some() { ids.push("predicate".into()) } diff --git a/crates/polars-mem-engine/src/planner/lp.rs b/crates/polars-mem-engine/src/planner/lp.rs index 1f161a34587a..45487f7b7024 100644 --- a/crates/polars-mem-engine/src/planner/lp.rs +++ b/crates/polars-mem-engine/src/planner/lp.rs @@ -306,7 +306,7 @@ fn create_physical_plan_impl( match scan_type { #[cfg(feature = "csv")] FileScan::Csv { options, .. } => Ok(Box::new(executors::CsvExec { - source: sources, + sources, file_info, options, predicate, diff --git a/crates/polars-pipe/src/executors/sources/csv.rs b/crates/polars-pipe/src/executors/sources/csv.rs index 5ca5551c506d..673848e67d77 100644 --- a/crates/polars-pipe/src/executors/sources/csv.rs +++ b/crates/polars-pipe/src/executors/sources/csv.rs @@ -4,7 +4,7 @@ use polars_core::{config, POOL}; use polars_io::csv::read::{BatchedCsvReader, CsvReadOptions, CsvReader}; use polars_io::path_utils::is_cloud_url; use polars_plan::global::_set_n_rows_for_scan; -use polars_plan::plans::ScanSource; +use polars_plan::plans::ScanSources; use polars_plan::prelude::FileScanOptions; use polars_utils::itertools::Itertools; @@ -20,7 +20,7 @@ pub(crate) struct CsvSource { batched_reader: Option>, reader: Option>, n_threads: usize, - sources: ScanSource, + sources: ScanSources, options: Option, file_options: FileScanOptions, verbose: bool, @@ -141,7 +141,7 @@ impl CsvSource { } pub(crate) fn new( - sources: ScanSource, + sources: ScanSources, schema: SchemaRef, options: CsvReadOptions, file_options: FileScanOptions, diff --git a/crates/polars-pipe/src/executors/sources/parquet.rs b/crates/polars-pipe/src/executors/sources/parquet.rs index ab5abbade817..e91eb2ec1bba 100644 --- a/crates/polars-pipe/src/executors/sources/parquet.rs +++ b/crates/polars-pipe/src/executors/sources/parquet.rs @@ -20,7 +20,7 @@ use polars_io::prelude::materialize_projection; use polars_io::prelude::ParquetAsyncReader; use polars_io::utils::slice::split_slice_at_file; use polars_io::SerReader; -use polars_plan::plans::{FileInfo, ScanSource}; +use polars_plan::plans::{FileInfo, ScanSources}; use polars_plan::prelude::hive::HivePartitions; use polars_plan::prelude::FileScanOptions; use polars_utils::itertools::Itertools; @@ -36,7 +36,7 @@ pub struct ParquetSource { processed_paths: usize, processed_rows: AtomicUsize, iter: Range, - sources: ScanSource, + sources: ScanSources, options: ParquetOptions, file_options: FileScanOptions, #[allow(dead_code)] @@ -246,7 +246,7 @@ impl ParquetSource { #[allow(unused_variables)] #[allow(clippy::too_many_arguments)] pub(crate) fn new( - sources: ScanSource, + sources: ScanSources, options: ParquetOptions, cloud_options: Option, metadata: Option, diff --git a/crates/polars-plan/src/client/check.rs b/crates/polars-plan/src/client/check.rs index c7070d22ed0c..1f5562bb4670 100644 --- a/crates/polars-plan/src/client/check.rs +++ b/crates/polars-plan/src/client/check.rs @@ -2,7 +2,7 @@ use polars_core::error::{polars_err, PolarsResult}; use polars_io::path_utils::is_cloud_url; use crate::plans::options::SinkType; -use crate::plans::{DslPlan, DslScanSource, FileScan}; +use crate::plans::{DslPlan, FileScan, ScanSources}; /// Assert that the given [`DslPlan`] is eligible to be executed on Polars Cloud. pub(super) fn assert_cloud_eligible(dsl: &DslPlan) -> PolarsResult<()> { @@ -13,13 +13,14 @@ pub(super) fn assert_cloud_eligible(dsl: &DslPlan) -> PolarsResult<()> { DslPlan::Scan { sources, scan_type, .. } => { - match sources { - DslScanSource::File(file) => { - if file.lock().unwrap().paths.iter().any(|p| !is_cloud_url(p)) { + let sources_lock = sources.lock().unwrap(); + match &sources_lock.sources { + ScanSources::Files(paths) => { + if paths.iter().any(|p| !is_cloud_url(p)) { return ineligible_error("contains scan of local file system"); } }, - DslScanSource::Buffer(_) => { + ScanSources::Buffers(_) => { return ineligible_error("contains scan of in-memory buffer"); }, } diff --git a/crates/polars-plan/src/plans/builder_dsl.rs b/crates/polars-plan/src/plans/builder_dsl.rs index 1170f95ec7a2..7efa55417509 100644 --- a/crates/polars-plan/src/plans/builder_dsl.rs +++ b/crates/polars-plan/src/plans/builder_dsl.rs @@ -1,4 +1,4 @@ -use std::sync::{Arc, RwLock}; +use std::sync::{Arc, Mutex, RwLock}; use polars_core::prelude::*; #[cfg(any(feature = "parquet", feature = "ipc", feature = "csv"))] @@ -58,7 +58,10 @@ impl DslBuilder { }; Ok(DslPlan::Scan { - sources: DslScanSource::Buffer(Arc::default()), + sources: Arc::new(Mutex::new(DslScanSources { + sources: ScanSources::Buffers(Arc::default()), + is_expanded: true, + })), file_info: Arc::new(RwLock::new(Some(file_info))), hive_parts: None, predicate: None, @@ -77,7 +80,7 @@ impl DslBuilder { #[cfg(feature = "parquet")] #[allow(clippy::too_many_arguments)] pub fn scan_parquet( - source: DslScanSource, + sources: DslScanSources, n_rows: Option, cache: bool, parallel: polars_io::parquet::read::ParallelStrategy, @@ -102,8 +105,7 @@ impl DslBuilder { include_file_paths, }; Ok(DslPlan::Scan { - // @FIX: sources -> source - sources: source, + sources: Arc::new(Mutex::new(sources)), file_info: Arc::new(RwLock::new(None)), hive_parts: None, predicate: None, @@ -124,7 +126,7 @@ impl DslBuilder { #[cfg(feature = "ipc")] #[allow(clippy::too_many_arguments)] pub fn scan_ipc( - source: DslScanSource, + sources: DslScanSources, options: IpcScanOptions, n_rows: Option, cache: bool, @@ -135,7 +137,7 @@ impl DslBuilder { include_file_paths: Option, ) -> PolarsResult { Ok(DslPlan::Scan { - sources: source, + sources: Arc::new(Mutex::new(sources)), file_info: Arc::new(RwLock::new(None)), hive_parts: None, file_options: FileScanOptions { @@ -162,7 +164,7 @@ impl DslBuilder { #[allow(clippy::too_many_arguments)] #[cfg(feature = "csv")] pub fn scan_csv( - source: DslScanSource, + sources: DslScanSources, read_options: CsvReadOptions, cache: bool, cloud_options: Option, @@ -188,7 +190,7 @@ impl DslBuilder { include_file_paths, }; Ok(DslPlan::Scan { - sources: source, + sources: Arc::new(Mutex::new(sources)), file_info: Arc::new(RwLock::new(None)), hive_parts: None, file_options: options, diff --git a/crates/polars-plan/src/plans/conversion/dsl_to_ir.rs b/crates/polars-plan/src/plans/conversion/dsl_to_ir.rs index 72e75d2b3017..7966d6ff688e 100644 --- a/crates/polars-plan/src/plans/conversion/dsl_to_ir.rs +++ b/crates/polars-plan/src/plans/conversion/dsl_to_ir.rs @@ -105,21 +105,16 @@ pub fn to_alp_impl(lp: DslPlan, ctxt: &mut DslConversionContext) -> PolarsResult let v = match lp { DslPlan::Scan { - mut sources, + sources, file_info, hive_parts, predicate, mut file_options, mut scan_type, } => { - sources.expand_paths(&mut scan_type, &mut file_options)?; - - let source = match sources { - DslScanSource::File(paths) => { - ScanSource::Files(paths.as_ref().lock().unwrap().paths.clone()) - }, - DslScanSource::Buffer(buf) => ScanSource::Buffer(buf), - }; + let mut sources_lock = sources.lock().unwrap(); + sources_lock.expand_paths(&mut scan_type, &mut file_options)?; + let sources = sources_lock.sources.clone(); let file_info_read = file_info.read().unwrap(); @@ -146,7 +141,7 @@ pub fn to_alp_impl(lp: DslPlan, ctxt: &mut DslConversionContext) -> PolarsResult .. } => { let (file_info, md) = scans::parquet_file_info( - &source, + &sources, &file_options, cloud_options.as_ref(), ) @@ -160,12 +155,9 @@ pub fn to_alp_impl(lp: DslPlan, ctxt: &mut DslConversionContext) -> PolarsResult metadata, .. } => { - let (file_info, md) = scans::ipc_file_info( - source.as_paths(), - &file_options, - cloud_options.as_ref(), - ) - .map_err(|e| e.context(failed_here!(ipc scan)))?; + let (file_info, md) = + scans::ipc_file_info(&sources, &file_options, cloud_options.as_ref()) + .map_err(|e| e.context(failed_here!(ipc scan)))?; *metadata = Some(md); file_info }, @@ -174,7 +166,7 @@ pub fn to_alp_impl(lp: DslPlan, ctxt: &mut DslConversionContext) -> PolarsResult options, cloud_options, } => scans::csv_file_info( - &source, + &sources, &file_options, options, cloud_options.as_ref(), @@ -185,7 +177,7 @@ pub fn to_alp_impl(lp: DslPlan, ctxt: &mut DslConversionContext) -> PolarsResult options, cloud_options, } => scans::ndjson_file_info( - source.as_paths(), + &sources, &file_options, options, cloud_options.as_ref(), @@ -205,7 +197,7 @@ pub fn to_alp_impl(lp: DslPlan, ctxt: &mut DslConversionContext) -> PolarsResult let mut owned = None; hive_partitions_from_paths( - source.as_paths().as_ref(), + &sources.as_paths(), file_options.hive_options.hive_start_idx, file_options.hive_options.schema.clone(), match resolved_file_info.reader_schema.as_ref().unwrap() { @@ -279,7 +271,7 @@ pub fn to_alp_impl(lp: DslPlan, ctxt: &mut DslConversionContext) -> PolarsResult } IR::Scan { - sources: source, + sources, file_info: resolved_file_info, hive_parts, output_schema: None, @@ -819,64 +811,48 @@ pub fn to_alp_impl(lp: DslPlan, ctxt: &mut DslConversionContext) -> PolarsResult Ok(ctxt.lp_arena.add(v)) } -impl DslScanSource { +impl DslScanSources { /// Expand scan paths if they were not already expanded. pub fn expand_paths( &mut self, scan_type: &mut FileScan, file_options: &mut FileScanOptions, ) -> PolarsResult<()> { - match self { - DslScanSource::File(source) => { - #[allow(unused_mut)] - let mut lock = source.lock().unwrap(); - - // Return if paths are already expanded - if lock.is_expanded { - return Ok(()); - } - - { - let paths_expanded = match &scan_type { - #[cfg(feature = "parquet")] - FileScan::Parquet { cloud_options, .. } => { - expand_scan_paths_with_hive_update( - &lock.paths[..], - file_options, - cloud_options, - )? - }, - #[cfg(feature = "ipc")] - FileScan::Ipc { cloud_options, .. } => expand_scan_paths_with_hive_update( - &lock.paths[..], - file_options, - cloud_options, - )?, - #[cfg(feature = "csv")] - FileScan::Csv { cloud_options, .. } => expand_paths( - &lock.paths[..], - file_options.glob, - cloud_options.as_ref(), - )?, - #[cfg(feature = "json")] - FileScan::NDJson { cloud_options, .. } => expand_paths( - &lock.paths[..], - file_options.glob, - cloud_options.as_ref(), - )?, - FileScan::Anonymous { .. } => unreachable!(), // Invariant: Anonymous scans are already expanded. - }; + if self.is_expanded { + return Ok(()); + } - #[allow(unreachable_code)] - { - lock.paths = paths_expanded; - lock.is_expanded = true; + let ScanSources::Files(paths) = &self.sources else { + self.is_expanded = true; + return Ok(()); + }; - Ok(()) - } - } + let expanded_sources = match &scan_type { + #[cfg(feature = "parquet")] + FileScan::Parquet { cloud_options, .. } => { + expand_scan_paths_with_hive_update(&paths, file_options, cloud_options)? }, - DslScanSource::Buffer(_) => Ok(()), + #[cfg(feature = "ipc")] + FileScan::Ipc { cloud_options, .. } => { + expand_scan_paths_with_hive_update(&paths, file_options, cloud_options)? + }, + #[cfg(feature = "csv")] + FileScan::Csv { cloud_options, .. } => { + expand_paths(&paths, file_options.glob, cloud_options.as_ref())? + }, + #[cfg(feature = "json")] + FileScan::NDJson { cloud_options, .. } => { + expand_paths(&paths, file_options.glob, cloud_options.as_ref())? + }, + FileScan::Anonymous { .. } => unreachable!(), // Invariant: Anonymous scans are already expanded. + }; + + #[allow(unreachable_code)] + { + self.sources = ScanSources::Files(expanded_sources); + self.is_expanded = true; + + Ok(()) } } } diff --git a/crates/polars-plan/src/plans/conversion/mod.rs b/crates/polars-plan/src/plans/conversion/mod.rs index 9851a6d2c3ba..b9ed8711a438 100644 --- a/crates/polars-plan/src/plans/conversion/mod.rs +++ b/crates/polars-plan/src/plans/conversion/mod.rs @@ -12,7 +12,7 @@ mod ir_to_dsl; mod scans; mod stack_opt; -use std::sync::{Arc, RwLock}; +use std::sync::{Arc, Mutex, RwLock}; pub use dsl_to_ir::*; pub use expr_to_ir::*; @@ -58,7 +58,10 @@ impl IR { output_schema: _, file_options: options, } => DslPlan::Scan { - sources: sources.to_dsl(true), + sources: Arc::new(Mutex::new(DslScanSources { + sources, + is_expanded: true, + })), file_info: Arc::new(RwLock::new(Some(file_info))), hive_parts, predicate: predicate.map(|e| e.to_expr(expr_arena)), diff --git a/crates/polars-plan/src/plans/conversion/scans.rs b/crates/polars-plan/src/plans/conversion/scans.rs index 1cc939417d60..e234521c0a51 100644 --- a/crates/polars-plan/src/plans/conversion/scans.rs +++ b/crates/polars-plan/src/plans/conversion/scans.rs @@ -10,13 +10,6 @@ use polars_io::RowIndex; use super::*; -fn get_first_path(paths: &[PathBuf]) -> PolarsResult<&PathBuf> { - // Use first path to get schema. - paths - .first() - .ok_or_else(|| polars_err!(ComputeError: "expected at least 1 path")) -} - #[cfg(any(feature = "parquet", feature = "ipc"))] fn prepare_output_schema(mut schema: Schema, row_index: Option<&RowIndex>) -> SchemaRef { if let Some(rc) = row_index { @@ -39,66 +32,57 @@ fn prepare_schemas(mut schema: Schema, row_index: Option<&RowIndex>) -> (SchemaR #[cfg(feature = "parquet")] pub(super) fn parquet_file_info( - source: &ScanSource, + sources: &ScanSources, file_options: &FileScanOptions, #[allow(unused)] cloud_options: Option<&polars_io::cloud::CloudOptions>, ) -> PolarsResult<(FileInfo, Option)> { - let (schema, reader_schema, num_rows, metadata) = match source { - ScanSource::Files(paths) => { - let path = get_first_path(paths)?; - if is_cloud_url(path) { - #[cfg(not(feature = "cloud"))] - panic!("One or more of the cloud storage features ('aws', 'gcp', ...) must be enabled."); + use polars_core::error::feature_gated; + + let first_source = sources + .first() + .ok_or_else(|| polars_err!(ComputeError: "expected at least 1 source"))?; - #[cfg(feature = "cloud")] - { + let (reader_schema, num_rows, metadata) = match first_source { + ScanSourceRef::File(path) => { + if is_cloud_url(path) { + feature_gated!("cloud", { let uri = path.to_string_lossy(); get_runtime().block_on(async { let mut reader = ParquetAsyncReader::from_uri(&uri, cloud_options, None).await?; - let reader_schema = reader.schema().await?; - let num_rows = reader.num_rows().await?; - let metadata = reader.get_metadata().await?.clone(); - - let schema = prepare_output_schema( - Schema::from_arrow_schema(reader_schema.as_ref()), - file_options.row_index.as_ref(), - ); - PolarsResult::Ok((schema, reader_schema, Some(num_rows), Some(metadata))) + + PolarsResult::Ok(( + reader.schema().await?, + Some(reader.num_rows().await?), + Some(reader.get_metadata().await?.clone()), + )) })? - } + }) } else { let file = polars_utils::open_file(path)?; let mut reader = ParquetReader::new(file); - let reader_schema = reader.schema()?; - let schema = prepare_output_schema( - Schema::from_arrow_schema(reader_schema.as_ref()), - file_options.row_index.as_ref(), - ); ( - schema, - reader_schema, + reader.schema()?, Some(reader.num_rows()?), Some(reader.get_metadata()?.clone()), ) } }, - ScanSource::Buffer(buffer) => { + ScanSourceRef::Buffer(buffer) => { let mut reader = ParquetReader::new(std::io::Cursor::new(buffer)); - let reader_schema = reader.schema()?; - let schema = prepare_output_schema( - Schema::from_arrow_schema(reader_schema.as_ref()), - file_options.row_index.as_ref(), - ); ( - schema, - reader_schema, + reader.schema()?, Some(reader.num_rows()?), Some(reader.get_metadata()?.clone()), ) }, }; + let schema = prepare_output_schema( + Schema::from_arrow_schema(reader_schema.as_ref()), + file_options.row_index.as_ref(), + ); + let file_info = FileInfo::new( schema, Some(Either::Left(reader_schema)), @@ -111,31 +95,39 @@ pub(super) fn parquet_file_info( // TODO! return metadata arced #[cfg(feature = "ipc")] pub(super) fn ipc_file_info( - paths: &[PathBuf], + sources: &ScanSources, file_options: &FileScanOptions, cloud_options: Option<&polars_io::cloud::CloudOptions>, ) -> PolarsResult<(FileInfo, arrow::io::ipc::read::FileMetadata)> { - let path = get_first_path(paths)?; - - let metadata = if is_cloud_url(path) { - #[cfg(not(feature = "cloud"))] - panic!("One or more of the cloud storage features ('aws', 'gcp', ...) must be enabled."); - - #[cfg(feature = "cloud")] - { - let uri = path.to_string_lossy(); - get_runtime().block_on(async { - polars_io::ipc::IpcReaderAsync::from_uri(&uri, cloud_options) - .await? - .metadata() - .await - })? - } - } else { - arrow::io::ipc::read::read_file_metadata(&mut std::io::BufReader::new( - polars_utils::open_file(path)?, - ))? + use polars_core::error::feature_gated; + + let Some(first) = sources.first() else { + polars_bail!(ComputeError: "expected at least 1 source"); }; + + let metadata = match first { + ScanSourceRef::File(path) => { + if is_cloud_url(path) { + feature_gated!("cloud", { + let uri = path.to_string_lossy(); + get_runtime().block_on(async { + polars_io::ipc::IpcReaderAsync::from_uri(&uri, cloud_options) + .await? + .metadata() + .await + })? + }) + } else { + arrow::io::ipc::read::read_file_metadata(&mut std::io::BufReader::new( + polars_utils::open_file(path)?, + ))? + } + }, + ScanSourceRef::Buffer(buff) => { + arrow::io::ipc::read::read_file_metadata(&mut std::io::Cursor::new(buff))? + }, + }; + let file_info = FileInfo::new( prepare_output_schema( Schema::from_arrow_schema(metadata.schema.as_ref()), @@ -150,7 +142,7 @@ pub(super) fn ipc_file_info( #[cfg(feature = "csv")] pub(super) fn csv_file_info( - source: &ScanSource, + sources: &ScanSources, file_options: &FileScanOptions, csv_options: &mut CsvReadOptions, cloud_options: Option<&polars_io::cloud::CloudOptions>, @@ -168,30 +160,31 @@ pub(super) fn csv_file_info( // * See if we can do this without downloading the entire file // prints the error message if paths is empty. - let run_async = source.is_cloud_url()? || config::force_async(); + let run_async = sources.is_cloud_url() || config::force_async(); - let si_result = match source { - ScanSource::Files(paths) => { - let cache_entries = { - feature_gated!("cloud", { - if run_async { - Some(polars_io::file_cache::init_entries_from_uri_list( - source - .as_paths() - .iter() - .flat_map(|p| p.iter()) - .map(|path| Arc::from(path.to_str().unwrap())) - .collect::>() - .as_slice(), - cloud_options, - )?) - } else { - None - } - }) - }; + let cache_entries = { + feature_gated!("cloud", { + if run_async { + Some(polars_io::file_cache::init_entries_from_uri_list( + sources + .as_paths() + .iter() + .map(|path| Arc::from(path.to_str().unwrap())) + .collect::>() + .as_slice(), + cloud_options, + )?) + } else { + None + } + }) + }; - let infer_schema_func = |i| { + let infer_schema_func = |i| { + let source = sources.at(i); + let owned = &mut vec![]; + match source { + ScanSourceRef::File(path) => { let file = if run_async { feature_gated!("cloud", { let entry: &Arc = @@ -199,92 +192,77 @@ pub(super) fn csv_file_info( entry.try_open_check_latest()? }) } else { - let p: &PathBuf = &paths[i]; - polars_utils::open_file(p.as_ref())? + polars_utils::open_file(path)? }; let mmap = unsafe { memmap::Mmap::map(&file).unwrap() }; - let owned = &mut vec![]; + let mut reader = + std::io::Cursor::new(maybe_decompress_bytes(mmap.as_ref(), owned)?); - let mut curs = std::io::Cursor::new(maybe_decompress_bytes(mmap.as_ref(), owned)?); - - if curs.read(&mut [0; 4])? < 2 && csv_options.raise_if_empty { + if reader.read(&mut [0; 4])? < 2 && csv_options.raise_if_empty { polars_bail!(NoData: "empty CSV") } - curs.rewind()?; + reader.rewind()?; - let reader_bytes = get_reader_bytes(&mut curs).expect("could not mmap file"); + let reader_bytes = get_reader_bytes(&mut reader).expect("could not mmap file"); // this needs a way to estimated bytes/rows. - let si_result = SchemaInferenceResult::try_from_reader_bytes_and_options( - &reader_bytes, - csv_options, - )?; - - Ok(si_result) - }; - - let merge_func = |a: PolarsResult, - b: PolarsResult| { - match (a, b) { - (Err(e), _) | (_, Err(e)) => Err(e), - (Ok(a), Ok(b)) => { - let merged_schema = if csv_options.schema.is_some() { - csv_options.schema.clone().unwrap() - } else { - let schema_a = a.get_inferred_schema(); - let schema_b = b.get_inferred_schema(); - - match (schema_a.is_empty(), schema_b.is_empty()) { - (true, _) => schema_b, - (_, true) => schema_a, - _ => { - let mut s = Arc::unwrap_or_clone(schema_a); - s.to_supertype(&schema_b)?; - Arc::new(s) - }, - } - }; - - Ok(a.with_inferred_schema(merged_schema)) - }, + SchemaInferenceResult::try_from_reader_bytes_and_options(&reader_bytes, csv_options) + }, + ScanSourceRef::Buffer(buffer) => { + let mut reader = std::io::Cursor::new(maybe_decompress_bytes(buffer, owned)?); + + if reader.read(&mut [0; 4])? < 2 && csv_options.raise_if_empty { + polars_bail!(NoData: "empty CSV") } - }; - - let si_results = POOL.join( - || infer_schema_func(0), - || { - (1..paths.len()) - .into_par_iter() - .map(infer_schema_func) - .reduce(|| Ok(Default::default()), merge_func) - }, - ); - - merge_func(si_results.0, si_results.1)? - }, - ScanSource::Buffer(buffer) => { - polars_ensure!(!run_async, nyi = "BytesIO scan with async"); + reader.rewind()?; - let owned = &mut vec![]; - let mut reader = std::io::Cursor::new(maybe_decompress_bytes(buffer, owned)?); + let reader_bytes = get_reader_bytes(&mut reader).expect("could not open file"); - if reader.read(&mut [0; 4])? < 2 && csv_options.raise_if_empty { - polars_bail!(NoData: "empty CSV") - } - reader.rewind()?; + // this needs a way to estimated bytes/rows. + SchemaInferenceResult::try_from_reader_bytes_and_options(&reader_bytes, csv_options) + }, + } + }; - let reader_bytes = get_reader_bytes(&mut reader).expect("could not open file"); + let merge_func = |a: PolarsResult, + b: PolarsResult| { + match (a, b) { + (Err(e), _) | (_, Err(e)) => Err(e), + (Ok(a), Ok(b)) => { + let merged_schema = if csv_options.schema.is_some() { + csv_options.schema.clone().unwrap() + } else { + let schema_a = a.get_inferred_schema(); + let schema_b = b.get_inferred_schema(); + + match (schema_a.is_empty(), schema_b.is_empty()) { + (true, _) => schema_b, + (_, true) => schema_a, + _ => { + let mut s = Arc::unwrap_or_clone(schema_a); + s.to_supertype(&schema_b)?; + Arc::new(s) + }, + } + }; - // this needs a way to estimated bytes/rows. - let si_result = SchemaInferenceResult::try_from_reader_bytes_and_options( - &reader_bytes, - csv_options, - )?; + Ok(a.with_inferred_schema(merged_schema)) + }, + } + }; - si_result + let si_results = POOL.join( + || infer_schema_func(0), + || { + (1..sources.len()) + .into_par_iter() + .map(infer_schema_func) + .reduce(|| Ok(Default::default()), merge_func) }, - }; + ); + + let si_result = merge_func(si_results.0, si_results.1)?; csv_options.update_with_inference_result(&si_result); @@ -314,58 +292,39 @@ pub(super) fn csv_file_info( #[cfg(feature = "json")] pub(super) fn ndjson_file_info( - paths: &[PathBuf], + sources: &ScanSources, file_options: &FileScanOptions, ndjson_options: &mut NDJsonReadOptions, cloud_options: Option<&polars_io::cloud::CloudOptions>, ) -> PolarsResult { use polars_core::config; + use polars_core::error::feature_gated; + + let Some(first) = sources.first() else { + polars_bail!(ComputeError: "expected at least 1 source"); + }; - let run_async = !paths.is_empty() && is_cloud_url(&paths[0]) || config::force_async(); + let run_async = sources.is_cloud_url() || config::force_async(); let cache_entries = { - #[cfg(feature = "cloud")] - { - if run_async { + if run_async { + feature_gated!("cloud", { Some(polars_io::file_cache::init_entries_from_uri_list( - paths + sources + .as_paths() .iter() .map(|path| Arc::from(path.to_str().unwrap())) .collect::>() .as_slice(), cloud_options, )?) - } else { - None - } - } - #[cfg(not(feature = "cloud"))] - { - if run_async { - panic!("required feature `cloud` is not enabled") - } - } - }; - - let first_path = get_first_path(paths)?; - - let f = if run_async { - #[cfg(feature = "cloud")] - { - cache_entries.unwrap()[0].try_open_check_latest()? - } - #[cfg(not(feature = "cloud"))] - { - panic!("required feature `cloud` is not enabled") + }) + } else { + None } - } else { - polars_utils::open_file(first_path)? }; let owned = &mut vec![]; - let mmap = unsafe { memmap::Mmap::map(&f).unwrap() }; - - let mut reader = std::io::BufReader::new(maybe_decompress_bytes(mmap.as_ref(), owned)?); let (mut reader_schema, schema) = if let Some(schema) = ndjson_options.schema.take() { if file_options.row_index.is_none() { @@ -377,8 +336,28 @@ pub(super) fn ndjson_file_info( ) } } else { - let schema = - polars_io::ndjson::infer_schema(&mut reader, ndjson_options.infer_schema_length)?; + let schema = match first { + ScanSourceRef::File(path) => { + let f = if run_async { + feature_gated!("cloud", { + cache_entries.unwrap()[0].try_open_check_latest()? + }) + } else { + polars_utils::open_file(path)? + }; + + let mmap = unsafe { memmap::Mmap::map(&f).unwrap() }; + let mut reader = + std::io::Cursor::new(maybe_decompress_bytes(mmap.as_ref(), owned)?); + + polars_io::ndjson::infer_schema(&mut reader, ndjson_options.infer_schema_length)? + }, + ScanSourceRef::Buffer(buff) => { + let mut reader = std::io::Cursor::new(maybe_decompress_bytes(buff, owned)?); + polars_io::ndjson::infer_schema(&mut reader, ndjson_options.infer_schema_length)? + }, + }; + prepare_schemas(schema, file_options.row_index.as_ref()) }; diff --git a/crates/polars-plan/src/plans/functions/count.rs b/crates/polars-plan/src/plans/functions/count.rs index f3120bad8dff..f8d344217e70 100644 --- a/crates/polars-plan/src/plans/functions/count.rs +++ b/crates/polars-plan/src/plans/functions/count.rs @@ -1,5 +1,12 @@ #[cfg(feature = "ipc")] use arrow::io::ipc::read::get_row_count as count_rows_ipc_sync; +#[cfg(any( + feature = "parquet", + feature = "ipc", + feature = "json", + feature = "csv" +))] +use polars_core::error::feature_gated; #[cfg(any(feature = "parquet", feature = "json"))] use polars_io::cloud::CloudOptions; #[cfg(feature = "csv")] @@ -18,7 +25,7 @@ use polars_io::SerReader; use super::*; #[allow(unused_variables)] -pub fn count_rows(sources: &Arc<[ScanSource]>, scan_type: &FileScan) -> PolarsResult { +pub fn count_rows(sources: &ScanSources, scan_type: &FileScan) -> PolarsResult { #[cfg(not(any( feature = "parquet", feature = "ipc", @@ -79,7 +86,7 @@ pub fn count_rows(sources: &Arc<[ScanSource]>, scan_type: &FileScan) -> PolarsRe #[cfg(feature = "csv")] fn count_all_rows_csv( - sources: &Arc<[ScanSource]>, + sources: &ScanSources, options: &polars_io::prelude::CsvReadOptions, ) -> PolarsResult { let parse_options = options.get_parse_options(); @@ -87,20 +94,15 @@ fn count_all_rows_csv( sources .iter() .map(|source| match source { - ScanSource::Files(paths) => paths - .iter() - .map(|path| { - count_rows_csv( - path, - parse_options.separator, - parse_options.quote_char, - parse_options.comment_prefix.as_ref(), - parse_options.eol_char, - options.has_header, - ) - }) - .sum::>(), - ScanSource::Buffer(buf) => count_rows_csv_from_slice( + ScanSourceRef::File(path) => count_rows_csv( + path, + parse_options.separator, + parse_options.quote_char, + parse_options.comment_prefix.as_ref(), + parse_options.eol_char, + options.has_header, + ), + ScanSourceRef::Buffer(buf) => count_rows_csv_from_slice( &buf[..], parse_options.separator, parse_options.quote_char, @@ -114,31 +116,26 @@ fn count_all_rows_csv( #[cfg(feature = "parquet")] pub(super) fn count_rows_parquet( - sources: &Arc<[ScanSource]>, + sources: &ScanSources, #[allow(unused)] cloud_options: Option<&CloudOptions>, ) -> PolarsResult { if sources.is_empty() { return Ok(0); }; - let is_cloud = sources.first().unwrap().is_cloud_url()?; + let is_cloud = sources.is_cloud_url(); if is_cloud { - #[cfg(not(feature = "cloud"))] - panic!("One or more of the cloud storage features ('aws', 'gcp', ...) must be enabled."); - - #[cfg(feature = "cloud")] - { - get_runtime().block_on(count_rows_cloud_parquet(sources, cloud_options)) - } + feature_gated!("cloud", { + get_runtime().block_on(count_rows_cloud_parquet(sources.as_paths(), cloud_options)) + }) } else { sources .iter() .map(|source| match source { - ScanSource::Files(paths) => paths - .iter() - .map(|path| ParquetReader::new(polars_utils::open_file(path)?).num_rows()) - .sum::>(), - ScanSource::Buffer(buffer) => { + ScanSourceRef::File(path) => { + ParquetReader::new(polars_utils::open_file(path)?).num_rows() + }, + ScanSourceRef::Buffer(buffer) => { ParquetReader::new(std::io::Cursor::new(buffer)).num_rows() }, }) @@ -148,17 +145,14 @@ pub(super) fn count_rows_parquet( #[cfg(all(feature = "parquet", feature = "async"))] async fn count_rows_cloud_parquet( - sources: &Arc<[ScanSource]>, + paths: &[std::path::PathBuf], cloud_options: Option<&CloudOptions>, ) -> PolarsResult { - let collection = sources.iter().flat_map(|source| { - source.as_paths().iter().map(|path| { - with_concurrency_budget(1, || async { - let mut reader = - ParquetAsyncReader::from_uri(&path.to_string_lossy(), cloud_options, None) - .await?; - reader.num_rows().await - }) + let collection = paths.iter().map(|path| { + with_concurrency_budget(1, || async { + let mut reader = + ParquetAsyncReader::from_uri(&path.to_string_lossy(), cloud_options, None).await?; + reader.num_rows().await }) }); futures::future::try_join_all(collection) @@ -168,34 +162,31 @@ async fn count_rows_cloud_parquet( #[cfg(feature = "ipc")] pub(super) fn count_rows_ipc( - sources: &Arc<[ScanSource]>, + sources: &ScanSources, #[cfg(feature = "cloud")] cloud_options: Option<&CloudOptions>, metadata: Option<&arrow::io::ipc::read::FileMetadata>, ) -> PolarsResult { if sources.is_empty() { return Ok(0); }; - let is_cloud = sources.first().unwrap().is_cloud_url()?; + let is_cloud = sources.is_cloud_url(); if is_cloud { - #[cfg(not(feature = "cloud"))] - panic!("One or more of the cloud storage features ('aws', 'gcp', ...) must be enabled."); - - #[cfg(feature = "cloud")] - { - get_runtime().block_on(count_rows_cloud_ipc(sources, cloud_options, metadata)) - } + feature_gated!("cloud", { + get_runtime().block_on(count_rows_cloud_ipc( + sources.as_paths(), + cloud_options, + metadata, + )) + }) } else { sources .iter() .map(|source| match source { - ScanSource::Files(paths) => paths - .iter() - .map(|path| { - count_rows_ipc_sync(&mut polars_utils::open_file(path)?).map(|v| v as usize) - }) - .sum::>(), - ScanSource::Buffer(buffer) => { + ScanSourceRef::File(path) => { + count_rows_ipc_sync(&mut polars_utils::open_file(path)?).map(|v| v as usize) + }, + ScanSourceRef::Buffer(buffer) => { count_rows_ipc_sync(&mut std::io::Cursor::new(buffer)).map(|v| v as usize) }, }) @@ -205,19 +196,16 @@ pub(super) fn count_rows_ipc( #[cfg(all(feature = "ipc", feature = "async"))] async fn count_rows_cloud_ipc( - sources: &Arc<[ScanSource]>, + paths: &[std::path::PathBuf], cloud_options: Option<&CloudOptions>, metadata: Option<&arrow::io::ipc::read::FileMetadata>, ) -> PolarsResult { use polars_io::ipc::IpcReaderAsync; - let collection = sources.iter().flat_map(|source| { - source.as_paths().iter().map(|path| { - with_concurrency_budget(1, || async { - let reader = - IpcReaderAsync::from_uri(&path.to_string_lossy(), cloud_options).await?; - reader.count_rows(metadata).await - }) + let collection = paths.iter().map(|path| { + with_concurrency_budget(1, || async { + let reader = IpcReaderAsync::from_uri(&path.to_string_lossy(), cloud_options).await?; + reader.count_rows(metadata).await }) }); futures::future::try_join_all(collection) @@ -227,23 +215,26 @@ async fn count_rows_cloud_ipc( #[cfg(feature = "json")] pub(super) fn count_rows_ndjson( - sources: &Arc<[ScanSource]>, + sources: &ScanSources, cloud_options: Option<&CloudOptions>, ) -> PolarsResult { use polars_core::config; - use polars_core::error::feature_gated; use polars_io::utils::maybe_decompress_bytes; - let run_async = - !sources.is_empty() && sources.first().unwrap().is_cloud_url()? || config::force_async(); + if sources.is_empty() { + return Ok(0); + } + + let is_cloud_url = sources.is_cloud_url(); + let run_async = is_cloud_url || config::force_async(); let cache_entries = { feature_gated!("cloud", { if run_async { Some(polars_io::file_cache::init_entries_from_uri_list( sources + .as_paths() .iter() - .flat_map(|source| source.as_paths()) .map(|path| Arc::from(path.to_str().unwrap())) .collect::>() .as_slice(), @@ -258,29 +249,26 @@ pub(super) fn count_rows_ndjson( sources .iter() .map(|source| match source { - ScanSource::Files(paths) => paths - .iter() - .map(|path| { - let f = if run_async { - feature_gated!("cloud", { - let entry: &Arc = - &cache_entries.as_ref().unwrap()[0]; - entry.try_open_check_latest()? - }) - } else { - polars_utils::open_file(path)? - }; + ScanSourceRef::File(path) => { + let f = if run_async { + feature_gated!("cloud", { + let entry: &Arc = + &cache_entries.as_ref().unwrap()[0]; + entry.try_open_check_latest()? + }) + } else { + polars_utils::open_file(path)? + }; - let mmap = unsafe { memmap::Mmap::map(&f).unwrap() }; - let owned = &mut vec![]; + let mmap = unsafe { memmap::Mmap::map(&f).unwrap() }; + let owned = &mut vec![]; - let reader = polars_io::ndjson::core::JsonLineReader::new( - std::io::Cursor::new(maybe_decompress_bytes(mmap.as_ref(), owned)?), - ); - reader.count() - }) - .sum::>(), - ScanSource::Buffer(buffer) => { + let reader = polars_io::ndjson::core::JsonLineReader::new(std::io::Cursor::new( + maybe_decompress_bytes(mmap.as_ref(), owned)?, + )); + reader.count() + }, + ScanSourceRef::Buffer(buffer) => { polars_ensure!(!run_async, nyi = "BytesIO with force_async"); let owned = &mut vec![]; diff --git a/crates/polars-plan/src/plans/functions/mod.rs b/crates/polars-plan/src/plans/functions/mod.rs index 468a85273ea4..e453acae6855 100644 --- a/crates/polars-plan/src/plans/functions/mod.rs +++ b/crates/polars-plan/src/plans/functions/mod.rs @@ -13,6 +13,7 @@ use std::hash::{Hash, Hasher}; use std::sync::{Arc, Mutex}; pub use dsl::*; +use polars_core::error::feature_gated; use polars_core::prelude::*; use polars_utils::pl_str::PlSmallStr; #[cfg(feature = "serde")] @@ -44,7 +45,7 @@ pub enum FunctionIR { fmt_str: PlSmallStr, }, FastCount { - sources: Arc<[ScanSource]>, + sources: ScanSources, scan_type: FileScan, alias: Option, }, @@ -274,14 +275,7 @@ impl FunctionIR { #[cfg(feature = "merge_sorted")] MergeSorted { column } => merge_sorted(&df, column.as_ref()), Unnest { columns: _columns } => { - #[cfg(feature = "dtype-struct")] - { - df.unnest(_columns.iter().cloned()) - } - #[cfg(not(feature = "dtype-struct"))] - { - panic!("activate feature 'dtype-struct'") - } + feature_gated!("dtype-struct", df.unnest(_columns.iter().cloned())) }, Pipeline { function, .. } => { // we use a global string cache here as streaming chunks all have different rev maps diff --git a/crates/polars-plan/src/plans/ir/dot.rs b/crates/polars-plan/src/plans/ir/dot.rs index c3b8f2e94874..3ece8966a857 100644 --- a/crates/polars-plan/src/plans/ir/dot.rs +++ b/crates/polars-plan/src/plans/ir/dot.rs @@ -255,9 +255,8 @@ impl<'a> IRDotDisplay<'a> { file_options: options, output_schema: _, } => { - let paths = sources.as_paths(); let name: &str = scan_type.into(); - let path = PathsDisplay(paths.as_ref()); + let path = ScanSourcesDisplay(sources); let with_columns = options.with_columns.as_ref().map(|cols| cols.as_ref()); let with_columns = NumColumns(with_columns); let total_columns = @@ -344,10 +343,36 @@ impl<'a> IRDotDisplay<'a> { // A few utility structures for formatting pub struct PathsDisplay<'a>(pub &'a [PathBuf]); +pub struct ScanSourcesDisplay<'a>(pub &'a ScanSources); struct NumColumns<'a>(Option<&'a [PlSmallStr]>); struct NumColumnsSchema<'a>(Option<&'a Schema>); struct OptionExprIRDisplay<'a>(Option>); +impl fmt::Display for ScanSourceRef<'_> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + ScanSourceRef::File(path) => path.display().fmt(f), + ScanSourceRef::Buffer(buff) => write!(f, "{} in-mem bytes", buff.len()), + } + } +} + +impl fmt::Display for ScanSourcesDisplay<'_> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self.0.len() { + 0 => write!(f, "[]"), + 1 => write!(f, "[{}]", self.0.at(0)), + 2 => write!(f, "[{}, {}]", self.0.at(0), self.0.at(1)), + _ => write!( + f, + "[{}, ... {} other sources]", + self.0.at(0), + self.0.len() - 1, + ), + } + } +} + impl fmt::Display for PathsDisplay<'_> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self.0.len() { @@ -357,7 +382,7 @@ impl fmt::Display for PathsDisplay<'_> { _ => write!( f, "[{}, ... {} other files]", - self.0[0].to_string_lossy(), + self.0[0].display(), self.0.len() - 1, ), } diff --git a/crates/polars-plan/src/plans/ir/format.rs b/crates/polars-plan/src/plans/ir/format.rs index cc64daf67a30..a69eb5203359 100644 --- a/crates/polars-plan/src/plans/ir/format.rs +++ b/crates/polars-plan/src/plans/ir/format.rs @@ -7,7 +7,7 @@ use polars_core::schema::Schema; use polars_io::RowIndex; use recursive::recursive; -use super::ir::dot::PathsDisplay; +use self::ir::dot::ScanSourcesDisplay; use crate::prelude::*; pub struct IRDisplay<'a> { @@ -55,7 +55,7 @@ impl AsExpr for ExprIR { fn write_scan( f: &mut Formatter, name: &str, - source: &ScanSource, + sources: &ScanSources, indent: usize, n_columns: i64, total_columns: usize, @@ -63,12 +63,12 @@ fn write_scan( slice: Option<(i64, usize)>, row_index: Option<&RowIndex>, ) -> fmt::Result { - write!(f, "{:indent$}{name} SCAN ", "")?; - - match source { - ScanSource::Files(paths) => write!(f, "{}", PathsDisplay(paths.as_ref()))?, - ScanSource::Buffer(_) => write!(f, "IN MEMORY BUFFER")?, - } + write!( + f, + "{:indent$}{name} SCAN {}", + "", + ScanSourcesDisplay(sources) + )?; let total_columns = total_columns - usize::from(row_index.is_some()); if n_columns > 0 { @@ -175,7 +175,7 @@ impl<'a> IRDisplay<'a> { write_scan( f, "PYTHON", - &ScanSource::default(), + &ScanSources::default(), indent, n_columns, total_columns, diff --git a/crates/polars-plan/src/plans/ir/mod.rs b/crates/polars-plan/src/plans/ir/mod.rs index ff4e46e64dd8..db73f7a13528 100644 --- a/crates/polars-plan/src/plans/ir/mod.rs +++ b/crates/polars-plan/src/plans/ir/mod.rs @@ -7,7 +7,6 @@ pub(crate) mod tree_format; use std::borrow::Cow; use std::fmt; use std::path::{Path, PathBuf}; -use std::sync::Mutex; pub use dot::{EscapeLabel, IRDotDisplay, PathsDisplay}; pub use format::{ExprIRDisplay, IRDisplay}; @@ -15,7 +14,7 @@ use hive::HivePartitions; use polars_core::prelude::*; use polars_core::POOL; use polars_utils::idx_vec::UnitVec; -use polars_utils::{format_pl_smallstr, unitvec}; +use polars_utils::unitvec; #[cfg(feature = "ir_serde")] use serde::{Deserialize, Serialize}; @@ -35,18 +34,17 @@ pub struct IRPlanRef<'a> { pub expr_arena: &'a Arena, } -#[cfg_attr(feature = "ir_serde", derive(Serialize, Deserialize))] +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] #[derive(Debug, Clone, Hash, PartialEq, Eq)] -pub enum ScanSource { +pub enum ScanSources { Files(Arc<[PathBuf]>), - #[cfg_attr(feature = "ir_serde", serde(skip))] - Buffer(Arc<[u8]>), + Buffers(Arc<[Arc<[u8]>]>), } -impl Default for ScanSource { - fn default() -> Self { - Self::Files(Arc::default()) - } +#[derive(Debug, Clone, Copy)] +pub enum ScanSourceRef<'a> { + File(&'a Path), + Buffer(&'a [u8]), } pub struct ScanSourceSliceInfo { @@ -54,62 +52,88 @@ pub struct ScanSourceSliceInfo { pub source_slice: std::ops::Range, } -impl ScanSource { +impl Default for ScanSources { + fn default() -> Self { + Self::Buffers(Arc::default()) + } +} + +impl<'a> ScanSourceRef<'a> { + pub fn to_file_path(&self) -> &str { + match self { + ScanSourceRef::File(path) => path.to_str().unwrap(), + ScanSourceRef::Buffer(_) => "in-mem", + } + } +} + +impl ScanSources { + pub fn iter(&self) -> ScanSourceIter { + ScanSourceIter { + sources: self, + offset: 0, + } + } pub fn as_paths(&self) -> &[PathBuf] { match self { - ScanSource::Files(paths) => paths, - ScanSource::Buffer(_) => unimplemented!(), + Self::Files(paths) => &paths, + Self::Buffers(_) => unimplemented!(), } } - pub fn try_into_paths(&self) -> PolarsResult> { + pub fn try_into_paths(&self) -> Option> { match self { - ScanSource::Files(paths) => Ok(paths.clone()), - ScanSource::Buffer(_) => Err(polars_err!( - nyi = "Unable to convert BytesIO scan into path" - )), + Self::Files(paths) => Some(paths.clone()), + Self::Buffers(_) => None, } } pub fn into_paths(&self) -> Arc<[PathBuf]> { match self { - ScanSource::Files(paths) => paths.clone(), - ScanSource::Buffer(_) => unimplemented!(), + Self::Files(paths) => paths.clone(), + Self::Buffers(_) => unimplemented!(), } } - pub fn to_dsl(self, is_expanded: bool) -> DslScanSource { - match self { - ScanSource::Files(paths) => { - DslScanSource::File(Arc::new(Mutex::new(ScanFileSource { paths, is_expanded }))) - }, - ScanSource::Buffer(buffer) => DslScanSource::Buffer(buffer), + pub fn to_dsl(self, is_expanded: bool) -> DslScanSources { + DslScanSources { + sources: self, + is_expanded, } } - pub fn num_sources(&self) -> usize { + pub fn is_cloud_url(&self) -> bool { match self { - ScanSource::Files(paths) => paths.len(), - ScanSource::Buffer(_) => 1, + Self::Files(paths) => paths.first().map_or(false, |p| polars_io::is_cloud_url(p)), + Self::Buffers(_) => false, } } - pub fn is_cloud_url(&self) -> PolarsResult { + pub fn len(&self) -> usize { match self { - ScanSource::Files(paths) => { - Ok(polars_io::is_cloud_url(paths.first().ok_or_else( - || polars_err!(ComputeError: "expected at least 1 path"), - )?)) - }, - ScanSource::Buffer(_) => Ok(false), + Self::Files(s) => s.len(), + Self::Buffers(s) => s.len(), } } + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + pub fn first(&self) -> Option { + self.get(0) + } + pub fn id(&self) -> PlSmallStr { + if self.is_empty() { + return PlSmallStr::from_static("EMPTY"); + } + match self { - ScanSource::Files(paths) if paths.is_empty() => PlSmallStr::from_static("EMPTY"), - ScanSource::Files(paths) => PlSmallStr::from_str(paths[0].to_string_lossy().as_ref()), - ScanSource::Buffer(_) => PlSmallStr::from_static("IN_MEMORY"), + Self::Files(paths) => { + PlSmallStr::from_str(paths.first().unwrap().to_string_lossy().as_ref()) + }, + Self::Buffers(_) => PlSmallStr::from_static("IN_MEMORY"), } } @@ -118,8 +142,7 @@ impl ScanSource { pub fn collect_slice_information( &self, slice: (i64, usize), - path_to_num_rows: impl Fn(&Path) -> PolarsResult + Send + Sync, - buffer_to_num_rows: impl Fn(&[u8]) -> PolarsResult + Send + Sync, + map_to_num_rows: impl Fn(ScanSourceRef) -> PolarsResult + Send + Sync, ) -> PolarsResult { fn slice_to_start_end( offset: i64, @@ -148,80 +171,114 @@ impl ScanSource { let (offset, length) = slice; - Ok(match self { - ScanSource::Files(paths) if paths.len() == 1 => { - let num_rows = path_to_num_rows(&paths[0])?; - ScanSourceSliceInfo { - item_slice: slice_to_start_end(offset, length, num_rows), - source_slice: 0..1, + if self.is_empty() { + return Ok(ScanSourceSliceInfo { + item_slice: 0..0, + source_slice: 0..0, + }); + } + + if self.len() == 1 { + let num_rows = map_to_num_rows(self.get(0).unwrap())?; + let item_slice = slice_to_start_end(offset, length, num_rows); + let source_slice = if item_slice.is_empty() { 0..0 } else { 0..1 }; + + Ok(ScanSourceSliceInfo { + item_slice, + source_slice, + }) + } else { + use rayon::prelude::*; + + // Walk the files in reverse until we find the first file, and then translate the + // slice into a positive-offset equivalent. + const CHUNK_SIZE: usize = 8; + let mut row_counts = Vec::with_capacity(self.len()); + + POOL.install(|| { + for idx_end in (0..self.len()).step_by(CHUNK_SIZE) { + let idx_start = idx_end.saturating_sub(CHUNK_SIZE); + + row_counts.extend( + (idx_start..=idx_end) + .into_par_iter() + .map(|i| map_to_num_rows(self.at(i))) + .collect::>>()? + .into_iter() + .rev(), + ); } - }, - ScanSource::Files(paths) => { - use rayon::prelude::*; - assert_ne!(paths.len(), 0); + PolarsResult::Ok(()) + })?; - // Walk the files in reverse until we find the first file, and then translate the - // slice into a positive-offset equivalent. - const CHUNK_SIZE: usize = 8; - let mut row_counts = Vec::with_capacity(paths.len()); + let num_rows = row_counts.iter().sum::(); - POOL.install(|| { - for idx_end in (0..paths.len()).step_by(CHUNK_SIZE) { - let idx_start = idx_end.saturating_sub(CHUNK_SIZE); + let item_slice = slice_to_start_end(offset, length, num_rows); - row_counts.extend( - (idx_start..=idx_end) - .into_par_iter() - .map(|i| path_to_num_rows(&paths[i])) - .collect::>>()? - .into_iter() - .rev(), - ); - } + let mut source_start = self.len() - 1; + let mut source_end = 0; - PolarsResult::Ok(()) - })?; + let mut sum = 0; + for (i, row_count) in row_counts.iter().rev().enumerate() { + if sum < item_slice.end { + source_end = usize::max(source_end, i); + } - let num_rows = row_counts.iter().sum::(); + sum += row_count; - let item_slice = slice_to_start_end(offset, length, num_rows); + if sum >= item_slice.start { + source_start = usize::min(source_start, i); + } + } - let mut source_start = paths.len() - 1; - let mut source_end = 0; + let source_slice = source_start..source_end + 1; - let mut sum = 0; - for (i, row_count) in row_counts.iter().rev().enumerate() { - if sum < item_slice.end { - source_end = usize::max(source_end, i); - } + Ok(ScanSourceSliceInfo { + item_slice, + source_slice, + }) + } + } - sum += row_count; + pub fn get(&self, idx: usize) -> Option { + match self { + ScanSources::Files(paths) => paths.get(idx).map(|p| ScanSourceRef::File(p)), + ScanSources::Buffers(buffers) => buffers.get(idx).map(|b| ScanSourceRef::Buffer(b)), + } + } - if sum >= item_slice.start { - source_start = usize::min(source_start, i); - } - } + pub fn at(&self, idx: usize) -> ScanSourceRef { + self.get(idx).unwrap() + } +} - let source_slice = source_start..source_end + 1; +pub struct ScanSourceIter<'a> { + sources: &'a ScanSources, + offset: usize, +} - ScanSourceSliceInfo { - item_slice, - source_slice, - } - }, - ScanSource::Buffer(buffer) => { - let num_rows = buffer_to_num_rows(buffer)?; +impl<'a> Iterator for ScanSourceIter<'a> { + type Item = ScanSourceRef<'a>; - ScanSourceSliceInfo { - item_slice: slice_to_start_end(offset, length, num_rows), - source_slice: 0..1, - } - }, - }) + fn next(&mut self) -> Option { + let item = match self.sources { + ScanSources::Files(paths) => ScanSourceRef::File(paths.get(self.offset)?), + ScanSources::Buffers(buffers) => ScanSourceRef::Buffer(buffers.get(self.offset)?), + }; + + self.offset += 1; + Some(item) + } + + fn size_hint(&self) -> (usize, Option) { + let len = self.sources.len() - self.offset; + (len, Some(len)) } } +impl<'a> ExactSizeIterator for ScanSourceIter<'a> {} + /// [`IR`] is a representation of [`DslPlan`] with [`Node`]s which are allocated in an [`Arena`] /// In this IR the logical plan has access to the full dataset. #[derive(Clone, Debug, Default)] @@ -241,7 +298,7 @@ pub enum IR { predicate: ExprIR, }, Scan { - sources: ScanSource, + sources: ScanSources, file_info: FileInfo, hive_parts: Option>>, predicate: Option, diff --git a/crates/polars-plan/src/plans/mod.rs b/crates/polars-plan/src/plans/mod.rs index 9e2b4d56d6a4..92eeb783bf76 100644 --- a/crates/polars-plan/src/plans/mod.rs +++ b/crates/polars-plan/src/plans/mod.rs @@ -1,6 +1,5 @@ use std::fmt; use std::fmt::Debug; -use std::path::PathBuf; use std::sync::{Arc, Mutex, RwLock}; use hive::HivePartitions; @@ -61,19 +60,11 @@ pub enum Context { #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] #[derive(Clone)] -pub struct ScanFileSource { - pub paths: Arc<[PathBuf]>, +pub struct DslScanSources { + pub sources: ScanSources, pub is_expanded: bool, } -#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] -#[derive(Clone)] -pub enum DslScanSource { - File(Arc>), - // @Q? Can we serde skip this? - Buffer(Arc<[u8]>), -} - // https://stackoverflow.com/questions/1031076/what-are-projection-and-selection #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] pub enum DslPlan { @@ -91,7 +82,7 @@ pub enum DslPlan { cache_hits: u32, }, Scan { - sources: DslScanSource, + sources: Arc>, // Option as this is mostly materialized on the IR phase. // During conversion we update the value in the DSL as well // This is to cater to use cases where parts of a `LazyFrame` diff --git a/crates/polars-plan/src/plans/optimizer/count_star.rs b/crates/polars-plan/src/plans/optimizer/count_star.rs index d88956d2903f..c8570b0f908f 100644 --- a/crates/polars-plan/src/plans/optimizer/count_star.rs +++ b/crates/polars-plan/src/plans/optimizer/count_star.rs @@ -1,3 +1,5 @@ +use std::path::PathBuf; + use super::*; pub(super) struct CountStar; @@ -47,7 +49,7 @@ struct CountStarExpr { // Top node of the projection to replace node: Node, // Paths to the input files - sources: Arc<[ScanSource]>, + sources: ScanSources, // File Type scan_type: FileScan, // Column Alias @@ -64,12 +66,37 @@ fn visit_logical_plan_for_scan_paths( ) -> Option { match lp_arena.get(node) { IR::Union { inputs, .. } => { + enum MutableSources { + Files(Vec), + Buffers(Vec>), + } + let mut scan_type: Option = None; - let mut sources = Vec::with_capacity(inputs.len()); + let mut sources = None; for input in inputs { match visit_logical_plan_for_scan_paths(*input, lp_arena, expr_arena, true) { Some(expr) => { - sources.extend(expr.sources.iter().cloned()); + match expr.sources { + ScanSources::Files(paths) => match sources { + Some(MutableSources::Files(ref mut files)) => { + files.extend_from_slice(&paths[..]) + }, + Some(MutableSources::Buffers(_)) => { + todo!("Mixing in memory buffers and paths in count star opt") + }, + None => sources = Some(MutableSources::Files(paths.to_vec())), + }, + ScanSources::Buffers(bs) => match sources { + Some(MutableSources::Files(_)) => { + todo!("Mixing in memory buffers and paths in count star opt") + }, + Some(MutableSources::Buffers(ref mut buffers)) => { + buffers.extend_from_slice(&bs[..]) + }, + None => sources = Some(MutableSources::Buffers(bs.to_vec())), + }, + } + match &scan_type { None => scan_type = Some(expr.scan_type), Some(scan_type) => { @@ -86,7 +113,11 @@ fn visit_logical_plan_for_scan_paths( } } Some(CountStarExpr { - sources: sources.into(), + sources: match sources { + Some(MutableSources::Files(files)) => ScanSources::Files(files.into()), + Some(MutableSources::Buffers(buffers)) => ScanSources::Buffers(buffers.into()), + None => ScanSources::default(), + }, scan_type: scan_type.unwrap(), node, alias: None, @@ -95,7 +126,7 @@ fn visit_logical_plan_for_scan_paths( IR::Scan { scan_type, sources, .. } if !matches!(scan_type, FileScan::Anonymous { .. }) => Some(CountStarExpr { - sources: [sources.clone()].into(), + sources: sources.clone(), scan_type: scan_type.clone(), node, alias: None, diff --git a/crates/polars-plan/src/plans/optimizer/predicate_pushdown/mod.rs b/crates/polars-plan/src/plans/optimizer/predicate_pushdown/mod.rs index 3b9e6c8d8ef9..d5aefb2a16d7 100644 --- a/crates/polars-plan/src/plans/optimizer/predicate_pushdown/mod.rs +++ b/crates/polars-plan/src/plans/optimizer/predicate_pushdown/mod.rs @@ -401,7 +401,7 @@ impl<'a> PredicatePushDown<'a> { filter: None, }); } else { - sources = ScanSource::Files(new_paths.into()); + sources = ScanSources::Files(new_paths.into()); scan_hive_parts = Some(Arc::from(new_hive_parts)); } } diff --git a/crates/polars-python/src/lazyframe/general.rs b/crates/polars-python/src/lazyframe/general.rs index 2e2ce702f5bd..40c69b260c40 100644 --- a/crates/polars-python/src/lazyframe/general.rs +++ b/crates/polars-python/src/lazyframe/general.rs @@ -31,7 +31,7 @@ impl PyLazyFrame { row_index, ignore_errors, include_file_paths, cloud_options, retries, file_cache_ttl ))] fn new_from_ndjson( - path: Option, + path: Option, paths: Vec, infer_schema_length: Option, schema: Option>, @@ -52,37 +52,48 @@ impl PyLazyFrame { offset, }); - #[cfg(feature = "cloud")] - let cloud_options = { - let first_path = if let Some(path) = &path { - path - } else { - paths - .first() - .ok_or_else(|| PyValueError::new_err("expected a path argument"))? - }; + use std::path::Path; - let first_path_url = first_path.to_string_lossy(); + use polars_plan::plans::ScanSources; + use EitherPythonFileOrPath as EF; - let mut cloud_options = if let Some(opts) = cloud_options { - parse_cloud_options(&first_path_url, opts)? - } else { - parse_cloud_options(&first_path_url, vec![])? - }; + use crate::file::{get_either_file_or_path, EitherPythonFileOrPath}; + let (first_path, mut r) = match path + .map(|py_f| get_either_file_or_path(py_f, false)) + .transpose()? + { + Some(EF::Path(path)) => { + let reader = LazyJsonLineReader::new(>::as_ref(&path)); + (Some(path), reader) + }, + Some(EF::Py(f)) => ( + None, + LazyJsonLineReader::new_sourced(ScanSources::Buffers([f.as_arc()].into())), + ), + None => ( + Some( + paths + .first() + .cloned() + .ok_or_else(|| PyValueError::new_err("expected a path argument"))?, + ), + LazyJsonLineReader::new_paths(paths.into()), + ), + }; + #[cfg(feature = "cloud")] + if let Some(first_path) = first_path { + let first_path_url = first_path.to_string_lossy(); + + let mut cloud_options = + parse_cloud_options(&first_path_url, cloud_options.unwrap_or_default())?; cloud_options = cloud_options.with_max_retries(retries); if let Some(file_cache_ttl) = file_cache_ttl { cloud_options.file_cache_ttl = file_cache_ttl; } - Some(cloud_options) - }; - - let r = if let Some(path) = &path { - LazyJsonLineReader::new(path) - } else { - LazyJsonLineReader::new_paths(paths.into()) + r = r.with_cloud_options(Some(cloud_options)); }; let lf = r @@ -96,7 +107,6 @@ impl PyLazyFrame { .with_row_index(row_index) .with_ignore_errors(ignore_errors) .with_include_file_paths(include_file_paths.map(|x| x.into())) - .with_cloud_options(cloud_options) .finish() .map_err(PyPolarsErr::from)?; @@ -165,7 +175,7 @@ impl PyLazyFrame { .collect::() }); - use polars_plan::plans::ScanSource; + use polars_plan::plans::ScanSources; use EitherPythonFileOrPath as EF; let (first_path, mut r) = match path .map(|py_f| get_either_file_or_path(py_f, false)) @@ -177,7 +187,7 @@ impl PyLazyFrame { }, Some(EF::Py(f)) => ( None, - LazyCsvReader::new_sourced(ScanSource::Buffer(f.as_arc())), + LazyCsvReader::new_sourced(ScanSources::Buffers([f.as_arc()].into())), ), None => ( Some( @@ -310,7 +320,7 @@ impl PyLazyFrame { include_file_paths: include_file_paths.map(|x| x.into()), }; - use polars_plan::plans::ScanSource; + use polars_plan::plans::ScanSources; use EitherPythonFileOrPath as EF; let use_first_path = path.is_some(); let first_path = match path @@ -319,10 +329,13 @@ impl PyLazyFrame { { Some(EF::Path(path)) => path, Some(EF::Py(f)) => { - return LazyFrame::scan_parquet_sourced(ScanSource::Buffer(f.as_arc()), args) - .map(Self::from) - .map_err(PyPolarsErr::from) - .map_err(From::from); + return LazyFrame::scan_parquet_sourced( + ScanSources::Buffers([f.as_arc()].into()), + args, + ) + .map(Self::from) + .map_err(PyPolarsErr::from) + .map_err(From::from); }, None => paths .first() @@ -392,7 +405,7 @@ impl PyLazyFrame { include_file_paths: include_file_paths.map(|x| x.into()), }; - use polars_plan::plans::ScanSource; + use polars_plan::plans::ScanSources; use EitherPythonFileOrPath as EF; let use_first_path = path.is_some(); let first_path = match path @@ -401,10 +414,13 @@ impl PyLazyFrame { { Some(EF::Path(path)) => path, Some(EF::Py(f)) => { - return LazyFrame::scan_ipc_sourced(ScanSource::Buffer(f.as_arc()), args) - .map(Self::from) - .map_err(PyPolarsErr::from) - .map_err(From::from); + return LazyFrame::scan_ipc_sourced( + ScanSources::Buffers([f.as_arc()].into()), + args, + ) + .map(Self::from) + .map_err(PyPolarsErr::from) + .map_err(From::from); }, None => paths .first() diff --git a/crates/polars-python/src/lazyframe/visitor/nodes.rs b/crates/polars-python/src/lazyframe/visitor/nodes.rs index 3c31ff11b63a..37a51e4d481d 100644 --- a/crates/polars-python/src/lazyframe/visitor/nodes.rs +++ b/crates/polars-python/src/lazyframe/visitor/nodes.rs @@ -327,7 +327,7 @@ pub(crate) fn into_py(py: Python<'_>, plan: &IR) -> PyResult { } => Scan { paths: sources .try_into_paths() - .map_err(|_| PyNotImplementedError::new_err("scan with BytesIO"))? + .ok_or_else(|| PyNotImplementedError::new_err("scan with BytesIO"))? .to_object(py), // TODO: file info file_info: py.None(), diff --git a/crates/polars-stream/src/utils/late_materialized_df.rs b/crates/polars-stream/src/utils/late_materialized_df.rs index 87fe97135aad..9e7322167f7f 100644 --- a/crates/polars-stream/src/utils/late_materialized_df.rs +++ b/crates/polars-stream/src/utils/late_materialized_df.rs @@ -4,7 +4,7 @@ use parking_lot::Mutex; use polars_core::frame::DataFrame; use polars_core::schema::Schema; use polars_error::PolarsResult; -use polars_plan::plans::{AnonymousScan, AnonymousScanArgs, FileInfo, FileScan, ScanSource, IR}; +use polars_plan::plans::{AnonymousScan, AnonymousScanArgs, FileInfo, FileScan, ScanSources, IR}; use polars_plan::prelude::{AnonymousScanOptions, FileScanOptions}; /// Used to insert a dataframe into in-memory-engine query plan after the query @@ -25,7 +25,7 @@ impl LateMaterializedDataFrame { fmt_str: "LateMaterializedDataFrame", }); IR::Scan { - sources: ScanSource::Files(Arc::default()), + sources: ScanSources::Files(Arc::default()), file_info: FileInfo::new(schema, None, (None, usize::MAX)), hive_parts: None, predicate: None, diff --git a/py-polars/polars/io/parquet/functions.py b/py-polars/polars/io/parquet/functions.py index ef01b24955b0..253af8042b84 100644 --- a/py-polars/polars/io/parquet/functions.py +++ b/py-polars/polars/io/parquet/functions.py @@ -422,7 +422,7 @@ def scan_parquet( if isinstance(source, (str, Path)): source = normalize_filepath(source, check_not_directory=False) - elif isinstance(source, (IO, BytesIO)): + elif isinstance(source, (IO, io.BytesIO)): sources = [] else: source = [ From 87aef7121c348bc249e85176ec0d3dba664527b3 Mon Sep 17 00:00:00 2001 From: coastalwhite Date: Wed, 4 Sep 2024 18:01:24 +0200 Subject: [PATCH 05/27] almost completely working :) --- Cargo.lock | 2 + .../src/executors/scan/csv.rs | 23 +-- .../src/executors/scan/ndjson.rs | 65 +++---- .../src/executors/scan/parquet.rs | 46 ++--- crates/polars-plan/Cargo.toml | 1 + .../polars-plan/src/plans/conversion/scans.rs | 74 ++------ crates/polars-plan/src/plans/ir/mod.rs | 38 +++- .../src/plans/optimizer/count_star.rs | 2 +- crates/polars-python/Cargo.toml | 1 + crates/polars-python/src/conversion/mod.rs | 57 ++++++ crates/polars-python/src/file.rs | 9 +- crates/polars-python/src/lazyframe/general.rs | 177 +++++------------- crates/polars-utils/src/mmap.rs | 7 + py-polars/polars/io/csv/functions.py | 6 +- py-polars/polars/io/ipc/functions.py | 7 +- py-polars/polars/io/ndjson.py | 11 +- py-polars/polars/io/parquet/functions.py | 6 +- py-polars/tests/unit/io/test_scan.py | 45 +++++ 18 files changed, 292 insertions(+), 285 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 332550f89f89..5a0e38a53d31 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3367,6 +3367,7 @@ dependencies = [ "ahash", "bitflags", "bytemuck", + "bytes", "chrono", "chrono-tz", "ciborium", @@ -3403,6 +3404,7 @@ dependencies = [ "ahash", "arboard", "bytemuck", + "bytes", "ciborium", "either", "itoa", diff --git a/crates/polars-mem-engine/src/executors/scan/csv.rs b/crates/polars-mem-engine/src/executors/scan/csv.rs index b06386cdfa03..7a2ac0d34950 100644 --- a/crates/polars-mem-engine/src/executors/scan/csv.rs +++ b/crates/polars-mem-engine/src/executors/scan/csv.rs @@ -5,6 +5,7 @@ use polars_core::utils::{ accumulate_dataframes_vertical, accumulate_dataframes_vertical_unchecked, }; use polars_error::feature_gated; +use polars_utils::mmap::MemSlice; use super::*; @@ -67,7 +68,7 @@ impl CsvExec { let source = self.sources.at(i); let owned = &mut vec![]; - let mut df = match source { + let memslice = match source { ScanSourceRef::File(path) => { let file = if run_async { feature_gated!("cloud", { @@ -82,21 +83,17 @@ impl CsvExec { }?; let mmap = unsafe { memmap::Mmap::map(&file).unwrap() }; - options - .into_reader_with_file_handle(std::io::Cursor::new( - maybe_decompress_bytes(mmap.as_ref(), owned)?, - )) - ._with_predicate(predicate.clone()) - .finish()? + MemSlice::from_mmap(Arc::new(mmap)) }, - ScanSourceRef::Buffer(buffer) => options - .into_reader_with_file_handle(std::io::Cursor::new(maybe_decompress_bytes( - buffer, owned, - )?)) - ._with_predicate(predicate.clone()) - .finish()?, + ScanSourceRef::Buffer(buffer) => MemSlice::from_bytes(buffer.clone()), }; + let reader = std::io::Cursor::new(maybe_decompress_bytes(&memslice, owned)?); + let mut df = options + .into_reader_with_file_handle(reader) + ._with_predicate(predicate.clone()) + .finish()?; + if let Some(col) = &self.file_options.include_file_paths { let name = source.to_file_path(); diff --git a/crates/polars-mem-engine/src/executors/scan/ndjson.rs b/crates/polars-mem-engine/src/executors/scan/ndjson.rs index 27aab29fd0c1..b37f76ee826d 100644 --- a/crates/polars-mem-engine/src/executors/scan/ndjson.rs +++ b/crates/polars-mem-engine/src/executors/scan/ndjson.rs @@ -1,6 +1,7 @@ use polars_core::config; use polars_core::utils::accumulate_dataframes_vertical; use polars_error::feature_gated; +use polars_utils::mmap::MemSlice; use super::*; @@ -75,8 +76,7 @@ impl JsonExec { let row_index = self.file_scan_options.row_index.as_mut(); - let owned = &mut vec![]; - let df = match source { + let memslice = match source { ScanSourceRef::File(path) => { let file = if run_async { feature_gated!("cloud", { @@ -97,49 +97,30 @@ impl JsonExec { } }; - let mmap = unsafe { memmap::Mmap::map(&file).unwrap() }; - let curs = std::io::Cursor::new( - match maybe_decompress_bytes(mmap.as_ref(), owned) { - Ok(v) => v, - Err(e) => return Some(Err(e)), - }, - ); - let reader = JsonLineReader::new(curs); - - reader - .with_schema(schema.clone()) - .with_rechunk(self.file_scan_options.rechunk) - .with_chunk_size(Some(self.options.chunk_size)) - .with_row_index(row_index) - .with_predicate(self.predicate.clone().map(phys_expr_to_io_expr)) - .with_projection(self.file_scan_options.with_columns.clone()) - .low_memory(self.options.low_memory) - .with_n_rows(n_rows) - .with_ignore_errors(self.options.ignore_errors) - .finish() - }, - ScanSourceRef::Buffer(buff) => { - let curs = - std::io::Cursor::new(match maybe_decompress_bytes(buff, owned) { - Ok(v) => v, - Err(e) => return Some(Err(e)), - }); - let reader = JsonLineReader::new(curs); - - reader - .with_schema(schema.clone()) - .with_rechunk(self.file_scan_options.rechunk) - .with_chunk_size(Some(self.options.chunk_size)) - .with_row_index(row_index) - .with_predicate(self.predicate.clone().map(phys_expr_to_io_expr)) - .with_projection(self.file_scan_options.with_columns.clone()) - .low_memory(self.options.low_memory) - .with_n_rows(n_rows) - .with_ignore_errors(self.options.ignore_errors) - .finish() + MemSlice::from_mmap(Arc::new(unsafe { memmap::Mmap::map(&file).unwrap() })) }, + ScanSourceRef::Buffer(buff) => MemSlice::from_bytes(buff.clone()), }; + let owned = &mut vec![]; + let curs = std::io::Cursor::new(match maybe_decompress_bytes(&memslice, owned) { + Ok(v) => v, + Err(e) => return Some(Err(e)), + }); + let reader = JsonLineReader::new(curs); + + let df = reader + .with_schema(schema.clone()) + .with_rechunk(self.file_scan_options.rechunk) + .with_chunk_size(Some(self.options.chunk_size)) + .with_row_index(row_index) + .with_predicate(self.predicate.clone().map(phys_expr_to_io_expr)) + .with_projection(self.file_scan_options.with_columns.clone()) + .low_memory(self.options.low_memory) + .with_n_rows(n_rows) + .with_ignore_errors(self.options.ignore_errors) + .finish(); + let mut df = match df { Ok(df) => df, Err(e) => return Some(Err(e)), diff --git a/crates/polars-mem-engine/src/executors/scan/parquet.rs b/crates/polars-mem-engine/src/executors/scan/parquet.rs index bb47eb458a49..509ea7ba8c55 100644 --- a/crates/polars-mem-engine/src/executors/scan/parquet.rs +++ b/crates/polars-mem-engine/src/executors/scan/parquet.rs @@ -8,6 +8,7 @@ use polars_io::cloud::CloudOptions; use polars_io::parquet::metadata::FileMetaDataRef; use polars_io::utils::slice::split_slice_at_file; use polars_io::RowIndex; +use polars_utils::mmap::MemSlice; use super::*; @@ -81,10 +82,8 @@ impl ParquetExec { let base_row_index = self.file_options.row_index.take(); // Limit no. of files at a time to prevent open file limits. - let paths = self.sources.as_paths(); - for i in slice_info.source_slice.step_by(step) { - let end = std::cmp::min(i.saturating_add(step), paths.len()); + let end = std::cmp::min(i.saturating_add(step), self.sources.len()); let hive_parts = self.hive_parts.as_ref().map(|x| &x[i..end]); if current_offset >= slice_info.item_slice.end && !result.is_empty() { @@ -106,29 +105,30 @@ impl ParquetExec { hive_partitions.as_deref(), ); - match source { + let memslice = match source { ScanSourceRef::File(path) => { let file = std::fs::File::open(path)?; - - let mut reader = ParquetReader::new(file) - .read_parallel(parallel) - .set_low_memory(self.options.low_memory) - .use_statistics(self.options.use_statistics) - .set_rechunk(false) - .with_hive_partition_columns(hive_partitions) - .with_include_file_path( - self.file_options - .include_file_paths - .as_ref() - .map(|x| (x.clone(), Arc::from(paths[i].to_str().unwrap()))), - ); - - reader - .num_rows() - .map(|num_rows| (reader, num_rows, predicate, projection)) + MemSlice::from_mmap(Arc::new(unsafe { memmap::Mmap::map(&file).unwrap() })) }, - ScanSourceRef::Buffer(_) => todo!(), - } + ScanSourceRef::Buffer(buff) => MemSlice::from_bytes(buff.clone()), + }; + + let mut reader = ParquetReader::new(std::io::Cursor::new(memslice)) + .read_parallel(parallel) + .set_low_memory(self.options.low_memory) + .use_statistics(self.options.use_statistics) + .set_rechunk(false) + .with_hive_partition_columns(hive_partitions) + .with_include_file_path( + self.file_options + .include_file_paths + .as_ref() + .map(|x| (x.clone(), Arc::from(source.to_file_path()))), + ); + + reader + .num_rows() + .map(|num_rows| (reader, num_rows, predicate, projection)) }); // We do this in parallel because wide tables can take a long time deserializing metadata. diff --git a/crates/polars-plan/Cargo.toml b/crates/polars-plan/Cargo.toml index b37b9b445f10..dd33428c8398 100644 --- a/crates/polars-plan/Cargo.toml +++ b/crates/polars-plan/Cargo.toml @@ -26,6 +26,7 @@ ahash = { workspace = true } arrow = { workspace = true } bitflags = { workspace = true } bytemuck = { workspace = true } +bytes = { workspace = true } chrono = { workspace = true, optional = true } chrono-tz = { workspace = true, optional = true } ciborium = { workspace = true, optional = true } diff --git a/crates/polars-plan/src/plans/conversion/scans.rs b/crates/polars-plan/src/plans/conversion/scans.rs index e234521c0a51..50da756f9a38 100644 --- a/crates/polars-plan/src/plans/conversion/scans.rs +++ b/crates/polars-plan/src/plans/conversion/scans.rs @@ -1,6 +1,3 @@ -use std::path::PathBuf; -use std::sync::Arc; - use either::Either; use polars_io::path_utils::is_cloud_url; #[cfg(feature = "cloud")] @@ -182,47 +179,18 @@ pub(super) fn csv_file_info( let infer_schema_func = |i| { let source = sources.at(i); + let memslice = source.to_memslice(run_async, cache_entries.as_ref(), i)?; let owned = &mut vec![]; - match source { - ScanSourceRef::File(path) => { - let file = if run_async { - feature_gated!("cloud", { - let entry: &Arc = - &cache_entries.as_ref().unwrap()[i]; - entry.try_open_check_latest()? - }) - } else { - polars_utils::open_file(path)? - }; - - let mmap = unsafe { memmap::Mmap::map(&file).unwrap() }; - let mut reader = - std::io::Cursor::new(maybe_decompress_bytes(mmap.as_ref(), owned)?); - - if reader.read(&mut [0; 4])? < 2 && csv_options.raise_if_empty { - polars_bail!(NoData: "empty CSV") - } - reader.rewind()?; - - let reader_bytes = get_reader_bytes(&mut reader).expect("could not mmap file"); - - // this needs a way to estimated bytes/rows. - SchemaInferenceResult::try_from_reader_bytes_and_options(&reader_bytes, csv_options) - }, - ScanSourceRef::Buffer(buffer) => { - let mut reader = std::io::Cursor::new(maybe_decompress_bytes(buffer, owned)?); - - if reader.read(&mut [0; 4])? < 2 && csv_options.raise_if_empty { - polars_bail!(NoData: "empty CSV") - } - reader.rewind()?; + let mut reader = std::io::Cursor::new(maybe_decompress_bytes(&memslice, owned)?); + if reader.read(&mut [0; 4])? < 2 && csv_options.raise_if_empty { + polars_bail!(NoData: "empty CSV") + } + reader.rewind()?; - let reader_bytes = get_reader_bytes(&mut reader).expect("could not open file"); + let reader_bytes = get_reader_bytes(&mut reader).expect("could not mmap file"); - // this needs a way to estimated bytes/rows. - SchemaInferenceResult::try_from_reader_bytes_and_options(&reader_bytes, csv_options) - }, - } + // this needs a way to estimated bytes/rows. + SchemaInferenceResult::try_from_reader_bytes_and_options(&reader_bytes, csv_options) }; let merge_func = |a: PolarsResult, @@ -336,27 +304,11 @@ pub(super) fn ndjson_file_info( ) } } else { - let schema = match first { - ScanSourceRef::File(path) => { - let f = if run_async { - feature_gated!("cloud", { - cache_entries.unwrap()[0].try_open_check_latest()? - }) - } else { - polars_utils::open_file(path)? - }; - - let mmap = unsafe { memmap::Mmap::map(&f).unwrap() }; - let mut reader = - std::io::Cursor::new(maybe_decompress_bytes(mmap.as_ref(), owned)?); + let memslice = first.to_memslice(run_async, cache_entries.as_ref(), 0)?; + let mut reader = std::io::Cursor::new(maybe_decompress_bytes(&memslice, owned)?); - polars_io::ndjson::infer_schema(&mut reader, ndjson_options.infer_schema_length)? - }, - ScanSourceRef::Buffer(buff) => { - let mut reader = std::io::Cursor::new(maybe_decompress_bytes(buff, owned)?); - polars_io::ndjson::infer_schema(&mut reader, ndjson_options.infer_schema_length)? - }, - }; + let schema = + polars_io::ndjson::infer_schema(&mut reader, ndjson_options.infer_schema_length)?; prepare_schemas(schema, file_options.row_index.as_ref()) }; diff --git a/crates/polars-plan/src/plans/ir/mod.rs b/crates/polars-plan/src/plans/ir/mod.rs index db73f7a13528..95a7a5aaf374 100644 --- a/crates/polars-plan/src/plans/ir/mod.rs +++ b/crates/polars-plan/src/plans/ir/mod.rs @@ -11,9 +11,12 @@ use std::path::{Path, PathBuf}; pub use dot::{EscapeLabel, IRDotDisplay, PathsDisplay}; pub use format::{ExprIRDisplay, IRDisplay}; use hive::HivePartitions; +use polars_core::error::feature_gated; use polars_core::prelude::*; use polars_core::POOL; +use polars_io::file_cache::FileCacheEntry; use polars_utils::idx_vec::UnitVec; +use polars_utils::mmap::MemSlice; use polars_utils::unitvec; #[cfg(feature = "ir_serde")] use serde::{Deserialize, Serialize}; @@ -38,13 +41,14 @@ pub struct IRPlanRef<'a> { #[derive(Debug, Clone, Hash, PartialEq, Eq)] pub enum ScanSources { Files(Arc<[PathBuf]>), - Buffers(Arc<[Arc<[u8]>]>), + #[cfg_attr(feature = "serde", serde(skip))] + Buffers(Arc<[bytes::Bytes]>), } #[derive(Debug, Clone, Copy)] pub enum ScanSourceRef<'a> { File(&'a Path), - Buffer(&'a [u8]), + Buffer(&'a bytes::Bytes), } pub struct ScanSourceSliceInfo { @@ -65,6 +69,29 @@ impl<'a> ScanSourceRef<'a> { ScanSourceRef::Buffer(_) => "in-mem", } } + + pub fn to_memslice( + &self, + run_async: bool, + cache_entries: Option<&Vec>>, + index: usize, + ) -> PolarsResult { + match self { + Self::File(path) => { + let f = if run_async { + feature_gated!("cloud", { + cache_entries.unwrap()[index].try_open_check_latest()? + }) + } else { + polars_utils::open_file(path)? + }; + + let mmap = unsafe { memmap::Mmap::map(&f)? }; + Ok(MemSlice::from_mmap(Arc::new(mmap))) + }, + Self::Buffer(buff) => Ok(MemSlice::from_bytes((*buff).clone())), + } + } } impl ScanSources { @@ -95,6 +122,13 @@ impl ScanSources { } } + pub fn first_path(&self) -> Option<&Path> { + match self { + ScanSources::Files(paths) => paths.first().map(|p| p.as_path()), + ScanSources::Buffers(_) => None, + } + } + pub fn to_dsl(self, is_expanded: bool) -> DslScanSources { DslScanSources { sources: self, diff --git a/crates/polars-plan/src/plans/optimizer/count_star.rs b/crates/polars-plan/src/plans/optimizer/count_star.rs index c8570b0f908f..02c8b94a033c 100644 --- a/crates/polars-plan/src/plans/optimizer/count_star.rs +++ b/crates/polars-plan/src/plans/optimizer/count_star.rs @@ -68,7 +68,7 @@ fn visit_logical_plan_for_scan_paths( IR::Union { inputs, .. } => { enum MutableSources { Files(Vec), - Buffers(Vec>), + Buffers(Vec), } let mut scan_type: Option = None; diff --git a/crates/polars-python/Cargo.toml b/crates/polars-python/Cargo.toml index 03178d684e34..b93d34a678e5 100644 --- a/crates/polars-python/Cargo.toml +++ b/crates/polars-python/Cargo.toml @@ -25,6 +25,7 @@ polars-stream = { workspace = true } ahash = { workspace = true } arboard = { workspace = true, optional = true } bytemuck = { workspace = true } +bytes = { workspace = true } ciborium = { workspace = true } either = { workspace = true } itoa = { workspace = true } diff --git a/crates/polars-python/src/conversion/mod.rs b/crates/polars-python/src/conversion/mod.rs index 8d5c96f3b58c..886b6f744552 100644 --- a/crates/polars-python/src/conversion/mod.rs +++ b/crates/polars-python/src/conversion/mod.rs @@ -3,6 +3,7 @@ pub(crate) mod chunked_array; mod datetime; use std::fmt::{Display, Formatter}; use std::hash::{Hash, Hasher}; +use std::path::PathBuf; #[cfg(feature = "object")] use polars::chunked_array::object::PolarsObjectSafe; @@ -19,6 +20,7 @@ use polars_core::utils::materialize_dyn_int; use polars_lazy::prelude::*; #[cfg(feature = "parquet")] use polars_parquet::write::StatisticsOptions; +use polars_plan::plans::ScanSources; use polars_utils::pl_str::PlSmallStr; use polars_utils::total_ord::{TotalEq, TotalHash}; use pyo3::basic::CompareOp; @@ -29,6 +31,7 @@ use pyo3::pybacked::PyBackedStr; use pyo3::types::{PyDict, PyList, PySequence}; use crate::error::PyPolarsErr; +use crate::file::{get_either_file_or_path, EitherPythonFileOrPath}; #[cfg(feature = "object")] use crate::object::OBJECT_NAME; use crate::prelude::*; @@ -528,6 +531,60 @@ impl<'py> FromPyObject<'py> for Wrap { } } +impl<'py> FromPyObject<'py> for Wrap { + fn extract_bound(ob: &Bound<'py, PyAny>) -> PyResult { + let list = ob.downcast::()?.to_owned(); + + if list.is_empty() { + return Ok(Wrap(ScanSources::default())); + } + + enum MutableSources { + Files(Vec), + Buffers(Vec), + } + + let num_items = list.len(); + let mut iter = list + .into_iter() + .map(|val| get_either_file_or_path(val.unbind(), false)); + + let Some(first) = iter.next() else { + return Ok(Wrap(ScanSources::default())); + }; + + let mut sources = match first? { + EitherPythonFileOrPath::Py(f) => { + let mut sources = Vec::with_capacity(num_items); + sources.push(f.as_bytes()); + MutableSources::Buffers(sources) + }, + EitherPythonFileOrPath::Path(path) => { + let mut sources = Vec::with_capacity(num_items); + sources.push(path); + MutableSources::Files(sources) + }, + }; + + for source in iter { + match (&mut sources, source?) { + (MutableSources::Files(v), EitherPythonFileOrPath::Path(p)) => v.push(p), + (MutableSources::Buffers(v), EitherPythonFileOrPath::Py(f)) => v.push(f.as_bytes()), + _ => { + return Err(PyTypeError::new_err( + "Cannot combine in-memory bytes and paths for scan sources", + )) + }, + } + } + + Ok(Wrap(match sources { + MutableSources::Files(i) => ScanSources::Files(i.into()), + MutableSources::Buffers(i) => ScanSources::Buffers(i.into()), + })) + } +} + impl IntoPy for Wrap<&Schema> { fn into_py(self, py: Python<'_>) -> PyObject { let dict = PyDict::new_bound(py); diff --git a/crates/polars-python/src/file.rs b/crates/polars-python/src/file.rs index 6225ee5427f7..3d180e2bedf3 100644 --- a/crates/polars-python/src/file.rs +++ b/crates/polars-python/src/file.rs @@ -7,7 +7,6 @@ use std::io::{Cursor, ErrorKind, Read, Seek, SeekFrom, Write}; #[cfg(target_family = "unix")] use std::os::fd::{FromRawFd, RawFd}; use std::path::PathBuf; -use std::sync::Arc; use polars::io::mmap::MmapBytesReader; use polars_error::{polars_err, polars_warn}; @@ -32,7 +31,7 @@ impl PyFileLikeObject { PyFileLikeObject { inner: object } } - pub fn as_arc(&self) -> Arc<[u8]> { + pub fn as_bytes(&self) -> bytes::Bytes { self.as_file_buffer().into_inner().into() } @@ -252,7 +251,7 @@ pub fn get_either_file_or_path(py_f: PyObject, write: bool) -> PyResult PyResult<(EitherRustPythonFile, Option)> { @@ -366,7 +365,7 @@ fn get_either_file_and_path( /// # Arguments /// * `write` - open for writing; will truncate existing file and create new file if not. pub fn get_either_file(py_f: PyObject, write: bool) -> PyResult { - Ok(get_either_file_and_path(py_f, write)?.0) + Ok(get_either_buffer_or_path(py_f, write)?.0) } pub fn get_file_like(f: PyObject, truncate: bool) -> PyResult> { @@ -403,7 +402,7 @@ pub fn get_mmap_bytes_reader_and_path<'a>( } // string so read file else { - match get_either_file_and_path(py_f.to_object(py_f.py()), false)? { + match get_either_buffer_or_path(py_f.to_object(py_f.py()), false)? { (EitherRustPythonFile::Rust(f), path) => Ok((Box::new(f), path)), (EitherRustPythonFile::Py(f), path) => Ok((Box::new(f), path)), } diff --git a/crates/polars-python/src/lazyframe/general.rs b/crates/polars-python/src/lazyframe/general.rs index 40c69b260c40..30206d0c088b 100644 --- a/crates/polars-python/src/lazyframe/general.rs +++ b/crates/polars-python/src/lazyframe/general.rs @@ -7,7 +7,7 @@ use polars::time::*; use polars_core::prelude::*; #[cfg(feature = "parquet")] use polars_parquet::arrow::write::StatisticsOptions; -use pyo3::exceptions::PyValueError; +use polars_plan::plans::ScanSources; use pyo3::prelude::*; use pyo3::pybacked::PyBackedStr; use pyo3::types::{PyDict, PyList}; @@ -20,6 +20,18 @@ use crate::lazyframe::visit::NodeTraverser; use crate::prelude::*; use crate::{PyDataFrame, PyExpr, PyLazyGroupBy}; +fn pyobject_to_first_path_and_scan_sources( + obj: PyObject, +) -> PyResult<(Option, ScanSources)> { + use crate::file::{get_either_file_or_path, EitherPythonFileOrPath}; + Ok(match get_either_file_or_path(obj, false)? { + EitherPythonFileOrPath::Path(path) => { + (Some(path.clone()), ScanSources::Files([path].into())) + }, + EitherPythonFileOrPath::Py(f) => (None, ScanSources::Buffers([f.as_bytes()].into())), + }) +} + #[pymethods] #[allow(clippy::should_implement_trait)] impl PyLazyFrame { @@ -27,12 +39,12 @@ impl PyLazyFrame { #[cfg(feature = "json")] #[allow(clippy::too_many_arguments)] #[pyo3(signature = ( - path, paths, infer_schema_length, schema, schema_overrides, batch_size, n_rows, low_memory, rechunk, + source, sources, infer_schema_length, schema, schema_overrides, batch_size, n_rows, low_memory, rechunk, row_index, ignore_errors, include_file_paths, cloud_options, retries, file_cache_ttl ))] fn new_from_ndjson( - path: Option, - paths: Vec, + source: Option, + sources: Wrap, infer_schema_length: Option, schema: Option>, schema_overrides: Option>, @@ -52,35 +64,14 @@ impl PyLazyFrame { offset, }); - use std::path::Path; - - use polars_plan::plans::ScanSources; - use EitherPythonFileOrPath as EF; - - use crate::file::{get_either_file_or_path, EitherPythonFileOrPath}; - let (first_path, mut r) = match path - .map(|py_f| get_either_file_or_path(py_f, false)) - .transpose()? - { - Some(EF::Path(path)) => { - let reader = LazyJsonLineReader::new(>::as_ref(&path)); - (Some(path), reader) - }, - Some(EF::Py(f)) => ( - None, - LazyJsonLineReader::new_sourced(ScanSources::Buffers([f.as_arc()].into())), - ), - None => ( - Some( - paths - .first() - .cloned() - .ok_or_else(|| PyValueError::new_err("expected a path argument"))?, - ), - LazyJsonLineReader::new_paths(paths.into()), - ), + let sources = sources.0; + let (first_path, sources) = match source { + None => (sources.first_path().map(|p| p.to_path_buf()), sources), + Some(source) => pyobject_to_first_path_and_scan_sources(source)?, }; + let mut r = LazyJsonLineReader::new_sourced(sources); + #[cfg(feature = "cloud")] if let Some(first_path) = first_path { let first_path_url = first_path.to_string_lossy(); @@ -115,7 +106,7 @@ impl PyLazyFrame { #[staticmethod] #[cfg(feature = "csv")] - #[pyo3(signature = (path, paths, separator, has_header, ignore_errors, skip_rows, n_rows, cache, overwrite_dtype, + #[pyo3(signature = (source, sources, separator, has_header, ignore_errors, skip_rows, n_rows, cache, overwrite_dtype, low_memory, comment_prefix, quote_char, null_values, missing_utf8_is_empty_string, infer_schema_length, with_schema_modify, rechunk, skip_rows_after_header, encoding, row_index, try_parse_dates, eol_char, raise_if_empty, truncate_ragged_lines, decimal_comma, glob, schema, @@ -123,8 +114,8 @@ impl PyLazyFrame { ) )] fn new_from_csv( - path: Option, - paths: Vec, + source: Option, + sources: Wrap, separator: &str, has_header: bool, ignore_errors: bool, @@ -155,10 +146,6 @@ impl PyLazyFrame { file_cache_ttl: Option, include_file_paths: Option, ) -> PyResult { - use std::path::Path; - - use crate::file::{get_either_file_or_path, EitherPythonFileOrPath}; - let null_values = null_values.map(|w| w.0); let quote_char = quote_char.map(|s| s.as_bytes()[0]); let separator = separator.as_bytes()[0]; @@ -175,31 +162,14 @@ impl PyLazyFrame { .collect::() }); - use polars_plan::plans::ScanSources; - use EitherPythonFileOrPath as EF; - let (first_path, mut r) = match path - .map(|py_f| get_either_file_or_path(py_f, false)) - .transpose()? - { - Some(EF::Path(path)) => { - let reader = LazyCsvReader::new(>::as_ref(&path)); - (Some(path), reader) - }, - Some(EF::Py(f)) => ( - None, - LazyCsvReader::new_sourced(ScanSources::Buffers([f.as_arc()].into())), - ), - None => ( - Some( - paths - .first() - .cloned() - .ok_or_else(|| PyValueError::new_err("expected a path argument"))?, - ), - LazyCsvReader::new_paths(paths.into()), - ), + let sources = sources.0; + let (first_path, sources) = match source { + None => (sources.first_path().map(|p| p.to_path_buf()), sources), + Some(source) => pyobject_to_first_path_and_scan_sources(source)?, }; + let mut r = LazyCsvReader::new_sourced(sources); + #[cfg(feature = "cloud")] if let Some(first_path) = first_path { let first_path_url = first_path.to_string_lossy(); @@ -268,12 +238,12 @@ impl PyLazyFrame { #[cfg(feature = "parquet")] #[staticmethod] - #[pyo3(signature = (path, paths, n_rows, cache, parallel, rechunk, row_index, + #[pyo3(signature = (source, sources, n_rows, cache, parallel, rechunk, row_index, low_memory, cloud_options, use_statistics, hive_partitioning, hive_schema, try_parse_hive_dates, retries, glob, include_file_paths) )] fn new_from_parquet( - path: Option, - paths: Vec, + source: Option, + sources: Wrap, n_rows: Option, cache: bool, parallel: Wrap, @@ -289,8 +259,6 @@ impl PyLazyFrame { glob: bool, include_file_paths: Option, ) -> PyResult { - use crate::file::{get_either_file_or_path, EitherPythonFileOrPath}; - let parallel = parallel.0; let hive_schema = hive_schema.map(|s| Arc::new(s.0)); @@ -320,52 +288,31 @@ impl PyLazyFrame { include_file_paths: include_file_paths.map(|x| x.into()), }; - use polars_plan::plans::ScanSources; - use EitherPythonFileOrPath as EF; - let use_first_path = path.is_some(); - let first_path = match path - .map(|py_f| get_either_file_or_path(py_f, false)) - .transpose()? - { - Some(EF::Path(path)) => path, - Some(EF::Py(f)) => { - return LazyFrame::scan_parquet_sourced( - ScanSources::Buffers([f.as_arc()].into()), - args, - ) - .map(Self::from) - .map_err(PyPolarsErr::from) - .map_err(From::from); - }, - None => paths - .first() - .cloned() - .ok_or_else(|| PyValueError::new_err("expected a path argument"))?, + let sources = sources.0; + let (first_path, sources) = match source { + None => (sources.first_path().map(|p| p.to_path_buf()), sources), + Some(source) => pyobject_to_first_path_and_scan_sources(source)?, }; #[cfg(feature = "cloud")] - { + if let Some(first_path) = first_path { let first_path_url = first_path.to_string_lossy(); let cloud_options = parse_cloud_options(&first_path_url, cloud_options.unwrap_or_default())?; args.cloud_options = Some(cloud_options.with_max_retries(retries)); } - let lf = if use_first_path { - LazyFrame::scan_parquet(first_path, args) - } else { - LazyFrame::scan_parquet_files(Arc::from(paths), args) - } - .map_err(PyPolarsErr::from)?; + let lf = LazyFrame::scan_parquet_sourced(sources, args).map_err(PyPolarsErr::from)?; + Ok(lf.into()) } #[cfg(feature = "ipc")] #[staticmethod] - #[pyo3(signature = (path, paths, n_rows, cache, rechunk, row_index, memory_map, cloud_options, hive_partitioning, hive_schema, try_parse_hive_dates, retries, file_cache_ttl, include_file_paths))] + #[pyo3(signature = (source, sources, n_rows, cache, rechunk, row_index, memory_map, cloud_options, hive_partitioning, hive_schema, try_parse_hive_dates, retries, file_cache_ttl, include_file_paths))] fn new_from_ipc( - path: Option, - paths: Vec, + source: Option, + sources: Wrap, n_rows: Option, cache: bool, rechunk: bool, @@ -379,8 +326,6 @@ impl PyLazyFrame { file_cache_ttl: Option, include_file_paths: Option, ) -> PyResult { - use crate::file::{get_either_file_or_path, EitherPythonFileOrPath}; - let row_index = row_index.map(|(name, offset)| RowIndex { name: name.into(), offset, @@ -405,31 +350,14 @@ impl PyLazyFrame { include_file_paths: include_file_paths.map(|x| x.into()), }; - use polars_plan::plans::ScanSources; - use EitherPythonFileOrPath as EF; - let use_first_path = path.is_some(); - let first_path = match path - .map(|py_f| get_either_file_or_path(py_f, false)) - .transpose()? - { - Some(EF::Path(path)) => path, - Some(EF::Py(f)) => { - return LazyFrame::scan_ipc_sourced( - ScanSources::Buffers([f.as_arc()].into()), - args, - ) - .map(Self::from) - .map_err(PyPolarsErr::from) - .map_err(From::from); - }, - None => paths - .first() - .cloned() - .ok_or_else(|| PyValueError::new_err("expected a path argument"))?, + let sources = sources.0; + let (first_path, sources) = match source { + None => (sources.first_path().map(|p| p.to_path_buf()), sources), + Some(source) => pyobject_to_first_path_and_scan_sources(source)?, }; #[cfg(feature = "cloud")] - { + if let Some(first_path) = first_path { let first_path_url = first_path.to_string_lossy(); let mut cloud_options = @@ -440,12 +368,7 @@ impl PyLazyFrame { args.cloud_options = Some(cloud_options.with_max_retries(retries)); } - let lf = if use_first_path { - LazyFrame::scan_ipc(first_path, args) - } else { - LazyFrame::scan_ipc_files(paths.into(), args) - } - .map_err(PyPolarsErr::from)?; + let lf = LazyFrame::scan_ipc_sourced(sources, args).map_err(PyPolarsErr::from)?; Ok(lf.into()) } diff --git a/crates/polars-utils/src/mmap.rs b/crates/polars-utils/src/mmap.rs index d8db6d0ae671..c753525b43ee 100644 --- a/crates/polars-utils/src/mmap.rs +++ b/crates/polars-utils/src/mmap.rs @@ -46,6 +46,13 @@ mod private { } } + impl AsRef<[u8]> for MemSlice { + #[inline(always)] + fn as_ref(&self) -> &[u8] { + self.slice + } + } + impl Default for MemSlice { fn default() -> Self { Self::from_bytes(bytes::Bytes::new()) diff --git a/py-polars/polars/io/csv/functions.py b/py-polars/polars/io/csv/functions.py index b7b5c4764845..77cd73e0aa5f 100644 --- a/py-polars/polars/io/csv/functions.py +++ b/py-polars/polars/io/csv/functions.py @@ -1234,6 +1234,8 @@ def with_column_names(cols: list[str]) -> list[str]: source = normalize_filepath(source, check_not_directory=False) elif isinstance(source, (IO, BytesIO)): pass + elif isinstance(source, list) and isinstance(source[0], BytesIO): + pass else: source = [ normalize_filepath(source, check_not_directory=False) for source in source @@ -1331,8 +1333,8 @@ def _scan_csv_impl( storage_options = None pylf = PyLazyFrame.new_from_csv( - path=source, - paths=sources, + source, + sources, separator=separator, has_header=has_header, ignore_errors=ignore_errors, diff --git a/py-polars/polars/io/ipc/functions.py b/py-polars/polars/io/ipc/functions.py index 8f3c21bdf286..8c0138df2a36 100644 --- a/py-polars/polars/io/ipc/functions.py +++ b/py-polars/polars/io/ipc/functions.py @@ -347,7 +347,7 @@ def read_ipc_schema(source: str | Path | IO[bytes] | bytes) -> dict[str, DataTyp @deprecate_renamed_parameter("row_count_name", "row_index_name", version="0.20.4") @deprecate_renamed_parameter("row_count_offset", "row_index_offset", version="0.20.4") def scan_ipc( - source: str | Path | list[str] | list[Path] | IO[str] | IO[bytes], + source: str | Path | IO[bytes] | list[str] | list[Path] | list[IO[bytes]], *, n_rows: int | None = None, cache: bool = True, @@ -430,8 +430,11 @@ def scan_ipc( if isinstance(source, (str, Path)): source = normalize_filepath(source, check_not_directory=False) sources = [] - elif isinstance(source, (IO, BytesIO)): + elif isinstance(source, BytesIO): sources = [] + elif isinstance(source, list) and isinstance(source[0], BytesIO): + sources = source + source = None # type: ignore[assignment] else: sources = [ normalize_filepath(source, check_not_directory=False) for source in source diff --git a/py-polars/polars/io/ndjson.py b/py-polars/polars/io/ndjson.py index 5482ccc52c42..63032b5dc688 100644 --- a/py-polars/polars/io/ndjson.py +++ b/py-polars/polars/io/ndjson.py @@ -166,7 +166,7 @@ def read_ndjson( @deprecate_renamed_parameter("row_count_name", "row_index_name", version="0.20.4") @deprecate_renamed_parameter("row_count_offset", "row_index_offset", version="0.20.4") def scan_ndjson( - source: str | Path | list[str] | list[Path] | IO[str] | IO[bytes], + source: str | Path | IO[str] | IO[bytes] | list[str] | list[Path] | list[IO[str]] | list[IO[bytes]], *, schema: SchemaDefinition | None = None, schema_overrides: SchemaDefinition | None = None, @@ -250,8 +250,11 @@ def scan_ndjson( if isinstance(source, (str, Path)): source = normalize_filepath(source, check_not_directory=False) sources = [] - elif isinstance(source, (IO, BytesIO)): + elif isinstance(source, BytesIO): sources = [] + elif isinstance(source, list) and isinstance(source[0], BytesIO): + sources = source + source = None # type: ignore[assignment] else: sources = [ normalize_filepath(source, check_not_directory=False) for source in source @@ -268,8 +271,8 @@ def scan_ndjson( storage_options = None pylf = PyLazyFrame.new_from_ndjson( - path=source, - paths=sources, + source, + sources, infer_schema_length=infer_schema_length, schema=schema, schema_overrides=schema_overrides, diff --git a/py-polars/polars/io/parquet/functions.py b/py-polars/polars/io/parquet/functions.py index 253af8042b84..583b8fddf326 100644 --- a/py-polars/polars/io/parquet/functions.py +++ b/py-polars/polars/io/parquet/functions.py @@ -295,7 +295,7 @@ def read_parquet_schema(source: str | Path | IO[bytes] | bytes) -> dict[str, Dat @deprecate_renamed_parameter("row_count_name", "row_index_name", version="0.20.4") @deprecate_renamed_parameter("row_count_offset", "row_index_offset", version="0.20.4") def scan_parquet( - source: str | Path | list[str] | list[Path] | IO[str] | IO[bytes], + source: str | Path | IO[bytes] | list[str] | list[Path] | list[IO[bytes]], *, n_rows: int | None = None, row_index_name: str | None = None, @@ -422,8 +422,8 @@ def scan_parquet( if isinstance(source, (str, Path)): source = normalize_filepath(source, check_not_directory=False) - elif isinstance(source, (IO, io.BytesIO)): - sources = [] + elif isinstance(source, io.BytesIO) or (isinstance(source, list) and isinstance(source[0], io.BytesIO)): + pass else: source = [ normalize_filepath(source, check_not_directory=False) for source in source diff --git a/py-polars/tests/unit/io/test_scan.py b/py-polars/tests/unit/io/test_scan.py index 1bcc463bd2e7..cb33344a1fce 100644 --- a/py-polars/tests/unit/io/test_scan.py +++ b/py-polars/tests/unit/io/test_scan.py @@ -9,6 +9,7 @@ import pytest import polars as pl +import io from polars.testing.asserts.frame import assert_frame_equal if TYPE_CHECKING: @@ -690,3 +691,47 @@ def test_async_path_expansion_bracket_17629(tmp_path: Path) -> None: df.write_parquet(path) assert_frame_equal(pl.scan_parquet(tmp_path / "[d]ata.parquet").collect(), df) + + +@pytest.mark.parametrize( + "method", + ["parquet", "csv", "ipc", "ndjson"], +) +def test_scan_in_memory(method: str) -> None: + f = io.BytesIO() + df = pl.DataFrame({ + 'a': [1, 2, 3], + 'b': ['x', 'y', 'z'], + }) + + (getattr(df, f'write_{method}'))(f) + + f.seek(0) + result = (getattr(pl, f'scan_{method}'))(f).collect() + assert_frame_equal(df, result) + + f.seek(0) + result = (getattr(pl, f'scan_{method}'))(f).slice(1, 2).collect() + assert_frame_equal(df.slice(1, 2), result) + + f.seek(0) + result = (getattr(pl, f'scan_{method}'))(f).slice(-1, 1).collect() + assert_frame_equal(df.slice(-1, 1), result) + + g = io.BytesIO() + (getattr(df, f'write_{method}'))(g) + + f.seek(0) + g.seek(0) + result = (getattr(pl, f'scan_{method}'))([f, g]).collect() + assert_frame_equal(df.vstack(df), result) + + f.seek(0) + g.seek(0) + result = (getattr(pl, f'scan_{method}'))([f, g]).slice(1, 2).collect() + assert_frame_equal(df.vstack(df).slice(1, 2), result) + + f.seek(0) + g.seek(0) + result = (getattr(pl, f'scan_{method}'))([f, g]).slice(-1, 1).collect() + assert_frame_equal(df.vstack(df).slice(-1, 1), result) From 278e1a7269b3427a684fd7785f2bd12869a5bdbc Mon Sep 17 00:00:00 2001 From: coastalwhite Date: Thu, 5 Sep 2024 16:41:26 +0200 Subject: [PATCH 06/27] no more failing tests --- crates/polars-lazy/src/scan/ndjson.rs | 9 +- .../src/executors/scan/parquet.rs | 99 ++++++++++++---- .../src/parquet/encoding/uleb128.rs | 1 + .../src/plans/conversion/dsl_to_ir.rs | 10 +- crates/polars-plan/src/plans/ir/format.rs | 2 +- crates/polars-plan/src/plans/ir/mod.rs | 111 +----------------- py-polars/polars/io/csv/functions.py | 8 +- py-polars/polars/io/ndjson.py | 9 +- py-polars/polars/io/parquet/functions.py | 4 +- py-polars/tests/unit/io/test_parquet.py | 6 +- py-polars/tests/unit/io/test_scan.py | 28 +++-- 11 files changed, 122 insertions(+), 165 deletions(-) diff --git a/crates/polars-lazy/src/scan/ndjson.rs b/crates/polars-lazy/src/scan/ndjson.rs index 8d71d9a585a2..6d0492e170dc 100644 --- a/crates/polars-lazy/src/scan/ndjson.rs +++ b/crates/polars-lazy/src/scan/ndjson.rs @@ -128,10 +128,11 @@ impl LazyFileListReader for LazyJsonLineReader { row_index: self.row_index, rechunk: self.rechunk, file_counter: 0, - hive_options: { - let mut options = HiveOptions::default(); - options.enabled = Some(false); - options + hive_options: HiveOptions { + enabled: Some(false), + hive_start_idx: 0, + schema: None, + try_parse_dates: true, }, glob: true, include_file_paths: self.include_file_paths, diff --git a/crates/polars-mem-engine/src/executors/scan/parquet.rs b/crates/polars-mem-engine/src/executors/scan/parquet.rs index 509ea7ba8c55..7de67aff4284 100644 --- a/crates/polars-mem-engine/src/executors/scan/parquet.rs +++ b/crates/polars-mem-engine/src/executors/scan/parquet.rs @@ -60,40 +60,94 @@ impl ParquetExec { let mut result = vec![]; let step = std::cmp::min(POOL.current_num_threads(), 128); - let slice_info = match self.file_options.slice { - None => ScanSourceSliceInfo { - item_slice: 0..usize::MAX, - source_slice: 0..self.sources.len(), - }, - Some(slice) => { - self.sources - .collect_slice_information(slice, |source| match source { - ScanSourceRef::File(path) => { - ParquetReader::new(std::fs::File::open(path)?).num_rows() - }, - ScanSourceRef::Buffer(buff) => { - ParquetReader::new(std::io::Cursor::new(buff)).num_rows() - }, - })? - }, + // Modified if we have a negative slice + let mut first_source = 0; + + // (offset, end) + let (slice_offset, slice_end) = if let Some(slice) = self.file_options.slice { + if slice.0 >= 0 { + (slice.0 as usize, slice.1.saturating_add(slice.0 as usize)) + } else { + // Walk the files in reverse until we find the first file, and then translate the + // slice into a positive-offset equivalent. + let slice_start_as_n_from_end = -slice.0 as usize; + let mut cum_rows = 0; + let chunk_size = 8; + POOL.install(|| { + for path_indexes in (0..self.sources.len()) + .rev() + .collect::>() + .chunks(chunk_size) + { + let row_counts = path_indexes + .into_par_iter() + .map(|&i| { + let memslice = match self.sources.at(i) { + ScanSourceRef::File(path) => { + let file = std::fs::File::open(path)?; + MemSlice::from_mmap(Arc::new(unsafe { + memmap::Mmap::map(&file).unwrap() + })) + }, + ScanSourceRef::Buffer(buff) => { + MemSlice::from_bytes(buff.clone()) + }, + }; + + ParquetReader::new(std::io::Cursor::new(memslice)).num_rows() + }) + .collect::>>()?; + + for (path_idx, rc) in path_indexes.iter().zip(row_counts) { + cum_rows += rc; + + if cum_rows >= slice_start_as_n_from_end { + first_source = *path_idx; + break; + } + } + + if first_source > 0 { + break; + } + } + + PolarsResult::Ok(()) + })?; + + let (start, len) = if slice_start_as_n_from_end > cum_rows { + // We need to trim the slice, e.g. SLICE[offset: -100, len: 75] on a file of 50 + // rows should only give the first 25 rows. + let first_file_position = slice_start_as_n_from_end - cum_rows; + (0, slice.1.saturating_sub(first_file_position)) + } else { + (cum_rows - slice_start_as_n_from_end, slice.1) + }; + + let end = start.saturating_add(len); + + (start, end) + } + } else { + (0, usize::MAX) }; let mut current_offset = 0; let base_row_index = self.file_options.row_index.take(); // Limit no. of files at a time to prevent open file limits. - for i in slice_info.source_slice.step_by(step) { + for i in (first_source..self.sources.len()).step_by(step) { let end = std::cmp::min(i.saturating_add(step), self.sources.len()); let hive_parts = self.hive_parts.as_ref().map(|x| &x[i..end]); - if current_offset >= slice_info.item_slice.end && !result.is_empty() { + if current_offset >= slice_end && !result.is_empty() { return Ok(result); } // First initialize the readers, predicates and metadata. // This will be used to determine the slices. That way we can actually read all the // files in parallel even if we add row index columns or slices. - let iter = (0..self.sources.len()).into_par_iter().map(|i| { + let iter = (i..end).into_par_iter().map(|i| { let source = self.sources.at(i); let hive_partitions = hive_parts.map(|x| x[i].materialize_partition_columns()); @@ -141,12 +195,7 @@ impl ParquetExec { let cum_rows = *current_offset_ref; ( cum_rows, - split_slice_at_file( - current_offset_ref, - *num_rows, - slice_info.item_slice.start, - slice_info.item_slice.end, - ), + split_slice_at_file(current_offset_ref, *num_rows, slice_offset, slice_end), ) }) .collect::>(); diff --git a/crates/polars-parquet/src/parquet/encoding/uleb128.rs b/crates/polars-parquet/src/parquet/encoding/uleb128.rs index 08459233961c..0740c9575a15 100644 --- a/crates/polars-parquet/src/parquet/encoding/uleb128.rs +++ b/crates/polars-parquet/src/parquet/encoding/uleb128.rs @@ -1,5 +1,6 @@ // Reads an uleb128 encoded integer with at most 56 bits (8 bytes with 7 bits worth of payload each). /// Returns the integer and the number of bytes that made up this integer. +/// /// If the returned length is bigger than 8 this means the integer required more than 8 bytes and the remaining bytes need to be read sequentially and combined with the return value. /// /// # Safety diff --git a/crates/polars-plan/src/plans/conversion/dsl_to_ir.rs b/crates/polars-plan/src/plans/conversion/dsl_to_ir.rs index 7966d6ff688e..1bf06322f090 100644 --- a/crates/polars-plan/src/plans/conversion/dsl_to_ir.rs +++ b/crates/polars-plan/src/plans/conversion/dsl_to_ir.rs @@ -197,7 +197,7 @@ pub fn to_alp_impl(lp: DslPlan, ctxt: &mut DslConversionContext) -> PolarsResult let mut owned = None; hive_partitions_from_paths( - &sources.as_paths(), + sources.as_paths(), file_options.hive_options.hive_start_idx, file_options.hive_options.schema.clone(), match resolved_file_info.reader_schema.as_ref().unwrap() { @@ -830,19 +830,19 @@ impl DslScanSources { let expanded_sources = match &scan_type { #[cfg(feature = "parquet")] FileScan::Parquet { cloud_options, .. } => { - expand_scan_paths_with_hive_update(&paths, file_options, cloud_options)? + expand_scan_paths_with_hive_update(paths, file_options, cloud_options)? }, #[cfg(feature = "ipc")] FileScan::Ipc { cloud_options, .. } => { - expand_scan_paths_with_hive_update(&paths, file_options, cloud_options)? + expand_scan_paths_with_hive_update(paths, file_options, cloud_options)? }, #[cfg(feature = "csv")] FileScan::Csv { cloud_options, .. } => { - expand_paths(&paths, file_options.glob, cloud_options.as_ref())? + expand_paths(paths, file_options.glob, cloud_options.as_ref())? }, #[cfg(feature = "json")] FileScan::NDJson { cloud_options, .. } => { - expand_paths(&paths, file_options.glob, cloud_options.as_ref())? + expand_paths(paths, file_options.glob, cloud_options.as_ref())? }, FileScan::Anonymous { .. } => unreachable!(), // Invariant: Anonymous scans are already expanded. }; diff --git a/crates/polars-plan/src/plans/ir/format.rs b/crates/polars-plan/src/plans/ir/format.rs index a69eb5203359..76de9f3beb24 100644 --- a/crates/polars-plan/src/plans/ir/format.rs +++ b/crates/polars-plan/src/plans/ir/format.rs @@ -243,7 +243,7 @@ impl<'a> IRDisplay<'a> { write_scan( f, scan_type.into(), - &sources, + sources, indent, n_columns, file_info.schema.len(), diff --git a/crates/polars-plan/src/plans/ir/mod.rs b/crates/polars-plan/src/plans/ir/mod.rs index 95a7a5aaf374..328efce28be9 100644 --- a/crates/polars-plan/src/plans/ir/mod.rs +++ b/crates/polars-plan/src/plans/ir/mod.rs @@ -13,7 +13,6 @@ pub use format::{ExprIRDisplay, IRDisplay}; use hive::HivePartitions; use polars_core::error::feature_gated; use polars_core::prelude::*; -use polars_core::POOL; use polars_io::file_cache::FileCacheEntry; use polars_utils::idx_vec::UnitVec; use polars_utils::mmap::MemSlice; @@ -103,7 +102,7 @@ impl ScanSources { } pub fn as_paths(&self) -> &[PathBuf] { match self { - Self::Files(paths) => &paths, + Self::Files(paths) => paths, Self::Buffers(_) => unimplemented!(), } } @@ -138,7 +137,7 @@ impl ScanSources { pub fn is_cloud_url(&self) -> bool { match self { - Self::Files(paths) => paths.first().map_or(false, |p| polars_io::is_cloud_url(p)), + Self::Files(paths) => paths.first().map_or(false, polars_io::is_cloud_url), Self::Buffers(_) => false, } } @@ -171,114 +170,10 @@ impl ScanSources { } } - /// Normalize the slice and collect information as to what rows and parts of the source are - /// used in this slice. - pub fn collect_slice_information( - &self, - slice: (i64, usize), - map_to_num_rows: impl Fn(ScanSourceRef) -> PolarsResult + Send + Sync, - ) -> PolarsResult { - fn slice_to_start_end( - offset: i64, - length: usize, - num_rows: usize, - ) -> std::ops::Range { - if offset < 0 { - let slice_start_as_n_from_end = -offset as usize; - let (start, len) = if slice_start_as_n_from_end > num_rows { - // We need to trim the slice, e.g. SLICE[offset: -100, len: 75] on a file of 50 - // rows should only give the first 25 rows. - let start_position = slice_start_as_n_from_end - num_rows; - (0, length.saturating_sub(start_position)) - } else { - (num_rows - slice_start_as_n_from_end, length) - }; - - let end = start.saturating_add(len); - - start..end - } else { - let offset = offset as usize; - offset.min(num_rows)..(offset + length).min(num_rows) - } - } - - let (offset, length) = slice; - - if self.is_empty() { - return Ok(ScanSourceSliceInfo { - item_slice: 0..0, - source_slice: 0..0, - }); - } - - if self.len() == 1 { - let num_rows = map_to_num_rows(self.get(0).unwrap())?; - let item_slice = slice_to_start_end(offset, length, num_rows); - let source_slice = if item_slice.is_empty() { 0..0 } else { 0..1 }; - - Ok(ScanSourceSliceInfo { - item_slice, - source_slice, - }) - } else { - use rayon::prelude::*; - - // Walk the files in reverse until we find the first file, and then translate the - // slice into a positive-offset equivalent. - const CHUNK_SIZE: usize = 8; - let mut row_counts = Vec::with_capacity(self.len()); - - POOL.install(|| { - for idx_end in (0..self.len()).step_by(CHUNK_SIZE) { - let idx_start = idx_end.saturating_sub(CHUNK_SIZE); - - row_counts.extend( - (idx_start..=idx_end) - .into_par_iter() - .map(|i| map_to_num_rows(self.at(i))) - .collect::>>()? - .into_iter() - .rev(), - ); - } - - PolarsResult::Ok(()) - })?; - - let num_rows = row_counts.iter().sum::(); - - let item_slice = slice_to_start_end(offset, length, num_rows); - - let mut source_start = self.len() - 1; - let mut source_end = 0; - - let mut sum = 0; - for (i, row_count) in row_counts.iter().rev().enumerate() { - if sum < item_slice.end { - source_end = usize::max(source_end, i); - } - - sum += row_count; - - if sum >= item_slice.start { - source_start = usize::min(source_start, i); - } - } - - let source_slice = source_start..source_end + 1; - - Ok(ScanSourceSliceInfo { - item_slice, - source_slice, - }) - } - } - pub fn get(&self, idx: usize) -> Option { match self { ScanSources::Files(paths) => paths.get(idx).map(|p| ScanSourceRef::File(p)), - ScanSources::Buffers(buffers) => buffers.get(idx).map(|b| ScanSourceRef::Buffer(b)), + ScanSources::Buffers(buffers) => buffers.get(idx).map(ScanSourceRef::Buffer), } } diff --git a/py-polars/polars/io/csv/functions.py b/py-polars/polars/io/csv/functions.py index 77cd73e0aa5f..257522831cd3 100644 --- a/py-polars/polars/io/csv/functions.py +++ b/py-polars/polars/io/csv/functions.py @@ -1232,9 +1232,11 @@ def with_column_names(cols: list[str]) -> list[str]: if isinstance(source, (str, Path)): source = normalize_filepath(source, check_not_directory=False) - elif isinstance(source, (IO, BytesIO)): - pass - elif isinstance(source, list) and isinstance(source[0], BytesIO): + elif ( + isinstance(source, (IO, BytesIO)) + or isinstance(source, list) + and isinstance(source[0], BytesIO) + ): pass else: source = [ diff --git a/py-polars/polars/io/ndjson.py b/py-polars/polars/io/ndjson.py index 63032b5dc688..166e990ba25d 100644 --- a/py-polars/polars/io/ndjson.py +++ b/py-polars/polars/io/ndjson.py @@ -166,7 +166,14 @@ def read_ndjson( @deprecate_renamed_parameter("row_count_name", "row_index_name", version="0.20.4") @deprecate_renamed_parameter("row_count_offset", "row_index_offset", version="0.20.4") def scan_ndjson( - source: str | Path | IO[str] | IO[bytes] | list[str] | list[Path] | list[IO[str]] | list[IO[bytes]], + source: str + | Path + | IO[str] + | IO[bytes] + | list[str] + | list[Path] + | list[IO[str]] + | list[IO[bytes]], *, schema: SchemaDefinition | None = None, schema_overrides: SchemaDefinition | None = None, diff --git a/py-polars/polars/io/parquet/functions.py b/py-polars/polars/io/parquet/functions.py index 583b8fddf326..2eda346e7c26 100644 --- a/py-polars/polars/io/parquet/functions.py +++ b/py-polars/polars/io/parquet/functions.py @@ -422,7 +422,9 @@ def scan_parquet( if isinstance(source, (str, Path)): source = normalize_filepath(source, check_not_directory=False) - elif isinstance(source, io.BytesIO) or (isinstance(source, list) and isinstance(source[0], io.BytesIO)): + elif isinstance(source, io.BytesIO) or ( + isinstance(source, list) and isinstance(source[0], io.BytesIO) + ): pass else: source = [ diff --git a/py-polars/tests/unit/io/test_parquet.py b/py-polars/tests/unit/io/test_parquet.py index b46f21f3893e..f57d8bbf5b38 100644 --- a/py-polars/tests/unit/io/test_parquet.py +++ b/py-polars/tests/unit/io/test_parquet.py @@ -12,7 +12,7 @@ import pyarrow.dataset as ds import pyarrow.parquet as pq import pytest -from hypothesis import HealthCheck, given, settings +from hypothesis import given from hypothesis import strategies as st import polars as pl @@ -1559,9 +1559,7 @@ def test_predicate_filtering( offset=st.integers(0, 100), length=st.integers(0, 100), ) -def test_slice_roundtrip( - df: pl.DataFrame, offset: int, length: int -) -> None: +def test_slice_roundtrip(df: pl.DataFrame, offset: int, length: int) -> None: offset %= df.height + 1 length %= df.height - offset + 1 diff --git a/py-polars/tests/unit/io/test_scan.py b/py-polars/tests/unit/io/test_scan.py index cb33344a1fce..a254daaeaa12 100644 --- a/py-polars/tests/unit/io/test_scan.py +++ b/py-polars/tests/unit/io/test_scan.py @@ -1,5 +1,6 @@ from __future__ import annotations +import io from dataclasses import dataclass from functools import partial from math import ceil @@ -9,7 +10,6 @@ import pytest import polars as pl -import io from polars.testing.asserts.frame import assert_frame_equal if TYPE_CHECKING: @@ -699,39 +699,41 @@ def test_async_path_expansion_bracket_17629(tmp_path: Path) -> None: ) def test_scan_in_memory(method: str) -> None: f = io.BytesIO() - df = pl.DataFrame({ - 'a': [1, 2, 3], - 'b': ['x', 'y', 'z'], - }) + df = pl.DataFrame( + { + "a": [1, 2, 3], + "b": ["x", "y", "z"], + } + ) - (getattr(df, f'write_{method}'))(f) + (getattr(df, f"write_{method}"))(f) f.seek(0) - result = (getattr(pl, f'scan_{method}'))(f).collect() + result = (getattr(pl, f"scan_{method}"))(f).collect() assert_frame_equal(df, result) f.seek(0) - result = (getattr(pl, f'scan_{method}'))(f).slice(1, 2).collect() + result = (getattr(pl, f"scan_{method}"))(f).slice(1, 2).collect() assert_frame_equal(df.slice(1, 2), result) f.seek(0) - result = (getattr(pl, f'scan_{method}'))(f).slice(-1, 1).collect() + result = (getattr(pl, f"scan_{method}"))(f).slice(-1, 1).collect() assert_frame_equal(df.slice(-1, 1), result) g = io.BytesIO() - (getattr(df, f'write_{method}'))(g) + (getattr(df, f"write_{method}"))(g) f.seek(0) g.seek(0) - result = (getattr(pl, f'scan_{method}'))([f, g]).collect() + result = (getattr(pl, f"scan_{method}"))([f, g]).collect() assert_frame_equal(df.vstack(df), result) f.seek(0) g.seek(0) - result = (getattr(pl, f'scan_{method}'))([f, g]).slice(1, 2).collect() + result = (getattr(pl, f"scan_{method}"))([f, g]).slice(1, 2).collect() assert_frame_equal(df.vstack(df).slice(1, 2), result) f.seek(0) g.seek(0) - result = (getattr(pl, f'scan_{method}'))([f, g]).slice(-1, 1).collect() + result = (getattr(pl, f"scan_{method}"))([f, g]).slice(-1, 1).collect() assert_frame_equal(df.vstack(df).slice(-1, 1), result) From 88e76511f592067b22fe1cbdf29ef92141dbd5b2 Mon Sep 17 00:00:00 2001 From: coastalwhite Date: Thu, 5 Sep 2024 17:19:00 +0200 Subject: [PATCH 07/27] minor fixes --- crates/polars-io/src/ipc/ipc_file.rs | 1 + crates/polars-lazy/src/scan/csv.rs | 14 ++--- .../polars-lazy/src/scan/file_list_reader.rs | 3 +- crates/polars-lazy/src/scan/ipc.rs | 11 ++-- crates/polars-lazy/src/scan/ndjson.rs | 7 ++- crates/polars-lazy/src/scan/parquet.rs | 11 ++-- .../src/executors/scan/ipc.rs | 58 ++++++++----------- crates/polars-python/src/lazyframe/general.rs | 8 +-- py-polars/polars/io/csv/functions.py | 21 +++++-- py-polars/polars/io/ndjson.py | 4 +- 10 files changed, 72 insertions(+), 66 deletions(-) diff --git a/crates/polars-io/src/ipc/ipc_file.rs b/crates/polars-io/src/ipc/ipc_file.rs index aa6546c8dd5a..9347a453b426 100644 --- a/crates/polars-io/src/ipc/ipc_file.rs +++ b/crates/polars-io/src/ipc/ipc_file.rs @@ -52,6 +52,7 @@ use crate::RowIndex; #[derive(Clone, Debug, PartialEq, Hash)] #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] pub struct IpcScanOptions { + /// Not used anymore. pub memory_map: bool, } diff --git a/crates/polars-lazy/src/scan/csv.rs b/crates/polars-lazy/src/scan/csv.rs index a8687aba3b8b..83e34cff0fe5 100644 --- a/crates/polars-lazy/src/scan/csv.rs +++ b/crates/polars-lazy/src/scan/csv.rs @@ -32,16 +32,12 @@ impl LazyCsvReader { } pub fn new_paths(paths: Arc<[PathBuf]>) -> Self { - Self::new("").with_paths(paths) + Self::new_with_sources(ScanSources::Files(paths)) } - pub fn new_sourced(sources: ScanSources) -> Self { - Self::new("").with_sources(sources) - } - - pub fn new(path: impl AsRef) -> Self { + pub fn new_with_sources(sources: ScanSources) -> Self { LazyCsvReader { - sources: ScanSources::Files([path.as_ref().to_path_buf()].into()), + sources, glob: true, cache: true, read_options: Default::default(), @@ -50,6 +46,10 @@ impl LazyCsvReader { } } + pub fn new(path: impl AsRef) -> Self { + Self::new_with_sources(ScanSources::Files([path.as_ref().to_path_buf()].into())) + } + /// Skip this number of rows after the header location. #[must_use] pub fn with_skip_rows_after_header(mut self, offset: usize) -> Self { diff --git a/crates/polars-lazy/src/scan/file_list_reader.rs b/crates/polars-lazy/src/scan/file_list_reader.rs index b25cec6eda3b..2c8c9d86dd33 100644 --- a/crates/polars-lazy/src/scan/file_list_reader.rs +++ b/crates/polars-lazy/src/scan/file_list_reader.rs @@ -83,9 +83,10 @@ pub trait LazyFileListReader: Clone { true } + /// Get the sources for this reader. fn sources(&self) -> &ScanSources; - /// Set paths of the scanned files. + /// Set sources of the scanned files. #[must_use] fn with_sources(self, source: ScanSources) -> Self; diff --git a/crates/polars-lazy/src/scan/ipc.rs b/crates/polars-lazy/src/scan/ipc.rs index fa11ef8e4455..e70434e39d8d 100644 --- a/crates/polars-lazy/src/scan/ipc.rs +++ b/crates/polars-lazy/src/scan/ipc.rs @@ -124,16 +124,17 @@ impl LazyFileListReader for LazyIpcReader { impl LazyFrame { /// Create a LazyFrame directly from a ipc scan. pub fn scan_ipc(path: impl AsRef, args: ScanArgsIpc) -> PolarsResult { - LazyIpcReader::new(args) - .with_paths([path.as_ref().to_path_buf()].into()) - .finish() + Self::scan_ipc_sources( + ScanSources::Files([path.as_ref().to_path_buf()].into()), + args, + ) } pub fn scan_ipc_files(paths: Arc<[PathBuf]>, args: ScanArgsIpc) -> PolarsResult { - LazyIpcReader::new(args).with_paths(paths).finish() + Self::scan_ipc_sources(ScanSources::Files(paths), args) } - pub fn scan_ipc_sourced(sources: ScanSources, args: ScanArgsIpc) -> PolarsResult { + pub fn scan_ipc_sources(sources: ScanSources, args: ScanArgsIpc) -> PolarsResult { LazyIpcReader::new(args).with_sources(sources).finish() } } diff --git a/crates/polars-lazy/src/scan/ndjson.rs b/crates/polars-lazy/src/scan/ndjson.rs index 6d0492e170dc..195f8e0372a3 100644 --- a/crates/polars-lazy/src/scan/ndjson.rs +++ b/crates/polars-lazy/src/scan/ndjson.rs @@ -29,10 +29,10 @@ pub struct LazyJsonLineReader { impl LazyJsonLineReader { pub fn new_paths(paths: Arc<[PathBuf]>) -> Self { - Self::new_sourced(ScanSources::Files(paths)) + Self::new_with_sources(ScanSources::Files(paths)) } - pub fn new_sourced(sources: ScanSources) -> Self { + pub fn new_with_sources(sources: ScanSources) -> Self { LazyJsonLineReader { sources, batch_size: None, @@ -50,8 +50,9 @@ impl LazyJsonLineReader { } pub fn new(path: impl AsRef) -> Self { - Self::new_sourced(ScanSources::Files([path.as_ref().to_path_buf()].into())) + Self::new_with_sources(ScanSources::Files([path.as_ref().to_path_buf()].into())) } + /// Add a row index column. #[must_use] pub fn with_row_index(mut self, row_index: Option) -> Self { diff --git a/crates/polars-lazy/src/scan/parquet.rs b/crates/polars-lazy/src/scan/parquet.rs index c198ccf690c1..ff4f9a73ec78 100644 --- a/crates/polars-lazy/src/scan/parquet.rs +++ b/crates/polars-lazy/src/scan/parquet.rs @@ -139,18 +139,19 @@ impl LazyFileListReader for LazyParquetReader { impl LazyFrame { /// Create a LazyFrame directly from a parquet scan. pub fn scan_parquet(path: impl AsRef, args: ScanArgsParquet) -> PolarsResult { - LazyParquetReader::new(args) - .with_paths(vec![path.as_ref().to_path_buf()].into()) - .finish() + Self::scan_parquet_sources( + ScanSources::Files([path.as_ref().to_path_buf()].into()), + args, + ) } /// Create a LazyFrame directly from a parquet scan. - pub fn scan_parquet_sourced(sources: ScanSources, args: ScanArgsParquet) -> PolarsResult { + pub fn scan_parquet_sources(sources: ScanSources, args: ScanArgsParquet) -> PolarsResult { LazyParquetReader::new(args).with_sources(sources).finish() } /// Create a LazyFrame directly from a parquet scan. pub fn scan_parquet_files(paths: Arc<[PathBuf]>, args: ScanArgsParquet) -> PolarsResult { - LazyParquetReader::new(args).with_paths(paths).finish() + Self::scan_parquet_sources(ScanSources::Files(paths), args) } } diff --git a/crates/polars-mem-engine/src/executors/scan/ipc.rs b/crates/polars-mem-engine/src/executors/scan/ipc.rs index ae1e3bcf30f2..daa98e209126 100644 --- a/crates/polars-mem-engine/src/executors/scan/ipc.rs +++ b/crates/polars-mem-engine/src/executors/scan/ipc.rs @@ -5,6 +5,7 @@ use polars_error::feature_gated; use polars_io::cloud::CloudOptions; use polars_io::path_utils::is_cloud_url; use polars_io::predicates::apply_predicate; +use polars_utils::mmap::MemSlice; use rayon::prelude::*; use super::*; @@ -13,6 +14,7 @@ pub struct IpcExec { pub(crate) sources: ScanSources, pub(crate) file_info: FileInfo, pub(crate) predicate: Option>, + #[allow(dead_code)] pub(crate) options: IpcScanOptions, pub(crate) file_options: FileScanOptions, pub(crate) hive_parts: Option>>, @@ -72,48 +74,34 @@ impl IpcExec { let read_path = |index: usize, n_rows: Option| { let source = self.sources.at(index); - match source { + let memslice = match source { ScanSourceRef::File(path) => { let file = match idx_to_cached_file(index) { None => std::fs::File::open(path)?, Some(f) => f?, }; - IpcReader::new(file) - .with_n_rows(n_rows) - .with_row_index(self.file_options.row_index.clone()) - .with_projection(projection.clone()) - .with_hive_partition_columns( - self.hive_parts - .as_ref() - .map(|x| x[index].materialize_partition_columns()), - ) - .with_include_file_path( - self.file_options - .include_file_paths - .as_ref() - .map(|x| (x.clone(), Arc::from(source.to_file_path()))), - ) - .memory_mapped(self.options.memory_map.then(|| path.to_path_buf())) - .finish() + MemSlice::from_mmap(Arc::new(unsafe { memmap::Mmap::map(&file).unwrap() })) }, - ScanSourceRef::Buffer(buff) => IpcReader::new(std::io::Cursor::new(buff)) - .with_n_rows(n_rows) - .with_row_index(self.file_options.row_index.clone()) - .with_projection(projection.clone()) - .with_hive_partition_columns( - self.hive_parts - .as_ref() - .map(|x| x[index].materialize_partition_columns()), - ) - .with_include_file_path( - self.file_options - .include_file_paths - .as_ref() - .map(|x| (x.clone(), Arc::from(source.to_file_path()))), - ) - .finish(), - } + ScanSourceRef::Buffer(buff) => MemSlice::from_bytes(buff.clone()), + }; + + IpcReader::new(std::io::Cursor::new(memslice)) + .with_n_rows(n_rows) + .with_row_index(self.file_options.row_index.clone()) + .with_projection(projection.clone()) + .with_hive_partition_columns( + self.hive_parts + .as_ref() + .map(|x| x[index].materialize_partition_columns()), + ) + .with_include_file_path( + self.file_options + .include_file_paths + .as_ref() + .map(|x| (x.clone(), Arc::from(source.to_file_path()))), + ) + .finish() }; let mut dfs = if let Some(mut n_rows) = self.file_options.slice.map(|x| { diff --git a/crates/polars-python/src/lazyframe/general.rs b/crates/polars-python/src/lazyframe/general.rs index 30206d0c088b..e09d5cb7f309 100644 --- a/crates/polars-python/src/lazyframe/general.rs +++ b/crates/polars-python/src/lazyframe/general.rs @@ -70,7 +70,7 @@ impl PyLazyFrame { Some(source) => pyobject_to_first_path_and_scan_sources(source)?, }; - let mut r = LazyJsonLineReader::new_sourced(sources); + let mut r = LazyJsonLineReader::new_with_sources(sources); #[cfg(feature = "cloud")] if let Some(first_path) = first_path { @@ -168,7 +168,7 @@ impl PyLazyFrame { Some(source) => pyobject_to_first_path_and_scan_sources(source)?, }; - let mut r = LazyCsvReader::new_sourced(sources); + let mut r = LazyCsvReader::new_with_sources(sources); #[cfg(feature = "cloud")] if let Some(first_path) = first_path { @@ -302,7 +302,7 @@ impl PyLazyFrame { args.cloud_options = Some(cloud_options.with_max_retries(retries)); } - let lf = LazyFrame::scan_parquet_sourced(sources, args).map_err(PyPolarsErr::from)?; + let lf = LazyFrame::scan_parquet_sources(sources, args).map_err(PyPolarsErr::from)?; Ok(lf.into()) } @@ -368,7 +368,7 @@ impl PyLazyFrame { args.cloud_options = Some(cloud_options.with_max_retries(retries)); } - let lf = LazyFrame::scan_ipc_sourced(sources, args).map_err(PyPolarsErr::from)?; + let lf = LazyFrame::scan_ipc_sources(sources, args).map_err(PyPolarsErr::from)?; Ok(lf.into()) } diff --git a/py-polars/polars/io/csv/functions.py b/py-polars/polars/io/csv/functions.py index 257522831cd3..23d3e86badc4 100644 --- a/py-polars/polars/io/csv/functions.py +++ b/py-polars/polars/io/csv/functions.py @@ -984,7 +984,14 @@ def read_csv_batched( @deprecate_renamed_parameter("row_count_name", "row_index_name", version="0.20.4") @deprecate_renamed_parameter("row_count_offset", "row_index_offset", version="0.20.4") def scan_csv( - source: str | Path | list[str] | list[Path] | IO[str] | IO[bytes], + source: str + | Path + | IO[str] + | IO[bytes] + | list[str] + | list[Path] + | list[IO[str]] + | list[IO[bytes]], *, has_header: bool = True, separator: str = ",", @@ -1233,9 +1240,9 @@ def with_column_names(cols: list[str]) -> list[str]: if isinstance(source, (str, Path)): source = normalize_filepath(source, check_not_directory=False) elif ( - isinstance(source, (IO, BytesIO)) + isinstance(source, (BytesIO, StringIO)) or isinstance(source, list) - and isinstance(source[0], BytesIO) + and isinstance(source[0], (BytesIO, StringIO)) ): pass else: @@ -1282,7 +1289,13 @@ def with_column_names(cols: list[str]) -> list[str]: def _scan_csv_impl( - source: str | list[str] | list[Path] | IO[str] | IO[bytes], + source: str + | IO[str] + | IO[bytes] + | list[str] + | list[Path] + | list[IO[str]] + | list[IO[bytes]], *, has_header: bool = True, separator: str = ",", diff --git a/py-polars/polars/io/ndjson.py b/py-polars/polars/io/ndjson.py index 166e990ba25d..a4d8f62e73b6 100644 --- a/py-polars/polars/io/ndjson.py +++ b/py-polars/polars/io/ndjson.py @@ -257,9 +257,9 @@ def scan_ndjson( if isinstance(source, (str, Path)): source = normalize_filepath(source, check_not_directory=False) sources = [] - elif isinstance(source, BytesIO): + elif isinstance(source, (BytesIO, StringIO)): sources = [] - elif isinstance(source, list) and isinstance(source[0], BytesIO): + elif isinstance(source, list) and isinstance(source[0], (BytesIO, StringIO)): sources = source source = None # type: ignore[assignment] else: From 6026101f95c398623673499b050e2d0e1bacb5e7 Mon Sep 17 00:00:00 2001 From: coastalwhite Date: Thu, 5 Sep 2024 17:23:59 +0200 Subject: [PATCH 08/27] fix cfg --- crates/polars-plan/src/plans/ir/mod.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/crates/polars-plan/src/plans/ir/mod.rs b/crates/polars-plan/src/plans/ir/mod.rs index 328efce28be9..cb4cd5f4b203 100644 --- a/crates/polars-plan/src/plans/ir/mod.rs +++ b/crates/polars-plan/src/plans/ir/mod.rs @@ -13,7 +13,6 @@ pub use format::{ExprIRDisplay, IRDisplay}; use hive::HivePartitions; use polars_core::error::feature_gated; use polars_core::prelude::*; -use polars_io::file_cache::FileCacheEntry; use polars_utils::idx_vec::UnitVec; use polars_utils::mmap::MemSlice; use polars_utils::unitvec; @@ -72,7 +71,7 @@ impl<'a> ScanSourceRef<'a> { pub fn to_memslice( &self, run_async: bool, - cache_entries: Option<&Vec>>, + cache_entries: Option<&Vec>>, index: usize, ) -> PolarsResult { match self { From ef8d0348180f1a23207e1c99e35c10cdf2d35ca3 Mon Sep 17 00:00:00 2001 From: coastalwhite Date: Fri, 6 Sep 2024 11:29:53 +0200 Subject: [PATCH 09/27] fix several issues --- .../src/executors/scan/ipc.rs | 5 +++- .../src/executors/scan/parquet.rs | 11 ++++++-- crates/polars-mem-engine/src/utils.rs | 28 +++++++++++-------- .../polars-pipe/src/executors/sources/csv.rs | 15 +++++----- .../src/executors/sources/parquet.rs | 9 ++++-- .../src/plans/conversion/dsl_to_ir.rs | 6 +++- .../polars-plan/src/plans/conversion/scans.rs | 18 ++++++++---- .../polars-plan/src/plans/functions/count.rs | 14 ++++++++-- crates/polars-plan/src/plans/ir/mod.rs | 21 ++++++-------- .../plans/optimizer/predicate_pushdown/mod.rs | 4 ++- .../src/lazyframe/visitor/nodes.rs | 2 +- .../src/physical_plan/lower_ir.rs | 6 ++-- py-polars/tests/unit/io/test_scan.py | 24 ++++++++++++++++ .../tests/unit/streaming/test_streaming_io.py | 24 ++++++++++++++++ 14 files changed, 137 insertions(+), 50 deletions(-) diff --git a/crates/polars-mem-engine/src/executors/scan/ipc.rs b/crates/polars-mem-engine/src/executors/scan/ipc.rs index daa98e209126..856cd8820ba4 100644 --- a/crates/polars-mem-engine/src/executors/scan/ipc.rs +++ b/crates/polars-mem-engine/src/executors/scan/ipc.rs @@ -173,7 +173,10 @@ impl IpcExec { // concurrently. use polars_io::file_cache::init_entries_from_uri_list; - let paths = self.sources.into_paths(); + let paths = self + .sources + .into_paths() + .ok_or_else(|| polars_err!(nyi = "Asynchronous scanning of in-memory buffers"))?; tokio::task::block_in_place(|| { let cache_entries = init_entries_from_uri_list( diff --git a/crates/polars-mem-engine/src/executors/scan/parquet.rs b/crates/polars-mem-engine/src/executors/scan/parquet.rs index 7de67aff4284..99581ad2c15d 100644 --- a/crates/polars-mem-engine/src/executors/scan/parquet.rs +++ b/crates/polars-mem-engine/src/executors/scan/parquet.rs @@ -138,7 +138,6 @@ impl ParquetExec { for i in (first_source..self.sources.len()).step_by(step) { let end = std::cmp::min(i.saturating_add(step), self.sources.len()); - let hive_parts = self.hive_parts.as_ref().map(|x| &x[i..end]); if current_offset >= slice_end && !result.is_empty() { return Ok(result); @@ -149,7 +148,10 @@ impl ParquetExec { // files in parallel even if we add row index columns or slices. let iter = (i..end).into_par_iter().map(|i| { let source = self.sources.at(i); - let hive_partitions = hive_parts.map(|x| x[i].materialize_partition_columns()); + let hive_partitions = self + .hive_parts + .as_ref() + .map(|x| x[i].materialize_partition_columns()); let (projection, predicate) = prepare_scan_args( self.predicate.clone(), @@ -249,7 +251,10 @@ impl ParquetExec { use polars_io::utils::slice::split_slice_at_file; let verbose = verbose(); - let paths = self.sources.into_paths(); + let paths = self + .sources + .into_paths() + .ok_or_else(|| polars_err!(nyi = "Asynchronous scanning of in-memory buffers"))?; let first_metadata = &self.metadata; let cloud_options = self.cloud_options.as_ref(); diff --git a/crates/polars-mem-engine/src/utils.rs b/crates/polars-mem-engine/src/utils.rs index b104da3c4e78..06941cbc128d 100644 --- a/crates/polars-mem-engine/src/utils.rs +++ b/crates/polars-mem-engine/src/utils.rs @@ -1,22 +1,28 @@ -use std::path::PathBuf; +use std::path::Path; pub(crate) use polars_plan::plans::ArenaLpIter; -use polars_plan::plans::IR; +use polars_plan::plans::{ScanSources, IR}; use polars_utils::aliases::PlHashSet; use polars_utils::arena::{Arena, Node}; /// Get a set of the data source paths in this LogicalPlan -pub(crate) fn agg_source_paths( +/// +/// # Notes +/// +/// - Scan sources with in-memory buffers are ignored. +pub(crate) fn agg_source_paths<'a>( root_lp: Node, - acc_paths: &mut PlHashSet, - lp_arena: &Arena, + acc_paths: &mut PlHashSet<&'a Path>, + lp_arena: &'a Arena, ) { - lp_arena.iter(root_lp).for_each(|(_, lp)| { - use IR::*; - if let Scan { sources, .. } = lp { - for path in sources.as_paths() { - acc_paths.insert(path.clone()); + for (_, lp) in lp_arena.iter(root_lp) { + if let IR::Scan { sources, .. } = lp { + match sources { + ScanSources::Files(paths) => acc_paths.extend(paths.iter().map(|p| p.as_path())), + ScanSources::Buffers(_) => { + // Ignore + }, } } - }) + } } diff --git a/crates/polars-pipe/src/executors/sources/csv.rs b/crates/polars-pipe/src/executors/sources/csv.rs index 673848e67d77..f3267ac1e90a 100644 --- a/crates/polars-pipe/src/executors/sources/csv.rs +++ b/crates/polars-pipe/src/executors/sources/csv.rs @@ -1,5 +1,6 @@ use std::fs::File; +use polars_core::error::feature_gated; use polars_core::{config, POOL}; use polars_io::csv::read::{BatchedCsvReader, CsvReadOptions, CsvReader}; use polars_io::path_utils::is_cloud_url; @@ -36,7 +37,10 @@ impl CsvSource { // otherwise all files would be opened during construction of the pipeline // leading to Too many Open files error fn init_next_reader(&mut self) -> PolarsResult<()> { - let paths = self.sources.as_paths(); + let paths = self + .sources + .as_paths() + .ok_or_else(|| polars_err!(nyi = "Streaming scanning of in-memory buffers"))?; let file_options = self.file_options.clone(); let n_rows = file_options.slice.map(|x| { @@ -105,8 +109,7 @@ impl CsvSource { .with_row_index(row_index); let reader: CsvReader = if run_async { - #[cfg(feature = "cloud")] - { + feature_gated!("cloud", { options.into_reader_with_file_handle( polars_io::file_cache::FILE_CACHE .get_entry(path.to_str().unwrap()) @@ -114,11 +117,7 @@ impl CsvSource { .unwrap() .try_open_assume_latest()?, ) - } - #[cfg(not(feature = "cloud"))] - { - panic!("required feature `cloud` is not enabled") - } + }) } else { options .with_path(Some(path)) diff --git a/crates/polars-pipe/src/executors/sources/parquet.rs b/crates/polars-pipe/src/executors/sources/parquet.rs index e91eb2ec1bba..8592021b2ff3 100644 --- a/crates/polars-pipe/src/executors/sources/parquet.rs +++ b/crates/polars-pipe/src/executors/sources/parquet.rs @@ -77,7 +77,10 @@ impl ParquetSource { usize, Option>, )> { - let paths = self.sources.as_paths(); + let paths = self + .sources + .as_paths() + .ok_or_else(|| polars_err!(nyi = "Streaming scanning of in-memory buffers"))?; let path = &paths[index]; let options = self.options; let file_options = self.file_options.clone(); @@ -256,7 +259,9 @@ impl ParquetSource { verbose: bool, predicate: Option>, ) -> PolarsResult { - let paths = sources.as_paths(); + let paths = sources + .as_paths() + .ok_or_else(|| polars_err!(nyi = "Streaming scanning of in-memory buffers"))?; let n_threads = POOL.current_num_threads(); let iter = 0..paths.len(); diff --git a/crates/polars-plan/src/plans/conversion/dsl_to_ir.rs b/crates/polars-plan/src/plans/conversion/dsl_to_ir.rs index 1bf06322f090..437db91f1975 100644 --- a/crates/polars-plan/src/plans/conversion/dsl_to_ir.rs +++ b/crates/polars-plan/src/plans/conversion/dsl_to_ir.rs @@ -193,11 +193,15 @@ pub fn to_alp_impl(lp: DslPlan, ctxt: &mut DslConversionContext) -> PolarsResult } else if file_options.hive_options.enabled.unwrap_or(false) && resolved_file_info.reader_schema.is_some() { + let paths = sources + .as_paths() + .ok_or_else(|| polars_err!(nyi = "Hive-partitioning of in-memory buffers"))?; + #[allow(unused_assignments)] let mut owned = None; hive_partitions_from_paths( - sources.as_paths(), + paths, file_options.hive_options.hive_start_idx, file_options.hive_options.schema.clone(), match resolved_file_info.reader_schema.as_ref().unwrap() { diff --git a/crates/polars-plan/src/plans/conversion/scans.rs b/crates/polars-plan/src/plans/conversion/scans.rs index 50da756f9a38..ca5d85b7ba3e 100644 --- a/crates/polars-plan/src/plans/conversion/scans.rs +++ b/crates/polars-plan/src/plans/conversion/scans.rs @@ -160,21 +160,24 @@ pub(super) fn csv_file_info( let run_async = sources.is_cloud_url() || config::force_async(); let cache_entries = { - feature_gated!("cloud", { - if run_async { + if run_async { + feature_gated!("cloud", { Some(polars_io::file_cache::init_entries_from_uri_list( sources .as_paths() + .ok_or_else(|| { + polars_err!(nyi = "Asynchronous scanning of in-memory buffers") + })? .iter() .map(|path| Arc::from(path.to_str().unwrap())) .collect::>() .as_slice(), cloud_options, )?) - } else { - None - } - }) + }) + } else { + None + } }; let infer_schema_func = |i| { @@ -280,6 +283,9 @@ pub(super) fn ndjson_file_info( Some(polars_io::file_cache::init_entries_from_uri_list( sources .as_paths() + .ok_or_else(|| { + polars_err!(nyi = "Asynchronous scanning of in-memory buffers") + })? .iter() .map(|path| Arc::from(path.to_str().unwrap())) .collect::>() diff --git a/crates/polars-plan/src/plans/functions/count.rs b/crates/polars-plan/src/plans/functions/count.rs index f8d344217e70..64dc1615d8b7 100644 --- a/crates/polars-plan/src/plans/functions/count.rs +++ b/crates/polars-plan/src/plans/functions/count.rs @@ -126,7 +126,12 @@ pub(super) fn count_rows_parquet( if is_cloud { feature_gated!("cloud", { - get_runtime().block_on(count_rows_cloud_parquet(sources.as_paths(), cloud_options)) + get_runtime().block_on(count_rows_cloud_parquet( + sources.as_paths().ok_or_else(|| { + polars_err!(nyi = "Asynchronous scanning of in-memory buffers") + })?, + cloud_options, + )) }) } else { sources @@ -174,7 +179,9 @@ pub(super) fn count_rows_ipc( if is_cloud { feature_gated!("cloud", { get_runtime().block_on(count_rows_cloud_ipc( - sources.as_paths(), + sources.as_paths().ok_or_else(|| { + polars_err!(nyi = "Asynchronous scanning of in-memory buffers") + })?, cloud_options, metadata, )) @@ -234,6 +241,9 @@ pub(super) fn count_rows_ndjson( Some(polars_io::file_cache::init_entries_from_uri_list( sources .as_paths() + .ok_or_else(|| { + polars_err!(nyi = "Asynchronous scanning of in-memory buffers") + })? .iter() .map(|path| Arc::from(path.to_str().unwrap())) .collect::>() diff --git a/crates/polars-plan/src/plans/ir/mod.rs b/crates/polars-plan/src/plans/ir/mod.rs index cb4cd5f4b203..919a4e635e65 100644 --- a/crates/polars-plan/src/plans/ir/mod.rs +++ b/crates/polars-plan/src/plans/ir/mod.rs @@ -71,7 +71,10 @@ impl<'a> ScanSourceRef<'a> { pub fn to_memslice( &self, run_async: bool, - cache_entries: Option<&Vec>>, + #[cfg(feature = "cloud")] cache_entries: Option< + &Vec>, + >, + #[cfg(not(feature = "cloud"))] cache_entries: Option<&()>, index: usize, ) -> PolarsResult { match self { @@ -99,24 +102,18 @@ impl ScanSources { offset: 0, } } - pub fn as_paths(&self) -> &[PathBuf] { - match self { - Self::Files(paths) => paths, - Self::Buffers(_) => unimplemented!(), - } - } - pub fn try_into_paths(&self) -> Option> { + pub fn as_paths(&self) -> Option<&[PathBuf]> { match self { - Self::Files(paths) => Some(paths.clone()), + Self::Files(paths) => Some(paths.as_ref()), Self::Buffers(_) => None, } } - pub fn into_paths(&self) -> Arc<[PathBuf]> { + pub fn into_paths(&self) -> Option> { match self { - Self::Files(paths) => paths.clone(), - Self::Buffers(_) => unimplemented!(), + Self::Files(paths) => Some(paths.clone()), + Self::Buffers(_) => None, } } diff --git a/crates/polars-plan/src/plans/optimizer/predicate_pushdown/mod.rs b/crates/polars-plan/src/plans/optimizer/predicate_pushdown/mod.rs index d5aefb2a16d7..f42a7ca7239b 100644 --- a/crates/polars-plan/src/plans/optimizer/predicate_pushdown/mod.rs +++ b/crates/polars-plan/src/plans/optimizer/predicate_pushdown/mod.rs @@ -366,7 +366,9 @@ impl<'a> PredicatePushDown<'a> { if let (Some(hive_parts), Some(predicate)) = (&scan_hive_parts, &predicate) { if let Some(io_expr) = self.expr_eval.unwrap()(predicate, expr_arena) { if let Some(stats_evaluator) = io_expr.as_stats_evaluator() { - let paths = sources.as_paths(); + let paths = sources.as_paths().ok_or_else(|| { + polars_err!(nyi = "Hive partitioning of in-memory buffers") + })?; let mut new_paths = Vec::with_capacity(paths.len()); let mut new_hive_parts = Vec::with_capacity(paths.len()); diff --git a/crates/polars-python/src/lazyframe/visitor/nodes.rs b/crates/polars-python/src/lazyframe/visitor/nodes.rs index 37a51e4d481d..4e9344a61d15 100644 --- a/crates/polars-python/src/lazyframe/visitor/nodes.rs +++ b/crates/polars-python/src/lazyframe/visitor/nodes.rs @@ -326,7 +326,7 @@ pub(crate) fn into_py(py: Python<'_>, plan: &IR) -> PyResult { file_options, } => Scan { paths: sources - .try_into_paths() + .into_paths() .ok_or_else(|| PyNotImplementedError::new_err("scan with BytesIO"))? .to_object(py), // TODO: file info diff --git a/crates/polars-stream/src/physical_plan/lower_ir.rs b/crates/polars-stream/src/physical_plan/lower_ir.rs index d50d90afe52a..b993ea6ac557 100644 --- a/crates/polars-stream/src/physical_plan/lower_ir.rs +++ b/crates/polars-stream/src/physical_plan/lower_ir.rs @@ -2,7 +2,7 @@ use std::sync::Arc; use polars_core::prelude::{InitHashMaps, PlHashMap, PlIndexMap}; use polars_core::schema::{IndexOfSchema, Schema}; -use polars_error::PolarsResult; +use polars_error::{polars_err, PolarsResult}; use polars_plan::plans::expr_ir::{ExprIR, OutputName}; use polars_plan::plans::{AExpr, IR}; use polars_plan::prelude::SinkType; @@ -343,7 +343,9 @@ pub fn lower_ir( unreachable!(); }; - let paths = sources.into_paths(); + let paths = sources + .into_paths() + .ok_or_else(|| polars_err!(nyi = "Streaming scanning of in-memory buffers"))?; PhysNodeKind::FileScan { paths, diff --git a/py-polars/tests/unit/io/test_scan.py b/py-polars/tests/unit/io/test_scan.py index a254daaeaa12..340c4741fa48 100644 --- a/py-polars/tests/unit/io/test_scan.py +++ b/py-polars/tests/unit/io/test_scan.py @@ -737,3 +737,27 @@ def test_scan_in_memory(method: str) -> None: g.seek(0) result = (getattr(pl, f"scan_{method}"))([f, g]).slice(-1, 1).collect() assert_frame_equal(df.vstack(df).slice(-1, 1), result) + + +@pytest.mark.parametrize( + "method", + ["parquet", "csv", "ipc", "ndjson"], +) +def test_nyi_async_scan_in_memory(method: str, monkeypatch: pytest.MonkeyPatch) -> None: + f = io.BytesIO() + df = pl.DataFrame( + { + "a": [1, 2, 3], + "b": ["x", "y", "z"], + } + ) + + (getattr(df, f"write_{method}"))(f) + + f.seek(0) + _enable_force_async(monkeypatch) + with pytest.raises( + pl.exceptions.ComputeError, + match="not yet implemented: Asynchronous scanning of in-memory buffers", + ): + (getattr(pl, f"scan_{method}"))(f).collect() diff --git a/py-polars/tests/unit/streaming/test_streaming_io.py b/py-polars/tests/unit/streaming/test_streaming_io.py index ff526d609a0a..0cbf0d90e4ba 100644 --- a/py-polars/tests/unit/streaming/test_streaming_io.py +++ b/py-polars/tests/unit/streaming/test_streaming_io.py @@ -1,5 +1,6 @@ from __future__ import annotations +import io from typing import TYPE_CHECKING, Any from unittest.mock import patch @@ -294,3 +295,26 @@ def test_streaming_empty_parquet_16523(tmp_path: Path) -> None: q = pl.scan_parquet(file_path) q2 = pl.LazyFrame({"a": [1]}, schema={"a": pl.Int32}) assert q.join(q2, on="a").collect(streaming=True).shape == (0, 1) + + +@pytest.mark.parametrize( + "method", + ["parquet", "csv"], +) +def test_nyi_scan_in_memory(method: str) -> None: + f = io.BytesIO() + df = pl.DataFrame( + { + "a": [1, 2, 3], + "b": ["x", "y", "z"], + } + ) + + (getattr(df, f"write_{method}"))(f) + + f.seek(0) + with pytest.raises( + pl.exceptions.ComputeError, + match="not yet implemented: Streaming scanning of in-memory buffers", + ): + (getattr(pl, f"scan_{method}"))(f).collect(streaming=True) From 04932705461759f0df5f42bc6f6f3d9d0e0bb062 Mon Sep 17 00:00:00 2001 From: coastalwhite Date: Fri, 6 Sep 2024 11:55:56 +0200 Subject: [PATCH 10/27] fix: #18581 --- .../polars-plan/src/plans/functions/count.rs | 12 +++++----- crates/polars-plan/src/plans/functions/mod.rs | 22 +++++++++++++++++-- crates/polars-plan/src/plans/ir/mod.rs | 2 +- .../tests/unit/lazyframe/optimizations.py | 13 +++++++++++ 4 files changed, 41 insertions(+), 8 deletions(-) diff --git a/crates/polars-plan/src/plans/functions/count.rs b/crates/polars-plan/src/plans/functions/count.rs index 64dc1615d8b7..d30d6ef91d09 100644 --- a/crates/polars-plan/src/plans/functions/count.rs +++ b/crates/polars-plan/src/plans/functions/count.rs @@ -25,7 +25,11 @@ use polars_io::SerReader; use super::*; #[allow(unused_variables)] -pub fn count_rows(sources: &ScanSources, scan_type: &FileScan) -> PolarsResult { +pub fn count_rows( + sources: &ScanSources, + scan_type: &FileScan, + alias: Option, +) -> PolarsResult { #[cfg(not(any( feature = "parquet", feature = "ipc", @@ -77,10 +81,8 @@ pub fn count_rows(sources: &ScanSources, scan_type: &FileScan) -> PolarsResult python_udf::call_python_udf(function, df, *validate_output, schema.as_deref()), FastCount { - sources, scan_type, .. - } => count::count_rows(sources, scan_type), + sources, + scan_type, + alias, + } => count::count_rows(sources, scan_type, alias.clone()), Rechunk => { df.as_single_chunk_par(); Ok(df) @@ -344,6 +347,21 @@ impl Display for FunctionIR { write!(f, "STREAMING") } }, + FastCount { + sources, + scan_type, + alias, + } => { + let scan_type: &str = scan_type.into(); + let default_column_name = PlSmallStr::from_static(crate::constants::LEN); + let alias = alias.as_ref().unwrap_or(&default_column_name); + + write!( + f, + "FAST COUNT ({scan_type}) {} as \"{alias}\"", + ScanSourcesDisplay(&sources) + ) + }, v => { let s: &str = v.into(); write!(f, "{s}") diff --git a/crates/polars-plan/src/plans/ir/mod.rs b/crates/polars-plan/src/plans/ir/mod.rs index 919a4e635e65..a1c96d41ece1 100644 --- a/crates/polars-plan/src/plans/ir/mod.rs +++ b/crates/polars-plan/src/plans/ir/mod.rs @@ -8,7 +8,7 @@ use std::borrow::Cow; use std::fmt; use std::path::{Path, PathBuf}; -pub use dot::{EscapeLabel, IRDotDisplay, PathsDisplay}; +pub use dot::{EscapeLabel, IRDotDisplay, PathsDisplay, ScanSourcesDisplay}; pub use format::{ExprIRDisplay, IRDisplay}; use hive::HivePartitions; use polars_core::error::feature_gated; diff --git a/py-polars/tests/unit/lazyframe/optimizations.py b/py-polars/tests/unit/lazyframe/optimizations.py index a44816fad0e6..2417edecdeb8 100644 --- a/py-polars/tests/unit/lazyframe/optimizations.py +++ b/py-polars/tests/unit/lazyframe/optimizations.py @@ -1,3 +1,5 @@ +import io + import polars as pl from polars.testing import assert_frame_equal @@ -27,3 +29,14 @@ def test_double_sort_maintain_order_18558() -> None: ) assert_frame_equal(lf.collect(), expect) + + +def test_fast_count_alias_18581() -> None: + f = io.BytesIO() + f.write(b"a,b,c\n1,2,3\n4,5,6") + f.flush() + f.seek(0) + + df = pl.scan_csv(f).select(pl.len().alias("weird_name")).collect() + + assert_frame_equal(pl.DataFrame({"weird_name": 2}), df) From b97b529b934788b9ebf91a68bc304851f6367c44 Mon Sep 17 00:00:00 2001 From: coastalwhite Date: Fri, 6 Sep 2024 12:34:23 +0200 Subject: [PATCH 11/27] fix StringIO loading for scan_csv, scan_ndjson --- crates/polars-python/src/file.rs | 30 +++++++++++++++++++--------- py-polars/polars/dataframe/frame.py | 10 ++++++++++ py-polars/tests/unit/io/test_scan.py | 28 ++++++++++++++++++++++++++ 3 files changed, 59 insertions(+), 9 deletions(-) diff --git a/crates/polars-python/src/file.rs b/crates/polars-python/src/file.rs index 3d180e2bedf3..3cbb3d364e2f 100644 --- a/crates/polars-python/src/file.rs +++ b/crates/polars-python/src/file.rs @@ -12,7 +12,7 @@ use polars::io::mmap::MmapBytesReader; use polars_error::{polars_err, polars_warn}; use pyo3::exceptions::PyTypeError; use pyo3::prelude::*; -use pyo3::types::{PyBytes, PyString}; +use pyo3::types::{PyBytes, PyString, PyStringMethods}; use crate::error::PyPolarsErr; use crate::prelude::resolve_homedir; @@ -47,11 +47,19 @@ impl PyFileLikeObject { .call_method_bound(py, "read", (), None) .expect("no read method found"); - let bytes: &Bound<'_, PyBytes> = bytes - .downcast_bound(py) - .expect("Expecting to be able to downcast into bytes from read result."); + if let Ok(bytes) = bytes.downcast_bound::(py) { + return bytes.as_bytes().to_vec(); + } + + if let Ok(bytes) = bytes.downcast_bound::(py) { + return bytes + .to_cow() + .expect("PyString is not valid UTF-8") + .into_owned() + .into_bytes(); + } - bytes.as_bytes().to_vec() + panic!("Expecting to be able to downcast into bytes from read result."); }); Cursor::new(buf) @@ -215,8 +223,10 @@ pub fn get_either_file_or_path(py_f: PyObject, write: bool) -> PyResult str: + with BytesIO() as buf: + self.write_csv(buf) + csv_bytes = buf.getvalue() + return csv_bytes.decode("utf8") + should_return_buffer = False if file is None: buffer = file = BytesIO() should_return_buffer = True + elif isinstance(file, StringIO): + csv_str = write_csv_to_string() + file.write(csv_str) + return None elif isinstance(file, (str, os.PathLike)): file = normalize_filepath(file) diff --git a/py-polars/tests/unit/io/test_scan.py b/py-polars/tests/unit/io/test_scan.py index 340c4741fa48..a4c3ac1d133c 100644 --- a/py-polars/tests/unit/io/test_scan.py +++ b/py-polars/tests/unit/io/test_scan.py @@ -739,6 +739,34 @@ def test_scan_in_memory(method: str) -> None: assert_frame_equal(df.vstack(df).slice(-1, 1), result) +@pytest.mark.parametrize( + "method", + ["csv", "ndjson"], +) +def test_scan_stringio(method: str) -> None: + f = io.StringIO() + df = pl.DataFrame( + { + "a": [1, 2, 3], + "b": ["x", "y", "z"], + } + ) + + (getattr(df, f"write_{method}"))(f) + + f.seek(0) + result = (getattr(pl, f"scan_{method}"))(f).collect() + assert_frame_equal(df, result) + + g = io.StringIO() + (getattr(df, f"write_{method}"))(g) + + f.seek(0) + g.seek(0) + result = (getattr(pl, f"scan_{method}"))([f, g]).collect() + assert_frame_equal(df.vstack(df), result) + + @pytest.mark.parametrize( "method", ["parquet", "csv", "ipc", "ndjson"], From 49bcc85df371a6dc0b7e315300057606675baca3 Mon Sep 17 00:00:00 2001 From: coastalwhite Date: Fri, 6 Sep 2024 14:34:25 +0200 Subject: [PATCH 12/27] fix async and source lists --- .../src/executors/scan/csv.rs | 4 +- .../src/executors/scan/ipc.rs | 7 +- .../src/executors/scan/ndjson.rs | 4 +- .../src/executors/scan/parquet.rs | 7 +- .../polars-plan/src/plans/conversion/scans.rs | 14 +-- .../polars-plan/src/plans/functions/count.rs | 14 +-- crates/polars-plan/src/plans/ir/mod.rs | 4 + py-polars/polars/io/csv/functions.py | 6 +- py-polars/polars/io/ipc/functions.py | 4 +- py-polars/polars/io/ndjson.py | 6 +- py-polars/polars/io/parquet/functions.py | 113 +++++++++--------- .../tests/unit/io/test_lazy_count_star.py | 10 +- py-polars/tests/unit/io/test_parquet.py | 42 +++++++ py-polars/tests/unit/io/test_scan.py | 23 +--- 14 files changed, 138 insertions(+), 120 deletions(-) diff --git a/crates/polars-mem-engine/src/executors/scan/csv.rs b/crates/polars-mem-engine/src/executors/scan/csv.rs index 7a2ac0d34950..a9048481a4ac 100644 --- a/crates/polars-mem-engine/src/executors/scan/csv.rs +++ b/crates/polars-mem-engine/src/executors/scan/csv.rs @@ -57,9 +57,9 @@ impl CsvExec { let verbose = config::verbose(); let force_async = config::force_async(); - let run_async = force_async || self.sources.is_cloud_url(); + let run_async = (self.sources.is_files() && force_async) || self.sources.is_cloud_url(); - if force_async && verbose { + if self.sources.is_files() && force_async && verbose { eprintln!("ASYNC READING FORCED"); } diff --git a/crates/polars-mem-engine/src/executors/scan/ipc.rs b/crates/polars-mem-engine/src/executors/scan/ipc.rs index 856cd8820ba4..33ad2f54f429 100644 --- a/crates/polars-mem-engine/src/executors/scan/ipc.rs +++ b/crates/polars-mem-engine/src/executors/scan/ipc.rs @@ -29,7 +29,7 @@ impl IpcExec { }; let force_async = config::force_async(); - let mut out = if is_cloud || force_async { + let mut out = if is_cloud || (self.sources.is_files() && force_async) { feature_gated!("cloud", { if force_async && config::verbose() { eprintln!("ASYNC READING FORCED"); @@ -173,10 +173,7 @@ impl IpcExec { // concurrently. use polars_io::file_cache::init_entries_from_uri_list; - let paths = self - .sources - .into_paths() - .ok_or_else(|| polars_err!(nyi = "Asynchronous scanning of in-memory buffers"))?; + let paths = self.sources.into_paths().unwrap(); tokio::task::block_in_place(|| { let cache_entries = init_entries_from_uri_list( diff --git a/crates/polars-mem-engine/src/executors/scan/ndjson.rs b/crates/polars-mem-engine/src/executors/scan/ndjson.rs index b37f76ee826d..fb55cb2b38e5 100644 --- a/crates/polars-mem-engine/src/executors/scan/ndjson.rs +++ b/crates/polars-mem-engine/src/executors/scan/ndjson.rs @@ -41,9 +41,9 @@ impl JsonExec { let verbose = config::verbose(); let force_async = config::force_async(); - let run_async = force_async || self.sources.is_cloud_url(); + let run_async = (self.sources.is_files() && force_async) || self.sources.is_cloud_url(); - if force_async && verbose { + if self.sources.is_files() && force_async && verbose { eprintln!("ASYNC READING FORCED"); } diff --git a/crates/polars-mem-engine/src/executors/scan/parquet.rs b/crates/polars-mem-engine/src/executors/scan/parquet.rs index 99581ad2c15d..fd0f53b6d728 100644 --- a/crates/polars-mem-engine/src/executors/scan/parquet.rs +++ b/crates/polars-mem-engine/src/executors/scan/parquet.rs @@ -251,10 +251,7 @@ impl ParquetExec { use polars_io::utils::slice::split_slice_at_file; let verbose = verbose(); - let paths = self - .sources - .into_paths() - .ok_or_else(|| polars_err!(nyi = "Asynchronous scanning of in-memory buffers"))?; + let paths = self.sources.into_paths().unwrap(); let first_metadata = &self.metadata; let cloud_options = self.cloud_options.as_ref(); @@ -474,7 +471,7 @@ impl ParquetExec { let is_cloud = self.sources.is_cloud_url(); let force_async = config::force_async(); - let out = if is_cloud || force_async { + let out = if is_cloud || (self.sources.is_files() && force_async) { feature_gated!("cloud", { if force_async && config::verbose() { eprintln!("ASYNC READING FORCED"); diff --git a/crates/polars-plan/src/plans/conversion/scans.rs b/crates/polars-plan/src/plans/conversion/scans.rs index ca5d85b7ba3e..f7c64ed7612b 100644 --- a/crates/polars-plan/src/plans/conversion/scans.rs +++ b/crates/polars-plan/src/plans/conversion/scans.rs @@ -152,12 +152,14 @@ pub(super) fn csv_file_info( use polars_io::utils::get_reader_bytes; use rayon::iter::{IntoParallelIterator, ParallelIterator}; + polars_ensure!(!sources.is_empty(), ComputeError: "expected at least 1 source"); + // TODO: // * See if we can do better than scanning all files if there is a row limit // * See if we can do this without downloading the entire file // prints the error message if paths is empty. - let run_async = sources.is_cloud_url() || config::force_async(); + let run_async = sources.is_cloud_url() || (sources.is_files() && config::force_async()); let cache_entries = { if run_async { @@ -165,9 +167,7 @@ pub(super) fn csv_file_info( Some(polars_io::file_cache::init_entries_from_uri_list( sources .as_paths() - .ok_or_else(|| { - polars_err!(nyi = "Asynchronous scanning of in-memory buffers") - })? + .unwrap() .iter() .map(|path| Arc::from(path.to_str().unwrap())) .collect::>() @@ -275,7 +275,7 @@ pub(super) fn ndjson_file_info( polars_bail!(ComputeError: "expected at least 1 source"); }; - let run_async = sources.is_cloud_url() || config::force_async(); + let run_async = sources.is_cloud_url() || (sources.is_files() && config::force_async()); let cache_entries = { if run_async { @@ -283,9 +283,7 @@ pub(super) fn ndjson_file_info( Some(polars_io::file_cache::init_entries_from_uri_list( sources .as_paths() - .ok_or_else(|| { - polars_err!(nyi = "Asynchronous scanning of in-memory buffers") - })? + .unwrap() .iter() .map(|path| Arc::from(path.to_str().unwrap())) .collect::>() diff --git a/crates/polars-plan/src/plans/functions/count.rs b/crates/polars-plan/src/plans/functions/count.rs index d30d6ef91d09..3bba674edb89 100644 --- a/crates/polars-plan/src/plans/functions/count.rs +++ b/crates/polars-plan/src/plans/functions/count.rs @@ -129,9 +129,7 @@ pub(super) fn count_rows_parquet( if is_cloud { feature_gated!("cloud", { get_runtime().block_on(count_rows_cloud_parquet( - sources.as_paths().ok_or_else(|| { - polars_err!(nyi = "Asynchronous scanning of in-memory buffers") - })?, + sources.as_paths().unwrap(), cloud_options, )) }) @@ -181,9 +179,7 @@ pub(super) fn count_rows_ipc( if is_cloud { feature_gated!("cloud", { get_runtime().block_on(count_rows_cloud_ipc( - sources.as_paths().ok_or_else(|| { - polars_err!(nyi = "Asynchronous scanning of in-memory buffers") - })?, + sources.as_paths().unwrap(), cloud_options, metadata, )) @@ -235,7 +231,7 @@ pub(super) fn count_rows_ndjson( } let is_cloud_url = sources.is_cloud_url(); - let run_async = is_cloud_url || config::force_async(); + let run_async = is_cloud_url || (sources.is_files() && config::force_async()); let cache_entries = { feature_gated!("cloud", { @@ -243,9 +239,7 @@ pub(super) fn count_rows_ndjson( Some(polars_io::file_cache::init_entries_from_uri_list( sources .as_paths() - .ok_or_else(|| { - polars_err!(nyi = "Asynchronous scanning of in-memory buffers") - })? + .unwrap() .iter() .map(|path| Arc::from(path.to_str().unwrap())) .collect::>() diff --git a/crates/polars-plan/src/plans/ir/mod.rs b/crates/polars-plan/src/plans/ir/mod.rs index a1c96d41ece1..e2f2ca3eae3d 100644 --- a/crates/polars-plan/src/plans/ir/mod.rs +++ b/crates/polars-plan/src/plans/ir/mod.rs @@ -131,6 +131,10 @@ impl ScanSources { } } + pub fn is_files(&self) -> bool { + matches!(self, Self::Files(_)) + } + pub fn is_cloud_url(&self) -> bool { match self { Self::Files(paths) => paths.first().map_or(false, polars_io::is_cloud_url), diff --git a/py-polars/polars/io/csv/functions.py b/py-polars/polars/io/csv/functions.py index 23d3e86badc4..a6002602af6a 100644 --- a/py-polars/polars/io/csv/functions.py +++ b/py-polars/polars/io/csv/functions.py @@ -1239,9 +1239,9 @@ def with_column_names(cols: list[str]) -> list[str]: if isinstance(source, (str, Path)): source = normalize_filepath(source, check_not_directory=False) - elif ( - isinstance(source, (BytesIO, StringIO)) - or isinstance(source, list) + elif isinstance(source, (BytesIO, StringIO)) or ( + isinstance(source, list) + and len(source) > 0 and isinstance(source[0], (BytesIO, StringIO)) ): pass diff --git a/py-polars/polars/io/ipc/functions.py b/py-polars/polars/io/ipc/functions.py index 8c0138df2a36..9945e1c6cbb7 100644 --- a/py-polars/polars/io/ipc/functions.py +++ b/py-polars/polars/io/ipc/functions.py @@ -432,7 +432,9 @@ def scan_ipc( sources = [] elif isinstance(source, BytesIO): sources = [] - elif isinstance(source, list) and isinstance(source[0], BytesIO): + elif ( + isinstance(source, list) and len(source) > 0 and isinstance(source[0], BytesIO) + ): sources = source source = None # type: ignore[assignment] else: diff --git a/py-polars/polars/io/ndjson.py b/py-polars/polars/io/ndjson.py index a4d8f62e73b6..dfea6cf2871f 100644 --- a/py-polars/polars/io/ndjson.py +++ b/py-polars/polars/io/ndjson.py @@ -259,7 +259,11 @@ def scan_ndjson( sources = [] elif isinstance(source, (BytesIO, StringIO)): sources = [] - elif isinstance(source, list) and isinstance(source[0], (BytesIO, StringIO)): + elif ( + isinstance(source, list) + and len(source) > 0 + and isinstance(source[0], (BytesIO, StringIO)) + ): sources = source source = None # type: ignore[assignment] else: diff --git a/py-polars/polars/io/parquet/functions.py b/py-polars/polars/io/parquet/functions.py index 2eda346e7c26..7e1b2b9a93b1 100644 --- a/py-polars/polars/io/parquet/functions.py +++ b/py-polars/polars/io/parquet/functions.py @@ -3,26 +3,26 @@ import contextlib import io from pathlib import Path -from typing import IO, TYPE_CHECKING, Any, Sequence +from typing import IO, TYPE_CHECKING, Any import polars.functions as F +from polars import concat as plconcat from polars._utils.deprecation import deprecate_renamed_parameter from polars._utils.unstable import issue_unstable_warning from polars._utils.various import ( is_int_sequence, normalize_filepath, ) -from polars._utils.wrap import wrap_df, wrap_ldf +from polars._utils.wrap import wrap_ldf from polars.convert import from_arrow from polars.dependencies import import_optional from polars.io._utils import ( - parse_columns_arg, parse_row_index_args, prepare_file_arg, ) with contextlib.suppress(ImportError): - from polars.polars import PyDataFrame, PyLazyFrame + from polars.polars import PyLazyFrame from polars.polars import read_parquet_schema as _read_parquet_schema if TYPE_CHECKING: @@ -33,7 +33,14 @@ @deprecate_renamed_parameter("row_count_name", "row_index_name", version="0.20.4") @deprecate_renamed_parameter("row_count_offset", "row_index_offset", version="0.20.4") def read_parquet( - source: str | Path | list[str] | list[Path] | IO[bytes] | bytes, + source: str + | Path + | IO[bytes] + | bytes + | list[str] + | list[Path] + | list[IO[bytes]] + | list[bytes], *, columns: list[int] | list[str] | None = None, n_rows: int | None = None, @@ -166,18 +173,10 @@ def read_parquet( ) # Read file and bytes inputs using `read_parquet` - elif isinstance(source, (io.IOBase, bytes)): - return _read_parquet_binary( - source, - columns=columns, - n_rows=n_rows, - parallel=parallel, - row_index_name=row_index_name, - row_index_offset=row_index_offset, - low_memory=low_memory, - use_statistics=use_statistics, - rechunk=rechunk, - ) + if isinstance(source, bytes): + source = io.BytesIO(source) + elif isinstance(source, list) and len(source) > 0 and isinstance(source[0], bytes): + source = [io.BytesIO(s) for s in source] # type: ignore[arg-type] # For other inputs, defer to `scan_parquet` lf = scan_parquet( @@ -209,7 +208,14 @@ def read_parquet( def _read_parquet_with_pyarrow( - source: str | Path | list[str] | list[Path] | IO[bytes] | bytes, + source: str + | Path + | IO[bytes] + | bytes + | list[str] + | list[Path] + | list[IO[bytes]] + | list[bytes], *, columns: list[int] | list[str] | None = None, storage_options: dict[str, Any] | None = None, @@ -224,48 +230,35 @@ def _read_parquet_with_pyarrow( ) pyarrow_options = pyarrow_options or {} - with prepare_file_arg( - source, # type: ignore[arg-type] - use_pyarrow=True, - storage_options=storage_options, - ) as source_prep: - pa_table = pyarrow_parquet.read_table( - source_prep, - memory_map=memory_map, - columns=columns, - **pyarrow_options, - ) - return from_arrow(pa_table, rechunk=rechunk) # type: ignore[return-value] - + if ( + isinstance(source, list) + and len(source) > 0 + and isinstance(source[0], (bytes, io.BytesIO)) + ): + sources = source + else: + sources = [source] -def _read_parquet_binary( - source: IO[bytes] | bytes, - *, - columns: Sequence[int] | Sequence[str] | None = None, - n_rows: int | None = None, - row_index_name: str | None = None, - row_index_offset: int = 0, - parallel: ParallelStrategy = "auto", - use_statistics: bool = True, - rechunk: bool = False, - low_memory: bool = False, -) -> DataFrame: - projection, columns = parse_columns_arg(columns) - row_index = parse_row_index_args(row_index_name, row_index_offset) + results = [] + for source in sources: + with prepare_file_arg( + source, # type: ignore[arg-type] + use_pyarrow=True, + storage_options=storage_options, + ) as source_prep: + pa_table = pyarrow_parquet.read_table( + source_prep, + memory_map=memory_map, + columns=columns, + **pyarrow_options, + ) + result = from_arrow(pa_table, rechunk=rechunk) # type: ignore[return-value] + results.append(result) - with prepare_file_arg(source) as source_prep: - pydf = PyDataFrame.read_parquet( - source_prep, - columns=columns, - projection=projection, - n_rows=n_rows, - row_index=row_index, - parallel=parallel, - use_statistics=use_statistics, - rechunk=rechunk, - low_memory=low_memory, - ) - return wrap_df(pydf) + if len(results) == 1: + return results[0] + else: + return plconcat(results) def read_parquet_schema(source: str | Path | IO[bytes] | bytes) -> dict[str, DataType]: @@ -423,7 +416,9 @@ def scan_parquet( if isinstance(source, (str, Path)): source = normalize_filepath(source, check_not_directory=False) elif isinstance(source, io.BytesIO) or ( - isinstance(source, list) and isinstance(source[0], io.BytesIO) + isinstance(source, list) + and len(source) > 0 + and isinstance(source[0], io.BytesIO) ): pass else: diff --git a/py-polars/tests/unit/io/test_lazy_count_star.py b/py-polars/tests/unit/io/test_lazy_count_star.py index 7b988bed75c7..a2c03596dd15 100644 --- a/py-polars/tests/unit/io/test_lazy_count_star.py +++ b/py-polars/tests/unit/io/test_lazy_count_star.py @@ -23,7 +23,7 @@ def test_count_csv(io_files_path: Path, path: str, n_rows: int) -> None: expected = pl.DataFrame(pl.Series("len", [n_rows], dtype=pl.UInt32)) # Check if we are using our fast count star - assert "FAST_COUNT" in lf.explain() + assert "FAST COUNT" in lf.explain() assert_frame_equal(lf.collect(), expected) @@ -42,7 +42,7 @@ def test_commented_csv() -> None: expected = pl.DataFrame(pl.Series("len", [2], dtype=pl.UInt32)) lf = pl.scan_csv(csv_a.name, comment_prefix="#").select(pl.len()) - assert "FAST_COUNT" in lf.explain() + assert "FAST COUNT" in lf.explain() assert_frame_equal(lf.collect(), expected) @@ -55,7 +55,7 @@ def test_count_parquet(io_files_path: Path, pattern: str, n_rows: int) -> None: expected = pl.DataFrame(pl.Series("len", [n_rows], dtype=pl.UInt32)) # Check if we are using our fast count star - assert "FAST_COUNT" in lf.explain() + assert "FAST COUNT" in lf.explain() assert_frame_equal(lf.collect(), expected) @@ -68,7 +68,7 @@ def test_count_ipc(io_files_path: Path, path: str, n_rows: int) -> None: expected = pl.DataFrame(pl.Series("len", [n_rows], dtype=pl.UInt32)) # Check if we are using our fast count star - assert "FAST_COUNT" in lf.explain() + assert "FAST COUNT" in lf.explain() assert_frame_equal(lf.collect(), expected) @@ -81,7 +81,7 @@ def test_count_ndjson(io_files_path: Path, path: str, n_rows: int) -> None: expected = pl.DataFrame(pl.Series("len", [n_rows], dtype=pl.UInt32)) # Check if we are using our fast count star - assert "FAST_COUNT" in lf.explain() + assert "FAST COUNT" in lf.explain() assert_frame_equal(lf.collect(), expected) diff --git a/py-polars/tests/unit/io/test_parquet.py b/py-polars/tests/unit/io/test_parquet.py index f57d8bbf5b38..fe918c866af7 100644 --- a/py-polars/tests/unit/io/test_parquet.py +++ b/py-polars/tests/unit/io/test_parquet.py @@ -1835,3 +1835,45 @@ def test_row_index_projection_pushdown_18463( df.select("index").slice(1, 1).collect(), df.collect().select("index").slice(1, 1), ) + + +def test_concat_multiple_inmem() -> None: + f = io.BytesIO() + g = io.BytesIO() + + df1 = pl.DataFrame( + { + "a": [1, 2, 3], + "b": ["xyz", "abc", "wow"], + } + ) + df2 = pl.DataFrame( + { + "a": [5, 6, 7], + "b": ["a", "few", "entries"], + } + ) + + dfs = pl.concat([df1, df2]) + + df1.write_parquet(f) + df2.write_parquet(g) + + f.seek(0) + g.seek(0) + + assert_frame_equal(pl.read_parquet([f, g]), dfs) + + f.seek(0) + g.seek(0) + + assert_frame_equal(pl.read_parquet([f, g], use_pyarrow=True), dfs) + + f.seek(0) + g.seek(0) + + fb = f.read() + gb = g.read() + + assert_frame_equal(pl.read_parquet([fb, gb]), dfs) + assert_frame_equal(pl.read_parquet([fb, gb], use_pyarrow=True), dfs) diff --git a/py-polars/tests/unit/io/test_scan.py b/py-polars/tests/unit/io/test_scan.py index a4c3ac1d133c..15e30ef87274 100644 --- a/py-polars/tests/unit/io/test_scan.py +++ b/py-polars/tests/unit/io/test_scan.py @@ -769,23 +769,8 @@ def test_scan_stringio(method: str) -> None: @pytest.mark.parametrize( "method", - ["parquet", "csv", "ipc", "ndjson"], + [pl.scan_parquet, pl.scan_csv, pl.scan_ipc, pl.scan_ndjson], ) -def test_nyi_async_scan_in_memory(method: str, monkeypatch: pytest.MonkeyPatch) -> None: - f = io.BytesIO() - df = pl.DataFrame( - { - "a": [1, 2, 3], - "b": ["x", "y", "z"], - } - ) - - (getattr(df, f"write_{method}"))(f) - - f.seek(0) - _enable_force_async(monkeypatch) - with pytest.raises( - pl.exceptions.ComputeError, - match="not yet implemented: Asynchronous scanning of in-memory buffers", - ): - (getattr(pl, f"scan_{method}"))(f).collect() +def test_empty_list(method: Callable[[list[str]], pl.LazyFrame]) -> None: + with pytest.raises(pl.exceptions.ComputeError, match="expected at least 1 source"): + _ = (method)([]).collect() From 3070d7e26e93a83e4d008dcdab550753055a76a6 Mon Sep 17 00:00:00 2001 From: coastalwhite Date: Fri, 6 Sep 2024 14:40:36 +0200 Subject: [PATCH 13/27] small fixes --- crates/polars-plan/src/plans/functions/mod.rs | 2 +- py-polars/polars/io/csv/functions.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/crates/polars-plan/src/plans/functions/mod.rs b/crates/polars-plan/src/plans/functions/mod.rs index 2729596b9ab7..61cce46de9af 100644 --- a/crates/polars-plan/src/plans/functions/mod.rs +++ b/crates/polars-plan/src/plans/functions/mod.rs @@ -359,7 +359,7 @@ impl Display for FunctionIR { write!( f, "FAST COUNT ({scan_type}) {} as \"{alias}\"", - ScanSourcesDisplay(&sources) + ScanSourcesDisplay(sources) ) }, v => { diff --git a/py-polars/polars/io/csv/functions.py b/py-polars/polars/io/csv/functions.py index a6002602af6a..7164856901c6 100644 --- a/py-polars/polars/io/csv/functions.py +++ b/py-polars/polars/io/csv/functions.py @@ -443,6 +443,8 @@ def read_csv( # * The `storage_options` configuration keys are different between # fsspec and object_store (would require a breaking change) ): + source = normalize_filepath(source, check_not_directory=False) + if schema_overrides_is_list: msg = "passing a list to `schema_overrides` is unsupported for hf:// paths" raise ValueError(msg) From 8b4b5232a5afbaf22064e76dcbcadf4956e86ffd Mon Sep 17 00:00:00 2001 From: coastalwhite Date: Fri, 6 Sep 2024 15:29:26 +0200 Subject: [PATCH 14/27] fix a million mypy errors --- py-polars/polars/io/csv/functions.py | 10 +++++--- py-polars/polars/io/ipc/functions.py | 29 +++++++++++++----------- py-polars/polars/io/ndjson.py | 13 ++++++++--- py-polars/polars/io/parquet/functions.py | 26 +++++++++++---------- py-polars/tests/unit/io/test_parquet.py | 7 +++--- 5 files changed, 51 insertions(+), 34 deletions(-) diff --git a/py-polars/polars/io/csv/functions.py b/py-polars/polars/io/csv/functions.py index 7164856901c6..d8bc983e0fcc 100644 --- a/py-polars/polars/io/csv/functions.py +++ b/py-polars/polars/io/csv/functions.py @@ -443,7 +443,7 @@ def read_csv( # * The `storage_options` configuration keys are different between # fsspec and object_store (would require a breaking change) ): - source = normalize_filepath(source, check_not_directory=False) + source = normalize_filepath(v, check_not_directory=False) if schema_overrides_is_list: msg = "passing a list to `schema_overrides` is unsupported for hf:// paths" @@ -453,7 +453,7 @@ def read_csv( raise ValueError(msg) lf = _scan_csv_impl( - source, # type: ignore[arg-type] + source, has_header=has_header, separator=separator, comment_prefix=comment_prefix, @@ -1249,7 +1249,11 @@ def with_column_names(cols: list[str]) -> list[str]: pass else: source = [ - normalize_filepath(source, check_not_directory=False) for source in source + normalize_filepath( + source, # type: ignore[arg-type] + check_not_directory=False, + ) + for source in source ] if not infer_schema: diff --git a/py-polars/polars/io/ipc/functions.py b/py-polars/polars/io/ipc/functions.py index 9945e1c6cbb7..e0e213c376da 100644 --- a/py-polars/polars/io/ipc/functions.py +++ b/py-polars/polars/io/ipc/functions.py @@ -2,7 +2,7 @@ import contextlib import os -from io import BytesIO +from io import BytesIO, BufferedIOBase from pathlib import Path from typing import IO, TYPE_CHECKING, Any, Sequence @@ -427,20 +427,23 @@ def scan_ipc( include_file_paths Include the path of the source file(s) as a column with this name. """ + + sources: list[str] | list[Path] | list[IO[bytes]] = [] if isinstance(source, (str, Path)): source = normalize_filepath(source, check_not_directory=False) - sources = [] - elif isinstance(source, BytesIO): - sources = [] - elif ( - isinstance(source, list) and len(source) > 0 and isinstance(source[0], BytesIO) - ): - sources = source - source = None # type: ignore[assignment] - else: - sources = [ - normalize_filepath(source, check_not_directory=False) for source in source - ] + elif isinstance(source, list): + if len(source) > 0: + if isinstance(source[0], (str, Path)): + sources = [ + normalize_filepath( + source, # type: ignore[arg-type] + check_not_directory=False, + ) + for source in source + ] + else: + sources = source + source = None # type: ignore[assignment] pylf = PyLazyFrame.new_from_ipc( diff --git a/py-polars/polars/io/ndjson.py b/py-polars/polars/io/ndjson.py index dfea6cf2871f..135ff1a35d75 100644 --- a/py-polars/polars/io/ndjson.py +++ b/py-polars/polars/io/ndjson.py @@ -254,11 +254,12 @@ def scan_ndjson( include_file_paths Include the path of the source file(s) as a column with this name. """ + + sources: list[str] | list[Path] | list[IO[str]] | list[IO[bytes]] = [] if isinstance(source, (str, Path)): source = normalize_filepath(source, check_not_directory=False) - sources = [] elif isinstance(source, (BytesIO, StringIO)): - sources = [] + pass elif ( isinstance(source, list) and len(source) > 0 @@ -267,8 +268,14 @@ def scan_ndjson( sources = source source = None # type: ignore[assignment] else: + assert all(isinstance(s, (str, Path)) for s in source) + sources = [ - normalize_filepath(source, check_not_directory=False) for source in source + normalize_filepath( + source, # type: ignore[arg-type] + check_not_directory=False, + ) + for source in source ] source = None # type: ignore[assignment] if infer_schema_length == 0: diff --git a/py-polars/polars/io/parquet/functions.py b/py-polars/polars/io/parquet/functions.py index 7e1b2b9a93b1..61b3b585d067 100644 --- a/py-polars/polars/io/parquet/functions.py +++ b/py-polars/polars/io/parquet/functions.py @@ -3,7 +3,7 @@ import contextlib import io from pathlib import Path -from typing import IO, TYPE_CHECKING, Any +from typing import IO, TYPE_CHECKING, Any, List import polars.functions as F from polars import concat as plconcat @@ -176,7 +176,8 @@ def read_parquet( if isinstance(source, bytes): source = io.BytesIO(source) elif isinstance(source, list) and len(source) > 0 and isinstance(source[0], bytes): - source = [io.BytesIO(s) for s in source] # type: ignore[arg-type] + assert all(isinstance(s, bytes) for s in source) + source = [io.BytesIO(s) for s in source] # type: ignore # For other inputs, defer to `scan_parquet` lf = scan_parquet( @@ -230,16 +231,16 @@ def _read_parquet_with_pyarrow( ) pyarrow_options = pyarrow_options or {} - if ( - isinstance(source, list) - and len(source) > 0 - and isinstance(source[0], (bytes, io.BytesIO)) - ): - sources = source + sources: list[str | Path | IO[bytes] | bytes | list[str] | list[Path]] = [] + if isinstance(source, list): + if len(source) > 0 and isinstance(source[0], (bytes, io.BytesIO)): + sources = source # type: ignore + else: + sources = [source] # type: ignore else: sources = [source] - results = [] + results: list[DataFrame] = [] for source in sources: with prepare_file_arg( source, # type: ignore[arg-type] @@ -253,7 +254,7 @@ def _read_parquet_with_pyarrow( **pyarrow_options, ) result = from_arrow(pa_table, rechunk=rechunk) # type: ignore[return-value] - results.append(result) + results.append(result) # type: ignore[arg-type] if len(results) == 1: return results[0] @@ -423,11 +424,12 @@ def scan_parquet( pass else: source = [ - normalize_filepath(source, check_not_directory=False) for source in source + normalize_filepath(source, check_not_directory=False) # type: ignore[arg-type] + for source in source ] return _scan_parquet_impl( - source, + source, # type: ignore[arg-type] n_rows=n_rows, cache=cache, parallel=parallel, diff --git a/py-polars/tests/unit/io/test_parquet.py b/py-polars/tests/unit/io/test_parquet.py index fe918c866af7..ad7a497e585f 100644 --- a/py-polars/tests/unit/io/test_parquet.py +++ b/py-polars/tests/unit/io/test_parquet.py @@ -3,7 +3,7 @@ import io from datetime import datetime, time, timezone from decimal import Decimal -from typing import TYPE_CHECKING, Any, Literal, cast +from typing import TYPE_CHECKING, Any, Literal, cast, IO import fsspec import numpy as np @@ -1862,12 +1862,13 @@ def test_concat_multiple_inmem() -> None: f.seek(0) g.seek(0) - assert_frame_equal(pl.read_parquet([f, g]), dfs) + items: list[IO[bytes]] = [f, g] + assert_frame_equal(pl.read_parquet(items), dfs) f.seek(0) g.seek(0) - assert_frame_equal(pl.read_parquet([f, g], use_pyarrow=True), dfs) + assert_frame_equal(pl.read_parquet(items, use_pyarrow=True), dfs) f.seek(0) g.seek(0) From 17b90b948eb78d9521b60bab268af08028d5ce35 Mon Sep 17 00:00:00 2001 From: coastalwhite Date: Fri, 6 Sep 2024 15:33:41 +0200 Subject: [PATCH 15/27] mypy --- py-polars/polars/io/ipc/functions.py | 2 -- py-polars/polars/io/ndjson.py | 1 - py-polars/polars/io/parquet/functions.py | 18 +++++++++--------- py-polars/tests/unit/io/test_parquet.py | 2 +- 4 files changed, 10 insertions(+), 13 deletions(-) diff --git a/py-polars/polars/io/ipc/functions.py b/py-polars/polars/io/ipc/functions.py index e0e213c376da..5142b2ae68c6 100644 --- a/py-polars/polars/io/ipc/functions.py +++ b/py-polars/polars/io/ipc/functions.py @@ -2,7 +2,6 @@ import contextlib import os -from io import BytesIO, BufferedIOBase from pathlib import Path from typing import IO, TYPE_CHECKING, Any, Sequence @@ -427,7 +426,6 @@ def scan_ipc( include_file_paths Include the path of the source file(s) as a column with this name. """ - sources: list[str] | list[Path] | list[IO[bytes]] = [] if isinstance(source, (str, Path)): source = normalize_filepath(source, check_not_directory=False) diff --git a/py-polars/polars/io/ndjson.py b/py-polars/polars/io/ndjson.py index 135ff1a35d75..6cff4ddb1643 100644 --- a/py-polars/polars/io/ndjson.py +++ b/py-polars/polars/io/ndjson.py @@ -254,7 +254,6 @@ def scan_ndjson( include_file_paths Include the path of the source file(s) as a column with this name. """ - sources: list[str] | list[Path] | list[IO[str]] | list[IO[bytes]] = [] if isinstance(source, (str, Path)): source = normalize_filepath(source, check_not_directory=False) diff --git a/py-polars/polars/io/parquet/functions.py b/py-polars/polars/io/parquet/functions.py index 61b3b585d067..04cccc85fc5d 100644 --- a/py-polars/polars/io/parquet/functions.py +++ b/py-polars/polars/io/parquet/functions.py @@ -3,7 +3,7 @@ import contextlib import io from pathlib import Path -from typing import IO, TYPE_CHECKING, Any, List +from typing import IO, TYPE_CHECKING, Any import polars.functions as F from polars import concat as plconcat @@ -176,8 +176,8 @@ def read_parquet( if isinstance(source, bytes): source = io.BytesIO(source) elif isinstance(source, list) and len(source) > 0 and isinstance(source[0], bytes): - assert all(isinstance(s, bytes) for s in source) - source = [io.BytesIO(s) for s in source] # type: ignore + assert all(isinstance(s, bytes) for s in source) + source = [io.BytesIO(s) for s in source] # type: ignore[arg-type, assignment] # For other inputs, defer to `scan_parquet` lf = scan_parquet( @@ -234,9 +234,9 @@ def _read_parquet_with_pyarrow( sources: list[str | Path | IO[bytes] | bytes | list[str] | list[Path]] = [] if isinstance(source, list): if len(source) > 0 and isinstance(source[0], (bytes, io.BytesIO)): - sources = source # type: ignore + sources = source # type: ignore[assignment] else: - sources = [source] # type: ignore + sources = [source] # type: ignore[list-item] else: sources = [source] @@ -253,8 +253,8 @@ def _read_parquet_with_pyarrow( columns=columns, **pyarrow_options, ) - result = from_arrow(pa_table, rechunk=rechunk) # type: ignore[return-value] - results.append(result) # type: ignore[arg-type] + result = from_arrow(pa_table, rechunk=rechunk) + results.append(result) # type: ignore[arg-type] if len(results) == 1: return results[0] @@ -424,12 +424,12 @@ def scan_parquet( pass else: source = [ - normalize_filepath(source, check_not_directory=False) # type: ignore[arg-type] + normalize_filepath(source, check_not_directory=False) # type: ignore[arg-type] for source in source ] return _scan_parquet_impl( - source, # type: ignore[arg-type] + source, # type: ignore[arg-type] n_rows=n_rows, cache=cache, parallel=parallel, diff --git a/py-polars/tests/unit/io/test_parquet.py b/py-polars/tests/unit/io/test_parquet.py index ad7a497e585f..db3186a3f874 100644 --- a/py-polars/tests/unit/io/test_parquet.py +++ b/py-polars/tests/unit/io/test_parquet.py @@ -3,7 +3,7 @@ import io from datetime import datetime, time, timezone from decimal import Decimal -from typing import TYPE_CHECKING, Any, Literal, cast, IO +from typing import IO, TYPE_CHECKING, Any, Literal, cast import fsspec import numpy as np From daae7f1597553cdb78e0fac7f8428c8af144300c Mon Sep 17 00:00:00 2001 From: coastalwhite Date: Fri, 6 Sep 2024 18:30:32 +0200 Subject: [PATCH 16/27] almost working further file support --- crates/polars-io/src/csv/read/parser.rs | 20 +- crates/polars-io/src/mmap.rs | 6 + crates/polars-lazy/src/scan/csv.rs | 16 +- .../polars-lazy/src/scan/file_list_reader.rs | 6 +- crates/polars-lazy/src/scan/ipc.rs | 4 +- crates/polars-lazy/src/scan/ndjson.rs | 4 +- crates/polars-lazy/src/scan/parquet.rs | 4 +- .../src/executors/scan/csv.rs | 22 +- .../src/executors/scan/ipc.rs | 11 +- .../src/executors/scan/ndjson.rs | 29 +-- .../src/executors/scan/parquet.rs | 22 +- crates/polars-mem-engine/src/utils.rs | 6 +- crates/polars-plan/src/client/check.rs | 5 +- .../src/plans/conversion/dsl_to_ir.rs | 4 +- .../polars-plan/src/plans/conversion/scans.rs | 65 +++-- .../polars-plan/src/plans/functions/count.rs | 89 +++---- crates/polars-plan/src/plans/ir/dot.rs | 3 +- crates/polars-plan/src/plans/ir/mod.rs | 108 +++++++-- .../src/plans/optimizer/count_star.rs | 35 ++- .../plans/optimizer/predicate_pushdown/mod.rs | 2 +- crates/polars-python/src/conversion/mod.rs | 13 +- crates/polars-python/src/dataframe/io.rs | 229 +----------------- crates/polars-python/src/file.rs | 50 ++++ crates/polars-python/src/lazyframe/general.rs | 3 +- .../src/utils/late_materialized_df.rs | 2 +- py-polars/polars/io/csv/functions.py | 139 ++++------- py-polars/polars/io/ipc/functions.py | 52 ++-- py-polars/polars/io/ndjson.py | 27 +-- py-polars/polars/io/parquet/functions.py | 9 +- 29 files changed, 367 insertions(+), 618 deletions(-) diff --git a/crates/polars-io/src/csv/read/parser.rs b/crates/polars-io/src/csv/read/parser.rs index ccda4805792b..282a304003a3 100644 --- a/crates/polars-io/src/csv/read/parser.rs +++ b/crates/polars-io/src/csv/read/parser.rs @@ -40,15 +40,7 @@ pub fn count_rows( let mmap = unsafe { memmap::Mmap::map(&file).unwrap() }; let owned = &mut vec![]; - let mut reader_bytes = maybe_decompress_bytes(mmap.as_ref(), owned)?; - - for _ in 0..reader_bytes.len() { - if reader_bytes[0] != eol_char { - break; - } - - reader_bytes = &reader_bytes[1..]; - } + let reader_bytes = maybe_decompress_bytes(mmap.as_ref(), owned)?; count_rows_from_slice( reader_bytes, @@ -63,13 +55,21 @@ pub fn count_rows( /// Read the number of rows without parsing columns /// useful for count(*) queries pub fn count_rows_from_slice( - bytes: &[u8], + mut bytes: &[u8], separator: u8, quote_char: Option, comment_prefix: Option<&CommentPrefix>, eol_char: u8, has_header: bool, ) -> PolarsResult { + for _ in 0..bytes.len() { + if bytes[0] != eol_char { + break; + } + + bytes = &bytes[1..]; + } + const MIN_ROWS_PER_THREAD: usize = 1024; let max_threads = POOL.current_num_threads(); diff --git a/crates/polars-io/src/mmap.rs b/crates/polars-io/src/mmap.rs index 66ea8ed7b48b..ad2c05175810 100644 --- a/crates/polars-io/src/mmap.rs +++ b/crates/polars-io/src/mmap.rs @@ -104,6 +104,12 @@ impl MmapBytesReader for BufReader { } } +impl MmapBytesReader for BufReader<&File> { + fn to_file(&self) -> Option<&File> { + Some(self.get_ref()) + } +} + impl MmapBytesReader for Cursor where T: AsRef<[u8]> + Send + Sync, diff --git a/crates/polars-lazy/src/scan/csv.rs b/crates/polars-lazy/src/scan/csv.rs index 83e34cff0fe5..998f422820c6 100644 --- a/crates/polars-lazy/src/scan/csv.rs +++ b/crates/polars-lazy/src/scan/csv.rs @@ -32,7 +32,7 @@ impl LazyCsvReader { } pub fn new_paths(paths: Arc<[PathBuf]>) -> Self { - Self::new_with_sources(ScanSources::Files(paths)) + Self::new_with_sources(ScanSources::Paths(paths)) } pub fn new_with_sources(sources: ScanSources) -> Self { @@ -47,7 +47,7 @@ impl LazyCsvReader { } pub fn new(path: impl AsRef) -> Self { - Self::new_with_sources(ScanSources::Files([path.as_ref().to_path_buf()].into())) + Self::new_with_sources(ScanSources::Paths([path.as_ref().to_path_buf()].into())) } /// Skip this number of rows after the header location. @@ -254,7 +254,7 @@ impl LazyCsvReader { }; let schema = match self.sources.clone() { - ScanSources::Files(paths) => { + ScanSources::Paths(paths) => { // TODO: Path expansion should happen when converting to the IR // https://github.com/pola-rs/polars/issues/17634 let paths = expand_paths(&paths[..], self.glob(), self.cloud_options())?; @@ -266,6 +266,16 @@ impl LazyCsvReader { let mut file = polars_utils::open_file(path)?; infer_schema(get_reader_bytes(&mut file).expect("could not mmap file"))? }, + ScanSources::Files(files) => { + let Some(file) = files.first() else { + polars_bail!(ComputeError: "no buffers specified for this reader"); + }; + + infer_schema( + get_reader_bytes(&mut std::io::BufReader::new(file)) + .expect("could not mmap file"), + )? + }, ScanSources::Buffers(buffers) => { let Some(buffer) = buffers.first() else { polars_bail!(ComputeError: "no buffers specified for this reader"); diff --git a/crates/polars-lazy/src/scan/file_list_reader.rs b/crates/polars-lazy/src/scan/file_list_reader.rs index 2c8c9d86dd33..28315c96f736 100644 --- a/crates/polars-lazy/src/scan/file_list_reader.rs +++ b/crates/polars-lazy/src/scan/file_list_reader.rs @@ -19,8 +19,8 @@ pub trait LazyFileListReader: Clone { return self.finish_no_glob(); } - let ScanSources::Files(paths) = self.sources() else { - unreachable!("in-memory buffers should never be globbed"); + let ScanSources::Paths(paths) = self.sources() else { + unreachable!("opened-files or in-memory buffers should never be globbed"); }; let lfs = paths @@ -93,7 +93,7 @@ pub trait LazyFileListReader: Clone { /// Set paths of the scanned files. #[must_use] fn with_paths(self, paths: Arc<[PathBuf]>) -> Self { - self.with_sources(ScanSources::Files(paths)) + self.with_sources(ScanSources::Paths(paths)) } /// Configure the row limit. diff --git a/crates/polars-lazy/src/scan/ipc.rs b/crates/polars-lazy/src/scan/ipc.rs index e70434e39d8d..8d84ef3de049 100644 --- a/crates/polars-lazy/src/scan/ipc.rs +++ b/crates/polars-lazy/src/scan/ipc.rs @@ -125,13 +125,13 @@ impl LazyFrame { /// Create a LazyFrame directly from a ipc scan. pub fn scan_ipc(path: impl AsRef, args: ScanArgsIpc) -> PolarsResult { Self::scan_ipc_sources( - ScanSources::Files([path.as_ref().to_path_buf()].into()), + ScanSources::Paths([path.as_ref().to_path_buf()].into()), args, ) } pub fn scan_ipc_files(paths: Arc<[PathBuf]>, args: ScanArgsIpc) -> PolarsResult { - Self::scan_ipc_sources(ScanSources::Files(paths), args) + Self::scan_ipc_sources(ScanSources::Paths(paths), args) } pub fn scan_ipc_sources(sources: ScanSources, args: ScanArgsIpc) -> PolarsResult { diff --git a/crates/polars-lazy/src/scan/ndjson.rs b/crates/polars-lazy/src/scan/ndjson.rs index 195f8e0372a3..e38270ec3e09 100644 --- a/crates/polars-lazy/src/scan/ndjson.rs +++ b/crates/polars-lazy/src/scan/ndjson.rs @@ -29,7 +29,7 @@ pub struct LazyJsonLineReader { impl LazyJsonLineReader { pub fn new_paths(paths: Arc<[PathBuf]>) -> Self { - Self::new_with_sources(ScanSources::Files(paths)) + Self::new_with_sources(ScanSources::Paths(paths)) } pub fn new_with_sources(sources: ScanSources) -> Self { @@ -50,7 +50,7 @@ impl LazyJsonLineReader { } pub fn new(path: impl AsRef) -> Self { - Self::new_with_sources(ScanSources::Files([path.as_ref().to_path_buf()].into())) + Self::new_with_sources(ScanSources::Paths([path.as_ref().to_path_buf()].into())) } /// Add a row index column. diff --git a/crates/polars-lazy/src/scan/parquet.rs b/crates/polars-lazy/src/scan/parquet.rs index ff4f9a73ec78..9adb0f1838be 100644 --- a/crates/polars-lazy/src/scan/parquet.rs +++ b/crates/polars-lazy/src/scan/parquet.rs @@ -140,7 +140,7 @@ impl LazyFrame { /// Create a LazyFrame directly from a parquet scan. pub fn scan_parquet(path: impl AsRef, args: ScanArgsParquet) -> PolarsResult { Self::scan_parquet_sources( - ScanSources::Files([path.as_ref().to_path_buf()].into()), + ScanSources::Paths([path.as_ref().to_path_buf()].into()), args, ) } @@ -152,6 +152,6 @@ impl LazyFrame { /// Create a LazyFrame directly from a parquet scan. pub fn scan_parquet_files(paths: Arc<[PathBuf]>, args: ScanArgsParquet) -> PolarsResult { - Self::scan_parquet_sources(ScanSources::Files(paths), args) + Self::scan_parquet_sources(ScanSources::Paths(paths), args) } } diff --git a/crates/polars-mem-engine/src/executors/scan/csv.rs b/crates/polars-mem-engine/src/executors/scan/csv.rs index a9048481a4ac..c00a0047d525 100644 --- a/crates/polars-mem-engine/src/executors/scan/csv.rs +++ b/crates/polars-mem-engine/src/executors/scan/csv.rs @@ -4,8 +4,6 @@ use polars_core::config; use polars_core::utils::{ accumulate_dataframes_vertical, accumulate_dataframes_vertical_unchecked, }; -use polars_error::feature_gated; -use polars_utils::mmap::MemSlice; use super::*; @@ -68,25 +66,7 @@ impl CsvExec { let source = self.sources.at(i); let owned = &mut vec![]; - let memslice = match source { - ScanSourceRef::File(path) => { - let file = if run_async { - feature_gated!("cloud", { - polars_io::file_cache::FILE_CACHE - .get_entry(path.to_str().unwrap()) - // Safety: This was initialized by schema inference. - .unwrap() - .try_open_assume_latest() - }) - } else { - polars_utils::open_file(path) - }?; - - let mmap = unsafe { memmap::Mmap::map(&file).unwrap() }; - MemSlice::from_mmap(Arc::new(mmap)) - }, - ScanSourceRef::Buffer(buffer) => MemSlice::from_bytes(buffer.clone()), - }; + let memslice = source.to_memslice_async_latest(run_async)?; let reader = std::io::Cursor::new(maybe_decompress_bytes(&memslice, owned)?); let mut df = options diff --git a/crates/polars-mem-engine/src/executors/scan/ipc.rs b/crates/polars-mem-engine/src/executors/scan/ipc.rs index 33ad2f54f429..acbcc2d28dd6 100644 --- a/crates/polars-mem-engine/src/executors/scan/ipc.rs +++ b/crates/polars-mem-engine/src/executors/scan/ipc.rs @@ -24,8 +24,8 @@ pub struct IpcExec { impl IpcExec { fn read(&mut self) -> PolarsResult { let is_cloud = match &self.sources { - ScanSources::Files(paths) => paths.iter().any(is_cloud_url), - ScanSources::Buffers(_) => false, + ScanSources::Paths(paths) => paths.iter().any(is_cloud_url), + ScanSources::Files(_) | ScanSources::Buffers(_) => false, }; let force_async = config::force_async(); @@ -75,13 +75,16 @@ impl IpcExec { let source = self.sources.at(index); let memslice = match source { - ScanSourceRef::File(path) => { + ScanSourceRef::Path(path) => { let file = match idx_to_cached_file(index) { None => std::fs::File::open(path)?, Some(f) => f?, }; - MemSlice::from_mmap(Arc::new(unsafe { memmap::Mmap::map(&file).unwrap() })) + MemSlice::from_mmap(Arc::new(unsafe { memmap::Mmap::map(&file)? })) + }, + ScanSourceRef::File(file) => { + MemSlice::from_mmap(Arc::new(unsafe { memmap::Mmap::map(file)? })) }, ScanSourceRef::Buffer(buff) => MemSlice::from_bytes(buff.clone()), }; diff --git a/crates/polars-mem-engine/src/executors/scan/ndjson.rs b/crates/polars-mem-engine/src/executors/scan/ndjson.rs index fb55cb2b38e5..06e1d18892c6 100644 --- a/crates/polars-mem-engine/src/executors/scan/ndjson.rs +++ b/crates/polars-mem-engine/src/executors/scan/ndjson.rs @@ -1,7 +1,5 @@ use polars_core::config; use polars_core::utils::accumulate_dataframes_vertical; -use polars_error::feature_gated; -use polars_utils::mmap::MemSlice; use super::*; @@ -76,30 +74,9 @@ impl JsonExec { let row_index = self.file_scan_options.row_index.as_mut(); - let memslice = match source { - ScanSourceRef::File(path) => { - let file = if run_async { - feature_gated!("cloud", { - match polars_io::file_cache::FILE_CACHE - .get_entry(path.to_str().unwrap()) - // Safety: This was initialized by schema inference. - .unwrap() - .try_open_assume_latest() - { - Ok(v) => v, - Err(e) => return Some(Err(e)), - } - }) - } else { - match polars_utils::open_file(path) { - Ok(v) => v, - Err(e) => return Some(Err(e)), - } - }; - - MemSlice::from_mmap(Arc::new(unsafe { memmap::Mmap::map(&file).unwrap() })) - }, - ScanSourceRef::Buffer(buff) => MemSlice::from_bytes(buff.clone()), + let memslice = match source.to_memslice_async_latest(run_async) { + Ok(memslice) => memslice, + Err(err) => return Some(Err(err)), }; let owned = &mut vec![]; diff --git a/crates/polars-mem-engine/src/executors/scan/parquet.rs b/crates/polars-mem-engine/src/executors/scan/parquet.rs index fd0f53b6d728..2f32e0b50aa3 100644 --- a/crates/polars-mem-engine/src/executors/scan/parquet.rs +++ b/crates/polars-mem-engine/src/executors/scan/parquet.rs @@ -8,7 +8,6 @@ use polars_io::cloud::CloudOptions; use polars_io::parquet::metadata::FileMetaDataRef; use polars_io::utils::slice::split_slice_at_file; use polars_io::RowIndex; -use polars_utils::mmap::MemSlice; use super::*; @@ -82,18 +81,7 @@ impl ParquetExec { let row_counts = path_indexes .into_par_iter() .map(|&i| { - let memslice = match self.sources.at(i) { - ScanSourceRef::File(path) => { - let file = std::fs::File::open(path)?; - MemSlice::from_mmap(Arc::new(unsafe { - memmap::Mmap::map(&file).unwrap() - })) - }, - ScanSourceRef::Buffer(buff) => { - MemSlice::from_bytes(buff.clone()) - }, - }; - + let memslice = self.sources.at(i).to_memslice()?; ParquetReader::new(std::io::Cursor::new(memslice)).num_rows() }) .collect::>>()?; @@ -161,13 +149,7 @@ impl ParquetExec { hive_partitions.as_deref(), ); - let memslice = match source { - ScanSourceRef::File(path) => { - let file = std::fs::File::open(path)?; - MemSlice::from_mmap(Arc::new(unsafe { memmap::Mmap::map(&file).unwrap() })) - }, - ScanSourceRef::Buffer(buff) => MemSlice::from_bytes(buff.clone()), - }; + let memslice = source.to_memslice()?; let mut reader = ParquetReader::new(std::io::Cursor::new(memslice)) .read_parallel(parallel) diff --git a/crates/polars-mem-engine/src/utils.rs b/crates/polars-mem-engine/src/utils.rs index 06941cbc128d..91bd0e17902a 100644 --- a/crates/polars-mem-engine/src/utils.rs +++ b/crates/polars-mem-engine/src/utils.rs @@ -9,7 +9,7 @@ use polars_utils::arena::{Arena, Node}; /// /// # Notes /// -/// - Scan sources with in-memory buffers are ignored. +/// - Scan sources with opened files or in-memory buffers are ignored. pub(crate) fn agg_source_paths<'a>( root_lp: Node, acc_paths: &mut PlHashSet<&'a Path>, @@ -18,8 +18,8 @@ pub(crate) fn agg_source_paths<'a>( for (_, lp) in lp_arena.iter(root_lp) { if let IR::Scan { sources, .. } = lp { match sources { - ScanSources::Files(paths) => acc_paths.extend(paths.iter().map(|p| p.as_path())), - ScanSources::Buffers(_) => { + ScanSources::Paths(paths) => acc_paths.extend(paths.iter().map(|p| p.as_path())), + ScanSources::Buffers(_) | ScanSources::Files(_) => { // Ignore }, } diff --git a/crates/polars-plan/src/client/check.rs b/crates/polars-plan/src/client/check.rs index 1f5562bb4670..84189840a3dd 100644 --- a/crates/polars-plan/src/client/check.rs +++ b/crates/polars-plan/src/client/check.rs @@ -15,11 +15,14 @@ pub(super) fn assert_cloud_eligible(dsl: &DslPlan) -> PolarsResult<()> { } => { let sources_lock = sources.lock().unwrap(); match &sources_lock.sources { - ScanSources::Files(paths) => { + ScanSources::Paths(paths) => { if paths.iter().any(|p| !is_cloud_url(p)) { return ineligible_error("contains scan of local file system"); } }, + ScanSources::Files(_) => { + return ineligible_error("contains scan of opened files"); + }, ScanSources::Buffers(_) => { return ineligible_error("contains scan of in-memory buffer"); }, diff --git a/crates/polars-plan/src/plans/conversion/dsl_to_ir.rs b/crates/polars-plan/src/plans/conversion/dsl_to_ir.rs index 437db91f1975..084779a68a28 100644 --- a/crates/polars-plan/src/plans/conversion/dsl_to_ir.rs +++ b/crates/polars-plan/src/plans/conversion/dsl_to_ir.rs @@ -826,7 +826,7 @@ impl DslScanSources { return Ok(()); } - let ScanSources::Files(paths) = &self.sources else { + let ScanSources::Paths(paths) = &self.sources else { self.is_expanded = true; return Ok(()); }; @@ -853,7 +853,7 @@ impl DslScanSources { #[allow(unreachable_code)] { - self.sources = ScanSources::Files(expanded_sources); + self.sources = ScanSources::Paths(expanded_sources); self.is_expanded = true; Ok(()) diff --git a/crates/polars-plan/src/plans/conversion/scans.rs b/crates/polars-plan/src/plans/conversion/scans.rs index f7c64ed7612b..2b20d9fe932a 100644 --- a/crates/polars-plan/src/plans/conversion/scans.rs +++ b/crates/polars-plan/src/plans/conversion/scans.rs @@ -35,44 +35,34 @@ pub(super) fn parquet_file_info( ) -> PolarsResult<(FileInfo, Option)> { use polars_core::error::feature_gated; - let first_source = sources - .first() - .ok_or_else(|| polars_err!(ComputeError: "expected at least 1 source"))?; - - let (reader_schema, num_rows, metadata) = match first_source { - ScanSourceRef::File(path) => { - if is_cloud_url(path) { - feature_gated!("cloud", { - let uri = path.to_string_lossy(); - get_runtime().block_on(async { - let mut reader = - ParquetAsyncReader::from_uri(&uri, cloud_options, None).await?; - - PolarsResult::Ok(( - reader.schema().await?, - Some(reader.num_rows().await?), - Some(reader.get_metadata().await?.clone()), - )) - })? - }) - } else { - let file = polars_utils::open_file(path)?; - let mut reader = ParquetReader::new(file); - ( - reader.schema()?, - Some(reader.num_rows()?), - Some(reader.get_metadata()?.clone()), - ) - } - }, - ScanSourceRef::Buffer(buffer) => { - let mut reader = ParquetReader::new(std::io::Cursor::new(buffer)); + let (reader_schema, num_rows, metadata) = { + if sources.is_cloud_url() { + let first_path = &sources.as_paths().unwrap()[0]; + feature_gated!("cloud", { + let uri = first_path.to_string_lossy(); + get_runtime().block_on(async { + let mut reader = + ParquetAsyncReader::from_uri(&uri, cloud_options, None).await?; + + PolarsResult::Ok(( + reader.schema().await?, + Some(reader.num_rows().await?), + Some(reader.get_metadata().await?.clone()), + )) + })? + }) + } else { + let first_source = sources + .first() + .ok_or_else(|| polars_err!(ComputeError: "expected at least 1 source"))?; + let memslice = first_source.to_memslice()?; + let mut reader = ParquetReader::new(std::io::Cursor::new(memslice)); ( reader.schema()?, Some(reader.num_rows()?), Some(reader.get_metadata()?.clone()), ) - }, + } }; let schema = prepare_output_schema( @@ -103,7 +93,7 @@ pub(super) fn ipc_file_info( }; let metadata = match first { - ScanSourceRef::File(path) => { + ScanSourceRef::Path(path) => { if is_cloud_url(path) { feature_gated!("cloud", { let uri = path.to_string_lossy(); @@ -120,6 +110,9 @@ pub(super) fn ipc_file_info( ))? } }, + ScanSourceRef::File(file) => { + arrow::io::ipc::read::read_file_metadata(&mut std::io::BufReader::new(file))? + }, ScanSourceRef::Buffer(buff) => { arrow::io::ipc::read::read_file_metadata(&mut std::io::Cursor::new(buff))? }, @@ -182,7 +175,7 @@ pub(super) fn csv_file_info( let infer_schema_func = |i| { let source = sources.at(i); - let memslice = source.to_memslice(run_async, cache_entries.as_ref(), i)?; + let memslice = source.to_memslice_possibly_async(run_async, cache_entries.as_ref(), i)?; let owned = &mut vec![]; let mut reader = std::io::Cursor::new(maybe_decompress_bytes(&memslice, owned)?); if reader.read(&mut [0; 4])? < 2 && csv_options.raise_if_empty { @@ -308,7 +301,7 @@ pub(super) fn ndjson_file_info( ) } } else { - let memslice = first.to_memslice(run_async, cache_entries.as_ref(), 0)?; + let memslice = first.to_memslice_possibly_async(run_async, cache_entries.as_ref(), 0)?; let mut reader = std::io::Cursor::new(maybe_decompress_bytes(&memslice, owned)?); let schema = diff --git a/crates/polars-plan/src/plans/functions/count.rs b/crates/polars-plan/src/plans/functions/count.rs index 3bba674edb89..0b16c8eac994 100644 --- a/crates/polars-plan/src/plans/functions/count.rs +++ b/crates/polars-plan/src/plans/functions/count.rs @@ -96,7 +96,7 @@ fn count_all_rows_csv( sources .iter() .map(|source| match source { - ScanSourceRef::File(path) => count_rows_csv( + ScanSourceRef::Path(path) => count_rows_csv( path, parse_options.separator, parse_options.quote_char, @@ -104,14 +104,18 @@ fn count_all_rows_csv( parse_options.eol_char, options.has_header, ), - ScanSourceRef::Buffer(buf) => count_rows_csv_from_slice( - &buf[..], - parse_options.separator, - parse_options.quote_char, - parse_options.comment_prefix.as_ref(), - parse_options.eol_char, - options.has_header, - ), + _ => { + let memslice = source.to_memslice()?; + + count_rows_csv_from_slice( + &memslice[..], + parse_options.separator, + parse_options.quote_char, + parse_options.comment_prefix.as_ref(), + parse_options.eol_char, + options.has_header, + ) + }, }) .sum() } @@ -136,13 +140,8 @@ pub(super) fn count_rows_parquet( } else { sources .iter() - .map(|source| match source { - ScanSourceRef::File(path) => { - ParquetReader::new(polars_utils::open_file(path)?).num_rows() - }, - ScanSourceRef::Buffer(buffer) => { - ParquetReader::new(std::io::Cursor::new(buffer)).num_rows() - }, + .map(|source| { + ParquetReader::new(std::io::Cursor::new(source.to_memslice()?)).num_rows() }) .sum::>() } @@ -187,13 +186,9 @@ pub(super) fn count_rows_ipc( } else { sources .iter() - .map(|source| match source { - ScanSourceRef::File(path) => { - count_rows_ipc_sync(&mut polars_utils::open_file(path)?).map(|v| v as usize) - }, - ScanSourceRef::Buffer(buffer) => { - count_rows_ipc_sync(&mut std::io::Cursor::new(buffer)).map(|v| v as usize) - }, + .map(|source| { + let memslice = source.to_memslice()?; + count_rows_ipc_sync(&mut std::io::Cursor::new(memslice)).map(|v| v as usize) }) .sum::>() } @@ -234,8 +229,8 @@ pub(super) fn count_rows_ndjson( let run_async = is_cloud_url || (sources.is_files() && config::force_async()); let cache_entries = { - feature_gated!("cloud", { - if run_async { + if run_async { + feature_gated!("cloud", { Some(polars_io::file_cache::init_entries_from_uri_list( sources .as_paths() @@ -246,43 +241,23 @@ pub(super) fn count_rows_ndjson( .as_slice(), cloud_options, )?) - } else { - None - } - }) + }) + } else { + None + } }; sources .iter() - .map(|source| match source { - ScanSourceRef::File(path) => { - let f = if run_async { - feature_gated!("cloud", { - let entry: &Arc = - &cache_entries.as_ref().unwrap()[0]; - entry.try_open_check_latest()? - }) - } else { - polars_utils::open_file(path)? - }; + .map(|source| { + let memslice = + source.to_memslice_possibly_async(run_async, cache_entries.as_ref(), 0)?; - let mmap = unsafe { memmap::Mmap::map(&f).unwrap() }; - let owned = &mut vec![]; - - let reader = polars_io::ndjson::core::JsonLineReader::new(std::io::Cursor::new( - maybe_decompress_bytes(mmap.as_ref(), owned)?, - )); - reader.count() - }, - ScanSourceRef::Buffer(buffer) => { - polars_ensure!(!run_async, nyi = "BytesIO with force_async"); - - let owned = &mut vec![]; - let reader = polars_io::ndjson::core::JsonLineReader::new(std::io::Cursor::new( - maybe_decompress_bytes(buffer, owned)?, - )); - reader.count() - }, + let owned = &mut vec![]; + let reader = polars_io::ndjson::core::JsonLineReader::new(std::io::Cursor::new( + maybe_decompress_bytes(&memslice[..], owned)?, + )); + reader.count() }) .sum() } diff --git a/crates/polars-plan/src/plans/ir/dot.rs b/crates/polars-plan/src/plans/ir/dot.rs index 3ece8966a857..51050f2fa877 100644 --- a/crates/polars-plan/src/plans/ir/dot.rs +++ b/crates/polars-plan/src/plans/ir/dot.rs @@ -351,7 +351,8 @@ struct OptionExprIRDisplay<'a>(Option>); impl fmt::Display for ScanSourceRef<'_> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { - ScanSourceRef::File(path) => path.display().fmt(f), + ScanSourceRef::Path(path) => path.display().fmt(f), + ScanSourceRef::File(_) => f.write_str("open-file"), ScanSourceRef::Buffer(buff) => write!(f, "{} in-mem bytes", buff.len()), } } diff --git a/crates/polars-plan/src/plans/ir/mod.rs b/crates/polars-plan/src/plans/ir/mod.rs index e2f2ca3eae3d..67ba46d4aca2 100644 --- a/crates/polars-plan/src/plans/ir/mod.rs +++ b/crates/polars-plan/src/plans/ir/mod.rs @@ -6,6 +6,7 @@ pub(crate) mod tree_format; use std::borrow::Cow; use std::fmt; +use std::fs::File; use std::path::{Path, PathBuf}; pub use dot::{EscapeLabel, IRDotDisplay, PathsDisplay, ScanSourcesDisplay}; @@ -36,16 +37,44 @@ pub struct IRPlanRef<'a> { } #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] -#[derive(Debug, Clone, Hash, PartialEq, Eq)] +#[derive(Debug, Clone)] pub enum ScanSources { - Files(Arc<[PathBuf]>), + Paths(Arc<[PathBuf]>), + + #[cfg_attr(feature = "serde", serde(skip))] + Files(Arc<[File]>), #[cfg_attr(feature = "serde", serde(skip))] Buffers(Arc<[bytes::Bytes]>), } +impl std::hash::Hash for ScanSources { + fn hash(&self, state: &mut H) { + std::mem::discriminant(self).hash(state); + + // @NOTE: This is a bit crazy + match self { + Self::Paths(paths) => paths.hash(state), + Self::Files(files) => files.as_ptr().hash(state), + Self::Buffers(buffers) => buffers.as_ptr().hash(state), + } + } +} + +impl PartialEq for ScanSources { + fn eq(&self, other: &Self) -> bool { + match (self, other) { + (ScanSources::Paths(l), ScanSources::Paths(r)) => l == r, + _ => false, + } + } +} + +impl Eq for ScanSources {} + #[derive(Debug, Clone, Copy)] pub enum ScanSourceRef<'a> { - File(&'a Path), + Path(&'a Path), + File(&'a File), Buffer(&'a bytes::Bytes), } @@ -63,12 +92,43 @@ impl Default for ScanSources { impl<'a> ScanSourceRef<'a> { pub fn to_file_path(&self) -> &str { match self { - ScanSourceRef::File(path) => path.to_str().unwrap(), - ScanSourceRef::Buffer(_) => "in-mem", + Self::Path(path) => path.to_str().unwrap(), + Self::File(_) => "open-file", + Self::Buffer(_) => "in-mem", + } + } + + pub fn to_memslice(&self) -> PolarsResult { + self.to_memslice_possibly_async(false, None, 0) + } + + pub fn to_memslice_async_latest(&self, run_async: bool) -> PolarsResult { + match self { + ScanSourceRef::Path(path) => { + let file = if run_async { + feature_gated!("cloud", { + polars_io::file_cache::FILE_CACHE + .get_entry(path.to_str().unwrap()) + // Safety: This was initialized by schema inference. + .unwrap() + .try_open_assume_latest()? + }) + } else { + polars_utils::open_file(path)? + }; + + Ok(MemSlice::from_mmap(Arc::new(unsafe { + memmap::Mmap::map(&file)? + }))) + }, + ScanSourceRef::File(file) => Ok(MemSlice::from_mmap(Arc::new(unsafe { + memmap::Mmap::map(*file)? + }))), + ScanSourceRef::Buffer(buff) => Ok(MemSlice::from_bytes((*buff).clone())), } } - pub fn to_memslice( + pub fn to_memslice_possibly_async( &self, run_async: bool, #[cfg(feature = "cloud")] cache_entries: Option< @@ -78,7 +138,7 @@ impl<'a> ScanSourceRef<'a> { index: usize, ) -> PolarsResult { match self { - Self::File(path) => { + Self::Path(path) => { let f = if run_async { feature_gated!("cloud", { cache_entries.unwrap()[index].try_open_check_latest()? @@ -90,6 +150,10 @@ impl<'a> ScanSourceRef<'a> { let mmap = unsafe { memmap::Mmap::map(&f)? }; Ok(MemSlice::from_mmap(Arc::new(mmap))) }, + Self::File(file) => { + let mmap = unsafe { memmap::Mmap::map(*file)? }; + Ok(MemSlice::from_mmap(Arc::new(mmap))) + }, Self::Buffer(buff) => Ok(MemSlice::from_bytes((*buff).clone())), } } @@ -105,22 +169,22 @@ impl ScanSources { pub fn as_paths(&self) -> Option<&[PathBuf]> { match self { - Self::Files(paths) => Some(paths.as_ref()), - Self::Buffers(_) => None, + Self::Paths(paths) => Some(paths.as_ref()), + Self::Files(_) | Self::Buffers(_) => None, } } pub fn into_paths(&self) -> Option> { match self { - Self::Files(paths) => Some(paths.clone()), - Self::Buffers(_) => None, + Self::Paths(paths) => Some(paths.clone()), + Self::Files(_) | Self::Buffers(_) => None, } } pub fn first_path(&self) -> Option<&Path> { match self { - ScanSources::Files(paths) => paths.first().map(|p| p.as_path()), - ScanSources::Buffers(_) => None, + Self::Paths(paths) => paths.first().map(|p| p.as_path()), + Self::Files(_) | Self::Buffers(_) => None, } } @@ -132,18 +196,19 @@ impl ScanSources { } pub fn is_files(&self) -> bool { - matches!(self, Self::Files(_)) + matches!(self, Self::Paths(_)) } pub fn is_cloud_url(&self) -> bool { match self { - Self::Files(paths) => paths.first().map_or(false, polars_io::is_cloud_url), - Self::Buffers(_) => false, + Self::Paths(paths) => paths.first().map_or(false, polars_io::is_cloud_url), + Self::Files(_) | Self::Buffers(_) => false, } } pub fn len(&self) -> usize { match self { + Self::Paths(s) => s.len(), Self::Files(s) => s.len(), Self::Buffers(s) => s.len(), } @@ -163,17 +228,19 @@ impl ScanSources { } match self { - Self::Files(paths) => { + Self::Paths(paths) => { PlSmallStr::from_str(paths.first().unwrap().to_string_lossy().as_ref()) }, + Self::Files(_) => PlSmallStr::from_static("OPEN_FILES"), Self::Buffers(_) => PlSmallStr::from_static("IN_MEMORY"), } } pub fn get(&self, idx: usize) -> Option { match self { - ScanSources::Files(paths) => paths.get(idx).map(|p| ScanSourceRef::File(p)), - ScanSources::Buffers(buffers) => buffers.get(idx).map(ScanSourceRef::Buffer), + Self::Paths(paths) => paths.get(idx).map(|p| ScanSourceRef::Path(p)), + Self::Files(files) => files.get(idx).map(|f| ScanSourceRef::File(f)), + Self::Buffers(buffers) => buffers.get(idx).map(ScanSourceRef::Buffer), } } @@ -192,7 +259,8 @@ impl<'a> Iterator for ScanSourceIter<'a> { fn next(&mut self) -> Option { let item = match self.sources { - ScanSources::Files(paths) => ScanSourceRef::File(paths.get(self.offset)?), + ScanSources::Paths(paths) => ScanSourceRef::Path(paths.get(self.offset)?), + ScanSources::Files(files) => ScanSourceRef::File(files.get(self.offset)?), ScanSources::Buffers(buffers) => ScanSourceRef::Buffer(buffers.get(self.offset)?), }; diff --git a/crates/polars-plan/src/plans/optimizer/count_star.rs b/crates/polars-plan/src/plans/optimizer/count_star.rs index 02c8b94a033c..1f20c83f6a87 100644 --- a/crates/polars-plan/src/plans/optimizer/count_star.rs +++ b/crates/polars-plan/src/plans/optimizer/count_star.rs @@ -67,7 +67,7 @@ fn visit_logical_plan_for_scan_paths( match lp_arena.get(node) { IR::Union { inputs, .. } => { enum MutableSources { - Files(Vec), + Paths(Vec), Buffers(Vec), } @@ -76,25 +76,22 @@ fn visit_logical_plan_for_scan_paths( for input in inputs { match visit_logical_plan_for_scan_paths(*input, lp_arena, expr_arena, true) { Some(expr) => { - match expr.sources { - ScanSources::Files(paths) => match sources { - Some(MutableSources::Files(ref mut files)) => { - files.extend_from_slice(&paths[..]) - }, - Some(MutableSources::Buffers(_)) => { - todo!("Mixing in memory buffers and paths in count star opt") - }, - None => sources = Some(MutableSources::Files(paths.to_vec())), + match (expr.sources, &mut sources) { + ( + ScanSources::Paths(paths), + Some(MutableSources::Paths(ref mut mutable_paths)), + ) => mutable_paths.extend_from_slice(&paths[..]), + (ScanSources::Paths(paths), None) => { + sources = Some(MutableSources::Paths(paths.to_vec())) }, - ScanSources::Buffers(bs) => match sources { - Some(MutableSources::Files(_)) => { - todo!("Mixing in memory buffers and paths in count star opt") - }, - Some(MutableSources::Buffers(ref mut buffers)) => { - buffers.extend_from_slice(&bs[..]) - }, - None => sources = Some(MutableSources::Buffers(bs.to_vec())), + ( + ScanSources::Buffers(buffers), + Some(MutableSources::Buffers(ref mut mutable_buffers)), + ) => mutable_buffers.extend_from_slice(&buffers[..]), + (ScanSources::Buffers(buffers), None) => { + sources = Some(MutableSources::Buffers(buffers.to_vec())) }, + _ => return None, } match &scan_type { @@ -114,7 +111,7 @@ fn visit_logical_plan_for_scan_paths( } Some(CountStarExpr { sources: match sources { - Some(MutableSources::Files(files)) => ScanSources::Files(files.into()), + Some(MutableSources::Paths(paths)) => ScanSources::Paths(paths.into()), Some(MutableSources::Buffers(buffers)) => ScanSources::Buffers(buffers.into()), None => ScanSources::default(), }, diff --git a/crates/polars-plan/src/plans/optimizer/predicate_pushdown/mod.rs b/crates/polars-plan/src/plans/optimizer/predicate_pushdown/mod.rs index f42a7ca7239b..7cb0753e5a6d 100644 --- a/crates/polars-plan/src/plans/optimizer/predicate_pushdown/mod.rs +++ b/crates/polars-plan/src/plans/optimizer/predicate_pushdown/mod.rs @@ -403,7 +403,7 @@ impl<'a> PredicatePushDown<'a> { filter: None, }); } else { - sources = ScanSources::Files(new_paths.into()); + sources = ScanSources::Paths(new_paths.into()); scan_hive_parts = Some(Arc::from(new_hive_parts)); } } diff --git a/crates/polars-python/src/conversion/mod.rs b/crates/polars-python/src/conversion/mod.rs index 886b6f744552..ec05729acc81 100644 --- a/crates/polars-python/src/conversion/mod.rs +++ b/crates/polars-python/src/conversion/mod.rs @@ -2,6 +2,7 @@ pub(crate) mod any_value; pub(crate) mod chunked_array; mod datetime; use std::fmt::{Display, Formatter}; +use std::fs::File; use std::hash::{Hash, Hasher}; use std::path::PathBuf; @@ -540,7 +541,8 @@ impl<'py> FromPyObject<'py> for Wrap { } enum MutableSources { - Files(Vec), + Paths(Vec), + Files(Vec), Buffers(Vec), } @@ -562,13 +564,19 @@ impl<'py> FromPyObject<'py> for Wrap { EitherPythonFileOrPath::Path(path) => { let mut sources = Vec::with_capacity(num_items); sources.push(path); + MutableSources::Paths(sources) + }, + EitherPythonFileOrPath::File(file) => { + let mut sources = Vec::with_capacity(num_items); + sources.push(file); MutableSources::Files(sources) }, }; for source in iter { match (&mut sources, source?) { - (MutableSources::Files(v), EitherPythonFileOrPath::Path(p)) => v.push(p), + (MutableSources::Paths(v), EitherPythonFileOrPath::Path(p)) => v.push(p), + (MutableSources::Files(v), EitherPythonFileOrPath::File(f)) => v.push(f), (MutableSources::Buffers(v), EitherPythonFileOrPath::Py(f)) => v.push(f.as_bytes()), _ => { return Err(PyTypeError::new_err( @@ -579,6 +587,7 @@ impl<'py> FromPyObject<'py> for Wrap { } Ok(Wrap(match sources { + MutableSources::Paths(i) => ScanSources::Paths(i.into()), MutableSources::Files(i) => ScanSources::Files(i.into()), MutableSources::Buffers(i) => ScanSources::Buffers(i.into()), })) diff --git a/crates/polars-python/src/dataframe/io.rs b/crates/polars-python/src/dataframe/io.rs index 12707e93dd85..d56334d35ad0 100644 --- a/crates/polars-python/src/dataframe/io.rs +++ b/crates/polars-python/src/dataframe/io.rs @@ -10,7 +10,6 @@ use polars::prelude::*; #[cfg(feature = "parquet")] use polars_parquet::arrow::write::StatisticsOptions; use pyo3::prelude::*; -use pyo3::pybacked::PyBackedStr; use super::PyDataFrame; #[cfg(feature = "parquet")] @@ -18,176 +17,13 @@ use crate::conversion::parse_parquet_compression; use crate::conversion::Wrap; use crate::error::PyPolarsErr; use crate::file::{ - get_either_file, get_file_like, get_mmap_bytes_reader, get_mmap_bytes_reader_and_path, + get_either_file, get_file_like, get_mmap_bytes_reader, read_if_bytesio, EitherRustPythonFile, }; use crate::prelude::PyCompatLevel; #[pymethods] impl PyDataFrame { - #[staticmethod] - #[cfg(feature = "csv")] - #[pyo3(signature = ( - py_f, infer_schema_length, chunk_size, has_header, ignore_errors, n_rows, - skip_rows, projection, separator, rechunk, columns, encoding, n_threads, path, - overwrite_dtype, overwrite_dtype_slice, low_memory, comment_prefix, quote_char, - null_values, missing_utf8_is_empty_string, try_parse_dates, skip_rows_after_header, - row_index, sample_size, eol_char, raise_if_empty, truncate_ragged_lines, decimal_comma, schema) -)] - pub fn read_csv( - py: Python, - mut py_f: Bound, - infer_schema_length: Option, - chunk_size: usize, - has_header: bool, - ignore_errors: bool, - n_rows: Option, - skip_rows: usize, - projection: Option>, - separator: &str, - rechunk: bool, - columns: Option>, - encoding: Wrap, - n_threads: Option, - path: Option, - overwrite_dtype: Option)>>, - overwrite_dtype_slice: Option>>, - low_memory: bool, - comment_prefix: Option<&str>, - quote_char: Option<&str>, - null_values: Option>, - missing_utf8_is_empty_string: bool, - try_parse_dates: bool, - skip_rows_after_header: usize, - row_index: Option<(String, IdxSize)>, - sample_size: usize, - eol_char: &str, - raise_if_empty: bool, - truncate_ragged_lines: bool, - decimal_comma: bool, - schema: Option>, - ) -> PyResult { - let null_values = null_values.map(|w| w.0); - let eol_char = eol_char.as_bytes()[0]; - let row_index = row_index.map(|(name, offset)| RowIndex { - name: name.into(), - offset, - }); - let quote_char = quote_char.and_then(|s| s.as_bytes().first().copied()); - - let overwrite_dtype = overwrite_dtype.map(|overwrite_dtype| { - overwrite_dtype - .iter() - .map(|(name, dtype)| { - let dtype = dtype.0.clone(); - Field::new((&**name).into(), dtype) - }) - .collect::() - }); - - let overwrite_dtype_slice = overwrite_dtype_slice.map(|overwrite_dtype| { - overwrite_dtype - .iter() - .map(|dt| dt.0.clone()) - .collect::>() - }); - - py_f = read_if_bytesio(py_f); - let mmap_bytes_r = get_mmap_bytes_reader(&py_f)?; - let df = py.allow_threads(move || { - CsvReadOptions::default() - .with_path(path) - .with_infer_schema_length(infer_schema_length) - .with_has_header(has_header) - .with_n_rows(n_rows) - .with_skip_rows(skip_rows) - .with_ignore_errors(ignore_errors) - .with_projection(projection.map(Arc::new)) - .with_rechunk(rechunk) - .with_chunk_size(chunk_size) - .with_columns(columns.map(|x| x.into_iter().map(|x| x.into()).collect())) - .with_n_threads(n_threads) - .with_schema_overwrite(overwrite_dtype.map(Arc::new)) - .with_dtype_overwrite(overwrite_dtype_slice.map(Arc::new)) - .with_schema(schema.map(|schema| Arc::new(schema.0))) - .with_low_memory(low_memory) - .with_skip_rows_after_header(skip_rows_after_header) - .with_row_index(row_index) - .with_sample_size(sample_size) - .with_raise_if_empty(raise_if_empty) - .with_parse_options( - CsvParseOptions::default() - .with_separator(separator.as_bytes()[0]) - .with_encoding(encoding.0) - .with_missing_is_null(!missing_utf8_is_empty_string) - .with_comment_prefix(comment_prefix) - .with_null_values(null_values) - .with_try_parse_dates(try_parse_dates) - .with_quote_char(quote_char) - .with_eol_char(eol_char) - .with_truncate_ragged_lines(truncate_ragged_lines) - .with_decimal_comma(decimal_comma), - ) - .into_reader_with_file_handle(mmap_bytes_r) - .finish() - .map_err(PyPolarsErr::from) - })?; - Ok(df.into()) - } - - #[staticmethod] - #[cfg(feature = "parquet")] - #[pyo3(signature = (py_f, columns, projection, n_rows, row_index, low_memory, parallel, use_statistics, rechunk))] - pub fn read_parquet( - py: Python, - py_f: PyObject, - columns: Option>, - projection: Option>, - n_rows: Option, - row_index: Option<(String, IdxSize)>, - low_memory: bool, - parallel: Wrap, - use_statistics: bool, - rechunk: bool, - ) -> PyResult { - use EitherRustPythonFile::*; - - let row_index = row_index.map(|(name, offset)| RowIndex { - name: name.into(), - offset, - }); - let result = match get_either_file(py_f, false)? { - Py(f) => { - let buf = f.as_buffer(); - py.allow_threads(move || { - ParquetReader::new(buf) - .with_projection(projection) - .with_columns(columns) - .read_parallel(parallel.0) - .with_slice(n_rows.map(|x| (0, x))) - .with_row_index(row_index) - .set_low_memory(low_memory) - .use_statistics(use_statistics) - .set_rechunk(rechunk) - .finish() - }) - }, - Rust(f) => py.allow_threads(move || { - ParquetReader::new(f) - .with_projection(projection) - .with_columns(columns) - .read_parallel(parallel.0) - .with_slice(n_rows.map(|x| (0, x))) - .with_row_index(row_index) - .use_statistics(use_statistics) - .set_rechunk(rechunk) - .finish() - }), - }; - let df = result.map_err(PyPolarsErr::from)?; - Ok(PyDataFrame::new(df)) - } - #[staticmethod] #[cfg(feature = "json")] pub fn read_json( @@ -220,69 +56,6 @@ impl PyDataFrame { }) } - #[staticmethod] - #[cfg(feature = "json")] - pub fn read_ndjson( - py: Python, - mut py_f: Bound, - ignore_errors: bool, - schema: Option>, - schema_overrides: Option>, - ) -> PyResult { - py_f = read_if_bytesio(py_f); - let mmap_bytes_r = get_mmap_bytes_reader(&py_f)?; - - let mut builder = JsonReader::new(mmap_bytes_r) - .with_json_format(JsonFormat::JsonLines) - .with_ignore_errors(ignore_errors); - - if let Some(schema) = schema { - builder = builder.with_schema(Arc::new(schema.0)); - } - - if let Some(schema) = schema_overrides.as_ref() { - builder = builder.with_schema_overwrite(&schema.0); - } - - let out = py - .allow_threads(move || builder.finish()) - .map_err(|e| PyPolarsErr::Other(format!("{e}")))?; - Ok(out.into()) - } - - #[staticmethod] - #[cfg(feature = "ipc")] - #[pyo3(signature = (py_f, columns, projection, n_rows, row_index, memory_map))] - pub fn read_ipc( - py: Python, - mut py_f: Bound, - columns: Option>, - projection: Option>, - n_rows: Option, - row_index: Option<(String, IdxSize)>, - memory_map: bool, - ) -> PyResult { - let row_index = row_index.map(|(name, offset)| RowIndex { - name: name.into(), - offset, - }); - py_f = read_if_bytesio(py_f); - let (mmap_bytes_r, mmap_path) = get_mmap_bytes_reader_and_path(&py_f)?; - - let mmap_path = if memory_map { mmap_path } else { None }; - let df = py.allow_threads(move || { - IpcReader::new(mmap_bytes_r) - .with_projection(projection) - .with_columns(columns) - .with_n_rows(n_rows) - .with_row_index(row_index) - .memory_mapped(mmap_path) - .finish() - .map_err(PyPolarsErr::from) - })?; - Ok(PyDataFrame::new(df)) - } - #[staticmethod] #[cfg(feature = "ipc_streaming")] #[pyo3(signature = (py_f, columns, projection, n_rows, row_index, rechunk))] diff --git a/crates/polars-python/src/file.rs b/crates/polars-python/src/file.rs index 3cbb3d364e2f..6caaf3bb05b4 100644 --- a/crates/polars-python/src/file.rs +++ b/crates/polars-python/src/file.rs @@ -206,6 +206,7 @@ impl EitherRustPythonFile { pub enum EitherPythonFileOrPath { Py(PyFileLikeObject), Path(PathBuf), + File(File), } pub fn get_either_file_or_path(py_f: PyObject, write: bool) -> PyResult { @@ -223,6 +224,55 @@ pub fn get_either_file_or_path(py_f: PyObject, write: bool) -> PyResult()) + .ok() + }) + .flatten() + .map(|fileno| unsafe { + // `File::from_raw_fd()` takes the ownership of the file descriptor. + // When the File is dropped, it closes the file descriptor. + // This is undesired - the Python file object will become invalid. + // Therefore, we duplicate the file descriptor here. + // Closing the duplicated file descriptor will not close + // the original file descriptor; + // and the status, e.g. stream position, is still shared with + // the original file descriptor. + // We use `F_DUPFD_CLOEXEC` here instead of `dup()` + // because it also sets the `O_CLOEXEC` flag on the duplicated file descriptor, + // which `dup()` clears. + // `open()` in both Rust and Python automatically set `O_CLOEXEC` flag; + // it prevents leaking file descriptors across processes, + // and we want to be consistent with them. + // `F_DUPFD_CLOEXEC` is defined in POSIX.1-2008 + // and is present on all alive UNIX(-like) systems. + libc::fcntl(fileno, libc::F_DUPFD_CLOEXEC, 0) + }) + .filter(|fileno| *fileno != -1) + .map(|fileno| fileno as RawFd) + { + return Ok(EitherPythonFileOrPath::File(unsafe { + File::from_raw_fd(fd) + })); + } + // BytesIO / StringIO is relatively fast, and some code relies on it. if !py_f.is_exact_instance(&io.getattr("BytesIO").unwrap()) || !py_f.is_exact_instance(&io.getattr("StringIO").unwrap()) diff --git a/crates/polars-python/src/lazyframe/general.rs b/crates/polars-python/src/lazyframe/general.rs index e09d5cb7f309..11266a696d5e 100644 --- a/crates/polars-python/src/lazyframe/general.rs +++ b/crates/polars-python/src/lazyframe/general.rs @@ -26,8 +26,9 @@ fn pyobject_to_first_path_and_scan_sources( use crate::file::{get_either_file_or_path, EitherPythonFileOrPath}; Ok(match get_either_file_or_path(obj, false)? { EitherPythonFileOrPath::Path(path) => { - (Some(path.clone()), ScanSources::Files([path].into())) + (Some(path.clone()), ScanSources::Paths([path].into())) }, + EitherPythonFileOrPath::File(file) => (None, ScanSources::Files([file].into())), EitherPythonFileOrPath::Py(f) => (None, ScanSources::Buffers([f.as_bytes()].into())), }) } diff --git a/crates/polars-stream/src/utils/late_materialized_df.rs b/crates/polars-stream/src/utils/late_materialized_df.rs index 9e7322167f7f..b18c5cea0657 100644 --- a/crates/polars-stream/src/utils/late_materialized_df.rs +++ b/crates/polars-stream/src/utils/late_materialized_df.rs @@ -25,7 +25,7 @@ impl LateMaterializedDataFrame { fmt_str: "LateMaterializedDataFrame", }); IR::Scan { - sources: ScanSources::Files(Arc::default()), + sources: ScanSources::Paths(Arc::default()), file_info: FileInfo::new(schema, None, (None, usize::MAX)), hive_parts: None, predicate: None, diff --git a/py-polars/polars/io/csv/functions.py b/py-polars/polars/io/csv/functions.py index d8bc983e0fcc..61349439fed8 100644 --- a/py-polars/polars/io/csv/functions.py +++ b/py-polars/polars/io/csv/functions.py @@ -2,7 +2,7 @@ import contextlib import os -from io import BytesIO, StringIO +import io from pathlib import Path from typing import IO, TYPE_CHECKING, Any, Callable, Mapping, Sequence @@ -14,10 +14,9 @@ is_str_sequence, normalize_filepath, ) -from polars._utils.wrap import wrap_df, wrap_ldf +from polars._utils.wrap import wrap_ldf from polars.datatypes import N_INFER_DEFAULT, String, parse_into_dtype from polars.io._utils import ( - is_glob_pattern, parse_columns_arg, parse_row_index_args, prepare_file_arg, @@ -26,7 +25,7 @@ from polars.io.csv.batched_reader import BatchedCsvReader with contextlib.suppress(ImportError): # Module not available when building docs - from polars.polars import PyDataFrame, PyLazyFrame + from polars.polars import PyLazyFrame if TYPE_CHECKING: from polars import DataFrame, LazyFrame @@ -564,15 +563,8 @@ def _read_csv_impl( decimal_comma: bool = False, glob: bool = True, ) -> DataFrame: - path: str | None - if isinstance(source, (str, Path)): - path = normalize_filepath(source, check_not_directory=False) - else: - path = None - if isinstance(source, BytesIO): - source = source.getvalue() - if isinstance(source, StringIO): - source = source.getvalue().encode() + if isinstance(source, (bytes, memoryview, bytearray)): + source = io.BytesIO(source) dtype_list: Sequence[tuple[str, PolarsDataType]] | None = None dtype_slice: Sequence[PolarsDataType] | None = None @@ -587,93 +579,58 @@ def _read_csv_impl( msg = f"`schema_overrides` should be of type list or dict, got {type(schema_overrides).__name__!r}" raise TypeError(msg) - processed_null_values = _process_null_values(null_values) - if isinstance(columns, str): columns = [columns] - if isinstance(source, str) and is_glob_pattern(source): - dtypes_dict = None - if dtype_list is not None: - dtypes_dict = dict(dtype_list) - if dtype_slice is not None: - msg = ( - "cannot use glob patterns and unnamed dtypes as `schema_overrides` argument" - "\n\nUse `schema_overrides`: Mapping[str, Type[DataType]]" - ) - raise ValueError(msg) - from polars import scan_csv - scan = scan_csv( - source, - has_header=has_header, - separator=separator, - comment_prefix=comment_prefix, - quote_char=quote_char, - skip_rows=skip_rows, - schema=schema, - schema_overrides=dtypes_dict, - null_values=null_values, - missing_utf8_is_empty_string=missing_utf8_is_empty_string, - ignore_errors=ignore_errors, - infer_schema_length=infer_schema_length, - n_rows=n_rows, - low_memory=low_memory, - rechunk=rechunk, - skip_rows_after_header=skip_rows_after_header, - row_index_name=row_index_name, - row_index_offset=row_index_offset, - eol_char=eol_char, - raise_if_empty=raise_if_empty, - truncate_ragged_lines=truncate_ragged_lines, - decimal_comma=decimal_comma, - glob=glob, + dtypes_dict = None + if dtype_list is not None: + dtypes_dict = dict(dtype_list) + if dtype_slice is not None: + msg = ( + "cannot use glob patterns and unnamed dtypes as `schema_overrides` argument" + "\n\nUse `schema_overrides`: Mapping[str, Type[DataType]]" ) - if columns is None: - return scan.collect() - elif is_str_sequence(columns, allow_str=False): - return scan.select(columns).collect() - else: - msg = ( - "cannot use glob patterns and integer based projection as `columns` argument" - "\n\nUse columns: List[str]" - ) - raise ValueError(msg) - - projection, columns = parse_columns_arg(columns) + raise ValueError(msg) + from polars import scan_csv - pydf = PyDataFrame.read_csv( + scan = scan_csv( source, - infer_schema_length, - batch_size, - has_header, - ignore_errors, - n_rows, - skip_rows, - projection, - separator, - rechunk, - columns, - encoding, - n_threads, - path, - dtype_list, - dtype_slice, - low_memory, - comment_prefix, - quote_char, - processed_null_values, - missing_utf8_is_empty_string, - try_parse_dates, - skip_rows_after_header, - parse_row_index_args(row_index_name, row_index_offset), - sample_size=sample_size, + has_header=has_header, + separator=separator, + comment_prefix=comment_prefix, + quote_char=quote_char, + skip_rows=skip_rows, + schema=schema, + schema_overrides=dtypes_dict, + null_values=null_values, + missing_utf8_is_empty_string=missing_utf8_is_empty_string, + ignore_errors=ignore_errors, + infer_schema_length=infer_schema_length, + n_rows=n_rows, + encoding=encoding, + low_memory=low_memory, + rechunk=rechunk, + skip_rows_after_header=skip_rows_after_header, + row_index_name=row_index_name, + row_index_offset=row_index_offset, eol_char=eol_char, raise_if_empty=raise_if_empty, truncate_ragged_lines=truncate_ragged_lines, decimal_comma=decimal_comma, - schema=schema, + glob=glob, + try_parse_dates=try_parse_dates, ) - return wrap_df(pydf) + + if columns is None: + return scan.collect() + elif is_str_sequence(columns, allow_str=False): + return scan.select(columns).collect() + else: + msg = ( + "cannot use glob patterns and integer based projection as `columns` argument" + "\n\nUse columns: List[str]" + ) + raise ValueError(msg) @deprecate_renamed_parameter("dtypes", "schema_overrides", version="0.20.31") @@ -1241,10 +1198,10 @@ def with_column_names(cols: list[str]) -> list[str]: if isinstance(source, (str, Path)): source = normalize_filepath(source, check_not_directory=False) - elif isinstance(source, (BytesIO, StringIO)) or ( + elif isinstance(source, io.IOBase) or ( isinstance(source, list) and len(source) > 0 - and isinstance(source[0], (BytesIO, StringIO)) + and isinstance(source[0], io.IOBase) ): pass else: diff --git a/py-polars/polars/io/ipc/functions.py b/py-polars/polars/io/ipc/functions.py index 5142b2ae68c6..a318ed8d62a4 100644 --- a/py-polars/polars/io/ipc/functions.py +++ b/py-polars/polars/io/ipc/functions.py @@ -2,6 +2,7 @@ import contextlib import os +import io from pathlib import Path from typing import IO, TYPE_CHECKING, Any, Sequence @@ -15,8 +16,6 @@ from polars._utils.wrap import wrap_df, wrap_ldf from polars.dependencies import import_optional from polars.io._utils import ( - is_glob_pattern, - is_local_file, parse_columns_arg, parse_row_index_args, prepare_file_arg, @@ -176,42 +175,31 @@ def _read_ipc_impl( rechunk: bool = True, memory_map: bool = True, ) -> DataFrame: - if isinstance(source, (str, Path)): - source = normalize_filepath(source, check_not_directory=False) + if isinstance(source, (memoryview, bytearray, bytes)): + source = io.BytesIO(source) + if isinstance(columns, str): columns = [columns] - if isinstance(source, str) and is_glob_pattern(source) and is_local_file(source): - scan = scan_ipc( - source, - n_rows=n_rows, - rechunk=rechunk, - row_index_name=row_index_name, - row_index_offset=row_index_offset, - memory_map=memory_map, - ) - if columns is None: - df = scan.collect() - elif is_str_sequence(columns, allow_str=False): - df = scan.select(columns).collect() - else: - msg = ( - "cannot use glob patterns and integer based projection as `columns` argument" - "\n\nUse columns: List[str]" - ) - raise TypeError(msg) - return df - - projection, columns = parse_columns_arg(columns) - pydf = PyDataFrame.read_ipc( + scan = scan_ipc( source, - columns, - projection, - n_rows, - parse_row_index_args(row_index_name, row_index_offset), + n_rows=n_rows, + rechunk=rechunk, + row_index_name=row_index_name, + row_index_offset=row_index_offset, memory_map=memory_map, ) - return wrap_df(pydf) + if columns is None: + df = scan.collect() + elif is_str_sequence(columns, allow_str=False): + df = scan.select(columns).collect() + else: + msg = ( + "cannot use glob patterns and integer based projection as `columns` argument" + "\n\nUse columns: List[str]" + ) + raise TypeError(msg) + return df @deprecate_renamed_parameter("row_count_name", "row_index_name", version="0.20.4") diff --git a/py-polars/polars/io/ndjson.py b/py-polars/polars/io/ndjson.py index 6cff4ddb1643..ba1d120890bb 100644 --- a/py-polars/polars/io/ndjson.py +++ b/py-polars/polars/io/ndjson.py @@ -3,11 +3,11 @@ import contextlib from io import BytesIO, StringIO from pathlib import Path -from typing import IO, TYPE_CHECKING, Any, Sequence +from typing import IO, TYPE_CHECKING, Any from polars._utils.deprecation import deprecate_renamed_parameter from polars._utils.various import normalize_filepath -from polars._utils.wrap import wrap_df, wrap_ldf +from polars._utils.wrap import wrap_ldf from polars.datatypes import N_INFER_DEFAULT from polars.io._utils import parse_row_index_args @@ -120,29 +120,6 @@ def read_ndjson( │ 3 ┆ 8 │ └─────┴─────┘ """ - if not ( - isinstance(source, (str, Path)) - or isinstance(source, Sequence) - and source - and isinstance(source[0], (str, Path)) - ): - # TODO: A lot of the parameters aren't applied for BytesIO - if isinstance(source, StringIO): - source = BytesIO(source.getvalue().encode()) - - pydf = PyDataFrame.read_ndjson( - source, - ignore_errors=ignore_errors, - schema=schema, - schema_overrides=schema_overrides, - ) - - df = wrap_df(pydf) - - if n_rows: - df = df.head(n_rows) - - return df return scan_ndjson( source, # type: ignore[arg-type] diff --git a/py-polars/polars/io/parquet/functions.py b/py-polars/polars/io/parquet/functions.py index 04cccc85fc5d..6320a0072578 100644 --- a/py-polars/polars/io/parquet/functions.py +++ b/py-polars/polars/io/parquet/functions.py @@ -171,9 +171,8 @@ def read_parquet( memory_map=memory_map, rechunk=rechunk, ) - # Read file and bytes inputs using `read_parquet` - if isinstance(source, bytes): + elif isinstance(source, bytes): source = io.BytesIO(source) elif isinstance(source, list) and len(source) > 0 and isinstance(source[0], bytes): assert all(isinstance(s, bytes) for s in source) @@ -233,7 +232,7 @@ def _read_parquet_with_pyarrow( sources: list[str | Path | IO[bytes] | bytes | list[str] | list[Path]] = [] if isinstance(source, list): - if len(source) > 0 and isinstance(source[0], (bytes, io.BytesIO)): + if len(source) > 0 and isinstance(source[0], (bytes, io.IOBase)): sources = source # type: ignore[assignment] else: sources = [source] # type: ignore[list-item] @@ -416,10 +415,10 @@ def scan_parquet( if isinstance(source, (str, Path)): source = normalize_filepath(source, check_not_directory=False) - elif isinstance(source, io.BytesIO) or ( + elif isinstance(source, io.IOBase) or ( isinstance(source, list) and len(source) > 0 - and isinstance(source[0], io.BytesIO) + and isinstance(source[0], io.IOBase) ): pass else: From 0b49b136ef2581abdedb494b081632a379640db4 Mon Sep 17 00:00:00 2001 From: coastalwhite Date: Sun, 8 Sep 2024 13:54:54 +0200 Subject: [PATCH 17/27] add bytes and file descriptors --- crates/polars-plan/src/plans/ir/mod.rs | 2 +- crates/polars-python/src/conversion/mod.rs | 8 +- crates/polars-python/src/dataframe/io.rs | 229 +++++++++++++++++- crates/polars-python/src/file.rs | 11 +- crates/polars-python/src/lazyframe/general.rs | 1 + py-polars/polars/_utils/various.py | 18 ++ py-polars/polars/io/csv/functions.py | 158 +++++++----- py-polars/polars/io/ipc/functions.py | 82 ++++--- py-polars/polars/io/ndjson.py | 58 +++-- py-polars/polars/io/parquet/functions.py | 15 +- py-polars/tests/unit/io/test_csv.py | 5 + py-polars/tests/unit/io/test_ipc.py | 4 + py-polars/tests/unit/io/test_parquet.py | 13 + 13 files changed, 475 insertions(+), 129 deletions(-) diff --git a/crates/polars-plan/src/plans/ir/mod.rs b/crates/polars-plan/src/plans/ir/mod.rs index 67ba46d4aca2..cf0b5ee8df7d 100644 --- a/crates/polars-plan/src/plans/ir/mod.rs +++ b/crates/polars-plan/src/plans/ir/mod.rs @@ -239,7 +239,7 @@ impl ScanSources { pub fn get(&self, idx: usize) -> Option { match self { Self::Paths(paths) => paths.get(idx).map(|p| ScanSourceRef::Path(p)), - Self::Files(files) => files.get(idx).map(|f| ScanSourceRef::File(f)), + Self::Files(files) => files.get(idx).map(ScanSourceRef::File), Self::Buffers(buffers) => buffers.get(idx).map(ScanSourceRef::Buffer), } } diff --git a/crates/polars-python/src/conversion/mod.rs b/crates/polars-python/src/conversion/mod.rs index ec05729acc81..4eee205d8550 100644 --- a/crates/polars-python/src/conversion/mod.rs +++ b/crates/polars-python/src/conversion/mod.rs @@ -571,6 +571,11 @@ impl<'py> FromPyObject<'py> for Wrap { sources.push(file); MutableSources::Files(sources) }, + EitherPythonFileOrPath::Buffer(buffer) => { + let mut sources = Vec::with_capacity(num_items); + sources.push(buffer); + MutableSources::Buffers(sources) + }, }; for source in iter { @@ -578,9 +583,10 @@ impl<'py> FromPyObject<'py> for Wrap { (MutableSources::Paths(v), EitherPythonFileOrPath::Path(p)) => v.push(p), (MutableSources::Files(v), EitherPythonFileOrPath::File(f)) => v.push(f), (MutableSources::Buffers(v), EitherPythonFileOrPath::Py(f)) => v.push(f.as_bytes()), + (MutableSources::Buffers(v), EitherPythonFileOrPath::Buffer(f)) => v.push(f), _ => { return Err(PyTypeError::new_err( - "Cannot combine in-memory bytes and paths for scan sources", + "Cannot combine in-memory bytes, paths and files for scan sources", )) }, } diff --git a/crates/polars-python/src/dataframe/io.rs b/crates/polars-python/src/dataframe/io.rs index d56334d35ad0..12707e93dd85 100644 --- a/crates/polars-python/src/dataframe/io.rs +++ b/crates/polars-python/src/dataframe/io.rs @@ -10,6 +10,7 @@ use polars::prelude::*; #[cfg(feature = "parquet")] use polars_parquet::arrow::write::StatisticsOptions; use pyo3::prelude::*; +use pyo3::pybacked::PyBackedStr; use super::PyDataFrame; #[cfg(feature = "parquet")] @@ -17,13 +18,176 @@ use crate::conversion::parse_parquet_compression; use crate::conversion::Wrap; use crate::error::PyPolarsErr; use crate::file::{ - get_either_file, get_file_like, get_mmap_bytes_reader, + get_either_file, get_file_like, get_mmap_bytes_reader, get_mmap_bytes_reader_and_path, read_if_bytesio, EitherRustPythonFile, }; use crate::prelude::PyCompatLevel; #[pymethods] impl PyDataFrame { + #[staticmethod] + #[cfg(feature = "csv")] + #[pyo3(signature = ( + py_f, infer_schema_length, chunk_size, has_header, ignore_errors, n_rows, + skip_rows, projection, separator, rechunk, columns, encoding, n_threads, path, + overwrite_dtype, overwrite_dtype_slice, low_memory, comment_prefix, quote_char, + null_values, missing_utf8_is_empty_string, try_parse_dates, skip_rows_after_header, + row_index, sample_size, eol_char, raise_if_empty, truncate_ragged_lines, decimal_comma, schema) +)] + pub fn read_csv( + py: Python, + mut py_f: Bound, + infer_schema_length: Option, + chunk_size: usize, + has_header: bool, + ignore_errors: bool, + n_rows: Option, + skip_rows: usize, + projection: Option>, + separator: &str, + rechunk: bool, + columns: Option>, + encoding: Wrap, + n_threads: Option, + path: Option, + overwrite_dtype: Option)>>, + overwrite_dtype_slice: Option>>, + low_memory: bool, + comment_prefix: Option<&str>, + quote_char: Option<&str>, + null_values: Option>, + missing_utf8_is_empty_string: bool, + try_parse_dates: bool, + skip_rows_after_header: usize, + row_index: Option<(String, IdxSize)>, + sample_size: usize, + eol_char: &str, + raise_if_empty: bool, + truncate_ragged_lines: bool, + decimal_comma: bool, + schema: Option>, + ) -> PyResult { + let null_values = null_values.map(|w| w.0); + let eol_char = eol_char.as_bytes()[0]; + let row_index = row_index.map(|(name, offset)| RowIndex { + name: name.into(), + offset, + }); + let quote_char = quote_char.and_then(|s| s.as_bytes().first().copied()); + + let overwrite_dtype = overwrite_dtype.map(|overwrite_dtype| { + overwrite_dtype + .iter() + .map(|(name, dtype)| { + let dtype = dtype.0.clone(); + Field::new((&**name).into(), dtype) + }) + .collect::() + }); + + let overwrite_dtype_slice = overwrite_dtype_slice.map(|overwrite_dtype| { + overwrite_dtype + .iter() + .map(|dt| dt.0.clone()) + .collect::>() + }); + + py_f = read_if_bytesio(py_f); + let mmap_bytes_r = get_mmap_bytes_reader(&py_f)?; + let df = py.allow_threads(move || { + CsvReadOptions::default() + .with_path(path) + .with_infer_schema_length(infer_schema_length) + .with_has_header(has_header) + .with_n_rows(n_rows) + .with_skip_rows(skip_rows) + .with_ignore_errors(ignore_errors) + .with_projection(projection.map(Arc::new)) + .with_rechunk(rechunk) + .with_chunk_size(chunk_size) + .with_columns(columns.map(|x| x.into_iter().map(|x| x.into()).collect())) + .with_n_threads(n_threads) + .with_schema_overwrite(overwrite_dtype.map(Arc::new)) + .with_dtype_overwrite(overwrite_dtype_slice.map(Arc::new)) + .with_schema(schema.map(|schema| Arc::new(schema.0))) + .with_low_memory(low_memory) + .with_skip_rows_after_header(skip_rows_after_header) + .with_row_index(row_index) + .with_sample_size(sample_size) + .with_raise_if_empty(raise_if_empty) + .with_parse_options( + CsvParseOptions::default() + .with_separator(separator.as_bytes()[0]) + .with_encoding(encoding.0) + .with_missing_is_null(!missing_utf8_is_empty_string) + .with_comment_prefix(comment_prefix) + .with_null_values(null_values) + .with_try_parse_dates(try_parse_dates) + .with_quote_char(quote_char) + .with_eol_char(eol_char) + .with_truncate_ragged_lines(truncate_ragged_lines) + .with_decimal_comma(decimal_comma), + ) + .into_reader_with_file_handle(mmap_bytes_r) + .finish() + .map_err(PyPolarsErr::from) + })?; + Ok(df.into()) + } + + #[staticmethod] + #[cfg(feature = "parquet")] + #[pyo3(signature = (py_f, columns, projection, n_rows, row_index, low_memory, parallel, use_statistics, rechunk))] + pub fn read_parquet( + py: Python, + py_f: PyObject, + columns: Option>, + projection: Option>, + n_rows: Option, + row_index: Option<(String, IdxSize)>, + low_memory: bool, + parallel: Wrap, + use_statistics: bool, + rechunk: bool, + ) -> PyResult { + use EitherRustPythonFile::*; + + let row_index = row_index.map(|(name, offset)| RowIndex { + name: name.into(), + offset, + }); + let result = match get_either_file(py_f, false)? { + Py(f) => { + let buf = f.as_buffer(); + py.allow_threads(move || { + ParquetReader::new(buf) + .with_projection(projection) + .with_columns(columns) + .read_parallel(parallel.0) + .with_slice(n_rows.map(|x| (0, x))) + .with_row_index(row_index) + .set_low_memory(low_memory) + .use_statistics(use_statistics) + .set_rechunk(rechunk) + .finish() + }) + }, + Rust(f) => py.allow_threads(move || { + ParquetReader::new(f) + .with_projection(projection) + .with_columns(columns) + .read_parallel(parallel.0) + .with_slice(n_rows.map(|x| (0, x))) + .with_row_index(row_index) + .use_statistics(use_statistics) + .set_rechunk(rechunk) + .finish() + }), + }; + let df = result.map_err(PyPolarsErr::from)?; + Ok(PyDataFrame::new(df)) + } + #[staticmethod] #[cfg(feature = "json")] pub fn read_json( @@ -56,6 +220,69 @@ impl PyDataFrame { }) } + #[staticmethod] + #[cfg(feature = "json")] + pub fn read_ndjson( + py: Python, + mut py_f: Bound, + ignore_errors: bool, + schema: Option>, + schema_overrides: Option>, + ) -> PyResult { + py_f = read_if_bytesio(py_f); + let mmap_bytes_r = get_mmap_bytes_reader(&py_f)?; + + let mut builder = JsonReader::new(mmap_bytes_r) + .with_json_format(JsonFormat::JsonLines) + .with_ignore_errors(ignore_errors); + + if let Some(schema) = schema { + builder = builder.with_schema(Arc::new(schema.0)); + } + + if let Some(schema) = schema_overrides.as_ref() { + builder = builder.with_schema_overwrite(&schema.0); + } + + let out = py + .allow_threads(move || builder.finish()) + .map_err(|e| PyPolarsErr::Other(format!("{e}")))?; + Ok(out.into()) + } + + #[staticmethod] + #[cfg(feature = "ipc")] + #[pyo3(signature = (py_f, columns, projection, n_rows, row_index, memory_map))] + pub fn read_ipc( + py: Python, + mut py_f: Bound, + columns: Option>, + projection: Option>, + n_rows: Option, + row_index: Option<(String, IdxSize)>, + memory_map: bool, + ) -> PyResult { + let row_index = row_index.map(|(name, offset)| RowIndex { + name: name.into(), + offset, + }); + py_f = read_if_bytesio(py_f); + let (mmap_bytes_r, mmap_path) = get_mmap_bytes_reader_and_path(&py_f)?; + + let mmap_path = if memory_map { mmap_path } else { None }; + let df = py.allow_threads(move || { + IpcReader::new(mmap_bytes_r) + .with_projection(projection) + .with_columns(columns) + .with_n_rows(n_rows) + .with_row_index(row_index) + .memory_mapped(mmap_path) + .finish() + .map_err(PyPolarsErr::from) + })?; + Ok(PyDataFrame::new(df)) + } + #[staticmethod] #[cfg(feature = "ipc_streaming")] #[pyo3(signature = (py_f, columns, projection, n_rows, row_index, rechunk))] diff --git a/crates/polars-python/src/file.rs b/crates/polars-python/src/file.rs index 6caaf3bb05b4..2857b37a4891 100644 --- a/crates/polars-python/src/file.rs +++ b/crates/polars-python/src/file.rs @@ -205,6 +205,7 @@ impl EitherRustPythonFile { pub enum EitherPythonFileOrPath { Py(PyFileLikeObject), + Buffer(bytes::Bytes), Path(PathBuf), File(File), } @@ -212,6 +213,14 @@ pub enum EitherPythonFileOrPath { pub fn get_either_file_or_path(py_f: PyObject, write: bool) -> PyResult { Python::with_gil(|py| { let py_f = py_f.into_bound(py); + + // If the pyobject is a `bytes` class + if let Ok(bytes) = py_f.downcast::() { + return Ok(EitherPythonFileOrPath::Buffer( + bytes::Bytes::copy_from_slice(bytes.as_bytes()), + )); + } + if let Ok(s) = py_f.extract::>() { let file_path = std::path::Path::new(&*s); let file_path = resolve_homedir(file_path); @@ -275,7 +284,7 @@ pub fn get_either_file_or_path(py_f: PyObject, write: bool) -> PyResult (None, ScanSources::Files([file].into())), EitherPythonFileOrPath::Py(f) => (None, ScanSources::Buffers([f.as_bytes()].into())), + EitherPythonFileOrPath::Buffer(buff) => (None, ScanSources::Buffers([buff].into())), }) } diff --git a/py-polars/polars/_utils/various.py b/py-polars/polars/_utils/various.py index 014e601de8e2..f82bbec0d785 100644 --- a/py-polars/polars/_utils/various.py +++ b/py-polars/polars/_utils/various.py @@ -84,6 +84,24 @@ def _is_iterable_of(val: Iterable[object], eltype: type | tuple[type, ...]) -> b return all(isinstance(x, eltype) for x in val) +def is_path_or_str_sequence( + val: object, *, allow_str: bool = False, include_series: bool = False +) -> TypeGuard[Sequence[str | Path]]: + """ + Check that `val` is a sequence of strings or paths. + + Note that a single string is a sequence of strings by definition, use + `allow_str=False` to return False on a single string. + """ + if allow_str is False and isinstance(val, str): + return False + elif _check_for_numpy(val) and isinstance(val, np.ndarray): + return np.issubdtype(val.dtype, np.str_) + elif include_series and isinstance(val, pl.Series): + return val.dtype == pl.String + return isinstance(val, Sequence) and _is_iterable_of(val, (Path, str)) + + def is_bool_sequence( val: object, *, include_series: bool = False ) -> TypeGuard[Sequence[bool]]: diff --git a/py-polars/polars/io/csv/functions.py b/py-polars/polars/io/csv/functions.py index 61349439fed8..ceba49391560 100644 --- a/py-polars/polars/io/csv/functions.py +++ b/py-polars/polars/io/csv/functions.py @@ -2,7 +2,7 @@ import contextlib import os -import io +from io import BytesIO, StringIO from pathlib import Path from typing import IO, TYPE_CHECKING, Any, Callable, Mapping, Sequence @@ -11,12 +11,14 @@ from polars._utils.deprecation import deprecate_renamed_parameter from polars._utils.various import ( _process_null_values, + is_path_or_str_sequence, is_str_sequence, normalize_filepath, ) -from polars._utils.wrap import wrap_ldf +from polars._utils.wrap import wrap_df, wrap_ldf from polars.datatypes import N_INFER_DEFAULT, String, parse_into_dtype from polars.io._utils import ( + is_glob_pattern, parse_columns_arg, parse_row_index_args, prepare_file_arg, @@ -25,7 +27,7 @@ from polars.io.csv.batched_reader import BatchedCsvReader with contextlib.suppress(ImportError): # Module not available when building docs - from polars.polars import PyLazyFrame + from polars.polars import PyDataFrame, PyLazyFrame if TYPE_CHECKING: from polars import DataFrame, LazyFrame @@ -563,8 +565,15 @@ def _read_csv_impl( decimal_comma: bool = False, glob: bool = True, ) -> DataFrame: - if isinstance(source, (bytes, memoryview, bytearray)): - source = io.BytesIO(source) + path: str | None + if isinstance(source, (str, Path)): + path = normalize_filepath(source, check_not_directory=False) + else: + path = None + if isinstance(source, BytesIO): + source = source.getvalue() + if isinstance(source, StringIO): + source = source.getvalue().encode() dtype_list: Sequence[tuple[str, PolarsDataType]] | None = None dtype_slice: Sequence[PolarsDataType] | None = None @@ -579,58 +588,93 @@ def _read_csv_impl( msg = f"`schema_overrides` should be of type list or dict, got {type(schema_overrides).__name__!r}" raise TypeError(msg) + processed_null_values = _process_null_values(null_values) + if isinstance(columns, str): columns = [columns] + if isinstance(source, str) and is_glob_pattern(source): + dtypes_dict = None + if dtype_list is not None: + dtypes_dict = dict(dtype_list) + if dtype_slice is not None: + msg = ( + "cannot use glob patterns and unnamed dtypes as `schema_overrides` argument" + "\n\nUse `schema_overrides`: Mapping[str, Type[DataType]]" + ) + raise ValueError(msg) + from polars import scan_csv - dtypes_dict = None - if dtype_list is not None: - dtypes_dict = dict(dtype_list) - if dtype_slice is not None: - msg = ( - "cannot use glob patterns and unnamed dtypes as `schema_overrides` argument" - "\n\nUse `schema_overrides`: Mapping[str, Type[DataType]]" + scan = scan_csv( + source, + has_header=has_header, + separator=separator, + comment_prefix=comment_prefix, + quote_char=quote_char, + skip_rows=skip_rows, + schema=schema, + schema_overrides=dtypes_dict, + null_values=null_values, + missing_utf8_is_empty_string=missing_utf8_is_empty_string, + ignore_errors=ignore_errors, + infer_schema_length=infer_schema_length, + n_rows=n_rows, + low_memory=low_memory, + rechunk=rechunk, + skip_rows_after_header=skip_rows_after_header, + row_index_name=row_index_name, + row_index_offset=row_index_offset, + eol_char=eol_char, + raise_if_empty=raise_if_empty, + truncate_ragged_lines=truncate_ragged_lines, + decimal_comma=decimal_comma, + glob=glob, ) - raise ValueError(msg) - from polars import scan_csv + if columns is None: + return scan.collect() + elif is_str_sequence(columns, allow_str=False): + return scan.select(columns).collect() + else: + msg = ( + "cannot use glob patterns and integer based projection as `columns` argument" + "\n\nUse columns: List[str]" + ) + raise ValueError(msg) + + projection, columns = parse_columns_arg(columns) - scan = scan_csv( + pydf = PyDataFrame.read_csv( source, - has_header=has_header, - separator=separator, - comment_prefix=comment_prefix, - quote_char=quote_char, - skip_rows=skip_rows, - schema=schema, - schema_overrides=dtypes_dict, - null_values=null_values, - missing_utf8_is_empty_string=missing_utf8_is_empty_string, - ignore_errors=ignore_errors, - infer_schema_length=infer_schema_length, - n_rows=n_rows, - encoding=encoding, - low_memory=low_memory, - rechunk=rechunk, - skip_rows_after_header=skip_rows_after_header, - row_index_name=row_index_name, - row_index_offset=row_index_offset, + infer_schema_length, + batch_size, + has_header, + ignore_errors, + n_rows, + skip_rows, + projection, + separator, + rechunk, + columns, + encoding, + n_threads, + path, + dtype_list, + dtype_slice, + low_memory, + comment_prefix, + quote_char, + processed_null_values, + missing_utf8_is_empty_string, + try_parse_dates, + skip_rows_after_header, + parse_row_index_args(row_index_name, row_index_offset), + sample_size=sample_size, eol_char=eol_char, raise_if_empty=raise_if_empty, truncate_ragged_lines=truncate_ragged_lines, decimal_comma=decimal_comma, - glob=glob, - try_parse_dates=try_parse_dates, + schema=schema, ) - - if columns is None: - return scan.collect() - elif is_str_sequence(columns, allow_str=False): - return scan.select(columns).collect() - else: - msg = ( - "cannot use glob patterns and integer based projection as `columns` argument" - "\n\nUse columns: List[str]" - ) - raise ValueError(msg) + return wrap_df(pydf) @deprecate_renamed_parameter("dtypes", "schema_overrides", version="0.20.31") @@ -947,10 +991,12 @@ def scan_csv( | Path | IO[str] | IO[bytes] + | bytes | list[str] | list[Path] | list[IO[str]] - | list[IO[bytes]], + | list[IO[bytes]] + | list[bytes], *, has_header: bool = True, separator: str = ",", @@ -1198,19 +1244,9 @@ def with_column_names(cols: list[str]) -> list[str]: if isinstance(source, (str, Path)): source = normalize_filepath(source, check_not_directory=False) - elif isinstance(source, io.IOBase) or ( - isinstance(source, list) - and len(source) > 0 - and isinstance(source[0], io.IOBase) - ): - pass - else: + elif is_path_or_str_sequence(source, allow_str=False): source = [ - normalize_filepath( - source, # type: ignore[arg-type] - check_not_directory=False, - ) - for source in source + normalize_filepath(source, check_not_directory=False) for source in source ] if not infer_schema: @@ -1255,10 +1291,12 @@ def _scan_csv_impl( source: str | IO[str] | IO[bytes] + | bytes | list[str] | list[Path] | list[IO[str]] - | list[IO[bytes]], + | list[IO[bytes]] + | list[bytes], *, has_header: bool = True, separator: str = ",", diff --git a/py-polars/polars/io/ipc/functions.py b/py-polars/polars/io/ipc/functions.py index a318ed8d62a4..b704ce814ab8 100644 --- a/py-polars/polars/io/ipc/functions.py +++ b/py-polars/polars/io/ipc/functions.py @@ -2,7 +2,6 @@ import contextlib import os -import io from pathlib import Path from typing import IO, TYPE_CHECKING, Any, Sequence @@ -10,12 +9,15 @@ import polars.functions as F from polars._utils.deprecation import deprecate_renamed_parameter from polars._utils.various import ( + is_path_or_str_sequence, is_str_sequence, normalize_filepath, ) from polars._utils.wrap import wrap_df, wrap_ldf from polars.dependencies import import_optional from polars.io._utils import ( + is_glob_pattern, + is_local_file, parse_columns_arg, parse_row_index_args, prepare_file_arg, @@ -175,31 +177,42 @@ def _read_ipc_impl( rechunk: bool = True, memory_map: bool = True, ) -> DataFrame: - if isinstance(source, (memoryview, bytearray, bytes)): - source = io.BytesIO(source) - + if isinstance(source, (str, Path)): + source = normalize_filepath(source, check_not_directory=False) if isinstance(columns, str): columns = [columns] - scan = scan_ipc( + if isinstance(source, str) and is_glob_pattern(source) and is_local_file(source): + scan = scan_ipc( + source, + n_rows=n_rows, + rechunk=rechunk, + row_index_name=row_index_name, + row_index_offset=row_index_offset, + memory_map=memory_map, + ) + if columns is None: + df = scan.collect() + elif is_str_sequence(columns, allow_str=False): + df = scan.select(columns).collect() + else: + msg = ( + "cannot use glob patterns and integer based projection as `columns` argument" + "\n\nUse columns: List[str]" + ) + raise TypeError(msg) + return df + + projection, columns = parse_columns_arg(columns) + pydf = PyDataFrame.read_ipc( source, - n_rows=n_rows, - rechunk=rechunk, - row_index_name=row_index_name, - row_index_offset=row_index_offset, + columns, + projection, + n_rows, + parse_row_index_args(row_index_name, row_index_offset), memory_map=memory_map, ) - if columns is None: - df = scan.collect() - elif is_str_sequence(columns, allow_str=False): - df = scan.select(columns).collect() - else: - msg = ( - "cannot use glob patterns and integer based projection as `columns` argument" - "\n\nUse columns: List[str]" - ) - raise TypeError(msg) - return df + return wrap_df(pydf) @deprecate_renamed_parameter("row_count_name", "row_index_name", version="0.20.4") @@ -334,7 +347,14 @@ def read_ipc_schema(source: str | Path | IO[bytes] | bytes) -> dict[str, DataTyp @deprecate_renamed_parameter("row_count_name", "row_index_name", version="0.20.4") @deprecate_renamed_parameter("row_count_offset", "row_index_offset", version="0.20.4") def scan_ipc( - source: str | Path | IO[bytes] | list[str] | list[Path] | list[IO[bytes]], + source: str + | Path + | IO[bytes] + | bytes + | list[str] + | list[Path] + | list[IO[bytes]] + | list[bytes], *, n_rows: int | None = None, cache: bool = True, @@ -414,21 +434,17 @@ def scan_ipc( include_file_paths Include the path of the source file(s) as a column with this name. """ - sources: list[str] | list[Path] | list[IO[bytes]] = [] + sources: list[str] | list[Path] | list[IO[bytes]] | list[bytes] = [] if isinstance(source, (str, Path)): source = normalize_filepath(source, check_not_directory=False) elif isinstance(source, list): - if len(source) > 0: - if isinstance(source[0], (str, Path)): - sources = [ - normalize_filepath( - source, # type: ignore[arg-type] - check_not_directory=False, - ) - for source in source - ] - else: - sources = source + if is_path_or_str_sequence(source): + sources = [ + normalize_filepath(source, check_not_directory=False) + for source in source + ] + else: + sources = source source = None # type: ignore[assignment] diff --git a/py-polars/polars/io/ndjson.py b/py-polars/polars/io/ndjson.py index ba1d120890bb..8b4cf39076c1 100644 --- a/py-polars/polars/io/ndjson.py +++ b/py-polars/polars/io/ndjson.py @@ -3,11 +3,11 @@ import contextlib from io import BytesIO, StringIO from pathlib import Path -from typing import IO, TYPE_CHECKING, Any +from typing import IO, TYPE_CHECKING, Any, Sequence from polars._utils.deprecation import deprecate_renamed_parameter -from polars._utils.various import normalize_filepath -from polars._utils.wrap import wrap_ldf +from polars._utils.various import is_path_or_str_sequence, normalize_filepath +from polars._utils.wrap import wrap_df, wrap_ldf from polars.datatypes import N_INFER_DEFAULT from polars.io._utils import parse_row_index_args @@ -120,6 +120,29 @@ def read_ndjson( │ 3 ┆ 8 │ └─────┴─────┘ """ + if not ( + isinstance(source, (str, Path)) + or isinstance(source, Sequence) + and source + and isinstance(source[0], (str, Path)) + ): + # TODO: A lot of the parameters aren't applied for BytesIO + if isinstance(source, StringIO): + source = BytesIO(source.getvalue().encode()) + + pydf = PyDataFrame.read_ndjson( + source, + ignore_errors=ignore_errors, + schema=schema, + schema_overrides=schema_overrides, + ) + + df = wrap_df(pydf) + + if n_rows: + df = df.head(n_rows) + + return df return scan_ndjson( source, # type: ignore[arg-type] @@ -234,26 +257,17 @@ def scan_ndjson( sources: list[str] | list[Path] | list[IO[str]] | list[IO[bytes]] = [] if isinstance(source, (str, Path)): source = normalize_filepath(source, check_not_directory=False) - elif isinstance(source, (BytesIO, StringIO)): - pass - elif ( - isinstance(source, list) - and len(source) > 0 - and isinstance(source[0], (BytesIO, StringIO)) - ): - sources = source - source = None # type: ignore[assignment] - else: - assert all(isinstance(s, (str, Path)) for s in source) - - sources = [ - normalize_filepath( - source, # type: ignore[arg-type] - check_not_directory=False, - ) - for source in source - ] + elif isinstance(source, list): + if is_path_or_str_sequence(source): + sources = [ + normalize_filepath(source, check_not_directory=False) + for source in source + ] + else: + sources = source + source = None # type: ignore[assignment] + if infer_schema_length == 0: msg = "'infer_schema_length' should be positive" raise ValueError(msg) diff --git a/py-polars/polars/io/parquet/functions.py b/py-polars/polars/io/parquet/functions.py index 6320a0072578..bc434b05cc2d 100644 --- a/py-polars/polars/io/parquet/functions.py +++ b/py-polars/polars/io/parquet/functions.py @@ -11,6 +11,7 @@ from polars._utils.unstable import issue_unstable_warning from polars._utils.various import ( is_int_sequence, + is_path_or_str_sequence, normalize_filepath, ) from polars._utils.wrap import wrap_ldf @@ -171,8 +172,9 @@ def read_parquet( memory_map=memory_map, rechunk=rechunk, ) + # Read file and bytes inputs using `read_parquet` - elif isinstance(source, bytes): + if isinstance(source, bytes): source = io.BytesIO(source) elif isinstance(source, list) and len(source) > 0 and isinstance(source[0], bytes): assert all(isinstance(s, bytes) for s in source) @@ -415,16 +417,9 @@ def scan_parquet( if isinstance(source, (str, Path)): source = normalize_filepath(source, check_not_directory=False) - elif isinstance(source, io.IOBase) or ( - isinstance(source, list) - and len(source) > 0 - and isinstance(source[0], io.IOBase) - ): - pass - else: + elif is_path_or_str_sequence(source): source = [ - normalize_filepath(source, check_not_directory=False) # type: ignore[arg-type] - for source in source + normalize_filepath(source, check_not_directory=False) for source in source ] return _scan_parquet_impl( diff --git a/py-polars/tests/unit/io/test_csv.py b/py-polars/tests/unit/io/test_csv.py index eab89d3b7855..fcacedead1d4 100644 --- a/py-polars/tests/unit/io/test_csv.py +++ b/py-polars/tests/unit/io/test_csv.py @@ -953,6 +953,7 @@ def test_write_csv_separator() -> None: df.write_csv(f, separator="\t") f.seek(0) assert f.read() == b"a\tb\n1\t1\n2\t2\n3\t3\n" + f.seek(0) assert_frame_equal(df, pl.read_csv(f, separator="\t")) @@ -962,6 +963,7 @@ def test_write_csv_line_terminator() -> None: df.write_csv(f, line_terminator="\r\n") f.seek(0) assert f.read() == b"a,b\r\n1,1\r\n2,2\r\n3,3\r\n" + f.seek(0) assert_frame_equal(df, pl.read_csv(f, eol_char="\n")) @@ -996,6 +998,7 @@ def test_quoting_round_trip() -> None: } ) df.write_csv(f) + f.seek(0) read_df = pl.read_csv(f) assert_frame_equal(read_df, df) @@ -1183,6 +1186,7 @@ def test_csv_write_escape_headers() -> None: out = io.BytesIO() df1.write_csv(out) + out.seek(0) df2 = pl.read_csv(out) assert_frame_equal(df1, df2) assert df2.schema == {"c,o,l,u,m,n": pl.Int64} @@ -2279,4 +2283,5 @@ def test_read_csv_cast_unparsable_later( ) -> None: f = io.BytesIO() df.write_csv(f) + f.seek(0) assert df.equals(pl.read_csv(f, schema={"x": dtype})) diff --git a/py-polars/tests/unit/io/test_ipc.py b/py-polars/tests/unit/io/test_ipc.py index 18e19f4ec885..a1505cf3d2fc 100644 --- a/py-polars/tests/unit/io/test_ipc.py +++ b/py-polars/tests/unit/io/test_ipc.py @@ -44,11 +44,13 @@ def test_from_to_buffer( ) -> None: # use an ad-hoc buffer (file=None) buf1 = write_ipc(df, stream, None, compression=compression) + buf1.seek(0) read_df = read_ipc(stream, buf1, use_pyarrow=False) assert_frame_equal(df, read_df, categorical_as_str=True) # explicitly supply an existing buffer buf2 = io.BytesIO() + buf2.seek(0) write_ipc(df, stream, buf2, compression=compression) buf2.seek(0) read_df = read_ipc(stream, buf2, use_pyarrow=False) @@ -245,6 +247,7 @@ def test_list_nested_enum() -> None: df = pl.DataFrame(pl.Series("list_cat", [["a", "b", "c", None]], dtype=dtype)) buffer = io.BytesIO() df.write_ipc(buffer, compat_level=CompatLevel.newest()) + buffer.seek(0) df = pl.read_ipc(buffer) assert df.get_column("list_cat").dtype == dtype @@ -258,6 +261,7 @@ def test_struct_nested_enum() -> None: ) buffer = io.BytesIO() df.write_ipc(buffer, compat_level=CompatLevel.newest()) + buffer.seek(0) df = pl.read_ipc(buffer) assert df.get_column("struct_cat").dtype == dtype diff --git a/py-polars/tests/unit/io/test_parquet.py b/py-polars/tests/unit/io/test_parquet.py index db3186a3f874..3da465561bd1 100644 --- a/py-polars/tests/unit/io/test_parquet.py +++ b/py-polars/tests/unit/io/test_parquet.py @@ -685,6 +685,19 @@ def test_write_parquet_with_null_col(tmp_path: Path) -> None: assert_frame_equal(out, df) +@pytest.mark.write_disk +def test_scan_parquet_binary_buffered_reader(tmp_path: Path) -> None: + tmp_path.mkdir(exist_ok=True) + + df = pl.DataFrame({"a": [1, 2, 3]}) + file_path = tmp_path / "test.parquet" + df.write_parquet(file_path) + + with file_path.open("rb") as f: + out = pl.scan_parquet(f).collect() + assert_frame_equal(out, df) + + @pytest.mark.write_disk def test_read_parquet_binary_buffered_reader(tmp_path: Path) -> None: tmp_path.mkdir(exist_ok=True) From be81e80e9af6c64b3dd11f4efcefe3549bbbef9d Mon Sep 17 00:00:00 2001 From: coastalwhite Date: Sun, 8 Sep 2024 13:57:56 +0200 Subject: [PATCH 18/27] mypy --- py-polars/polars/io/ipc/functions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/py-polars/polars/io/ipc/functions.py b/py-polars/polars/io/ipc/functions.py index b704ce814ab8..17ee17d6843b 100644 --- a/py-polars/polars/io/ipc/functions.py +++ b/py-polars/polars/io/ipc/functions.py @@ -112,7 +112,7 @@ def read_ipc( raise ValueError(msg) lf = scan_ipc( - source, # type: ignore[arg-type] + source, n_rows=n_rows, memory_map=memory_map, storage_options=storage_options, From 29fe063c4c069adadc076246fae10b553061ffe6 Mon Sep 17 00:00:00 2001 From: coastalwhite Date: Sun, 8 Sep 2024 14:01:47 +0200 Subject: [PATCH 19/27] add bytes to ndjson --- py-polars/polars/io/ndjson.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/py-polars/polars/io/ndjson.py b/py-polars/polars/io/ndjson.py index 8b4cf39076c1..8fc2addd6901 100644 --- a/py-polars/polars/io/ndjson.py +++ b/py-polars/polars/io/ndjson.py @@ -170,10 +170,12 @@ def scan_ndjson( | Path | IO[str] | IO[bytes] + | bytes | list[str] | list[Path] | list[IO[str]] - | list[IO[bytes]], + | list[IO[bytes]] + | bytes, *, schema: SchemaDefinition | None = None, schema_overrides: SchemaDefinition | None = None, From 5daebe4e2d8178f98ec11ee54dc1c6ab8afd52d5 Mon Sep 17 00:00:00 2001 From: coastalwhite Date: Sun, 8 Sep 2024 14:28:42 +0200 Subject: [PATCH 20/27] clean up the code a bit --- .../src/executors/scan/csv.rs | 6 +- .../src/executors/scan/ipc.rs | 4 +- .../src/executors/scan/ndjson.rs | 6 +- .../src/executors/scan/parquet.rs | 4 +- .../polars-plan/src/plans/conversion/scans.rs | 4 +- .../polars-plan/src/plans/functions/count.rs | 2 +- crates/polars-plan/src/plans/ir/mod.rs | 246 +--------------- .../polars-plan/src/plans/ir/scan_sources.rs | 270 ++++++++++++++++++ crates/polars-python/src/conversion/mod.rs | 22 +- crates/polars-python/src/file.rs | 21 +- crates/polars-python/src/lazyframe/general.rs | 11 +- 11 files changed, 310 insertions(+), 286 deletions(-) create mode 100644 crates/polars-plan/src/plans/ir/scan_sources.rs diff --git a/crates/polars-mem-engine/src/executors/scan/csv.rs b/crates/polars-mem-engine/src/executors/scan/csv.rs index c00a0047d525..0ebcb7632ae7 100644 --- a/crates/polars-mem-engine/src/executors/scan/csv.rs +++ b/crates/polars-mem-engine/src/executors/scan/csv.rs @@ -55,9 +55,9 @@ impl CsvExec { let verbose = config::verbose(); let force_async = config::force_async(); - let run_async = (self.sources.is_files() && force_async) || self.sources.is_cloud_url(); + let run_async = (self.sources.is_paths() && force_async) || self.sources.is_cloud_url(); - if self.sources.is_files() && force_async && verbose { + if self.sources.is_paths() && force_async && verbose { eprintln!("ASYNC READING FORCED"); } @@ -75,7 +75,7 @@ impl CsvExec { .finish()?; if let Some(col) = &self.file_options.include_file_paths { - let name = source.to_file_path(); + let name = source.to_include_path_name(); unsafe { df.with_column_unchecked( diff --git a/crates/polars-mem-engine/src/executors/scan/ipc.rs b/crates/polars-mem-engine/src/executors/scan/ipc.rs index acbcc2d28dd6..edde4765fec5 100644 --- a/crates/polars-mem-engine/src/executors/scan/ipc.rs +++ b/crates/polars-mem-engine/src/executors/scan/ipc.rs @@ -29,7 +29,7 @@ impl IpcExec { }; let force_async = config::force_async(); - let mut out = if is_cloud || (self.sources.is_files() && force_async) { + let mut out = if is_cloud || (self.sources.is_paths() && force_async) { feature_gated!("cloud", { if force_async && config::verbose() { eprintln!("ASYNC READING FORCED"); @@ -102,7 +102,7 @@ impl IpcExec { self.file_options .include_file_paths .as_ref() - .map(|x| (x.clone(), Arc::from(source.to_file_path()))), + .map(|x| (x.clone(), Arc::from(source.to_include_path_name()))), ) .finish() }; diff --git a/crates/polars-mem-engine/src/executors/scan/ndjson.rs b/crates/polars-mem-engine/src/executors/scan/ndjson.rs index 06e1d18892c6..a662760fd54b 100644 --- a/crates/polars-mem-engine/src/executors/scan/ndjson.rs +++ b/crates/polars-mem-engine/src/executors/scan/ndjson.rs @@ -39,9 +39,9 @@ impl JsonExec { let verbose = config::verbose(); let force_async = config::force_async(); - let run_async = (self.sources.is_files() && force_async) || self.sources.is_cloud_url(); + let run_async = (self.sources.is_paths() && force_async) || self.sources.is_cloud_url(); - if self.sources.is_files() && force_async && verbose { + if self.sources.is_paths() && force_async && verbose { eprintln!("ASYNC READING FORCED"); } @@ -108,7 +108,7 @@ impl JsonExec { } if let Some(col) = &self.file_scan_options.include_file_paths { - let name = source.to_file_path(); + let name = source.to_include_path_name(); unsafe { df.with_column_unchecked( StringChunked::full(col.clone(), name, df.height()).into_series(), diff --git a/crates/polars-mem-engine/src/executors/scan/parquet.rs b/crates/polars-mem-engine/src/executors/scan/parquet.rs index 2f32e0b50aa3..a37fc7c42f33 100644 --- a/crates/polars-mem-engine/src/executors/scan/parquet.rs +++ b/crates/polars-mem-engine/src/executors/scan/parquet.rs @@ -161,7 +161,7 @@ impl ParquetExec { self.file_options .include_file_paths .as_ref() - .map(|x| (x.clone(), Arc::from(source.to_file_path()))), + .map(|x| (x.clone(), Arc::from(source.to_include_path_name()))), ); reader @@ -453,7 +453,7 @@ impl ParquetExec { let is_cloud = self.sources.is_cloud_url(); let force_async = config::force_async(); - let out = if is_cloud || (self.sources.is_files() && force_async) { + let out = if is_cloud || (self.sources.is_paths() && force_async) { feature_gated!("cloud", { if force_async && config::verbose() { eprintln!("ASYNC READING FORCED"); diff --git a/crates/polars-plan/src/plans/conversion/scans.rs b/crates/polars-plan/src/plans/conversion/scans.rs index 2b20d9fe932a..25dd61aa1eb9 100644 --- a/crates/polars-plan/src/plans/conversion/scans.rs +++ b/crates/polars-plan/src/plans/conversion/scans.rs @@ -152,7 +152,7 @@ pub(super) fn csv_file_info( // * See if we can do this without downloading the entire file // prints the error message if paths is empty. - let run_async = sources.is_cloud_url() || (sources.is_files() && config::force_async()); + let run_async = sources.is_cloud_url() || (sources.is_paths() && config::force_async()); let cache_entries = { if run_async { @@ -268,7 +268,7 @@ pub(super) fn ndjson_file_info( polars_bail!(ComputeError: "expected at least 1 source"); }; - let run_async = sources.is_cloud_url() || (sources.is_files() && config::force_async()); + let run_async = sources.is_cloud_url() || (sources.is_paths() && config::force_async()); let cache_entries = { if run_async { diff --git a/crates/polars-plan/src/plans/functions/count.rs b/crates/polars-plan/src/plans/functions/count.rs index 0b16c8eac994..7375ff47ff31 100644 --- a/crates/polars-plan/src/plans/functions/count.rs +++ b/crates/polars-plan/src/plans/functions/count.rs @@ -226,7 +226,7 @@ pub(super) fn count_rows_ndjson( } let is_cloud_url = sources.is_cloud_url(); - let run_async = is_cloud_url || (sources.is_files() && config::force_async()); + let run_async = is_cloud_url || (sources.is_paths() && config::force_async()); let cache_entries = { if run_async { diff --git a/crates/polars-plan/src/plans/ir/mod.rs b/crates/polars-plan/src/plans/ir/mod.rs index cf0b5ee8df7d..a9eb45b6406f 100644 --- a/crates/polars-plan/src/plans/ir/mod.rs +++ b/crates/polars-plan/src/plans/ir/mod.rs @@ -1,22 +1,20 @@ mod dot; mod format; mod inputs; +mod scan_sources; mod schema; pub(crate) mod tree_format; use std::borrow::Cow; use std::fmt; -use std::fs::File; -use std::path::{Path, PathBuf}; pub use dot::{EscapeLabel, IRDotDisplay, PathsDisplay, ScanSourcesDisplay}; pub use format::{ExprIRDisplay, IRDisplay}; use hive::HivePartitions; -use polars_core::error::feature_gated; use polars_core::prelude::*; use polars_utils::idx_vec::UnitVec; -use polars_utils::mmap::MemSlice; use polars_utils::unitvec; +pub use scan_sources::{ScanSourceIter, ScanSourceRef, ScanSources}; #[cfg(feature = "ir_serde")] use serde::{Deserialize, Serialize}; @@ -36,246 +34,6 @@ pub struct IRPlanRef<'a> { pub expr_arena: &'a Arena, } -#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] -#[derive(Debug, Clone)] -pub enum ScanSources { - Paths(Arc<[PathBuf]>), - - #[cfg_attr(feature = "serde", serde(skip))] - Files(Arc<[File]>), - #[cfg_attr(feature = "serde", serde(skip))] - Buffers(Arc<[bytes::Bytes]>), -} - -impl std::hash::Hash for ScanSources { - fn hash(&self, state: &mut H) { - std::mem::discriminant(self).hash(state); - - // @NOTE: This is a bit crazy - match self { - Self::Paths(paths) => paths.hash(state), - Self::Files(files) => files.as_ptr().hash(state), - Self::Buffers(buffers) => buffers.as_ptr().hash(state), - } - } -} - -impl PartialEq for ScanSources { - fn eq(&self, other: &Self) -> bool { - match (self, other) { - (ScanSources::Paths(l), ScanSources::Paths(r)) => l == r, - _ => false, - } - } -} - -impl Eq for ScanSources {} - -#[derive(Debug, Clone, Copy)] -pub enum ScanSourceRef<'a> { - Path(&'a Path), - File(&'a File), - Buffer(&'a bytes::Bytes), -} - -pub struct ScanSourceSliceInfo { - pub item_slice: std::ops::Range, - pub source_slice: std::ops::Range, -} - -impl Default for ScanSources { - fn default() -> Self { - Self::Buffers(Arc::default()) - } -} - -impl<'a> ScanSourceRef<'a> { - pub fn to_file_path(&self) -> &str { - match self { - Self::Path(path) => path.to_str().unwrap(), - Self::File(_) => "open-file", - Self::Buffer(_) => "in-mem", - } - } - - pub fn to_memslice(&self) -> PolarsResult { - self.to_memslice_possibly_async(false, None, 0) - } - - pub fn to_memslice_async_latest(&self, run_async: bool) -> PolarsResult { - match self { - ScanSourceRef::Path(path) => { - let file = if run_async { - feature_gated!("cloud", { - polars_io::file_cache::FILE_CACHE - .get_entry(path.to_str().unwrap()) - // Safety: This was initialized by schema inference. - .unwrap() - .try_open_assume_latest()? - }) - } else { - polars_utils::open_file(path)? - }; - - Ok(MemSlice::from_mmap(Arc::new(unsafe { - memmap::Mmap::map(&file)? - }))) - }, - ScanSourceRef::File(file) => Ok(MemSlice::from_mmap(Arc::new(unsafe { - memmap::Mmap::map(*file)? - }))), - ScanSourceRef::Buffer(buff) => Ok(MemSlice::from_bytes((*buff).clone())), - } - } - - pub fn to_memslice_possibly_async( - &self, - run_async: bool, - #[cfg(feature = "cloud")] cache_entries: Option< - &Vec>, - >, - #[cfg(not(feature = "cloud"))] cache_entries: Option<&()>, - index: usize, - ) -> PolarsResult { - match self { - Self::Path(path) => { - let f = if run_async { - feature_gated!("cloud", { - cache_entries.unwrap()[index].try_open_check_latest()? - }) - } else { - polars_utils::open_file(path)? - }; - - let mmap = unsafe { memmap::Mmap::map(&f)? }; - Ok(MemSlice::from_mmap(Arc::new(mmap))) - }, - Self::File(file) => { - let mmap = unsafe { memmap::Mmap::map(*file)? }; - Ok(MemSlice::from_mmap(Arc::new(mmap))) - }, - Self::Buffer(buff) => Ok(MemSlice::from_bytes((*buff).clone())), - } - } -} - -impl ScanSources { - pub fn iter(&self) -> ScanSourceIter { - ScanSourceIter { - sources: self, - offset: 0, - } - } - - pub fn as_paths(&self) -> Option<&[PathBuf]> { - match self { - Self::Paths(paths) => Some(paths.as_ref()), - Self::Files(_) | Self::Buffers(_) => None, - } - } - - pub fn into_paths(&self) -> Option> { - match self { - Self::Paths(paths) => Some(paths.clone()), - Self::Files(_) | Self::Buffers(_) => None, - } - } - - pub fn first_path(&self) -> Option<&Path> { - match self { - Self::Paths(paths) => paths.first().map(|p| p.as_path()), - Self::Files(_) | Self::Buffers(_) => None, - } - } - - pub fn to_dsl(self, is_expanded: bool) -> DslScanSources { - DslScanSources { - sources: self, - is_expanded, - } - } - - pub fn is_files(&self) -> bool { - matches!(self, Self::Paths(_)) - } - - pub fn is_cloud_url(&self) -> bool { - match self { - Self::Paths(paths) => paths.first().map_or(false, polars_io::is_cloud_url), - Self::Files(_) | Self::Buffers(_) => false, - } - } - - pub fn len(&self) -> usize { - match self { - Self::Paths(s) => s.len(), - Self::Files(s) => s.len(), - Self::Buffers(s) => s.len(), - } - } - - pub fn is_empty(&self) -> bool { - self.len() == 0 - } - - pub fn first(&self) -> Option { - self.get(0) - } - - pub fn id(&self) -> PlSmallStr { - if self.is_empty() { - return PlSmallStr::from_static("EMPTY"); - } - - match self { - Self::Paths(paths) => { - PlSmallStr::from_str(paths.first().unwrap().to_string_lossy().as_ref()) - }, - Self::Files(_) => PlSmallStr::from_static("OPEN_FILES"), - Self::Buffers(_) => PlSmallStr::from_static("IN_MEMORY"), - } - } - - pub fn get(&self, idx: usize) -> Option { - match self { - Self::Paths(paths) => paths.get(idx).map(|p| ScanSourceRef::Path(p)), - Self::Files(files) => files.get(idx).map(ScanSourceRef::File), - Self::Buffers(buffers) => buffers.get(idx).map(ScanSourceRef::Buffer), - } - } - - pub fn at(&self, idx: usize) -> ScanSourceRef { - self.get(idx).unwrap() - } -} - -pub struct ScanSourceIter<'a> { - sources: &'a ScanSources, - offset: usize, -} - -impl<'a> Iterator for ScanSourceIter<'a> { - type Item = ScanSourceRef<'a>; - - fn next(&mut self) -> Option { - let item = match self.sources { - ScanSources::Paths(paths) => ScanSourceRef::Path(paths.get(self.offset)?), - ScanSources::Files(files) => ScanSourceRef::File(files.get(self.offset)?), - ScanSources::Buffers(buffers) => ScanSourceRef::Buffer(buffers.get(self.offset)?), - }; - - self.offset += 1; - Some(item) - } - - fn size_hint(&self) -> (usize, Option) { - let len = self.sources.len() - self.offset; - (len, Some(len)) - } -} - -impl<'a> ExactSizeIterator for ScanSourceIter<'a> {} - /// [`IR`] is a representation of [`DslPlan`] with [`Node`]s which are allocated in an [`Arena`] /// In this IR the logical plan has access to the full dataset. #[derive(Clone, Debug, Default)] diff --git a/crates/polars-plan/src/plans/ir/scan_sources.rs b/crates/polars-plan/src/plans/ir/scan_sources.rs new file mode 100644 index 000000000000..5261d6ede706 --- /dev/null +++ b/crates/polars-plan/src/plans/ir/scan_sources.rs @@ -0,0 +1,270 @@ +use std::fs::File; +use std::path::{Path, PathBuf}; +use std::sync::Arc; + +use polars_core::error::{feature_gated, PolarsResult}; +use polars_utils::mmap::MemSlice; +use polars_utils::pl_str::PlSmallStr; + +use super::DslScanSources; + +/// Set of sources to scan from +/// +/// This is can either be a list of paths to files, opened files or in-memory buffers. Mixing of +/// buffers is not currently possible. +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] +#[derive(Debug, Clone)] +pub enum ScanSources { + Paths(Arc<[PathBuf]>), + + #[cfg_attr(feature = "serde", serde(skip))] + Files(Arc<[File]>), + #[cfg_attr(feature = "serde", serde(skip))] + Buffers(Arc<[bytes::Bytes]>), +} + +/// A reference to a single item in [`ScanSources`] +#[derive(Debug, Clone, Copy)] +pub enum ScanSourceRef<'a> { + Path(&'a Path), + File(&'a File), + Buffer(&'a bytes::Bytes), +} + +/// An iterator for [`ScanSources`] +pub struct ScanSourceIter<'a> { + sources: &'a ScanSources, + offset: usize, +} + +impl Default for ScanSources { + fn default() -> Self { + Self::Buffers(Arc::default()) + } +} + +impl std::hash::Hash for ScanSources { + fn hash(&self, state: &mut H) { + std::mem::discriminant(self).hash(state); + + // @NOTE: This is a bit crazy + // + // We don't really want to hash the file descriptors or the whole buffers so for now we + // just settle with the fact that the memory behind Arc's does not really move. Therefore, + // we can just hash the pointer. + match self { + Self::Paths(paths) => paths.hash(state), + Self::Files(files) => files.as_ptr().hash(state), + Self::Buffers(buffers) => buffers.as_ptr().hash(state), + } + } +} + +impl PartialEq for ScanSources { + fn eq(&self, other: &Self) -> bool { + match (self, other) { + (ScanSources::Paths(l), ScanSources::Paths(r)) => l == r, + (ScanSources::Files(l), ScanSources::Files(r)) => std::ptr::eq(l.as_ptr(), r.as_ptr()), + (ScanSources::Buffers(l), ScanSources::Buffers(r)) => { + std::ptr::eq(l.as_ptr(), r.as_ptr()) + }, + _ => false, + } + } +} + +impl Eq for ScanSources {} + +impl ScanSources { + pub fn iter(&self) -> ScanSourceIter { + ScanSourceIter { + sources: self, + offset: 0, + } + } + + pub fn to_dsl(self, is_expanded: bool) -> DslScanSources { + DslScanSources { + sources: self, + is_expanded, + } + } + + /// Are the sources all paths? + pub fn is_paths(&self) -> bool { + matches!(self, Self::Paths(_)) + } + + /// Try cast the scan sources to [`ScanSources::Paths`] + pub fn as_paths(&self) -> Option<&[PathBuf]> { + match self { + Self::Paths(paths) => Some(paths.as_ref()), + Self::Files(_) | Self::Buffers(_) => None, + } + } + + /// Try cast the scan sources to [`ScanSources::Paths`] with a clone + pub fn into_paths(&self) -> Option> { + match self { + Self::Paths(paths) => Some(paths.clone()), + Self::Files(_) | Self::Buffers(_) => None, + } + } + + /// Try get the first path in the scan sources + pub fn first_path(&self) -> Option<&Path> { + match self { + Self::Paths(paths) => paths.first().map(|p| p.as_path()), + Self::Files(_) | Self::Buffers(_) => None, + } + } + + /// Is the first path a cloud URL? + pub fn is_cloud_url(&self) -> bool { + self.first_path().is_some_and(polars_io::is_cloud_url) + } + + pub fn len(&self) -> usize { + match self { + Self::Paths(s) => s.len(), + Self::Files(s) => s.len(), + Self::Buffers(s) => s.len(), + } + } + + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + pub fn first(&self) -> Option { + self.get(0) + } + + /// Turn the [`ScanSources`] into some kind of identifier + pub fn id(&self) -> PlSmallStr { + if self.is_empty() { + return PlSmallStr::from_static("EMPTY"); + } + + match self { + Self::Paths(paths) => { + PlSmallStr::from_str(paths.first().unwrap().to_string_lossy().as_ref()) + }, + Self::Files(_) => PlSmallStr::from_static("OPEN_FILES"), + Self::Buffers(_) => PlSmallStr::from_static("IN_MEMORY"), + } + } + + /// Get the scan source at specific address + pub fn get(&self, idx: usize) -> Option { + match self { + Self::Paths(paths) => paths.get(idx).map(|p| ScanSourceRef::Path(p)), + Self::Files(files) => files.get(idx).map(ScanSourceRef::File), + Self::Buffers(buffers) => buffers.get(idx).map(ScanSourceRef::Buffer), + } + } + + /// Get the scan source at specific address + /// + /// # Panics + /// + /// If the `idx` is out of range. + #[track_caller] + pub fn at(&self, idx: usize) -> ScanSourceRef { + self.get(idx).unwrap() + } +} + +impl<'a> ScanSourceRef<'a> { + /// Get the name for `include_paths` + pub fn to_include_path_name(&self) -> &str { + match self { + Self::Path(path) => path.to_str().unwrap(), + Self::File(_) => "open-file", + Self::Buffer(_) => "in-mem", + } + } + + /// Turn the scan source into a memory slice + pub fn to_memslice(&self) -> PolarsResult { + self.to_memslice_possibly_async(false, None, 0) + } + + pub fn to_memslice_async_latest(&self, run_async: bool) -> PolarsResult { + match self { + ScanSourceRef::Path(path) => { + let file = if run_async { + feature_gated!("cloud", { + polars_io::file_cache::FILE_CACHE + .get_entry(path.to_str().unwrap()) + // Safety: This was initialized by schema inference. + .unwrap() + .try_open_assume_latest()? + }) + } else { + polars_utils::open_file(path)? + }; + + Ok(MemSlice::from_mmap(Arc::new(unsafe { + memmap::Mmap::map(&file)? + }))) + }, + ScanSourceRef::File(file) => Ok(MemSlice::from_mmap(Arc::new(unsafe { + memmap::Mmap::map(*file)? + }))), + ScanSourceRef::Buffer(buff) => Ok(MemSlice::from_bytes((*buff).clone())), + } + } + + pub fn to_memslice_possibly_async( + &self, + run_async: bool, + #[cfg(feature = "cloud")] cache_entries: Option< + &Vec>, + >, + #[cfg(not(feature = "cloud"))] cache_entries: Option<&()>, + index: usize, + ) -> PolarsResult { + match self { + Self::Path(path) => { + let f = if run_async { + feature_gated!("cloud", { + cache_entries.unwrap()[index].try_open_check_latest()? + }) + } else { + polars_utils::open_file(path)? + }; + + let mmap = unsafe { memmap::Mmap::map(&f)? }; + Ok(MemSlice::from_mmap(Arc::new(mmap))) + }, + Self::File(file) => { + let mmap = unsafe { memmap::Mmap::map(*file)? }; + Ok(MemSlice::from_mmap(Arc::new(mmap))) + }, + Self::Buffer(buff) => Ok(MemSlice::from_bytes((*buff).clone())), + } + } +} + +impl<'a> Iterator for ScanSourceIter<'a> { + type Item = ScanSourceRef<'a>; + + fn next(&mut self) -> Option { + let item = match self.sources { + ScanSources::Paths(paths) => ScanSourceRef::Path(paths.get(self.offset)?), + ScanSources::Files(files) => ScanSourceRef::File(files.get(self.offset)?), + ScanSources::Buffers(buffers) => ScanSourceRef::Buffer(buffers.get(self.offset)?), + }; + + self.offset += 1; + Some(item) + } + + fn size_hint(&self) -> (usize, Option) { + let len = self.sources.len() - self.offset; + (len, Some(len)) + } +} + +impl<'a> ExactSizeIterator for ScanSourceIter<'a> {} diff --git a/crates/polars-python/src/conversion/mod.rs b/crates/polars-python/src/conversion/mod.rs index 4eee205d8550..fd8e97cb7adc 100644 --- a/crates/polars-python/src/conversion/mod.rs +++ b/crates/polars-python/src/conversion/mod.rs @@ -32,7 +32,7 @@ use pyo3::pybacked::PyBackedStr; use pyo3::types::{PyDict, PyList, PySequence}; use crate::error::PyPolarsErr; -use crate::file::{get_either_file_or_path, EitherPythonFileOrPath}; +use crate::file::{get_python_scan_source_input, PythonScanSourceInput}; #[cfg(feature = "object")] use crate::object::OBJECT_NAME; use crate::prelude::*; @@ -549,29 +549,24 @@ impl<'py> FromPyObject<'py> for Wrap { let num_items = list.len(); let mut iter = list .into_iter() - .map(|val| get_either_file_or_path(val.unbind(), false)); + .map(|val| get_python_scan_source_input(val.unbind(), false)); let Some(first) = iter.next() else { return Ok(Wrap(ScanSources::default())); }; let mut sources = match first? { - EitherPythonFileOrPath::Py(f) => { - let mut sources = Vec::with_capacity(num_items); - sources.push(f.as_bytes()); - MutableSources::Buffers(sources) - }, - EitherPythonFileOrPath::Path(path) => { + PythonScanSourceInput::Path(path) => { let mut sources = Vec::with_capacity(num_items); sources.push(path); MutableSources::Paths(sources) }, - EitherPythonFileOrPath::File(file) => { + PythonScanSourceInput::File(file) => { let mut sources = Vec::with_capacity(num_items); sources.push(file); MutableSources::Files(sources) }, - EitherPythonFileOrPath::Buffer(buffer) => { + PythonScanSourceInput::Buffer(buffer) => { let mut sources = Vec::with_capacity(num_items); sources.push(buffer); MutableSources::Buffers(sources) @@ -580,10 +575,9 @@ impl<'py> FromPyObject<'py> for Wrap { for source in iter { match (&mut sources, source?) { - (MutableSources::Paths(v), EitherPythonFileOrPath::Path(p)) => v.push(p), - (MutableSources::Files(v), EitherPythonFileOrPath::File(f)) => v.push(f), - (MutableSources::Buffers(v), EitherPythonFileOrPath::Py(f)) => v.push(f.as_bytes()), - (MutableSources::Buffers(v), EitherPythonFileOrPath::Buffer(f)) => v.push(f), + (MutableSources::Paths(v), PythonScanSourceInput::Path(p)) => v.push(p), + (MutableSources::Files(v), PythonScanSourceInput::File(f)) => v.push(f), + (MutableSources::Buffers(v), PythonScanSourceInput::Buffer(f)) => v.push(f), _ => { return Err(PyTypeError::new_err( "Cannot combine in-memory bytes, paths and files for scan sources", diff --git a/crates/polars-python/src/file.rs b/crates/polars-python/src/file.rs index 2857b37a4891..33d084c5130c 100644 --- a/crates/polars-python/src/file.rs +++ b/crates/polars-python/src/file.rs @@ -203,20 +203,22 @@ impl EitherRustPythonFile { } } -pub enum EitherPythonFileOrPath { - Py(PyFileLikeObject), +pub enum PythonScanSourceInput { Buffer(bytes::Bytes), Path(PathBuf), File(File), } -pub fn get_either_file_or_path(py_f: PyObject, write: bool) -> PyResult { +pub fn get_python_scan_source_input( + py_f: PyObject, + write: bool, +) -> PyResult { Python::with_gil(|py| { let py_f = py_f.into_bound(py); // If the pyobject is a `bytes` class if let Ok(bytes) = py_f.downcast::() { - return Ok(EitherPythonFileOrPath::Buffer( + return Ok(PythonScanSourceInput::Buffer( bytes::Bytes::copy_from_slice(bytes.as_bytes()), )); } @@ -224,7 +226,7 @@ pub fn get_either_file_or_path(py_f: PyObject, write: bool) -> PyResult>() { let file_path = std::path::Path::new(&*s); let file_path = resolve_homedir(file_path); - Ok(EitherPythonFileOrPath::Path(file_path)) + Ok(PythonScanSourceInput::Path(file_path)) } else { let io = py.import_bound("io").unwrap(); let is_utf8_encoding = |py_f: &Bound| -> PyResult { @@ -277,7 +279,7 @@ pub fn get_either_file_or_path(py_f: PyObject, write: bool) -> PyResult PyResult PyResult<(Option, ScanSources)> { - use crate::file::{get_either_file_or_path, EitherPythonFileOrPath}; - Ok(match get_either_file_or_path(obj, false)? { - EitherPythonFileOrPath::Path(path) => { + use crate::file::{get_python_scan_source_input, PythonScanSourceInput}; + Ok(match get_python_scan_source_input(obj, false)? { + PythonScanSourceInput::Path(path) => { (Some(path.clone()), ScanSources::Paths([path].into())) }, - EitherPythonFileOrPath::File(file) => (None, ScanSources::Files([file].into())), - EitherPythonFileOrPath::Py(f) => (None, ScanSources::Buffers([f.as_bytes()].into())), - EitherPythonFileOrPath::Buffer(buff) => (None, ScanSources::Buffers([buff].into())), + PythonScanSourceInput::File(file) => (None, ScanSources::Files([file].into())), + PythonScanSourceInput::Buffer(buff) => (None, ScanSources::Buffers([buff].into())), }) } From 393e589391ff02e946135bbd28af21fea54dbab9 Mon Sep 17 00:00:00 2001 From: coastalwhite Date: Sun, 8 Sep 2024 14:35:23 +0200 Subject: [PATCH 21/27] fix mypy --- py-polars/polars/io/ndjson.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/py-polars/polars/io/ndjson.py b/py-polars/polars/io/ndjson.py index 8fc2addd6901..cd9ea92bf3c0 100644 --- a/py-polars/polars/io/ndjson.py +++ b/py-polars/polars/io/ndjson.py @@ -145,7 +145,7 @@ def read_ndjson( return df return scan_ndjson( - source, # type: ignore[arg-type] + source, schema=schema, schema_overrides=schema_overrides, infer_schema_length=infer_schema_length, From 1ddc3a580cc3febf737b08b280af52242fca5d79 Mon Sep 17 00:00:00 2001 From: coastalwhite Date: Sun, 8 Sep 2024 15:46:05 +0200 Subject: [PATCH 22/27] fix mmap --- crates/polars-io/src/ipc/mmap.rs | 13 +- crates/polars-io/src/mmap.rs | 83 +------------ crates/polars-io/src/utils/byte_source.rs | 5 +- crates/polars-io/src/utils/other.rs | 6 +- .../src/executors/scan/ipc.rs | 6 +- .../polars-plan/src/plans/ir/scan_sources.rs | 18 +-- crates/polars-python/src/dataframe/io.rs | 2 +- crates/polars-utils/src/mmap.rs | 112 ++++++++++++++++-- 8 files changed, 123 insertions(+), 122 deletions(-) diff --git a/crates/polars-io/src/ipc/mmap.rs b/crates/polars-io/src/ipc/mmap.rs index 854bd4c8d9d7..74ef2b533462 100644 --- a/crates/polars-io/src/ipc/mmap.rs +++ b/crates/polars-io/src/ipc/mmap.rs @@ -3,9 +3,10 @@ use arrow::io::ipc::read::{Dictionaries, FileMetadata}; use arrow::mmap::{mmap_dictionaries_unchecked, mmap_unchecked}; use arrow::record_batch::RecordBatch; use polars_core::prelude::*; +use polars_utils::mmap::MMapSemaphore; use super::ipc_file::IpcReader; -use crate::mmap::{MMapSemaphore, MmapBytesReader}; +use crate::mmap::MmapBytesReader; use crate::predicates::PhysicalIoExpr; use crate::shared::{finish_reader, ArrowReader}; use crate::utils::{apply_projection, columns_to_projection}; @@ -15,17 +16,9 @@ impl IpcReader { &mut self, predicate: Option>, ) -> PolarsResult { - #[cfg(target_family = "unix")] - use std::os::unix::fs::MetadataExt; match self.reader.to_file() { Some(file) => { - #[cfg(target_family = "unix")] - let metadata = file.metadata()?; - let mmap = unsafe { memmap::Mmap::map(file).unwrap() }; - #[cfg(target_family = "unix")] - let semaphore = MMapSemaphore::new(metadata.dev(), metadata.ino(), mmap); - #[cfg(not(target_family = "unix"))] - let semaphore = MMapSemaphore::new(mmap); + let semaphore = MMapSemaphore::new_from_file(&file)?; let metadata = read::read_file_metadata(&mut std::io::Cursor::new(semaphore.as_ref()))?; diff --git a/crates/polars-io/src/mmap.rs b/crates/polars-io/src/mmap.rs index ad2c05175810..498c73da1a9d 100644 --- a/crates/polars-io/src/mmap.rs +++ b/crates/polars-io/src/mmap.rs @@ -1,84 +1,9 @@ -#[cfg(target_family = "unix")] -use std::collections::btree_map::Entry; -#[cfg(target_family = "unix")] -use std::collections::BTreeMap; use std::fs::File; use std::io::{BufReader, Cursor, Read, Seek}; use std::sync::Arc; -#[cfg(target_family = "unix")] -use std::sync::Mutex; -use memmap::Mmap; -#[cfg(target_family = "unix")] -use once_cell::sync::Lazy; use polars_core::config::verbose; -#[cfg(target_family = "unix")] -use polars_error::polars_bail; -use polars_error::PolarsResult; -use polars_utils::mmap::MemSlice; - -// Keep track of memory mapped files so we don't write to them while reading -// Use a btree as it uses less memory than a hashmap and this thing never shrinks. -// Write handle in Windows is exclusive, so this is only necessary in Unix. -#[cfg(target_family = "unix")] -static MEMORY_MAPPED_FILES: Lazy>> = - Lazy::new(|| Mutex::new(Default::default())); - -pub(crate) struct MMapSemaphore { - #[cfg(target_family = "unix")] - key: (u64, u64), - mmap: Mmap, -} - -impl MMapSemaphore { - #[cfg(target_family = "unix")] - pub(super) fn new(dev: u64, ino: u64, mmap: Mmap) -> Self { - let mut guard = MEMORY_MAPPED_FILES.lock().unwrap(); - let key = (dev, ino); - guard.insert(key, 1); - Self { key, mmap } - } - - #[cfg(not(target_family = "unix"))] - pub(super) fn new(mmap: Mmap) -> Self { - Self { mmap } - } -} - -impl AsRef<[u8]> for MMapSemaphore { - #[inline] - fn as_ref(&self) -> &[u8] { - self.mmap.as_ref() - } -} - -#[cfg(target_family = "unix")] -impl Drop for MMapSemaphore { - fn drop(&mut self) { - let mut guard = MEMORY_MAPPED_FILES.lock().unwrap(); - if let Entry::Occupied(mut e) = guard.entry(self.key) { - let v = e.get_mut(); - *v -= 1; - - if *v == 0 { - e.remove_entry(); - } - } - } -} - -pub fn ensure_not_mapped(#[allow(unused)] file: &File) -> PolarsResult<()> { - #[cfg(target_family = "unix")] - { - use std::os::unix::fs::MetadataExt; - let guard = MEMORY_MAPPED_FILES.lock().unwrap(); - let metadata = file.metadata()?; - if guard.contains_key(&(metadata.dev(), metadata.ino())) { - polars_bail!(ComputeError: "cannot write to file: already memory mapped"); - } - } - Ok(()) -} +use polars_utils::mmap::{MMapSemaphore, MemSlice}; /// Trait used to get a hold to file handler or to the underlying bytes /// without performing a Read. @@ -143,7 +68,7 @@ impl MmapBytesReader for &mut T { pub enum ReaderBytes<'a> { Borrowed(&'a [u8]), Owned(Vec), - Mapped(memmap::Mmap, &'a File), + Mapped(MMapSemaphore, &'a File), } impl std::ops::Deref for ReaderBytes<'_> { @@ -152,7 +77,7 @@ impl std::ops::Deref for ReaderBytes<'_> { match self { Self::Borrowed(ref_bytes) => ref_bytes, Self::Owned(vec) => vec, - Self::Mapped(mmap, _) => mmap, + Self::Mapped(mmap, _) => mmap.as_ref(), } } } @@ -180,7 +105,7 @@ impl<'a, T: 'a + MmapBytesReader> From<&'a mut T> for ReaderBytes<'a> { None => { if let Some(f) = m.to_file() { let f = unsafe { std::mem::transmute::<&File, &'a File>(f) }; - let mmap = unsafe { memmap::Mmap::map(f).unwrap() }; + let mmap = MMapSemaphore::new_from_file(f).unwrap(); ReaderBytes::Mapped(mmap, f) } else { if verbose() { diff --git a/crates/polars-io/src/utils/byte_source.rs b/crates/polars-io/src/utils/byte_source.rs index fce7e795ce46..72cbabb3dd5c 100644 --- a/crates/polars-io/src/utils/byte_source.rs +++ b/crates/polars-io/src/utils/byte_source.rs @@ -1,7 +1,7 @@ use std::ops::Range; use std::sync::Arc; -use polars_error::{to_compute_err, PolarsResult}; +use polars_error::PolarsResult; use polars_utils::_limit_path_len_io_err; use polars_utils::mmap::MemSlice; @@ -34,9 +34,8 @@ impl MemSliceByteSource { .into_std() .await, ); - let mmap = Arc::new(unsafe { memmap::Mmap::map(file.as_ref()) }.map_err(to_compute_err)?); - Ok(Self(MemSlice::from_mmap(mmap))) + Ok(Self(MemSlice::from_file(file.as_ref())?)) } } diff --git a/crates/polars-io/src/utils/other.rs b/crates/polars-io/src/utils/other.rs index 3c1ab1e248d8..7267a6616924 100644 --- a/crates/polars-io/src/utils/other.rs +++ b/crates/polars-io/src/utils/other.rs @@ -7,6 +7,7 @@ use polars_core::prelude::*; #[cfg(any(feature = "ipc_streaming", feature = "parquet"))] use polars_core::utils::{accumulate_dataframes_vertical_unchecked, split_df_as_ref}; use polars_error::to_compute_err; +use polars_utils::mmap::MMapSemaphore; use regex::{Regex, RegexBuilder}; use crate::mmap::{MmapBytesReader, ReaderBytes}; @@ -21,12 +22,15 @@ pub fn get_reader_bytes<'a, R: Read + MmapBytesReader + ?Sized>( .ok() .and_then(|offset| Some((reader.to_file()?, offset))) { - let mmap = unsafe { memmap::MmapOptions::new().offset(offset).map(file)? }; + let mut options = memmap::MmapOptions::new(); + options.offset(offset); // somehow bck thinks borrows alias // this is sound as file was already bound to 'a use std::fs::File; + let file = unsafe { std::mem::transmute::<&File, &'a File>(file) }; + let mmap = MMapSemaphore::new_from_file_with_options(file, options)?; Ok(ReaderBytes::Mapped(mmap, file)) } else { // we can get the bytes for free diff --git a/crates/polars-mem-engine/src/executors/scan/ipc.rs b/crates/polars-mem-engine/src/executors/scan/ipc.rs index edde4765fec5..78b31f268756 100644 --- a/crates/polars-mem-engine/src/executors/scan/ipc.rs +++ b/crates/polars-mem-engine/src/executors/scan/ipc.rs @@ -81,11 +81,9 @@ impl IpcExec { Some(f) => f?, }; - MemSlice::from_mmap(Arc::new(unsafe { memmap::Mmap::map(&file)? })) - }, - ScanSourceRef::File(file) => { - MemSlice::from_mmap(Arc::new(unsafe { memmap::Mmap::map(file)? })) + MemSlice::from_file(&file)? }, + ScanSourceRef::File(file) => MemSlice::from_file(file)?, ScanSourceRef::Buffer(buff) => MemSlice::from_bytes(buff.clone()), }; diff --git a/crates/polars-plan/src/plans/ir/scan_sources.rs b/crates/polars-plan/src/plans/ir/scan_sources.rs index 5261d6ede706..1bdb92fda904 100644 --- a/crates/polars-plan/src/plans/ir/scan_sources.rs +++ b/crates/polars-plan/src/plans/ir/scan_sources.rs @@ -205,13 +205,9 @@ impl<'a> ScanSourceRef<'a> { polars_utils::open_file(path)? }; - Ok(MemSlice::from_mmap(Arc::new(unsafe { - memmap::Mmap::map(&file)? - }))) + MemSlice::from_file(&file) }, - ScanSourceRef::File(file) => Ok(MemSlice::from_mmap(Arc::new(unsafe { - memmap::Mmap::map(*file)? - }))), + ScanSourceRef::File(file) => MemSlice::from_file(file), ScanSourceRef::Buffer(buff) => Ok(MemSlice::from_bytes((*buff).clone())), } } @@ -227,7 +223,7 @@ impl<'a> ScanSourceRef<'a> { ) -> PolarsResult { match self { Self::Path(path) => { - let f = if run_async { + let file = if run_async { feature_gated!("cloud", { cache_entries.unwrap()[index].try_open_check_latest()? }) @@ -235,13 +231,9 @@ impl<'a> ScanSourceRef<'a> { polars_utils::open_file(path)? }; - let mmap = unsafe { memmap::Mmap::map(&f)? }; - Ok(MemSlice::from_mmap(Arc::new(mmap))) - }, - Self::File(file) => { - let mmap = unsafe { memmap::Mmap::map(*file)? }; - Ok(MemSlice::from_mmap(Arc::new(mmap))) + MemSlice::from_file(&file) }, + Self::File(file) => MemSlice::from_file(file), Self::Buffer(buff) => Ok(MemSlice::from_bytes((*buff).clone())), } } diff --git a/crates/polars-python/src/dataframe/io.rs b/crates/polars-python/src/dataframe/io.rs index 12707e93dd85..dbdf91ddff09 100644 --- a/crates/polars-python/src/dataframe/io.rs +++ b/crates/polars-python/src/dataframe/io.rs @@ -4,11 +4,11 @@ use std::sync::Arc; #[cfg(feature = "avro")] use polars::io::avro::AvroCompression; -use polars::io::mmap::ensure_not_mapped; use polars::io::RowIndex; use polars::prelude::*; #[cfg(feature = "parquet")] use polars_parquet::arrow::write::StatisticsOptions; +use polars_utils::mmap::ensure_not_mapped; use pyo3::prelude::*; use pyo3::pybacked::PyBackedStr; diff --git a/crates/polars-utils/src/mmap.rs b/crates/polars-utils/src/mmap.rs index c753525b43ee..9e946b3dac52 100644 --- a/crates/polars-utils/src/mmap.rs +++ b/crates/polars-utils/src/mmap.rs @@ -1,14 +1,16 @@ +use std::fs::File; use std::io; -use std::sync::Arc; pub use memmap::Mmap; mod private { + use std::fs::File; use std::ops::Deref; use std::sync::Arc; - pub use memmap::Mmap; + use polars_error::PolarsResult; + use super::MMapSemaphore; use crate::mem::prefetch_l2; /// A read-only reference to a slice of memory that can potentially be memory-mapped. @@ -34,7 +36,7 @@ mod private { #[allow(unused)] enum MemSliceInner { Bytes(bytes::Bytes), - Mmap(Arc), + Mmap(Arc), } impl Deref for MemSlice { @@ -82,7 +84,7 @@ mod private { } #[inline] - pub fn from_mmap(mmap: Arc) -> Self { + pub fn from_mmap(mmap: Arc) -> Self { Self { slice: unsafe { std::mem::transmute::<&[u8], &'static [u8]>(mmap.as_ref().as_ref()) @@ -91,6 +93,12 @@ mod private { } } + #[inline] + pub fn from_file(file: &File) -> PolarsResult { + let mmap = MMapSemaphore::new_from_file(file)?; + Ok(Self::from_mmap(Arc::new(mmap))) + } + /// Construct a `MemSlice` that simply wraps around a `&[u8]`. #[inline] pub fn from_slice(slice: &'static [u8]) -> Self { @@ -115,6 +123,8 @@ mod private { } } +use memmap::MmapOptions; +use polars_error::{polars_bail, PolarsResult}; pub use private::MemSlice; /// A cursor over a [`MemSlice`]. @@ -156,11 +166,6 @@ impl MemReader { Self::new(MemSlice::from_bytes(bytes)) } - #[inline(always)] - pub fn from_mmap(mmap: Arc) -> Self { - Self::new(MemSlice::from_mmap(mmap)) - } - // Construct a `MemSlice` that simply wraps around a `&[u8]`. The caller must ensure the /// slice outlives the returned `MemSlice`. #[inline] @@ -231,8 +236,91 @@ impl io::Seek for MemReader { } } -mod tests { +// Keep track of memory mapped files so we don't write to them while reading +// Use a btree as it uses less memory than a hashmap and this thing never shrinks. +// Write handle in Windows is exclusive, so this is only necessary in Unix. +#[cfg(target_family = "unix")] +static MEMORY_MAPPED_FILES: once_cell::sync::Lazy< + std::sync::Mutex>, +> = once_cell::sync::Lazy::new(|| std::sync::Mutex::new(Default::default())); + +#[derive(Debug)] +pub struct MMapSemaphore { + #[cfg(target_family = "unix")] + key: (u64, u64), + mmap: Mmap, +} + +impl MMapSemaphore { + pub fn new_from_file_with_options( + file: &File, + options: MmapOptions, + ) -> PolarsResult { + let mmap = unsafe { options.map(file) }?; + + #[cfg(target_family = "unix")] + { + use std::os::unix::fs::MetadataExt; + let metadata = file.metadata()?; + + let mut guard = MEMORY_MAPPED_FILES.lock().unwrap(); + let key = (metadata.dev(), metadata.ino()); + match guard.entry(key) { + std::collections::btree_map::Entry::Occupied(mut e) => *e.get_mut() += 1, + std::collections::btree_map::Entry::Vacant(e) => _ = e.insert(1), + } + Ok(Self { key, mmap }) + } + + #[cfg(not(target_family = "unix"))] + Ok(Self { mmap }) + } + + pub fn new_from_file(file: &File) -> PolarsResult { + Self::new_from_file_with_options(file, MmapOptions::default()) + } + + pub fn as_ptr(&self) -> *const u8 { + self.mmap.as_ptr() + } +} + +impl AsRef<[u8]> for MMapSemaphore { + #[inline] + fn as_ref(&self) -> &[u8] { + self.mmap.as_ref() + } +} + +#[cfg(target_family = "unix")] +impl Drop for MMapSemaphore { + fn drop(&mut self) { + let mut guard = MEMORY_MAPPED_FILES.lock().unwrap(); + if let std::collections::btree_map::Entry::Occupied(mut e) = guard.entry(self.key) { + let v = e.get_mut(); + *v -= 1; + + if *v == 0 { + e.remove_entry(); + } + } + } +} +pub fn ensure_not_mapped(#[allow(unused)] file: &File) -> PolarsResult<()> { + #[cfg(target_family = "unix")] + { + use std::os::unix::fs::MetadataExt; + let guard = MEMORY_MAPPED_FILES.lock().unwrap(); + let metadata = file.metadata()?; + if guard.contains_key(&(metadata.dev(), metadata.ino())) { + polars_bail!(ComputeError: "cannot write to file: already memory mapped"); + } + } + Ok(()) +} + +mod tests { #[test] fn test_mem_slice_zero_copy() { use std::sync::Arc; @@ -271,9 +359,11 @@ mod tests { } { + use crate::mmap::MMapSemaphore; + let path = "../../examples/datasets/foods1.csv"; let file = std::fs::File::open(path).unwrap(); - let mmap = unsafe { memmap::Mmap::map(&file) }.unwrap(); + let mmap = MMapSemaphore::new_from_file(&file).unwrap(); let ptr = mmap.as_ptr(); let mem_slice = MemSlice::from_mmap(Arc::new(mmap)); From 124b5484bbe50b90ee73d158864e83ff147ca006 Mon Sep 17 00:00:00 2001 From: coastalwhite Date: Sun, 8 Sep 2024 15:47:57 +0200 Subject: [PATCH 23/27] clippy --- crates/polars-io/src/ipc/mmap.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/polars-io/src/ipc/mmap.rs b/crates/polars-io/src/ipc/mmap.rs index 74ef2b533462..f0343642482e 100644 --- a/crates/polars-io/src/ipc/mmap.rs +++ b/crates/polars-io/src/ipc/mmap.rs @@ -18,7 +18,7 @@ impl IpcReader { ) -> PolarsResult { match self.reader.to_file() { Some(file) => { - let semaphore = MMapSemaphore::new_from_file(&file)?; + let semaphore = MMapSemaphore::new_from_file(file)?; let metadata = read::read_file_metadata(&mut std::io::Cursor::new(semaphore.as_ref()))?; From b2888111948a03d2e67a84fb6d3012a2f358c4e1 Mon Sep 17 00:00:00 2001 From: coastalwhite Date: Sun, 8 Sep 2024 16:04:30 +0200 Subject: [PATCH 24/27] remove broken test --- py-polars/tests/unit/io/test_ipc.py | 26 -------------------------- 1 file changed, 26 deletions(-) diff --git a/py-polars/tests/unit/io/test_ipc.py b/py-polars/tests/unit/io/test_ipc.py index a1505cf3d2fc..52f69a450507 100644 --- a/py-polars/tests/unit/io/test_ipc.py +++ b/py-polars/tests/unit/io/test_ipc.py @@ -343,29 +343,3 @@ def test_ipc_decimal_15920( path = f"{tmp_path}/data" df.write_ipc(path) assert_frame_equal(pl.read_ipc(path), df) - - -@pytest.mark.write_disk -def test_ipc_raise_on_writing_mmap(tmp_path: Path) -> None: - p = tmp_path / "foo.ipc" - df = pl.DataFrame({"foo": [1, 2, 3]}) - # first write is allowed - df.write_ipc(p) - - # now open as memory mapped - df = pl.read_ipc(p, memory_map=True) - - if os.name == "nt": - # In Windows, it's the duty of the system to ensure exclusive access - with pytest.raises( - OSError, - match=re.escape( - "The requested operation cannot be performed on a file with a user-mapped section open. (os error 1224)" - ), - ): - df.write_ipc(p) - else: - with pytest.raises( - ComputeError, match="cannot write to file: already memory mapped" - ): - df.write_ipc(p) From a3ffac672b3127131bd8a60b3aae78903455c999 Mon Sep 17 00:00:00 2001 From: coastalwhite Date: Sun, 8 Sep 2024 16:05:02 +0200 Subject: [PATCH 25/27] ruff --- py-polars/tests/unit/io/test_ipc.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/py-polars/tests/unit/io/test_ipc.py b/py-polars/tests/unit/io/test_ipc.py index 52f69a450507..dd60d0ae209c 100644 --- a/py-polars/tests/unit/io/test_ipc.py +++ b/py-polars/tests/unit/io/test_ipc.py @@ -1,8 +1,6 @@ from __future__ import annotations import io -import os -import re from decimal import Decimal from typing import TYPE_CHECKING, Any @@ -10,7 +8,6 @@ import pytest import polars as pl -from polars.exceptions import ComputeError from polars.interchange.protocol import CompatLevel from polars.testing import assert_frame_equal From e35eabe6355fa79b077cede1adf53aae7f393b9f Mon Sep 17 00:00:00 2001 From: coastalwhite Date: Mon, 9 Sep 2024 09:17:13 +0200 Subject: [PATCH 26/27] remove stale memory_map option --- crates/polars-io/src/ipc/ipc_file.rs | 5 +---- crates/polars-lazy/src/scan/ipc.rs | 6 +----- crates/polars-python/src/lazyframe/general.rs | 4 +--- py-polars/polars/io/ipc/functions.py | 6 +++--- 4 files changed, 6 insertions(+), 15 deletions(-) diff --git a/crates/polars-io/src/ipc/ipc_file.rs b/crates/polars-io/src/ipc/ipc_file.rs index 9347a453b426..feaea44f5417 100644 --- a/crates/polars-io/src/ipc/ipc_file.rs +++ b/crates/polars-io/src/ipc/ipc_file.rs @@ -51,10 +51,7 @@ use crate::RowIndex; #[derive(Clone, Debug, PartialEq, Hash)] #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] -pub struct IpcScanOptions { - /// Not used anymore. - pub memory_map: bool, -} +pub struct IpcScanOptions; /// Read Arrows IPC format into a DataFrame /// diff --git a/crates/polars-lazy/src/scan/ipc.rs b/crates/polars-lazy/src/scan/ipc.rs index 8d84ef3de049..a9f8c8b98b0f 100644 --- a/crates/polars-lazy/src/scan/ipc.rs +++ b/crates/polars-lazy/src/scan/ipc.rs @@ -13,7 +13,6 @@ pub struct ScanArgsIpc { pub cache: bool, pub rechunk: bool, pub row_index: Option, - pub memory_map: bool, pub cloud_options: Option, pub hive_options: HiveOptions, pub include_file_paths: Option, @@ -26,7 +25,6 @@ impl Default for ScanArgsIpc { cache: true, rechunk: false, row_index: None, - memory_map: true, cloud_options: Default::default(), hive_options: Default::default(), include_file_paths: None, @@ -53,9 +51,7 @@ impl LazyFileListReader for LazyIpcReader { fn finish(self) -> PolarsResult { let args = self.args; - let options = IpcScanOptions { - memory_map: args.memory_map, - }; + let options = IpcScanOptions {}; let mut lf: LazyFrame = DslBuilder::scan_ipc( self.sources.to_dsl(false), diff --git a/crates/polars-python/src/lazyframe/general.rs b/crates/polars-python/src/lazyframe/general.rs index 362d114817b5..86bcd3c2566b 100644 --- a/crates/polars-python/src/lazyframe/general.rs +++ b/crates/polars-python/src/lazyframe/general.rs @@ -310,7 +310,7 @@ impl PyLazyFrame { #[cfg(feature = "ipc")] #[staticmethod] - #[pyo3(signature = (source, sources, n_rows, cache, rechunk, row_index, memory_map, cloud_options, hive_partitioning, hive_schema, try_parse_hive_dates, retries, file_cache_ttl, include_file_paths))] + #[pyo3(signature = (source, sources, n_rows, cache, rechunk, row_index, cloud_options, hive_partitioning, hive_schema, try_parse_hive_dates, retries, file_cache_ttl, include_file_paths))] fn new_from_ipc( source: Option, sources: Wrap, @@ -318,7 +318,6 @@ impl PyLazyFrame { cache: bool, rechunk: bool, row_index: Option<(String, IdxSize)>, - memory_map: bool, cloud_options: Option>, hive_partitioning: Option, hive_schema: Option>, @@ -344,7 +343,6 @@ impl PyLazyFrame { cache, rechunk, row_index, - memory_map, #[cfg(feature = "cloud")] cloud_options: None, hive_options, diff --git a/py-polars/polars/io/ipc/functions.py b/py-polars/polars/io/ipc/functions.py index 17ee17d6843b..43fbc8136de2 100644 --- a/py-polars/polars/io/ipc/functions.py +++ b/py-polars/polars/io/ipc/functions.py @@ -114,7 +114,6 @@ def read_ipc( lf = scan_ipc( source, n_rows=n_rows, - memory_map=memory_map, storage_options=storage_options, row_index_name=row_index_name, row_index_offset=row_index_offset, @@ -189,7 +188,6 @@ def _read_ipc_impl( rechunk=rechunk, row_index_name=row_index_name, row_index_offset=row_index_offset, - memory_map=memory_map, ) if columns is None: df = scan.collect() @@ -448,6 +446,9 @@ def scan_ipc( source = None # type: ignore[assignment] + # Memory Mapping is now a no-op + _ = memory_map + pylf = PyLazyFrame.new_from_ipc( source, sources, @@ -455,7 +456,6 @@ def scan_ipc( cache, rechunk, parse_row_index_args(row_index_name, row_index_offset), - memory_map=memory_map, cloud_options=storage_options, retries=retries, file_cache_ttl=file_cache_ttl, From 1e2fa0dfbce927be739bfb51e1e46ed24fb0ae3e Mon Sep 17 00:00:00 2001 From: coastalwhite Date: Mon, 9 Sep 2024 09:21:10 +0200 Subject: [PATCH 27/27] fix test --- crates/polars-lazy/src/tests/io.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/crates/polars-lazy/src/tests/io.rs b/crates/polars-lazy/src/tests/io.rs index 57beafc63033..a1d3f2c050a8 100644 --- a/crates/polars-lazy/src/tests/io.rs +++ b/crates/polars-lazy/src/tests/io.rs @@ -417,7 +417,6 @@ fn test_ipc_globbing() -> PolarsResult<()> { cache: true, rechunk: false, row_index: None, - memory_map: true, cloud_options: None, hive_options: Default::default(), include_file_paths: None,