From 5c29bd73d4eda7446d29b9a6736e79ea1184e280 Mon Sep 17 00:00:00 2001
From: coastalwhite <me@gburghoorn.com>
Date: Tue, 3 Sep 2024 15:08:26 +0200
Subject: [PATCH 01/27] feat: Allow BytesIO for Parquet scan

---
 crates/polars-io/src/path_utils/mod.rs        |   6 +-
 crates/polars-lazy/src/lib.rs                 |   1 +
 crates/polars-lazy/src/scan/csv.rs            |  91 +++--
 .../polars-lazy/src/scan/file_list_reader.rs  |  22 +-
 crates/polars-lazy/src/scan/ipc.rs            |  19 +-
 crates/polars-lazy/src/scan/ndjson.rs         |  22 +-
 crates/polars-lazy/src/scan/parquet.rs        |  26 +-
 .../src/executors/scan/csv.rs                 |  23 +-
 .../src/executors/scan/ipc.rs                 |  30 +-
 .../src/executors/scan/ndjson.rs              |  17 +-
 .../src/executors/scan/parquet.rs             | 323 +++++++++---------
 crates/polars-mem-engine/src/planner/lp.rs    |  10 +-
 crates/polars-mem-engine/src/utils.rs         |   4 +-
 .../polars-pipe/src/executors/sources/csv.rs  |  13 +-
 .../src/executors/sources/parquet.rs          |  12 +-
 crates/polars-pipe/src/pipeline/convert.rs    |   8 +-
 crates/polars-plan/src/client/check.rs        |  33 +-
 crates/polars-plan/src/plans/builder_dsl.rs   |  31 +-
 .../src/plans/conversion/dsl_to_ir.rs         | 134 +++++---
 .../polars-plan/src/plans/conversion/mod.rs   |   6 +-
 .../polars-plan/src/plans/conversion/scans.rs |  97 ++++--
 crates/polars-plan/src/plans/ir/dot.rs        |   3 +-
 crates/polars-plan/src/plans/ir/format.rs     |   3 +-
 crates/polars-plan/src/plans/ir/inputs.rs     |   4 +-
 crates/polars-plan/src/plans/ir/mod.rs        | 176 +++++++++-
 crates/polars-plan/src/plans/mod.rs           |  19 +-
 .../src/plans/optimizer/count_star.rs         |   5 +-
 .../plans/optimizer/predicate_pushdown/mod.rs |   9 +-
 .../optimizer/projection_pushdown/mod.rs      |   4 +-
 .../src/plans/optimizer/slice_pushdown_lp.rs  |  12 +-
 crates/polars-plan/src/plans/visitor/hash.rs  |  10 +-
 crates/polars-python/src/file.rs              |  64 ++++
 crates/polars-python/src/lazyframe/general.rs |  53 ++-
 .../src/lazyframe/visitor/nodes.rs            |   4 +-
 .../polars-stream/src/nodes/parquet_source.rs |   6 +-
 .../src/physical_plan/lower_ir.rs             |   4 +-
 crates/polars-stream/src/physical_plan/mod.rs |   2 +-
 .../src/utils/late_materialized_df.rs         |   4 +-
 py-polars/polars/io/parquet/functions.py      |   6 +-
 py-polars/tests/unit/io/test_parquet.py       | 176 +++-------
 40 files changed, 908 insertions(+), 584 deletions(-)
diff --git a/crates/polars-io/src/path_utils/mod.rs b/crates/polars-io/src/path_utils/mod.rs
index 5c4e48f7e6e4..d98034f6096c 100644
--- a/crates/polars-io/src/path_utils/mod.rs
+++ b/crates/polars-io/src/path_utils/mod.rs
@@ -88,7 +88,7 @@ pub fn expand_paths(
     paths: &[PathBuf],
     glob: bool,
     #[allow(unused_variables)] cloud_options: Option<&CloudOptions>,
-) -> PolarsResult<Arc<Vec<PathBuf>>> {
+) -> PolarsResult<Arc<[PathBuf]>> {
     expand_paths_hive(paths, glob, cloud_options, false).map(|x| x.0)
 }
 
@@ -129,7 +129,7 @@ pub fn expand_paths_hive(
     glob: bool,
     #[allow(unused_variables)] cloud_options: Option<&CloudOptions>,
     check_directory_level: bool,
-) -> PolarsResult<(Arc<Vec<PathBuf>>, usize)> {
+) -> PolarsResult<(Arc<[PathBuf]>, usize)> {
     let Some(first_path) = paths.first() else {
         return Ok((vec![].into(), 0));
     };
@@ -361,7 +361,7 @@ pub fn expand_paths_hive(
         out_paths
     };
 
-    Ok((Arc::new(out_paths), hive_idx_tracker.idx))
+    Ok((out_paths.into(), hive_idx_tracker.idx))
 }
 
 /// Ignores errors from `std::fs::create_dir_all` if the directory exists.
diff --git a/crates/polars-lazy/src/lib.rs b/crates/polars-lazy/src/lib.rs
index 024f2a26bffb..005a09186ba2 100644
--- a/crates/polars-lazy/src/lib.rs
+++ b/crates/polars-lazy/src/lib.rs
@@ -206,6 +206,7 @@ pub mod dsl;
 pub mod frame;
 pub mod physical_plan;
 pub mod prelude;
+
 mod scan;
 #[cfg(test)]
 mod tests;
diff --git a/crates/polars-lazy/src/scan/csv.rs b/crates/polars-lazy/src/scan/csv.rs
index 54e9c77e2480..676c34b6a71e 100644
--- a/crates/polars-lazy/src/scan/csv.rs
+++ b/crates/polars-lazy/src/scan/csv.rs
@@ -5,6 +5,7 @@ use polars_io::cloud::CloudOptions;
 use polars_io::csv::read::{
     infer_file_schema, CommentPrefix, CsvEncoding, CsvParseOptions, CsvReadOptions, NullValues,
 };
+use polars_io::mmap::ReaderBytes;
 use polars_io::path_utils::expand_paths;
 use polars_io::utils::get_reader_bytes;
 use polars_io::RowIndex;
@@ -14,7 +15,7 @@ use crate::prelude::*;
 #[derive(Clone)]
 #[cfg(feature = "csv")]
 pub struct LazyCsvReader {
-    paths: Arc<Vec<PathBuf>>,
+    source: ScanSource,
     glob: bool,
     cache: bool,
     read_options: CsvReadOptions,
@@ -30,13 +31,13 @@ impl LazyCsvReader {
         self
     }
 
-    pub fn new_paths(paths: Arc<Vec<PathBuf>>) -> Self {
+    pub fn new_paths(paths: Arc<[PathBuf]>) -> Self {
         Self::new("").with_paths(paths)
     }
 
     pub fn new(path: impl AsRef<Path>) -> Self {
         LazyCsvReader {
-            paths: Arc::new(vec![path.as_ref().to_path_buf()]),
+            source: ScanSource::Files([path.as_ref().to_path_buf()].into()),
             glob: true,
             cache: true,
             read_options: Default::default(),
@@ -219,38 +220,54 @@ impl LazyCsvReader {
     where
         F: Fn(Schema) -> PolarsResult<Schema>,
     {
-        // TODO: Path expansion should happen when converting to the IR
-        // https://github.com/pola-rs/polars/issues/17634
-        let paths = expand_paths(self.paths(), self.glob(), self.cloud_options())?;
+        let mut n_threads = self.read_options.n_threads;
+
+        let mut infer_schema = |reader_bytes: ReaderBytes| {
+            let skip_rows = self.read_options.skip_rows;
+            let parse_options = self.read_options.get_parse_options();
+
+            PolarsResult::Ok(
+                infer_file_schema(
+                    &reader_bytes,
+                    parse_options.separator,
+                    self.read_options.infer_schema_length,
+                    self.read_options.has_header,
+                    // we set it to None and modify them after the schema is updated
+                    None,
+                    skip_rows,
+                    self.read_options.skip_rows_after_header,
+                    parse_options.comment_prefix.as_ref(),
+                    parse_options.quote_char,
+                    parse_options.eol_char,
+                    None,
+                    parse_options.try_parse_dates,
+                    self.read_options.raise_if_empty,
+                    &mut n_threads,
+                    parse_options.decimal_comma,
+                )?
+                .0,
+            )
+        };
 
-        let Some(path) = paths.first() else {
-            polars_bail!(ComputeError: "no paths specified for this reader");
+        let schema = match self.source.clone() {
+            ScanSource::Files(paths) => {
+                // TODO: Path expansion should happen when converting to the IR
+                // https://github.com/pola-rs/polars/issues/17634
+                let paths = expand_paths(&paths[..], self.glob(), self.cloud_options())?;
+
+                let Some(path) = paths.first() else {
+                    polars_bail!(ComputeError: "no paths specified for this reader");
+                };
+
+                let mut file = polars_utils::open_file(path)?;
+                infer_schema(get_reader_bytes(&mut file).expect("could not mmap file"))?
+            },
+            ScanSource::Buffer(buffer) => infer_schema(
+                get_reader_bytes(&mut std::io::Cursor::new(buffer)).expect("could not mmap file"),
+            )?,
         };
 
-        let mut file = polars_utils::open_file(path)?;
-
-        let reader_bytes = get_reader_bytes(&mut file).expect("could not mmap file");
-        let skip_rows = self.read_options.skip_rows;
-        let parse_options = self.read_options.get_parse_options();
-
-        let (schema, _, _) = infer_file_schema(
-            &reader_bytes,
-            parse_options.separator,
-            self.read_options.infer_schema_length,
-            self.read_options.has_header,
-            // we set it to None and modify them after the schema is updated
-            None,
-            skip_rows,
-            self.read_options.skip_rows_after_header,
-            parse_options.comment_prefix.as_ref(),
-            parse_options.quote_char,
-            parse_options.eol_char,
-            None,
-            parse_options.try_parse_dates,
-            self.read_options.raise_if_empty,
-            &mut self.read_options.n_threads,
-            parse_options.decimal_comma,
-        )?;
+        self.read_options.n_threads = n_threads;
         let mut schema = f(schema)?;
 
         // the dtypes set may be for the new names, so update again
@@ -273,7 +290,7 @@ impl LazyFileListReader for LazyCsvReader {
     /// Get the final [LazyFrame].
     fn finish(self) -> PolarsResult<LazyFrame> {
         let mut lf: LazyFrame = DslBuilder::scan_csv(
-            self.paths,
+            self.source.to_dsl(false),
             self.read_options,
             self.cache,
             self.cloud_options,
@@ -294,12 +311,12 @@ impl LazyFileListReader for LazyCsvReader {
         self.glob
     }
 
-    fn paths(&self) -> &[PathBuf] {
-        &self.paths
+    fn source(&self) -> &ScanSource {
+        &self.source
     }
 
-    fn with_paths(mut self, paths: Arc<Vec<PathBuf>>) -> Self {
-        self.paths = paths;
+    fn with_source(mut self, source: ScanSource) -> Self {
+        self.source = source;
         self
     }
 
diff --git a/crates/polars-lazy/src/scan/file_list_reader.rs b/crates/polars-lazy/src/scan/file_list_reader.rs
index f7b91d427200..8992b8df5a65 100644
--- a/crates/polars-lazy/src/scan/file_list_reader.rs
+++ b/crates/polars-lazy/src/scan/file_list_reader.rs
@@ -1,4 +1,5 @@
 use std::path::PathBuf;
+use std::sync::Arc;
 
 use polars_core::prelude::*;
 use polars_io::cloud::CloudOptions;
@@ -18,8 +19,11 @@ pub trait LazyFileListReader: Clone {
             return self.finish_no_glob();
         }
 
-        let lfs = self
-            .paths()
+        let ScanSource::Files(paths) = self.source() else {
+            unreachable!("Should never be globbed");
+        };
+
+        let lfs = paths
             .iter()
             .map(|path| {
                 self.clone()
@@ -27,7 +31,7 @@ pub trait LazyFileListReader: Clone {
                     .with_n_rows(None)
                     // Each individual reader should not apply a row index.
                     .with_row_index(None)
-                    .with_paths(Arc::new(vec![path.clone()]))
+                    .with_paths([path.clone()].into())
                     .with_rechunk(false)
                     .finish_no_glob()
                     .map_err(|e| {
@@ -40,7 +44,7 @@ pub trait LazyFileListReader: Clone {
 
         polars_ensure!(
             !lfs.is_empty(),
-            ComputeError: "no matching files found in {:?}", self.paths().iter().map(|x| x.to_str().unwrap()).collect::<Vec<_>>()
+            ComputeError: "no matching files found in {:?}", paths.iter().map(|x| x.to_str().unwrap()).collect::<Vec<_>>()
         );
 
         let mut lf = self.concat_impl(lfs)?;
@@ -79,11 +83,17 @@ pub trait LazyFileListReader: Clone {
         true
     }
 
-    fn paths(&self) -> &[PathBuf];
+    fn source(&self) -> &ScanSource;
+
+    /// Set paths of the scanned files.
+    #[must_use]
+    fn with_source(self, source: ScanSource) -> Self;
 
     /// Set paths of the scanned files.
     #[must_use]
-    fn with_paths(self, paths: Arc<Vec<PathBuf>>) -> Self;
+    fn with_paths(self, paths: Arc<[PathBuf]>) -> Self {
+        self.with_source(ScanSource::Files(paths))
+    }
 
     /// Configure the row limit.
     fn with_n_rows(self, n_rows: impl Into<Option<usize>>) -> Self;
diff --git a/crates/polars-lazy/src/scan/ipc.rs b/crates/polars-lazy/src/scan/ipc.rs
index 9d981bc74c0e..af0b53ade823 100644
--- a/crates/polars-lazy/src/scan/ipc.rs
+++ b/crates/polars-lazy/src/scan/ipc.rs
@@ -37,21 +37,20 @@ impl Default for ScanArgsIpc {
 #[derive(Clone)]
 struct LazyIpcReader {
     args: ScanArgsIpc,
-    paths: Arc<Vec<PathBuf>>,
+    source: ScanSource,
 }
 
 impl LazyIpcReader {
     fn new(args: ScanArgsIpc) -> Self {
         Self {
             args,
-            paths: Arc::new(vec![]),
+            source: ScanSource::default(),
         }
     }
 }
 
 impl LazyFileListReader for LazyIpcReader {
     fn finish(self) -> PolarsResult<LazyFrame> {
-        let paths = self.paths;
         let args = self.args;
 
         let options = IpcScanOptions {
@@ -59,7 +58,7 @@ impl LazyFileListReader for LazyIpcReader {
         };
 
         let mut lf: LazyFrame = DslBuilder::scan_ipc(
-            paths,
+            self.source.to_dsl(false),
             options,
             args.n_rows,
             args.cache,
@@ -80,12 +79,12 @@ impl LazyFileListReader for LazyIpcReader {
         unreachable!()
     }
 
-    fn paths(&self) -> &[PathBuf] {
-        &self.paths
+    fn source(&self) -> &ScanSource {
+        &self.source
     }
 
-    fn with_paths(mut self, paths: Arc<Vec<PathBuf>>) -> Self {
-        self.paths = paths;
+    fn with_source(mut self, source: ScanSource) -> Self {
+        self.source = source;
         self
     }
 
@@ -126,11 +125,11 @@ impl LazyFrame {
     /// Create a LazyFrame directly from a ipc scan.
     pub fn scan_ipc(path: impl AsRef<Path>, args: ScanArgsIpc) -> PolarsResult<Self> {
         LazyIpcReader::new(args)
-            .with_paths(Arc::new(vec![path.as_ref().to_path_buf()]))
+            .with_paths([path.as_ref().to_path_buf()].into())
             .finish()
     }
 
-    pub fn scan_ipc_files(paths: Arc<Vec<PathBuf>>, args: ScanArgsIpc) -> PolarsResult<Self> {
+    pub fn scan_ipc_files(paths: Arc<[PathBuf]>, args: ScanArgsIpc) -> PolarsResult<Self> {
         LazyIpcReader::new(args).with_paths(paths).finish()
     }
 }
diff --git a/crates/polars-lazy/src/scan/ndjson.rs b/crates/polars-lazy/src/scan/ndjson.rs
index 0effd26d5497..9a1d071f8365 100644
--- a/crates/polars-lazy/src/scan/ndjson.rs
+++ b/crates/polars-lazy/src/scan/ndjson.rs
@@ -1,11 +1,11 @@
 use std::num::NonZeroUsize;
 use std::path::{Path, PathBuf};
-use std::sync::{Arc, Mutex, RwLock};
+use std::sync::{Arc, RwLock};
 
 use polars_core::prelude::*;
 use polars_io::cloud::CloudOptions;
 use polars_io::RowIndex;
-use polars_plan::plans::{DslPlan, FileScan};
+use polars_plan::plans::{DslPlan, FileScan, ScanSource};
 use polars_plan::prelude::{FileScanOptions, NDJsonReadOptions};
 
 use crate::prelude::LazyFrame;
@@ -13,7 +13,7 @@ use crate::scan::file_list_reader::LazyFileListReader;
 
 #[derive(Clone)]
 pub struct LazyJsonLineReader {
-    pub(crate) paths: Arc<Vec<PathBuf>>,
+    pub(crate) source: ScanSource,
     pub(crate) batch_size: Option<NonZeroUsize>,
     pub(crate) low_memory: bool,
     pub(crate) rechunk: bool,
@@ -28,13 +28,13 @@ pub struct LazyJsonLineReader {
 }
 
 impl LazyJsonLineReader {
-    pub fn new_paths(paths: Arc<Vec<PathBuf>>) -> Self {
+    pub fn new_paths(paths: Arc<[PathBuf]>) -> Self {
         Self::new(PathBuf::new()).with_paths(paths)
     }
 
     pub fn new(path: impl AsRef<Path>) -> Self {
         LazyJsonLineReader {
-            paths: Arc::new(vec![path.as_ref().to_path_buf()]),
+            source: ScanSource::Files([path.as_ref().to_path_buf()].into()),
             batch_size: None,
             low_memory: false,
             rechunk: false,
@@ -117,8 +117,6 @@ impl LazyJsonLineReader {
 
 impl LazyFileListReader for LazyJsonLineReader {
     fn finish(self) -> PolarsResult<LazyFrame> {
-        let paths = Arc::new(Mutex::new((self.paths, false)));
-
         let file_options = FileScanOptions {
             slice: self.n_rows.map(|x| (0, x)),
             with_columns: None,
@@ -147,7 +145,7 @@ impl LazyFileListReader for LazyJsonLineReader {
         };
 
         Ok(LazyFrame::from(DslPlan::Scan {
-            paths,
+            sources: self.source.to_dsl(false),
             file_info: Arc::new(RwLock::new(None)),
             hive_parts: None,
             predicate: None,
@@ -160,12 +158,12 @@ impl LazyFileListReader for LazyJsonLineReader {
         unreachable!();
     }
 
-    fn paths(&self) -> &[PathBuf] {
-        &self.paths
+    fn source(&self) -> &ScanSource {
+        &self.source
     }
 
-    fn with_paths(mut self, paths: Arc<Vec<PathBuf>>) -> Self {
-        self.paths = paths;
+    fn with_source(mut self, source: ScanSource) -> Self {
+        self.source = source;
         self
     }
 
diff --git a/crates/polars-lazy/src/scan/parquet.rs b/crates/polars-lazy/src/scan/parquet.rs
index e87e90e3330a..491ae3ee126c 100644
--- a/crates/polars-lazy/src/scan/parquet.rs
+++ b/crates/polars-lazy/src/scan/parquet.rs
@@ -44,14 +44,14 @@ impl Default for ScanArgsParquet {
 #[derive(Clone)]
 struct LazyParquetReader {
     args: ScanArgsParquet,
-    paths: Arc<Vec<PathBuf>>,
+    source: ScanSource,
 }
 
 impl LazyParquetReader {
     fn new(args: ScanArgsParquet) -> Self {
         Self {
             args,
-            paths: Arc::new(vec![]),
+            source: ScanSource::default(),
         }
     }
 }
@@ -62,7 +62,7 @@ impl LazyFileListReader for LazyParquetReader {
         let row_index = self.args.row_index;
 
         let mut lf: LazyFrame = DslBuilder::scan_parquet(
-            self.paths,
+            self.source.to_dsl(false),
             self.args.n_rows,
             self.args.cache,
             self.args.parallel,
@@ -95,12 +95,12 @@ impl LazyFileListReader for LazyParquetReader {
         unreachable!();
     }
 
-    fn paths(&self) -> &[PathBuf] {
-        &self.paths
+    fn source(&self) -> &ScanSource {
+        &self.source
     }
 
-    fn with_paths(mut self, paths: Arc<Vec<PathBuf>>) -> Self {
-        self.paths = paths;
+    fn with_source(mut self, source: ScanSource) -> Self {
+        self.source = source;
         self
     }
 
@@ -140,15 +140,17 @@ impl LazyFrame {
     /// Create a LazyFrame directly from a parquet scan.
     pub fn scan_parquet(path: impl AsRef<Path>, args: ScanArgsParquet) -> PolarsResult<Self> {
         LazyParquetReader::new(args)
-            .with_paths(Arc::new(vec![path.as_ref().to_path_buf()]))
+            .with_paths(vec![path.as_ref().to_path_buf()].into())
             .finish()
     }
 
     /// Create a LazyFrame directly from a parquet scan.
-    pub fn scan_parquet_files(
-        paths: Arc<Vec<PathBuf>>,
-        args: ScanArgsParquet,
-    ) -> PolarsResult<Self> {
+    pub fn scan_parquet_sourced(source: ScanSource, args: ScanArgsParquet) -> PolarsResult<Self> {
+        LazyParquetReader::new(args).with_source(source).finish()
+    }
+
+    /// Create a LazyFrame directly from a parquet scan.
+    pub fn scan_parquet_files(paths: Arc<[PathBuf]>, args: ScanArgsParquet) -> PolarsResult<Self> {
         LazyParquetReader::new(args).with_paths(paths).finish()
     }
 }
diff --git a/crates/polars-mem-engine/src/executors/scan/csv.rs b/crates/polars-mem-engine/src/executors/scan/csv.rs
index 50ed974e128b..24e813329bcf 100644
--- a/crates/polars-mem-engine/src/executors/scan/csv.rs
+++ b/crates/polars-mem-engine/src/executors/scan/csv.rs
@@ -1,4 +1,3 @@
-use std::path::PathBuf;
 use std::sync::Arc;
 
 use polars_core::config;
@@ -9,7 +8,7 @@ use polars_core::utils::{
 use super::*;
 
 pub struct CsvExec {
-    pub paths: Arc<Vec<PathBuf>>,
+    pub sources: ScanSource,
     pub file_info: FileInfo,
     pub options: CsvReadOptions,
     pub file_options: FileScanOptions,
@@ -18,6 +17,7 @@ pub struct CsvExec {
 
 impl CsvExec {
     fn read(&self) -> PolarsResult<DataFrame> {
+        let paths = self.sources.as_paths();
         let with_columns = self
             .file_options
             .with_columns
@@ -45,7 +45,7 @@ impl CsvExec {
             .with_row_index(None)
             .with_path::<&str>(None);
 
-        if self.paths.is_empty() {
+        if paths.is_empty() {
             let out = if let Some(schema) = options_base.schema {
                 DataFrame::from_rows_and_schema(&[], schema.as_ref())?
             } else {
@@ -56,7 +56,7 @@ impl CsvExec {
 
         let verbose = config::verbose();
         let force_async = config::force_async();
-        let run_async = force_async || is_cloud_url(self.paths.first().unwrap());
+        let run_async = force_async || is_cloud_url(paths.first().unwrap());
 
         if force_async && verbose {
             eprintln!("ASYNC READING FORCED");
@@ -64,7 +64,7 @@ impl CsvExec {
 
         let finish_read =
             |i: usize, options: CsvReadOptions, predicate: Option<Arc<dyn PhysicalIoExpr>>| {
-                let path = &self.paths[i];
+                let path = &paths[i];
                 let mut df = if run_async {
                     #[cfg(feature = "cloud")]
                     {
@@ -123,14 +123,14 @@ impl CsvExec {
             }
 
             let mut n_rows_read = 0usize;
-            let mut out = Vec::with_capacity(self.paths.len());
+            let mut out = Vec::with_capacity(paths.len());
             // If we have n_rows or row_index then we need to count how many rows we read, so we need
             // to delay applying the predicate.
             let predicate_during_read = predicate
                 .clone()
                 .filter(|_| n_rows.is_none() && self.file_options.row_index.is_none());
 
-            for i in 0..self.paths.len() {
+            for i in 0..paths.len() {
                 let opts = options_base
                     .clone()
                     .with_row_index(self.file_options.row_index.clone().map(|mut ri| {
@@ -178,7 +178,7 @@ impl CsvExec {
                             "reached n_rows = {} at file {} / {}",
                             n_rows.unwrap(),
                             1 + i,
-                            self.paths.len()
+                            paths.len()
                         )
                     }
                     break;
@@ -203,10 +203,10 @@ impl CsvExec {
             let dfs = POOL.install(|| {
                 let step = std::cmp::min(POOL.current_num_threads(), 128);
 
-                (0..self.paths.len())
+                (0..paths.len())
                     .step_by(step)
                     .map(|start| {
-                        (start..std::cmp::min(start.saturating_add(step), self.paths.len()))
+                        (start..std::cmp::min(start.saturating_add(step), paths.len()))
                             .into_par_iter()
                             .map(|i| finish_read(i, options_base.clone(), predicate.clone()))
                             .collect::<PolarsResult<Vec<_>>>()
@@ -234,9 +234,10 @@ impl CsvExec {
 
 impl Executor for CsvExec {
     fn execute(&mut self, state: &mut ExecutionState) -> PolarsResult<DataFrame> {
+        let paths = self.sources.as_paths();
         let profile_name = if state.has_node_timer() {
             let mut ids = vec![PlSmallStr::from_str(
-                self.paths[0].to_string_lossy().as_ref(),
+                paths[0].to_string_lossy().as_ref(),
             )];
             if self.predicate.is_some() {
                 ids.push("predicate".into())
diff --git a/crates/polars-mem-engine/src/executors/scan/ipc.rs b/crates/polars-mem-engine/src/executors/scan/ipc.rs
index 18d47c172bcd..b29e44a5e33c 100644
--- a/crates/polars-mem-engine/src/executors/scan/ipc.rs
+++ b/crates/polars-mem-engine/src/executors/scan/ipc.rs
@@ -1,4 +1,3 @@
-use std::path::PathBuf;
 
 use hive::HivePartitions;
 use polars_core::config;
@@ -11,7 +10,7 @@ use rayon::prelude::*;
 use super::*;
 
 pub struct IpcExec {
-    pub(crate) paths: Arc<Vec<PathBuf>>,
+    pub(crate) sources: ScanSource,
     pub(crate) file_info: FileInfo,
     pub(crate) predicate: Option<Arc<dyn PhysicalExpr>>,
     pub(crate) options: IpcScanOptions,
@@ -22,7 +21,8 @@ pub struct IpcExec {
 
 impl IpcExec {
     fn read(&mut self) -> PolarsResult<DataFrame> {
-        let is_cloud = self.paths.iter().any(is_cloud_url);
+        let paths = self.sources.as_paths();
+        let is_cloud = paths.iter().any(is_cloud_url);
         let force_async = config::force_async();
 
         let mut out = if is_cloud || force_async {
@@ -54,6 +54,7 @@ impl IpcExec {
         &mut self,
         path_idx_to_file: F,
     ) -> PolarsResult<DataFrame> {
+        let paths = self.sources.as_paths();
         if config::verbose() {
             eprintln!("executing ipc read sync with row_index = {:?}, n_rows = {:?}, predicate = {:?} for paths {:?}",
                 self.file_options.row_index.as_ref(),
@@ -62,7 +63,7 @@ impl IpcExec {
                     x.1
                 }).as_ref(),
                 self.predicate.is_some(),
-                self.paths
+                paths
             );
         }
 
@@ -86,13 +87,13 @@ impl IpcExec {
                 .with_include_file_path(self.file_options.include_file_paths.as_ref().map(|x| {
                     (
                         x.clone(),
-                        Arc::from(self.paths[path_index].to_str().unwrap().to_string()),
+                        Arc::from(paths[path_index].to_str().unwrap().to_string()),
                     )
                 }))
                 .memory_mapped(
                     self.options
                         .memory_map
-                        .then(|| self.paths[path_index].clone()),
+                        .then(|| paths[path_index].clone()),
                 )
                 .finish()
         };
@@ -101,9 +102,9 @@ impl IpcExec {
             assert_eq!(x.0, 0);
             x.1
         }) {
-            let mut out = Vec::with_capacity(self.paths.len());
+            let mut out = Vec::with_capacity(paths.len());
 
-            for i in 0..self.paths.len() {
+            for i in 0..paths.len() {
                 let df = read_path(i, Some(n_rows))?;
                 let df_height = df.height();
                 out.push(df);
@@ -121,7 +122,7 @@ impl IpcExec {
             out
         } else {
             POOL.install(|| {
-                (0..self.paths.len())
+                (0..paths.len())
                     .into_par_iter()
                     .map(|i| read_path(i, None))
                     .collect::<PolarsResult<Vec<_>>>()
@@ -157,7 +158,8 @@ impl IpcExec {
     }
 
     fn read_sync(&mut self) -> PolarsResult<DataFrame> {
-        let paths = self.paths.clone();
+        let paths = self.sources.into_paths();
+        let paths = paths.clone();
         self.read_impl(move |i| std::fs::File::open(&paths[i]).map_err(Into::into))
     }
 
@@ -167,9 +169,11 @@ impl IpcExec {
         // concurrently.
         use polars_io::file_cache::init_entries_from_uri_list;
 
+        let paths = self.sources.into_paths();
+
         tokio::task::block_in_place(|| {
             let cache_entries = init_entries_from_uri_list(
-                self.paths
+                paths
                     .iter()
                     .map(|x| Arc::from(x.to_str().unwrap()))
                     .collect::<Vec<_>>()
@@ -184,9 +188,11 @@ impl IpcExec {
 
 impl Executor for IpcExec {
     fn execute(&mut self, state: &mut ExecutionState) -> PolarsResult<DataFrame> {
+        let paths = self.sources.as_paths();
+
         let profile_name = if state.has_node_timer() {
             let mut ids = vec![PlSmallStr::from_str(
-                self.paths[0].to_string_lossy().as_ref(),
+                paths[0].to_string_lossy().as_ref(),
             )];
             if self.predicate.is_some() {
                 ids.push("predicate".into())
diff --git a/crates/polars-mem-engine/src/executors/scan/ndjson.rs b/crates/polars-mem-engine/src/executors/scan/ndjson.rs
index 680e5cbf3bed..68ad24ab837e 100644
--- a/crates/polars-mem-engine/src/executors/scan/ndjson.rs
+++ b/crates/polars-mem-engine/src/executors/scan/ndjson.rs
@@ -1,12 +1,10 @@
-use std::path::PathBuf;
-
 use polars_core::config;
 use polars_core::utils::accumulate_dataframes_vertical;
 
 use super::*;
 
 pub struct JsonExec {
-    paths: Arc<Vec<PathBuf>>,
+    sources: ScanSource,
     options: NDJsonReadOptions,
     file_scan_options: FileScanOptions,
     file_info: FileInfo,
@@ -15,14 +13,14 @@ pub struct JsonExec {
 
 impl JsonExec {
     pub fn new(
-        paths: Arc<Vec<PathBuf>>,
+        sources: ScanSource,
         options: NDJsonReadOptions,
         file_scan_options: FileScanOptions,
         file_info: FileInfo,
         predicate: Option<Arc<dyn PhysicalExpr>>,
     ) -> Self {
         Self {
-            paths,
+            sources,
             options,
             file_scan_options,
             file_info,
@@ -38,10 +36,11 @@ impl JsonExec {
             .unwrap()
             .as_ref()
             .unwrap_right();
+        let paths = self.sources.as_paths();
 
         let verbose = config::verbose();
         let force_async = config::force_async();
-        let run_async = force_async || is_cloud_url(self.paths.first().unwrap());
+        let run_async = force_async || is_cloud_url(paths.first().unwrap());
 
         if force_async && verbose {
             eprintln!("ASYNC READING FORCED");
@@ -66,8 +65,7 @@ impl JsonExec {
             return Ok(df);
         }
 
-        let dfs = self
-            .paths
+        let dfs = paths
             .iter()
             .map_while(|p| {
                 if n_rows == Some(0) {
@@ -149,8 +147,9 @@ impl JsonExec {
 
 impl Executor for JsonExec {
     fn execute(&mut self, state: &mut ExecutionState) -> PolarsResult<DataFrame> {
+        let paths = self.sources.as_paths();
         let profile_name = if state.has_node_timer() {
-            let ids = vec![self.paths[0].to_string_lossy().clone()];
+            let ids = vec![paths[0].to_string_lossy().clone()];
             let name = comma_delimited("ndjson".to_string(), &ids);
             Cow::Owned(name)
         } else {
diff --git a/crates/polars-mem-engine/src/executors/scan/parquet.rs b/crates/polars-mem-engine/src/executors/scan/parquet.rs
index bd3d87ff8832..efed503ad511 100644
--- a/crates/polars-mem-engine/src/executors/scan/parquet.rs
+++ b/crates/polars-mem-engine/src/executors/scan/parquet.rs
@@ -1,10 +1,9 @@
-use std::path::PathBuf;
-
 use hive::HivePartitions;
 use polars_core::config;
 #[cfg(feature = "cloud")]
 use polars_core::config::{get_file_prefetch_size, verbose};
 use polars_core::utils::accumulate_dataframes_vertical;
+use polars_error::feature_gated;
 use polars_io::cloud::CloudOptions;
 use polars_io::parquet::metadata::FileMetaDataRef;
 use polars_io::path_utils::is_cloud_url;
@@ -14,7 +13,7 @@ use polars_io::RowIndex;
 use super::*;
 
 pub struct ParquetExec {
-    paths: Arc<Vec<PathBuf>>,
+    sources: ScanSource,
     file_info: FileInfo,
     hive_parts: Option<Arc<Vec<HivePartitions>>>,
     predicate: Option<Arc<dyn PhysicalExpr>>,
@@ -29,7 +28,7 @@ pub struct ParquetExec {
 impl ParquetExec {
     #[allow(clippy::too_many_arguments)]
     pub(crate) fn new(
-        paths: Arc<Vec<PathBuf>>,
+        sources: ScanSource,
         file_info: FileInfo,
         hive_parts: Option<Arc<Vec<HivePartitions>>>,
         predicate: Option<Arc<dyn PhysicalExpr>>,
@@ -39,7 +38,7 @@ impl ParquetExec {
         metadata: Option<FileMetaDataRef>,
     ) -> Self {
         ParquetExec {
-            paths,
+            sources,
             file_info,
             hive_parts,
             predicate,
@@ -52,7 +51,7 @@ impl ParquetExec {
 
     fn read_par(&mut self) -> PolarsResult<Vec<DataFrame>> {
         let parallel = match self.options.parallel {
-            ParallelStrategy::Auto if self.paths.len() > POOL.current_num_threads() => {
+            ParallelStrategy::Auto if self.sources.num_sources() > POOL.current_num_threads() => {
                 ParallelStrategy::RowGroups
             },
             identity => identity,
@@ -61,166 +60,159 @@ impl ParquetExec {
         let mut result = vec![];
 
         let step = std::cmp::min(POOL.current_num_threads(), 128);
-        // Modified if we have a negative slice
-        let mut first_file = 0;
-
-        // (offset, end)
-        let (slice_offset, slice_end) = if let Some(slice) = self.file_options.slice {
-            if slice.0 >= 0 {
-                (slice.0 as usize, slice.1.saturating_add(slice.0 as usize))
-            } else {
-                // Walk the files in reverse until we find the first file, and then translate the
-                // slice into a positive-offset equivalent.
-                let slice_start_as_n_from_end = -slice.0 as usize;
-                let mut cum_rows = 0;
-                let chunk_size = 8;
-                POOL.install(|| {
-                    for path_indexes in (0..self.paths.len())
-                        .rev()
-                        .collect::<Vec<_>>()
-                        .chunks(chunk_size)
-                    {
-                        let row_counts = path_indexes
-                            .into_par_iter()
-                            .map(|i| {
-                                ParquetReader::new(std::fs::File::open(&self.paths[*i])?).num_rows()
-                            })
-                            .collect::<PolarsResult<Vec<_>>>()?;
-
-                        for (path_idx, rc) in path_indexes.iter().zip(row_counts) {
-                            cum_rows += rc;
-
-                            if cum_rows >= slice_start_as_n_from_end {
-                                first_file = *path_idx;
-                                break;
-                            }
-                        }
-
-                        if first_file > 0 {
-                            break;
-                        }
-                    }
-
-                    PolarsResult::Ok(())
-                })?;
-
-                let (start, len) = if slice_start_as_n_from_end > cum_rows {
-                    // We need to trim the slice, e.g. SLICE[offset: -100, len: 75] on a file of 50
-                    // rows should only give the first 25 rows.
-                    let first_file_position = slice_start_as_n_from_end - cum_rows;
-                    (0, slice.1.saturating_sub(first_file_position))
-                } else {
-                    (cum_rows - slice_start_as_n_from_end, slice.1)
-                };
-
-                let end = start.saturating_add(len);
-
-                (start, end)
-            }
-        } else {
-            (0, usize::MAX)
+        let slice_info = match self.file_options.slice {
+            None => ScanSourceSliceInfo {
+                item_slice: 0..usize::MAX,
+                source_slice: 0..self.sources.num_sources(),
+            },
+            Some(slice) => self.sources.collect_slice_information(
+                slice,
+                |path| ParquetReader::new(std::fs::File::open(path)?).num_rows(),
+                |buff| ParquetReader::new(std::io::Cursor::new(buff)).num_rows(),
+            )?,
         };
 
-        let mut current_offset = 0;
-        let base_row_index = self.file_options.row_index.take();
-        // Limit no. of files at a time to prevent open file limits.
-
-        for i in (first_file..self.paths.len()).step_by(step) {
-            let end = std::cmp::min(i.saturating_add(step), self.paths.len());
-            let paths = &self.paths[i..end];
-            let hive_parts = self.hive_parts.as_ref().map(|x| &x[i..end]);
-
-            if current_offset >= slice_end && !result.is_empty() {
-                return Ok(result);
-            }
-
-            // First initialize the readers, predicates and metadata.
-            // This will be used to determine the slices. That way we can actually read all the
-            // files in parallel even if we add row index columns or slices.
-            let iter = (0..paths.len()).into_par_iter().map(|i| {
-                let path = &paths[i];
-                let hive_partitions = hive_parts.map(|x| x[i].materialize_partition_columns());
-
-                let file = std::fs::File::open(path)?;
+        match &self.sources {
+            ScanSource::Buffer(buffer) => {
+                let row_index = self.file_options.row_index.take();
                 let (projection, predicate) = prepare_scan_args(
                     self.predicate.clone(),
                     &mut self.file_options.with_columns.clone(),
                     &mut self.file_info.schema.clone(),
-                    base_row_index.is_some(),
-                    hive_partitions.as_deref(),
+                    row_index.is_some(),
+                    None,
                 );
 
-                let mut reader = ParquetReader::new(file)
+                result = vec![ParquetReader::new(std::io::Cursor::new(buffer))
                     .read_parallel(parallel)
                     .set_low_memory(self.options.low_memory)
                     .use_statistics(self.options.use_statistics)
                     .set_rechunk(false)
-                    .with_hive_partition_columns(hive_partitions)
-                    .with_include_file_path(
-                        self.file_options
-                            .include_file_paths
-                            .as_ref()
-                            .map(|x| (x.clone(), Arc::from(paths[i].to_str().unwrap()))),
-                    );
-
-                reader
-                    .num_rows()
-                    .map(|num_rows| (reader, num_rows, predicate, projection))
-            });
+                    .with_slice(Some((slice_info.item_slice.start, slice_info.item_slice.len())))
+                    .with_row_index(row_index)
+                    .with_predicate(predicate.clone())
+                    .with_projection(projection.clone())
+                    .check_schema(
+                        self.file_info
+                            .reader_schema
+                            .clone()
+                            .unwrap()
+                            .unwrap_left()
+                            .as_ref(),
+                    )?
+                    .finish()?];
+            },
+            ScanSource::Files(paths) => {
+                let mut current_offset = 0;
+                let base_row_index = self.file_options.row_index.take();
+                // Limit no. of files at a time to prevent open file limits.
+
+                for i in slice_info.source_slice.step_by(step) {
+                    let end = std::cmp::min(i.saturating_add(step), paths.len());
+                    let paths = &paths[i..end];
+                    let hive_parts = self.hive_parts.as_ref().map(|x| &x[i..end]);
+
+                    if current_offset >= slice_info.item_slice.end && !result.is_empty() {
+                        return Ok(result);
+                    }
 
-            // We do this in parallel because wide tables can take a long time deserializing metadata.
-            let readers_and_metadata = POOL.install(|| iter.collect::<PolarsResult<Vec<_>>>())?;
+                    // First initialize the readers, predicates and metadata.
+                    // This will be used to determine the slices. That way we can actually read all the
+                    // files in parallel even if we add row index columns or slices.
+                    let iter = (0..paths.len()).into_par_iter().map(|i| {
+                        let path = &paths[i];
+                        let hive_partitions =
+                            hive_parts.map(|x| x[i].materialize_partition_columns());
 
-            let current_offset_ref = &mut current_offset;
-            let row_statistics = readers_and_metadata
-                .iter()
-                .map(|(_, num_rows, _, _)| {
-                    let cum_rows = *current_offset_ref;
-                    (
-                        cum_rows,
-                        split_slice_at_file(current_offset_ref, *num_rows, slice_offset, slice_end),
-                    )
-                })
-                .collect::<Vec<_>>();
+                        let file = std::fs::File::open(path)?;
+                        let (projection, predicate) = prepare_scan_args(
+                            self.predicate.clone(),
+                            &mut self.file_options.with_columns.clone(),
+                            &mut self.file_info.schema.clone(),
+                            base_row_index.is_some(),
+                            hive_partitions.as_deref(),
+                        );
 
-            let out = POOL.install(|| {
-                readers_and_metadata
-                    .into_par_iter()
-                    .zip(row_statistics.into_par_iter())
-                    .map(
-                        |((reader, _, predicate, projection), (cumulative_read, slice))| {
-                            let row_index = base_row_index.as_ref().map(|rc| RowIndex {
-                                name: rc.name.clone(),
-                                offset: rc.offset + cumulative_read as IdxSize,
-                            });
-
-                            let df = reader
-                                .with_slice(Some(slice))
-                                .with_row_index(row_index)
-                                .with_predicate(predicate.clone())
-                                .with_projection(projection.clone())
-                                .check_schema(
-                                    self.file_info
-                                        .reader_schema
-                                        .clone()
-                                        .unwrap()
-                                        .unwrap_left()
-                                        .as_ref(),
-                                )?
-                                .finish()?;
-
-                            Ok(df)
-                        },
-                    )
-                    .collect::<PolarsResult<Vec<_>>>()
-            })?;
+                        let mut reader = ParquetReader::new(file)
+                            .read_parallel(parallel)
+                            .set_low_memory(self.options.low_memory)
+                            .use_statistics(self.options.use_statistics)
+                            .set_rechunk(false)
+                            .with_hive_partition_columns(hive_partitions)
+                            .with_include_file_path(
+                                self.file_options
+                                    .include_file_paths
+                                    .as_ref()
+                                    .map(|x| (x.clone(), Arc::from(paths[i].to_str().unwrap()))),
+                            );
 
-            if result.is_empty() {
-                result = out;
-            } else {
-                result.extend_from_slice(&out)
-            }
+                        reader
+                            .num_rows()
+                            .map(|num_rows| (reader, num_rows, predicate, projection))
+                    });
+
+                    // We do this in parallel because wide tables can take a long time deserializing metadata.
+                    let readers_and_metadata =
+                        POOL.install(|| iter.collect::<PolarsResult<Vec<_>>>())?;
+
+                    let current_offset_ref = &mut current_offset;
+                    let row_statistics = readers_and_metadata
+                        .iter()
+                        .map(|(_, num_rows, _, _)| {
+                            let cum_rows = *current_offset_ref;
+                            (
+                                cum_rows,
+                                split_slice_at_file(
+                                    current_offset_ref,
+                                    *num_rows,
+                                    slice_info.item_slice.start,
+                                    slice_info.item_slice.end,
+                                ),
+                            )
+                        })
+                        .collect::<Vec<_>>();
+
+                    let out = POOL.install(|| {
+                        readers_and_metadata
+                            .into_par_iter()
+                            .zip(row_statistics.into_par_iter())
+                            .map(
+                                |((reader, _, predicate, projection), (cumulative_read, slice))| {
+                                    let row_index = base_row_index.as_ref().map(|rc| RowIndex {
+                                        name: rc.name.clone(),
+                                        offset: rc.offset + cumulative_read as IdxSize,
+                                    });
+
+                                    let df = reader
+                                        .with_slice(Some(slice))
+                                        .with_row_index(row_index)
+                                        .with_predicate(predicate.clone())
+                                        .with_projection(projection.clone())
+                                        .check_schema(
+                                            self.file_info
+                                                .reader_schema
+                                                .clone()
+                                                .unwrap()
+                                                .unwrap_left()
+                                                .as_ref(),
+                                        )?
+                                        .finish()?;
+
+                                    Ok(df)
+                                },
+                            )
+                            .collect::<PolarsResult<Vec<_>>>()
+                    })?;
+
+                    if result.is_empty() {
+                        result = out;
+                    } else {
+                        result.extend_from_slice(&out)
+                    }
+                }
+            },
         }
+
         Ok(result)
     }
 
@@ -231,6 +223,7 @@ impl ParquetExec {
         use polars_io::utils::slice::split_slice_at_file;
 
         let verbose = verbose();
+        let paths = self.sources.into_paths();
         let first_metadata = &self.metadata;
         let cloud_options = self.cloud_options.as_ref();
 
@@ -254,13 +247,13 @@ impl ParquetExec {
                 let slice_start_as_n_from_end = -slice.0 as usize;
                 let mut cum_rows = 0;
 
-                let paths = &self.paths;
+                let paths = &paths;
                 let cloud_options = Arc::new(self.cloud_options.clone());
 
                 let paths = paths.clone();
                 let cloud_options = cloud_options.clone();
 
-                let mut iter = stream::iter((0..self.paths.len()).rev().map(|i| {
+                let mut iter = stream::iter((0..paths.len()).rev().map(|i| {
                     let paths = paths.clone();
                     let cloud_options = cloud_options.clone();
 
@@ -312,9 +305,9 @@ impl ParquetExec {
         let base_row_index = self.file_options.row_index.take();
         let mut processed = 0;
 
-        for batch_start in (first_file_idx..self.paths.len()).step_by(batch_size) {
-            let end = std::cmp::min(batch_start.saturating_add(batch_size), self.paths.len());
-            let paths = &self.paths[batch_start..end];
+        for batch_start in (first_file_idx..paths.len()).step_by(batch_size) {
+            let end = std::cmp::min(batch_start.saturating_add(batch_size), paths.len());
+            let paths = &paths[batch_start..end];
             let hive_parts = self.hive_parts.as_ref().map(|x| &x[batch_start..end]);
 
             if current_offset >= slice_end && !result.is_empty() {
@@ -325,7 +318,7 @@ impl ParquetExec {
                 eprintln!(
                     "querying metadata of {}/{} files...",
                     processed,
-                    self.paths.len()
+                    paths.len()
                 );
             }
 
@@ -371,7 +364,7 @@ impl ParquetExec {
             let include_file_paths = self.file_options.include_file_paths.as_ref();
 
             if verbose {
-                eprintln!("reading of {}/{} file...", processed, self.paths.len());
+                eprintln!("reading of {}/{} file...", processed, paths.len());
             }
 
             let iter = readers_and_metadata
@@ -447,23 +440,20 @@ impl ParquetExec {
             .and_then(|_| self.predicate.take())
             .map(phys_expr_to_io_expr);
 
-        let is_cloud = is_cloud_url(self.paths.first().unwrap());
+        let is_cloud = match &self.sources {
+            ScanSource::Files(paths) => is_cloud_url(paths.first().unwrap()),
+            ScanSource::Buffer(_) => false,
+        };
         let force_async = config::force_async();
 
         let out = if is_cloud || force_async {
-            #[cfg(not(feature = "cloud"))]
-            {
-                panic!("activate cloud feature")
-            }
-
-            #[cfg(feature = "cloud")]
-            {
+            feature_gated!("cloud", {
                 if force_async && config::verbose() {
                     eprintln!("ASYNC READING FORCED");
                 }
 
                 polars_io::pl_async::get_runtime().block_on_potential_spawn(self.read_async())?
-            }
+            })
         } else {
             self.read_par()?
         };
@@ -482,7 +472,8 @@ impl ParquetExec {
 impl Executor for ParquetExec {
     fn execute(&mut self, state: &mut ExecutionState) -> PolarsResult<DataFrame> {
         let profile_name = if state.has_node_timer() {
-            let mut ids = vec![self.paths[0].to_string_lossy()];
+            let paths = self.sources.as_paths();
+            let mut ids = vec![paths[0].to_string_lossy()];
             if self.predicate.is_some() {
                 ids.push("predicate".into())
             }
diff --git a/crates/polars-mem-engine/src/planner/lp.rs b/crates/polars-mem-engine/src/planner/lp.rs
index 523cd1e5c588..45487f7b7024 100644
--- a/crates/polars-mem-engine/src/planner/lp.rs
+++ b/crates/polars-mem-engine/src/planner/lp.rs
@@ -276,7 +276,7 @@ fn create_physical_plan_impl(
         },
         #[allow(unused_variables)]
         Scan {
-            paths,
+            sources,
             file_info,
             hive_parts,
             output_schema,
@@ -306,7 +306,7 @@ fn create_physical_plan_impl(
             match scan_type {
                 #[cfg(feature = "csv")]
                 FileScan::Csv { options, .. } => Ok(Box::new(executors::CsvExec {
-                    paths,
+                    sources,
                     file_info,
                     options,
                     predicate,
@@ -318,7 +318,7 @@ fn create_physical_plan_impl(
                     cloud_options,
                     metadata,
                 } => Ok(Box::new(executors::IpcExec {
-                    paths,
+                    sources,
                     file_info,
                     predicate,
                     options,
@@ -332,7 +332,7 @@ fn create_physical_plan_impl(
                     cloud_options,
                     metadata,
                 } => Ok(Box::new(executors::ParquetExec::new(
-                    paths,
+                    sources,
                     file_info,
                     hive_parts,
                     predicate,
@@ -343,7 +343,7 @@ fn create_physical_plan_impl(
                 ))),
                 #[cfg(feature = "json")]
                 FileScan::NDJson { options, .. } => Ok(Box::new(executors::JsonExec::new(
-                    paths,
+                    sources,
                     options,
                     file_options,
                     file_info,
diff --git a/crates/polars-mem-engine/src/utils.rs b/crates/polars-mem-engine/src/utils.rs
index cb04d599a7f0..b104da3c4e78 100644
--- a/crates/polars-mem-engine/src/utils.rs
+++ b/crates/polars-mem-engine/src/utils.rs
@@ -13,8 +13,8 @@ pub(crate) fn agg_source_paths(
 ) {
     lp_arena.iter(root_lp).for_each(|(_, lp)| {
         use IR::*;
-        if let Scan { paths, .. } = lp {
-            for path in paths.as_ref() {
+        if let Scan { sources, .. } = lp {
+            for path in sources.as_paths() {
                 acc_paths.insert(path.clone());
             }
         }
diff --git a/crates/polars-pipe/src/executors/sources/csv.rs b/crates/polars-pipe/src/executors/sources/csv.rs
index 2c34228bada6..5ca5551c506d 100644
--- a/crates/polars-pipe/src/executors/sources/csv.rs
+++ b/crates/polars-pipe/src/executors/sources/csv.rs
@@ -1,10 +1,10 @@
 use std::fs::File;
-use std::path::PathBuf;
 
 use polars_core::{config, POOL};
 use polars_io::csv::read::{BatchedCsvReader, CsvReadOptions, CsvReader};
 use polars_io::path_utils::is_cloud_url;
 use polars_plan::global::_set_n_rows_for_scan;
+use polars_plan::plans::ScanSource;
 use polars_plan::prelude::FileScanOptions;
 use polars_utils::itertools::Itertools;
 
@@ -20,7 +20,7 @@ pub(crate) struct CsvSource {
     batched_reader: Option<BatchedCsvReader<'static>>,
     reader: Option<CsvReader<File>>,
     n_threads: usize,
-    paths: Arc<Vec<PathBuf>>,
+    sources: ScanSource,
     options: Option<CsvReadOptions>,
     file_options: FileScanOptions,
     verbose: bool,
@@ -36,6 +36,7 @@ impl CsvSource {
     // otherwise all files would be opened during construction of the pipeline
     // leading to Too many Open files error
     fn init_next_reader(&mut self) -> PolarsResult<()> {
+        let paths = self.sources.as_paths();
         let file_options = self.file_options.clone();
 
         let n_rows = file_options.slice.map(|x| {
@@ -43,12 +44,12 @@ impl CsvSource {
             x.1
         });
 
-        if self.current_path_idx == self.paths.len()
+        if self.current_path_idx == paths.len()
             || (n_rows.is_some() && n_rows.unwrap() <= self.n_rows_read)
         {
             return Ok(());
         }
-        let path = &self.paths[self.current_path_idx];
+        let path = &paths[self.current_path_idx];
 
         let force_async = config::force_async();
         let run_async = force_async || is_cloud_url(path);
@@ -140,7 +141,7 @@ impl CsvSource {
     }
 
     pub(crate) fn new(
-        paths: Arc<Vec<PathBuf>>,
+        sources: ScanSource,
         schema: SchemaRef,
         options: CsvReadOptions,
         file_options: FileScanOptions,
@@ -151,7 +152,7 @@ impl CsvSource {
             reader: None,
             batched_reader: None,
             n_threads: POOL.current_num_threads(),
-            paths,
+            sources,
             options: Some(options),
             file_options,
             verbose,
diff --git a/crates/polars-pipe/src/executors/sources/parquet.rs b/crates/polars-pipe/src/executors/sources/parquet.rs
index cd0cb58f3574..ab5abbade817 100644
--- a/crates/polars-pipe/src/executors/sources/parquet.rs
+++ b/crates/polars-pipe/src/executors/sources/parquet.rs
@@ -20,7 +20,7 @@ use polars_io::prelude::materialize_projection;
 use polars_io::prelude::ParquetAsyncReader;
 use polars_io::utils::slice::split_slice_at_file;
 use polars_io::SerReader;
-use polars_plan::plans::FileInfo;
+use polars_plan::plans::{FileInfo, ScanSource};
 use polars_plan::prelude::hive::HivePartitions;
 use polars_plan::prelude::FileScanOptions;
 use polars_utils::itertools::Itertools;
@@ -36,7 +36,7 @@ pub struct ParquetSource {
     processed_paths: usize,
     processed_rows: AtomicUsize,
     iter: Range<usize>,
-    paths: Arc<Vec<PathBuf>>,
+    sources: ScanSource,
     options: ParquetOptions,
     file_options: FileScanOptions,
     #[allow(dead_code)]
@@ -77,7 +77,8 @@ impl ParquetSource {
         usize,
         Option<Vec<Series>>,
     )> {
-        let path = &self.paths[index];
+        let paths = self.sources.as_paths();
+        let path = &paths[index];
         let options = self.options;
         let file_options = self.file_options.clone();
         let schema = self.file_info.schema.clone();
@@ -245,7 +246,7 @@ impl ParquetSource {
     #[allow(unused_variables)]
     #[allow(clippy::too_many_arguments)]
     pub(crate) fn new(
-        paths: Arc<Vec<PathBuf>>,
+        sources: ScanSource,
         options: ParquetOptions,
         cloud_options: Option<CloudOptions>,
         metadata: Option<FileMetaDataRef>,
@@ -255,6 +256,7 @@ impl ParquetSource {
         verbose: bool,
         predicate: Option<Arc<dyn PhysicalIoExpr>>,
     ) -> PolarsResult<Self> {
+        let paths = sources.as_paths();
         let n_threads = POOL.current_num_threads();
 
         let iter = 0..paths.len();
@@ -273,7 +275,7 @@ impl ParquetSource {
             options,
             file_options,
             iter,
-            paths,
+            sources,
             cloud_options,
             metadata,
             file_info,
diff --git a/crates/polars-pipe/src/pipeline/convert.rs b/crates/polars-pipe/src/pipeline/convert.rs
index 1e6f93eac9df..368fc91b17ef 100644
--- a/crates/polars-pipe/src/pipeline/convert.rs
+++ b/crates/polars-pipe/src/pipeline/convert.rs
@@ -74,7 +74,7 @@ where
             Ok(Box::new(sources::DataFrameSource::from_df(df)) as Box<dyn Source>)
         },
         Scan {
-            paths,
+            sources,
             file_info,
             hive_parts,
             file_options,
@@ -82,6 +82,8 @@ where
             output_schema,
             scan_type,
         } => {
+            let paths = sources.into_paths();
+
             // Add predicate to operators.
             // Except for parquet, as that format can use statistics to prune file/row-groups.
             #[cfg(feature = "parquet")]
@@ -102,7 +104,7 @@ where
                 #[cfg(feature = "csv")]
                 FileScan::Csv { options, .. } => {
                     let src = sources::CsvSource::new(
-                        paths,
+                        sources,
                         file_info.schema,
                         options,
                         file_options,
@@ -144,7 +146,7 @@ where
                         })
                         .transpose()?;
                     let src = sources::ParquetSource::new(
-                        paths,
+                        sources,
                         parquet_options,
                         cloud_options,
                         metadata,
diff --git a/crates/polars-plan/src/client/check.rs b/crates/polars-plan/src/client/check.rs
index a01addd9231d..e28e1906c8ea 100644
--- a/crates/polars-plan/src/client/check.rs
+++ b/crates/polars-plan/src/client/check.rs
@@ -2,7 +2,7 @@ use polars_core::error::{polars_err, PolarsResult};
 use polars_io::path_utils::is_cloud_url;
 
 use crate::plans::options::SinkType;
-use crate::plans::{DslPlan, FileScan};
+use crate::plans::{DslPlan, FileScan, DslScanSource};
 
 /// Assert that the given [`DslPlan`] is eligible to be executed on Polars Cloud.
 pub(super) fn assert_cloud_eligible(dsl: &DslPlan) -> PolarsResult<()> {
@@ -10,15 +10,30 @@ pub(super) fn assert_cloud_eligible(dsl: &DslPlan) -> PolarsResult<()> {
         match plan_node {
             #[cfg(feature = "python")]
             DslPlan::PythonScan { .. } => return ineligible_error("contains Python scan"),
-            DslPlan::Scan { paths, .. }
-                if paths.lock().unwrap().0.iter().any(|p| !is_cloud_url(p)) =>
-            {
-                return ineligible_error("contains scan of local file system")
-            },
             DslPlan::Scan {
-                scan_type: FileScan::Anonymous { .. },
-                ..
-            } => return ineligible_error("contains anonymous scan"),
+                sources, scan_type, ..
+            } => {
+                match sources {
+                    DslScanSource::File(file) => {
+                        if file
+                            .lock()
+                            .unwrap()
+                            .paths
+                            .iter()
+                            .any(|p| !is_cloud_url(p))
+                        {
+                            return ineligible_error("contains scan of local file system");
+                        }
+                    },
+                    DslScanSource::Buffer(_) => {
+                        return ineligible_error("contains scan of in-memory buffer");
+                    },
+                }
+
+                if matches!(scan_type, FileScan::Anonymous { .. }) {
+                    return ineligible_error("contains anonymous scan");
+                }
+            },
             DslPlan::Sink { payload, .. } => {
                 if !matches!(payload, SinkType::Cloud { .. }) {
                     return ineligible_error("contains sink to non-cloud location");
diff --git a/crates/polars-plan/src/plans/builder_dsl.rs b/crates/polars-plan/src/plans/builder_dsl.rs
index 893dbeb00e6e..1170f95ec7a2 100644
--- a/crates/polars-plan/src/plans/builder_dsl.rs
+++ b/crates/polars-plan/src/plans/builder_dsl.rs
@@ -1,6 +1,4 @@
-#[cfg(any(feature = "csv", feature = "ipc", feature = "parquet"))]
-use std::path::PathBuf;
-use std::sync::{Arc, Mutex, RwLock};
+use std::sync::{Arc, RwLock};
 
 use polars_core::prelude::*;
 #[cfg(any(feature = "parquet", feature = "ipc", feature = "csv"))]
@@ -60,7 +58,7 @@ impl DslBuilder {
         };
 
         Ok(DslPlan::Scan {
-            paths: Arc::new(Mutex::new((Arc::new(vec![]), true))),
+            sources: DslScanSource::Buffer(Arc::default()),
             file_info: Arc::new(RwLock::new(Some(file_info))),
             hive_parts: None,
             predicate: None,
@@ -79,7 +77,7 @@ impl DslBuilder {
     #[cfg(feature = "parquet")]
     #[allow(clippy::too_many_arguments)]
     pub fn scan_parquet(
-        paths: Arc<Vec<std::path::PathBuf>>,
+        source: DslScanSource,
         n_rows: Option<usize>,
         cache: bool,
         parallel: polars_io::parquet::read::ParallelStrategy,
@@ -92,8 +90,6 @@ impl DslBuilder {
         glob: bool,
         include_file_paths: Option<PlSmallStr>,
     ) -> PolarsResult<Self> {
-        let paths = init_paths(paths);
-
         let options = FileScanOptions {
             with_columns: None,
             cache,
@@ -106,7 +102,8 @@ impl DslBuilder {
             include_file_paths,
         };
         Ok(DslPlan::Scan {
-            paths,
+            // @FIX: sources -> source
+            sources: source,
             file_info: Arc::new(RwLock::new(None)),
             hive_parts: None,
             predicate: None,
@@ -127,7 +124,7 @@ impl DslBuilder {
     #[cfg(feature = "ipc")]
     #[allow(clippy::too_many_arguments)]
     pub fn scan_ipc(
-        paths: Arc<Vec<std::path::PathBuf>>,
+        source: DslScanSource,
         options: IpcScanOptions,
         n_rows: Option<usize>,
         cache: bool,
@@ -137,10 +134,8 @@ impl DslBuilder {
         hive_options: HiveOptions,
         include_file_paths: Option<PlSmallStr>,
     ) -> PolarsResult<Self> {
-        let paths = init_paths(paths);
-
         Ok(DslPlan::Scan {
-            paths,
+            sources: source,
             file_info: Arc::new(RwLock::new(None)),
             hive_parts: None,
             file_options: FileScanOptions {
@@ -167,15 +162,13 @@ impl DslBuilder {
     #[allow(clippy::too_many_arguments)]
     #[cfg(feature = "csv")]
     pub fn scan_csv(
-        paths: Arc<Vec<std::path::PathBuf>>,
+        source: DslScanSource,
         read_options: CsvReadOptions,
         cache: bool,
         cloud_options: Option<CloudOptions>,
         glob: bool,
         include_file_paths: Option<PlSmallStr>,
     ) -> PolarsResult<Self> {
-        let paths = init_paths(paths);
-
         // This gets partially moved by FileScanOptions
         let read_options_clone = read_options.clone();
 
@@ -195,7 +188,7 @@ impl DslBuilder {
             include_file_paths,
         };
         Ok(DslPlan::Scan {
-            paths,
+            sources: source,
             file_info: Arc::new(RwLock::new(None)),
             hive_parts: None,
             file_options: options,
@@ -464,9 +457,3 @@ impl DslBuilder {
         .into()
     }
 }
-
-/// Initialize paths as non-expanded.
-#[cfg(any(feature = "csv", feature = "ipc", feature = "parquet"))]
-fn init_paths(paths: Arc<Vec<std::path::PathBuf>>) -> Arc<Mutex<(Arc<Vec<PathBuf>>, bool)>> {
-    Arc::new(Mutex::new((paths, false)))
-}
diff --git a/crates/polars-plan/src/plans/conversion/dsl_to_ir.rs b/crates/polars-plan/src/plans/conversion/dsl_to_ir.rs
index a902b2da1e5d..825c5896097b 100644
--- a/crates/polars-plan/src/plans/conversion/dsl_to_ir.rs
+++ b/crates/polars-plan/src/plans/conversion/dsl_to_ir.rs
@@ -105,14 +105,21 @@ pub fn to_alp_impl(lp: DslPlan, ctxt: &mut DslConversionContext) -> PolarsResult
 
     let v = match lp {
         DslPlan::Scan {
-            paths,
+            mut sources,
             file_info,
             hive_parts,
             predicate,
             mut file_options,
             mut scan_type,
         } => {
-            let paths = expand_scan_paths(paths, &mut scan_type, &mut file_options)?;
+            sources.expand_paths(&mut scan_type, &mut file_options)?;
+
+            let source = match sources {
+                DslScanSource::File(paths) => {
+                    ScanSource::Files(paths.as_ref().lock().unwrap().paths.clone())
+                },
+                DslScanSource::Buffer(buf) => ScanSource::Buffer(buf),
+            };
 
             let file_info_read = file_info.read().unwrap();
 
@@ -139,7 +146,7 @@ pub fn to_alp_impl(lp: DslPlan, ctxt: &mut DslConversionContext) -> PolarsResult
                         ..
                     } => {
                         let (file_info, md) =
-                            scans::parquet_file_info(&paths, &file_options, cloud_options.as_ref())
+                            scans::parquet_file_info(&source, &file_options, cloud_options.as_ref())
                                 .map_err(|e| e.context(failed_here!(parquet scan)))?;
                         *metadata = md;
                         file_info
@@ -150,9 +157,12 @@ pub fn to_alp_impl(lp: DslPlan, ctxt: &mut DslConversionContext) -> PolarsResult
                         metadata,
                         ..
                     } => {
-                        let (file_info, md) =
-                            scans::ipc_file_info(&paths, &file_options, cloud_options.as_ref())
-                                .map_err(|e| e.context(failed_here!(ipc scan)))?;
+                        let (file_info, md) = scans::ipc_file_info(
+                            source.as_paths(),
+                            &file_options,
+                            cloud_options.as_ref(),
+                        )
+                        .map_err(|e| e.context(failed_here!(ipc scan)))?;
                         *metadata = Some(md);
                         file_info
                     },
@@ -160,16 +170,19 @@ pub fn to_alp_impl(lp: DslPlan, ctxt: &mut DslConversionContext) -> PolarsResult
                     FileScan::Csv {
                         options,
                         cloud_options,
-                    } => {
-                        scans::csv_file_info(&paths, &file_options, options, cloud_options.as_ref())
-                            .map_err(|e| e.context(failed_here!(csv scan)))?
-                    },
+                    } => scans::csv_file_info(
+                        source.as_paths(),
+                        &file_options,
+                        options,
+                        cloud_options.as_ref(),
+                    )
+                    .map_err(|e| e.context(failed_here!(csv scan)))?,
                     #[cfg(feature = "json")]
                     FileScan::NDJson {
                         options,
                         cloud_options,
                     } => scans::ndjson_file_info(
-                        &paths,
+                        source.as_paths(),
                         &file_options,
                         options,
                         cloud_options.as_ref(),
@@ -189,7 +202,7 @@ pub fn to_alp_impl(lp: DslPlan, ctxt: &mut DslConversionContext) -> PolarsResult
                 let mut owned = None;
 
                 hive_partitions_from_paths(
-                    paths.as_ref(),
+                    source.as_paths().as_ref(),
                     file_options.hive_options.hive_start_idx,
                     file_options.hive_options.schema.clone(),
                     match resolved_file_info.reader_schema.as_ref().unwrap() {
@@ -263,7 +276,7 @@ pub fn to_alp_impl(lp: DslPlan, ctxt: &mut DslConversionContext) -> PolarsResult
             }
 
             IR::Scan {
-                paths,
+                sources: source,
                 file_info: resolved_file_info,
                 hive_parts,
                 output_schema: None,
@@ -803,47 +816,64 @@ pub fn to_alp_impl(lp: DslPlan, ctxt: &mut DslConversionContext) -> PolarsResult
     Ok(ctxt.lp_arena.add(v))
 }
 
-/// Expand scan paths if they were not already expanded.
-#[allow(unused_variables)]
-fn expand_scan_paths(
-    paths: Arc<Mutex<(Arc<Vec<PathBuf>>, bool)>>,
-    scan_type: &mut FileScan,
-    file_options: &mut FileScanOptions,
-) -> PolarsResult<Arc<Vec<PathBuf>>> {
-    #[allow(unused_mut)]
-    let mut lock = paths.lock().unwrap();
-
-    // Return if paths are already expanded
-    if lock.1 {
-        return Ok(lock.0.clone());
-    }
+impl DslScanSource {
+    /// Expand scan paths if they were not already expanded.
+    pub fn expand_paths(
+        &mut self,
+        scan_type: &mut FileScan,
+        file_options: &mut FileScanOptions,
+    ) -> PolarsResult<()> {
+        match self {
+            DslScanSource::File(source) => {
+                #[allow(unused_mut)]
+                let mut lock = source.lock().unwrap();
+
+                // Return if paths are already expanded
+                if lock.is_expanded {
+                    return Ok(());
+                }
 
-    {
-        let paths_expanded = match &scan_type {
-            #[cfg(feature = "parquet")]
-            FileScan::Parquet { cloud_options, .. } => {
-                expand_scan_paths_with_hive_update(&lock.0, file_options, cloud_options)?
-            },
-            #[cfg(feature = "ipc")]
-            FileScan::Ipc { cloud_options, .. } => {
-                expand_scan_paths_with_hive_update(&lock.0, file_options, cloud_options)?
-            },
-            #[cfg(feature = "csv")]
-            FileScan::Csv { cloud_options, .. } => {
-                expand_paths(&lock.0, file_options.glob, cloud_options.as_ref())?
-            },
-            #[cfg(feature = "json")]
-            FileScan::NDJson { cloud_options, .. } => {
-                expand_paths(&lock.0, file_options.glob, cloud_options.as_ref())?
-            },
-            FileScan::Anonymous { .. } => unreachable!(), // Invariant: Anonymous scans are already expanded.
-        };
+                {
+                    let paths_expanded = match &scan_type {
+                        #[cfg(feature = "parquet")]
+                        FileScan::Parquet { cloud_options, .. } => {
+                            expand_scan_paths_with_hive_update(
+                                &lock.paths[..],
+                                file_options,
+                                cloud_options,
+                            )?
+                        },
+                        #[cfg(feature = "ipc")]
+                        FileScan::Ipc { cloud_options, .. } => expand_scan_paths_with_hive_update(
+                            &lock.paths[..],
+                            file_options,
+                            cloud_options,
+                        )?,
+                        #[cfg(feature = "csv")]
+                        FileScan::Csv { cloud_options, .. } => expand_paths(
+                            &lock.paths[..],
+                            file_options.glob,
+                            cloud_options.as_ref(),
+                        )?,
+                        #[cfg(feature = "json")]
+                        FileScan::NDJson { cloud_options, .. } => expand_paths(
+                            &lock.paths[..],
+                            file_options.glob,
+                            cloud_options.as_ref(),
+                        )?,
+                        FileScan::Anonymous { .. } => unreachable!(), // Invariant: Anonymous scans are already expanded.
+                    };
 
-        #[allow(unreachable_code)]
-        {
-            *lock = (paths_expanded, true);
+                    #[allow(unreachable_code)]
+                    {
+                        lock.paths = paths_expanded;
+                        lock.is_expanded = true;
 
-            Ok(lock.0.clone())
+                        Ok(())
+                    }
+                }
+            },
+            DslScanSource::Buffer(_) => Ok(()),
         }
     }
 }
@@ -854,7 +884,7 @@ fn expand_scan_paths_with_hive_update(
     paths: &[PathBuf],
     file_options: &mut FileScanOptions,
     cloud_options: &Option<CloudOptions>,
-) -> PolarsResult<Arc<Vec<PathBuf>>> {
+) -> PolarsResult<Arc<[PathBuf]>> {
     let hive_enabled = file_options.hive_options.enabled;
     let (expanded_paths, hive_start_idx) = expand_paths_hive(
         paths,
diff --git a/crates/polars-plan/src/plans/conversion/mod.rs b/crates/polars-plan/src/plans/conversion/mod.rs
index 89167a124534..3e8f8748e618 100644
--- a/crates/polars-plan/src/plans/conversion/mod.rs
+++ b/crates/polars-plan/src/plans/conversion/mod.rs
@@ -12,7 +12,7 @@ mod ir_to_dsl;
 mod scans;
 mod stack_opt;
 
-use std::sync::{Arc, Mutex, RwLock};
+use std::sync::{Arc, RwLock};
 
 pub use dsl_to_ir::*;
 pub use expr_to_ir::*;
@@ -50,7 +50,7 @@ impl IR {
         };
         match lp {
             IR::Scan {
-                paths,
+                sources,
                 file_info,
                 hive_parts,
                 predicate,
@@ -58,7 +58,7 @@ impl IR {
                 output_schema: _,
                 file_options: options,
             } => DslPlan::Scan {
-                paths: Arc::new(Mutex::new((paths, true))),
+                sources: sources.into(),
                 file_info: Arc::new(RwLock::new(Some(file_info))),
                 hive_parts,
                 predicate: predicate.map(|e| e.to_expr(expr_arena)),
diff --git a/crates/polars-plan/src/plans/conversion/scans.rs b/crates/polars-plan/src/plans/conversion/scans.rs
index 9b2636430622..82c953e2ffa2 100644
--- a/crates/polars-plan/src/plans/conversion/scans.rs
+++ b/crates/polars-plan/src/plans/conversion/scans.rs
@@ -1,4 +1,5 @@
 use std::path::PathBuf;
+use std::sync::{Arc, Mutex};
 
 use either::Either;
 use polars_io::path_utils::is_cloud_url;
@@ -16,6 +17,18 @@ fn get_first_path(paths: &[PathBuf]) -> PolarsResult<&PathBuf> {
         .ok_or_else(|| polars_err!(ComputeError: "expected at least 1 path"))
 }
 
+impl From<ScanSource> for DslScanSource {
+    fn from(value: ScanSource) -> Self {
+        match value {
+            ScanSource::Files(paths) => DslScanSource::File(Arc::new(Mutex::new(ScanFileSource {
+                paths,
+                is_expanded: true,
+            }))),
+            ScanSource::Buffer(buffer) => DslScanSource::Buffer(buffer),
+        }
+    }
+}
+
 #[cfg(any(feature = "parquet", feature = "ipc"))]
 fn prepare_output_schema(mut schema: Schema, row_index: Option<&RowIndex>) -> SchemaRef {
     if let Some(rc) = row_index {
@@ -38,46 +51,64 @@ fn prepare_schemas(mut schema: Schema, row_index: Option<&RowIndex>) -> (SchemaR
 
 #[cfg(feature = "parquet")]
 pub(super) fn parquet_file_info(
-    paths: &[PathBuf],
+    source: &ScanSource,
     file_options: &FileScanOptions,
     #[allow(unused)] cloud_options: Option<&polars_io::cloud::CloudOptions>,
 ) -> PolarsResult<(FileInfo, Option<FileMetaDataRef>)> {
-    let path = get_first_path(paths)?;
-
-    let (schema, reader_schema, num_rows, metadata) = if is_cloud_url(path) {
-        #[cfg(not(feature = "cloud"))]
-        panic!("One or more of the cloud storage features ('aws', 'gcp', ...) must be enabled.");
-
-        #[cfg(feature = "cloud")]
-        {
-            let uri = path.to_string_lossy();
-            get_runtime().block_on(async {
-                let mut reader = ParquetAsyncReader::from_uri(&uri, cloud_options, None).await?;
-                let reader_schema = reader.schema().await?;
-                let num_rows = reader.num_rows().await?;
-                let metadata = reader.get_metadata().await?.clone();
-
+    let (schema, reader_schema, num_rows, metadata) = match source {
+        ScanSource::Files(paths) => {
+            let path = get_first_path(paths)?;
+            if is_cloud_url(path) {
+                #[cfg(not(feature = "cloud"))]
+                panic!("One or more of the cloud storage features ('aws', 'gcp', ...) must be enabled.");
+
+                #[cfg(feature = "cloud")]
+                {
+                    let uri = path.to_string_lossy();
+                    get_runtime().block_on(async {
+                        let mut reader =
+                            ParquetAsyncReader::from_uri(&uri, cloud_options, None).await?;
+                        let reader_schema = reader.schema().await?;
+                        let num_rows = reader.num_rows().await?;
+                        let metadata = reader.get_metadata().await?.clone();
+
+                        let schema = prepare_output_schema(
+                            Schema::from_arrow_schema(reader_schema.as_ref()),
+                            file_options.row_index.as_ref(),
+                        );
+                        PolarsResult::Ok((schema, reader_schema, Some(num_rows), Some(metadata)))
+                    })?
+                }
+            } else {
+                let file = polars_utils::open_file(path)?;
+                let mut reader = ParquetReader::new(file);
+                let reader_schema = reader.schema()?;
                 let schema = prepare_output_schema(
                     Schema::from_arrow_schema(reader_schema.as_ref()),
                     file_options.row_index.as_ref(),
                 );
-                PolarsResult::Ok((schema, reader_schema, Some(num_rows), Some(metadata)))
-            })?
-        }
-    } else {
-        let file = polars_utils::open_file(path)?;
-        let mut reader = ParquetReader::new(file);
-        let reader_schema = reader.schema()?;
-        let schema = prepare_output_schema(
-            Schema::from_arrow_schema(reader_schema.as_ref()),
-            file_options.row_index.as_ref(),
-        );
-        (
-            schema,
-            reader_schema,
-            Some(reader.num_rows()?),
-            Some(reader.get_metadata()?.clone()),
-        )
+                (
+                    schema,
+                    reader_schema,
+                    Some(reader.num_rows()?),
+                    Some(reader.get_metadata()?.clone()),
+                )
+            }
+        },
+        ScanSource::Buffer(buffer) => {
+            let mut reader = ParquetReader::new(std::io::Cursor::new(buffer));
+            let reader_schema = reader.schema()?;
+            let schema = prepare_output_schema(
+                Schema::from_arrow_schema(reader_schema.as_ref()),
+                file_options.row_index.as_ref(),
+            );
+            (
+                schema,
+                reader_schema,
+                Some(reader.num_rows()?),
+                Some(reader.get_metadata()?.clone()),
+            )
+        },
     };
 
     let file_info = FileInfo::new(
diff --git a/crates/polars-plan/src/plans/ir/dot.rs b/crates/polars-plan/src/plans/ir/dot.rs
index 69e3a69733c5..c3b8f2e94874 100644
--- a/crates/polars-plan/src/plans/ir/dot.rs
+++ b/crates/polars-plan/src/plans/ir/dot.rs
@@ -247,7 +247,7 @@ impl<'a> IRDotDisplay<'a> {
                 })?;
             },
             Scan {
-                paths,
+                sources,
                 file_info,
                 hive_parts: _,
                 predicate,
@@ -255,6 +255,7 @@ impl<'a> IRDotDisplay<'a> {
                 file_options: options,
                 output_schema: _,
             } => {
+                let paths = sources.as_paths();
                 let name: &str = scan_type.into();
                 let path = PathsDisplay(paths.as_ref());
                 let with_columns = options.with_columns.as_ref().map(|cols| cols.as_ref());
diff --git a/crates/polars-plan/src/plans/ir/format.rs b/crates/polars-plan/src/plans/ir/format.rs
index 60699be85095..6c1c37b78671 100644
--- a/crates/polars-plan/src/plans/ir/format.rs
+++ b/crates/polars-plan/src/plans/ir/format.rs
@@ -221,13 +221,14 @@ impl<'a> IRDisplay<'a> {
                 self.with_root(*input)._format(f, sub_indent)
             },
             Scan {
-                paths,
+                sources,
                 file_info,
                 predicate,
                 scan_type,
                 file_options,
                 ..
             } => {
+                let paths = sources.as_paths();
                 let n_columns = file_options
                     .with_columns
                     .as_ref()
diff --git a/crates/polars-plan/src/plans/ir/inputs.rs b/crates/polars-plan/src/plans/ir/inputs.rs
index b00c91cddae4..2a7c14e300de 100644
--- a/crates/polars-plan/src/plans/ir/inputs.rs
+++ b/crates/polars-plan/src/plans/ir/inputs.rs
@@ -101,7 +101,7 @@ impl IR {
                 options: *options,
             },
             Scan {
-                paths,
+                sources,
                 file_info,
                 hive_parts,
                 output_schema,
@@ -114,7 +114,7 @@ impl IR {
                     new_predicate = exprs.pop()
                 }
                 Scan {
-                    paths: paths.clone(),
+                    sources: sources.clone(),
                     file_info: file_info.clone(),
                     hive_parts: hive_parts.clone(),
                     output_schema: output_schema.clone(),
diff --git a/crates/polars-plan/src/plans/ir/mod.rs b/crates/polars-plan/src/plans/ir/mod.rs
index 443726affad0..7062514f7689 100644
--- a/crates/polars-plan/src/plans/ir/mod.rs
+++ b/crates/polars-plan/src/plans/ir/mod.rs
@@ -6,12 +6,14 @@ pub(crate) mod tree_format;
 
 use std::borrow::Cow;
 use std::fmt;
-use std::path::PathBuf;
+use std::path::{Path, PathBuf};
+use std::sync::Mutex;
 
 pub use dot::{EscapeLabel, IRDotDisplay, PathsDisplay};
 pub use format::{ExprIRDisplay, IRDisplay};
 use hive::HivePartitions;
 use polars_core::prelude::*;
+use polars_core::POOL;
 use polars_utils::idx_vec::UnitVec;
 use polars_utils::unitvec;
 #[cfg(feature = "ir_serde")]
@@ -33,6 +35,176 @@ pub struct IRPlanRef<'a> {
     pub expr_arena: &'a Arena<AExpr>,
 }
 
+#[cfg_attr(feature = "ir_serde", derive(Serialize, Deserialize))]
+#[derive(Debug, Clone, Hash)]
+pub enum ScanSource {
+    Files(Arc<[PathBuf]>),
+    #[cfg_attr(feature = "ir_serde", serde(skip))]
+    Buffer(Arc<[u8]>),
+}
+
+impl Default for ScanSource {
+    fn default() -> Self {
+        Self::Files(Arc::default())
+    }
+}
+
+pub struct ScanSourceSliceInfo {
+    pub item_slice: std::ops::Range<usize>,
+    pub source_slice: std::ops::Range<usize>,
+}
+
+impl ScanSource {
+    pub fn as_paths(&self) -> &[PathBuf] {
+        match self {
+            ScanSource::Files(paths) => paths,
+            ScanSource::Buffer(_) => unimplemented!(),
+        }
+    }
+
+    pub fn into_paths(&self) -> Arc<[PathBuf]> {
+        match self {
+            ScanSource::Files(paths) => paths.clone(),
+            ScanSource::Buffer(_) => unimplemented!(),
+        }
+    }
+
+    pub fn to_dsl(self, is_expanded: bool) -> DslScanSource {
+        match self {
+            ScanSource::Files(paths) => {
+                DslScanSource::File(Arc::new(Mutex::new(ScanFileSource { paths, is_expanded })))
+            },
+            ScanSource::Buffer(buffer) => DslScanSource::Buffer(buffer),
+        }
+    }
+
+    pub fn num_sources(&self) -> usize {
+        match self {
+            ScanSource::Files(paths) => paths.len(),
+            ScanSource::Buffer(_) => 1,
+        }
+    }
+
+    pub fn is_cloud_url(&self) -> PolarsResult<bool> {
+        match self {
+            ScanSource::Files(paths) => {
+                Ok(polars_io::is_cloud_url(paths.first().ok_or_else(
+                    || polars_err!(ComputeError: "expected at least 1 path"),
+                )?))
+            },
+            ScanSource::Buffer(_) => Ok(false),
+        }
+    }
+
+    /// Normalize the slice and collect information as to what rows and parts of the source are
+    /// used in this slice. 
+    pub fn collect_slice_information(
+        &self,
+        slice: (i64, usize),
+        path_to_num_rows: impl Fn(&Path) -> PolarsResult<usize> + Send + Sync,
+        buffer_to_num_rows: impl Fn(&[u8]) -> PolarsResult<usize> + Send + Sync,
+    ) -> PolarsResult<ScanSourceSliceInfo> {
+        fn slice_to_start_end(
+            offset: i64,
+            length: usize,
+            num_rows: usize,
+        ) -> std::ops::Range<usize> {
+            if offset < 0 {
+                let slice_start_as_n_from_end = -offset as usize;
+                let (start, len) = if slice_start_as_n_from_end > num_rows {
+                    // We need to trim the slice, e.g. SLICE[offset: -100, len: 75] on a file of 50
+                    // rows should only give the first 25 rows.
+                    let start_position = slice_start_as_n_from_end - num_rows;
+                    (0, length.saturating_sub(start_position))
+                } else {
+                    (num_rows - slice_start_as_n_from_end, length)
+                };
+
+                let end = start.saturating_add(len);
+
+                start..end
+            } else {
+                let offset = offset as usize;
+                offset.min(num_rows)..(offset + length).min(num_rows)
+            }
+        }
+
+        let (offset, length) = slice;
+
+        Ok(match self {
+            ScanSource::Files(paths) if paths.len() == 1 => {
+                let num_rows = path_to_num_rows(&paths[0])?;
+                ScanSourceSliceInfo {
+                    item_slice: slice_to_start_end(offset, length, num_rows),
+                    source_slice: 0..1,
+                }
+            },
+            ScanSource::Files(paths) => {
+                use rayon::prelude::*;
+
+                assert_ne!(paths.len(), 0);
+
+                // Walk the files in reverse until we find the first file, and then translate the
+                // slice into a positive-offset equivalent.
+                const CHUNK_SIZE: usize = 8;
+                let mut row_counts = Vec::with_capacity(paths.len());
+
+                POOL.install(|| {
+                    for idx_end in (0..paths.len()).step_by(CHUNK_SIZE) {
+                        let idx_start = idx_end.saturating_sub(CHUNK_SIZE);
+
+                        row_counts.extend(
+                            (idx_start..=idx_end)
+                                .into_par_iter()
+                                .map(|i| path_to_num_rows(&paths[i]))
+                                .collect::<PolarsResult<Vec<_>>>()?
+                                .into_iter()
+                                .rev(),
+                        );
+                    }
+
+                    PolarsResult::Ok(())
+                })?;
+
+                let num_rows = row_counts.iter().sum::<usize>();
+
+                let item_slice = slice_to_start_end(offset, length, num_rows);
+
+                let mut source_start = paths.len() - 1;
+                let mut source_end = 0;
+
+                let mut sum = 0;
+                for (i, row_count) in row_counts.iter().rev().enumerate() {
+                    if sum < item_slice.end {
+                        source_end = usize::max(source_end, i);
+                    }
+
+                    sum += row_count;
+
+                    if sum >= item_slice.start {
+                        source_start = usize::min(source_start, i);
+                    }
+                }
+
+                let source_slice = source_start..source_end + 1;
+
+                ScanSourceSliceInfo {
+                    item_slice,
+                    source_slice,
+                }
+            },
+            ScanSource::Buffer(buffer) => {
+                let num_rows = buffer_to_num_rows(buffer)?;
+
+                ScanSourceSliceInfo {
+                    item_slice: slice_to_start_end(offset, length, num_rows),
+                    source_slice: 0..1,
+                }
+            },
+        })
+    }
+}
+
 /// [`IR`] is a representation of [`DslPlan`] with [`Node`]s which are allocated in an [`Arena`]
 /// In this IR the logical plan has access to the full dataset.
 #[derive(Clone, Debug, Default)]
@@ -52,7 +224,7 @@ pub enum IR {
         predicate: ExprIR,
     },
     Scan {
-        paths: Arc<Vec<PathBuf>>,
+        sources: ScanSource,
         file_info: FileInfo,
         hive_parts: Option<Arc<Vec<HivePartitions>>>,
         predicate: Option<ExprIR>,
diff --git a/crates/polars-plan/src/plans/mod.rs b/crates/polars-plan/src/plans/mod.rs
index cee1a3bb1045..9e2b4d56d6a4 100644
--- a/crates/polars-plan/src/plans/mod.rs
+++ b/crates/polars-plan/src/plans/mod.rs
@@ -59,6 +59,21 @@ pub enum Context {
     Default,
 }
 
+#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
+#[derive(Clone)]
+pub struct ScanFileSource {
+    pub paths: Arc<[PathBuf]>,
+    pub is_expanded: bool,
+}
+
+#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
+#[derive(Clone)]
+pub enum DslScanSource {
+    File(Arc<Mutex<ScanFileSource>>),
+    // @Q? Can we serde skip this?
+    Buffer(Arc<[u8]>),
+}
+
 // https://stackoverflow.com/questions/1031076/what-are-projection-and-selection
 #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
 pub enum DslPlan {
@@ -76,7 +91,7 @@ pub enum DslPlan {
         cache_hits: u32,
     },
     Scan {
-        paths: Arc<Mutex<(Arc<Vec<PathBuf>>, bool)>>,
+        sources: DslScanSource,
         // Option as this is mostly materialized on the IR phase.
         // During conversion we update the value in the DSL as well
         // This is to cater to use cases where parts of a `LazyFrame`
@@ -193,7 +208,7 @@ impl Clone for DslPlan {
             Self::PythonScan { options } => Self::PythonScan { options: options.clone() },
             Self::Filter { input, predicate } => Self::Filter { input: input.clone(), predicate: predicate.clone() },
             Self::Cache { input, id, cache_hits } => Self::Cache { input: input.clone(), id: id.clone(), cache_hits: cache_hits.clone() },
-            Self::Scan { paths, file_info, hive_parts, predicate, file_options, scan_type } => Self::Scan { paths: paths.clone(), file_info: file_info.clone(), hive_parts: hive_parts.clone(), predicate: predicate.clone(), file_options: file_options.clone(), scan_type: scan_type.clone() },
+            Self::Scan { sources, file_info, hive_parts, predicate, file_options, scan_type } => Self::Scan { sources: sources.clone(), file_info: file_info.clone(), hive_parts: hive_parts.clone(), predicate: predicate.clone(), file_options: file_options.clone(), scan_type: scan_type.clone() },
             Self::DataFrameScan { df, schema, output_schema, filter: selection } => Self::DataFrameScan { df: df.clone(), schema: schema.clone(), output_schema: output_schema.clone(), filter: selection.clone() },
             Self::Select { expr, input, options } => Self::Select { expr: expr.clone(), input: input.clone(), options: options.clone() },
             Self::GroupBy { input, keys, aggs,  apply, maintain_order, options } => Self::GroupBy { input: input.clone(), keys: keys.clone(), aggs: aggs.clone(), apply: apply.clone(), maintain_order: maintain_order.clone(), options: options.clone() },
diff --git a/crates/polars-plan/src/plans/optimizer/count_star.rs b/crates/polars-plan/src/plans/optimizer/count_star.rs
index 32a95cc3ede3..8565e066dcb4 100644
--- a/crates/polars-plan/src/plans/optimizer/count_star.rs
+++ b/crates/polars-plan/src/plans/optimizer/count_star.rs
@@ -95,9 +95,10 @@ fn visit_logical_plan_for_scan_paths(
             })
         },
         IR::Scan {
-            scan_type, paths, ..
+            scan_type, sources, ..
         } if !matches!(scan_type, FileScan::Anonymous { .. }) => Some(CountStarExpr {
-            paths: paths.clone(),
+            // @FIX: Count Star Should probably just have a Arc Slice
+            paths: Arc::new(sources.as_paths().as_ref().to_vec()),
             scan_type: scan_type.clone(),
             node,
             alias: None,
diff --git a/crates/polars-plan/src/plans/optimizer/predicate_pushdown/mod.rs b/crates/polars-plan/src/plans/optimizer/predicate_pushdown/mod.rs
index 1def3d375958..3b9e6c8d8ef9 100644
--- a/crates/polars-plan/src/plans/optimizer/predicate_pushdown/mod.rs
+++ b/crates/polars-plan/src/plans/optimizer/predicate_pushdown/mod.rs
@@ -325,7 +325,7 @@ impl<'a> PredicatePushDown<'a> {
                 Ok(lp)
             },
             Scan {
-                mut paths,
+                mut sources,
                 file_info,
                 hive_parts: mut scan_hive_parts,
                 ref predicate,
@@ -366,6 +366,7 @@ impl<'a> PredicatePushDown<'a> {
                 if let (Some(hive_parts), Some(predicate)) = (&scan_hive_parts, &predicate) {
                     if let Some(io_expr) = self.expr_eval.unwrap()(predicate, expr_arena) {
                         if let Some(stats_evaluator) = io_expr.as_stats_evaluator() {
+                            let paths = sources.as_paths();
                             let mut new_paths = Vec::with_capacity(paths.len());
                             let mut new_hive_parts = Vec::with_capacity(paths.len());
 
@@ -400,7 +401,7 @@ impl<'a> PredicatePushDown<'a> {
                                     filter: None,
                                 });
                             } else {
-                                paths = Arc::from(new_paths);
+                                sources = ScanSource::Files(new_paths.into());
                                 scan_hive_parts = Some(Arc::from(new_hive_parts));
                             }
                         }
@@ -422,7 +423,7 @@ impl<'a> PredicatePushDown<'a> {
 
                 let lp = if do_optimization {
                     Scan {
-                        paths,
+                        sources,
                         file_info,
                         hive_parts,
                         predicate,
@@ -432,7 +433,7 @@ impl<'a> PredicatePushDown<'a> {
                     }
                 } else {
                     let lp = Scan {
-                        paths,
+                        sources,
                         file_info,
                         hive_parts,
                         predicate: None,
diff --git a/crates/polars-plan/src/plans/optimizer/projection_pushdown/mod.rs b/crates/polars-plan/src/plans/optimizer/projection_pushdown/mod.rs
index e5e2fb94ccde..20e0d0d28633 100644
--- a/crates/polars-plan/src/plans/optimizer/projection_pushdown/mod.rs
+++ b/crates/polars-plan/src/plans/optimizer/projection_pushdown/mod.rs
@@ -398,7 +398,7 @@ impl ProjectionPushDown {
                 Ok(PythonScan { options })
             },
             Scan {
-                paths,
+                sources,
                 mut file_info,
                 mut hive_parts,
                 scan_type,
@@ -510,7 +510,7 @@ impl ProjectionPushDown {
                     }
                 };
                 let lp = Scan {
-                    paths,
+                    sources,
                     file_info,
                     hive_parts,
                     output_schema,
diff --git a/crates/polars-plan/src/plans/optimizer/slice_pushdown_lp.rs b/crates/polars-plan/src/plans/optimizer/slice_pushdown_lp.rs
index f62bd9ee197d..b656795f53d2 100644
--- a/crates/polars-plan/src/plans/optimizer/slice_pushdown_lp.rs
+++ b/crates/polars-plan/src/plans/optimizer/slice_pushdown_lp.rs
@@ -165,7 +165,7 @@ impl SlicePushDown {
             }
             #[cfg(feature = "csv")]
             (Scan {
-                paths,
+                sources,
                 file_info,
                 hive_parts,
                 output_schema,
@@ -176,7 +176,7 @@ impl SlicePushDown {
                 file_options.slice = Some((0, state.offset as usize + state.len as usize));
 
                 let lp = Scan {
-                    paths,
+                    sources,
                     file_info,
                     hive_parts,
                     output_schema,
@@ -189,7 +189,7 @@ impl SlicePushDown {
             },
             #[cfg(feature = "parquet")]
             (Scan {
-                paths,
+                sources,
                 file_info,
                 hive_parts,
                 output_schema,
@@ -200,7 +200,7 @@ impl SlicePushDown {
                 file_options.slice = Some((state.offset, state.len as usize));
 
                 let lp = Scan {
-                    paths,
+                    sources,
                     file_info,
                     hive_parts,
                     output_schema,
@@ -213,7 +213,7 @@ impl SlicePushDown {
             },
             // TODO! we currently skip slice pushdown if there is a predicate.
             (Scan {
-                paths,
+                sources,
                 file_info,
                 hive_parts,
                 output_schema,
@@ -224,7 +224,7 @@ impl SlicePushDown {
                 options.slice = Some((0, state.len as usize));
 
                 let lp = Scan {
-                    paths,
+                    sources,
                     file_info,
                     hive_parts,
                     output_schema,
diff --git a/crates/polars-plan/src/plans/visitor/hash.rs b/crates/polars-plan/src/plans/visitor/hash.rs
index 80c251108297..7087122802ea 100644
--- a/crates/polars-plan/src/plans/visitor/hash.rs
+++ b/crates/polars-plan/src/plans/visitor/hash.rs
@@ -74,7 +74,7 @@ impl Hash for HashableEqLP<'_> {
                 predicate.traverse_and_hash(self.expr_arena, state);
             },
             IR::Scan {
-                paths,
+                sources,
                 file_info: _,
                 hive_parts: _,
                 predicate,
@@ -84,7 +84,7 @@ impl Hash for HashableEqLP<'_> {
             } => {
                 // We don't have to traverse the schema, hive partitions etc. as they are derivative from the paths.
                 scan_type.hash(state);
-                paths.hash(state);
+                sources.hash(state);
                 hash_option_expr(predicate, self.expr_arena, state);
                 file_options.hash(state);
             },
@@ -254,7 +254,7 @@ impl HashableEqLP<'_> {
             ) => expr_ir_eq(l, r, self.expr_arena),
             (
                 IR::Scan {
-                    paths: pl,
+                    sources: pl,
                     file_info: _,
                     hive_parts: _,
                     predicate: pred_l,
@@ -263,7 +263,7 @@ impl HashableEqLP<'_> {
                     file_options: ol,
                 },
                 IR::Scan {
-                    paths: pr,
+                    sources: pr,
                     file_info: _,
                     hive_parts: _,
                     predicate: pred_r,
@@ -272,7 +272,7 @@ impl HashableEqLP<'_> {
                     file_options: or,
                 },
             ) => {
-                pl == pr
+                pl.as_paths() == pr.as_paths()
                     && stl == str
                     && ol == or
                     && opt_expr_ir_eq(pred_l, pred_r, self.expr_arena)
diff --git a/crates/polars-python/src/file.rs b/crates/polars-python/src/file.rs
index bcdfd7ff6ee7..9443fd0b5213 100644
--- a/crates/polars-python/src/file.rs
+++ b/crates/polars-python/src/file.rs
@@ -7,6 +7,7 @@ use std::io::{Cursor, ErrorKind, Read, Seek, SeekFrom, Write};
 #[cfg(target_family = "unix")]
 use std::os::fd::{FromRawFd, RawFd};
 use std::path::PathBuf;
+use std::sync::Arc;
 
 use polars::io::mmap::MmapBytesReader;
 use polars_error::{polars_err, polars_warn};
@@ -31,6 +32,10 @@ impl PyFileLikeObject {
         PyFileLikeObject { inner: object }
     }
 
+    pub fn as_arc(&self) -> Arc<[u8]> {
+        self.as_file_buffer().into_inner().into()
+    }
+
     pub fn as_buffer(&self) -> std::io::Cursor<Vec<u8>> {
         let data = self.as_file_buffer().into_inner();
         std::io::Cursor::new(data)
@@ -191,6 +196,65 @@ impl EitherRustPythonFile {
     }
 }
 
+pub enum EitherPythonFileOrPath {
+    Py(PyFileLikeObject),
+    Path(PathBuf),
+}
+
+pub fn get_either_file_or_path(
+    py_f: PyObject,
+    write: bool,
+) -> PyResult<EitherPythonFileOrPath> {
+    Python::with_gil(|py| {
+        let py_f = py_f.into_bound(py);
+        if let Ok(s) = py_f.extract::<Cow<str>>() {
+            let file_path = std::path::Path::new(&*s);
+            let file_path = resolve_homedir(file_path);
+            Ok(EitherPythonFileOrPath::Path(file_path))
+        } else {
+            let io = py.import_bound("io").unwrap();
+            let is_utf8_encoding = |py_f: &Bound<PyAny>| -> PyResult<bool> {
+                let encoding = py_f.getattr("encoding")?;
+                let encoding = encoding.extract::<Cow<str>>()?;
+                Ok(encoding.eq_ignore_ascii_case("utf-8") || encoding.eq_ignore_ascii_case("utf8"))
+            };
+
+            // BytesIO is relatively fast, and some code relies on it.
+            if !py_f.is_exact_instance(&io.getattr("BytesIO").unwrap()) {
+                polars_warn!("Polars found a filename. \
+                Ensure you pass a path to the file instead of a python file object when possible for best \
+                performance.");
+            }
+            // Unwrap TextIOWrapper
+            // Allow subclasses to allow things like pytest.capture.CaptureIO
+            let py_f = if py_f
+                .is_instance(&io.getattr("TextIOWrapper").unwrap())
+                .unwrap_or_default()
+            {
+                if !is_utf8_encoding(&py_f)? {
+                    return Err(PyPolarsErr::from(
+                        polars_err!(InvalidOperation: "file encoding is not UTF-8"),
+                    )
+                    .into());
+                }
+                // XXX: we have to clear buffer here.
+                // Is there a better solution?
+                if write {
+                    py_f.call_method0("flush")?;
+                } else {
+                    py_f.call_method1("seek", (0, 1))?;
+                }
+                py_f.getattr("buffer")?
+            } else {
+                py_f
+            };
+            PyFileLikeObject::ensure_requirements(&py_f, !write, write, !write)?;
+            let f = PyFileLikeObject::new(py_f.to_object(py));
+            Ok(EitherPythonFileOrPath::Py(f))
+        }
+    })
+}
+
 fn get_either_file_and_path(
     py_f: PyObject,
     write: bool,
diff --git a/crates/polars-python/src/lazyframe/general.rs b/crates/polars-python/src/lazyframe/general.rs
index 4cfd25258b61..d5f64c9f35ac 100644
--- a/crates/polars-python/src/lazyframe/general.rs
+++ b/crates/polars-python/src/lazyframe/general.rs
@@ -254,7 +254,7 @@ impl PyLazyFrame {
         low_memory, cloud_options, use_statistics, hive_partitioning, hive_schema, try_parse_hive_dates, retries, glob, include_file_paths)
     )]
     fn new_from_parquet(
-        path: Option<PathBuf>,
+        path: Option<PyObject>,
         paths: Vec<PathBuf>,
         n_rows: Option<usize>,
         cache: bool,
@@ -271,15 +271,54 @@ impl PyLazyFrame {
         glob: bool,
         include_file_paths: Option<String>,
     ) -> PyResult<Self> {
+        use crate::file::{get_either_file_or_path, EitherPythonFileOrPath};
+
         let parallel = parallel.0;
         let hive_schema = hive_schema.map(|s| Arc::new(s.0));
 
-        let first_path = if let Some(path) = &path {
-            path
-        } else {
-            paths
+        use polars_plan::plans::ScanSource;
+        use EitherPythonFileOrPath as EF;
+        let use_first_path = path.is_some();
+        let first_path = match path
+            .map(|py_f| get_either_file_or_path(py_f, false))
+            .transpose()?
+        {
+            Some(EF::Path(path)) => path,
+            Some(EF::Py(f)) => {
+                let scan_source = ScanSource::Buffer(f.as_arc());
+
+                let row_index = row_index.map(|(name, offset)| RowIndex {
+                    name: name.into(),
+                    offset,
+                });
+
+                let args = ScanArgsParquet {
+                    n_rows,
+                    cache,
+                    parallel,
+                    rechunk,
+                    row_index,
+                    low_memory,
+                    cloud_options: None,
+                    use_statistics,
+                    hive_options: HiveOptions {
+                        enabled: hive_partitioning,
+                        hive_start_idx: 0,
+                        schema: hive_schema,
+                        try_parse_dates: try_parse_hive_dates,
+                    },
+                    glob,
+                    include_file_paths: include_file_paths.map(|x| x.into()),
+                };
+
+                let lf = LazyFrame::scan_parquet_sourced(scan_source, args)
+                    .map_err(PyPolarsErr::from)?;
+                return Ok(lf.into());
+            },
+            None => paths
                 .first()
-                .ok_or_else(|| PyValueError::new_err("expected a path argument"))?
+                .cloned()
+                .ok_or_else(|| PyValueError::new_err("expected a path argument"))?,
         };
 
         #[cfg(feature = "cloud")]
@@ -322,7 +361,7 @@ impl PyLazyFrame {
             include_file_paths: include_file_paths.map(|x| x.into()),
         };
 
-        let lf = if path.is_some() {
+        let lf = if use_first_path {
             LazyFrame::scan_parquet(first_path, args)
         } else {
             LazyFrame::scan_parquet_files(Arc::from(paths), args)
diff --git a/crates/polars-python/src/lazyframe/visitor/nodes.rs b/crates/polars-python/src/lazyframe/visitor/nodes.rs
index 02960a1aad23..973a4ce432ef 100644
--- a/crates/polars-python/src/lazyframe/visitor/nodes.rs
+++ b/crates/polars-python/src/lazyframe/visitor/nodes.rs
@@ -317,7 +317,7 @@ pub(crate) fn into_py(py: Python<'_>, plan: &IR) -> PyResult<PyObject> {
             ))
         },
         IR::Scan {
-            paths,
+            sources,
             file_info: _,
             hive_parts: _,
             predicate,
@@ -325,7 +325,7 @@ pub(crate) fn into_py(py: Python<'_>, plan: &IR) -> PyResult<PyObject> {
             scan_type,
             file_options,
         } => Scan {
-            paths: paths.to_object(py),
+            paths: sources.into_paths().to_object(py),
             // TODO: file info
             file_info: py.None(),
             predicate: predicate.as_ref().map(|e| e.into()),
diff --git a/crates/polars-stream/src/nodes/parquet_source.rs b/crates/polars-stream/src/nodes/parquet_source.rs
index f167f12b6fdc..bf5d4262fed6 100644
--- a/crates/polars-stream/src/nodes/parquet_source.rs
+++ b/crates/polars-stream/src/nodes/parquet_source.rs
@@ -46,7 +46,7 @@ type AsyncTaskData = Option<(
 
 #[allow(clippy::type_complexity)]
 pub struct ParquetSourceNode {
-    paths: Arc<Vec<PathBuf>>,
+    paths: Arc<[PathBuf]>,
     file_info: FileInfo,
     hive_parts: Option<Arc<Vec<HivePartitions>>>,
     predicate: Option<Arc<dyn PhysicalExpr>>,
@@ -71,7 +71,7 @@ pub struct ParquetSourceNode {
 #[allow(clippy::too_many_arguments)]
 impl ParquetSourceNode {
     pub fn new(
-        paths: Arc<Vec<PathBuf>>,
+        paths: Arc<[PathBuf]>,
         file_info: FileInfo,
         hive_parts: Option<Arc<Vec<HivePartitions>>>,
         predicate: Option<Arc<dyn PhysicalExpr>>,
@@ -1355,7 +1355,7 @@ struct SharedFileState {
 
 /// Turns row group data into DataFrames.
 struct RowGroupDecoder {
-    paths: Arc<Vec<PathBuf>>,
+    paths: Arc<[PathBuf]>,
     hive_partitions: Option<Arc<Vec<HivePartitions>>>,
     hive_partitions_width: usize,
     include_file_paths: Option<PlSmallStr>,
diff --git a/crates/polars-stream/src/physical_plan/lower_ir.rs b/crates/polars-stream/src/physical_plan/lower_ir.rs
index b9693e6c3c56..d50d90afe52a 100644
--- a/crates/polars-stream/src/physical_plan/lower_ir.rs
+++ b/crates/polars-stream/src/physical_plan/lower_ir.rs
@@ -331,7 +331,7 @@ pub fn lower_ir(
 
         v @ IR::Scan { .. } => {
             let IR::Scan {
-                paths,
+                sources,
                 file_info,
                 hive_parts,
                 output_schema,
@@ -343,6 +343,8 @@ pub fn lower_ir(
                 unreachable!();
             };
 
+            let paths = sources.into_paths();
+
             PhysNodeKind::FileScan {
                 paths,
                 file_info,
diff --git a/crates/polars-stream/src/physical_plan/mod.rs b/crates/polars-stream/src/physical_plan/mod.rs
index 99103343565a..d22a5f968900 100644
--- a/crates/polars-stream/src/physical_plan/mod.rs
+++ b/crates/polars-stream/src/physical_plan/mod.rs
@@ -119,7 +119,7 @@ pub enum PhysNodeKind {
     },
 
     FileScan {
-        paths: Arc<Vec<PathBuf>>,
+        paths: Arc<[PathBuf]>,
         file_info: FileInfo,
         hive_parts: Option<Arc<Vec<HivePartitions>>>,
         predicate: Option<ExprIR>,
diff --git a/crates/polars-stream/src/utils/late_materialized_df.rs b/crates/polars-stream/src/utils/late_materialized_df.rs
index 2173598d5369..87fe97135aad 100644
--- a/crates/polars-stream/src/utils/late_materialized_df.rs
+++ b/crates/polars-stream/src/utils/late_materialized_df.rs
@@ -4,7 +4,7 @@ use parking_lot::Mutex;
 use polars_core::frame::DataFrame;
 use polars_core::schema::Schema;
 use polars_error::PolarsResult;
-use polars_plan::plans::{AnonymousScan, AnonymousScanArgs, FileInfo, FileScan, IR};
+use polars_plan::plans::{AnonymousScan, AnonymousScanArgs, FileInfo, FileScan, ScanSource, IR};
 use polars_plan::prelude::{AnonymousScanOptions, FileScanOptions};
 
 /// Used to insert a dataframe into in-memory-engine query plan after the query
@@ -25,7 +25,7 @@ impl LateMaterializedDataFrame {
             fmt_str: "LateMaterializedDataFrame",
         });
         IR::Scan {
-            paths: Arc::new(vec![]),
+            sources: ScanSource::Files(Arc::default()),
             file_info: FileInfo::new(schema, None, (None, usize::MAX)),
             hive_parts: None,
             predicate: None,
diff --git a/py-polars/polars/io/parquet/functions.py b/py-polars/polars/io/parquet/functions.py
index 90b6137c4924..0fc52142e5de 100644
--- a/py-polars/polars/io/parquet/functions.py
+++ b/py-polars/polars/io/parquet/functions.py
@@ -295,7 +295,7 @@ def read_parquet_schema(source: str | Path | IO[bytes] | bytes) -> dict[str, Dat
 @deprecate_renamed_parameter("row_count_name", "row_index_name", version="0.20.4")
 @deprecate_renamed_parameter("row_count_offset", "row_index_offset", version="0.20.4")
 def scan_parquet(
-    source: str | Path | list[str] | list[Path],
+    source: str | Path | list[str] | list[Path] | io.BytesIO,
     *,
     n_rows: int | None = None,
     row_index_name: str | None = None,
@@ -422,6 +422,8 @@ def scan_parquet(
 
     if isinstance(source, (str, Path)):
         source = normalize_filepath(source, check_not_directory=False)
+    elif isinstance(source, io.BytesIO):
+        pass
     else:
         source = [
             normalize_filepath(source, check_not_directory=False) for source in source
@@ -448,7 +450,7 @@ def scan_parquet(
 
 
 def _scan_parquet_impl(
-    source: str | list[str] | list[Path],
+    source: str | list[str] | list[Path] | io.BytesIO,
     *,
     n_rows: int | None = None,
     cache: bool = True,
diff --git a/py-polars/tests/unit/io/test_parquet.py b/py-polars/tests/unit/io/test_parquet.py
index 9ec82b991f39..b46f21f3893e 100644
--- a/py-polars/tests/unit/io/test_parquet.py
+++ b/py-polars/tests/unit/io/test_parquet.py
@@ -34,12 +34,12 @@ def test_round_trip(df: pl.DataFrame) -> None:
     assert_frame_equal(pl.read_parquet(f), df)
 
 
-def test_scan_round_trip(tmp_path: Path, df: pl.DataFrame) -> None:
-    tmp_path.mkdir(exist_ok=True)
-    f = tmp_path / "test.parquet"
-
+def test_scan_round_trip(df: pl.DataFrame) -> None:
+    f = io.BytesIO()
     df.write_parquet(f)
+    f.seek(0)
     assert_frame_equal(pl.scan_parquet(f).collect(), df)
+    f.seek(0)
     assert_frame_equal(pl.scan_parquet(f).head().collect(), df.head())
 
 
@@ -919,8 +919,7 @@ def test_parquet_array_dtype_nulls() -> None:
         ),
     ],
 )
-@pytest.mark.write_disk
-def test_complex_types(tmp_path: Path, series: list[Any], dtype: pl.DataType) -> None:
+def test_complex_types(series: list[Any], dtype: pl.DataType) -> None:
     xs = pl.Series(series, dtype=dtype)
     df = pl.DataFrame({"x": xs})
 
@@ -981,20 +980,18 @@ def test_read_parquet_only_loads_selected_columns_15098(
 
 
 @pytest.mark.release
-@pytest.mark.write_disk
-def test_max_statistic_parquet_writer(tmp_path: Path) -> None:
+def test_max_statistic_parquet_writer() -> None:
     # this hits the maximal page size
     # so the row group will be split into multiple pages
     # the page statistics need to be correctly reduced
     # for this query to make sense
     n = 150_000
 
-    tmp_path.mkdir(exist_ok=True)
-
     # int64 is important to hit the page size
     df = pl.int_range(0, n, eager=True, dtype=pl.Int64).alias("int").to_frame()
-    f = tmp_path / "tmp.parquet"
+    f = io.BytesIO()
     df.write_parquet(f, statistics=True, use_pyarrow=False, row_group_size=n)
+    f.seek(0)
     result = pl.scan_parquet(f).filter(pl.col("int") > n - 3).collect()
     expected = pl.DataFrame({"int": [149998, 149999]})
     assert_frame_equal(result, expected)
@@ -1088,14 +1085,11 @@ def test_hybrid_rle() -> None:
     )
 )
 @pytest.mark.slow
-@pytest.mark.write_disk
-@settings(suppress_health_check=[HealthCheck.function_scoped_fixture])
-def test_roundtrip_parametric(df: pl.DataFrame, tmp_path: Path) -> None:
-    # delete if exists
-    path = tmp_path / "data.parquet"
-
-    df.write_parquet(path)
-    result = pl.read_parquet(path)
+def test_roundtrip_parametric(df: pl.DataFrame) -> None:
+    f = io.BytesIO()
+    df.write_parquet(f)
+    f.seek(0)
+    result = pl.read_parquet(f)
 
     assert_frame_equal(df, result)
 
@@ -1207,18 +1201,14 @@ def test_read_byte_stream_split_arrays(
     assert_frame_equal(read, df)
 
 
-@pytest.mark.write_disk
-def test_parquet_nested_null_array_17795(tmp_path: Path) -> None:
-    filename = tmp_path / "nested_null.parquet"
-
-    pl.DataFrame([{"struct": {"field": None}}]).write_parquet(filename)
-    pq.read_table(filename)
-
+def test_parquet_nested_null_array_17795() -> None:
+    f = io.BytesIO()
+    pl.DataFrame([{"struct": {"field": None}}]).write_parquet(f)
+    f.seek(0)
+    pq.read_table(f)
 
-@pytest.mark.write_disk
-def test_parquet_record_batches_pyarrow_fixed_size_list_16614(tmp_path: Path) -> None:
-    filename = tmp_path / "a.parquet"
 
+def test_parquet_record_batches_pyarrow_fixed_size_list_16614() -> None:
     # @NOTE:
     # The minimum that I could get it to crash which was ~132000, but let's
     # just do 150000 to be sure.
@@ -1228,27 +1218,28 @@ def test_parquet_record_batches_pyarrow_fixed_size_list_16614(tmp_path: Path) ->
         schema={"x": pl.Array(pl.Float32, 2)},
     )
 
-    x.write_parquet(filename)
-    b = pl.read_parquet(filename, use_pyarrow=True)
+    f = io.BytesIO()
+    x.write_parquet(f)
+    f.seek(0)
+    b = pl.read_parquet(f, use_pyarrow=True)
 
     assert b["x"].shape[0] == n
     assert_frame_equal(b, x)
 
 
-@pytest.mark.write_disk
-def test_parquet_list_element_field_name(tmp_path: Path) -> None:
-    filename = tmp_path / "list.parquet"
-
+def test_parquet_list_element_field_name() -> None:
+    f = io.BytesIO()
     (
         pl.DataFrame(
             {
                 "a": [[1, 2], [1, 1, 1]],
             },
             schema={"a": pl.List(pl.Int64)},
-        ).write_parquet(filename, use_pyarrow=False)
+        ).write_parquet(f, use_pyarrow=False)
     )
 
-    schema_str = str(pq.read_schema(filename))
+    f.seek(0)
+    schema_str = str(pq.read_schema(f))
     assert "<element: int64>" in schema_str
     assert "child 0, element: int64" in schema_str
 
@@ -1368,8 +1359,7 @@ def test_parquet_high_nested_null_17805(
     )
 
 
-@pytest.mark.write_disk
-def test_struct_plain_encoded_statistics(tmp_path: Path) -> None:
+def test_struct_plain_encoded_statistics() -> None:
     df = pl.DataFrame(
         {
             "a": [None, None, None, None, {"x": None, "y": 0}],
@@ -1377,17 +1367,12 @@ def test_struct_plain_encoded_statistics(tmp_path: Path) -> None:
         schema={"a": pl.Struct({"x": pl.Int8, "y": pl.Int8})},
     )
 
-    test_scan_round_trip(tmp_path, df)
+    test_scan_round_trip(df)
 
 
 @given(df=dataframes(min_size=5, excluded_dtypes=[pl.Decimal, pl.Categorical]))
-@settings(
-    max_examples=100,
-    deadline=None,
-    suppress_health_check=[HealthCheck.function_scoped_fixture],
-)
-def test_scan_round_trip_parametric(tmp_path: Path, df: pl.DataFrame) -> None:
-    test_scan_round_trip(tmp_path, df)
+def test_scan_round_trip_parametric(df: pl.DataFrame) -> None:
+    test_scan_round_trip(df)
 
 
 def test_empty_rg_no_dict_page_18146() -> None:
@@ -1532,13 +1517,7 @@ def test_delta_strings_encoding_roundtrip(
     r2=st.integers(min_value=0, max_value=1000),
 )
 @pytest.mark.parametrize("parallel_st", ["auto", "prefiltered"])
-@settings(
-    deadline=None,
-    suppress_health_check=[HealthCheck.function_scoped_fixture],
-)
-@pytest.mark.write_disk
 def test_predicate_filtering(
-    tmp_path: Path,
     df: pl.DataFrame,
     first_op: str,
     second_op: None | tuple[str, str],
@@ -1548,9 +1527,7 @@ def test_predicate_filtering(
     r2: int,
     parallel_st: Literal["auto", "prefiltered"],
 ) -> None:
-    tmp_path.mkdir(exist_ok=True)
-    f = tmp_path / "test.parquet"
-
+    f = io.BytesIO()
     df.write_parquet(f, row_group_size=5)
 
     cols = df.columns
@@ -1566,6 +1543,7 @@ def test_predicate_filtering(
             (getattr(pl.col(r1s), second_op[1]))(pl.col(r2s))
         )
 
+    f.seek(0)
     result = pl.scan_parquet(f, parallel=parallel_st).filter(expr).collect()
     assert_frame_equal(result, df.filter(expr))
 
@@ -1581,33 +1559,26 @@ def test_predicate_filtering(
     offset=st.integers(0, 100),
     length=st.integers(0, 100),
 )
-@settings(
-    suppress_health_check=[HealthCheck.function_scoped_fixture],
-)
-@pytest.mark.write_disk
 def test_slice_roundtrip(
-    df: pl.DataFrame, offset: int, length: int, tmp_path: Path
+    df: pl.DataFrame, offset: int, length: int
 ) -> None:
-    tmp_path.mkdir(exist_ok=True)
-    f = tmp_path / "test.parquet"
-
     offset %= df.height + 1
     length %= df.height - offset + 1
 
+    f = io.BytesIO()
     df.write_parquet(f)
 
+    f.seek(0)
     scanned = pl.scan_parquet(f).slice(offset, length).collect()
     assert_frame_equal(scanned, df.slice(offset, length))
 
 
-@pytest.mark.write_disk
-def test_struct_prefiltered(tmp_path: Path) -> None:
-    tmp_path.mkdir(exist_ok=True)
-    f = tmp_path / "test.parquet"
-
+def test_struct_prefiltered() -> None:
     df = pl.DataFrame({"a": {"x": 1, "y": 2}})
+    f = io.BytesIO()
     df.write_parquet(f)
 
+    f.seek(0)
     (
         pl.scan_parquet(f, parallel="prefiltered")
         .filter(pl.col("a").struct.field("x") == 1)
@@ -1641,19 +1612,17 @@ def test_struct_prefiltered(tmp_path: Path) -> None:
     ],
 )
 @pytest.mark.parametrize("nullable", [False, True])
-@pytest.mark.write_disk
 def test_nested_skip_18303(
     data: tuple[list[dict[str, str] | list[str]], pa.DataType],
     nullable: bool,
-    tmp_path: Path,
 ) -> None:
-    tmp_path.mkdir(exist_ok=True)
-    f = tmp_path / "test.parquet"
-
     schema = pa.schema([pa.field("a", data[1], nullable=nullable)])
     tb = pa.table({"a": data[0]}, schema=schema)
+
+    f = io.BytesIO()
     pq.write_table(tb, f)
 
+    f.seek(0)
     scanned = pl.scan_parquet(f).slice(1, 1).collect()
 
     assert_frame_equal(scanned, pl.DataFrame(tb).slice(1, 1))
@@ -1697,20 +1666,12 @@ def test_nested_span_multiple_pages_18400() -> None:
         include_cols=[column("filter_col", pl.Boolean, allow_null=False)],
     ),
 )
-@pytest.mark.write_disk
-@settings(
-    suppress_health_check=[HealthCheck.function_scoped_fixture],
-)
-def test_parametric_small_page_mask_filtering(
-    tmp_path: Path,
-    df: pl.DataFrame,
-) -> None:
-    tmp_path.mkdir(exist_ok=True)
-    f = tmp_path / "test.parquet"
-
+def test_parametric_small_page_mask_filtering(df: pl.DataFrame) -> None:
+    f = io.BytesIO()
     df.write_parquet(f, data_page_size=1024)
 
     expr = pl.col("filter_col")
+    f.seek(0)
     result = pl.scan_parquet(f, parallel="prefiltered").filter(expr).collect()
     assert_frame_equal(result, df.filter(expr))
 
@@ -1756,23 +1717,13 @@ def test_different_page_validity_across_pages(value: str | int | float | bool) -
         ],
     ),
 )
-@settings(
-    deadline=None,
-    suppress_health_check=[HealthCheck.function_scoped_fixture],
-)
-@pytest.mark.write_disk
-def test_delta_length_byte_array_prefiltering(
-    tmp_path: Path,
-    df: pl.DataFrame,
-) -> None:
-    tmp_path.mkdir(exist_ok=True)
-    f = tmp_path / "test.parquet"
-
+def test_delta_length_byte_array_prefiltering(df: pl.DataFrame) -> None:
     cols = df.columns
 
     encodings = {col: "DELTA_LENGTH_BYTE_ARRAY" for col in cols}
     encodings["filter_col"] = "PLAIN"
 
+    f = io.BytesIO()
     pq.write_table(
         df.to_arrow(),
         f,
@@ -1780,6 +1731,7 @@ def test_delta_length_byte_array_prefiltering(
         column_encoding=encodings,
     )
 
+    f.seek(0)
     expr = pl.col("filter_col") == 0
     result = pl.scan_parquet(f, parallel="prefiltered").filter(expr).collect()
     assert_frame_equal(result, df.filter(expr))
@@ -1797,22 +1749,13 @@ def test_delta_length_byte_array_prefiltering(
         ],
     ),
 )
-@settings(
-    deadline=None,
-    suppress_health_check=[HealthCheck.function_scoped_fixture],
-)
-@pytest.mark.write_disk
-def test_general_prefiltering(
-    tmp_path: Path,
-    df: pl.DataFrame,
-) -> None:
-    tmp_path.mkdir(exist_ok=True)
-    f = tmp_path / "test.parquet"
-
+def test_general_prefiltering(df: pl.DataFrame) -> None:
+    f = io.BytesIO()
     df.write_parquet(f)
 
     expr = pl.col("filter_col") == 0
 
+    f.seek(0)
     result = pl.scan_parquet(f, parallel="prefiltered").filter(expr).collect()
     assert_frame_equal(result, df.filter(expr))
 
@@ -1827,22 +1770,13 @@ def test_general_prefiltering(
         include_cols=[column("filter_col", pl.Boolean, allow_null=False)],
     ),
 )
-@settings(
-    deadline=None,
-    suppress_health_check=[HealthCheck.function_scoped_fixture],
-)
-@pytest.mark.write_disk
-def test_row_index_prefiltering(
-    tmp_path: Path,
-    df: pl.DataFrame,
-) -> None:
-    tmp_path.mkdir(exist_ok=True)
-    f = tmp_path / "test.parquet"
-
+def test_row_index_prefiltering(df: pl.DataFrame) -> None:
+    f = io.BytesIO()
     df.write_parquet(f)
 
     expr = pl.col("filter_col")
 
+    f.seek(0)
     result = (
         pl.scan_parquet(
             f, row_index_name="ri", row_index_offset=42, parallel="prefiltered"

From e14b78ce05c811ff6b68964c3a591e0cae904400 Mon Sep 17 00:00:00 2001
From: coastalwhite <me@gburghoorn.com>
Date: Tue, 3 Sep 2024 16:18:10 +0200
Subject: [PATCH 02/27] refactor: Add ScanSource to CountStar

---
 crates/polars-io/src/csv/read/mod.rs          |   2 +-
 crates/polars-io/src/csv/read/parser.rs       |  36 ++-
 .../polars-plan/src/plans/functions/count.rs  | 215 +++++++++++-------
 crates/polars-plan/src/plans/functions/mod.rs |  15 +-
 crates/polars-plan/src/plans/ir/format.rs     |  15 +-
 crates/polars-plan/src/plans/ir/mod.rs        |   9 +-
 .../src/plans/optimizer/count_star.rs         |  15 +-
 .../src/lazyframe/visitor/nodes.rs            |   7 +-
 8 files changed, 189 insertions(+), 125 deletions(-)

diff --git a/crates/polars-io/src/csv/read/mod.rs b/crates/polars-io/src/csv/read/mod.rs
index 969be1a58908..b9d48291f8ce 100644
--- a/crates/polars-io/src/csv/read/mod.rs
+++ b/crates/polars-io/src/csv/read/mod.rs
@@ -26,7 +26,7 @@ mod splitfields;
 mod utils;
 
 pub use options::{CommentPrefix, CsvEncoding, CsvParseOptions, CsvReadOptions, NullValues};
-pub use parser::count_rows;
+pub use parser::{count_rows, count_rows_from_slice};
 pub use read_impl::batched::{BatchedCsvReader, OwnedBatchedCsvReader};
 pub use reader::CsvReader;
 pub use schema_inference::infer_file_schema;
diff --git a/crates/polars-io/src/csv/read/parser.rs b/crates/polars-io/src/csv/read/parser.rs
index 18e6ef5f3f6d..9d2852a02c82 100644
--- a/crates/polars-io/src/csv/read/parser.rs
+++ b/crates/polars-io/src/csv/read/parser.rs
@@ -54,12 +54,32 @@ pub fn count_rows(
         reader_bytes = &reader_bytes[1..];
     }
 
+    count_rows_from_slice(
+        reader_bytes,
+        separator,
+        quote_char,
+        comment_prefix,
+        eol_char,
+        has_header,
+    )
+}
+
+/// Read the number of rows without parsing columns
+/// useful for count(*) queries
+pub fn count_rows_from_slice(
+    bytes: &[u8],
+    separator: u8,
+    quote_char: Option<u8>,
+    comment_prefix: Option<&CommentPrefix>,
+    eol_char: u8,
+    has_header: bool,
+) -> PolarsResult<usize> {
     const MIN_ROWS_PER_THREAD: usize = 1024;
     let max_threads = POOL.current_num_threads();
 
     // Determine if parallelism is beneficial and how many threads
     let n_threads = get_line_stats(
-        reader_bytes,
+        bytes,
         MIN_ROWS_PER_THREAD,
         eol_char,
         None,
@@ -67,22 +87,16 @@ pub fn count_rows(
         quote_char,
     )
     .map(|(mean, std)| {
-        let n_rows = (reader_bytes.len() as f32 / (mean - 0.01 * std)) as usize;
+        let n_rows = (bytes.len() as f32 / (mean - 0.01 * std)) as usize;
         (n_rows / MIN_ROWS_PER_THREAD).clamp(1, max_threads)
     })
     .unwrap_or(1);
 
-    let file_chunks: Vec<(usize, usize)> = get_file_chunks(
-        reader_bytes,
-        n_threads,
-        None,
-        separator,
-        quote_char,
-        eol_char,
-    );
+    let file_chunks: Vec<(usize, usize)> =
+        get_file_chunks(bytes, n_threads, None, separator, quote_char, eol_char);
 
     let iter = file_chunks.into_par_iter().map(|(start, stop)| {
-        let local_bytes = &reader_bytes[start..stop];
+        let local_bytes = &bytes[start..stop];
         let row_iterator = SplitLines::new(local_bytes, quote_char.unwrap_or(b'"'), eol_char);
         if comment_prefix.is_some() {
             Ok(row_iterator
diff --git a/crates/polars-plan/src/plans/functions/count.rs b/crates/polars-plan/src/plans/functions/count.rs
index bd68db61a06c..dca574e67808 100644
--- a/crates/polars-plan/src/plans/functions/count.rs
+++ b/crates/polars-plan/src/plans/functions/count.rs
@@ -3,9 +3,7 @@ use arrow::io::ipc::read::get_row_count as count_rows_ipc_sync;
 #[cfg(any(feature = "parquet", feature = "json"))]
 use polars_io::cloud::CloudOptions;
 #[cfg(feature = "csv")]
-use polars_io::csv::read::count_rows as count_rows_csv;
-#[cfg(any(feature = "parquet", feature = "ipc", feature = "json"))]
-use polars_io::is_cloud_url;
+use polars_io::csv::read::{count_rows as count_rows_csv, count_rows_from_slice as count_rows_csv_from_slice};
 #[cfg(all(feature = "parquet", feature = "cloud"))]
 use polars_io::parquet::read::ParquetAsyncReader;
 #[cfg(feature = "parquet")]
@@ -18,7 +16,7 @@ use polars_io::SerReader;
 use super::*;
 
 #[allow(unused_variables)]
-pub fn count_rows(paths: &Arc<Vec<PathBuf>>, scan_type: &FileScan) -> PolarsResult<DataFrame> {
+pub fn count_rows(sources: &Arc<[ScanSource]>, scan_type: &FileScan) -> PolarsResult<DataFrame> {
     #[cfg(not(any(
         feature = "parquet",
         feature = "ipc",
@@ -41,26 +39,10 @@ pub fn count_rows(paths: &Arc<Vec<PathBuf>>, scan_type: &FileScan) -> PolarsResu
             FileScan::Csv {
                 options,
                 cloud_options,
-            } => {
-                let parse_options = options.get_parse_options();
-                let n_rows: PolarsResult<usize> = paths
-                    .iter()
-                    .map(|path| {
-                        count_rows_csv(
-                            path,
-                            parse_options.separator,
-                            parse_options.quote_char,
-                            parse_options.comment_prefix.as_ref(),
-                            parse_options.eol_char,
-                            options.has_header,
-                        )
-                    })
-                    .sum();
-                n_rows
-            },
+            } => count_all_rows_csv(sources, options),
             #[cfg(feature = "parquet")]
             FileScan::Parquet { cloud_options, .. } => {
-                count_rows_parquet(paths, cloud_options.as_ref())
+                count_rows_parquet(sources, cloud_options.as_ref())
             },
             #[cfg(feature = "ipc")]
             FileScan::Ipc {
@@ -68,7 +50,7 @@ pub fn count_rows(paths: &Arc<Vec<PathBuf>>, scan_type: &FileScan) -> PolarsResu
                 cloud_options,
                 metadata,
             } => count_rows_ipc(
-                paths,
+                sources,
                 #[cfg(feature = "cloud")]
                 cloud_options.as_ref(),
                 metadata.as_ref(),
@@ -77,7 +59,7 @@ pub fn count_rows(paths: &Arc<Vec<PathBuf>>, scan_type: &FileScan) -> PolarsResu
             FileScan::NDJson {
                 options,
                 cloud_options,
-            } => count_rows_ndjson(paths, cloud_options.as_ref()),
+            } => count_rows_ndjson(sources, cloud_options.as_ref()),
             FileScan::Anonymous { .. } => {
                 unreachable!()
             },
@@ -92,15 +74,51 @@ pub fn count_rows(paths: &Arc<Vec<PathBuf>>, scan_type: &FileScan) -> PolarsResu
         )])
     }
 }
+
+#[cfg(feature = "csv")]
+fn count_all_rows_csv(
+    sources: &Arc<[ScanSource]>,
+    options: &polars_io::prelude::CsvReadOptions,
+) -> PolarsResult<usize> {
+    let parse_options = options.get_parse_options();
+
+    sources
+        .iter()
+        .map(|source| match source {
+            ScanSource::Files(paths) => paths
+                .iter()
+                .map(|path| {
+                    count_rows_csv(
+                        path,
+                        parse_options.separator,
+                        parse_options.quote_char,
+                        parse_options.comment_prefix.as_ref(),
+                        parse_options.eol_char,
+                        options.has_header,
+                    )
+                })
+                .sum::<PolarsResult<usize>>(),
+            ScanSource::Buffer(buf) => count_rows_csv_from_slice(
+                &buf[..],
+                parse_options.separator,
+                parse_options.quote_char,
+                parse_options.comment_prefix.as_ref(),
+                parse_options.eol_char,
+                options.has_header,
+            ),
+        })
+        .sum()
+}
+
 #[cfg(feature = "parquet")]
 pub(super) fn count_rows_parquet(
-    paths: &Arc<Vec<PathBuf>>,
+    sources: &Arc<[ScanSource]>,
     #[allow(unused)] cloud_options: Option<&CloudOptions>,
 ) -> PolarsResult<usize> {
-    if paths.is_empty() {
+    if sources.is_empty() {
         return Ok(0);
     };
-    let is_cloud = is_cloud_url(paths.first().unwrap().as_path());
+    let is_cloud = sources.first().unwrap().is_cloud_url()?;
 
     if is_cloud {
         #[cfg(not(feature = "cloud"))]
@@ -108,15 +126,19 @@ pub(super) fn count_rows_parquet(
 
         #[cfg(feature = "cloud")]
         {
-            get_runtime().block_on(count_rows_cloud_parquet(paths, cloud_options))
+            get_runtime().block_on(count_rows_cloud_parquet(sources, cloud_options))
         }
     } else {
-        paths
+        sources
             .iter()
-            .map(|path| {
-                let file = polars_utils::open_file(path)?;
-                let mut reader = ParquetReader::new(file);
-                reader.num_rows()
+            .map(|source| match source {
+                ScanSource::Files(paths) => paths
+                    .iter()
+                    .map(|path| ParquetReader::new(polars_utils::open_file(path)?).num_rows())
+                    .sum::<PolarsResult<usize>>(),
+                ScanSource::Buffer(buffer) => {
+                    ParquetReader::new(std::io::Cursor::new(buffer)).num_rows()
+                },
             })
             .sum::<PolarsResult<usize>>()
     }
@@ -124,14 +146,17 @@ pub(super) fn count_rows_parquet(
 
 #[cfg(all(feature = "parquet", feature = "async"))]
 async fn count_rows_cloud_parquet(
-    paths: &Arc<Vec<PathBuf>>,
+    sources: &Arc<[ScanSource]>,
     cloud_options: Option<&CloudOptions>,
 ) -> PolarsResult<usize> {
-    let collection = paths.iter().map(|path| {
-        with_concurrency_budget(1, || async {
-            let mut reader =
-                ParquetAsyncReader::from_uri(&path.to_string_lossy(), cloud_options, None).await?;
-            reader.num_rows().await
+    let collection = sources.iter().flat_map(|source| {
+        source.as_paths().iter().map(|path| {
+            with_concurrency_budget(1, || async {
+                let mut reader =
+                    ParquetAsyncReader::from_uri(&path.to_string_lossy(), cloud_options, None)
+                        .await?;
+                reader.num_rows().await
+            })
         })
     });
     futures::future::try_join_all(collection)
@@ -141,14 +166,14 @@ async fn count_rows_cloud_parquet(
 
 #[cfg(feature = "ipc")]
 pub(super) fn count_rows_ipc(
-    paths: &Arc<Vec<PathBuf>>,
+    sources: &Arc<[ScanSource]>,
     #[cfg(feature = "cloud")] cloud_options: Option<&CloudOptions>,
     metadata: Option<&arrow::io::ipc::read::FileMetadata>,
 ) -> PolarsResult<usize> {
-    if paths.is_empty() {
+    if sources.is_empty() {
         return Ok(0);
     };
-    let is_cloud = is_cloud_url(paths.first().unwrap().as_path());
+    let is_cloud = sources.first().unwrap().is_cloud_url()?;
 
     if is_cloud {
         #[cfg(not(feature = "cloud"))]
@@ -156,31 +181,41 @@ pub(super) fn count_rows_ipc(
 
         #[cfg(feature = "cloud")]
         {
-            get_runtime().block_on(count_rows_cloud_ipc(paths, cloud_options, metadata))
+            get_runtime().block_on(count_rows_cloud_ipc(sources, cloud_options, metadata))
         }
     } else {
-        paths
+        sources
             .iter()
-            .map(|path| {
-                let mut reader = polars_utils::open_file(path)?;
-                count_rows_ipc_sync(&mut reader).map(|v| v as usize)
+            .map(|source| match source {
+                ScanSource::Files(paths) => paths
+                    .iter()
+                    .map(|path| {
+                        count_rows_ipc_sync(&mut polars_utils::open_file(path)?).map(|v| v as usize)
+                    })
+                    .sum::<PolarsResult<usize>>(),
+                ScanSource::Buffer(buffer) => {
+                    count_rows_ipc_sync(&mut std::io::Cursor::new(buffer)).map(|v| v as usize)
+                },
             })
-            .sum()
+            .sum::<PolarsResult<usize>>()
     }
 }
 
 #[cfg(all(feature = "ipc", feature = "async"))]
 async fn count_rows_cloud_ipc(
-    paths: &Arc<Vec<PathBuf>>,
+    sources: &Arc<[ScanSource]>,
     cloud_options: Option<&CloudOptions>,
     metadata: Option<&arrow::io::ipc::read::FileMetadata>,
 ) -> PolarsResult<usize> {
     use polars_io::ipc::IpcReaderAsync;
 
-    let collection = paths.iter().map(|path| {
-        with_concurrency_budget(1, || async {
-            let reader = IpcReaderAsync::from_uri(&path.to_string_lossy(), cloud_options).await?;
-            reader.count_rows(metadata).await
+    let collection = sources.iter().flat_map(|source| {
+        source.as_paths().iter().map(|path| {
+            with_concurrency_budget(1, || async {
+                let reader =
+                    IpcReaderAsync::from_uri(&path.to_string_lossy(), cloud_options).await?;
+                reader.count_rows(metadata).await
+            })
         })
     });
     futures::future::try_join_all(collection)
@@ -190,21 +225,23 @@ async fn count_rows_cloud_ipc(
 
 #[cfg(feature = "json")]
 pub(super) fn count_rows_ndjson(
-    paths: &Arc<Vec<PathBuf>>,
+    sources: &Arc<[ScanSource]>,
     cloud_options: Option<&CloudOptions>,
 ) -> PolarsResult<usize> {
     use polars_core::config;
+    use polars_core::error::feature_gated;
     use polars_io::utils::maybe_decompress_bytes;
 
-    let run_async = !paths.is_empty() && is_cloud_url(&paths[0]) || config::force_async();
+    let run_async =
+        !sources.is_empty() && sources.first().unwrap().is_cloud_url()? || config::force_async();
 
     let cache_entries = {
-        #[cfg(feature = "cloud")]
-        {
+        feature_gated!("cloud", {
             if run_async {
                 Some(polars_io::file_cache::init_entries_from_uri_list(
-                    paths
+                    sources
                         .iter()
+                        .flat_map(|source| source.as_paths())
                         .map(|path| Arc::from(path.to_str().unwrap()))
                         .collect::<Vec<_>>()
                         .as_slice(),
@@ -213,39 +250,43 @@ pub(super) fn count_rows_ndjson(
             } else {
                 None
             }
-        }
-        #[cfg(not(feature = "cloud"))]
-        {
-            if run_async {
-                panic!("required feature `cloud` is not enabled")
-            }
-        }
+        })
     };
 
-    (0..paths.len())
-        .map(|i| {
-            let f = if run_async {
-                #[cfg(feature = "cloud")]
-                {
-                    let entry: &Arc<polars_io::file_cache::FileCacheEntry> =
-                        &cache_entries.as_ref().unwrap()[0];
-                    entry.try_open_check_latest()?
-                }
-                #[cfg(not(feature = "cloud"))]
-                {
-                    panic!("required feature `cloud` is not enabled")
-                }
-            } else {
-                polars_utils::open_file(&paths[i])?
-            };
+    sources
+        .iter()
+        .map(|source| match source {
+            ScanSource::Files(paths) => paths
+                .iter()
+                .map(|path| {
+                    let f = if run_async {
+                        feature_gated!("cloud", {
+                            let entry: &Arc<polars_io::file_cache::FileCacheEntry> =
+                                &cache_entries.as_ref().unwrap()[0];
+                            entry.try_open_check_latest()?
+                        })
+                    } else {
+                        polars_utils::open_file(path)?
+                    };
 
-            let mmap = unsafe { memmap::Mmap::map(&f).unwrap() };
-            let owned = &mut vec![];
+                    let mmap = unsafe { memmap::Mmap::map(&f).unwrap() };
+                    let owned = &mut vec![];
 
-            let reader = polars_io::ndjson::core::JsonLineReader::new(std::io::Cursor::new(
-                maybe_decompress_bytes(mmap.as_ref(), owned)?,
-            ));
-            reader.count()
+                    let reader = polars_io::ndjson::core::JsonLineReader::new(
+                        std::io::Cursor::new(maybe_decompress_bytes(mmap.as_ref(), owned)?),
+                    );
+                    reader.count()
+                })
+                .sum::<PolarsResult<usize>>(),
+            ScanSource::Buffer(buffer) => {
+                polars_ensure!(!run_async, nyi = "BytesIO with force_async");
+
+                let owned = &mut vec![];
+                let reader = polars_io::ndjson::core::JsonLineReader::new(std::io::Cursor::new(
+                    maybe_decompress_bytes(buffer, owned)?,
+                ));
+                reader.count()
+            },
         })
         .sum()
 }
diff --git a/crates/polars-plan/src/plans/functions/mod.rs b/crates/polars-plan/src/plans/functions/mod.rs
index b0e5bb444689..37e01bcc654f 100644
--- a/crates/polars-plan/src/plans/functions/mod.rs
+++ b/crates/polars-plan/src/plans/functions/mod.rs
@@ -10,7 +10,6 @@ mod schema;
 use std::borrow::Cow;
 use std::fmt::{Debug, Display, Formatter};
 use std::hash::{Hash, Hasher};
-use std::path::PathBuf;
 use std::sync::{Arc, Mutex};
 
 pub use dsl::*;
@@ -45,7 +44,7 @@ pub enum FunctionIR {
         fmt_str: PlSmallStr,
     },
     FastCount {
-        paths: Arc<Vec<PathBuf>>,
+        sources: Arc<[ScanSource]>,
         scan_type: FileScan,
         alias: Option<PlSmallStr>,
     },
@@ -104,8 +103,8 @@ impl PartialEq for FunctionIR {
         use FunctionIR::*;
         match (self, other) {
             (Rechunk, Rechunk) => true,
-            (FastCount { paths: paths_l, .. }, FastCount { paths: paths_r, .. }) => {
-                paths_l == paths_r
+            (FastCount { sources: srcs_l, .. }, FastCount { sources: srcs_r, .. }) => {
+                srcs_l == srcs_r
             },
             (
                 Rename {
@@ -138,11 +137,11 @@ impl Hash for FunctionIR {
             FunctionIR::OpaquePython { .. } => {},
             FunctionIR::Opaque { fmt_str, .. } => fmt_str.hash(state),
             FunctionIR::FastCount {
-                paths,
+                sources,
                 scan_type,
                 alias,
             } => {
-                paths.hash(state);
+                sources.hash(state);
                 scan_type.hash(state);
                 alias.hash(state);
             },
@@ -261,8 +260,8 @@ impl FunctionIR {
                 ..
             }) => python_udf::call_python_udf(function, df, *validate_output, schema.as_deref()),
             FastCount {
-                paths, scan_type, ..
-            } => count::count_rows(paths, scan_type),
+                sources, scan_type, ..
+            } => count::count_rows(sources, scan_type),
             Rechunk => {
                 df.as_single_chunk_par();
                 Ok(df)
diff --git a/crates/polars-plan/src/plans/ir/format.rs b/crates/polars-plan/src/plans/ir/format.rs
index 6c1c37b78671..cc64daf67a30 100644
--- a/crates/polars-plan/src/plans/ir/format.rs
+++ b/crates/polars-plan/src/plans/ir/format.rs
@@ -1,7 +1,6 @@
 use std::borrow::Cow;
 use std::fmt;
 use std::fmt::{Display, Formatter};
-use std::path::PathBuf;
 
 use polars_core::datatypes::AnyValue;
 use polars_core::schema::Schema;
@@ -56,7 +55,7 @@ impl AsExpr for ExprIR {
 fn write_scan(
     f: &mut Formatter,
     name: &str,
-    path: &[PathBuf],
+    source: &ScanSource,
     indent: usize,
     n_columns: i64,
     total_columns: usize,
@@ -64,7 +63,12 @@ fn write_scan(
     slice: Option<(i64, usize)>,
     row_index: Option<&RowIndex>,
 ) -> fmt::Result {
-    write!(f, "{:indent$}{name} SCAN {}", "", PathsDisplay(path))?;
+    write!(f, "{:indent$}{name} SCAN ", "")?;
+
+    match source {
+        ScanSource::Files(paths) => write!(f, "{}", PathsDisplay(paths.as_ref()))?,
+        ScanSource::Buffer(_) => write!(f, "IN MEMORY BUFFER")?,
+    }
 
     let total_columns = total_columns - usize::from(row_index.is_some());
     if n_columns > 0 {
@@ -171,7 +175,7 @@ impl<'a> IRDisplay<'a> {
                 write_scan(
                     f,
                     "PYTHON",
-                    &[],
+                    &ScanSource::default(),
                     indent,
                     n_columns,
                     total_columns,
@@ -228,7 +232,6 @@ impl<'a> IRDisplay<'a> {
                 file_options,
                 ..
             } => {
-                let paths = sources.as_paths();
                 let n_columns = file_options
                     .with_columns
                     .as_ref()
@@ -240,7 +243,7 @@ impl<'a> IRDisplay<'a> {
                 write_scan(
                     f,
                     scan_type.into(),
-                    paths,
+                    &sources,
                     indent,
                     n_columns,
                     file_info.schema.len(),
diff --git a/crates/polars-plan/src/plans/ir/mod.rs b/crates/polars-plan/src/plans/ir/mod.rs
index 7062514f7689..52593e3d2b28 100644
--- a/crates/polars-plan/src/plans/ir/mod.rs
+++ b/crates/polars-plan/src/plans/ir/mod.rs
@@ -36,7 +36,7 @@ pub struct IRPlanRef<'a> {
 }
 
 #[cfg_attr(feature = "ir_serde", derive(Serialize, Deserialize))]
-#[derive(Debug, Clone, Hash)]
+#[derive(Debug, Clone, Hash, PartialEq, Eq)]
 pub enum ScanSource {
     Files(Arc<[PathBuf]>),
     #[cfg_attr(feature = "ir_serde", serde(skip))]
@@ -62,6 +62,13 @@ impl ScanSource {
         }
     }
 
+    pub fn try_into_paths(&self) -> PolarsResult<Arc<[PathBuf]>> {
+        match self {
+            ScanSource::Files(paths) => Ok(paths.clone()),
+            ScanSource::Buffer(_) => Err(polars_err!(nyi = "Unable to convert BytesIO scan into path")),
+        }
+    }
+
     pub fn into_paths(&self) -> Arc<[PathBuf]> {
         match self {
             ScanSource::Files(paths) => paths.clone(),
diff --git a/crates/polars-plan/src/plans/optimizer/count_star.rs b/crates/polars-plan/src/plans/optimizer/count_star.rs
index 8565e066dcb4..d88956d2903f 100644
--- a/crates/polars-plan/src/plans/optimizer/count_star.rs
+++ b/crates/polars-plan/src/plans/optimizer/count_star.rs
@@ -1,5 +1,3 @@
-use std::path::PathBuf;
-
 use super::*;
 
 pub(super) struct CountStar;
@@ -32,7 +30,7 @@ impl OptimizationRule for CountStar {
                 let alp = IR::MapFunction {
                     input: placeholder_node,
                     function: FunctionIR::FastCount {
-                        paths: count_star_expr.paths,
+                        sources: count_star_expr.sources,
                         scan_type: count_star_expr.scan_type,
                         alias: count_star_expr.alias,
                     },
@@ -49,7 +47,7 @@ struct CountStarExpr {
     // Top node of the projection to replace
     node: Node,
     // Paths to the input files
-    paths: Arc<Vec<PathBuf>>,
+    sources: Arc<[ScanSource]>,
     // File Type
     scan_type: FileScan,
     // Column Alias
@@ -67,11 +65,11 @@ fn visit_logical_plan_for_scan_paths(
     match lp_arena.get(node) {
         IR::Union { inputs, .. } => {
             let mut scan_type: Option<FileScan> = None;
-            let mut paths = Vec::with_capacity(inputs.len());
+            let mut sources = Vec::with_capacity(inputs.len());
             for input in inputs {
                 match visit_logical_plan_for_scan_paths(*input, lp_arena, expr_arena, true) {
                     Some(expr) => {
-                        paths.extend(expr.paths.iter().cloned());
+                        sources.extend(expr.sources.iter().cloned());
                         match &scan_type {
                             None => scan_type = Some(expr.scan_type),
                             Some(scan_type) => {
@@ -88,7 +86,7 @@ fn visit_logical_plan_for_scan_paths(
                 }
             }
             Some(CountStarExpr {
-                paths: paths.into(),
+                sources: sources.into(),
                 scan_type: scan_type.unwrap(),
                 node,
                 alias: None,
@@ -97,8 +95,7 @@ fn visit_logical_plan_for_scan_paths(
         IR::Scan {
             scan_type, sources, ..
         } if !matches!(scan_type, FileScan::Anonymous { .. }) => Some(CountStarExpr {
-            // @FIX: Count Star Should probably just have a Arc Slice
-            paths: Arc::new(sources.as_paths().as_ref().to_vec()),
+            sources: [sources.clone()].into(),
             scan_type: scan_type.clone(),
             node,
             alias: None,
diff --git a/crates/polars-python/src/lazyframe/visitor/nodes.rs b/crates/polars-python/src/lazyframe/visitor/nodes.rs
index 973a4ce432ef..3c31ff11b63a 100644
--- a/crates/polars-python/src/lazyframe/visitor/nodes.rs
+++ b/crates/polars-python/src/lazyframe/visitor/nodes.rs
@@ -325,7 +325,10 @@ pub(crate) fn into_py(py: Python<'_>, plan: &IR) -> PyResult<PyObject> {
             scan_type,
             file_options,
         } => Scan {
-            paths: sources.into_paths().to_object(py),
+            paths: sources
+                .try_into_paths()
+                .map_err(|_| PyNotImplementedError::new_err("scan with BytesIO"))?
+                .to_object(py),
             // TODO: file info
             file_info: py.None(),
             predicate: predicate.as_ref().map(|e| e.into()),
@@ -596,7 +599,7 @@ pub(crate) fn into_py(py: Python<'_>, plan: &IR) -> PyResult<PyObject> {
                     offset,
                 } => ("row_index", name.to_string(), offset.unwrap_or(0)).to_object(py),
                 FunctionIR::FastCount {
-                    paths: _,
+                    sources: _,
                     scan_type: _,
                     alias: _,
                 } => return Err(PyNotImplementedError::new_err("function count")),

From 3e72f51a936495db4cea5387cc71118101e1c095 Mon Sep 17 00:00:00 2001
From: coastalwhite <me@gburghoorn.com>
Date: Tue, 3 Sep 2024 17:35:28 +0200
Subject: [PATCH 03/27] refactor: Add ScanSource to other scan functions

---
 crates/polars-lazy/src/scan/csv.rs            |   4 +
 crates/polars-lazy/src/scan/ipc.rs            |   4 +
 .../src/executors/scan/csv.rs                 |  56 ++---
 .../src/executors/scan/ipc.rs                 |  11 +-
 .../src/executors/scan/parquet.rs             |  26 +-
 crates/polars-mem-engine/src/planner/lp.rs    |   2 +-
 crates/polars-plan/src/client/check.rs        |  10 +-
 .../src/plans/conversion/dsl_to_ir.rs         |  11 +-
 .../polars-plan/src/plans/conversion/mod.rs   |   2 +-
 .../polars-plan/src/plans/conversion/scans.rs | 207 ++++++++--------
 .../polars-plan/src/plans/functions/count.rs  |   4 +-
 crates/polars-plan/src/plans/functions/mod.rs |  11 +-
 crates/polars-plan/src/plans/ir/mod.rs        |  16 +-
 crates/polars-python/src/file.rs              |   5 +-
 crates/polars-python/src/lazyframe/general.rs | 228 ++++++++----------
 py-polars/polars/io/csv/functions.py          |   6 +-
 py-polars/polars/io/ipc/functions.py          |   5 +-
 py-polars/polars/io/ndjson.py                 |   6 +-
 py-polars/polars/io/parquet/functions.py      |   8 +-
 19 files changed, 309 insertions(+), 313 deletions(-)

diff --git a/crates/polars-lazy/src/scan/csv.rs b/crates/polars-lazy/src/scan/csv.rs
index 676c34b6a71e..e408681789c3 100644
--- a/crates/polars-lazy/src/scan/csv.rs
+++ b/crates/polars-lazy/src/scan/csv.rs
@@ -35,6 +35,10 @@ impl LazyCsvReader {
         Self::new("").with_paths(paths)
     }
 
+    pub fn new_sourced(source: ScanSource) -> Self {
+        Self::new("").with_source(source)
+    }
+
     pub fn new(path: impl AsRef<Path>) -> Self {
         LazyCsvReader {
             source: ScanSource::Files([path.as_ref().to_path_buf()].into()),
diff --git a/crates/polars-lazy/src/scan/ipc.rs b/crates/polars-lazy/src/scan/ipc.rs
index af0b53ade823..18043a15717a 100644
--- a/crates/polars-lazy/src/scan/ipc.rs
+++ b/crates/polars-lazy/src/scan/ipc.rs
@@ -132,4 +132,8 @@ impl LazyFrame {
     pub fn scan_ipc_files(paths: Arc<[PathBuf]>, args: ScanArgsIpc) -> PolarsResult<Self> {
         LazyIpcReader::new(args).with_paths(paths).finish()
     }
+
+    pub fn scan_ipc_sourced(source: ScanSource, args: ScanArgsIpc) -> PolarsResult<Self> {
+        LazyIpcReader::new(args).with_source(source).finish()
+    }
 }
diff --git a/crates/polars-mem-engine/src/executors/scan/csv.rs b/crates/polars-mem-engine/src/executors/scan/csv.rs
index 24e813329bcf..4ab5a034c584 100644
--- a/crates/polars-mem-engine/src/executors/scan/csv.rs
+++ b/crates/polars-mem-engine/src/executors/scan/csv.rs
@@ -4,11 +4,12 @@ use polars_core::config;
 use polars_core::utils::{
     accumulate_dataframes_vertical, accumulate_dataframes_vertical_unchecked,
 };
+use polars_error::feature_gated;
 
 use super::*;
 
 pub struct CsvExec {
-    pub sources: ScanSource,
+    pub source: ScanSource,
     pub file_info: FileInfo,
     pub options: CsvReadOptions,
     pub file_options: FileScanOptions,
@@ -17,7 +18,7 @@ pub struct CsvExec {
 
 impl CsvExec {
     fn read(&self) -> PolarsResult<DataFrame> {
-        let paths = self.sources.as_paths();
+        let paths = self.source.as_paths();
         let with_columns = self
             .file_options
             .with_columns
@@ -65,42 +66,28 @@ impl CsvExec {
         let finish_read =
             |i: usize, options: CsvReadOptions, predicate: Option<Arc<dyn PhysicalIoExpr>>| {
                 let path = &paths[i];
-                let mut df = if run_async {
-                    #[cfg(feature = "cloud")]
-                    {
-                        let file = polars_io::file_cache::FILE_CACHE
+                let file = if run_async {
+                    feature_gated!("cloud", {
+                        polars_io::file_cache::FILE_CACHE
                             .get_entry(path.to_str().unwrap())
                             // Safety: This was initialized by schema inference.
                             .unwrap()
-                            .try_open_assume_latest()?;
-                        let owned = &mut vec![];
-                        let mmap = unsafe { memmap::Mmap::map(&file).unwrap() };
-
-                        options
-                            .into_reader_with_file_handle(std::io::Cursor::new(
-                                maybe_decompress_bytes(mmap.as_ref(), owned)?,
-                            ))
-                            ._with_predicate(predicate.clone())
-                            .finish()
-                    }
-                    #[cfg(not(feature = "cloud"))]
-                    {
-                        panic!("required feature `cloud` is not enabled")
-                    }
+                            .try_open_assume_latest()
+                    })
                 } else {
-                    let file = polars_utils::open_file(path)?;
-                    let mmap = unsafe { memmap::Mmap::map(&file).unwrap() };
-                    let owned = &mut vec![];
-
-                    options
-                        .into_reader_with_file_handle(std::io::Cursor::new(maybe_decompress_bytes(
-                            mmap.as_ref(),
-                            owned,
-                        )?))
-                        ._with_predicate(predicate.clone())
-                        .finish()
+                    polars_utils::open_file(path)
                 }?;
 
+                let owned = &mut vec![];
+                let mmap = unsafe { memmap::Mmap::map(&file).unwrap() };
+                let mut df = options
+                    .into_reader_with_file_handle(std::io::Cursor::new(maybe_decompress_bytes(
+                        mmap.as_ref(),
+                        owned,
+                    )?))
+                    ._with_predicate(predicate.clone())
+                    .finish()?;
+
                 if let Some(col) = &self.file_options.include_file_paths {
                     let path = path.to_str().unwrap();
                     unsafe {
@@ -234,11 +221,8 @@ impl CsvExec {
 
 impl Executor for CsvExec {
     fn execute(&mut self, state: &mut ExecutionState) -> PolarsResult<DataFrame> {
-        let paths = self.sources.as_paths();
         let profile_name = if state.has_node_timer() {
-            let mut ids = vec![PlSmallStr::from_str(
-                paths[0].to_string_lossy().as_ref(),
-            )];
+            let mut ids = vec![self.source.id()];
             if self.predicate.is_some() {
                 ids.push("predicate".into())
             }
diff --git a/crates/polars-mem-engine/src/executors/scan/ipc.rs b/crates/polars-mem-engine/src/executors/scan/ipc.rs
index b29e44a5e33c..b9387cad5878 100644
--- a/crates/polars-mem-engine/src/executors/scan/ipc.rs
+++ b/crates/polars-mem-engine/src/executors/scan/ipc.rs
@@ -1,4 +1,3 @@
-
 use hive::HivePartitions;
 use polars_core::config;
 use polars_core::utils::accumulate_dataframes_vertical;
@@ -90,11 +89,7 @@ impl IpcExec {
                         Arc::from(paths[path_index].to_str().unwrap().to_string()),
                     )
                 }))
-                .memory_mapped(
-                    self.options
-                        .memory_map
-                        .then(|| paths[path_index].clone()),
-                )
+                .memory_mapped(self.options.memory_map.then(|| paths[path_index].clone()))
                 .finish()
         };
 
@@ -191,9 +186,7 @@ impl Executor for IpcExec {
         let paths = self.sources.as_paths();
 
         let profile_name = if state.has_node_timer() {
-            let mut ids = vec![PlSmallStr::from_str(
-                paths[0].to_string_lossy().as_ref(),
-            )];
+            let mut ids = vec![PlSmallStr::from_str(paths[0].to_string_lossy().as_ref())];
             if self.predicate.is_some() {
                 ids.push("predicate".into())
             }
diff --git a/crates/polars-mem-engine/src/executors/scan/parquet.rs b/crates/polars-mem-engine/src/executors/scan/parquet.rs
index efed503ad511..ed740809fcd3 100644
--- a/crates/polars-mem-engine/src/executors/scan/parquet.rs
+++ b/crates/polars-mem-engine/src/executors/scan/parquet.rs
@@ -13,7 +13,7 @@ use polars_io::RowIndex;
 use super::*;
 
 pub struct ParquetExec {
-    sources: ScanSource,
+    source: ScanSource,
     file_info: FileInfo,
     hive_parts: Option<Arc<Vec<HivePartitions>>>,
     predicate: Option<Arc<dyn PhysicalExpr>>,
@@ -28,7 +28,7 @@ pub struct ParquetExec {
 impl ParquetExec {
     #[allow(clippy::too_many_arguments)]
     pub(crate) fn new(
-        sources: ScanSource,
+        source: ScanSource,
         file_info: FileInfo,
         hive_parts: Option<Arc<Vec<HivePartitions>>>,
         predicate: Option<Arc<dyn PhysicalExpr>>,
@@ -38,7 +38,7 @@ impl ParquetExec {
         metadata: Option<FileMetaDataRef>,
     ) -> Self {
         ParquetExec {
-            sources,
+            source,
             file_info,
             hive_parts,
             predicate,
@@ -51,7 +51,7 @@ impl ParquetExec {
 
     fn read_par(&mut self) -> PolarsResult<Vec<DataFrame>> {
         let parallel = match self.options.parallel {
-            ParallelStrategy::Auto if self.sources.num_sources() > POOL.current_num_threads() => {
+            ParallelStrategy::Auto if self.source.num_sources() > POOL.current_num_threads() => {
                 ParallelStrategy::RowGroups
             },
             identity => identity,
@@ -63,16 +63,16 @@ impl ParquetExec {
         let slice_info = match self.file_options.slice {
             None => ScanSourceSliceInfo {
                 item_slice: 0..usize::MAX,
-                source_slice: 0..self.sources.num_sources(),
+                source_slice: 0..self.source.num_sources(),
             },
-            Some(slice) => self.sources.collect_slice_information(
+            Some(slice) => self.source.collect_slice_information(
                 slice,
                 |path| ParquetReader::new(std::fs::File::open(path)?).num_rows(),
                 |buff| ParquetReader::new(std::io::Cursor::new(buff)).num_rows(),
             )?,
         };
 
-        match &self.sources {
+        match &self.source {
             ScanSource::Buffer(buffer) => {
                 let row_index = self.file_options.row_index.take();
                 let (projection, predicate) = prepare_scan_args(
@@ -88,7 +88,10 @@ impl ParquetExec {
                     .set_low_memory(self.options.low_memory)
                     .use_statistics(self.options.use_statistics)
                     .set_rechunk(false)
-                    .with_slice(Some((slice_info.item_slice.start, slice_info.item_slice.len())))
+                    .with_slice(Some((
+                        slice_info.item_slice.start,
+                        slice_info.item_slice.len(),
+                    )))
                     .with_row_index(row_index)
                     .with_predicate(predicate.clone())
                     .with_projection(projection.clone())
@@ -223,7 +226,7 @@ impl ParquetExec {
         use polars_io::utils::slice::split_slice_at_file;
 
         let verbose = verbose();
-        let paths = self.sources.into_paths();
+        let paths = self.source.into_paths();
         let first_metadata = &self.metadata;
         let cloud_options = self.cloud_options.as_ref();
 
@@ -440,7 +443,7 @@ impl ParquetExec {
             .and_then(|_| self.predicate.take())
             .map(phys_expr_to_io_expr);
 
-        let is_cloud = match &self.sources {
+        let is_cloud = match &self.source {
             ScanSource::Files(paths) => is_cloud_url(paths.first().unwrap()),
             ScanSource::Buffer(_) => false,
         };
@@ -472,8 +475,7 @@ impl ParquetExec {
 impl Executor for ParquetExec {
     fn execute(&mut self, state: &mut ExecutionState) -> PolarsResult<DataFrame> {
         let profile_name = if state.has_node_timer() {
-            let paths = self.sources.as_paths();
-            let mut ids = vec![paths[0].to_string_lossy()];
+            let mut ids = vec![self.source.id()];
             if self.predicate.is_some() {
                 ids.push("predicate".into())
             }
diff --git a/crates/polars-mem-engine/src/planner/lp.rs b/crates/polars-mem-engine/src/planner/lp.rs
index 45487f7b7024..1f161a34587a 100644
--- a/crates/polars-mem-engine/src/planner/lp.rs
+++ b/crates/polars-mem-engine/src/planner/lp.rs
@@ -306,7 +306,7 @@ fn create_physical_plan_impl(
             match scan_type {
                 #[cfg(feature = "csv")]
                 FileScan::Csv { options, .. } => Ok(Box::new(executors::CsvExec {
-                    sources,
+                    source: sources,
                     file_info,
                     options,
                     predicate,
diff --git a/crates/polars-plan/src/client/check.rs b/crates/polars-plan/src/client/check.rs
index e28e1906c8ea..c7070d22ed0c 100644
--- a/crates/polars-plan/src/client/check.rs
+++ b/crates/polars-plan/src/client/check.rs
@@ -2,7 +2,7 @@ use polars_core::error::{polars_err, PolarsResult};
 use polars_io::path_utils::is_cloud_url;
 
 use crate::plans::options::SinkType;
-use crate::plans::{DslPlan, FileScan, DslScanSource};
+use crate::plans::{DslPlan, DslScanSource, FileScan};
 
 /// Assert that the given [`DslPlan`] is eligible to be executed on Polars Cloud.
 pub(super) fn assert_cloud_eligible(dsl: &DslPlan) -> PolarsResult<()> {
@@ -15,13 +15,7 @@ pub(super) fn assert_cloud_eligible(dsl: &DslPlan) -> PolarsResult<()> {
             } => {
                 match sources {
                     DslScanSource::File(file) => {
-                        if file
-                            .lock()
-                            .unwrap()
-                            .paths
-                            .iter()
-                            .any(|p| !is_cloud_url(p))
-                        {
+                        if file.lock().unwrap().paths.iter().any(|p| !is_cloud_url(p)) {
                             return ineligible_error("contains scan of local file system");
                         }
                     },
diff --git a/crates/polars-plan/src/plans/conversion/dsl_to_ir.rs b/crates/polars-plan/src/plans/conversion/dsl_to_ir.rs
index 825c5896097b..72e75d2b3017 100644
--- a/crates/polars-plan/src/plans/conversion/dsl_to_ir.rs
+++ b/crates/polars-plan/src/plans/conversion/dsl_to_ir.rs
@@ -145,9 +145,12 @@ pub fn to_alp_impl(lp: DslPlan, ctxt: &mut DslConversionContext) -> PolarsResult
                         metadata,
                         ..
                     } => {
-                        let (file_info, md) =
-                            scans::parquet_file_info(&source, &file_options, cloud_options.as_ref())
-                                .map_err(|e| e.context(failed_here!(parquet scan)))?;
+                        let (file_info, md) = scans::parquet_file_info(
+                            &source,
+                            &file_options,
+                            cloud_options.as_ref(),
+                        )
+                        .map_err(|e| e.context(failed_here!(parquet scan)))?;
                         *metadata = md;
                         file_info
                     },
@@ -171,7 +174,7 @@ pub fn to_alp_impl(lp: DslPlan, ctxt: &mut DslConversionContext) -> PolarsResult
                         options,
                         cloud_options,
                     } => scans::csv_file_info(
-                        source.as_paths(),
+                        &source,
                         &file_options,
                         options,
                         cloud_options.as_ref(),
diff --git a/crates/polars-plan/src/plans/conversion/mod.rs b/crates/polars-plan/src/plans/conversion/mod.rs
index 3e8f8748e618..9851a6d2c3ba 100644
--- a/crates/polars-plan/src/plans/conversion/mod.rs
+++ b/crates/polars-plan/src/plans/conversion/mod.rs
@@ -58,7 +58,7 @@ impl IR {
                 output_schema: _,
                 file_options: options,
             } => DslPlan::Scan {
-                sources: sources.into(),
+                sources: sources.to_dsl(true),
                 file_info: Arc::new(RwLock::new(Some(file_info))),
                 hive_parts,
                 predicate: predicate.map(|e| e.to_expr(expr_arena)),
diff --git a/crates/polars-plan/src/plans/conversion/scans.rs b/crates/polars-plan/src/plans/conversion/scans.rs
index 82c953e2ffa2..1cc939417d60 100644
--- a/crates/polars-plan/src/plans/conversion/scans.rs
+++ b/crates/polars-plan/src/plans/conversion/scans.rs
@@ -1,5 +1,5 @@
 use std::path::PathBuf;
-use std::sync::{Arc, Mutex};
+use std::sync::Arc;
 
 use either::Either;
 use polars_io::path_utils::is_cloud_url;
@@ -17,18 +17,6 @@ fn get_first_path(paths: &[PathBuf]) -> PolarsResult<&PathBuf> {
         .ok_or_else(|| polars_err!(ComputeError: "expected at least 1 path"))
 }
 
-impl From<ScanSource> for DslScanSource {
-    fn from(value: ScanSource) -> Self {
-        match value {
-            ScanSource::Files(paths) => DslScanSource::File(Arc::new(Mutex::new(ScanFileSource {
-                paths,
-                is_expanded: true,
-            }))),
-            ScanSource::Buffer(buffer) => DslScanSource::Buffer(buffer),
-        }
-    }
-}
-
 #[cfg(any(feature = "parquet", feature = "ipc"))]
 fn prepare_output_schema(mut schema: Schema, row_index: Option<&RowIndex>) -> SchemaRef {
     if let Some(rc) = row_index {
@@ -162,13 +150,14 @@ pub(super) fn ipc_file_info(
 
 #[cfg(feature = "csv")]
 pub(super) fn csv_file_info(
-    paths: &[PathBuf],
+    source: &ScanSource,
     file_options: &FileScanOptions,
     csv_options: &mut CsvReadOptions,
     cloud_options: Option<&polars_io::cloud::CloudOptions>,
 ) -> PolarsResult<FileInfo> {
     use std::io::{Read, Seek};
 
+    use polars_core::error::feature_gated;
     use polars_core::{config, POOL};
     use polars_io::csv::read::schema_inference::SchemaInferenceResult;
     use polars_io::utils::get_reader_bytes;
@@ -179,105 +168,123 @@ pub(super) fn csv_file_info(
     // * See if we can do this without downloading the entire file
 
     // prints the error message if paths is empty.
-    let first_path = get_first_path(paths)?;
-    let run_async = is_cloud_url(first_path) || config::force_async();
-
-    let cache_entries = {
-        #[cfg(feature = "cloud")]
-        {
-            if run_async {
-                Some(polars_io::file_cache::init_entries_from_uri_list(
-                    paths
-                        .iter()
-                        .map(|path| Arc::from(path.to_str().unwrap()))
-                        .collect::<Vec<_>>()
-                        .as_slice(),
-                    cloud_options,
-                )?)
-            } else {
-                None
-            }
-        }
-        #[cfg(not(feature = "cloud"))]
-        {
-            if run_async {
-                panic!("required feature `cloud` is not enabled")
-            }
-        }
-    };
-
-    let infer_schema_func = |i| {
-        let file = if run_async {
-            #[cfg(feature = "cloud")]
-            {
-                let entry: &Arc<polars_io::file_cache::FileCacheEntry> =
-                    &cache_entries.as_ref().unwrap()[i];
-                entry.try_open_check_latest()?
-            }
-            #[cfg(not(feature = "cloud"))]
-            {
-                panic!("required feature `cloud` is not enabled")
-            }
-        } else {
-            let p: &PathBuf = &paths[i];
-            polars_utils::open_file(p.as_ref())?
-        };
-
-        let mmap = unsafe { memmap::Mmap::map(&file).unwrap() };
-        let owned = &mut vec![];
+    let run_async = source.is_cloud_url()? || config::force_async();
 
-        let mut curs = std::io::Cursor::new(maybe_decompress_bytes(mmap.as_ref(), owned)?);
+    let si_result = match source {
+        ScanSource::Files(paths) => {
+            let cache_entries = {
+                feature_gated!("cloud", {
+                    if run_async {
+                        Some(polars_io::file_cache::init_entries_from_uri_list(
+                            source
+                                .as_paths()
+                                .iter()
+                                .flat_map(|p| p.iter())
+                                .map(|path| Arc::from(path.to_str().unwrap()))
+                                .collect::<Vec<_>>()
+                                .as_slice(),
+                            cloud_options,
+                        )?)
+                    } else {
+                        None
+                    }
+                })
+            };
 
-        if curs.read(&mut [0; 4])? < 2 && csv_options.raise_if_empty {
-            polars_bail!(NoData: "empty CSV")
-        }
-        curs.rewind()?;
+            let infer_schema_func = |i| {
+                let file = if run_async {
+                    feature_gated!("cloud", {
+                        let entry: &Arc<polars_io::file_cache::FileCacheEntry> =
+                            &cache_entries.as_ref().unwrap()[i];
+                        entry.try_open_check_latest()?
+                    })
+                } else {
+                    let p: &PathBuf = &paths[i];
+                    polars_utils::open_file(p.as_ref())?
+                };
+
+                let mmap = unsafe { memmap::Mmap::map(&file).unwrap() };
+                let owned = &mut vec![];
+
+                let mut curs = std::io::Cursor::new(maybe_decompress_bytes(mmap.as_ref(), owned)?);
+
+                if curs.read(&mut [0; 4])? < 2 && csv_options.raise_if_empty {
+                    polars_bail!(NoData: "empty CSV")
+                }
+                curs.rewind()?;
 
-        let reader_bytes = get_reader_bytes(&mut curs).expect("could not mmap file");
+                let reader_bytes = get_reader_bytes(&mut curs).expect("could not mmap file");
 
-        // this needs a way to estimated bytes/rows.
-        let si_result =
-            SchemaInferenceResult::try_from_reader_bytes_and_options(&reader_bytes, csv_options)?;
+                // this needs a way to estimated bytes/rows.
+                let si_result = SchemaInferenceResult::try_from_reader_bytes_and_options(
+                    &reader_bytes,
+                    csv_options,
+                )?;
 
-        Ok(si_result)
-    };
+                Ok(si_result)
+            };
 
-    let merge_func = |a: PolarsResult<SchemaInferenceResult>,
-                      b: PolarsResult<SchemaInferenceResult>| match (a, b) {
-        (Err(e), _) | (_, Err(e)) => Err(e),
-        (Ok(a), Ok(b)) => {
-            let merged_schema = if csv_options.schema.is_some() {
-                csv_options.schema.clone().unwrap()
-            } else {
-                let schema_a = a.get_inferred_schema();
-                let schema_b = b.get_inferred_schema();
-
-                match (schema_a.is_empty(), schema_b.is_empty()) {
-                    (true, _) => schema_b,
-                    (_, true) => schema_a,
-                    _ => {
-                        let mut s = Arc::unwrap_or_clone(schema_a);
-                        s.to_supertype(&schema_b)?;
-                        Arc::new(s)
+            let merge_func = |a: PolarsResult<SchemaInferenceResult>,
+                              b: PolarsResult<SchemaInferenceResult>| {
+                match (a, b) {
+                    (Err(e), _) | (_, Err(e)) => Err(e),
+                    (Ok(a), Ok(b)) => {
+                        let merged_schema = if csv_options.schema.is_some() {
+                            csv_options.schema.clone().unwrap()
+                        } else {
+                            let schema_a = a.get_inferred_schema();
+                            let schema_b = b.get_inferred_schema();
+
+                            match (schema_a.is_empty(), schema_b.is_empty()) {
+                                (true, _) => schema_b,
+                                (_, true) => schema_a,
+                                _ => {
+                                    let mut s = Arc::unwrap_or_clone(schema_a);
+                                    s.to_supertype(&schema_b)?;
+                                    Arc::new(s)
+                                },
+                            }
+                        };
+
+                        Ok(a.with_inferred_schema(merged_schema))
                     },
                 }
             };
 
-            Ok(a.with_inferred_schema(merged_schema))
-        },
-    };
+            let si_results = POOL.join(
+                || infer_schema_func(0),
+                || {
+                    (1..paths.len())
+                        .into_par_iter()
+                        .map(infer_schema_func)
+                        .reduce(|| Ok(Default::default()), merge_func)
+                },
+            );
 
-    let si_results = POOL.join(
-        || infer_schema_func(0),
-        || {
-            (1..paths.len())
-                .into_par_iter()
-                .map(infer_schema_func)
-                .reduce(|| Ok(Default::default()), merge_func)
+            merge_func(si_results.0, si_results.1)?
         },
-    );
+        ScanSource::Buffer(buffer) => {
+            polars_ensure!(!run_async, nyi = "BytesIO scan with async");
 
-    let si_result = merge_func(si_results.0, si_results.1)?;
+            let owned = &mut vec![];
+            let mut reader = std::io::Cursor::new(maybe_decompress_bytes(buffer, owned)?);
+
+            if reader.read(&mut [0; 4])? < 2 && csv_options.raise_if_empty {
+                polars_bail!(NoData: "empty CSV")
+            }
+            reader.rewind()?;
+
+            let reader_bytes = get_reader_bytes(&mut reader).expect("could not open file");
+
+            // this needs a way to estimated bytes/rows.
+            let si_result = SchemaInferenceResult::try_from_reader_bytes_and_options(
+                &reader_bytes,
+                csv_options,
+            )?;
+
+            si_result
+        },
+    };
 
     csv_options.update_with_inference_result(&si_result);
 
diff --git a/crates/polars-plan/src/plans/functions/count.rs b/crates/polars-plan/src/plans/functions/count.rs
index dca574e67808..f3120bad8dff 100644
--- a/crates/polars-plan/src/plans/functions/count.rs
+++ b/crates/polars-plan/src/plans/functions/count.rs
@@ -3,7 +3,9 @@ use arrow::io::ipc::read::get_row_count as count_rows_ipc_sync;
 #[cfg(any(feature = "parquet", feature = "json"))]
 use polars_io::cloud::CloudOptions;
 #[cfg(feature = "csv")]
-use polars_io::csv::read::{count_rows as count_rows_csv, count_rows_from_slice as count_rows_csv_from_slice};
+use polars_io::csv::read::{
+    count_rows as count_rows_csv, count_rows_from_slice as count_rows_csv_from_slice,
+};
 #[cfg(all(feature = "parquet", feature = "cloud"))]
 use polars_io::parquet::read::ParquetAsyncReader;
 #[cfg(feature = "parquet")]
diff --git a/crates/polars-plan/src/plans/functions/mod.rs b/crates/polars-plan/src/plans/functions/mod.rs
index 37e01bcc654f..468a85273ea4 100644
--- a/crates/polars-plan/src/plans/functions/mod.rs
+++ b/crates/polars-plan/src/plans/functions/mod.rs
@@ -103,9 +103,14 @@ impl PartialEq for FunctionIR {
         use FunctionIR::*;
         match (self, other) {
             (Rechunk, Rechunk) => true,
-            (FastCount { sources: srcs_l, .. }, FastCount { sources: srcs_r, .. }) => {
-                srcs_l == srcs_r
-            },
+            (
+                FastCount {
+                    sources: srcs_l, ..
+                },
+                FastCount {
+                    sources: srcs_r, ..
+                },
+            ) => srcs_l == srcs_r,
             (
                 Rename {
                     existing: existing_l,
diff --git a/crates/polars-plan/src/plans/ir/mod.rs b/crates/polars-plan/src/plans/ir/mod.rs
index 52593e3d2b28..ff4e46e64dd8 100644
--- a/crates/polars-plan/src/plans/ir/mod.rs
+++ b/crates/polars-plan/src/plans/ir/mod.rs
@@ -15,7 +15,7 @@ use hive::HivePartitions;
 use polars_core::prelude::*;
 use polars_core::POOL;
 use polars_utils::idx_vec::UnitVec;
-use polars_utils::unitvec;
+use polars_utils::{format_pl_smallstr, unitvec};
 #[cfg(feature = "ir_serde")]
 use serde::{Deserialize, Serialize};
 
@@ -65,7 +65,9 @@ impl ScanSource {
     pub fn try_into_paths(&self) -> PolarsResult<Arc<[PathBuf]>> {
         match self {
             ScanSource::Files(paths) => Ok(paths.clone()),
-            ScanSource::Buffer(_) => Err(polars_err!(nyi = "Unable to convert BytesIO scan into path")),
+            ScanSource::Buffer(_) => Err(polars_err!(
+                nyi = "Unable to convert BytesIO scan into path"
+            )),
         }
     }
 
@@ -103,8 +105,16 @@ impl ScanSource {
         }
     }
 
+    pub fn id(&self) -> PlSmallStr {
+        match self {
+            ScanSource::Files(paths) if paths.is_empty() => PlSmallStr::from_static("EMPTY"),
+            ScanSource::Files(paths) => PlSmallStr::from_str(paths[0].to_string_lossy().as_ref()),
+            ScanSource::Buffer(_) => PlSmallStr::from_static("IN_MEMORY"),
+        }
+    }
+
     /// Normalize the slice and collect information as to what rows and parts of the source are
-    /// used in this slice. 
+    /// used in this slice.
     pub fn collect_slice_information(
         &self,
         slice: (i64, usize),
diff --git a/crates/polars-python/src/file.rs b/crates/polars-python/src/file.rs
index 9443fd0b5213..6225ee5427f7 100644
--- a/crates/polars-python/src/file.rs
+++ b/crates/polars-python/src/file.rs
@@ -201,10 +201,7 @@ pub enum EitherPythonFileOrPath {
     Path(PathBuf),
 }
 
-pub fn get_either_file_or_path(
-    py_f: PyObject,
-    write: bool,
-) -> PyResult<EitherPythonFileOrPath> {
+pub fn get_either_file_or_path(py_f: PyObject, write: bool) -> PyResult<EitherPythonFileOrPath> {
     Python::with_gil(|py| {
         let py_f = py_f.into_bound(py);
         if let Ok(s) = py_f.extract::<Cow<str>>() {
diff --git a/crates/polars-python/src/lazyframe/general.rs b/crates/polars-python/src/lazyframe/general.rs
index d5f64c9f35ac..2e2ce702f5bd 100644
--- a/crates/polars-python/src/lazyframe/general.rs
+++ b/crates/polars-python/src/lazyframe/general.rs
@@ -113,7 +113,7 @@ impl PyLazyFrame {
     )
     )]
     fn new_from_csv(
-        path: Option<PathBuf>,
+        path: Option<PyObject>,
         paths: Vec<PathBuf>,
         separator: &str,
         has_header: bool,
@@ -145,6 +145,10 @@ impl PyLazyFrame {
         file_cache_ttl: Option<u64>,
         include_file_paths: Option<String>,
     ) -> PyResult<Self> {
+        use std::path::Path;
+
+        use crate::file::{get_either_file_or_path, EitherPythonFileOrPath};
+
         let null_values = null_values.map(|w| w.0);
         let quote_char = quote_char.map(|s| s.as_bytes()[0]);
         let separator = separator.as_bytes()[0];
@@ -161,38 +165,43 @@ impl PyLazyFrame {
                 .collect::<Schema>()
         });
 
-        #[cfg(feature = "cloud")]
-        let cloud_options = {
-            let first_path = if let Some(path) = &path {
-                path
-            } else {
-                paths
-                    .first()
-                    .ok_or_else(|| PyValueError::new_err("expected a path argument"))?
-            };
+        use polars_plan::plans::ScanSource;
+        use EitherPythonFileOrPath as EF;
+        let (first_path, mut r) = match path
+            .map(|py_f| get_either_file_or_path(py_f, false))
+            .transpose()?
+        {
+            Some(EF::Path(path)) => {
+                let reader = LazyCsvReader::new(<PathBuf as AsRef<Path>>::as_ref(&path));
+                (Some(path), reader)
+            },
+            Some(EF::Py(f)) => (
+                None,
+                LazyCsvReader::new_sourced(ScanSource::Buffer(f.as_arc())),
+            ),
+            None => (
+                Some(
+                    paths
+                        .first()
+                        .cloned()
+                        .ok_or_else(|| PyValueError::new_err("expected a path argument"))?,
+                ),
+                LazyCsvReader::new_paths(paths.into()),
+            ),
+        };
 
+        #[cfg(feature = "cloud")]
+        if let Some(first_path) = first_path {
             let first_path_url = first_path.to_string_lossy();
 
-            let mut cloud_options = if let Some(opts) = cloud_options {
-                parse_cloud_options(&first_path_url, opts)?
-            } else {
-                parse_cloud_options(&first_path_url, vec![])?
-            };
-
-            cloud_options = cloud_options.with_max_retries(retries);
-
+            let mut cloud_options =
+                parse_cloud_options(&first_path_url, cloud_options.unwrap_or_default())?;
             if let Some(file_cache_ttl) = file_cache_ttl {
                 cloud_options.file_cache_ttl = file_cache_ttl;
             }
-
-            Some(cloud_options)
-        };
-
-        let r = if let Some(path) = path.as_ref() {
-            LazyCsvReader::new(path)
-        } else {
-            LazyCsvReader::new_paths(paths.into())
-        };
+            cloud_options = cloud_options.with_max_retries(retries);
+            r = r.with_cloud_options(Some(cloud_options));
+        }
 
         let mut r = r
             .with_infer_schema_length(infer_schema_length)
@@ -219,7 +228,6 @@ impl PyLazyFrame {
             .with_decimal_comma(decimal_comma)
             .with_glob(glob)
             .with_raise_if_empty(raise_if_empty)
-            .with_cloud_options(cloud_options)
             .with_include_file_paths(include_file_paths.map(|x| x.into()));
 
         if let Some(lambda) = with_schema_modify {
@@ -276,70 +284,11 @@ impl PyLazyFrame {
         let parallel = parallel.0;
         let hive_schema = hive_schema.map(|s| Arc::new(s.0));
 
-        use polars_plan::plans::ScanSource;
-        use EitherPythonFileOrPath as EF;
-        let use_first_path = path.is_some();
-        let first_path = match path
-            .map(|py_f| get_either_file_or_path(py_f, false))
-            .transpose()?
-        {
-            Some(EF::Path(path)) => path,
-            Some(EF::Py(f)) => {
-                let scan_source = ScanSource::Buffer(f.as_arc());
-
-                let row_index = row_index.map(|(name, offset)| RowIndex {
-                    name: name.into(),
-                    offset,
-                });
-
-                let args = ScanArgsParquet {
-                    n_rows,
-                    cache,
-                    parallel,
-                    rechunk,
-                    row_index,
-                    low_memory,
-                    cloud_options: None,
-                    use_statistics,
-                    hive_options: HiveOptions {
-                        enabled: hive_partitioning,
-                        hive_start_idx: 0,
-                        schema: hive_schema,
-                        try_parse_dates: try_parse_hive_dates,
-                    },
-                    glob,
-                    include_file_paths: include_file_paths.map(|x| x.into()),
-                };
-
-                let lf = LazyFrame::scan_parquet_sourced(scan_source, args)
-                    .map_err(PyPolarsErr::from)?;
-                return Ok(lf.into());
-            },
-            None => paths
-                .first()
-                .cloned()
-                .ok_or_else(|| PyValueError::new_err("expected a path argument"))?,
-        };
-
-        #[cfg(feature = "cloud")]
-        let cloud_options = {
-            let first_path_url = first_path.to_string_lossy();
-
-            let mut cloud_options = if let Some(opts) = cloud_options {
-                parse_cloud_options(&first_path_url, opts)?
-            } else {
-                parse_cloud_options(&first_path_url, vec![])?
-            };
-
-            cloud_options = cloud_options.with_max_retries(retries);
-
-            Some(cloud_options)
-        };
-
         let row_index = row_index.map(|(name, offset)| RowIndex {
             name: name.into(),
             offset,
         });
+
         let hive_options = HiveOptions {
             enabled: hive_partitioning,
             hive_start_idx: 0,
@@ -347,20 +296,48 @@ impl PyLazyFrame {
             try_parse_dates: try_parse_hive_dates,
         };
 
-        let args = ScanArgsParquet {
+        let mut args = ScanArgsParquet {
             n_rows,
             cache,
             parallel,
             rechunk,
             row_index,
             low_memory,
-            cloud_options,
+            cloud_options: None,
             use_statistics,
             hive_options,
             glob,
             include_file_paths: include_file_paths.map(|x| x.into()),
         };
 
+        use polars_plan::plans::ScanSource;
+        use EitherPythonFileOrPath as EF;
+        let use_first_path = path.is_some();
+        let first_path = match path
+            .map(|py_f| get_either_file_or_path(py_f, false))
+            .transpose()?
+        {
+            Some(EF::Path(path)) => path,
+            Some(EF::Py(f)) => {
+                return LazyFrame::scan_parquet_sourced(ScanSource::Buffer(f.as_arc()), args)
+                    .map(Self::from)
+                    .map_err(PyPolarsErr::from)
+                    .map_err(From::from);
+            },
+            None => paths
+                .first()
+                .cloned()
+                .ok_or_else(|| PyValueError::new_err("expected a path argument"))?,
+        };
+
+        #[cfg(feature = "cloud")]
+        {
+            let first_path_url = first_path.to_string_lossy();
+            let cloud_options =
+                parse_cloud_options(&first_path_url, cloud_options.unwrap_or_default())?;
+            args.cloud_options = Some(cloud_options.with_max_retries(retries));
+        }
+
         let lf = if use_first_path {
             LazyFrame::scan_parquet(first_path, args)
         } else {
@@ -374,7 +351,7 @@ impl PyLazyFrame {
     #[staticmethod]
     #[pyo3(signature = (path, paths, n_rows, cache, rechunk, row_index, memory_map, cloud_options, hive_partitioning, hive_schema, try_parse_hive_dates, retries, file_cache_ttl, include_file_paths))]
     fn new_from_ipc(
-        path: Option<PathBuf>,
+        path: Option<PyObject>,
         paths: Vec<PathBuf>,
         n_rows: Option<usize>,
         cache: bool,
@@ -389,38 +366,13 @@ impl PyLazyFrame {
         file_cache_ttl: Option<u64>,
         include_file_paths: Option<String>,
     ) -> PyResult<Self> {
+        use crate::file::{get_either_file_or_path, EitherPythonFileOrPath};
+
         let row_index = row_index.map(|(name, offset)| RowIndex {
             name: name.into(),
             offset,
         });
 
-        #[cfg(feature = "cloud")]
-        let cloud_options = {
-            let first_path = if let Some(path) = &path {
-                path
-            } else {
-                paths
-                    .first()
-                    .ok_or_else(|| PyValueError::new_err("expected a path argument"))?
-            };
-
-            let first_path_url = first_path.to_string_lossy();
-
-            let mut cloud_options = if let Some(opts) = cloud_options {
-                parse_cloud_options(&first_path_url, opts)?
-            } else {
-                parse_cloud_options(&first_path_url, vec![])?
-            };
-
-            cloud_options = cloud_options.with_max_retries(retries);
-
-            if let Some(file_cache_ttl) = file_cache_ttl {
-                cloud_options.file_cache_ttl = file_cache_ttl;
-            }
-
-            Some(cloud_options)
-        };
-
         let hive_options = HiveOptions {
             enabled: hive_partitioning,
             hive_start_idx: 0,
@@ -428,20 +380,52 @@ impl PyLazyFrame {
             try_parse_dates: try_parse_hive_dates,
         };
 
-        let args = ScanArgsIpc {
+        let mut args = ScanArgsIpc {
             n_rows,
             cache,
             rechunk,
             row_index,
             memory_map,
             #[cfg(feature = "cloud")]
-            cloud_options,
+            cloud_options: None,
             hive_options,
             include_file_paths: include_file_paths.map(|x| x.into()),
         };
 
-        let lf = if let Some(path) = &path {
-            LazyFrame::scan_ipc(path, args)
+        use polars_plan::plans::ScanSource;
+        use EitherPythonFileOrPath as EF;
+        let use_first_path = path.is_some();
+        let first_path = match path
+            .map(|py_f| get_either_file_or_path(py_f, false))
+            .transpose()?
+        {
+            Some(EF::Path(path)) => path,
+            Some(EF::Py(f)) => {
+                return LazyFrame::scan_ipc_sourced(ScanSource::Buffer(f.as_arc()), args)
+                    .map(Self::from)
+                    .map_err(PyPolarsErr::from)
+                    .map_err(From::from);
+            },
+            None => paths
+                .first()
+                .cloned()
+                .ok_or_else(|| PyValueError::new_err("expected a path argument"))?,
+        };
+
+        #[cfg(feature = "cloud")]
+        {
+            let first_path_url = first_path.to_string_lossy();
+
+            let mut cloud_options =
+                parse_cloud_options(&first_path_url, cloud_options.unwrap_or_default())?;
+            if let Some(file_cache_ttl) = file_cache_ttl {
+                cloud_options.file_cache_ttl = file_cache_ttl;
+            }
+            args.cloud_options = Some(cloud_options.with_max_retries(retries));
+        }
+
+        let lf = if use_first_path {
+            LazyFrame::scan_ipc(first_path, args)
         } else {
             LazyFrame::scan_ipc_files(paths.into(), args)
         }
diff --git a/py-polars/polars/io/csv/functions.py b/py-polars/polars/io/csv/functions.py
index 3a27911d716e..b7b5c4764845 100644
--- a/py-polars/polars/io/csv/functions.py
+++ b/py-polars/polars/io/csv/functions.py
@@ -984,7 +984,7 @@ def read_csv_batched(
 @deprecate_renamed_parameter("row_count_name", "row_index_name", version="0.20.4")
 @deprecate_renamed_parameter("row_count_offset", "row_index_offset", version="0.20.4")
 def scan_csv(
-    source: str | Path | list[str] | list[Path],
+    source: str | Path | list[str] | list[Path] | IO[str] | IO[bytes],
     *,
     has_header: bool = True,
     separator: str = ",",
@@ -1232,6 +1232,8 @@ def with_column_names(cols: list[str]) -> list[str]:
 
     if isinstance(source, (str, Path)):
         source = normalize_filepath(source, check_not_directory=False)
+    elif isinstance(source, (IO, BytesIO)):
+        pass
     else:
         source = [
             normalize_filepath(source, check_not_directory=False) for source in source
@@ -1276,7 +1278,7 @@ def with_column_names(cols: list[str]) -> list[str]:
 
 
 def _scan_csv_impl(
-    source: str | list[str] | list[Path],
+    source: str | list[str] | list[Path] | IO[str] | IO[bytes],
     *,
     has_header: bool = True,
     separator: str = ",",
diff --git a/py-polars/polars/io/ipc/functions.py b/py-polars/polars/io/ipc/functions.py
index 4443c31d513f..8f3c21bdf286 100644
--- a/py-polars/polars/io/ipc/functions.py
+++ b/py-polars/polars/io/ipc/functions.py
@@ -2,6 +2,7 @@
 
 import contextlib
 import os
+from io import BytesIO
 from pathlib import Path
 from typing import IO, TYPE_CHECKING, Any, Sequence
 
@@ -346,7 +347,7 @@ def read_ipc_schema(source: str | Path | IO[bytes] | bytes) -> dict[str, DataTyp
 @deprecate_renamed_parameter("row_count_name", "row_index_name", version="0.20.4")
 @deprecate_renamed_parameter("row_count_offset", "row_index_offset", version="0.20.4")
 def scan_ipc(
-    source: str | Path | list[str] | list[Path],
+    source: str | Path | list[str] | list[Path] | IO[str] | IO[bytes],
     *,
     n_rows: int | None = None,
     cache: bool = True,
@@ -429,6 +430,8 @@ def scan_ipc(
     if isinstance(source, (str, Path)):
         source = normalize_filepath(source, check_not_directory=False)
         sources = []
+    elif isinstance(source, (IO, BytesIO)):
+        sources = []
     else:
         sources = [
             normalize_filepath(source, check_not_directory=False) for source in source
diff --git a/py-polars/polars/io/ndjson.py b/py-polars/polars/io/ndjson.py
index e8eccca53ccd..5482ccc52c42 100644
--- a/py-polars/polars/io/ndjson.py
+++ b/py-polars/polars/io/ndjson.py
@@ -3,7 +3,7 @@
 import contextlib
 from io import BytesIO, StringIO
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Sequence
+from typing import IO, TYPE_CHECKING, Any, Sequence
 
 from polars._utils.deprecation import deprecate_renamed_parameter
 from polars._utils.various import normalize_filepath
@@ -166,7 +166,7 @@ def read_ndjson(
 @deprecate_renamed_parameter("row_count_name", "row_index_name", version="0.20.4")
 @deprecate_renamed_parameter("row_count_offset", "row_index_offset", version="0.20.4")
 def scan_ndjson(
-    source: str | Path | list[str] | list[Path],
+    source: str | Path | list[str] | list[Path] | IO[str] | IO[bytes],
     *,
     schema: SchemaDefinition | None = None,
     schema_overrides: SchemaDefinition | None = None,
@@ -250,6 +250,8 @@ def scan_ndjson(
     if isinstance(source, (str, Path)):
         source = normalize_filepath(source, check_not_directory=False)
         sources = []
+    elif isinstance(source, (IO, BytesIO)):
+        sources = []
     else:
         sources = [
             normalize_filepath(source, check_not_directory=False) for source in source
diff --git a/py-polars/polars/io/parquet/functions.py b/py-polars/polars/io/parquet/functions.py
index 0fc52142e5de..ef01b24955b0 100644
--- a/py-polars/polars/io/parquet/functions.py
+++ b/py-polars/polars/io/parquet/functions.py
@@ -295,7 +295,7 @@ def read_parquet_schema(source: str | Path | IO[bytes] | bytes) -> dict[str, Dat
 @deprecate_renamed_parameter("row_count_name", "row_index_name", version="0.20.4")
 @deprecate_renamed_parameter("row_count_offset", "row_index_offset", version="0.20.4")
 def scan_parquet(
-    source: str | Path | list[str] | list[Path] | io.BytesIO,
+    source: str | Path | list[str] | list[Path] | IO[str] | IO[bytes],
     *,
     n_rows: int | None = None,
     row_index_name: str | None = None,
@@ -422,8 +422,8 @@ def scan_parquet(
 
     if isinstance(source, (str, Path)):
         source = normalize_filepath(source, check_not_directory=False)
-    elif isinstance(source, io.BytesIO):
-        pass
+    elif isinstance(source, (IO, BytesIO)):
+        sources = []
     else:
         source = [
             normalize_filepath(source, check_not_directory=False) for source in source
@@ -450,7 +450,7 @@ def scan_parquet(
 
 
 def _scan_parquet_impl(
-    source: str | list[str] | list[Path] | io.BytesIO,
+    source: str | list[str] | list[Path] | IO[str] | IO[bytes],
     *,
     n_rows: int | None = None,
     cache: bool = True,

From 5fb9ffa31b8b956cd2000b9dd03d14cee747cdb8 Mon Sep 17 00:00:00 2001
From: coastalwhite <me@gburghoorn.com>
Date: Wed, 4 Sep 2024 16:05:31 +0200
Subject: [PATCH 04/27] working for all scans

---
 crates/polars-io/src/csv/read/parser.rs       |  14 +-
 crates/polars-lazy/src/scan/csv.rs            |  35 +-
 .../polars-lazy/src/scan/file_list_reader.rs  |  10 +-
 crates/polars-lazy/src/scan/ipc.rs            |  18 +-
 crates/polars-lazy/src/scan/ndjson.rs         |  34 +-
 crates/polars-lazy/src/scan/parquet.rs        |  18 +-
 .../src/executors/scan/csv.rs                 |  78 ++--
 .../src/executors/scan/ipc.rs                 | 103 +++--
 .../src/executors/scan/ndjson.rs              | 128 ++++---
 .../src/executors/scan/parquet.rs             | 239 +++++-------
 crates/polars-mem-engine/src/planner/lp.rs    |   2 +-
 .../polars-pipe/src/executors/sources/csv.rs  |   6 +-
 .../src/executors/sources/parquet.rs          |   6 +-
 crates/polars-plan/src/client/check.rs        |  11 +-
 crates/polars-plan/src/plans/builder_dsl.rs   |  20 +-
 .../src/plans/conversion/dsl_to_ir.rs         | 114 +++---
 .../polars-plan/src/plans/conversion/mod.rs   |   7 +-
 .../polars-plan/src/plans/conversion/scans.rs | 359 +++++++++---------
 .../polars-plan/src/plans/functions/count.rs  | 166 ++++----
 crates/polars-plan/src/plans/functions/mod.rs |  12 +-
 crates/polars-plan/src/plans/ir/dot.rs        |  31 +-
 crates/polars-plan/src/plans/ir/format.rs     |  18 +-
 crates/polars-plan/src/plans/ir/mod.rs        | 253 +++++++-----
 crates/polars-plan/src/plans/mod.rs           |  15 +-
 .../src/plans/optimizer/count_star.rs         |  41 +-
 .../plans/optimizer/predicate_pushdown/mod.rs |   2 +-
 crates/polars-python/src/lazyframe/general.rs |  88 +++--
 .../src/lazyframe/visitor/nodes.rs            |   2 +-
 .../src/utils/late_materialized_df.rs         |   4 +-
 py-polars/polars/io/parquet/functions.py      |   2 +-
 30 files changed, 965 insertions(+), 871 deletions(-)

diff --git a/crates/polars-io/src/csv/read/parser.rs b/crates/polars-io/src/csv/read/parser.rs
index 9d2852a02c82..ccda4805792b 100644
--- a/crates/polars-io/src/csv/read/parser.rs
+++ b/crates/polars-io/src/csv/read/parser.rs
@@ -1,9 +1,10 @@
-use std::path::PathBuf;
+use std::path::Path;
 
 use memchr::memchr2_iter;
 use num_traits::Pow;
 use polars_core::prelude::*;
 use polars_core::{config, POOL};
+use polars_error::feature_gated;
 use polars_utils::index::Bounded;
 use polars_utils::slice::GetSaferUnchecked;
 use rayon::prelude::*;
@@ -18,7 +19,7 @@ use crate::utils::maybe_decompress_bytes;
 /// Read the number of rows without parsing columns
 /// useful for count(*) queries
 pub fn count_rows(
-    path: &PathBuf,
+    path: &Path,
     separator: u8,
     quote_char: Option<u8>,
     comment_prefix: Option<&CommentPrefix>,
@@ -26,18 +27,13 @@ pub fn count_rows(
     has_header: bool,
 ) -> PolarsResult<usize> {
     let file = if is_cloud_url(path) || config::force_async() {
-        #[cfg(feature = "cloud")]
-        {
+        feature_gated!("cloud", {
             crate::file_cache::FILE_CACHE
                 .get_entry(path.to_str().unwrap())
                 // Safety: This was initialized by schema inference.
                 .unwrap()
                 .try_open_assume_latest()?
-        }
-        #[cfg(not(feature = "cloud"))]
-        {
-            panic!("required feature `cloud` is not enabled")
-        }
+        })
     } else {
         polars_utils::open_file(path)?
     };
diff --git a/crates/polars-lazy/src/scan/csv.rs b/crates/polars-lazy/src/scan/csv.rs
index e408681789c3..a8687aba3b8b 100644
--- a/crates/polars-lazy/src/scan/csv.rs
+++ b/crates/polars-lazy/src/scan/csv.rs
@@ -15,7 +15,7 @@ use crate::prelude::*;
 #[derive(Clone)]
 #[cfg(feature = "csv")]
 pub struct LazyCsvReader {
-    source: ScanSource,
+    sources: ScanSources,
     glob: bool,
     cache: bool,
     read_options: CsvReadOptions,
@@ -35,13 +35,13 @@ impl LazyCsvReader {
         Self::new("").with_paths(paths)
     }
 
-    pub fn new_sourced(source: ScanSource) -> Self {
-        Self::new("").with_source(source)
+    pub fn new_sourced(sources: ScanSources) -> Self {
+        Self::new("").with_sources(sources)
     }
 
     pub fn new(path: impl AsRef<Path>) -> Self {
         LazyCsvReader {
-            source: ScanSource::Files([path.as_ref().to_path_buf()].into()),
+            sources: ScanSources::Files([path.as_ref().to_path_buf()].into()),
             glob: true,
             cache: true,
             read_options: Default::default(),
@@ -253,8 +253,8 @@ impl LazyCsvReader {
             )
         };
 
-        let schema = match self.source.clone() {
-            ScanSource::Files(paths) => {
+        let schema = match self.sources.clone() {
+            ScanSources::Files(paths) => {
                 // TODO: Path expansion should happen when converting to the IR
                 // https://github.com/pola-rs/polars/issues/17634
                 let paths = expand_paths(&paths[..], self.glob(), self.cloud_options())?;
@@ -266,9 +266,16 @@ impl LazyCsvReader {
                 let mut file = polars_utils::open_file(path)?;
                 infer_schema(get_reader_bytes(&mut file).expect("could not mmap file"))?
             },
-            ScanSource::Buffer(buffer) => infer_schema(
-                get_reader_bytes(&mut std::io::Cursor::new(buffer)).expect("could not mmap file"),
-            )?,
+            ScanSources::Buffers(buffers) => {
+                let Some(buffer) = buffers.first() else {
+                    polars_bail!(ComputeError: "no buffers specified for this reader");
+                };
+
+                infer_schema(
+                    get_reader_bytes(&mut std::io::Cursor::new(buffer))
+                        .expect("could not mmap file"),
+                )?
+            },
         };
 
         self.read_options.n_threads = n_threads;
@@ -294,7 +301,7 @@ impl LazyFileListReader for LazyCsvReader {
     /// Get the final [LazyFrame].
     fn finish(self) -> PolarsResult<LazyFrame> {
         let mut lf: LazyFrame = DslBuilder::scan_csv(
-            self.source.to_dsl(false),
+            self.sources.to_dsl(false),
             self.read_options,
             self.cache,
             self.cloud_options,
@@ -315,12 +322,12 @@ impl LazyFileListReader for LazyCsvReader {
         self.glob
     }
 
-    fn source(&self) -> &ScanSource {
-        &self.source
+    fn sources(&self) -> &ScanSources {
+        &self.sources
     }
 
-    fn with_source(mut self, source: ScanSource) -> Self {
-        self.source = source;
+    fn with_sources(mut self, sources: ScanSources) -> Self {
+        self.sources = sources;
         self
     }
 
diff --git a/crates/polars-lazy/src/scan/file_list_reader.rs b/crates/polars-lazy/src/scan/file_list_reader.rs
index 8992b8df5a65..b25cec6eda3b 100644
--- a/crates/polars-lazy/src/scan/file_list_reader.rs
+++ b/crates/polars-lazy/src/scan/file_list_reader.rs
@@ -19,8 +19,8 @@ pub trait LazyFileListReader: Clone {
             return self.finish_no_glob();
         }
 
-        let ScanSource::Files(paths) = self.source() else {
-            unreachable!("Should never be globbed");
+        let ScanSources::Files(paths) = self.sources() else {
+            unreachable!("in-memory buffers should never be globbed");
         };
 
         let lfs = paths
@@ -83,16 +83,16 @@ pub trait LazyFileListReader: Clone {
         true
     }
 
-    fn source(&self) -> &ScanSource;
+    fn sources(&self) -> &ScanSources;
 
     /// Set paths of the scanned files.
     #[must_use]
-    fn with_source(self, source: ScanSource) -> Self;
+    fn with_sources(self, source: ScanSources) -> Self;
 
     /// Set paths of the scanned files.
     #[must_use]
     fn with_paths(self, paths: Arc<[PathBuf]>) -> Self {
-        self.with_source(ScanSource::Files(paths))
+        self.with_sources(ScanSources::Files(paths))
     }
 
     /// Configure the row limit.
diff --git a/crates/polars-lazy/src/scan/ipc.rs b/crates/polars-lazy/src/scan/ipc.rs
index 18043a15717a..fa11ef8e4455 100644
--- a/crates/polars-lazy/src/scan/ipc.rs
+++ b/crates/polars-lazy/src/scan/ipc.rs
@@ -37,14 +37,14 @@ impl Default for ScanArgsIpc {
 #[derive(Clone)]
 struct LazyIpcReader {
     args: ScanArgsIpc,
-    source: ScanSource,
+    sources: ScanSources,
 }
 
 impl LazyIpcReader {
     fn new(args: ScanArgsIpc) -> Self {
         Self {
             args,
-            source: ScanSource::default(),
+            sources: ScanSources::default(),
         }
     }
 }
@@ -58,7 +58,7 @@ impl LazyFileListReader for LazyIpcReader {
         };
 
         let mut lf: LazyFrame = DslBuilder::scan_ipc(
-            self.source.to_dsl(false),
+            self.sources.to_dsl(false),
             options,
             args.n_rows,
             args.cache,
@@ -79,12 +79,12 @@ impl LazyFileListReader for LazyIpcReader {
         unreachable!()
     }
 
-    fn source(&self) -> &ScanSource {
-        &self.source
+    fn sources(&self) -> &ScanSources {
+        &self.sources
     }
 
-    fn with_source(mut self, source: ScanSource) -> Self {
-        self.source = source;
+    fn with_sources(mut self, sources: ScanSources) -> Self {
+        self.sources = sources;
         self
     }
 
@@ -133,7 +133,7 @@ impl LazyFrame {
         LazyIpcReader::new(args).with_paths(paths).finish()
     }
 
-    pub fn scan_ipc_sourced(source: ScanSource, args: ScanArgsIpc) -> PolarsResult<Self> {
-        LazyIpcReader::new(args).with_source(source).finish()
+    pub fn scan_ipc_sourced(sources: ScanSources, args: ScanArgsIpc) -> PolarsResult<Self> {
+        LazyIpcReader::new(args).with_sources(sources).finish()
     }
 }
diff --git a/crates/polars-lazy/src/scan/ndjson.rs b/crates/polars-lazy/src/scan/ndjson.rs
index 9a1d071f8365..8d71d9a585a2 100644
--- a/crates/polars-lazy/src/scan/ndjson.rs
+++ b/crates/polars-lazy/src/scan/ndjson.rs
@@ -1,11 +1,11 @@
 use std::num::NonZeroUsize;
 use std::path::{Path, PathBuf};
-use std::sync::{Arc, RwLock};
+use std::sync::{Arc, Mutex, RwLock};
 
 use polars_core::prelude::*;
 use polars_io::cloud::CloudOptions;
-use polars_io::RowIndex;
-use polars_plan::plans::{DslPlan, FileScan, ScanSource};
+use polars_io::{HiveOptions, RowIndex};
+use polars_plan::plans::{DslPlan, FileScan, ScanSources};
 use polars_plan::prelude::{FileScanOptions, NDJsonReadOptions};
 
 use crate::prelude::LazyFrame;
@@ -13,7 +13,7 @@ use crate::scan::file_list_reader::LazyFileListReader;
 
 #[derive(Clone)]
 pub struct LazyJsonLineReader {
-    pub(crate) source: ScanSource,
+    pub(crate) sources: ScanSources,
     pub(crate) batch_size: Option<NonZeroUsize>,
     pub(crate) low_memory: bool,
     pub(crate) rechunk: bool,
@@ -29,12 +29,12 @@ pub struct LazyJsonLineReader {
 
 impl LazyJsonLineReader {
     pub fn new_paths(paths: Arc<[PathBuf]>) -> Self {
-        Self::new(PathBuf::new()).with_paths(paths)
+        Self::new_sourced(ScanSources::Files(paths))
     }
 
-    pub fn new(path: impl AsRef<Path>) -> Self {
+    pub fn new_sourced(sources: ScanSources) -> Self {
         LazyJsonLineReader {
-            source: ScanSource::Files([path.as_ref().to_path_buf()].into()),
+            sources,
             batch_size: None,
             low_memory: false,
             rechunk: false,
@@ -48,6 +48,10 @@ impl LazyJsonLineReader {
             cloud_options: None,
         }
     }
+
+    pub fn new(path: impl AsRef<Path>) -> Self {
+        Self::new_sourced(ScanSources::Files([path.as_ref().to_path_buf()].into()))
+    }
     /// Add a row index column.
     #[must_use]
     pub fn with_row_index(mut self, row_index: Option<RowIndex>) -> Self {
@@ -124,7 +128,11 @@ impl LazyFileListReader for LazyJsonLineReader {
             row_index: self.row_index,
             rechunk: self.rechunk,
             file_counter: 0,
-            hive_options: Default::default(),
+            hive_options: {
+                let mut options = HiveOptions::default();
+                options.enabled = Some(false);
+                options
+            },
             glob: true,
             include_file_paths: self.include_file_paths,
         };
@@ -145,7 +153,7 @@ impl LazyFileListReader for LazyJsonLineReader {
         };
 
         Ok(LazyFrame::from(DslPlan::Scan {
-            sources: self.source.to_dsl(false),
+            sources: Arc::new(Mutex::new(self.sources.to_dsl(false))),
             file_info: Arc::new(RwLock::new(None)),
             hive_parts: None,
             predicate: None,
@@ -158,12 +166,12 @@ impl LazyFileListReader for LazyJsonLineReader {
         unreachable!();
     }
 
-    fn source(&self) -> &ScanSource {
-        &self.source
+    fn sources(&self) -> &ScanSources {
+        &self.sources
     }
 
-    fn with_source(mut self, source: ScanSource) -> Self {
-        self.source = source;
+    fn with_sources(mut self, sources: ScanSources) -> Self {
+        self.sources = sources;
         self
     }
 
diff --git a/crates/polars-lazy/src/scan/parquet.rs b/crates/polars-lazy/src/scan/parquet.rs
index 491ae3ee126c..c198ccf690c1 100644
--- a/crates/polars-lazy/src/scan/parquet.rs
+++ b/crates/polars-lazy/src/scan/parquet.rs
@@ -44,14 +44,14 @@ impl Default for ScanArgsParquet {
 #[derive(Clone)]
 struct LazyParquetReader {
     args: ScanArgsParquet,
-    source: ScanSource,
+    sources: ScanSources,
 }
 
 impl LazyParquetReader {
     fn new(args: ScanArgsParquet) -> Self {
         Self {
             args,
-            source: ScanSource::default(),
+            sources: ScanSources::default(),
         }
     }
 }
@@ -62,7 +62,7 @@ impl LazyFileListReader for LazyParquetReader {
         let row_index = self.args.row_index;
 
         let mut lf: LazyFrame = DslBuilder::scan_parquet(
-            self.source.to_dsl(false),
+            self.sources.to_dsl(false),
             self.args.n_rows,
             self.args.cache,
             self.args.parallel,
@@ -95,12 +95,12 @@ impl LazyFileListReader for LazyParquetReader {
         unreachable!();
     }
 
-    fn source(&self) -> &ScanSource {
-        &self.source
+    fn sources(&self) -> &ScanSources {
+        &self.sources
     }
 
-    fn with_source(mut self, source: ScanSource) -> Self {
-        self.source = source;
+    fn with_sources(mut self, sources: ScanSources) -> Self {
+        self.sources = sources;
         self
     }
 
@@ -145,8 +145,8 @@ impl LazyFrame {
     }
 
     /// Create a LazyFrame directly from a parquet scan.
-    pub fn scan_parquet_sourced(source: ScanSource, args: ScanArgsParquet) -> PolarsResult<Self> {
-        LazyParquetReader::new(args).with_source(source).finish()
+    pub fn scan_parquet_sourced(sources: ScanSources, args: ScanArgsParquet) -> PolarsResult<Self> {
+        LazyParquetReader::new(args).with_sources(sources).finish()
     }
 
     /// Create a LazyFrame directly from a parquet scan.
diff --git a/crates/polars-mem-engine/src/executors/scan/csv.rs b/crates/polars-mem-engine/src/executors/scan/csv.rs
index 4ab5a034c584..b06386cdfa03 100644
--- a/crates/polars-mem-engine/src/executors/scan/csv.rs
+++ b/crates/polars-mem-engine/src/executors/scan/csv.rs
@@ -9,7 +9,7 @@ use polars_error::feature_gated;
 use super::*;
 
 pub struct CsvExec {
-    pub source: ScanSource,
+    pub sources: ScanSources,
     pub file_info: FileInfo,
     pub options: CsvReadOptions,
     pub file_options: FileScanOptions,
@@ -18,7 +18,6 @@ pub struct CsvExec {
 
 impl CsvExec {
     fn read(&self) -> PolarsResult<DataFrame> {
-        let paths = self.source.as_paths();
         let with_columns = self
             .file_options
             .with_columns
@@ -46,7 +45,7 @@ impl CsvExec {
             .with_row_index(None)
             .with_path::<&str>(None);
 
-        if paths.is_empty() {
+        if self.sources.is_empty() {
             let out = if let Some(schema) = options_base.schema {
                 DataFrame::from_rows_and_schema(&[], schema.as_ref())?
             } else {
@@ -57,7 +56,7 @@ impl CsvExec {
 
         let verbose = config::verbose();
         let force_async = config::force_async();
-        let run_async = force_async || is_cloud_url(paths.first().unwrap());
+        let run_async = force_async || self.sources.is_cloud_url();
 
         if force_async && verbose {
             eprintln!("ASYNC READING FORCED");
@@ -65,34 +64,45 @@ impl CsvExec {
 
         let finish_read =
             |i: usize, options: CsvReadOptions, predicate: Option<Arc<dyn PhysicalIoExpr>>| {
-                let path = &paths[i];
-                let file = if run_async {
-                    feature_gated!("cloud", {
-                        polars_io::file_cache::FILE_CACHE
-                            .get_entry(path.to_str().unwrap())
-                            // Safety: This was initialized by schema inference.
-                            .unwrap()
-                            .try_open_assume_latest()
-                    })
-                } else {
-                    polars_utils::open_file(path)
-                }?;
-
+                let source = self.sources.at(i);
                 let owned = &mut vec![];
-                let mmap = unsafe { memmap::Mmap::map(&file).unwrap() };
-                let mut df = options
-                    .into_reader_with_file_handle(std::io::Cursor::new(maybe_decompress_bytes(
-                        mmap.as_ref(),
-                        owned,
-                    )?))
-                    ._with_predicate(predicate.clone())
-                    .finish()?;
+
+                let mut df = match source {
+                    ScanSourceRef::File(path) => {
+                        let file = if run_async {
+                            feature_gated!("cloud", {
+                                polars_io::file_cache::FILE_CACHE
+                                    .get_entry(path.to_str().unwrap())
+                                    // Safety: This was initialized by schema inference.
+                                    .unwrap()
+                                    .try_open_assume_latest()
+                            })
+                        } else {
+                            polars_utils::open_file(path)
+                        }?;
+
+                        let mmap = unsafe { memmap::Mmap::map(&file).unwrap() };
+                        options
+                            .into_reader_with_file_handle(std::io::Cursor::new(
+                                maybe_decompress_bytes(mmap.as_ref(), owned)?,
+                            ))
+                            ._with_predicate(predicate.clone())
+                            .finish()?
+                    },
+                    ScanSourceRef::Buffer(buffer) => options
+                        .into_reader_with_file_handle(std::io::Cursor::new(maybe_decompress_bytes(
+                            buffer, owned,
+                        )?))
+                        ._with_predicate(predicate.clone())
+                        .finish()?,
+                };
 
                 if let Some(col) = &self.file_options.include_file_paths {
-                    let path = path.to_str().unwrap();
+                    let name = source.to_file_path();
+
                     unsafe {
                         df.with_column_unchecked(
-                            StringChunked::full(col.clone(), path, df.height()).into_series(),
+                            StringChunked::full(col.clone(), name, df.height()).into_series(),
                         )
                     };
                 }
@@ -110,14 +120,14 @@ impl CsvExec {
             }
 
             let mut n_rows_read = 0usize;
-            let mut out = Vec::with_capacity(paths.len());
+            let mut out = Vec::with_capacity(self.sources.len());
             // If we have n_rows or row_index then we need to count how many rows we read, so we need
             // to delay applying the predicate.
             let predicate_during_read = predicate
                 .clone()
                 .filter(|_| n_rows.is_none() && self.file_options.row_index.is_none());
 
-            for i in 0..paths.len() {
+            for i in 0..self.sources.len() {
                 let opts = options_base
                     .clone()
                     .with_row_index(self.file_options.row_index.clone().map(|mut ri| {
@@ -162,10 +172,10 @@ impl CsvExec {
                 if n_rows.is_some() && n_rows_read == n_rows.unwrap() {
                     if verbose {
                         eprintln!(
-                            "reached n_rows = {} at file {} / {}",
+                            "reached n_rows = {} at source {} / {}",
                             n_rows.unwrap(),
                             1 + i,
-                            paths.len()
+                            self.sources.len()
                         )
                     }
                     break;
@@ -190,10 +200,10 @@ impl CsvExec {
             let dfs = POOL.install(|| {
                 let step = std::cmp::min(POOL.current_num_threads(), 128);
 
-                (0..paths.len())
+                (0..self.sources.len())
                     .step_by(step)
                     .map(|start| {
-                        (start..std::cmp::min(start.saturating_add(step), paths.len()))
+                        (start..std::cmp::min(start.saturating_add(step), self.sources.len()))
                             .into_par_iter()
                             .map(|i| finish_read(i, options_base.clone(), predicate.clone()))
                             .collect::<PolarsResult<Vec<_>>>()
@@ -222,7 +232,7 @@ impl CsvExec {
 impl Executor for CsvExec {
     fn execute(&mut self, state: &mut ExecutionState) -> PolarsResult<DataFrame> {
         let profile_name = if state.has_node_timer() {
-            let mut ids = vec![self.source.id()];
+            let mut ids = vec![self.sources.id()];
             if self.predicate.is_some() {
                 ids.push("predicate".into())
             }
diff --git a/crates/polars-mem-engine/src/executors/scan/ipc.rs b/crates/polars-mem-engine/src/executors/scan/ipc.rs
index b9387cad5878..ae1e3bcf30f2 100644
--- a/crates/polars-mem-engine/src/executors/scan/ipc.rs
+++ b/crates/polars-mem-engine/src/executors/scan/ipc.rs
@@ -1,6 +1,7 @@
 use hive::HivePartitions;
 use polars_core::config;
 use polars_core::utils::accumulate_dataframes_vertical;
+use polars_error::feature_gated;
 use polars_io::cloud::CloudOptions;
 use polars_io::path_utils::is_cloud_url;
 use polars_io::predicates::apply_predicate;
@@ -9,7 +10,7 @@ use rayon::prelude::*;
 use super::*;
 
 pub struct IpcExec {
-    pub(crate) sources: ScanSource,
+    pub(crate) sources: ScanSources,
     pub(crate) file_info: FileInfo,
     pub(crate) predicate: Option<Arc<dyn PhysicalExpr>>,
     pub(crate) options: IpcScanOptions,
@@ -20,24 +21,20 @@ pub struct IpcExec {
 
 impl IpcExec {
     fn read(&mut self) -> PolarsResult<DataFrame> {
-        let paths = self.sources.as_paths();
-        let is_cloud = paths.iter().any(is_cloud_url);
+        let is_cloud = match &self.sources {
+            ScanSources::Files(paths) => paths.iter().any(is_cloud_url),
+            ScanSources::Buffers(_) => false,
+        };
         let force_async = config::force_async();
 
         let mut out = if is_cloud || force_async {
-            #[cfg(not(feature = "cloud"))]
-            {
-                panic!("activate cloud feature")
-            }
-
-            #[cfg(feature = "cloud")]
-            {
+            feature_gated!("cloud", {
                 if force_async && config::verbose() {
                     eprintln!("ASYNC READING FORCED");
                 }
 
                 polars_io::pl_async::get_runtime().block_on_potential_spawn(self.read_async())?
-            }
+            })
         } else {
             self.read_sync()?
         };
@@ -49,11 +46,10 @@ impl IpcExec {
         Ok(out)
     }
 
-    fn read_impl<F: Fn(usize) -> PolarsResult<std::fs::File> + Send + Sync>(
+    fn read_impl(
         &mut self,
-        path_idx_to_file: F,
+        idx_to_cached_file: impl Fn(usize) -> Option<PolarsResult<std::fs::File>> + Send + Sync,
     ) -> PolarsResult<DataFrame> {
-        let paths = self.sources.as_paths();
         if config::verbose() {
             eprintln!("executing ipc read sync with row_index = {:?}, n_rows = {:?}, predicate = {:?} for paths {:?}",
                 self.file_options.row_index.as_ref(),
@@ -62,7 +58,7 @@ impl IpcExec {
                     x.1
                 }).as_ref(),
                 self.predicate.is_some(),
-                paths
+                self.sources,
             );
         }
 
@@ -73,33 +69,60 @@ impl IpcExec {
             self.file_options.row_index.is_some(),
         );
 
-        let read_path = |path_index: usize, n_rows: Option<usize>| {
-            IpcReader::new(path_idx_to_file(path_index)?)
-                .with_n_rows(n_rows)
-                .with_row_index(self.file_options.row_index.clone())
-                .with_projection(projection.clone())
-                .with_hive_partition_columns(
-                    self.hive_parts
-                        .as_ref()
-                        .map(|x| x[path_index].materialize_partition_columns()),
-                )
-                .with_include_file_path(self.file_options.include_file_paths.as_ref().map(|x| {
-                    (
-                        x.clone(),
-                        Arc::from(paths[path_index].to_str().unwrap().to_string()),
+        let read_path = |index: usize, n_rows: Option<usize>| {
+            let source = self.sources.at(index);
+
+            match source {
+                ScanSourceRef::File(path) => {
+                    let file = match idx_to_cached_file(index) {
+                        None => std::fs::File::open(path)?,
+                        Some(f) => f?,
+                    };
+
+                    IpcReader::new(file)
+                        .with_n_rows(n_rows)
+                        .with_row_index(self.file_options.row_index.clone())
+                        .with_projection(projection.clone())
+                        .with_hive_partition_columns(
+                            self.hive_parts
+                                .as_ref()
+                                .map(|x| x[index].materialize_partition_columns()),
+                        )
+                        .with_include_file_path(
+                            self.file_options
+                                .include_file_paths
+                                .as_ref()
+                                .map(|x| (x.clone(), Arc::from(source.to_file_path()))),
+                        )
+                        .memory_mapped(self.options.memory_map.then(|| path.to_path_buf()))
+                        .finish()
+                },
+                ScanSourceRef::Buffer(buff) => IpcReader::new(std::io::Cursor::new(buff))
+                    .with_n_rows(n_rows)
+                    .with_row_index(self.file_options.row_index.clone())
+                    .with_projection(projection.clone())
+                    .with_hive_partition_columns(
+                        self.hive_parts
+                            .as_ref()
+                            .map(|x| x[index].materialize_partition_columns()),
                     )
-                }))
-                .memory_mapped(self.options.memory_map.then(|| paths[path_index].clone()))
-                .finish()
+                    .with_include_file_path(
+                        self.file_options
+                            .include_file_paths
+                            .as_ref()
+                            .map(|x| (x.clone(), Arc::from(source.to_file_path()))),
+                    )
+                    .finish(),
+            }
         };
 
         let mut dfs = if let Some(mut n_rows) = self.file_options.slice.map(|x| {
             assert_eq!(x.0, 0);
             x.1
         }) {
-            let mut out = Vec::with_capacity(paths.len());
+            let mut out = Vec::with_capacity(self.sources.len());
 
-            for i in 0..paths.len() {
+            for i in 0..self.sources.len() {
                 let df = read_path(i, Some(n_rows))?;
                 let df_height = df.height();
                 out.push(df);
@@ -117,7 +140,7 @@ impl IpcExec {
             out
         } else {
             POOL.install(|| {
-                (0..paths.len())
+                (0..self.sources.len())
                     .into_par_iter()
                     .map(|i| read_path(i, None))
                     .collect::<PolarsResult<Vec<_>>>()
@@ -153,9 +176,7 @@ impl IpcExec {
     }
 
     fn read_sync(&mut self) -> PolarsResult<DataFrame> {
-        let paths = self.sources.into_paths();
-        let paths = paths.clone();
-        self.read_impl(move |i| std::fs::File::open(&paths[i]).map_err(Into::into))
+        self.read_impl(|_| None)
     }
 
     #[cfg(feature = "cloud")]
@@ -176,17 +197,15 @@ impl IpcExec {
                 self.cloud_options.as_ref(),
             )?;
 
-            self.read_impl(move |i| cache_entries[i].try_open_check_latest())
+            self.read_impl(|i| Some(cache_entries[i].try_open_check_latest()))
         })
     }
 }
 
 impl Executor for IpcExec {
     fn execute(&mut self, state: &mut ExecutionState) -> PolarsResult<DataFrame> {
-        let paths = self.sources.as_paths();
-
         let profile_name = if state.has_node_timer() {
-            let mut ids = vec![PlSmallStr::from_str(paths[0].to_string_lossy().as_ref())];
+            let mut ids = vec![self.sources.id()];
             if self.predicate.is_some() {
                 ids.push("predicate".into())
             }
diff --git a/crates/polars-mem-engine/src/executors/scan/ndjson.rs b/crates/polars-mem-engine/src/executors/scan/ndjson.rs
index 68ad24ab837e..27aab29fd0c1 100644
--- a/crates/polars-mem-engine/src/executors/scan/ndjson.rs
+++ b/crates/polars-mem-engine/src/executors/scan/ndjson.rs
@@ -1,10 +1,11 @@
 use polars_core::config;
 use polars_core::utils::accumulate_dataframes_vertical;
+use polars_error::feature_gated;
 
 use super::*;
 
 pub struct JsonExec {
-    sources: ScanSource,
+    sources: ScanSources,
     options: NDJsonReadOptions,
     file_scan_options: FileScanOptions,
     file_info: FileInfo,
@@ -13,7 +14,7 @@ pub struct JsonExec {
 
 impl JsonExec {
     pub fn new(
-        sources: ScanSource,
+        sources: ScanSources,
         options: NDJsonReadOptions,
         file_scan_options: FileScanOptions,
         file_info: FileInfo,
@@ -36,11 +37,10 @@ impl JsonExec {
             .unwrap()
             .as_ref()
             .unwrap_right();
-        let paths = self.sources.as_paths();
 
         let verbose = config::verbose();
         let force_async = config::force_async();
-        let run_async = force_async || is_cloud_url(paths.first().unwrap());
+        let run_async = force_async || self.sources.is_cloud_url();
 
         if force_async && verbose {
             eprintln!("ASYNC READING FORCED");
@@ -65,59 +65,80 @@ impl JsonExec {
             return Ok(df);
         }
 
-        let dfs = paths
+        let dfs = self
+            .sources
             .iter()
-            .map_while(|p| {
+            .map_while(|source| {
                 if n_rows == Some(0) {
                     return None;
                 }
 
-                let file = if run_async {
-                    #[cfg(feature = "cloud")]
-                    {
-                        match polars_io::file_cache::FILE_CACHE
-                            .get_entry(p.to_str().unwrap())
-                            // Safety: This was initialized by schema inference.
-                            .unwrap()
-                            .try_open_assume_latest()
-                        {
-                            Ok(v) => v,
-                            Err(e) => return Some(Err(e)),
-                        }
-                    }
-                    #[cfg(not(feature = "cloud"))]
-                    {
-                        panic!("required feature `cloud` is not enabled")
-                    }
-                } else {
-                    match polars_utils::open_file(p.as_ref()) {
-                        Ok(v) => v,
-                        Err(e) => return Some(Err(e)),
-                    }
-                };
-
-                let mmap = unsafe { memmap::Mmap::map(&file).unwrap() };
-                let owned = &mut vec![];
-                let curs =
-                    std::io::Cursor::new(match maybe_decompress_bytes(mmap.as_ref(), owned) {
-                        Ok(v) => v,
-                        Err(e) => return Some(Err(e)),
-                    });
-                let reader = JsonLineReader::new(curs);
-
                 let row_index = self.file_scan_options.row_index.as_mut();
 
-                let df = reader
-                    .with_schema(schema.clone())
-                    .with_rechunk(self.file_scan_options.rechunk)
-                    .with_chunk_size(Some(self.options.chunk_size))
-                    .with_row_index(row_index)
-                    .with_predicate(self.predicate.clone().map(phys_expr_to_io_expr))
-                    .with_projection(self.file_scan_options.with_columns.clone())
-                    .low_memory(self.options.low_memory)
-                    .with_n_rows(n_rows)
-                    .with_ignore_errors(self.options.ignore_errors)
-                    .finish();
+                let owned = &mut vec![];
+                let df = match source {
+                    ScanSourceRef::File(path) => {
+                        let file = if run_async {
+                            feature_gated!("cloud", {
+                                match polars_io::file_cache::FILE_CACHE
+                                    .get_entry(path.to_str().unwrap())
+                                    // Safety: This was initialized by schema inference.
+                                    .unwrap()
+                                    .try_open_assume_latest()
+                                {
+                                    Ok(v) => v,
+                                    Err(e) => return Some(Err(e)),
+                                }
+                            })
+                        } else {
+                            match polars_utils::open_file(path) {
+                                Ok(v) => v,
+                                Err(e) => return Some(Err(e)),
+                            }
+                        };
+
+                        let mmap = unsafe { memmap::Mmap::map(&file).unwrap() };
+                        let curs = std::io::Cursor::new(
+                            match maybe_decompress_bytes(mmap.as_ref(), owned) {
+                                Ok(v) => v,
+                                Err(e) => return Some(Err(e)),
+                            },
+                        );
+                        let reader = JsonLineReader::new(curs);
+
+                        reader
+                            .with_schema(schema.clone())
+                            .with_rechunk(self.file_scan_options.rechunk)
+                            .with_chunk_size(Some(self.options.chunk_size))
+                            .with_row_index(row_index)
+                            .with_predicate(self.predicate.clone().map(phys_expr_to_io_expr))
+                            .with_projection(self.file_scan_options.with_columns.clone())
+                            .low_memory(self.options.low_memory)
+                            .with_n_rows(n_rows)
+                            .with_ignore_errors(self.options.ignore_errors)
+                            .finish()
+                    },
+                    ScanSourceRef::Buffer(buff) => {
+                        let curs =
+                            std::io::Cursor::new(match maybe_decompress_bytes(buff, owned) {
+                                Ok(v) => v,
+                                Err(e) => return Some(Err(e)),
+                            });
+                        let reader = JsonLineReader::new(curs);
+
+                        reader
+                            .with_schema(schema.clone())
+                            .with_rechunk(self.file_scan_options.rechunk)
+                            .with_chunk_size(Some(self.options.chunk_size))
+                            .with_row_index(row_index)
+                            .with_predicate(self.predicate.clone().map(phys_expr_to_io_expr))
+                            .with_projection(self.file_scan_options.with_columns.clone())
+                            .low_memory(self.options.low_memory)
+                            .with_n_rows(n_rows)
+                            .with_ignore_errors(self.options.ignore_errors)
+                            .finish()
+                    },
+                };
 
                 let mut df = match df {
                     Ok(df) => df,
@@ -129,10 +150,10 @@ impl JsonExec {
                 }
 
                 if let Some(col) = &self.file_scan_options.include_file_paths {
-                    let path = p.to_str().unwrap();
+                    let name = source.to_file_path();
                     unsafe {
                         df.with_column_unchecked(
-                            StringChunked::full(col.clone(), path, df.height()).into_series(),
+                            StringChunked::full(col.clone(), name, df.height()).into_series(),
                         )
                     };
                 }
@@ -147,9 +168,8 @@ impl JsonExec {
 
 impl Executor for JsonExec {
     fn execute(&mut self, state: &mut ExecutionState) -> PolarsResult<DataFrame> {
-        let paths = self.sources.as_paths();
         let profile_name = if state.has_node_timer() {
-            let ids = vec![paths[0].to_string_lossy().clone()];
+            let ids = vec![self.sources.id()];
             let name = comma_delimited("ndjson".to_string(), &ids);
             Cow::Owned(name)
         } else {
diff --git a/crates/polars-mem-engine/src/executors/scan/parquet.rs b/crates/polars-mem-engine/src/executors/scan/parquet.rs
index ed740809fcd3..bb47eb458a49 100644
--- a/crates/polars-mem-engine/src/executors/scan/parquet.rs
+++ b/crates/polars-mem-engine/src/executors/scan/parquet.rs
@@ -6,14 +6,13 @@ use polars_core::utils::accumulate_dataframes_vertical;
 use polars_error::feature_gated;
 use polars_io::cloud::CloudOptions;
 use polars_io::parquet::metadata::FileMetaDataRef;
-use polars_io::path_utils::is_cloud_url;
 use polars_io::utils::slice::split_slice_at_file;
 use polars_io::RowIndex;
 
 use super::*;
 
 pub struct ParquetExec {
-    source: ScanSource,
+    sources: ScanSources,
     file_info: FileInfo,
     hive_parts: Option<Arc<Vec<HivePartitions>>>,
     predicate: Option<Arc<dyn PhysicalExpr>>,
@@ -28,7 +27,7 @@ pub struct ParquetExec {
 impl ParquetExec {
     #[allow(clippy::too_many_arguments)]
     pub(crate) fn new(
-        source: ScanSource,
+        sources: ScanSources,
         file_info: FileInfo,
         hive_parts: Option<Arc<Vec<HivePartitions>>>,
         predicate: Option<Arc<dyn PhysicalExpr>>,
@@ -38,7 +37,7 @@ impl ParquetExec {
         metadata: Option<FileMetaDataRef>,
     ) -> Self {
         ParquetExec {
-            source,
+            sources,
             file_info,
             hive_parts,
             predicate,
@@ -51,7 +50,7 @@ impl ParquetExec {
 
     fn read_par(&mut self) -> PolarsResult<Vec<DataFrame>> {
         let parallel = match self.options.parallel {
-            ParallelStrategy::Auto if self.source.num_sources() > POOL.current_num_threads() => {
+            ParallelStrategy::Auto if self.sources.len() > POOL.current_num_threads() => {
                 ParallelStrategy::RowGroups
             },
             identity => identity,
@@ -63,78 +62,53 @@ impl ParquetExec {
         let slice_info = match self.file_options.slice {
             None => ScanSourceSliceInfo {
                 item_slice: 0..usize::MAX,
-                source_slice: 0..self.source.num_sources(),
+                source_slice: 0..self.sources.len(),
+            },
+            Some(slice) => {
+                self.sources
+                    .collect_slice_information(slice, |source| match source {
+                        ScanSourceRef::File(path) => {
+                            ParquetReader::new(std::fs::File::open(path)?).num_rows()
+                        },
+                        ScanSourceRef::Buffer(buff) => {
+                            ParquetReader::new(std::io::Cursor::new(buff)).num_rows()
+                        },
+                    })?
             },
-            Some(slice) => self.source.collect_slice_information(
-                slice,
-                |path| ParquetReader::new(std::fs::File::open(path)?).num_rows(),
-                |buff| ParquetReader::new(std::io::Cursor::new(buff)).num_rows(),
-            )?,
         };
 
-        match &self.source {
-            ScanSource::Buffer(buffer) => {
-                let row_index = self.file_options.row_index.take();
+        let mut current_offset = 0;
+        let base_row_index = self.file_options.row_index.take();
+        // Limit no. of files at a time to prevent open file limits.
+
+        let paths = self.sources.as_paths();
+
+        for i in slice_info.source_slice.step_by(step) {
+            let end = std::cmp::min(i.saturating_add(step), paths.len());
+            let hive_parts = self.hive_parts.as_ref().map(|x| &x[i..end]);
+
+            if current_offset >= slice_info.item_slice.end && !result.is_empty() {
+                return Ok(result);
+            }
+
+            // First initialize the readers, predicates and metadata.
+            // This will be used to determine the slices. That way we can actually read all the
+            // files in parallel even if we add row index columns or slices.
+            let iter = (0..self.sources.len()).into_par_iter().map(|i| {
+                let source = self.sources.at(i);
+                let hive_partitions = hive_parts.map(|x| x[i].materialize_partition_columns());
+
                 let (projection, predicate) = prepare_scan_args(
                     self.predicate.clone(),
                     &mut self.file_options.with_columns.clone(),
                     &mut self.file_info.schema.clone(),
-                    row_index.is_some(),
-                    None,
+                    base_row_index.is_some(),
+                    hive_partitions.as_deref(),
                 );
 
-                result = vec![ParquetReader::new(std::io::Cursor::new(buffer))
-                    .read_parallel(parallel)
-                    .set_low_memory(self.options.low_memory)
-                    .use_statistics(self.options.use_statistics)
-                    .set_rechunk(false)
-                    .with_slice(Some((
-                        slice_info.item_slice.start,
-                        slice_info.item_slice.len(),
-                    )))
-                    .with_row_index(row_index)
-                    .with_predicate(predicate.clone())
-                    .with_projection(projection.clone())
-                    .check_schema(
-                        self.file_info
-                            .reader_schema
-                            .clone()
-                            .unwrap()
-                            .unwrap_left()
-                            .as_ref(),
-                    )?
-                    .finish()?];
-            },
-            ScanSource::Files(paths) => {
-                let mut current_offset = 0;
-                let base_row_index = self.file_options.row_index.take();
-                // Limit no. of files at a time to prevent open file limits.
-
-                for i in slice_info.source_slice.step_by(step) {
-                    let end = std::cmp::min(i.saturating_add(step), paths.len());
-                    let paths = &paths[i..end];
-                    let hive_parts = self.hive_parts.as_ref().map(|x| &x[i..end]);
-
-                    if current_offset >= slice_info.item_slice.end && !result.is_empty() {
-                        return Ok(result);
-                    }
-
-                    // First initialize the readers, predicates and metadata.
-                    // This will be used to determine the slices. That way we can actually read all the
-                    // files in parallel even if we add row index columns or slices.
-                    let iter = (0..paths.len()).into_par_iter().map(|i| {
-                        let path = &paths[i];
-                        let hive_partitions =
-                            hive_parts.map(|x| x[i].materialize_partition_columns());
-
+                match source {
+                    ScanSourceRef::File(path) => {
                         let file = std::fs::File::open(path)?;
-                        let (projection, predicate) = prepare_scan_args(
-                            self.predicate.clone(),
-                            &mut self.file_options.with_columns.clone(),
-                            &mut self.file_info.schema.clone(),
-                            base_row_index.is_some(),
-                            hive_partitions.as_deref(),
-                        );
 
                         let mut reader = ParquetReader::new(file)
                             .read_parallel(parallel)
@@ -152,68 +126,68 @@ impl ParquetExec {
                         reader
                             .num_rows()
                             .map(|num_rows| (reader, num_rows, predicate, projection))
-                    });
-
-                    // We do this in parallel because wide tables can take a long time deserializing metadata.
-                    let readers_and_metadata =
-                        POOL.install(|| iter.collect::<PolarsResult<Vec<_>>>())?;
-
-                    let current_offset_ref = &mut current_offset;
-                    let row_statistics = readers_and_metadata
-                        .iter()
-                        .map(|(_, num_rows, _, _)| {
-                            let cum_rows = *current_offset_ref;
-                            (
-                                cum_rows,
-                                split_slice_at_file(
-                                    current_offset_ref,
-                                    *num_rows,
-                                    slice_info.item_slice.start,
-                                    slice_info.item_slice.end,
-                                ),
-                            )
-                        })
-                        .collect::<Vec<_>>();
-
-                    let out = POOL.install(|| {
-                        readers_and_metadata
-                            .into_par_iter()
-                            .zip(row_statistics.into_par_iter())
-                            .map(
-                                |((reader, _, predicate, projection), (cumulative_read, slice))| {
-                                    let row_index = base_row_index.as_ref().map(|rc| RowIndex {
-                                        name: rc.name.clone(),
-                                        offset: rc.offset + cumulative_read as IdxSize,
-                                    });
-
-                                    let df = reader
-                                        .with_slice(Some(slice))
-                                        .with_row_index(row_index)
-                                        .with_predicate(predicate.clone())
-                                        .with_projection(projection.clone())
-                                        .check_schema(
-                                            self.file_info
-                                                .reader_schema
-                                                .clone()
-                                                .unwrap()
-                                                .unwrap_left()
-                                                .as_ref(),
-                                        )?
-                                        .finish()?;
-
-                                    Ok(df)
-                                },
-                            )
-                            .collect::<PolarsResult<Vec<_>>>()
-                    })?;
-
-                    if result.is_empty() {
-                        result = out;
-                    } else {
-                        result.extend_from_slice(&out)
-                    }
+                    },
+                    ScanSourceRef::Buffer(_) => todo!(),
                 }
-            },
+            });
+
+            // We do this in parallel because wide tables can take a long time deserializing metadata.
+            let readers_and_metadata = POOL.install(|| iter.collect::<PolarsResult<Vec<_>>>())?;
+
+            let current_offset_ref = &mut current_offset;
+            let row_statistics = readers_and_metadata
+                .iter()
+                .map(|(_, num_rows, _, _)| {
+                    let cum_rows = *current_offset_ref;
+                    (
+                        cum_rows,
+                        split_slice_at_file(
+                            current_offset_ref,
+                            *num_rows,
+                            slice_info.item_slice.start,
+                            slice_info.item_slice.end,
+                        ),
+                    )
+                })
+                .collect::<Vec<_>>();
+
+            let out = POOL.install(|| {
+                readers_and_metadata
+                    .into_par_iter()
+                    .zip(row_statistics.into_par_iter())
+                    .map(
+                        |((reader, _, predicate, projection), (cumulative_read, slice))| {
+                            let row_index = base_row_index.as_ref().map(|rc| RowIndex {
+                                name: rc.name.clone(),
+                                offset: rc.offset + cumulative_read as IdxSize,
+                            });
+
+                            let df = reader
+                                .with_slice(Some(slice))
+                                .with_row_index(row_index)
+                                .with_predicate(predicate.clone())
+                                .with_projection(projection.clone())
+                                .check_schema(
+                                    self.file_info
+                                        .reader_schema
+                                        .clone()
+                                        .unwrap()
+                                        .unwrap_left()
+                                        .as_ref(),
+                                )?
+                                .finish()?;
+
+                            Ok(df)
+                        },
+                    )
+                    .collect::<PolarsResult<Vec<_>>>()
+            })?;
+
+            if result.is_empty() {
+                result = out;
+            } else {
+                result.extend_from_slice(&out)
+            }
         }
 
         Ok(result)
@@ -226,7 +200,7 @@ impl ParquetExec {
         use polars_io::utils::slice::split_slice_at_file;
 
         let verbose = verbose();
-        let paths = self.source.into_paths();
+        let paths = self.sources.into_paths();
         let first_metadata = &self.metadata;
         let cloud_options = self.cloud_options.as_ref();
 
@@ -443,10 +417,7 @@ impl ParquetExec {
             .and_then(|_| self.predicate.take())
             .map(phys_expr_to_io_expr);
 
-        let is_cloud = match &self.source {
-            ScanSource::Files(paths) => is_cloud_url(paths.first().unwrap()),
-            ScanSource::Buffer(_) => false,
-        };
+        let is_cloud = self.sources.is_cloud_url();
         let force_async = config::force_async();
 
         let out = if is_cloud || force_async {
@@ -475,7 +446,7 @@ impl ParquetExec {
 impl Executor for ParquetExec {
     fn execute(&mut self, state: &mut ExecutionState) -> PolarsResult<DataFrame> {
         let profile_name = if state.has_node_timer() {
-            let mut ids = vec![self.source.id()];
+            let mut ids = vec![self.sources.id()];
             if self.predicate.is_some() {
                 ids.push("predicate".into())
             }
diff --git a/crates/polars-mem-engine/src/planner/lp.rs b/crates/polars-mem-engine/src/planner/lp.rs
index 1f161a34587a..45487f7b7024 100644
--- a/crates/polars-mem-engine/src/planner/lp.rs
+++ b/crates/polars-mem-engine/src/planner/lp.rs
@@ -306,7 +306,7 @@ fn create_physical_plan_impl(
             match scan_type {
                 #[cfg(feature = "csv")]
                 FileScan::Csv { options, .. } => Ok(Box::new(executors::CsvExec {
-                    source: sources,
+                    sources,
                     file_info,
                     options,
                     predicate,
diff --git a/crates/polars-pipe/src/executors/sources/csv.rs b/crates/polars-pipe/src/executors/sources/csv.rs
index 5ca5551c506d..673848e67d77 100644
--- a/crates/polars-pipe/src/executors/sources/csv.rs
+++ b/crates/polars-pipe/src/executors/sources/csv.rs
@@ -4,7 +4,7 @@ use polars_core::{config, POOL};
 use polars_io::csv::read::{BatchedCsvReader, CsvReadOptions, CsvReader};
 use polars_io::path_utils::is_cloud_url;
 use polars_plan::global::_set_n_rows_for_scan;
-use polars_plan::plans::ScanSource;
+use polars_plan::plans::ScanSources;
 use polars_plan::prelude::FileScanOptions;
 use polars_utils::itertools::Itertools;
 
@@ -20,7 +20,7 @@ pub(crate) struct CsvSource {
     batched_reader: Option<BatchedCsvReader<'static>>,
     reader: Option<CsvReader<File>>,
     n_threads: usize,
-    sources: ScanSource,
+    sources: ScanSources,
     options: Option<CsvReadOptions>,
     file_options: FileScanOptions,
     verbose: bool,
@@ -141,7 +141,7 @@ impl CsvSource {
     }
 
     pub(crate) fn new(
-        sources: ScanSource,
+        sources: ScanSources,
         schema: SchemaRef,
         options: CsvReadOptions,
         file_options: FileScanOptions,
diff --git a/crates/polars-pipe/src/executors/sources/parquet.rs b/crates/polars-pipe/src/executors/sources/parquet.rs
index ab5abbade817..e91eb2ec1bba 100644
--- a/crates/polars-pipe/src/executors/sources/parquet.rs
+++ b/crates/polars-pipe/src/executors/sources/parquet.rs
@@ -20,7 +20,7 @@ use polars_io::prelude::materialize_projection;
 use polars_io::prelude::ParquetAsyncReader;
 use polars_io::utils::slice::split_slice_at_file;
 use polars_io::SerReader;
-use polars_plan::plans::{FileInfo, ScanSource};
+use polars_plan::plans::{FileInfo, ScanSources};
 use polars_plan::prelude::hive::HivePartitions;
 use polars_plan::prelude::FileScanOptions;
 use polars_utils::itertools::Itertools;
@@ -36,7 +36,7 @@ pub struct ParquetSource {
     processed_paths: usize,
     processed_rows: AtomicUsize,
     iter: Range<usize>,
-    sources: ScanSource,
+    sources: ScanSources,
     options: ParquetOptions,
     file_options: FileScanOptions,
     #[allow(dead_code)]
@@ -246,7 +246,7 @@ impl ParquetSource {
     #[allow(unused_variables)]
     #[allow(clippy::too_many_arguments)]
     pub(crate) fn new(
-        sources: ScanSource,
+        sources: ScanSources,
         options: ParquetOptions,
         cloud_options: Option<CloudOptions>,
         metadata: Option<FileMetaDataRef>,
diff --git a/crates/polars-plan/src/client/check.rs b/crates/polars-plan/src/client/check.rs
index c7070d22ed0c..1f5562bb4670 100644
--- a/crates/polars-plan/src/client/check.rs
+++ b/crates/polars-plan/src/client/check.rs
@@ -2,7 +2,7 @@ use polars_core::error::{polars_err, PolarsResult};
 use polars_io::path_utils::is_cloud_url;
 
 use crate::plans::options::SinkType;
-use crate::plans::{DslPlan, DslScanSource, FileScan};
+use crate::plans::{DslPlan, FileScan, ScanSources};
 
 /// Assert that the given [`DslPlan`] is eligible to be executed on Polars Cloud.
 pub(super) fn assert_cloud_eligible(dsl: &DslPlan) -> PolarsResult<()> {
@@ -13,13 +13,14 @@ pub(super) fn assert_cloud_eligible(dsl: &DslPlan) -> PolarsResult<()> {
             DslPlan::Scan {
                 sources, scan_type, ..
             } => {
-                match sources {
-                    DslScanSource::File(file) => {
-                        if file.lock().unwrap().paths.iter().any(|p| !is_cloud_url(p)) {
+                let sources_lock = sources.lock().unwrap();
+                match &sources_lock.sources {
+                    ScanSources::Files(paths) => {
+                        if paths.iter().any(|p| !is_cloud_url(p)) {
                             return ineligible_error("contains scan of local file system");
                         }
                     },
-                    DslScanSource::Buffer(_) => {
+                    ScanSources::Buffers(_) => {
                         return ineligible_error("contains scan of in-memory buffer");
                     },
                 }
diff --git a/crates/polars-plan/src/plans/builder_dsl.rs b/crates/polars-plan/src/plans/builder_dsl.rs
index 1170f95ec7a2..7efa55417509 100644
--- a/crates/polars-plan/src/plans/builder_dsl.rs
+++ b/crates/polars-plan/src/plans/builder_dsl.rs
@@ -1,4 +1,4 @@
-use std::sync::{Arc, RwLock};
+use std::sync::{Arc, Mutex, RwLock};
 
 use polars_core::prelude::*;
 #[cfg(any(feature = "parquet", feature = "ipc", feature = "csv"))]
@@ -58,7 +58,10 @@ impl DslBuilder {
         };
 
         Ok(DslPlan::Scan {
-            sources: DslScanSource::Buffer(Arc::default()),
+            sources: Arc::new(Mutex::new(DslScanSources {
+                sources: ScanSources::Buffers(Arc::default()),
+                is_expanded: true,
+            })),
             file_info: Arc::new(RwLock::new(Some(file_info))),
             hive_parts: None,
             predicate: None,
@@ -77,7 +80,7 @@ impl DslBuilder {
     #[cfg(feature = "parquet")]
     #[allow(clippy::too_many_arguments)]
     pub fn scan_parquet(
-        source: DslScanSource,
+        sources: DslScanSources,
         n_rows: Option<usize>,
         cache: bool,
         parallel: polars_io::parquet::read::ParallelStrategy,
@@ -102,8 +105,7 @@ impl DslBuilder {
             include_file_paths,
         };
         Ok(DslPlan::Scan {
-            // @FIX: sources -> source
-            sources: source,
+            sources: Arc::new(Mutex::new(sources)),
             file_info: Arc::new(RwLock::new(None)),
             hive_parts: None,
             predicate: None,
@@ -124,7 +126,7 @@ impl DslBuilder {
     #[cfg(feature = "ipc")]
     #[allow(clippy::too_many_arguments)]
     pub fn scan_ipc(
-        source: DslScanSource,
+        sources: DslScanSources,
         options: IpcScanOptions,
         n_rows: Option<usize>,
         cache: bool,
@@ -135,7 +137,7 @@ impl DslBuilder {
         include_file_paths: Option<PlSmallStr>,
     ) -> PolarsResult<Self> {
         Ok(DslPlan::Scan {
-            sources: source,
+            sources: Arc::new(Mutex::new(sources)),
             file_info: Arc::new(RwLock::new(None)),
             hive_parts: None,
             file_options: FileScanOptions {
@@ -162,7 +164,7 @@ impl DslBuilder {
     #[allow(clippy::too_many_arguments)]
     #[cfg(feature = "csv")]
     pub fn scan_csv(
-        source: DslScanSource,
+        sources: DslScanSources,
         read_options: CsvReadOptions,
         cache: bool,
         cloud_options: Option<CloudOptions>,
@@ -188,7 +190,7 @@ impl DslBuilder {
             include_file_paths,
         };
         Ok(DslPlan::Scan {
-            sources: source,
+            sources: Arc::new(Mutex::new(sources)),
             file_info: Arc::new(RwLock::new(None)),
             hive_parts: None,
             file_options: options,
diff --git a/crates/polars-plan/src/plans/conversion/dsl_to_ir.rs b/crates/polars-plan/src/plans/conversion/dsl_to_ir.rs
index 72e75d2b3017..7966d6ff688e 100644
--- a/crates/polars-plan/src/plans/conversion/dsl_to_ir.rs
+++ b/crates/polars-plan/src/plans/conversion/dsl_to_ir.rs
@@ -105,21 +105,16 @@ pub fn to_alp_impl(lp: DslPlan, ctxt: &mut DslConversionContext) -> PolarsResult
 
     let v = match lp {
         DslPlan::Scan {
-            mut sources,
+            sources,
             file_info,
             hive_parts,
             predicate,
             mut file_options,
             mut scan_type,
         } => {
-            sources.expand_paths(&mut scan_type, &mut file_options)?;
-
-            let source = match sources {
-                DslScanSource::File(paths) => {
-                    ScanSource::Files(paths.as_ref().lock().unwrap().paths.clone())
-                },
-                DslScanSource::Buffer(buf) => ScanSource::Buffer(buf),
-            };
+            let mut sources_lock = sources.lock().unwrap();
+            sources_lock.expand_paths(&mut scan_type, &mut file_options)?;
+            let sources = sources_lock.sources.clone();
 
             let file_info_read = file_info.read().unwrap();
 
@@ -146,7 +141,7 @@ pub fn to_alp_impl(lp: DslPlan, ctxt: &mut DslConversionContext) -> PolarsResult
                         ..
                     } => {
                         let (file_info, md) = scans::parquet_file_info(
-                            &source,
+                            &sources,
                             &file_options,
                             cloud_options.as_ref(),
                         )
@@ -160,12 +155,9 @@ pub fn to_alp_impl(lp: DslPlan, ctxt: &mut DslConversionContext) -> PolarsResult
                         metadata,
                         ..
                     } => {
-                        let (file_info, md) = scans::ipc_file_info(
-                            source.as_paths(),
-                            &file_options,
-                            cloud_options.as_ref(),
-                        )
-                        .map_err(|e| e.context(failed_here!(ipc scan)))?;
+                        let (file_info, md) =
+                            scans::ipc_file_info(&sources, &file_options, cloud_options.as_ref())
+                                .map_err(|e| e.context(failed_here!(ipc scan)))?;
                         *metadata = Some(md);
                         file_info
                     },
@@ -174,7 +166,7 @@ pub fn to_alp_impl(lp: DslPlan, ctxt: &mut DslConversionContext) -> PolarsResult
                         options,
                         cloud_options,
                     } => scans::csv_file_info(
-                        &source,
+                        &sources,
                         &file_options,
                         options,
                         cloud_options.as_ref(),
@@ -185,7 +177,7 @@ pub fn to_alp_impl(lp: DslPlan, ctxt: &mut DslConversionContext) -> PolarsResult
                         options,
                         cloud_options,
                     } => scans::ndjson_file_info(
-                        source.as_paths(),
+                        &sources,
                         &file_options,
                         options,
                         cloud_options.as_ref(),
@@ -205,7 +197,7 @@ pub fn to_alp_impl(lp: DslPlan, ctxt: &mut DslConversionContext) -> PolarsResult
                 let mut owned = None;
 
                 hive_partitions_from_paths(
-                    source.as_paths().as_ref(),
+                    &sources.as_paths(),
                     file_options.hive_options.hive_start_idx,
                     file_options.hive_options.schema.clone(),
                     match resolved_file_info.reader_schema.as_ref().unwrap() {
@@ -279,7 +271,7 @@ pub fn to_alp_impl(lp: DslPlan, ctxt: &mut DslConversionContext) -> PolarsResult
             }
 
             IR::Scan {
-                sources: source,
+                sources,
                 file_info: resolved_file_info,
                 hive_parts,
                 output_schema: None,
@@ -819,64 +811,48 @@ pub fn to_alp_impl(lp: DslPlan, ctxt: &mut DslConversionContext) -> PolarsResult
     Ok(ctxt.lp_arena.add(v))
 }
 
-impl DslScanSource {
+impl DslScanSources {
     /// Expand scan paths if they were not already expanded.
     pub fn expand_paths(
         &mut self,
         scan_type: &mut FileScan,
         file_options: &mut FileScanOptions,
     ) -> PolarsResult<()> {
-        match self {
-            DslScanSource::File(source) => {
-                #[allow(unused_mut)]
-                let mut lock = source.lock().unwrap();
-
-                // Return if paths are already expanded
-                if lock.is_expanded {
-                    return Ok(());
-                }
-
-                {
-                    let paths_expanded = match &scan_type {
-                        #[cfg(feature = "parquet")]
-                        FileScan::Parquet { cloud_options, .. } => {
-                            expand_scan_paths_with_hive_update(
-                                &lock.paths[..],
-                                file_options,
-                                cloud_options,
-                            )?
-                        },
-                        #[cfg(feature = "ipc")]
-                        FileScan::Ipc { cloud_options, .. } => expand_scan_paths_with_hive_update(
-                            &lock.paths[..],
-                            file_options,
-                            cloud_options,
-                        )?,
-                        #[cfg(feature = "csv")]
-                        FileScan::Csv { cloud_options, .. } => expand_paths(
-                            &lock.paths[..],
-                            file_options.glob,
-                            cloud_options.as_ref(),
-                        )?,
-                        #[cfg(feature = "json")]
-                        FileScan::NDJson { cloud_options, .. } => expand_paths(
-                            &lock.paths[..],
-                            file_options.glob,
-                            cloud_options.as_ref(),
-                        )?,
-                        FileScan::Anonymous { .. } => unreachable!(), // Invariant: Anonymous scans are already expanded.
-                    };
+        if self.is_expanded {
+            return Ok(());
+        }
 
-                    #[allow(unreachable_code)]
-                    {
-                        lock.paths = paths_expanded;
-                        lock.is_expanded = true;
+        let ScanSources::Files(paths) = &self.sources else {
+            self.is_expanded = true;
+            return Ok(());
+        };
 
-                        Ok(())
-                    }
-                }
+        let expanded_sources = match &scan_type {
+            #[cfg(feature = "parquet")]
+            FileScan::Parquet { cloud_options, .. } => {
+                expand_scan_paths_with_hive_update(&paths, file_options, cloud_options)?
             },
-            DslScanSource::Buffer(_) => Ok(()),
+            #[cfg(feature = "ipc")]
+            FileScan::Ipc { cloud_options, .. } => {
+                expand_scan_paths_with_hive_update(&paths, file_options, cloud_options)?
+            },
+            #[cfg(feature = "csv")]
+            FileScan::Csv { cloud_options, .. } => {
+                expand_paths(&paths, file_options.glob, cloud_options.as_ref())?
+            },
+            #[cfg(feature = "json")]
+            FileScan::NDJson { cloud_options, .. } => {
+                expand_paths(&paths, file_options.glob, cloud_options.as_ref())?
+            },
+            FileScan::Anonymous { .. } => unreachable!(), // Invariant: Anonymous scans are already expanded.
+        };
+
+        #[allow(unreachable_code)]
+        {
+            self.sources = ScanSources::Files(expanded_sources);
+            self.is_expanded = true;
+
+            Ok(())
         }
     }
 }
diff --git a/crates/polars-plan/src/plans/conversion/mod.rs b/crates/polars-plan/src/plans/conversion/mod.rs
index 9851a6d2c3ba..b9ed8711a438 100644
--- a/crates/polars-plan/src/plans/conversion/mod.rs
+++ b/crates/polars-plan/src/plans/conversion/mod.rs
@@ -12,7 +12,7 @@ mod ir_to_dsl;
 mod scans;
 mod stack_opt;
 
-use std::sync::{Arc, RwLock};
+use std::sync::{Arc, Mutex, RwLock};
 
 pub use dsl_to_ir::*;
 pub use expr_to_ir::*;
@@ -58,7 +58,10 @@ impl IR {
                 output_schema: _,
                 file_options: options,
             } => DslPlan::Scan {
-                sources: sources.to_dsl(true),
+                sources: Arc::new(Mutex::new(DslScanSources {
+                    sources,
+                    is_expanded: true,
+                })),
                 file_info: Arc::new(RwLock::new(Some(file_info))),
                 hive_parts,
                 predicate: predicate.map(|e| e.to_expr(expr_arena)),
diff --git a/crates/polars-plan/src/plans/conversion/scans.rs b/crates/polars-plan/src/plans/conversion/scans.rs
index 1cc939417d60..e234521c0a51 100644
--- a/crates/polars-plan/src/plans/conversion/scans.rs
+++ b/crates/polars-plan/src/plans/conversion/scans.rs
@@ -10,13 +10,6 @@ use polars_io::RowIndex;
 
 use super::*;
 
-fn get_first_path(paths: &[PathBuf]) -> PolarsResult<&PathBuf> {
-    // Use first path to get schema.
-    paths
-        .first()
-        .ok_or_else(|| polars_err!(ComputeError: "expected at least 1 path"))
-}
-
 #[cfg(any(feature = "parquet", feature = "ipc"))]
 fn prepare_output_schema(mut schema: Schema, row_index: Option<&RowIndex>) -> SchemaRef {
     if let Some(rc) = row_index {
@@ -39,66 +32,57 @@ fn prepare_schemas(mut schema: Schema, row_index: Option<&RowIndex>) -> (SchemaR
 
 #[cfg(feature = "parquet")]
 pub(super) fn parquet_file_info(
-    source: &ScanSource,
+    sources: &ScanSources,
     file_options: &FileScanOptions,
     #[allow(unused)] cloud_options: Option<&polars_io::cloud::CloudOptions>,
 ) -> PolarsResult<(FileInfo, Option<FileMetaDataRef>)> {
-    let (schema, reader_schema, num_rows, metadata) = match source {
-        ScanSource::Files(paths) => {
-            let path = get_first_path(paths)?;
-            if is_cloud_url(path) {
-                #[cfg(not(feature = "cloud"))]
-                panic!("One or more of the cloud storage features ('aws', 'gcp', ...) must be enabled.");
+    use polars_core::error::feature_gated;
+
+    let first_source = sources
+        .first()
+        .ok_or_else(|| polars_err!(ComputeError: "expected at least 1 source"))?;
 
-                #[cfg(feature = "cloud")]
-                {
+    let (reader_schema, num_rows, metadata) = match first_source {
+        ScanSourceRef::File(path) => {
+            if is_cloud_url(path) {
+                feature_gated!("cloud", {
                     let uri = path.to_string_lossy();
                     get_runtime().block_on(async {
                         let mut reader =
                             ParquetAsyncReader::from_uri(&uri, cloud_options, None).await?;
-                        let reader_schema = reader.schema().await?;
-                        let num_rows = reader.num_rows().await?;
-                        let metadata = reader.get_metadata().await?.clone();
-
-                        let schema = prepare_output_schema(
-                            Schema::from_arrow_schema(reader_schema.as_ref()),
-                            file_options.row_index.as_ref(),
-                        );
-                        PolarsResult::Ok((schema, reader_schema, Some(num_rows), Some(metadata)))
+
+                        PolarsResult::Ok((
+                            reader.schema().await?,
+                            Some(reader.num_rows().await?),
+                            Some(reader.get_metadata().await?.clone()),
+                        ))
                     })?
-                }
+                })
             } else {
                 let file = polars_utils::open_file(path)?;
                 let mut reader = ParquetReader::new(file);
-                let reader_schema = reader.schema()?;
-                let schema = prepare_output_schema(
-                    Schema::from_arrow_schema(reader_schema.as_ref()),
-                    file_options.row_index.as_ref(),
-                );
                 (
-                    schema,
-                    reader_schema,
+                    reader.schema()?,
                     Some(reader.num_rows()?),
                     Some(reader.get_metadata()?.clone()),
                 )
             }
         },
-        ScanSource::Buffer(buffer) => {
+        ScanSourceRef::Buffer(buffer) => {
             let mut reader = ParquetReader::new(std::io::Cursor::new(buffer));
-            let reader_schema = reader.schema()?;
-            let schema = prepare_output_schema(
-                Schema::from_arrow_schema(reader_schema.as_ref()),
-                file_options.row_index.as_ref(),
-            );
             (
-                schema,
-                reader_schema,
+                reader.schema()?,
                 Some(reader.num_rows()?),
                 Some(reader.get_metadata()?.clone()),
             )
         },
     };
 
+    let schema = prepare_output_schema(
+        Schema::from_arrow_schema(reader_schema.as_ref()),
+        file_options.row_index.as_ref(),
+    );
+
     let file_info = FileInfo::new(
         schema,
         Some(Either::Left(reader_schema)),
@@ -111,31 +95,39 @@ pub(super) fn parquet_file_info(
 // TODO! return metadata arced
 #[cfg(feature = "ipc")]
 pub(super) fn ipc_file_info(
-    paths: &[PathBuf],
+    sources: &ScanSources,
     file_options: &FileScanOptions,
     cloud_options: Option<&polars_io::cloud::CloudOptions>,
 ) -> PolarsResult<(FileInfo, arrow::io::ipc::read::FileMetadata)> {
-    let path = get_first_path(paths)?;
-
-    let metadata = if is_cloud_url(path) {
-        #[cfg(not(feature = "cloud"))]
-        panic!("One or more of the cloud storage features ('aws', 'gcp', ...) must be enabled.");
-
-        #[cfg(feature = "cloud")]
-        {
-            let uri = path.to_string_lossy();
-            get_runtime().block_on(async {
-                polars_io::ipc::IpcReaderAsync::from_uri(&uri, cloud_options)
-                    .await?
-                    .metadata()
-                    .await
-            })?
-        }
-    } else {
-        arrow::io::ipc::read::read_file_metadata(&mut std::io::BufReader::new(
-            polars_utils::open_file(path)?,
-        ))?
+    use polars_core::error::feature_gated;
+
+    let Some(first) = sources.first() else {
+        polars_bail!(ComputeError: "expected at least 1 source");
     };
+
+    let metadata = match first {
+        ScanSourceRef::File(path) => {
+            if is_cloud_url(path) {
+                feature_gated!("cloud", {
+                    let uri = path.to_string_lossy();
+                    get_runtime().block_on(async {
+                        polars_io::ipc::IpcReaderAsync::from_uri(&uri, cloud_options)
+                            .await?
+                            .metadata()
+                            .await
+                    })?
+                })
+            } else {
+                arrow::io::ipc::read::read_file_metadata(&mut std::io::BufReader::new(
+                    polars_utils::open_file(path)?,
+                ))?
+            }
+        },
+        ScanSourceRef::Buffer(buff) => {
+            arrow::io::ipc::read::read_file_metadata(&mut std::io::Cursor::new(buff))?
+        },
+    };
+
     let file_info = FileInfo::new(
         prepare_output_schema(
             Schema::from_arrow_schema(metadata.schema.as_ref()),
@@ -150,7 +142,7 @@ pub(super) fn ipc_file_info(
 
 #[cfg(feature = "csv")]
 pub(super) fn csv_file_info(
-    source: &ScanSource,
+    sources: &ScanSources,
     file_options: &FileScanOptions,
     csv_options: &mut CsvReadOptions,
     cloud_options: Option<&polars_io::cloud::CloudOptions>,
@@ -168,30 +160,31 @@ pub(super) fn csv_file_info(
     // * See if we can do this without downloading the entire file
 
     // prints the error message if paths is empty.
-    let run_async = source.is_cloud_url()? || config::force_async();
+    let run_async = sources.is_cloud_url() || config::force_async();
 
-    let si_result = match source {
-        ScanSource::Files(paths) => {
-            let cache_entries = {
-                feature_gated!("cloud", {
-                    if run_async {
-                        Some(polars_io::file_cache::init_entries_from_uri_list(
-                            source
-                                .as_paths()
-                                .iter()
-                                .flat_map(|p| p.iter())
-                                .map(|path| Arc::from(path.to_str().unwrap()))
-                                .collect::<Vec<_>>()
-                                .as_slice(),
-                            cloud_options,
-                        )?)
-                    } else {
-                        None
-                    }
-                })
-            };
+    let cache_entries = {
+        feature_gated!("cloud", {
+            if run_async {
+                Some(polars_io::file_cache::init_entries_from_uri_list(
+                    sources
+                        .as_paths()
+                        .iter()
+                        .map(|path| Arc::from(path.to_str().unwrap()))
+                        .collect::<Vec<_>>()
+                        .as_slice(),
+                    cloud_options,
+                )?)
+            } else {
+                None
+            }
+        })
+    };
 
-            let infer_schema_func = |i| {
+    let infer_schema_func = |i| {
+        let source = sources.at(i);
+        let owned = &mut vec![];
+        match source {
+            ScanSourceRef::File(path) => {
                 let file = if run_async {
                     feature_gated!("cloud", {
                         let entry: &Arc<polars_io::file_cache::FileCacheEntry> =
@@ -199,92 +192,77 @@ pub(super) fn csv_file_info(
                         entry.try_open_check_latest()?
                     })
                 } else {
-                    let p: &PathBuf = &paths[i];
-                    polars_utils::open_file(p.as_ref())?
+                    polars_utils::open_file(path)?
                 };
 
                 let mmap = unsafe { memmap::Mmap::map(&file).unwrap() };
-                let owned = &mut vec![];
+                let mut reader =
+                    std::io::Cursor::new(maybe_decompress_bytes(mmap.as_ref(), owned)?);
 
-                let mut curs = std::io::Cursor::new(maybe_decompress_bytes(mmap.as_ref(), owned)?);
-
-                if curs.read(&mut [0; 4])? < 2 && csv_options.raise_if_empty {
+                if reader.read(&mut [0; 4])? < 2 && csv_options.raise_if_empty {
                     polars_bail!(NoData: "empty CSV")
                 }
-                curs.rewind()?;
+                reader.rewind()?;
 
-                let reader_bytes = get_reader_bytes(&mut curs).expect("could not mmap file");
+                let reader_bytes = get_reader_bytes(&mut reader).expect("could not mmap file");
 
                 // this needs a way to estimated bytes/rows.
-                let si_result = SchemaInferenceResult::try_from_reader_bytes_and_options(
-                    &reader_bytes,
-                    csv_options,
-                )?;
-
-                Ok(si_result)
-            };
-
-            let merge_func = |a: PolarsResult<SchemaInferenceResult>,
-                              b: PolarsResult<SchemaInferenceResult>| {
-                match (a, b) {
-                    (Err(e), _) | (_, Err(e)) => Err(e),
-                    (Ok(a), Ok(b)) => {
-                        let merged_schema = if csv_options.schema.is_some() {
-                            csv_options.schema.clone().unwrap()
-                        } else {
-                            let schema_a = a.get_inferred_schema();
-                            let schema_b = b.get_inferred_schema();
-
-                            match (schema_a.is_empty(), schema_b.is_empty()) {
-                                (true, _) => schema_b,
-                                (_, true) => schema_a,
-                                _ => {
-                                    let mut s = Arc::unwrap_or_clone(schema_a);
-                                    s.to_supertype(&schema_b)?;
-                                    Arc::new(s)
-                                },
-                            }
-                        };
-
-                        Ok(a.with_inferred_schema(merged_schema))
-                    },
+                SchemaInferenceResult::try_from_reader_bytes_and_options(&reader_bytes, csv_options)
+            },
+            ScanSourceRef::Buffer(buffer) => {
+                let mut reader = std::io::Cursor::new(maybe_decompress_bytes(buffer, owned)?);
+
+                if reader.read(&mut [0; 4])? < 2 && csv_options.raise_if_empty {
+                    polars_bail!(NoData: "empty CSV")
                 }
-            };
-
-            let si_results = POOL.join(
-                || infer_schema_func(0),
-                || {
-                    (1..paths.len())
-                        .into_par_iter()
-                        .map(infer_schema_func)
-                        .reduce(|| Ok(Default::default()), merge_func)
-                },
-            );
-
-            merge_func(si_results.0, si_results.1)?
-        },
-        ScanSource::Buffer(buffer) => {
-            polars_ensure!(!run_async, nyi = "BytesIO scan with async");
+                reader.rewind()?;
 
-            let owned = &mut vec![];
-            let mut reader = std::io::Cursor::new(maybe_decompress_bytes(buffer, owned)?);
+                let reader_bytes = get_reader_bytes(&mut reader).expect("could not open file");
 
-            if reader.read(&mut [0; 4])? < 2 && csv_options.raise_if_empty {
-                polars_bail!(NoData: "empty CSV")
-            }
-            reader.rewind()?;
+                // this needs a way to estimated bytes/rows.
+                SchemaInferenceResult::try_from_reader_bytes_and_options(&reader_bytes, csv_options)
+            },
+        }
+    };
 
-            let reader_bytes = get_reader_bytes(&mut reader).expect("could not open file");
+    let merge_func = |a: PolarsResult<SchemaInferenceResult>,
+                      b: PolarsResult<SchemaInferenceResult>| {
+        match (a, b) {
+            (Err(e), _) | (_, Err(e)) => Err(e),
+            (Ok(a), Ok(b)) => {
+                let merged_schema = if csv_options.schema.is_some() {
+                    csv_options.schema.clone().unwrap()
+                } else {
+                    let schema_a = a.get_inferred_schema();
+                    let schema_b = b.get_inferred_schema();
+
+                    match (schema_a.is_empty(), schema_b.is_empty()) {
+                        (true, _) => schema_b,
+                        (_, true) => schema_a,
+                        _ => {
+                            let mut s = Arc::unwrap_or_clone(schema_a);
+                            s.to_supertype(&schema_b)?;
+                            Arc::new(s)
+                        },
+                    }
+                };
 
-            // this needs a way to estimated bytes/rows.
-            let si_result = SchemaInferenceResult::try_from_reader_bytes_and_options(
-                &reader_bytes,
-                csv_options,
-            )?;
+                Ok(a.with_inferred_schema(merged_schema))
+            },
+        }
+    };
 
-            si_result
+    let si_results = POOL.join(
+        || infer_schema_func(0),
+        || {
+            (1..sources.len())
+                .into_par_iter()
+                .map(infer_schema_func)
+                .reduce(|| Ok(Default::default()), merge_func)
         },
-    };
+    );
+
+    let si_result = merge_func(si_results.0, si_results.1)?;
 
     csv_options.update_with_inference_result(&si_result);
 
@@ -314,58 +292,39 @@ pub(super) fn csv_file_info(
 
 #[cfg(feature = "json")]
 pub(super) fn ndjson_file_info(
-    paths: &[PathBuf],
+    sources: &ScanSources,
     file_options: &FileScanOptions,
     ndjson_options: &mut NDJsonReadOptions,
     cloud_options: Option<&polars_io::cloud::CloudOptions>,
 ) -> PolarsResult<FileInfo> {
     use polars_core::config;
+    use polars_core::error::feature_gated;
+
+    let Some(first) = sources.first() else {
+        polars_bail!(ComputeError: "expected at least 1 source");
+    };
 
-    let run_async = !paths.is_empty() && is_cloud_url(&paths[0]) || config::force_async();
+    let run_async = sources.is_cloud_url() || config::force_async();
 
     let cache_entries = {
-        #[cfg(feature = "cloud")]
-        {
-            if run_async {
+        if run_async {
+            feature_gated!("cloud", {
                 Some(polars_io::file_cache::init_entries_from_uri_list(
-                    paths
+                    sources
+                        .as_paths()
                         .iter()
                         .map(|path| Arc::from(path.to_str().unwrap()))
                         .collect::<Vec<_>>()
                         .as_slice(),
                     cloud_options,
                 )?)
-            } else {
-                None
-            }
-        }
-        #[cfg(not(feature = "cloud"))]
-        {
-            if run_async {
-                panic!("required feature `cloud` is not enabled")
-            }
-        }
-    };
-
-    let first_path = get_first_path(paths)?;
-
-    let f = if run_async {
-        #[cfg(feature = "cloud")]
-        {
-            cache_entries.unwrap()[0].try_open_check_latest()?
-        }
-        #[cfg(not(feature = "cloud"))]
-        {
-            panic!("required feature `cloud` is not enabled")
+            })
+        } else {
+            None
         }
-    } else {
-        polars_utils::open_file(first_path)?
     };
 
     let owned = &mut vec![];
-    let mmap = unsafe { memmap::Mmap::map(&f).unwrap() };
-
-    let mut reader = std::io::BufReader::new(maybe_decompress_bytes(mmap.as_ref(), owned)?);
 
     let (mut reader_schema, schema) = if let Some(schema) = ndjson_options.schema.take() {
         if file_options.row_index.is_none() {
@@ -377,8 +336,28 @@ pub(super) fn ndjson_file_info(
             )
         }
     } else {
-        let schema =
-            polars_io::ndjson::infer_schema(&mut reader, ndjson_options.infer_schema_length)?;
+        let schema = match first {
+            ScanSourceRef::File(path) => {
+                let f = if run_async {
+                    feature_gated!("cloud", {
+                        cache_entries.unwrap()[0].try_open_check_latest()?
+                    })
+                } else {
+                    polars_utils::open_file(path)?
+                };
+
+                let mmap = unsafe { memmap::Mmap::map(&f).unwrap() };
+                let mut reader =
+                    std::io::Cursor::new(maybe_decompress_bytes(mmap.as_ref(), owned)?);
+
+                polars_io::ndjson::infer_schema(&mut reader, ndjson_options.infer_schema_length)?
+            },
+            ScanSourceRef::Buffer(buff) => {
+                let mut reader = std::io::Cursor::new(maybe_decompress_bytes(buff, owned)?);
+                polars_io::ndjson::infer_schema(&mut reader, ndjson_options.infer_schema_length)?
+            },
+        };
+
         prepare_schemas(schema, file_options.row_index.as_ref())
     };
 
diff --git a/crates/polars-plan/src/plans/functions/count.rs b/crates/polars-plan/src/plans/functions/count.rs
index f3120bad8dff..f8d344217e70 100644
--- a/crates/polars-plan/src/plans/functions/count.rs
+++ b/crates/polars-plan/src/plans/functions/count.rs
@@ -1,5 +1,12 @@
 #[cfg(feature = "ipc")]
 use arrow::io::ipc::read::get_row_count as count_rows_ipc_sync;
+#[cfg(any(
+    feature = "parquet",
+    feature = "ipc",
+    feature = "json",
+    feature = "csv"
+))]
+use polars_core::error::feature_gated;
 #[cfg(any(feature = "parquet", feature = "json"))]
 use polars_io::cloud::CloudOptions;
 #[cfg(feature = "csv")]
@@ -18,7 +25,7 @@ use polars_io::SerReader;
 use super::*;
 
 #[allow(unused_variables)]
-pub fn count_rows(sources: &Arc<[ScanSource]>, scan_type: &FileScan) -> PolarsResult<DataFrame> {
+pub fn count_rows(sources: &ScanSources, scan_type: &FileScan) -> PolarsResult<DataFrame> {
     #[cfg(not(any(
         feature = "parquet",
         feature = "ipc",
@@ -79,7 +86,7 @@ pub fn count_rows(sources: &Arc<[ScanSource]>, scan_type: &FileScan) -> PolarsRe
 
 #[cfg(feature = "csv")]
 fn count_all_rows_csv(
-    sources: &Arc<[ScanSource]>,
+    sources: &ScanSources,
     options: &polars_io::prelude::CsvReadOptions,
 ) -> PolarsResult<usize> {
     let parse_options = options.get_parse_options();
@@ -87,20 +94,15 @@ fn count_all_rows_csv(
     sources
         .iter()
         .map(|source| match source {
-            ScanSource::Files(paths) => paths
-                .iter()
-                .map(|path| {
-                    count_rows_csv(
-                        path,
-                        parse_options.separator,
-                        parse_options.quote_char,
-                        parse_options.comment_prefix.as_ref(),
-                        parse_options.eol_char,
-                        options.has_header,
-                    )
-                })
-                .sum::<PolarsResult<usize>>(),
-            ScanSource::Buffer(buf) => count_rows_csv_from_slice(
+            ScanSourceRef::File(path) => count_rows_csv(
+                path,
+                parse_options.separator,
+                parse_options.quote_char,
+                parse_options.comment_prefix.as_ref(),
+                parse_options.eol_char,
+                options.has_header,
+            ),
+            ScanSourceRef::Buffer(buf) => count_rows_csv_from_slice(
                 &buf[..],
                 parse_options.separator,
                 parse_options.quote_char,
@@ -114,31 +116,26 @@ fn count_all_rows_csv(
 
 #[cfg(feature = "parquet")]
 pub(super) fn count_rows_parquet(
-    sources: &Arc<[ScanSource]>,
+    sources: &ScanSources,
     #[allow(unused)] cloud_options: Option<&CloudOptions>,
 ) -> PolarsResult<usize> {
     if sources.is_empty() {
         return Ok(0);
     };
-    let is_cloud = sources.first().unwrap().is_cloud_url()?;
+    let is_cloud = sources.is_cloud_url();
 
     if is_cloud {
-        #[cfg(not(feature = "cloud"))]
-        panic!("One or more of the cloud storage features ('aws', 'gcp', ...) must be enabled.");
-
-        #[cfg(feature = "cloud")]
-        {
-            get_runtime().block_on(count_rows_cloud_parquet(sources, cloud_options))
-        }
+        feature_gated!("cloud", {
+            get_runtime().block_on(count_rows_cloud_parquet(sources.as_paths(), cloud_options))
+        })
     } else {
         sources
             .iter()
             .map(|source| match source {
-                ScanSource::Files(paths) => paths
-                    .iter()
-                    .map(|path| ParquetReader::new(polars_utils::open_file(path)?).num_rows())
-                    .sum::<PolarsResult<usize>>(),
-                ScanSource::Buffer(buffer) => {
+                ScanSourceRef::File(path) => {
+                    ParquetReader::new(polars_utils::open_file(path)?).num_rows()
+                },
+                ScanSourceRef::Buffer(buffer) => {
                     ParquetReader::new(std::io::Cursor::new(buffer)).num_rows()
                 },
             })
@@ -148,17 +145,14 @@ pub(super) fn count_rows_parquet(
 
 #[cfg(all(feature = "parquet", feature = "async"))]
 async fn count_rows_cloud_parquet(
-    sources: &Arc<[ScanSource]>,
+    paths: &[std::path::PathBuf],
     cloud_options: Option<&CloudOptions>,
 ) -> PolarsResult<usize> {
-    let collection = sources.iter().flat_map(|source| {
-        source.as_paths().iter().map(|path| {
-            with_concurrency_budget(1, || async {
-                let mut reader =
-                    ParquetAsyncReader::from_uri(&path.to_string_lossy(), cloud_options, None)
-                        .await?;
-                reader.num_rows().await
-            })
+    let collection = paths.iter().map(|path| {
+        with_concurrency_budget(1, || async {
+            let mut reader =
+                ParquetAsyncReader::from_uri(&path.to_string_lossy(), cloud_options, None).await?;
+            reader.num_rows().await
         })
     });
     futures::future::try_join_all(collection)
@@ -168,34 +162,31 @@ async fn count_rows_cloud_parquet(
 
 #[cfg(feature = "ipc")]
 pub(super) fn count_rows_ipc(
-    sources: &Arc<[ScanSource]>,
+    sources: &ScanSources,
     #[cfg(feature = "cloud")] cloud_options: Option<&CloudOptions>,
     metadata: Option<&arrow::io::ipc::read::FileMetadata>,
 ) -> PolarsResult<usize> {
     if sources.is_empty() {
         return Ok(0);
     };
-    let is_cloud = sources.first().unwrap().is_cloud_url()?;
+    let is_cloud = sources.is_cloud_url();
 
     if is_cloud {
-        #[cfg(not(feature = "cloud"))]
-        panic!("One or more of the cloud storage features ('aws', 'gcp', ...) must be enabled.");
-
-        #[cfg(feature = "cloud")]
-        {
-            get_runtime().block_on(count_rows_cloud_ipc(sources, cloud_options, metadata))
-        }
+        feature_gated!("cloud", {
+            get_runtime().block_on(count_rows_cloud_ipc(
+                sources.as_paths(),
+                cloud_options,
+                metadata,
+            ))
+        })
     } else {
         sources
             .iter()
             .map(|source| match source {
-                ScanSource::Files(paths) => paths
-                    .iter()
-                    .map(|path| {
-                        count_rows_ipc_sync(&mut polars_utils::open_file(path)?).map(|v| v as usize)
-                    })
-                    .sum::<PolarsResult<usize>>(),
-                ScanSource::Buffer(buffer) => {
+                ScanSourceRef::File(path) => {
+                    count_rows_ipc_sync(&mut polars_utils::open_file(path)?).map(|v| v as usize)
+                },
+                ScanSourceRef::Buffer(buffer) => {
                     count_rows_ipc_sync(&mut std::io::Cursor::new(buffer)).map(|v| v as usize)
                 },
             })
@@ -205,19 +196,16 @@ pub(super) fn count_rows_ipc(
 
 #[cfg(all(feature = "ipc", feature = "async"))]
 async fn count_rows_cloud_ipc(
-    sources: &Arc<[ScanSource]>,
+    paths: &[std::path::PathBuf],
     cloud_options: Option<&CloudOptions>,
     metadata: Option<&arrow::io::ipc::read::FileMetadata>,
 ) -> PolarsResult<usize> {
     use polars_io::ipc::IpcReaderAsync;
 
-    let collection = sources.iter().flat_map(|source| {
-        source.as_paths().iter().map(|path| {
-            with_concurrency_budget(1, || async {
-                let reader =
-                    IpcReaderAsync::from_uri(&path.to_string_lossy(), cloud_options).await?;
-                reader.count_rows(metadata).await
-            })
+    let collection = paths.iter().map(|path| {
+        with_concurrency_budget(1, || async {
+            let reader = IpcReaderAsync::from_uri(&path.to_string_lossy(), cloud_options).await?;
+            reader.count_rows(metadata).await
         })
     });
     futures::future::try_join_all(collection)
@@ -227,23 +215,26 @@ async fn count_rows_cloud_ipc(
 
 #[cfg(feature = "json")]
 pub(super) fn count_rows_ndjson(
-    sources: &Arc<[ScanSource]>,
+    sources: &ScanSources,
     cloud_options: Option<&CloudOptions>,
 ) -> PolarsResult<usize> {
     use polars_core::config;
-    use polars_core::error::feature_gated;
     use polars_io::utils::maybe_decompress_bytes;
 
-    let run_async =
-        !sources.is_empty() && sources.first().unwrap().is_cloud_url()? || config::force_async();
+    if sources.is_empty() {
+        return Ok(0);
+    }
+
+    let is_cloud_url = sources.is_cloud_url();
+    let run_async = is_cloud_url || config::force_async();
 
     let cache_entries = {
         feature_gated!("cloud", {
             if run_async {
                 Some(polars_io::file_cache::init_entries_from_uri_list(
                     sources
+                        .as_paths()
                         .iter()
-                        .flat_map(|source| source.as_paths())
                         .map(|path| Arc::from(path.to_str().unwrap()))
                         .collect::<Vec<_>>()
                         .as_slice(),
@@ -258,29 +249,26 @@ pub(super) fn count_rows_ndjson(
     sources
         .iter()
         .map(|source| match source {
-            ScanSource::Files(paths) => paths
-                .iter()
-                .map(|path| {
-                    let f = if run_async {
-                        feature_gated!("cloud", {
-                            let entry: &Arc<polars_io::file_cache::FileCacheEntry> =
-                                &cache_entries.as_ref().unwrap()[0];
-                            entry.try_open_check_latest()?
-                        })
-                    } else {
-                        polars_utils::open_file(path)?
-                    };
+            ScanSourceRef::File(path) => {
+                let f = if run_async {
+                    feature_gated!("cloud", {
+                        let entry: &Arc<polars_io::file_cache::FileCacheEntry> =
+                            &cache_entries.as_ref().unwrap()[0];
+                        entry.try_open_check_latest()?
+                    })
+                } else {
+                    polars_utils::open_file(path)?
+                };
 
-                    let mmap = unsafe { memmap::Mmap::map(&f).unwrap() };
-                    let owned = &mut vec![];
+                let mmap = unsafe { memmap::Mmap::map(&f).unwrap() };
+                let owned = &mut vec![];
 
-                    let reader = polars_io::ndjson::core::JsonLineReader::new(
-                        std::io::Cursor::new(maybe_decompress_bytes(mmap.as_ref(), owned)?),
-                    );
-                    reader.count()
-                })
-                .sum::<PolarsResult<usize>>(),
-            ScanSource::Buffer(buffer) => {
+                let reader = polars_io::ndjson::core::JsonLineReader::new(std::io::Cursor::new(
+                    maybe_decompress_bytes(mmap.as_ref(), owned)?,
+                ));
+                reader.count()
+            },
+            ScanSourceRef::Buffer(buffer) => {
                 polars_ensure!(!run_async, nyi = "BytesIO with force_async");
 
                 let owned = &mut vec![];
diff --git a/crates/polars-plan/src/plans/functions/mod.rs b/crates/polars-plan/src/plans/functions/mod.rs
index 468a85273ea4..e453acae6855 100644
--- a/crates/polars-plan/src/plans/functions/mod.rs
+++ b/crates/polars-plan/src/plans/functions/mod.rs
@@ -13,6 +13,7 @@ use std::hash::{Hash, Hasher};
 use std::sync::{Arc, Mutex};
 
 pub use dsl::*;
+use polars_core::error::feature_gated;
 use polars_core::prelude::*;
 use polars_utils::pl_str::PlSmallStr;
 #[cfg(feature = "serde")]
@@ -44,7 +45,7 @@ pub enum FunctionIR {
         fmt_str: PlSmallStr,
     },
     FastCount {
-        sources: Arc<[ScanSource]>,
+        sources: ScanSources,
         scan_type: FileScan,
         alias: Option<PlSmallStr>,
     },
@@ -274,14 +275,7 @@ impl FunctionIR {
             #[cfg(feature = "merge_sorted")]
             MergeSorted { column } => merge_sorted(&df, column.as_ref()),
             Unnest { columns: _columns } => {
-                #[cfg(feature = "dtype-struct")]
-                {
-                    df.unnest(_columns.iter().cloned())
-                }
-                #[cfg(not(feature = "dtype-struct"))]
-                {
-                    panic!("activate feature 'dtype-struct'")
-                }
+                feature_gated!("dtype-struct", df.unnest(_columns.iter().cloned()))
             },
             Pipeline { function, .. } => {
                 // we use a global string cache here as streaming chunks all have different rev maps
diff --git a/crates/polars-plan/src/plans/ir/dot.rs b/crates/polars-plan/src/plans/ir/dot.rs
index c3b8f2e94874..3ece8966a857 100644
--- a/crates/polars-plan/src/plans/ir/dot.rs
+++ b/crates/polars-plan/src/plans/ir/dot.rs
@@ -255,9 +255,8 @@ impl<'a> IRDotDisplay<'a> {
                 file_options: options,
                 output_schema: _,
             } => {
-                let paths = sources.as_paths();
                 let name: &str = scan_type.into();
-                let path = PathsDisplay(paths.as_ref());
+                let path = ScanSourcesDisplay(sources);
                 let with_columns = options.with_columns.as_ref().map(|cols| cols.as_ref());
                 let with_columns = NumColumns(with_columns);
                 let total_columns =
@@ -344,10 +343,36 @@ impl<'a> IRDotDisplay<'a> {
 
 // A few utility structures for formatting
 pub struct PathsDisplay<'a>(pub &'a [PathBuf]);
+pub struct ScanSourcesDisplay<'a>(pub &'a ScanSources);
 struct NumColumns<'a>(Option<&'a [PlSmallStr]>);
 struct NumColumnsSchema<'a>(Option<&'a Schema>);
 struct OptionExprIRDisplay<'a>(Option<ExprIRDisplay<'a>>);
 
+impl fmt::Display for ScanSourceRef<'_> {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match self {
+            ScanSourceRef::File(path) => path.display().fmt(f),
+            ScanSourceRef::Buffer(buff) => write!(f, "{} in-mem bytes", buff.len()),
+        }
+    }
+}
+
+impl fmt::Display for ScanSourcesDisplay<'_> {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match self.0.len() {
+            0 => write!(f, "[]"),
+            1 => write!(f, "[{}]", self.0.at(0)),
+            2 => write!(f, "[{}, {}]", self.0.at(0), self.0.at(1)),
+            _ => write!(
+                f,
+                "[{}, ... {} other sources]",
+                self.0.at(0),
+                self.0.len() - 1,
+            ),
+        }
+    }
+}
+
 impl fmt::Display for PathsDisplay<'_> {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         match self.0.len() {
@@ -357,7 +382,7 @@ impl fmt::Display for PathsDisplay<'_> {
             _ => write!(
                 f,
                 "[{}, ... {} other files]",
-                self.0[0].to_string_lossy(),
+                self.0[0].display(),
                 self.0.len() - 1,
             ),
         }
diff --git a/crates/polars-plan/src/plans/ir/format.rs b/crates/polars-plan/src/plans/ir/format.rs
index cc64daf67a30..a69eb5203359 100644
--- a/crates/polars-plan/src/plans/ir/format.rs
+++ b/crates/polars-plan/src/plans/ir/format.rs
@@ -7,7 +7,7 @@ use polars_core::schema::Schema;
 use polars_io::RowIndex;
 use recursive::recursive;
 
-use super::ir::dot::PathsDisplay;
+use self::ir::dot::ScanSourcesDisplay;
 use crate::prelude::*;
 
 pub struct IRDisplay<'a> {
@@ -55,7 +55,7 @@ impl AsExpr for ExprIR {
 fn write_scan(
     f: &mut Formatter,
     name: &str,
-    source: &ScanSource,
+    sources: &ScanSources,
     indent: usize,
     n_columns: i64,
     total_columns: usize,
@@ -63,12 +63,12 @@ fn write_scan(
     slice: Option<(i64, usize)>,
     row_index: Option<&RowIndex>,
 ) -> fmt::Result {
-    write!(f, "{:indent$}{name} SCAN ", "")?;
-
-    match source {
-        ScanSource::Files(paths) => write!(f, "{}", PathsDisplay(paths.as_ref()))?,
-        ScanSource::Buffer(_) => write!(f, "IN MEMORY BUFFER")?,
-    }
+    write!(
+        f,
+        "{:indent$}{name} SCAN {}",
+        "",
+        ScanSourcesDisplay(sources)
+    )?;
 
     let total_columns = total_columns - usize::from(row_index.is_some());
     if n_columns > 0 {
@@ -175,7 +175,7 @@ impl<'a> IRDisplay<'a> {
                 write_scan(
                     f,
                     "PYTHON",
-                    &ScanSource::default(),
+                    &ScanSources::default(),
                     indent,
                     n_columns,
                     total_columns,
diff --git a/crates/polars-plan/src/plans/ir/mod.rs b/crates/polars-plan/src/plans/ir/mod.rs
index ff4e46e64dd8..db73f7a13528 100644
--- a/crates/polars-plan/src/plans/ir/mod.rs
+++ b/crates/polars-plan/src/plans/ir/mod.rs
@@ -7,7 +7,6 @@ pub(crate) mod tree_format;
 use std::borrow::Cow;
 use std::fmt;
 use std::path::{Path, PathBuf};
-use std::sync::Mutex;
 
 pub use dot::{EscapeLabel, IRDotDisplay, PathsDisplay};
 pub use format::{ExprIRDisplay, IRDisplay};
@@ -15,7 +14,7 @@ use hive::HivePartitions;
 use polars_core::prelude::*;
 use polars_core::POOL;
 use polars_utils::idx_vec::UnitVec;
-use polars_utils::{format_pl_smallstr, unitvec};
+use polars_utils::unitvec;
 #[cfg(feature = "ir_serde")]
 use serde::{Deserialize, Serialize};
 
@@ -35,18 +34,17 @@ pub struct IRPlanRef<'a> {
     pub expr_arena: &'a Arena<AExpr>,
 }
 
-#[cfg_attr(feature = "ir_serde", derive(Serialize, Deserialize))]
+#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
 #[derive(Debug, Clone, Hash, PartialEq, Eq)]
-pub enum ScanSource {
+pub enum ScanSources {
     Files(Arc<[PathBuf]>),
-    #[cfg_attr(feature = "ir_serde", serde(skip))]
-    Buffer(Arc<[u8]>),
+    Buffers(Arc<[Arc<[u8]>]>),
 }
 
-impl Default for ScanSource {
-    fn default() -> Self {
-        Self::Files(Arc::default())
-    }
+#[derive(Debug, Clone, Copy)]
+pub enum ScanSourceRef<'a> {
+    File(&'a Path),
+    Buffer(&'a [u8]),
 }
 
 pub struct ScanSourceSliceInfo {
@@ -54,62 +52,88 @@ pub struct ScanSourceSliceInfo {
     pub source_slice: std::ops::Range<usize>,
 }
 
-impl ScanSource {
+impl Default for ScanSources {
+    fn default() -> Self {
+        Self::Buffers(Arc::default())
+    }
+}
+
+impl<'a> ScanSourceRef<'a> {
+    pub fn to_file_path(&self) -> &str {
+        match self {
+            ScanSourceRef::File(path) => path.to_str().unwrap(),
+            ScanSourceRef::Buffer(_) => "in-mem",
+        }
+    }
+}
+
+impl ScanSources {
+    pub fn iter(&self) -> ScanSourceIter {
+        ScanSourceIter {
+            sources: self,
+            offset: 0,
+        }
+    }
     pub fn as_paths(&self) -> &[PathBuf] {
         match self {
-            ScanSource::Files(paths) => paths,
-            ScanSource::Buffer(_) => unimplemented!(),
+            Self::Files(paths) => &paths,
+            Self::Buffers(_) => unimplemented!(),
         }
     }
 
-    pub fn try_into_paths(&self) -> PolarsResult<Arc<[PathBuf]>> {
+    pub fn try_into_paths(&self) -> Option<Arc<[PathBuf]>> {
         match self {
-            ScanSource::Files(paths) => Ok(paths.clone()),
-            ScanSource::Buffer(_) => Err(polars_err!(
-                nyi = "Unable to convert BytesIO scan into path"
-            )),
+            Self::Files(paths) => Some(paths.clone()),
+            Self::Buffers(_) => None,
         }
     }
 
     pub fn into_paths(&self) -> Arc<[PathBuf]> {
         match self {
-            ScanSource::Files(paths) => paths.clone(),
-            ScanSource::Buffer(_) => unimplemented!(),
+            Self::Files(paths) => paths.clone(),
+            Self::Buffers(_) => unimplemented!(),
         }
     }
 
-    pub fn to_dsl(self, is_expanded: bool) -> DslScanSource {
-        match self {
-            ScanSource::Files(paths) => {
-                DslScanSource::File(Arc::new(Mutex::new(ScanFileSource { paths, is_expanded })))
-            },
-            ScanSource::Buffer(buffer) => DslScanSource::Buffer(buffer),
+    pub fn to_dsl(self, is_expanded: bool) -> DslScanSources {
+        DslScanSources {
+            sources: self,
+            is_expanded,
         }
     }
 
-    pub fn num_sources(&self) -> usize {
+    pub fn is_cloud_url(&self) -> bool {
         match self {
-            ScanSource::Files(paths) => paths.len(),
-            ScanSource::Buffer(_) => 1,
+            Self::Files(paths) => paths.first().map_or(false, |p| polars_io::is_cloud_url(p)),
+            Self::Buffers(_) => false,
         }
     }
 
-    pub fn is_cloud_url(&self) -> PolarsResult<bool> {
+    pub fn len(&self) -> usize {
         match self {
-            ScanSource::Files(paths) => {
-                Ok(polars_io::is_cloud_url(paths.first().ok_or_else(
-                    || polars_err!(ComputeError: "expected at least 1 path"),
-                )?))
-            },
-            ScanSource::Buffer(_) => Ok(false),
+            Self::Files(s) => s.len(),
+            Self::Buffers(s) => s.len(),
         }
     }
 
+    pub fn is_empty(&self) -> bool {
+        self.len() == 0
+    }
+
+    pub fn first(&self) -> Option<ScanSourceRef> {
+        self.get(0)
+    }
+
     pub fn id(&self) -> PlSmallStr {
+        if self.is_empty() {
+            return PlSmallStr::from_static("EMPTY");
+        }
+
         match self {
-            ScanSource::Files(paths) if paths.is_empty() => PlSmallStr::from_static("EMPTY"),
-            ScanSource::Files(paths) => PlSmallStr::from_str(paths[0].to_string_lossy().as_ref()),
-            ScanSource::Buffer(_) => PlSmallStr::from_static("IN_MEMORY"),
+            Self::Files(paths) => {
+                PlSmallStr::from_str(paths.first().unwrap().to_string_lossy().as_ref())
+            },
+            Self::Buffers(_) => PlSmallStr::from_static("IN_MEMORY"),
         }
     }
 
@@ -118,8 +142,7 @@ impl ScanSource {
     pub fn collect_slice_information(
         &self,
         slice: (i64, usize),
-        path_to_num_rows: impl Fn(&Path) -> PolarsResult<usize> + Send + Sync,
-        buffer_to_num_rows: impl Fn(&[u8]) -> PolarsResult<usize> + Send + Sync,
+        map_to_num_rows: impl Fn(ScanSourceRef) -> PolarsResult<usize> + Send + Sync,
     ) -> PolarsResult<ScanSourceSliceInfo> {
         fn slice_to_start_end(
             offset: i64,
@@ -148,80 +171,114 @@ impl ScanSource {
 
         let (offset, length) = slice;
 
-        Ok(match self {
-            ScanSource::Files(paths) if paths.len() == 1 => {
-                let num_rows = path_to_num_rows(&paths[0])?;
-                ScanSourceSliceInfo {
-                    item_slice: slice_to_start_end(offset, length, num_rows),
-                    source_slice: 0..1,
+        if self.is_empty() {
+            return Ok(ScanSourceSliceInfo {
+                item_slice: 0..0,
+                source_slice: 0..0,
+            });
+        }
+
+        if self.len() == 1 {
+            let num_rows = map_to_num_rows(self.get(0).unwrap())?;
+            let item_slice = slice_to_start_end(offset, length, num_rows);
+            let source_slice = if item_slice.is_empty() { 0..0 } else { 0..1 };
+
+            Ok(ScanSourceSliceInfo {
+                item_slice,
+                source_slice,
+            })
+        } else {
+            use rayon::prelude::*;
+
+            // Walk the files in reverse until we find the first file, and then translate the
+            // slice into a positive-offset equivalent.
+            const CHUNK_SIZE: usize = 8;
+            let mut row_counts = Vec::with_capacity(self.len());
+
+            POOL.install(|| {
+                for idx_end in (0..self.len()).step_by(CHUNK_SIZE) {
+                    let idx_start = idx_end.saturating_sub(CHUNK_SIZE);
+
+                    row_counts.extend(
+                        (idx_start..=idx_end)
+                            .into_par_iter()
+                            .map(|i| map_to_num_rows(self.at(i)))
+                            .collect::<PolarsResult<Vec<_>>>()?
+                            .into_iter()
+                            .rev(),
+                    );
                 }
-            },
-            ScanSource::Files(paths) => {
-                use rayon::prelude::*;
 
-                assert_ne!(paths.len(), 0);
+                PolarsResult::Ok(())
+            })?;
 
-                // Walk the files in reverse until we find the first file, and then translate the
-                // slice into a positive-offset equivalent.
-                const CHUNK_SIZE: usize = 8;
-                let mut row_counts = Vec::with_capacity(paths.len());
+            let num_rows = row_counts.iter().sum::<usize>();
 
-                POOL.install(|| {
-                    for idx_end in (0..paths.len()).step_by(CHUNK_SIZE) {
-                        let idx_start = idx_end.saturating_sub(CHUNK_SIZE);
+            let item_slice = slice_to_start_end(offset, length, num_rows);
 
-                        row_counts.extend(
-                            (idx_start..=idx_end)
-                                .into_par_iter()
-                                .map(|i| path_to_num_rows(&paths[i]))
-                                .collect::<PolarsResult<Vec<_>>>()?
-                                .into_iter()
-                                .rev(),
-                        );
-                    }
+            let mut source_start = self.len() - 1;
+            let mut source_end = 0;
 
-                    PolarsResult::Ok(())
-                })?;
+            let mut sum = 0;
+            for (i, row_count) in row_counts.iter().rev().enumerate() {
+                if sum < item_slice.end {
+                    source_end = usize::max(source_end, i);
+                }
 
-                let num_rows = row_counts.iter().sum::<usize>();
+                sum += row_count;
 
-                let item_slice = slice_to_start_end(offset, length, num_rows);
+                if sum >= item_slice.start {
+                    source_start = usize::min(source_start, i);
+                }
+            }
 
-                let mut source_start = paths.len() - 1;
-                let mut source_end = 0;
+            let source_slice = source_start..source_end + 1;
 
-                let mut sum = 0;
-                for (i, row_count) in row_counts.iter().rev().enumerate() {
-                    if sum < item_slice.end {
-                        source_end = usize::max(source_end, i);
-                    }
+            Ok(ScanSourceSliceInfo {
+                item_slice,
+                source_slice,
+            })
+        }
+    }
 
-                    sum += row_count;
+    pub fn get(&self, idx: usize) -> Option<ScanSourceRef> {
+        match self {
+            ScanSources::Files(paths) => paths.get(idx).map(|p| ScanSourceRef::File(p)),
+            ScanSources::Buffers(buffers) => buffers.get(idx).map(|b| ScanSourceRef::Buffer(b)),
+        }
+    }
 
-                    if sum >= item_slice.start {
-                        source_start = usize::min(source_start, i);
-                    }
-                }
+    pub fn at(&self, idx: usize) -> ScanSourceRef {
+        self.get(idx).unwrap()
+    }
+}
 
-                let source_slice = source_start..source_end + 1;
+pub struct ScanSourceIter<'a> {
+    sources: &'a ScanSources,
+    offset: usize,
+}
 
-                ScanSourceSliceInfo {
-                    item_slice,
-                    source_slice,
-                }
-            },
-            ScanSource::Buffer(buffer) => {
-                let num_rows = buffer_to_num_rows(buffer)?;
+impl<'a> Iterator for ScanSourceIter<'a> {
+    type Item = ScanSourceRef<'a>;
 
-                ScanSourceSliceInfo {
-                    item_slice: slice_to_start_end(offset, length, num_rows),
-                    source_slice: 0..1,
-                }
-            },
-        })
+    fn next(&mut self) -> Option<Self::Item> {
+        let item = match self.sources {
+            ScanSources::Files(paths) => ScanSourceRef::File(paths.get(self.offset)?),
+            ScanSources::Buffers(buffers) => ScanSourceRef::Buffer(buffers.get(self.offset)?),
+        };
+
+        self.offset += 1;
+        Some(item)
+    }
+
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        let len = self.sources.len() - self.offset;
+        (len, Some(len))
     }
 }
 
+impl<'a> ExactSizeIterator for ScanSourceIter<'a> {}
+
 /// [`IR`] is a representation of [`DslPlan`] with [`Node`]s which are allocated in an [`Arena`]
 /// In this IR the logical plan has access to the full dataset.
 #[derive(Clone, Debug, Default)]
@@ -241,7 +298,7 @@ pub enum IR {
         predicate: ExprIR,
     },
     Scan {
-        sources: ScanSource,
+        sources: ScanSources,
         file_info: FileInfo,
         hive_parts: Option<Arc<Vec<HivePartitions>>>,
         predicate: Option<ExprIR>,
diff --git a/crates/polars-plan/src/plans/mod.rs b/crates/polars-plan/src/plans/mod.rs
index 9e2b4d56d6a4..92eeb783bf76 100644
--- a/crates/polars-plan/src/plans/mod.rs
+++ b/crates/polars-plan/src/plans/mod.rs
@@ -1,6 +1,5 @@
 use std::fmt;
 use std::fmt::Debug;
-use std::path::PathBuf;
 use std::sync::{Arc, Mutex, RwLock};
 
 use hive::HivePartitions;
@@ -61,19 +60,11 @@ pub enum Context {
 
 #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
 #[derive(Clone)]
-pub struct ScanFileSource {
-    pub paths: Arc<[PathBuf]>,
+pub struct DslScanSources {
+    pub sources: ScanSources,
     pub is_expanded: bool,
 }
 
-#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
-#[derive(Clone)]
-pub enum DslScanSource {
-    File(Arc<Mutex<ScanFileSource>>),
-    // @Q? Can we serde skip this?
-    Buffer(Arc<[u8]>),
-}
-
 // https://stackoverflow.com/questions/1031076/what-are-projection-and-selection
 #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
 pub enum DslPlan {
@@ -91,7 +82,7 @@ pub enum DslPlan {
         cache_hits: u32,
     },
     Scan {
-        sources: DslScanSource,
+        sources: Arc<Mutex<DslScanSources>>,
         // Option as this is mostly materialized on the IR phase.
         // During conversion we update the value in the DSL as well
         // This is to cater to use cases where parts of a `LazyFrame`
diff --git a/crates/polars-plan/src/plans/optimizer/count_star.rs b/crates/polars-plan/src/plans/optimizer/count_star.rs
index d88956d2903f..c8570b0f908f 100644
--- a/crates/polars-plan/src/plans/optimizer/count_star.rs
+++ b/crates/polars-plan/src/plans/optimizer/count_star.rs
@@ -1,3 +1,5 @@
+use std::path::PathBuf;
+
 use super::*;
 
 pub(super) struct CountStar;
@@ -47,7 +49,7 @@ struct CountStarExpr {
     // Top node of the projection to replace
     node: Node,
     // Paths to the input files
-    sources: Arc<[ScanSource]>,
+    sources: ScanSources,
     // File Type
     scan_type: FileScan,
     // Column Alias
@@ -64,12 +66,37 @@ fn visit_logical_plan_for_scan_paths(
 ) -> Option<CountStarExpr> {
     match lp_arena.get(node) {
         IR::Union { inputs, .. } => {
+            enum MutableSources {
+                Files(Vec<PathBuf>),
+                Buffers(Vec<Arc<[u8]>>),
+            }
+
             let mut scan_type: Option<FileScan> = None;
-            let mut sources = Vec::with_capacity(inputs.len());
+            let mut sources = None;
             for input in inputs {
                 match visit_logical_plan_for_scan_paths(*input, lp_arena, expr_arena, true) {
                     Some(expr) => {
-                        sources.extend(expr.sources.iter().cloned());
+                        match expr.sources {
+                            ScanSources::Files(paths) => match sources {
+                                Some(MutableSources::Files(ref mut files)) => {
+                                    files.extend_from_slice(&paths[..])
+                                },
+                                Some(MutableSources::Buffers(_)) => {
+                                    todo!("Mixing in memory buffers and paths in count star opt")
+                                },
+                                None => sources = Some(MutableSources::Files(paths.to_vec())),
+                            },
+                            ScanSources::Buffers(bs) => match sources {
+                                Some(MutableSources::Files(_)) => {
+                                    todo!("Mixing in memory buffers and paths in count star opt")
+                                },
+                                Some(MutableSources::Buffers(ref mut buffers)) => {
+                                    buffers.extend_from_slice(&bs[..])
+                                },
+                                None => sources = Some(MutableSources::Buffers(bs.to_vec())),
+                            },
+                        }
+
                         match &scan_type {
                             None => scan_type = Some(expr.scan_type),
                             Some(scan_type) => {
@@ -86,7 +113,11 @@ fn visit_logical_plan_for_scan_paths(
                 }
             }
             Some(CountStarExpr {
-                sources: sources.into(),
+                sources: match sources {
+                    Some(MutableSources::Files(files)) => ScanSources::Files(files.into()),
+                    Some(MutableSources::Buffers(buffers)) => ScanSources::Buffers(buffers.into()),
+                    None => ScanSources::default(),
+                },
                 scan_type: scan_type.unwrap(),
                 node,
                 alias: None,
@@ -95,7 +126,7 @@ fn visit_logical_plan_for_scan_paths(
         IR::Scan {
             scan_type, sources, ..
         } if !matches!(scan_type, FileScan::Anonymous { .. }) => Some(CountStarExpr {
-            sources: [sources.clone()].into(),
+            sources: sources.clone(),
             scan_type: scan_type.clone(),
             node,
             alias: None,
diff --git a/crates/polars-plan/src/plans/optimizer/predicate_pushdown/mod.rs b/crates/polars-plan/src/plans/optimizer/predicate_pushdown/mod.rs
index 3b9e6c8d8ef9..d5aefb2a16d7 100644
--- a/crates/polars-plan/src/plans/optimizer/predicate_pushdown/mod.rs
+++ b/crates/polars-plan/src/plans/optimizer/predicate_pushdown/mod.rs
@@ -401,7 +401,7 @@ impl<'a> PredicatePushDown<'a> {
                                     filter: None,
                                 });
                             } else {
-                                sources = ScanSource::Files(new_paths.into());
+                                sources = ScanSources::Files(new_paths.into());
                                 scan_hive_parts = Some(Arc::from(new_hive_parts));
                             }
                         }
diff --git a/crates/polars-python/src/lazyframe/general.rs b/crates/polars-python/src/lazyframe/general.rs
index 2e2ce702f5bd..40c69b260c40 100644
--- a/crates/polars-python/src/lazyframe/general.rs
+++ b/crates/polars-python/src/lazyframe/general.rs
@@ -31,7 +31,7 @@ impl PyLazyFrame {
         row_index, ignore_errors, include_file_paths, cloud_options, retries, file_cache_ttl
     ))]
     fn new_from_ndjson(
-        path: Option<PathBuf>,
+        path: Option<PyObject>,
         paths: Vec<PathBuf>,
         infer_schema_length: Option<usize>,
         schema: Option<Wrap<Schema>>,
@@ -52,37 +52,48 @@ impl PyLazyFrame {
             offset,
         });
 
-        #[cfg(feature = "cloud")]
-        let cloud_options = {
-            let first_path = if let Some(path) = &path {
-                path
-            } else {
-                paths
-                    .first()
-                    .ok_or_else(|| PyValueError::new_err("expected a path argument"))?
-            };
+        use std::path::Path;
 
-            let first_path_url = first_path.to_string_lossy();
+        use polars_plan::plans::ScanSources;
+        use EitherPythonFileOrPath as EF;
 
-            let mut cloud_options = if let Some(opts) = cloud_options {
-                parse_cloud_options(&first_path_url, opts)?
-            } else {
-                parse_cloud_options(&first_path_url, vec![])?
-            };
+        use crate::file::{get_either_file_or_path, EitherPythonFileOrPath};
+        let (first_path, mut r) = match path
+            .map(|py_f| get_either_file_or_path(py_f, false))
+            .transpose()?
+        {
+            Some(EF::Path(path)) => {
+                let reader = LazyJsonLineReader::new(<PathBuf as AsRef<Path>>::as_ref(&path));
+                (Some(path), reader)
+            },
+            Some(EF::Py(f)) => (
+                None,
+                LazyJsonLineReader::new_sourced(ScanSources::Buffers([f.as_arc()].into())),
+            ),
+            None => (
+                Some(
+                    paths
+                        .first()
+                        .cloned()
+                        .ok_or_else(|| PyValueError::new_err("expected a path argument"))?,
+                ),
+                LazyJsonLineReader::new_paths(paths.into()),
+            ),
+        };
 
+        #[cfg(feature = "cloud")]
+        if let Some(first_path) = first_path {
+            let first_path_url = first_path.to_string_lossy();
+
+            let mut cloud_options =
+                parse_cloud_options(&first_path_url, cloud_options.unwrap_or_default())?;
             cloud_options = cloud_options.with_max_retries(retries);
 
             if let Some(file_cache_ttl) = file_cache_ttl {
                 cloud_options.file_cache_ttl = file_cache_ttl;
             }
 
-            Some(cloud_options)
-        };
-
-        let r = if let Some(path) = &path {
-            LazyJsonLineReader::new(path)
-        } else {
-            LazyJsonLineReader::new_paths(paths.into())
+            r = r.with_cloud_options(Some(cloud_options));
         };
 
         let lf = r
@@ -96,7 +107,6 @@ impl PyLazyFrame {
             .with_row_index(row_index)
             .with_ignore_errors(ignore_errors)
             .with_include_file_paths(include_file_paths.map(|x| x.into()))
-            .with_cloud_options(cloud_options)
             .finish()
             .map_err(PyPolarsErr::from)?;
 
@@ -165,7 +175,7 @@ impl PyLazyFrame {
                 .collect::<Schema>()
         });
 
-        use polars_plan::plans::ScanSource;
+        use polars_plan::plans::ScanSources;
         use EitherPythonFileOrPath as EF;
         let (first_path, mut r) = match path
             .map(|py_f| get_either_file_or_path(py_f, false))
@@ -177,7 +187,7 @@ impl PyLazyFrame {
             },
             Some(EF::Py(f)) => (
                 None,
-                LazyCsvReader::new_sourced(ScanSource::Buffer(f.as_arc())),
+                LazyCsvReader::new_sourced(ScanSources::Buffers([f.as_arc()].into())),
             ),
             None => (
                 Some(
@@ -310,7 +320,7 @@ impl PyLazyFrame {
             include_file_paths: include_file_paths.map(|x| x.into()),
         };
 
-        use polars_plan::plans::ScanSource;
+        use polars_plan::plans::ScanSources;
         use EitherPythonFileOrPath as EF;
         let use_first_path = path.is_some();
         let first_path = match path
@@ -319,10 +329,13 @@ impl PyLazyFrame {
         {
             Some(EF::Path(path)) => path,
             Some(EF::Py(f)) => {
-                return LazyFrame::scan_parquet_sourced(ScanSource::Buffer(f.as_arc()), args)
-                    .map(Self::from)
-                    .map_err(PyPolarsErr::from)
-                    .map_err(From::from);
+                return LazyFrame::scan_parquet_sourced(
+                    ScanSources::Buffers([f.as_arc()].into()),
+                    args,
+                )
+                .map(Self::from)
+                .map_err(PyPolarsErr::from)
+                .map_err(From::from);
             },
             None => paths
                 .first()
@@ -392,7 +405,7 @@ impl PyLazyFrame {
             include_file_paths: include_file_paths.map(|x| x.into()),
         };
 
-        use polars_plan::plans::ScanSource;
+        use polars_plan::plans::ScanSources;
         use EitherPythonFileOrPath as EF;
         let use_first_path = path.is_some();
         let first_path = match path
@@ -401,10 +414,13 @@ impl PyLazyFrame {
         {
             Some(EF::Path(path)) => path,
             Some(EF::Py(f)) => {
-                return LazyFrame::scan_ipc_sourced(ScanSource::Buffer(f.as_arc()), args)
-                    .map(Self::from)
-                    .map_err(PyPolarsErr::from)
-                    .map_err(From::from);
+                return LazyFrame::scan_ipc_sourced(
+                    ScanSources::Buffers([f.as_arc()].into()),
+                    args,
+                )
+                .map(Self::from)
+                .map_err(PyPolarsErr::from)
+                .map_err(From::from);
             },
             None => paths
                 .first()
diff --git a/crates/polars-python/src/lazyframe/visitor/nodes.rs b/crates/polars-python/src/lazyframe/visitor/nodes.rs
index 3c31ff11b63a..37a51e4d481d 100644
--- a/crates/polars-python/src/lazyframe/visitor/nodes.rs
+++ b/crates/polars-python/src/lazyframe/visitor/nodes.rs
@@ -327,7 +327,7 @@ pub(crate) fn into_py(py: Python<'_>, plan: &IR) -> PyResult<PyObject> {
         } => Scan {
             paths: sources
                 .try_into_paths()
-                .map_err(|_| PyNotImplementedError::new_err("scan with BytesIO"))?
+                .ok_or_else(|| PyNotImplementedError::new_err("scan with BytesIO"))?
                 .to_object(py),
             // TODO: file info
             file_info: py.None(),
diff --git a/crates/polars-stream/src/utils/late_materialized_df.rs b/crates/polars-stream/src/utils/late_materialized_df.rs
index 87fe97135aad..9e7322167f7f 100644
--- a/crates/polars-stream/src/utils/late_materialized_df.rs
+++ b/crates/polars-stream/src/utils/late_materialized_df.rs
@@ -4,7 +4,7 @@ use parking_lot::Mutex;
 use polars_core::frame::DataFrame;
 use polars_core::schema::Schema;
 use polars_error::PolarsResult;
-use polars_plan::plans::{AnonymousScan, AnonymousScanArgs, FileInfo, FileScan, ScanSource, IR};
+use polars_plan::plans::{AnonymousScan, AnonymousScanArgs, FileInfo, FileScan, ScanSources, IR};
 use polars_plan::prelude::{AnonymousScanOptions, FileScanOptions};
 
 /// Used to insert a dataframe into in-memory-engine query plan after the query
@@ -25,7 +25,7 @@ impl LateMaterializedDataFrame {
             fmt_str: "LateMaterializedDataFrame",
         });
         IR::Scan {
-            sources: ScanSource::Files(Arc::default()),
+            sources: ScanSources::Files(Arc::default()),
             file_info: FileInfo::new(schema, None, (None, usize::MAX)),
             hive_parts: None,
             predicate: None,
diff --git a/py-polars/polars/io/parquet/functions.py b/py-polars/polars/io/parquet/functions.py
index ef01b24955b0..253af8042b84 100644
--- a/py-polars/polars/io/parquet/functions.py
+++ b/py-polars/polars/io/parquet/functions.py
@@ -422,7 +422,7 @@ def scan_parquet(
 
     if isinstance(source, (str, Path)):
         source = normalize_filepath(source, check_not_directory=False)
-    elif isinstance(source, (IO, BytesIO)):
+    elif isinstance(source, (IO, io.BytesIO)):
         sources = []
     else:
         source = [

From 87aef7121c348bc249e85176ec0d3dba664527b3 Mon Sep 17 00:00:00 2001
From: coastalwhite <me@gburghoorn.com>
Date: Wed, 4 Sep 2024 18:01:24 +0200
Subject: [PATCH 05/27] almost completely working :)

---
 Cargo.lock                                    |   2 +
 .../src/executors/scan/csv.rs                 |  23 +--
 .../src/executors/scan/ndjson.rs              |  65 +++----
 .../src/executors/scan/parquet.rs             |  46 ++---
 crates/polars-plan/Cargo.toml                 |   1 +
 .../polars-plan/src/plans/conversion/scans.rs |  74 ++------
 crates/polars-plan/src/plans/ir/mod.rs        |  38 +++-
 .../src/plans/optimizer/count_star.rs         |   2 +-
 crates/polars-python/Cargo.toml               |   1 +
 crates/polars-python/src/conversion/mod.rs    |  57 ++++++
 crates/polars-python/src/file.rs              |   9 +-
 crates/polars-python/src/lazyframe/general.rs | 177 +++++-------------
 crates/polars-utils/src/mmap.rs               |   7 +
 py-polars/polars/io/csv/functions.py          |   6 +-
 py-polars/polars/io/ipc/functions.py          |   7 +-
 py-polars/polars/io/ndjson.py                 |  11 +-
 py-polars/polars/io/parquet/functions.py      |   6 +-
 py-polars/tests/unit/io/test_scan.py          |  45 +++++
 18 files changed, 292 insertions(+), 285 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 332550f89f89..5a0e38a53d31 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3367,6 +3367,7 @@ dependencies = [
  "ahash",
  "bitflags",
  "bytemuck",
+ "bytes",
  "chrono",
  "chrono-tz",
  "ciborium",
@@ -3403,6 +3404,7 @@ dependencies = [
  "ahash",
  "arboard",
  "bytemuck",
+ "bytes",
  "ciborium",
  "either",
  "itoa",
diff --git a/crates/polars-mem-engine/src/executors/scan/csv.rs b/crates/polars-mem-engine/src/executors/scan/csv.rs
index b06386cdfa03..7a2ac0d34950 100644
--- a/crates/polars-mem-engine/src/executors/scan/csv.rs
+++ b/crates/polars-mem-engine/src/executors/scan/csv.rs
@@ -5,6 +5,7 @@ use polars_core::utils::{
     accumulate_dataframes_vertical, accumulate_dataframes_vertical_unchecked,
 };
 use polars_error::feature_gated;
+use polars_utils::mmap::MemSlice;
 
 use super::*;
 
@@ -67,7 +68,7 @@ impl CsvExec {
                 let source = self.sources.at(i);
                 let owned = &mut vec![];
 
-                let mut df = match source {
+                let memslice = match source {
                     ScanSourceRef::File(path) => {
                         let file = if run_async {
                             feature_gated!("cloud", {
@@ -82,21 +83,17 @@ impl CsvExec {
                         }?;
 
                         let mmap = unsafe { memmap::Mmap::map(&file).unwrap() };
-                        options
-                            .into_reader_with_file_handle(std::io::Cursor::new(
-                                maybe_decompress_bytes(mmap.as_ref(), owned)?,
-                            ))
-                            ._with_predicate(predicate.clone())
-                            .finish()?
+                        MemSlice::from_mmap(Arc::new(mmap))
                     },
-                    ScanSourceRef::Buffer(buffer) => options
-                        .into_reader_with_file_handle(std::io::Cursor::new(maybe_decompress_bytes(
-                            buffer, owned,
-                        )?))
-                        ._with_predicate(predicate.clone())
-                        .finish()?,
+                    ScanSourceRef::Buffer(buffer) => MemSlice::from_bytes(buffer.clone()),
                 };
 
+                let reader = std::io::Cursor::new(maybe_decompress_bytes(&memslice, owned)?);
+                let mut df = options
+                    .into_reader_with_file_handle(reader)
+                    ._with_predicate(predicate.clone())
+                    .finish()?;
+
                 if let Some(col) = &self.file_options.include_file_paths {
                     let name = source.to_file_path();
 
diff --git a/crates/polars-mem-engine/src/executors/scan/ndjson.rs b/crates/polars-mem-engine/src/executors/scan/ndjson.rs
index 27aab29fd0c1..b37f76ee826d 100644
--- a/crates/polars-mem-engine/src/executors/scan/ndjson.rs
+++ b/crates/polars-mem-engine/src/executors/scan/ndjson.rs
@@ -1,6 +1,7 @@
 use polars_core::config;
 use polars_core::utils::accumulate_dataframes_vertical;
 use polars_error::feature_gated;
+use polars_utils::mmap::MemSlice;
 
 use super::*;
 
@@ -75,8 +76,7 @@ impl JsonExec {
 
                 let row_index = self.file_scan_options.row_index.as_mut();
 
-                let owned = &mut vec![];
-                let df = match source {
+                let memslice = match source {
                     ScanSourceRef::File(path) => {
                         let file = if run_async {
                             feature_gated!("cloud", {
@@ -97,49 +97,30 @@ impl JsonExec {
                             }
                         };
 
-                        let mmap = unsafe { memmap::Mmap::map(&file).unwrap() };
-                        let curs = std::io::Cursor::new(
-                            match maybe_decompress_bytes(mmap.as_ref(), owned) {
-                                Ok(v) => v,
-                                Err(e) => return Some(Err(e)),
-                            },
-                        );
-                        let reader = JsonLineReader::new(curs);
-
-                        reader
-                            .with_schema(schema.clone())
-                            .with_rechunk(self.file_scan_options.rechunk)
-                            .with_chunk_size(Some(self.options.chunk_size))
-                            .with_row_index(row_index)
-                            .with_predicate(self.predicate.clone().map(phys_expr_to_io_expr))
-                            .with_projection(self.file_scan_options.with_columns.clone())
-                            .low_memory(self.options.low_memory)
-                            .with_n_rows(n_rows)
-                            .with_ignore_errors(self.options.ignore_errors)
-                            .finish()
-                    },
-                    ScanSourceRef::Buffer(buff) => {
-                        let curs =
-                            std::io::Cursor::new(match maybe_decompress_bytes(buff, owned) {
-                                Ok(v) => v,
-                                Err(e) => return Some(Err(e)),
-                            });
-                        let reader = JsonLineReader::new(curs);
-
-                        reader
-                            .with_schema(schema.clone())
-                            .with_rechunk(self.file_scan_options.rechunk)
-                            .with_chunk_size(Some(self.options.chunk_size))
-                            .with_row_index(row_index)
-                            .with_predicate(self.predicate.clone().map(phys_expr_to_io_expr))
-                            .with_projection(self.file_scan_options.with_columns.clone())
-                            .low_memory(self.options.low_memory)
-                            .with_n_rows(n_rows)
-                            .with_ignore_errors(self.options.ignore_errors)
-                            .finish()
+                        MemSlice::from_mmap(Arc::new(unsafe { memmap::Mmap::map(&file).unwrap() }))
                     },
+                    ScanSourceRef::Buffer(buff) => MemSlice::from_bytes(buff.clone()),
                 };
 
+                let owned = &mut vec![];
+                let curs = std::io::Cursor::new(match maybe_decompress_bytes(&memslice, owned) {
+                    Ok(v) => v,
+                    Err(e) => return Some(Err(e)),
+                });
+                let reader = JsonLineReader::new(curs);
+
+                let df = reader
+                    .with_schema(schema.clone())
+                    .with_rechunk(self.file_scan_options.rechunk)
+                    .with_chunk_size(Some(self.options.chunk_size))
+                    .with_row_index(row_index)
+                    .with_predicate(self.predicate.clone().map(phys_expr_to_io_expr))
+                    .with_projection(self.file_scan_options.with_columns.clone())
+                    .low_memory(self.options.low_memory)
+                    .with_n_rows(n_rows)
+                    .with_ignore_errors(self.options.ignore_errors)
+                    .finish();
+
                 let mut df = match df {
                     Ok(df) => df,
                     Err(e) => return Some(Err(e)),
diff --git a/crates/polars-mem-engine/src/executors/scan/parquet.rs b/crates/polars-mem-engine/src/executors/scan/parquet.rs
index bb47eb458a49..509ea7ba8c55 100644
--- a/crates/polars-mem-engine/src/executors/scan/parquet.rs
+++ b/crates/polars-mem-engine/src/executors/scan/parquet.rs
@@ -8,6 +8,7 @@ use polars_io::cloud::CloudOptions;
 use polars_io::parquet::metadata::FileMetaDataRef;
 use polars_io::utils::slice::split_slice_at_file;
 use polars_io::RowIndex;
+use polars_utils::mmap::MemSlice;
 
 use super::*;
 
@@ -81,10 +82,8 @@ impl ParquetExec {
         let base_row_index = self.file_options.row_index.take();
         // Limit no. of files at a time to prevent open file limits.
 
-        let paths = self.sources.as_paths();
-
         for i in slice_info.source_slice.step_by(step) {
-            let end = std::cmp::min(i.saturating_add(step), paths.len());
+            let end = std::cmp::min(i.saturating_add(step), self.sources.len());
             let hive_parts = self.hive_parts.as_ref().map(|x| &x[i..end]);
 
             if current_offset >= slice_info.item_slice.end && !result.is_empty() {
@@ -106,29 +105,30 @@ impl ParquetExec {
                     hive_partitions.as_deref(),
                 );
 
-                match source {
+                let memslice = match source {
                     ScanSourceRef::File(path) => {
                         let file = std::fs::File::open(path)?;
-
-                        let mut reader = ParquetReader::new(file)
-                            .read_parallel(parallel)
-                            .set_low_memory(self.options.low_memory)
-                            .use_statistics(self.options.use_statistics)
-                            .set_rechunk(false)
-                            .with_hive_partition_columns(hive_partitions)
-                            .with_include_file_path(
-                                self.file_options
-                                    .include_file_paths
-                                    .as_ref()
-                                    .map(|x| (x.clone(), Arc::from(paths[i].to_str().unwrap()))),
-                            );
-
-                        reader
-                            .num_rows()
-                            .map(|num_rows| (reader, num_rows, predicate, projection))
+                        MemSlice::from_mmap(Arc::new(unsafe { memmap::Mmap::map(&file).unwrap() }))
                     },
-                    ScanSourceRef::Buffer(_) => todo!(),
-                }
+                    ScanSourceRef::Buffer(buff) => MemSlice::from_bytes(buff.clone()),
+                };
+
+                let mut reader = ParquetReader::new(std::io::Cursor::new(memslice))
+                    .read_parallel(parallel)
+                    .set_low_memory(self.options.low_memory)
+                    .use_statistics(self.options.use_statistics)
+                    .set_rechunk(false)
+                    .with_hive_partition_columns(hive_partitions)
+                    .with_include_file_path(
+                        self.file_options
+                            .include_file_paths
+                            .as_ref()
+                            .map(|x| (x.clone(), Arc::from(source.to_file_path()))),
+                    );
+
+                reader
+                    .num_rows()
+                    .map(|num_rows| (reader, num_rows, predicate, projection))
             });
 
             // We do this in parallel because wide tables can take a long time deserializing metadata.
diff --git a/crates/polars-plan/Cargo.toml b/crates/polars-plan/Cargo.toml
index b37b9b445f10..dd33428c8398 100644
--- a/crates/polars-plan/Cargo.toml
+++ b/crates/polars-plan/Cargo.toml
@@ -26,6 +26,7 @@ ahash = { workspace = true }
 arrow = { workspace = true }
 bitflags = { workspace = true }
 bytemuck = { workspace = true }
+bytes = { workspace = true }
 chrono = { workspace = true, optional = true }
 chrono-tz = { workspace = true, optional = true }
 ciborium = { workspace = true, optional = true }
diff --git a/crates/polars-plan/src/plans/conversion/scans.rs b/crates/polars-plan/src/plans/conversion/scans.rs
index e234521c0a51..50da756f9a38 100644
--- a/crates/polars-plan/src/plans/conversion/scans.rs
+++ b/crates/polars-plan/src/plans/conversion/scans.rs
@@ -1,6 +1,3 @@
-use std::path::PathBuf;
-use std::sync::Arc;
-
 use either::Either;
 use polars_io::path_utils::is_cloud_url;
 #[cfg(feature = "cloud")]
@@ -182,47 +179,18 @@ pub(super) fn csv_file_info(
 
     let infer_schema_func = |i| {
         let source = sources.at(i);
+        let memslice = source.to_memslice(run_async, cache_entries.as_ref(), i)?;
         let owned = &mut vec![];
-        match source {
-            ScanSourceRef::File(path) => {
-                let file = if run_async {
-                    feature_gated!("cloud", {
-                        let entry: &Arc<polars_io::file_cache::FileCacheEntry> =
-                            &cache_entries.as_ref().unwrap()[i];
-                        entry.try_open_check_latest()?
-                    })
-                } else {
-                    polars_utils::open_file(path)?
-                };
-
-                let mmap = unsafe { memmap::Mmap::map(&file).unwrap() };
-                let mut reader =
-                    std::io::Cursor::new(maybe_decompress_bytes(mmap.as_ref(), owned)?);
-
-                if reader.read(&mut [0; 4])? < 2 && csv_options.raise_if_empty {
-                    polars_bail!(NoData: "empty CSV")
-                }
-                reader.rewind()?;
-
-                let reader_bytes = get_reader_bytes(&mut reader).expect("could not mmap file");
-
-                // this needs a way to estimated bytes/rows.
-                SchemaInferenceResult::try_from_reader_bytes_and_options(&reader_bytes, csv_options)
-            },
-            ScanSourceRef::Buffer(buffer) => {
-                let mut reader = std::io::Cursor::new(maybe_decompress_bytes(buffer, owned)?);
-
-                if reader.read(&mut [0; 4])? < 2 && csv_options.raise_if_empty {
-                    polars_bail!(NoData: "empty CSV")
-                }
-                reader.rewind()?;
+        let mut reader = std::io::Cursor::new(maybe_decompress_bytes(&memslice, owned)?);
+        if reader.read(&mut [0; 4])? < 2 && csv_options.raise_if_empty {
+            polars_bail!(NoData: "empty CSV")
+        }
+        reader.rewind()?;
 
-                let reader_bytes = get_reader_bytes(&mut reader).expect("could not open file");
+        let reader_bytes = get_reader_bytes(&mut reader).expect("could not mmap file");
 
-                // this needs a way to estimated bytes/rows.
-                SchemaInferenceResult::try_from_reader_bytes_and_options(&reader_bytes, csv_options)
-            },
-        }
+        // this needs a way to estimated bytes/rows.
+        SchemaInferenceResult::try_from_reader_bytes_and_options(&reader_bytes, csv_options)
     };
 
     let merge_func = |a: PolarsResult<SchemaInferenceResult>,
@@ -336,27 +304,11 @@ pub(super) fn ndjson_file_info(
             )
         }
     } else {
-        let schema = match first {
-            ScanSourceRef::File(path) => {
-                let f = if run_async {
-                    feature_gated!("cloud", {
-                        cache_entries.unwrap()[0].try_open_check_latest()?
-                    })
-                } else {
-                    polars_utils::open_file(path)?
-                };
-
-                let mmap = unsafe { memmap::Mmap::map(&f).unwrap() };
-                let mut reader =
-                    std::io::Cursor::new(maybe_decompress_bytes(mmap.as_ref(), owned)?);
+        let memslice = first.to_memslice(run_async, cache_entries.as_ref(), 0)?;
+        let mut reader = std::io::Cursor::new(maybe_decompress_bytes(&memslice, owned)?);
 
-                polars_io::ndjson::infer_schema(&mut reader, ndjson_options.infer_schema_length)?
-            },
-            ScanSourceRef::Buffer(buff) => {
-                let mut reader = std::io::Cursor::new(maybe_decompress_bytes(buff, owned)?);
-                polars_io::ndjson::infer_schema(&mut reader, ndjson_options.infer_schema_length)?
-            },
-        };
+        let schema =
+            polars_io::ndjson::infer_schema(&mut reader, ndjson_options.infer_schema_length)?;
 
         prepare_schemas(schema, file_options.row_index.as_ref())
     };
diff --git a/crates/polars-plan/src/plans/ir/mod.rs b/crates/polars-plan/src/plans/ir/mod.rs
index db73f7a13528..95a7a5aaf374 100644
--- a/crates/polars-plan/src/plans/ir/mod.rs
+++ b/crates/polars-plan/src/plans/ir/mod.rs
@@ -11,9 +11,12 @@ use std::path::{Path, PathBuf};
 pub use dot::{EscapeLabel, IRDotDisplay, PathsDisplay};
 pub use format::{ExprIRDisplay, IRDisplay};
 use hive::HivePartitions;
+use polars_core::error::feature_gated;
 use polars_core::prelude::*;
 use polars_core::POOL;
+use polars_io::file_cache::FileCacheEntry;
 use polars_utils::idx_vec::UnitVec;
+use polars_utils::mmap::MemSlice;
 use polars_utils::unitvec;
 #[cfg(feature = "ir_serde")]
 use serde::{Deserialize, Serialize};
@@ -38,13 +41,14 @@ pub struct IRPlanRef<'a> {
 #[derive(Debug, Clone, Hash, PartialEq, Eq)]
 pub enum ScanSources {
     Files(Arc<[PathBuf]>),
-    Buffers(Arc<[Arc<[u8]>]>),
+    #[cfg_attr(feature = "serde", serde(skip))]
+    Buffers(Arc<[bytes::Bytes]>),
 }
 
 #[derive(Debug, Clone, Copy)]
 pub enum ScanSourceRef<'a> {
     File(&'a Path),
-    Buffer(&'a [u8]),
+    Buffer(&'a bytes::Bytes),
 }
 
 pub struct ScanSourceSliceInfo {
@@ -65,6 +69,29 @@ impl<'a> ScanSourceRef<'a> {
             ScanSourceRef::Buffer(_) => "in-mem",
         }
     }
+
+    pub fn to_memslice(
+        &self,
+        run_async: bool,
+        cache_entries: Option<&Vec<Arc<FileCacheEntry>>>,
+        index: usize,
+    ) -> PolarsResult<MemSlice> {
+        match self {
+            Self::File(path) => {
+                let f = if run_async {
+                    feature_gated!("cloud", {
+                        cache_entries.unwrap()[index].try_open_check_latest()?
+                    })
+                } else {
+                    polars_utils::open_file(path)?
+                };
+
+                let mmap = unsafe { memmap::Mmap::map(&f)? };
+                Ok(MemSlice::from_mmap(Arc::new(mmap)))
+            },
+            Self::Buffer(buff) => Ok(MemSlice::from_bytes((*buff).clone())),
+        }
+    }
 }
 
 impl ScanSources {
@@ -95,6 +122,13 @@ impl ScanSources {
         }
     }
 
+    pub fn first_path(&self) -> Option<&Path> {
+        match self {
+            ScanSources::Files(paths) => paths.first().map(|p| p.as_path()),
+            ScanSources::Buffers(_) => None,
+        }
+    }
+
     pub fn to_dsl(self, is_expanded: bool) -> DslScanSources {
         DslScanSources {
             sources: self,
diff --git a/crates/polars-plan/src/plans/optimizer/count_star.rs b/crates/polars-plan/src/plans/optimizer/count_star.rs
index c8570b0f908f..02c8b94a033c 100644
--- a/crates/polars-plan/src/plans/optimizer/count_star.rs
+++ b/crates/polars-plan/src/plans/optimizer/count_star.rs
@@ -68,7 +68,7 @@ fn visit_logical_plan_for_scan_paths(
         IR::Union { inputs, .. } => {
             enum MutableSources {
                 Files(Vec<PathBuf>),
-                Buffers(Vec<Arc<[u8]>>),
+                Buffers(Vec<bytes::Bytes>),
             }
 
             let mut scan_type: Option<FileScan> = None;
diff --git a/crates/polars-python/Cargo.toml b/crates/polars-python/Cargo.toml
index 03178d684e34..b93d34a678e5 100644
--- a/crates/polars-python/Cargo.toml
+++ b/crates/polars-python/Cargo.toml
@@ -25,6 +25,7 @@ polars-stream = { workspace = true }
 ahash = { workspace = true }
 arboard = { workspace = true, optional = true }
 bytemuck = { workspace = true }
+bytes = { workspace = true }
 ciborium = { workspace = true }
 either = { workspace = true }
 itoa = { workspace = true }
diff --git a/crates/polars-python/src/conversion/mod.rs b/crates/polars-python/src/conversion/mod.rs
index 8d5c96f3b58c..886b6f744552 100644
--- a/crates/polars-python/src/conversion/mod.rs
+++ b/crates/polars-python/src/conversion/mod.rs
@@ -3,6 +3,7 @@ pub(crate) mod chunked_array;
 mod datetime;
 use std::fmt::{Display, Formatter};
 use std::hash::{Hash, Hasher};
+use std::path::PathBuf;
 
 #[cfg(feature = "object")]
 use polars::chunked_array::object::PolarsObjectSafe;
@@ -19,6 +20,7 @@ use polars_core::utils::materialize_dyn_int;
 use polars_lazy::prelude::*;
 #[cfg(feature = "parquet")]
 use polars_parquet::write::StatisticsOptions;
+use polars_plan::plans::ScanSources;
 use polars_utils::pl_str::PlSmallStr;
 use polars_utils::total_ord::{TotalEq, TotalHash};
 use pyo3::basic::CompareOp;
@@ -29,6 +31,7 @@ use pyo3::pybacked::PyBackedStr;
 use pyo3::types::{PyDict, PyList, PySequence};
 
 use crate::error::PyPolarsErr;
+use crate::file::{get_either_file_or_path, EitherPythonFileOrPath};
 #[cfg(feature = "object")]
 use crate::object::OBJECT_NAME;
 use crate::prelude::*;
@@ -528,6 +531,60 @@ impl<'py> FromPyObject<'py> for Wrap<Schema> {
     }
 }
 
+impl<'py> FromPyObject<'py> for Wrap<ScanSources> {
+    fn extract_bound(ob: &Bound<'py, PyAny>) -> PyResult<Self> {
+        let list = ob.downcast::<PyList>()?.to_owned();
+
+        if list.is_empty() {
+            return Ok(Wrap(ScanSources::default()));
+        }
+
+        enum MutableSources {
+            Files(Vec<PathBuf>),
+            Buffers(Vec<bytes::Bytes>),
+        }
+
+        let num_items = list.len();
+        let mut iter = list
+            .into_iter()
+            .map(|val| get_either_file_or_path(val.unbind(), false));
+
+        let Some(first) = iter.next() else {
+            return Ok(Wrap(ScanSources::default()));
+        };
+
+        let mut sources = match first? {
+            EitherPythonFileOrPath::Py(f) => {
+                let mut sources = Vec::with_capacity(num_items);
+                sources.push(f.as_bytes());
+                MutableSources::Buffers(sources)
+            },
+            EitherPythonFileOrPath::Path(path) => {
+                let mut sources = Vec::with_capacity(num_items);
+                sources.push(path);
+                MutableSources::Files(sources)
+            },
+        };
+
+        for source in iter {
+            match (&mut sources, source?) {
+                (MutableSources::Files(v), EitherPythonFileOrPath::Path(p)) => v.push(p),
+                (MutableSources::Buffers(v), EitherPythonFileOrPath::Py(f)) => v.push(f.as_bytes()),
+                _ => {
+                    return Err(PyTypeError::new_err(
+                        "Cannot combine in-memory bytes and paths for scan sources",
+                    ))
+                },
+            }
+        }
+
+        Ok(Wrap(match sources {
+            MutableSources::Files(i) => ScanSources::Files(i.into()),
+            MutableSources::Buffers(i) => ScanSources::Buffers(i.into()),
+        }))
+    }
+}
+
 impl IntoPy<PyObject> for Wrap<&Schema> {
     fn into_py(self, py: Python<'_>) -> PyObject {
         let dict = PyDict::new_bound(py);
diff --git a/crates/polars-python/src/file.rs b/crates/polars-python/src/file.rs
index 6225ee5427f7..3d180e2bedf3 100644
--- a/crates/polars-python/src/file.rs
+++ b/crates/polars-python/src/file.rs
@@ -7,7 +7,6 @@ use std::io::{Cursor, ErrorKind, Read, Seek, SeekFrom, Write};
 #[cfg(target_family = "unix")]
 use std::os::fd::{FromRawFd, RawFd};
 use std::path::PathBuf;
-use std::sync::Arc;
 
 use polars::io::mmap::MmapBytesReader;
 use polars_error::{polars_err, polars_warn};
@@ -32,7 +31,7 @@ impl PyFileLikeObject {
         PyFileLikeObject { inner: object }
     }
 
-    pub fn as_arc(&self) -> Arc<[u8]> {
+    pub fn as_bytes(&self) -> bytes::Bytes {
         self.as_file_buffer().into_inner().into()
     }
 
@@ -252,7 +251,7 @@ pub fn get_either_file_or_path(py_f: PyObject, write: bool) -> PyResult<EitherPy
     })
 }
 
-fn get_either_file_and_path(
+fn get_either_buffer_or_path(
     py_f: PyObject,
     write: bool,
 ) -> PyResult<(EitherRustPythonFile, Option<PathBuf>)> {
@@ -366,7 +365,7 @@ fn get_either_file_and_path(
 /// # Arguments
 /// * `write` - open for writing; will truncate existing file and create new file if not.
 pub fn get_either_file(py_f: PyObject, write: bool) -> PyResult<EitherRustPythonFile> {
-    Ok(get_either_file_and_path(py_f, write)?.0)
+    Ok(get_either_buffer_or_path(py_f, write)?.0)
 }
 
 pub fn get_file_like(f: PyObject, truncate: bool) -> PyResult<Box<dyn FileLike>> {
@@ -403,7 +402,7 @@ pub fn get_mmap_bytes_reader_and_path<'a>(
     }
     // string so read file
     else {
-        match get_either_file_and_path(py_f.to_object(py_f.py()), false)? {
+        match get_either_buffer_or_path(py_f.to_object(py_f.py()), false)? {
             (EitherRustPythonFile::Rust(f), path) => Ok((Box::new(f), path)),
             (EitherRustPythonFile::Py(f), path) => Ok((Box::new(f), path)),
         }
diff --git a/crates/polars-python/src/lazyframe/general.rs b/crates/polars-python/src/lazyframe/general.rs
index 40c69b260c40..30206d0c088b 100644
--- a/crates/polars-python/src/lazyframe/general.rs
+++ b/crates/polars-python/src/lazyframe/general.rs
@@ -7,7 +7,7 @@ use polars::time::*;
 use polars_core::prelude::*;
 #[cfg(feature = "parquet")]
 use polars_parquet::arrow::write::StatisticsOptions;
-use pyo3::exceptions::PyValueError;
+use polars_plan::plans::ScanSources;
 use pyo3::prelude::*;
 use pyo3::pybacked::PyBackedStr;
 use pyo3::types::{PyDict, PyList};
@@ -20,6 +20,18 @@ use crate::lazyframe::visit::NodeTraverser;
 use crate::prelude::*;
 use crate::{PyDataFrame, PyExpr, PyLazyGroupBy};
 
+fn pyobject_to_first_path_and_scan_sources(
+    obj: PyObject,
+) -> PyResult<(Option<PathBuf>, ScanSources)> {
+    use crate::file::{get_either_file_or_path, EitherPythonFileOrPath};
+    Ok(match get_either_file_or_path(obj, false)? {
+        EitherPythonFileOrPath::Path(path) => {
+            (Some(path.clone()), ScanSources::Files([path].into()))
+        },
+        EitherPythonFileOrPath::Py(f) => (None, ScanSources::Buffers([f.as_bytes()].into())),
+    })
+}
+
 #[pymethods]
 #[allow(clippy::should_implement_trait)]
 impl PyLazyFrame {
@@ -27,12 +39,12 @@ impl PyLazyFrame {
     #[cfg(feature = "json")]
     #[allow(clippy::too_many_arguments)]
     #[pyo3(signature = (
-        path, paths, infer_schema_length, schema, schema_overrides, batch_size, n_rows, low_memory, rechunk,
+        source, sources, infer_schema_length, schema, schema_overrides, batch_size, n_rows, low_memory, rechunk,
         row_index, ignore_errors, include_file_paths, cloud_options, retries, file_cache_ttl
     ))]
     fn new_from_ndjson(
-        path: Option<PyObject>,
-        paths: Vec<PathBuf>,
+        source: Option<PyObject>,
+        sources: Wrap<ScanSources>,
         infer_schema_length: Option<usize>,
         schema: Option<Wrap<Schema>>,
         schema_overrides: Option<Wrap<Schema>>,
@@ -52,35 +64,14 @@ impl PyLazyFrame {
             offset,
         });
 
-        use std::path::Path;
-
-        use polars_plan::plans::ScanSources;
-        use EitherPythonFileOrPath as EF;
-
-        use crate::file::{get_either_file_or_path, EitherPythonFileOrPath};
-        let (first_path, mut r) = match path
-            .map(|py_f| get_either_file_or_path(py_f, false))
-            .transpose()?
-        {
-            Some(EF::Path(path)) => {
-                let reader = LazyJsonLineReader::new(<PathBuf as AsRef<Path>>::as_ref(&path));
-                (Some(path), reader)
-            },
-            Some(EF::Py(f)) => (
-                None,
-                LazyJsonLineReader::new_sourced(ScanSources::Buffers([f.as_arc()].into())),
-            ),
-            None => (
-                Some(
-                    paths
-                        .first()
-                        .cloned()
-                        .ok_or_else(|| PyValueError::new_err("expected a path argument"))?,
-                ),
-                LazyJsonLineReader::new_paths(paths.into()),
-            ),
+        let sources = sources.0;
+        let (first_path, sources) = match source {
+            None => (sources.first_path().map(|p| p.to_path_buf()), sources),
+            Some(source) => pyobject_to_first_path_and_scan_sources(source)?,
         };
 
+        let mut r = LazyJsonLineReader::new_sourced(sources);
+
         #[cfg(feature = "cloud")]
         if let Some(first_path) = first_path {
             let first_path_url = first_path.to_string_lossy();
@@ -115,7 +106,7 @@ impl PyLazyFrame {
 
     #[staticmethod]
     #[cfg(feature = "csv")]
-    #[pyo3(signature = (path, paths, separator, has_header, ignore_errors, skip_rows, n_rows, cache, overwrite_dtype,
+    #[pyo3(signature = (source, sources, separator, has_header, ignore_errors, skip_rows, n_rows, cache, overwrite_dtype,
         low_memory, comment_prefix, quote_char, null_values, missing_utf8_is_empty_string,
         infer_schema_length, with_schema_modify, rechunk, skip_rows_after_header,
         encoding, row_index, try_parse_dates, eol_char, raise_if_empty, truncate_ragged_lines, decimal_comma, glob, schema,
@@ -123,8 +114,8 @@ impl PyLazyFrame {
     )
     )]
     fn new_from_csv(
-        path: Option<PyObject>,
-        paths: Vec<PathBuf>,
+        source: Option<PyObject>,
+        sources: Wrap<ScanSources>,
         separator: &str,
         has_header: bool,
         ignore_errors: bool,
@@ -155,10 +146,6 @@ impl PyLazyFrame {
         file_cache_ttl: Option<u64>,
         include_file_paths: Option<String>,
     ) -> PyResult<Self> {
-        use std::path::Path;
-
-        use crate::file::{get_either_file_or_path, EitherPythonFileOrPath};
-
         let null_values = null_values.map(|w| w.0);
         let quote_char = quote_char.map(|s| s.as_bytes()[0]);
         let separator = separator.as_bytes()[0];
@@ -175,31 +162,14 @@ impl PyLazyFrame {
                 .collect::<Schema>()
         });
 
-        use polars_plan::plans::ScanSources;
-        use EitherPythonFileOrPath as EF;
-        let (first_path, mut r) = match path
-            .map(|py_f| get_either_file_or_path(py_f, false))
-            .transpose()?
-        {
-            Some(EF::Path(path)) => {
-                let reader = LazyCsvReader::new(<PathBuf as AsRef<Path>>::as_ref(&path));
-                (Some(path), reader)
-            },
-            Some(EF::Py(f)) => (
-                None,
-                LazyCsvReader::new_sourced(ScanSources::Buffers([f.as_arc()].into())),
-            ),
-            None => (
-                Some(
-                    paths
-                        .first()
-                        .cloned()
-                        .ok_or_else(|| PyValueError::new_err("expected a path argument"))?,
-                ),
-                LazyCsvReader::new_paths(paths.into()),
-            ),
+        let sources = sources.0;
+        let (first_path, sources) = match source {
+            None => (sources.first_path().map(|p| p.to_path_buf()), sources),
+            Some(source) => pyobject_to_first_path_and_scan_sources(source)?,
         };
 
+        let mut r = LazyCsvReader::new_sourced(sources);
+
         #[cfg(feature = "cloud")]
         if let Some(first_path) = first_path {
             let first_path_url = first_path.to_string_lossy();
@@ -268,12 +238,12 @@ impl PyLazyFrame {
 
     #[cfg(feature = "parquet")]
     #[staticmethod]
-    #[pyo3(signature = (path, paths, n_rows, cache, parallel, rechunk, row_index,
+    #[pyo3(signature = (source, sources, n_rows, cache, parallel, rechunk, row_index,
         low_memory, cloud_options, use_statistics, hive_partitioning, hive_schema, try_parse_hive_dates, retries, glob, include_file_paths)
     )]
     fn new_from_parquet(
-        path: Option<PyObject>,
-        paths: Vec<PathBuf>,
+        source: Option<PyObject>,
+        sources: Wrap<ScanSources>,
         n_rows: Option<usize>,
         cache: bool,
         parallel: Wrap<ParallelStrategy>,
@@ -289,8 +259,6 @@ impl PyLazyFrame {
         glob: bool,
         include_file_paths: Option<String>,
     ) -> PyResult<Self> {
-        use crate::file::{get_either_file_or_path, EitherPythonFileOrPath};
-
         let parallel = parallel.0;
         let hive_schema = hive_schema.map(|s| Arc::new(s.0));
 
@@ -320,52 +288,31 @@ impl PyLazyFrame {
             include_file_paths: include_file_paths.map(|x| x.into()),
         };
 
-        use polars_plan::plans::ScanSources;
-        use EitherPythonFileOrPath as EF;
-        let use_first_path = path.is_some();
-        let first_path = match path
-            .map(|py_f| get_either_file_or_path(py_f, false))
-            .transpose()?
-        {
-            Some(EF::Path(path)) => path,
-            Some(EF::Py(f)) => {
-                return LazyFrame::scan_parquet_sourced(
-                    ScanSources::Buffers([f.as_arc()].into()),
-                    args,
-                )
-                .map(Self::from)
-                .map_err(PyPolarsErr::from)
-                .map_err(From::from);
-            },
-            None => paths
-                .first()
-                .cloned()
-                .ok_or_else(|| PyValueError::new_err("expected a path argument"))?,
+        let sources = sources.0;
+        let (first_path, sources) = match source {
+            None => (sources.first_path().map(|p| p.to_path_buf()), sources),
+            Some(source) => pyobject_to_first_path_and_scan_sources(source)?,
         };
 
         #[cfg(feature = "cloud")]
-        {
+        if let Some(first_path) = first_path {
             let first_path_url = first_path.to_string_lossy();
             let cloud_options =
                 parse_cloud_options(&first_path_url, cloud_options.unwrap_or_default())?;
             args.cloud_options = Some(cloud_options.with_max_retries(retries));
         }
 
-        let lf = if use_first_path {
-            LazyFrame::scan_parquet(first_path, args)
-        } else {
-            LazyFrame::scan_parquet_files(Arc::from(paths), args)
-        }
-        .map_err(PyPolarsErr::from)?;
+        let lf = LazyFrame::scan_parquet_sourced(sources, args).map_err(PyPolarsErr::from)?;
+
         Ok(lf.into())
     }
 
     #[cfg(feature = "ipc")]
     #[staticmethod]
-    #[pyo3(signature = (path, paths, n_rows, cache, rechunk, row_index, memory_map, cloud_options, hive_partitioning, hive_schema, try_parse_hive_dates, retries, file_cache_ttl, include_file_paths))]
+    #[pyo3(signature = (source, sources, n_rows, cache, rechunk, row_index, memory_map, cloud_options, hive_partitioning, hive_schema, try_parse_hive_dates, retries, file_cache_ttl, include_file_paths))]
     fn new_from_ipc(
-        path: Option<PyObject>,
-        paths: Vec<PathBuf>,
+        source: Option<PyObject>,
+        sources: Wrap<ScanSources>,
         n_rows: Option<usize>,
         cache: bool,
         rechunk: bool,
@@ -379,8 +326,6 @@ impl PyLazyFrame {
         file_cache_ttl: Option<u64>,
         include_file_paths: Option<String>,
     ) -> PyResult<Self> {
-        use crate::file::{get_either_file_or_path, EitherPythonFileOrPath};
-
         let row_index = row_index.map(|(name, offset)| RowIndex {
             name: name.into(),
             offset,
@@ -405,31 +350,14 @@ impl PyLazyFrame {
             include_file_paths: include_file_paths.map(|x| x.into()),
         };
 
-        use polars_plan::plans::ScanSources;
-        use EitherPythonFileOrPath as EF;
-        let use_first_path = path.is_some();
-        let first_path = match path
-            .map(|py_f| get_either_file_or_path(py_f, false))
-            .transpose()?
-        {
-            Some(EF::Path(path)) => path,
-            Some(EF::Py(f)) => {
-                return LazyFrame::scan_ipc_sourced(
-                    ScanSources::Buffers([f.as_arc()].into()),
-                    args,
-                )
-                .map(Self::from)
-                .map_err(PyPolarsErr::from)
-                .map_err(From::from);
-            },
-            None => paths
-                .first()
-                .cloned()
-                .ok_or_else(|| PyValueError::new_err("expected a path argument"))?,
+        let sources = sources.0;
+        let (first_path, sources) = match source {
+            None => (sources.first_path().map(|p| p.to_path_buf()), sources),
+            Some(source) => pyobject_to_first_path_and_scan_sources(source)?,
         };
 
         #[cfg(feature = "cloud")]
-        {
+        if let Some(first_path) = first_path {
             let first_path_url = first_path.to_string_lossy();
 
             let mut cloud_options =
@@ -440,12 +368,7 @@ impl PyLazyFrame {
             args.cloud_options = Some(cloud_options.with_max_retries(retries));
         }
 
-        let lf = if use_first_path {
-            LazyFrame::scan_ipc(first_path, args)
-        } else {
-            LazyFrame::scan_ipc_files(paths.into(), args)
-        }
-        .map_err(PyPolarsErr::from)?;
+        let lf = LazyFrame::scan_ipc_sourced(sources, args).map_err(PyPolarsErr::from)?;
         Ok(lf.into())
     }
 
diff --git a/crates/polars-utils/src/mmap.rs b/crates/polars-utils/src/mmap.rs
index d8db6d0ae671..c753525b43ee 100644
--- a/crates/polars-utils/src/mmap.rs
+++ b/crates/polars-utils/src/mmap.rs
@@ -46,6 +46,13 @@ mod private {
         }
     }
 
+    impl AsRef<[u8]> for MemSlice {
+        #[inline(always)]
+        fn as_ref(&self) -> &[u8] {
+            self.slice
+        }
+    }
+
     impl Default for MemSlice {
         fn default() -> Self {
             Self::from_bytes(bytes::Bytes::new())
diff --git a/py-polars/polars/io/csv/functions.py b/py-polars/polars/io/csv/functions.py
index b7b5c4764845..77cd73e0aa5f 100644
--- a/py-polars/polars/io/csv/functions.py
+++ b/py-polars/polars/io/csv/functions.py
@@ -1234,6 +1234,8 @@ def with_column_names(cols: list[str]) -> list[str]:
         source = normalize_filepath(source, check_not_directory=False)
     elif isinstance(source, (IO, BytesIO)):
         pass
+    elif isinstance(source, list) and isinstance(source[0], BytesIO):
+        pass
     else:
         source = [
             normalize_filepath(source, check_not_directory=False) for source in source
@@ -1331,8 +1333,8 @@ def _scan_csv_impl(
         storage_options = None
 
     pylf = PyLazyFrame.new_from_csv(
-        path=source,
-        paths=sources,
+        source,
+        sources,
         separator=separator,
         has_header=has_header,
         ignore_errors=ignore_errors,
diff --git a/py-polars/polars/io/ipc/functions.py b/py-polars/polars/io/ipc/functions.py
index 8f3c21bdf286..8c0138df2a36 100644
--- a/py-polars/polars/io/ipc/functions.py
+++ b/py-polars/polars/io/ipc/functions.py
@@ -347,7 +347,7 @@ def read_ipc_schema(source: str | Path | IO[bytes] | bytes) -> dict[str, DataTyp
 @deprecate_renamed_parameter("row_count_name", "row_index_name", version="0.20.4")
 @deprecate_renamed_parameter("row_count_offset", "row_index_offset", version="0.20.4")
 def scan_ipc(
-    source: str | Path | list[str] | list[Path] | IO[str] | IO[bytes],
+    source: str | Path | IO[bytes] | list[str] | list[Path] | list[IO[bytes]],
     *,
     n_rows: int | None = None,
     cache: bool = True,
@@ -430,8 +430,11 @@ def scan_ipc(
     if isinstance(source, (str, Path)):
         source = normalize_filepath(source, check_not_directory=False)
         sources = []
-    elif isinstance(source, (IO, BytesIO)):
+    elif isinstance(source, BytesIO):
         sources = []
+    elif isinstance(source, list) and isinstance(source[0], BytesIO):
+        sources = source
+        source = None  # type: ignore[assignment]
     else:
         sources = [
             normalize_filepath(source, check_not_directory=False) for source in source
diff --git a/py-polars/polars/io/ndjson.py b/py-polars/polars/io/ndjson.py
index 5482ccc52c42..63032b5dc688 100644
--- a/py-polars/polars/io/ndjson.py
+++ b/py-polars/polars/io/ndjson.py
@@ -166,7 +166,7 @@ def read_ndjson(
 @deprecate_renamed_parameter("row_count_name", "row_index_name", version="0.20.4")
 @deprecate_renamed_parameter("row_count_offset", "row_index_offset", version="0.20.4")
 def scan_ndjson(
-    source: str | Path | list[str] | list[Path] | IO[str] | IO[bytes],
+    source: str | Path | IO[str] | IO[bytes] | list[str] | list[Path] | list[IO[str]] | list[IO[bytes]],
     *,
     schema: SchemaDefinition | None = None,
     schema_overrides: SchemaDefinition | None = None,
@@ -250,8 +250,11 @@ def scan_ndjson(
     if isinstance(source, (str, Path)):
         source = normalize_filepath(source, check_not_directory=False)
         sources = []
-    elif isinstance(source, (IO, BytesIO)):
+    elif isinstance(source, BytesIO):
         sources = []
+    elif isinstance(source, list) and isinstance(source[0], BytesIO):
+        sources = source
+        source = None  # type: ignore[assignment]
     else:
         sources = [
             normalize_filepath(source, check_not_directory=False) for source in source
@@ -268,8 +271,8 @@ def scan_ndjson(
         storage_options = None
 
     pylf = PyLazyFrame.new_from_ndjson(
-        path=source,
-        paths=sources,
+        source,
+        sources,
         infer_schema_length=infer_schema_length,
         schema=schema,
         schema_overrides=schema_overrides,
diff --git a/py-polars/polars/io/parquet/functions.py b/py-polars/polars/io/parquet/functions.py
index 253af8042b84..583b8fddf326 100644
--- a/py-polars/polars/io/parquet/functions.py
+++ b/py-polars/polars/io/parquet/functions.py
@@ -295,7 +295,7 @@ def read_parquet_schema(source: str | Path | IO[bytes] | bytes) -> dict[str, Dat
 @deprecate_renamed_parameter("row_count_name", "row_index_name", version="0.20.4")
 @deprecate_renamed_parameter("row_count_offset", "row_index_offset", version="0.20.4")
 def scan_parquet(
-    source: str | Path | list[str] | list[Path] | IO[str] | IO[bytes],
+    source: str | Path | IO[bytes] | list[str] | list[Path] | list[IO[bytes]],
     *,
     n_rows: int | None = None,
     row_index_name: str | None = None,
@@ -422,8 +422,8 @@ def scan_parquet(
 
     if isinstance(source, (str, Path)):
         source = normalize_filepath(source, check_not_directory=False)
-    elif isinstance(source, (IO, io.BytesIO)):
-        sources = []
+    elif isinstance(source, io.BytesIO) or (isinstance(source, list) and isinstance(source[0], io.BytesIO)):
+        pass
     else:
         source = [
             normalize_filepath(source, check_not_directory=False) for source in source
diff --git a/py-polars/tests/unit/io/test_scan.py b/py-polars/tests/unit/io/test_scan.py
index 1bcc463bd2e7..cb33344a1fce 100644
--- a/py-polars/tests/unit/io/test_scan.py
+++ b/py-polars/tests/unit/io/test_scan.py
@@ -9,6 +9,7 @@
 import pytest
 
 import polars as pl
+import io
 from polars.testing.asserts.frame import assert_frame_equal
 
 if TYPE_CHECKING:
@@ -690,3 +691,47 @@ def test_async_path_expansion_bracket_17629(tmp_path: Path) -> None:
     df.write_parquet(path)
 
     assert_frame_equal(pl.scan_parquet(tmp_path / "[d]ata.parquet").collect(), df)
+
+
+@pytest.mark.parametrize(
+    "method",
+    ["parquet", "csv", "ipc", "ndjson"],
+)
+def test_scan_in_memory(method: str) -> None:
+    f = io.BytesIO()
+    df = pl.DataFrame({
+        'a': [1, 2, 3],
+        'b': ['x', 'y', 'z'],
+    })
+
+    (getattr(df, f'write_{method}'))(f)
+
+    f.seek(0)
+    result = (getattr(pl, f'scan_{method}'))(f).collect()
+    assert_frame_equal(df, result)
+
+    f.seek(0)
+    result = (getattr(pl, f'scan_{method}'))(f).slice(1, 2).collect()
+    assert_frame_equal(df.slice(1, 2), result)
+
+    f.seek(0)
+    result = (getattr(pl, f'scan_{method}'))(f).slice(-1, 1).collect()
+    assert_frame_equal(df.slice(-1, 1), result)
+
+    g = io.BytesIO()
+    (getattr(df, f'write_{method}'))(g)
+
+    f.seek(0)
+    g.seek(0)
+    result = (getattr(pl, f'scan_{method}'))([f, g]).collect()
+    assert_frame_equal(df.vstack(df), result)
+
+    f.seek(0)
+    g.seek(0)
+    result = (getattr(pl, f'scan_{method}'))([f, g]).slice(1, 2).collect()
+    assert_frame_equal(df.vstack(df).slice(1, 2), result)
+
+    f.seek(0)
+    g.seek(0)
+    result = (getattr(pl, f'scan_{method}'))([f, g]).slice(-1, 1).collect()
+    assert_frame_equal(df.vstack(df).slice(-1, 1), result)

From 278e1a7269b3427a684fd7785f2bd12869a5bdbc Mon Sep 17 00:00:00 2001
From: coastalwhite <me@gburghoorn.com>
Date: Thu, 5 Sep 2024 16:41:26 +0200
Subject: [PATCH 06/27] no more failing tests

---
 crates/polars-lazy/src/scan/ndjson.rs         |   9 +-
 .../src/executors/scan/parquet.rs             |  99 ++++++++++++----
 .../src/parquet/encoding/uleb128.rs           |   1 +
 .../src/plans/conversion/dsl_to_ir.rs         |  10 +-
 crates/polars-plan/src/plans/ir/format.rs     |   2 +-
 crates/polars-plan/src/plans/ir/mod.rs        | 111 +-----------------
 py-polars/polars/io/csv/functions.py          |   8 +-
 py-polars/polars/io/ndjson.py                 |   9 +-
 py-polars/polars/io/parquet/functions.py      |   4 +-
 py-polars/tests/unit/io/test_parquet.py       |   6 +-
 py-polars/tests/unit/io/test_scan.py          |  28 +++--
 11 files changed, 122 insertions(+), 165 deletions(-)

diff --git a/crates/polars-lazy/src/scan/ndjson.rs b/crates/polars-lazy/src/scan/ndjson.rs
index 8d71d9a585a2..6d0492e170dc 100644
--- a/crates/polars-lazy/src/scan/ndjson.rs
+++ b/crates/polars-lazy/src/scan/ndjson.rs
@@ -128,10 +128,11 @@ impl LazyFileListReader for LazyJsonLineReader {
             row_index: self.row_index,
             rechunk: self.rechunk,
             file_counter: 0,
-            hive_options: {
-                let mut options = HiveOptions::default();
-                options.enabled = Some(false);
-                options
+            hive_options: HiveOptions {
+                enabled: Some(false),
+                hive_start_idx: 0,
+                schema: None,
+                try_parse_dates: true,
             },
             glob: true,
             include_file_paths: self.include_file_paths,
diff --git a/crates/polars-mem-engine/src/executors/scan/parquet.rs b/crates/polars-mem-engine/src/executors/scan/parquet.rs
index 509ea7ba8c55..7de67aff4284 100644
--- a/crates/polars-mem-engine/src/executors/scan/parquet.rs
+++ b/crates/polars-mem-engine/src/executors/scan/parquet.rs
@@ -60,40 +60,94 @@ impl ParquetExec {
         let mut result = vec![];
 
         let step = std::cmp::min(POOL.current_num_threads(), 128);
-        let slice_info = match self.file_options.slice {
-            None => ScanSourceSliceInfo {
-                item_slice: 0..usize::MAX,
-                source_slice: 0..self.sources.len(),
-            },
-            Some(slice) => {
-                self.sources
-                    .collect_slice_information(slice, |source| match source {
-                        ScanSourceRef::File(path) => {
-                            ParquetReader::new(std::fs::File::open(path)?).num_rows()
-                        },
-                        ScanSourceRef::Buffer(buff) => {
-                            ParquetReader::new(std::io::Cursor::new(buff)).num_rows()
-                        },
-                    })?
-            },
+        // Modified if we have a negative slice
+        let mut first_source = 0;
+
+        // (offset, end)
+        let (slice_offset, slice_end) = if let Some(slice) = self.file_options.slice {
+            if slice.0 >= 0 {
+                (slice.0 as usize, slice.1.saturating_add(slice.0 as usize))
+            } else {
+                // Walk the files in reverse until we find the first file, and then translate the
+                // slice into a positive-offset equivalent.
+                let slice_start_as_n_from_end = -slice.0 as usize;
+                let mut cum_rows = 0;
+                let chunk_size = 8;
+                POOL.install(|| {
+                    for path_indexes in (0..self.sources.len())
+                        .rev()
+                        .collect::<Vec<_>>()
+                        .chunks(chunk_size)
+                    {
+                        let row_counts = path_indexes
+                            .into_par_iter()
+                            .map(|&i| {
+                                let memslice = match self.sources.at(i) {
+                                    ScanSourceRef::File(path) => {
+                                        let file = std::fs::File::open(path)?;
+                                        MemSlice::from_mmap(Arc::new(unsafe {
+                                            memmap::Mmap::map(&file).unwrap()
+                                        }))
+                                    },
+                                    ScanSourceRef::Buffer(buff) => {
+                                        MemSlice::from_bytes(buff.clone())
+                                    },
+                                };
+
+                                ParquetReader::new(std::io::Cursor::new(memslice)).num_rows()
+                            })
+                            .collect::<PolarsResult<Vec<_>>>()?;
+
+                        for (path_idx, rc) in path_indexes.iter().zip(row_counts) {
+                            cum_rows += rc;
+
+                            if cum_rows >= slice_start_as_n_from_end {
+                                first_source = *path_idx;
+                                break;
+                            }
+                        }
+
+                        if first_source > 0 {
+                            break;
+                        }
+                    }
+
+                    PolarsResult::Ok(())
+                })?;
+
+                let (start, len) = if slice_start_as_n_from_end > cum_rows {
+                    // We need to trim the slice, e.g. SLICE[offset: -100, len: 75] on a file of 50
+                    // rows should only give the first 25 rows.
+                    let first_file_position = slice_start_as_n_from_end - cum_rows;
+                    (0, slice.1.saturating_sub(first_file_position))
+                } else {
+                    (cum_rows - slice_start_as_n_from_end, slice.1)
+                };
+
+                let end = start.saturating_add(len);
+
+                (start, end)
+            }
+        } else {
+            (0, usize::MAX)
         };
 
         let mut current_offset = 0;
         let base_row_index = self.file_options.row_index.take();
         // Limit no. of files at a time to prevent open file limits.
 
-        for i in slice_info.source_slice.step_by(step) {
+        for i in (first_source..self.sources.len()).step_by(step) {
             let end = std::cmp::min(i.saturating_add(step), self.sources.len());
             let hive_parts = self.hive_parts.as_ref().map(|x| &x[i..end]);
 
-            if current_offset >= slice_info.item_slice.end && !result.is_empty() {
+            if current_offset >= slice_end && !result.is_empty() {
                 return Ok(result);
             }
 
             // First initialize the readers, predicates and metadata.
             // This will be used to determine the slices. That way we can actually read all the
             // files in parallel even if we add row index columns or slices.
-            let iter = (0..self.sources.len()).into_par_iter().map(|i| {
+            let iter = (i..end).into_par_iter().map(|i| {
                 let source = self.sources.at(i);
                 let hive_partitions = hive_parts.map(|x| x[i].materialize_partition_columns());
 
@@ -141,12 +195,7 @@ impl ParquetExec {
                     let cum_rows = *current_offset_ref;
                     (
                         cum_rows,
-                        split_slice_at_file(
-                            current_offset_ref,
-                            *num_rows,
-                            slice_info.item_slice.start,
-                            slice_info.item_slice.end,
-                        ),
+                        split_slice_at_file(current_offset_ref, *num_rows, slice_offset, slice_end),
                     )
                 })
                 .collect::<Vec<_>>();
diff --git a/crates/polars-parquet/src/parquet/encoding/uleb128.rs b/crates/polars-parquet/src/parquet/encoding/uleb128.rs
index 08459233961c..0740c9575a15 100644
--- a/crates/polars-parquet/src/parquet/encoding/uleb128.rs
+++ b/crates/polars-parquet/src/parquet/encoding/uleb128.rs
@@ -1,5 +1,6 @@
 // Reads an uleb128 encoded integer with at most 56 bits (8 bytes with 7 bits worth of payload each).
 /// Returns the integer and the number of bytes that made up this integer.
+///
 /// If the returned length is bigger than 8 this means the integer required more than 8 bytes and the remaining bytes need to be read sequentially and combined with the return value.
 ///
 /// # Safety
diff --git a/crates/polars-plan/src/plans/conversion/dsl_to_ir.rs b/crates/polars-plan/src/plans/conversion/dsl_to_ir.rs
index 7966d6ff688e..1bf06322f090 100644
--- a/crates/polars-plan/src/plans/conversion/dsl_to_ir.rs
+++ b/crates/polars-plan/src/plans/conversion/dsl_to_ir.rs
@@ -197,7 +197,7 @@ pub fn to_alp_impl(lp: DslPlan, ctxt: &mut DslConversionContext) -> PolarsResult
                 let mut owned = None;
 
                 hive_partitions_from_paths(
-                    &sources.as_paths(),
+                    sources.as_paths(),
                     file_options.hive_options.hive_start_idx,
                     file_options.hive_options.schema.clone(),
                     match resolved_file_info.reader_schema.as_ref().unwrap() {
@@ -830,19 +830,19 @@ impl DslScanSources {
         let expanded_sources = match &scan_type {
             #[cfg(feature = "parquet")]
             FileScan::Parquet { cloud_options, .. } => {
-                expand_scan_paths_with_hive_update(&paths, file_options, cloud_options)?
+                expand_scan_paths_with_hive_update(paths, file_options, cloud_options)?
             },
             #[cfg(feature = "ipc")]
             FileScan::Ipc { cloud_options, .. } => {
-                expand_scan_paths_with_hive_update(&paths, file_options, cloud_options)?
+                expand_scan_paths_with_hive_update(paths, file_options, cloud_options)?
             },
             #[cfg(feature = "csv")]
             FileScan::Csv { cloud_options, .. } => {
-                expand_paths(&paths, file_options.glob, cloud_options.as_ref())?
+                expand_paths(paths, file_options.glob, cloud_options.as_ref())?
             },
             #[cfg(feature = "json")]
             FileScan::NDJson { cloud_options, .. } => {
-                expand_paths(&paths, file_options.glob, cloud_options.as_ref())?
+                expand_paths(paths, file_options.glob, cloud_options.as_ref())?
             },
             FileScan::Anonymous { .. } => unreachable!(), // Invariant: Anonymous scans are already expanded.
         };
diff --git a/crates/polars-plan/src/plans/ir/format.rs b/crates/polars-plan/src/plans/ir/format.rs
index a69eb5203359..76de9f3beb24 100644
--- a/crates/polars-plan/src/plans/ir/format.rs
+++ b/crates/polars-plan/src/plans/ir/format.rs
@@ -243,7 +243,7 @@ impl<'a> IRDisplay<'a> {
                 write_scan(
                     f,
                     scan_type.into(),
-                    &sources,
+                    sources,
                     indent,
                     n_columns,
                     file_info.schema.len(),
diff --git a/crates/polars-plan/src/plans/ir/mod.rs b/crates/polars-plan/src/plans/ir/mod.rs
index 95a7a5aaf374..328efce28be9 100644
--- a/crates/polars-plan/src/plans/ir/mod.rs
+++ b/crates/polars-plan/src/plans/ir/mod.rs
@@ -13,7 +13,6 @@ pub use format::{ExprIRDisplay, IRDisplay};
 use hive::HivePartitions;
 use polars_core::error::feature_gated;
 use polars_core::prelude::*;
-use polars_core::POOL;
 use polars_io::file_cache::FileCacheEntry;
 use polars_utils::idx_vec::UnitVec;
 use polars_utils::mmap::MemSlice;
@@ -103,7 +102,7 @@ impl ScanSources {
     }
     pub fn as_paths(&self) -> &[PathBuf] {
         match self {
-            Self::Files(paths) => &paths,
+            Self::Files(paths) => paths,
             Self::Buffers(_) => unimplemented!(),
         }
     }
@@ -138,7 +137,7 @@ impl ScanSources {
 
     pub fn is_cloud_url(&self) -> bool {
         match self {
-            Self::Files(paths) => paths.first().map_or(false, |p| polars_io::is_cloud_url(p)),
+            Self::Files(paths) => paths.first().map_or(false, polars_io::is_cloud_url),
             Self::Buffers(_) => false,
         }
     }
@@ -171,114 +170,10 @@ impl ScanSources {
         }
     }
 
-    /// Normalize the slice and collect information as to what rows and parts of the source are
-    /// used in this slice.
-    pub fn collect_slice_information(
-        &self,
-        slice: (i64, usize),
-        map_to_num_rows: impl Fn(ScanSourceRef) -> PolarsResult<usize> + Send + Sync,
-    ) -> PolarsResult<ScanSourceSliceInfo> {
-        fn slice_to_start_end(
-            offset: i64,
-            length: usize,
-            num_rows: usize,
-        ) -> std::ops::Range<usize> {
-            if offset < 0 {
-                let slice_start_as_n_from_end = -offset as usize;
-                let (start, len) = if slice_start_as_n_from_end > num_rows {
-                    // We need to trim the slice, e.g. SLICE[offset: -100, len: 75] on a file of 50
-                    // rows should only give the first 25 rows.
-                    let start_position = slice_start_as_n_from_end - num_rows;
-                    (0, length.saturating_sub(start_position))
-                } else {
-                    (num_rows - slice_start_as_n_from_end, length)
-                };
-
-                let end = start.saturating_add(len);
-
-                start..end
-            } else {
-                let offset = offset as usize;
-                offset.min(num_rows)..(offset + length).min(num_rows)
-            }
-        }
-
-        let (offset, length) = slice;
-
-        if self.is_empty() {
-            return Ok(ScanSourceSliceInfo {
-                item_slice: 0..0,
-                source_slice: 0..0,
-            });
-        }
-
-        if self.len() == 1 {
-            let num_rows = map_to_num_rows(self.get(0).unwrap())?;
-            let item_slice = slice_to_start_end(offset, length, num_rows);
-            let source_slice = if item_slice.is_empty() { 0..0 } else { 0..1 };
-
-            Ok(ScanSourceSliceInfo {
-                item_slice,
-                source_slice,
-            })
-        } else {
-            use rayon::prelude::*;
-
-            // Walk the files in reverse until we find the first file, and then translate the
-            // slice into a positive-offset equivalent.
-            const CHUNK_SIZE: usize = 8;
-            let mut row_counts = Vec::with_capacity(self.len());
-
-            POOL.install(|| {
-                for idx_end in (0..self.len()).step_by(CHUNK_SIZE) {
-                    let idx_start = idx_end.saturating_sub(CHUNK_SIZE);
-
-                    row_counts.extend(
-                        (idx_start..=idx_end)
-                            .into_par_iter()
-                            .map(|i| map_to_num_rows(self.at(i)))
-                            .collect::<PolarsResult<Vec<_>>>()?
-                            .into_iter()
-                            .rev(),
-                    );
-                }
-
-                PolarsResult::Ok(())
-            })?;
-
-            let num_rows = row_counts.iter().sum::<usize>();
-
-            let item_slice = slice_to_start_end(offset, length, num_rows);
-
-            let mut source_start = self.len() - 1;
-            let mut source_end = 0;
-
-            let mut sum = 0;
-            for (i, row_count) in row_counts.iter().rev().enumerate() {
-                if sum < item_slice.end {
-                    source_end = usize::max(source_end, i);
-                }
-
-                sum += row_count;
-
-                if sum >= item_slice.start {
-                    source_start = usize::min(source_start, i);
-                }
-            }
-
-            let source_slice = source_start..source_end + 1;
-
-            Ok(ScanSourceSliceInfo {
-                item_slice,
-                source_slice,
-            })
-        }
-    }
-
     pub fn get(&self, idx: usize) -> Option<ScanSourceRef> {
         match self {
             ScanSources::Files(paths) => paths.get(idx).map(|p| ScanSourceRef::File(p)),
-            ScanSources::Buffers(buffers) => buffers.get(idx).map(|b| ScanSourceRef::Buffer(b)),
+            ScanSources::Buffers(buffers) => buffers.get(idx).map(ScanSourceRef::Buffer),
         }
     }
 
diff --git a/py-polars/polars/io/csv/functions.py b/py-polars/polars/io/csv/functions.py
index 77cd73e0aa5f..257522831cd3 100644
--- a/py-polars/polars/io/csv/functions.py
+++ b/py-polars/polars/io/csv/functions.py
@@ -1232,9 +1232,11 @@ def with_column_names(cols: list[str]) -> list[str]:
 
     if isinstance(source, (str, Path)):
         source = normalize_filepath(source, check_not_directory=False)
-    elif isinstance(source, (IO, BytesIO)):
-        pass
-    elif isinstance(source, list) and isinstance(source[0], BytesIO):
+    elif (
+        isinstance(source, (IO, BytesIO))
+        or isinstance(source, list)
+        and isinstance(source[0], BytesIO)
+    ):
         pass
     else:
         source = [
diff --git a/py-polars/polars/io/ndjson.py b/py-polars/polars/io/ndjson.py
index 63032b5dc688..166e990ba25d 100644
--- a/py-polars/polars/io/ndjson.py
+++ b/py-polars/polars/io/ndjson.py
@@ -166,7 +166,14 @@ def read_ndjson(
 @deprecate_renamed_parameter("row_count_name", "row_index_name", version="0.20.4")
 @deprecate_renamed_parameter("row_count_offset", "row_index_offset", version="0.20.4")
 def scan_ndjson(
-    source: str | Path | IO[str] | IO[bytes] | list[str] | list[Path] | list[IO[str]] | list[IO[bytes]],
+    source: str
+    | Path
+    | IO[str]
+    | IO[bytes]
+    | list[str]
+    | list[Path]
+    | list[IO[str]]
+    | list[IO[bytes]],
     *,
     schema: SchemaDefinition | None = None,
     schema_overrides: SchemaDefinition | None = None,
diff --git a/py-polars/polars/io/parquet/functions.py b/py-polars/polars/io/parquet/functions.py
index 583b8fddf326..2eda346e7c26 100644
--- a/py-polars/polars/io/parquet/functions.py
+++ b/py-polars/polars/io/parquet/functions.py
@@ -422,7 +422,9 @@ def scan_parquet(
 
     if isinstance(source, (str, Path)):
         source = normalize_filepath(source, check_not_directory=False)
-    elif isinstance(source, io.BytesIO) or (isinstance(source, list) and isinstance(source[0], io.BytesIO)):
+    elif isinstance(source, io.BytesIO) or (
+        isinstance(source, list) and isinstance(source[0], io.BytesIO)
+    ):
         pass
     else:
         source = [
diff --git a/py-polars/tests/unit/io/test_parquet.py b/py-polars/tests/unit/io/test_parquet.py
index b46f21f3893e..f57d8bbf5b38 100644
--- a/py-polars/tests/unit/io/test_parquet.py
+++ b/py-polars/tests/unit/io/test_parquet.py
@@ -12,7 +12,7 @@
 import pyarrow.dataset as ds
 import pyarrow.parquet as pq
 import pytest
-from hypothesis import HealthCheck, given, settings
+from hypothesis import given
 from hypothesis import strategies as st
 
 import polars as pl
@@ -1559,9 +1559,7 @@ def test_predicate_filtering(
     offset=st.integers(0, 100),
     length=st.integers(0, 100),
 )
-def test_slice_roundtrip(
-    df: pl.DataFrame, offset: int, length: int
-) -> None:
+def test_slice_roundtrip(df: pl.DataFrame, offset: int, length: int) -> None:
     offset %= df.height + 1
     length %= df.height - offset + 1
 
diff --git a/py-polars/tests/unit/io/test_scan.py b/py-polars/tests/unit/io/test_scan.py
index cb33344a1fce..a254daaeaa12 100644
--- a/py-polars/tests/unit/io/test_scan.py
+++ b/py-polars/tests/unit/io/test_scan.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import io
 from dataclasses import dataclass
 from functools import partial
 from math import ceil
@@ -9,7 +10,6 @@
 import pytest
 
 import polars as pl
-import io
 from polars.testing.asserts.frame import assert_frame_equal
 
 if TYPE_CHECKING:
@@ -699,39 +699,41 @@ def test_async_path_expansion_bracket_17629(tmp_path: Path) -> None:
 )
 def test_scan_in_memory(method: str) -> None:
     f = io.BytesIO()
-    df = pl.DataFrame({
-        'a': [1, 2, 3],
-        'b': ['x', 'y', 'z'],
-    })
+    df = pl.DataFrame(
+        {
+            "a": [1, 2, 3],
+            "b": ["x", "y", "z"],
+        }
+    )
 
-    (getattr(df, f'write_{method}'))(f)
+    (getattr(df, f"write_{method}"))(f)
 
     f.seek(0)
-    result = (getattr(pl, f'scan_{method}'))(f).collect()
+    result = (getattr(pl, f"scan_{method}"))(f).collect()
     assert_frame_equal(df, result)
 
     f.seek(0)
-    result = (getattr(pl, f'scan_{method}'))(f).slice(1, 2).collect()
+    result = (getattr(pl, f"scan_{method}"))(f).slice(1, 2).collect()
     assert_frame_equal(df.slice(1, 2), result)
 
     f.seek(0)
-    result = (getattr(pl, f'scan_{method}'))(f).slice(-1, 1).collect()
+    result = (getattr(pl, f"scan_{method}"))(f).slice(-1, 1).collect()
     assert_frame_equal(df.slice(-1, 1), result)
 
     g = io.BytesIO()
-    (getattr(df, f'write_{method}'))(g)
+    (getattr(df, f"write_{method}"))(g)
 
     f.seek(0)
     g.seek(0)
-    result = (getattr(pl, f'scan_{method}'))([f, g]).collect()
+    result = (getattr(pl, f"scan_{method}"))([f, g]).collect()
     assert_frame_equal(df.vstack(df), result)
 
     f.seek(0)
     g.seek(0)
-    result = (getattr(pl, f'scan_{method}'))([f, g]).slice(1, 2).collect()
+    result = (getattr(pl, f"scan_{method}"))([f, g]).slice(1, 2).collect()
     assert_frame_equal(df.vstack(df).slice(1, 2), result)
 
     f.seek(0)
     g.seek(0)
-    result = (getattr(pl, f'scan_{method}'))([f, g]).slice(-1, 1).collect()
+    result = (getattr(pl, f"scan_{method}"))([f, g]).slice(-1, 1).collect()
     assert_frame_equal(df.vstack(df).slice(-1, 1), result)

From 88e76511f592067b22fe1cbdf29ef92141dbd5b2 Mon Sep 17 00:00:00 2001
From: coastalwhite <me@gburghoorn.com>
Date: Thu, 5 Sep 2024 17:19:00 +0200
Subject: [PATCH 07/27] minor fixes

---
 crates/polars-io/src/ipc/ipc_file.rs          |  1 +
 crates/polars-lazy/src/scan/csv.rs            | 14 ++---
 .../polars-lazy/src/scan/file_list_reader.rs  |  3 +-
 crates/polars-lazy/src/scan/ipc.rs            | 11 ++--
 crates/polars-lazy/src/scan/ndjson.rs         |  7 ++-
 crates/polars-lazy/src/scan/parquet.rs        | 11 ++--
 .../src/executors/scan/ipc.rs                 | 58 ++++++++-----------
 crates/polars-python/src/lazyframe/general.rs |  8 +--
 py-polars/polars/io/csv/functions.py          | 21 +++++--
 py-polars/polars/io/ndjson.py                 |  4 +-
 10 files changed, 72 insertions(+), 66 deletions(-)

diff --git a/crates/polars-io/src/ipc/ipc_file.rs b/crates/polars-io/src/ipc/ipc_file.rs
index aa6546c8dd5a..9347a453b426 100644
--- a/crates/polars-io/src/ipc/ipc_file.rs
+++ b/crates/polars-io/src/ipc/ipc_file.rs
@@ -52,6 +52,7 @@ use crate::RowIndex;
 #[derive(Clone, Debug, PartialEq, Hash)]
 #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
 pub struct IpcScanOptions {
+    /// Not used anymore.
     pub memory_map: bool,
 }
 
diff --git a/crates/polars-lazy/src/scan/csv.rs b/crates/polars-lazy/src/scan/csv.rs
index a8687aba3b8b..83e34cff0fe5 100644
--- a/crates/polars-lazy/src/scan/csv.rs
+++ b/crates/polars-lazy/src/scan/csv.rs
@@ -32,16 +32,12 @@ impl LazyCsvReader {
     }
 
     pub fn new_paths(paths: Arc<[PathBuf]>) -> Self {
-        Self::new("").with_paths(paths)
+        Self::new_with_sources(ScanSources::Files(paths))
     }
 
-    pub fn new_sourced(sources: ScanSources) -> Self {
-        Self::new("").with_sources(sources)
-    }
-
-    pub fn new(path: impl AsRef<Path>) -> Self {
+    pub fn new_with_sources(sources: ScanSources) -> Self {
         LazyCsvReader {
-            sources: ScanSources::Files([path.as_ref().to_path_buf()].into()),
+            sources,
             glob: true,
             cache: true,
             read_options: Default::default(),
@@ -50,6 +46,10 @@ impl LazyCsvReader {
         }
     }
 
+    pub fn new(path: impl AsRef<Path>) -> Self {
+        Self::new_with_sources(ScanSources::Files([path.as_ref().to_path_buf()].into()))
+    }
+
     /// Skip this number of rows after the header location.
     #[must_use]
     pub fn with_skip_rows_after_header(mut self, offset: usize) -> Self {
diff --git a/crates/polars-lazy/src/scan/file_list_reader.rs b/crates/polars-lazy/src/scan/file_list_reader.rs
index b25cec6eda3b..2c8c9d86dd33 100644
--- a/crates/polars-lazy/src/scan/file_list_reader.rs
+++ b/crates/polars-lazy/src/scan/file_list_reader.rs
@@ -83,9 +83,10 @@ pub trait LazyFileListReader: Clone {
         true
     }
 
+    /// Get the sources for this reader.
     fn sources(&self) -> &ScanSources;
 
-    /// Set paths of the scanned files.
+    /// Set sources of the scanned files.
     #[must_use]
     fn with_sources(self, source: ScanSources) -> Self;
 
diff --git a/crates/polars-lazy/src/scan/ipc.rs b/crates/polars-lazy/src/scan/ipc.rs
index fa11ef8e4455..e70434e39d8d 100644
--- a/crates/polars-lazy/src/scan/ipc.rs
+++ b/crates/polars-lazy/src/scan/ipc.rs
@@ -124,16 +124,17 @@ impl LazyFileListReader for LazyIpcReader {
 impl LazyFrame {
     /// Create a LazyFrame directly from a ipc scan.
     pub fn scan_ipc(path: impl AsRef<Path>, args: ScanArgsIpc) -> PolarsResult<Self> {
-        LazyIpcReader::new(args)
-            .with_paths([path.as_ref().to_path_buf()].into())
-            .finish()
+        Self::scan_ipc_sources(
+            ScanSources::Files([path.as_ref().to_path_buf()].into()),
+            args,
+        )
     }
 
     pub fn scan_ipc_files(paths: Arc<[PathBuf]>, args: ScanArgsIpc) -> PolarsResult<Self> {
-        LazyIpcReader::new(args).with_paths(paths).finish()
+        Self::scan_ipc_sources(ScanSources::Files(paths), args)
     }
 
-    pub fn scan_ipc_sourced(sources: ScanSources, args: ScanArgsIpc) -> PolarsResult<Self> {
+    pub fn scan_ipc_sources(sources: ScanSources, args: ScanArgsIpc) -> PolarsResult<Self> {
         LazyIpcReader::new(args).with_sources(sources).finish()
     }
 }
diff --git a/crates/polars-lazy/src/scan/ndjson.rs b/crates/polars-lazy/src/scan/ndjson.rs
index 6d0492e170dc..195f8e0372a3 100644
--- a/crates/polars-lazy/src/scan/ndjson.rs
+++ b/crates/polars-lazy/src/scan/ndjson.rs
@@ -29,10 +29,10 @@ pub struct LazyJsonLineReader {
 
 impl LazyJsonLineReader {
     pub fn new_paths(paths: Arc<[PathBuf]>) -> Self {
-        Self::new_sourced(ScanSources::Files(paths))
+        Self::new_with_sources(ScanSources::Files(paths))
     }
 
-    pub fn new_sourced(sources: ScanSources) -> Self {
+    pub fn new_with_sources(sources: ScanSources) -> Self {
         LazyJsonLineReader {
             sources,
             batch_size: None,
@@ -50,8 +50,9 @@ impl LazyJsonLineReader {
     }
 
     pub fn new(path: impl AsRef<Path>) -> Self {
-        Self::new_sourced(ScanSources::Files([path.as_ref().to_path_buf()].into()))
+        Self::new_with_sources(ScanSources::Files([path.as_ref().to_path_buf()].into()))
     }
+
     /// Add a row index column.
     #[must_use]
     pub fn with_row_index(mut self, row_index: Option<RowIndex>) -> Self {
diff --git a/crates/polars-lazy/src/scan/parquet.rs b/crates/polars-lazy/src/scan/parquet.rs
index c198ccf690c1..ff4f9a73ec78 100644
--- a/crates/polars-lazy/src/scan/parquet.rs
+++ b/crates/polars-lazy/src/scan/parquet.rs
@@ -139,18 +139,19 @@ impl LazyFileListReader for LazyParquetReader {
 impl LazyFrame {
     /// Create a LazyFrame directly from a parquet scan.
     pub fn scan_parquet(path: impl AsRef<Path>, args: ScanArgsParquet) -> PolarsResult<Self> {
-        LazyParquetReader::new(args)
-            .with_paths(vec![path.as_ref().to_path_buf()].into())
-            .finish()
+        Self::scan_parquet_sources(
+            ScanSources::Files([path.as_ref().to_path_buf()].into()),
+            args,
+        )
     }
 
     /// Create a LazyFrame directly from a parquet scan.
-    pub fn scan_parquet_sourced(sources: ScanSources, args: ScanArgsParquet) -> PolarsResult<Self> {
+    pub fn scan_parquet_sources(sources: ScanSources, args: ScanArgsParquet) -> PolarsResult<Self> {
         LazyParquetReader::new(args).with_sources(sources).finish()
     }
 
     /// Create a LazyFrame directly from a parquet scan.
     pub fn scan_parquet_files(paths: Arc<[PathBuf]>, args: ScanArgsParquet) -> PolarsResult<Self> {
-        LazyParquetReader::new(args).with_paths(paths).finish()
+        Self::scan_parquet_sources(ScanSources::Files(paths), args)
     }
 }
diff --git a/crates/polars-mem-engine/src/executors/scan/ipc.rs b/crates/polars-mem-engine/src/executors/scan/ipc.rs
index ae1e3bcf30f2..daa98e209126 100644
--- a/crates/polars-mem-engine/src/executors/scan/ipc.rs
+++ b/crates/polars-mem-engine/src/executors/scan/ipc.rs
@@ -5,6 +5,7 @@ use polars_error::feature_gated;
 use polars_io::cloud::CloudOptions;
 use polars_io::path_utils::is_cloud_url;
 use polars_io::predicates::apply_predicate;
+use polars_utils::mmap::MemSlice;
 use rayon::prelude::*;
 
 use super::*;
@@ -13,6 +14,7 @@ pub struct IpcExec {
     pub(crate) sources: ScanSources,
     pub(crate) file_info: FileInfo,
     pub(crate) predicate: Option<Arc<dyn PhysicalExpr>>,
+    #[allow(dead_code)]
     pub(crate) options: IpcScanOptions,
     pub(crate) file_options: FileScanOptions,
     pub(crate) hive_parts: Option<Arc<Vec<HivePartitions>>>,
@@ -72,48 +74,34 @@ impl IpcExec {
         let read_path = |index: usize, n_rows: Option<usize>| {
             let source = self.sources.at(index);
 
-            match source {
+            let memslice = match source {
                 ScanSourceRef::File(path) => {
                     let file = match idx_to_cached_file(index) {
                         None => std::fs::File::open(path)?,
                         Some(f) => f?,
                     };
 
-                    IpcReader::new(file)
-                        .with_n_rows(n_rows)
-                        .with_row_index(self.file_options.row_index.clone())
-                        .with_projection(projection.clone())
-                        .with_hive_partition_columns(
-                            self.hive_parts
-                                .as_ref()
-                                .map(|x| x[index].materialize_partition_columns()),
-                        )
-                        .with_include_file_path(
-                            self.file_options
-                                .include_file_paths
-                                .as_ref()
-                                .map(|x| (x.clone(), Arc::from(source.to_file_path()))),
-                        )
-                        .memory_mapped(self.options.memory_map.then(|| path.to_path_buf()))
-                        .finish()
+                    MemSlice::from_mmap(Arc::new(unsafe { memmap::Mmap::map(&file).unwrap() }))
                 },
-                ScanSourceRef::Buffer(buff) => IpcReader::new(std::io::Cursor::new(buff))
-                    .with_n_rows(n_rows)
-                    .with_row_index(self.file_options.row_index.clone())
-                    .with_projection(projection.clone())
-                    .with_hive_partition_columns(
-                        self.hive_parts
-                            .as_ref()
-                            .map(|x| x[index].materialize_partition_columns()),
-                    )
-                    .with_include_file_path(
-                        self.file_options
-                            .include_file_paths
-                            .as_ref()
-                            .map(|x| (x.clone(), Arc::from(source.to_file_path()))),
-                    )
-                    .finish(),
-            }
+                ScanSourceRef::Buffer(buff) => MemSlice::from_bytes(buff.clone()),
+            };
+
+            IpcReader::new(std::io::Cursor::new(memslice))
+                .with_n_rows(n_rows)
+                .with_row_index(self.file_options.row_index.clone())
+                .with_projection(projection.clone())
+                .with_hive_partition_columns(
+                    self.hive_parts
+                        .as_ref()
+                        .map(|x| x[index].materialize_partition_columns()),
+                )
+                .with_include_file_path(
+                    self.file_options
+                        .include_file_paths
+                        .as_ref()
+                        .map(|x| (x.clone(), Arc::from(source.to_file_path()))),
+                )
+                .finish()
         };
 
         let mut dfs = if let Some(mut n_rows) = self.file_options.slice.map(|x| {
diff --git a/crates/polars-python/src/lazyframe/general.rs b/crates/polars-python/src/lazyframe/general.rs
index 30206d0c088b..e09d5cb7f309 100644
--- a/crates/polars-python/src/lazyframe/general.rs
+++ b/crates/polars-python/src/lazyframe/general.rs
@@ -70,7 +70,7 @@ impl PyLazyFrame {
             Some(source) => pyobject_to_first_path_and_scan_sources(source)?,
         };
 
-        let mut r = LazyJsonLineReader::new_sourced(sources);
+        let mut r = LazyJsonLineReader::new_with_sources(sources);
 
         #[cfg(feature = "cloud")]
         if let Some(first_path) = first_path {
@@ -168,7 +168,7 @@ impl PyLazyFrame {
             Some(source) => pyobject_to_first_path_and_scan_sources(source)?,
         };
 
-        let mut r = LazyCsvReader::new_sourced(sources);
+        let mut r = LazyCsvReader::new_with_sources(sources);
 
         #[cfg(feature = "cloud")]
         if let Some(first_path) = first_path {
@@ -302,7 +302,7 @@ impl PyLazyFrame {
             args.cloud_options = Some(cloud_options.with_max_retries(retries));
         }
 
-        let lf = LazyFrame::scan_parquet_sourced(sources, args).map_err(PyPolarsErr::from)?;
+        let lf = LazyFrame::scan_parquet_sources(sources, args).map_err(PyPolarsErr::from)?;
 
         Ok(lf.into())
     }
@@ -368,7 +368,7 @@ impl PyLazyFrame {
             args.cloud_options = Some(cloud_options.with_max_retries(retries));
         }
 
-        let lf = LazyFrame::scan_ipc_sourced(sources, args).map_err(PyPolarsErr::from)?;
+        let lf = LazyFrame::scan_ipc_sources(sources, args).map_err(PyPolarsErr::from)?;
         Ok(lf.into())
     }
 
diff --git a/py-polars/polars/io/csv/functions.py b/py-polars/polars/io/csv/functions.py
index 257522831cd3..23d3e86badc4 100644
--- a/py-polars/polars/io/csv/functions.py
+++ b/py-polars/polars/io/csv/functions.py
@@ -984,7 +984,14 @@ def read_csv_batched(
 @deprecate_renamed_parameter("row_count_name", "row_index_name", version="0.20.4")
 @deprecate_renamed_parameter("row_count_offset", "row_index_offset", version="0.20.4")
 def scan_csv(
-    source: str | Path | list[str] | list[Path] | IO[str] | IO[bytes],
+    source: str
+    | Path
+    | IO[str]
+    | IO[bytes]
+    | list[str]
+    | list[Path]
+    | list[IO[str]]
+    | list[IO[bytes]],
     *,
     has_header: bool = True,
     separator: str = ",",
@@ -1233,9 +1240,9 @@ def with_column_names(cols: list[str]) -> list[str]:
     if isinstance(source, (str, Path)):
         source = normalize_filepath(source, check_not_directory=False)
     elif (
-        isinstance(source, (IO, BytesIO))
+        isinstance(source, (BytesIO, StringIO))
         or isinstance(source, list)
-        and isinstance(source[0], BytesIO)
+        and isinstance(source[0], (BytesIO, StringIO))
     ):
         pass
     else:
@@ -1282,7 +1289,13 @@ def with_column_names(cols: list[str]) -> list[str]:
 
 
 def _scan_csv_impl(
-    source: str | list[str] | list[Path] | IO[str] | IO[bytes],
+    source: str
+    | IO[str]
+    | IO[bytes]
+    | list[str]
+    | list[Path]
+    | list[IO[str]]
+    | list[IO[bytes]],
     *,
     has_header: bool = True,
     separator: str = ",",
diff --git a/py-polars/polars/io/ndjson.py b/py-polars/polars/io/ndjson.py
index 166e990ba25d..a4d8f62e73b6 100644
--- a/py-polars/polars/io/ndjson.py
+++ b/py-polars/polars/io/ndjson.py
@@ -257,9 +257,9 @@ def scan_ndjson(
     if isinstance(source, (str, Path)):
         source = normalize_filepath(source, check_not_directory=False)
         sources = []
-    elif isinstance(source, BytesIO):
+    elif isinstance(source, (BytesIO, StringIO)):
         sources = []
-    elif isinstance(source, list) and isinstance(source[0], BytesIO):
+    elif isinstance(source, list) and isinstance(source[0], (BytesIO, StringIO)):
         sources = source
         source = None  # type: ignore[assignment]
     else:

From 6026101f95c398623673499b050e2d0e1bacb5e7 Mon Sep 17 00:00:00 2001
From: coastalwhite <me@gburghoorn.com>
Date: Thu, 5 Sep 2024 17:23:59 +0200
Subject: [PATCH 08/27] fix cfg

---
 crates/polars-plan/src/plans/ir/mod.rs | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/crates/polars-plan/src/plans/ir/mod.rs b/crates/polars-plan/src/plans/ir/mod.rs
index 328efce28be9..cb4cd5f4b203 100644
--- a/crates/polars-plan/src/plans/ir/mod.rs
+++ b/crates/polars-plan/src/plans/ir/mod.rs
@@ -13,7 +13,6 @@ pub use format::{ExprIRDisplay, IRDisplay};
 use hive::HivePartitions;
 use polars_core::error::feature_gated;
 use polars_core::prelude::*;
-use polars_io::file_cache::FileCacheEntry;
 use polars_utils::idx_vec::UnitVec;
 use polars_utils::mmap::MemSlice;
 use polars_utils::unitvec;
@@ -72,7 +71,7 @@ impl<'a> ScanSourceRef<'a> {
     pub fn to_memslice(
         &self,
         run_async: bool,
-        cache_entries: Option<&Vec<Arc<FileCacheEntry>>>,
+        cache_entries: Option<&Vec<Arc<polars_io::file_cache::FileCacheEntry>>>,
         index: usize,
     ) -> PolarsResult<MemSlice> {
         match self {

From ef8d0348180f1a23207e1c99e35c10cdf2d35ca3 Mon Sep 17 00:00:00 2001
From: coastalwhite <me@gburghoorn.com>
Date: Fri, 6 Sep 2024 11:29:53 +0200
Subject: [PATCH 09/27] fix several issues

---
 .../src/executors/scan/ipc.rs                 |  5 +++-
 .../src/executors/scan/parquet.rs             | 11 ++++++--
 crates/polars-mem-engine/src/utils.rs         | 28 +++++++++++--------
 .../polars-pipe/src/executors/sources/csv.rs  | 15 +++++-----
 .../src/executors/sources/parquet.rs          |  9 ++++--
 .../src/plans/conversion/dsl_to_ir.rs         |  6 +++-
 .../polars-plan/src/plans/conversion/scans.rs | 18 ++++++++----
 .../polars-plan/src/plans/functions/count.rs  | 14 ++++++++--
 crates/polars-plan/src/plans/ir/mod.rs        | 21 ++++++--------
 .../plans/optimizer/predicate_pushdown/mod.rs |  4 ++-
 .../src/lazyframe/visitor/nodes.rs            |  2 +-
 .../src/physical_plan/lower_ir.rs             |  6 ++--
 py-polars/tests/unit/io/test_scan.py          | 24 ++++++++++++++++
 .../tests/unit/streaming/test_streaming_io.py | 24 ++++++++++++++++
 14 files changed, 137 insertions(+), 50 deletions(-)

diff --git a/crates/polars-mem-engine/src/executors/scan/ipc.rs b/crates/polars-mem-engine/src/executors/scan/ipc.rs
index daa98e209126..856cd8820ba4 100644
--- a/crates/polars-mem-engine/src/executors/scan/ipc.rs
+++ b/crates/polars-mem-engine/src/executors/scan/ipc.rs
@@ -173,7 +173,10 @@ impl IpcExec {
         // concurrently.
         use polars_io::file_cache::init_entries_from_uri_list;
 
-        let paths = self.sources.into_paths();
+        let paths = self
+            .sources
+            .into_paths()
+            .ok_or_else(|| polars_err!(nyi = "Asynchronous scanning of in-memory buffers"))?;
 
         tokio::task::block_in_place(|| {
             let cache_entries = init_entries_from_uri_list(
diff --git a/crates/polars-mem-engine/src/executors/scan/parquet.rs b/crates/polars-mem-engine/src/executors/scan/parquet.rs
index 7de67aff4284..99581ad2c15d 100644
--- a/crates/polars-mem-engine/src/executors/scan/parquet.rs
+++ b/crates/polars-mem-engine/src/executors/scan/parquet.rs
@@ -138,7 +138,6 @@ impl ParquetExec {
 
         for i in (first_source..self.sources.len()).step_by(step) {
             let end = std::cmp::min(i.saturating_add(step), self.sources.len());
-            let hive_parts = self.hive_parts.as_ref().map(|x| &x[i..end]);
 
             if current_offset >= slice_end && !result.is_empty() {
                 return Ok(result);
@@ -149,7 +148,10 @@ impl ParquetExec {
             // files in parallel even if we add row index columns or slices.
             let iter = (i..end).into_par_iter().map(|i| {
                 let source = self.sources.at(i);
-                let hive_partitions = hive_parts.map(|x| x[i].materialize_partition_columns());
+                let hive_partitions = self
+                    .hive_parts
+                    .as_ref()
+                    .map(|x| x[i].materialize_partition_columns());
 
                 let (projection, predicate) = prepare_scan_args(
                     self.predicate.clone(),
@@ -249,7 +251,10 @@ impl ParquetExec {
         use polars_io::utils::slice::split_slice_at_file;
 
         let verbose = verbose();
-        let paths = self.sources.into_paths();
+        let paths = self
+            .sources
+            .into_paths()
+            .ok_or_else(|| polars_err!(nyi = "Asynchronous scanning of in-memory buffers"))?;
         let first_metadata = &self.metadata;
         let cloud_options = self.cloud_options.as_ref();
 
diff --git a/crates/polars-mem-engine/src/utils.rs b/crates/polars-mem-engine/src/utils.rs
index b104da3c4e78..06941cbc128d 100644
--- a/crates/polars-mem-engine/src/utils.rs
+++ b/crates/polars-mem-engine/src/utils.rs
@@ -1,22 +1,28 @@
-use std::path::PathBuf;
+use std::path::Path;
 
 pub(crate) use polars_plan::plans::ArenaLpIter;
-use polars_plan::plans::IR;
+use polars_plan::plans::{ScanSources, IR};
 use polars_utils::aliases::PlHashSet;
 use polars_utils::arena::{Arena, Node};
 
 /// Get a set of the data source paths in this LogicalPlan
-pub(crate) fn agg_source_paths(
+///
+/// # Notes
+///
+/// - Scan sources with in-memory buffers are ignored.
+pub(crate) fn agg_source_paths<'a>(
     root_lp: Node,
-    acc_paths: &mut PlHashSet<PathBuf>,
-    lp_arena: &Arena<IR>,
+    acc_paths: &mut PlHashSet<&'a Path>,
+    lp_arena: &'a Arena<IR>,
 ) {
-    lp_arena.iter(root_lp).for_each(|(_, lp)| {
-        use IR::*;
-        if let Scan { sources, .. } = lp {
-            for path in sources.as_paths() {
-                acc_paths.insert(path.clone());
+    for (_, lp) in lp_arena.iter(root_lp) {
+        if let IR::Scan { sources, .. } = lp {
+            match sources {
+                ScanSources::Files(paths) => acc_paths.extend(paths.iter().map(|p| p.as_path())),
+                ScanSources::Buffers(_) => {
+                    // Ignore
+                },
             }
         }
-    })
+    }
 }
diff --git a/crates/polars-pipe/src/executors/sources/csv.rs b/crates/polars-pipe/src/executors/sources/csv.rs
index 673848e67d77..f3267ac1e90a 100644
--- a/crates/polars-pipe/src/executors/sources/csv.rs
+++ b/crates/polars-pipe/src/executors/sources/csv.rs
@@ -1,5 +1,6 @@
 use std::fs::File;
 
+use polars_core::error::feature_gated;
 use polars_core::{config, POOL};
 use polars_io::csv::read::{BatchedCsvReader, CsvReadOptions, CsvReader};
 use polars_io::path_utils::is_cloud_url;
@@ -36,7 +37,10 @@ impl CsvSource {
     // otherwise all files would be opened during construction of the pipeline
     // leading to Too many Open files error
     fn init_next_reader(&mut self) -> PolarsResult<()> {
-        let paths = self.sources.as_paths();
+        let paths = self
+            .sources
+            .as_paths()
+            .ok_or_else(|| polars_err!(nyi = "Streaming scanning of in-memory buffers"))?;
         let file_options = self.file_options.clone();
 
         let n_rows = file_options.slice.map(|x| {
@@ -105,8 +109,7 @@ impl CsvSource {
             .with_row_index(row_index);
 
         let reader: CsvReader<File> = if run_async {
-            #[cfg(feature = "cloud")]
-            {
+            feature_gated!("cloud", {
                 options.into_reader_with_file_handle(
                     polars_io::file_cache::FILE_CACHE
                         .get_entry(path.to_str().unwrap())
@@ -114,11 +117,7 @@ impl CsvSource {
                         .unwrap()
                         .try_open_assume_latest()?,
                 )
-            }
-            #[cfg(not(feature = "cloud"))]
-            {
-                panic!("required feature `cloud` is not enabled")
-            }
+            })
         } else {
             options
                 .with_path(Some(path))
diff --git a/crates/polars-pipe/src/executors/sources/parquet.rs b/crates/polars-pipe/src/executors/sources/parquet.rs
index e91eb2ec1bba..8592021b2ff3 100644
--- a/crates/polars-pipe/src/executors/sources/parquet.rs
+++ b/crates/polars-pipe/src/executors/sources/parquet.rs
@@ -77,7 +77,10 @@ impl ParquetSource {
         usize,
         Option<Vec<Series>>,
     )> {
-        let paths = self.sources.as_paths();
+        let paths = self
+            .sources
+            .as_paths()
+            .ok_or_else(|| polars_err!(nyi = "Streaming scanning of in-memory buffers"))?;
         let path = &paths[index];
         let options = self.options;
         let file_options = self.file_options.clone();
@@ -256,7 +259,9 @@ impl ParquetSource {
         verbose: bool,
         predicate: Option<Arc<dyn PhysicalIoExpr>>,
     ) -> PolarsResult<Self> {
-        let paths = sources.as_paths();
+        let paths = sources
+            .as_paths()
+            .ok_or_else(|| polars_err!(nyi = "Streaming scanning of in-memory buffers"))?;
         let n_threads = POOL.current_num_threads();
 
         let iter = 0..paths.len();
diff --git a/crates/polars-plan/src/plans/conversion/dsl_to_ir.rs b/crates/polars-plan/src/plans/conversion/dsl_to_ir.rs
index 1bf06322f090..437db91f1975 100644
--- a/crates/polars-plan/src/plans/conversion/dsl_to_ir.rs
+++ b/crates/polars-plan/src/plans/conversion/dsl_to_ir.rs
@@ -193,11 +193,15 @@ pub fn to_alp_impl(lp: DslPlan, ctxt: &mut DslConversionContext) -> PolarsResult
             } else if file_options.hive_options.enabled.unwrap_or(false)
                 && resolved_file_info.reader_schema.is_some()
             {
+                let paths = sources
+                    .as_paths()
+                    .ok_or_else(|| polars_err!(nyi = "Hive-partitioning of in-memory buffers"))?;
+
                 #[allow(unused_assignments)]
                 let mut owned = None;
 
                 hive_partitions_from_paths(
-                    sources.as_paths(),
+                    paths,
                     file_options.hive_options.hive_start_idx,
                     file_options.hive_options.schema.clone(),
                     match resolved_file_info.reader_schema.as_ref().unwrap() {
diff --git a/crates/polars-plan/src/plans/conversion/scans.rs b/crates/polars-plan/src/plans/conversion/scans.rs
index 50da756f9a38..ca5d85b7ba3e 100644
--- a/crates/polars-plan/src/plans/conversion/scans.rs
+++ b/crates/polars-plan/src/plans/conversion/scans.rs
@@ -160,21 +160,24 @@ pub(super) fn csv_file_info(
     let run_async = sources.is_cloud_url() || config::force_async();
 
     let cache_entries = {
-        feature_gated!("cloud", {
-            if run_async {
+        if run_async {
+            feature_gated!("cloud", {
                 Some(polars_io::file_cache::init_entries_from_uri_list(
                     sources
                         .as_paths()
+                        .ok_or_else(|| {
+                            polars_err!(nyi = "Asynchronous scanning of in-memory buffers")
+                        })?
                         .iter()
                         .map(|path| Arc::from(path.to_str().unwrap()))
                         .collect::<Vec<_>>()
                         .as_slice(),
                     cloud_options,
                 )?)
-            } else {
-                None
-            }
-        })
+            })
+        } else {
+            None
+        }
     };
 
     let infer_schema_func = |i| {
@@ -280,6 +283,9 @@ pub(super) fn ndjson_file_info(
                 Some(polars_io::file_cache::init_entries_from_uri_list(
                     sources
                         .as_paths()
+                        .ok_or_else(|| {
+                            polars_err!(nyi = "Asynchronous scanning of in-memory buffers")
+                        })?
                         .iter()
                         .map(|path| Arc::from(path.to_str().unwrap()))
                         .collect::<Vec<_>>()
diff --git a/crates/polars-plan/src/plans/functions/count.rs b/crates/polars-plan/src/plans/functions/count.rs
index f8d344217e70..64dc1615d8b7 100644
--- a/crates/polars-plan/src/plans/functions/count.rs
+++ b/crates/polars-plan/src/plans/functions/count.rs
@@ -126,7 +126,12 @@ pub(super) fn count_rows_parquet(
 
     if is_cloud {
         feature_gated!("cloud", {
-            get_runtime().block_on(count_rows_cloud_parquet(sources.as_paths(), cloud_options))
+            get_runtime().block_on(count_rows_cloud_parquet(
+                sources.as_paths().ok_or_else(|| {
+                    polars_err!(nyi = "Asynchronous scanning of in-memory buffers")
+                })?,
+                cloud_options,
+            ))
         })
     } else {
         sources
@@ -174,7 +179,9 @@ pub(super) fn count_rows_ipc(
     if is_cloud {
         feature_gated!("cloud", {
             get_runtime().block_on(count_rows_cloud_ipc(
-                sources.as_paths(),
+                sources.as_paths().ok_or_else(|| {
+                    polars_err!(nyi = "Asynchronous scanning of in-memory buffers")
+                })?,
                 cloud_options,
                 metadata,
             ))
@@ -234,6 +241,9 @@ pub(super) fn count_rows_ndjson(
                 Some(polars_io::file_cache::init_entries_from_uri_list(
                     sources
                         .as_paths()
+                        .ok_or_else(|| {
+                            polars_err!(nyi = "Asynchronous scanning of in-memory buffers")
+                        })?
                         .iter()
                         .map(|path| Arc::from(path.to_str().unwrap()))
                         .collect::<Vec<_>>()
diff --git a/crates/polars-plan/src/plans/ir/mod.rs b/crates/polars-plan/src/plans/ir/mod.rs
index cb4cd5f4b203..919a4e635e65 100644
--- a/crates/polars-plan/src/plans/ir/mod.rs
+++ b/crates/polars-plan/src/plans/ir/mod.rs
@@ -71,7 +71,10 @@ impl<'a> ScanSourceRef<'a> {
     pub fn to_memslice(
         &self,
         run_async: bool,
-        cache_entries: Option<&Vec<Arc<polars_io::file_cache::FileCacheEntry>>>,
+        #[cfg(feature = "cloud")] cache_entries: Option<
+            &Vec<Arc<polars_io::file_cache::FileCacheEntry>>,
+        >,
+        #[cfg(not(feature = "cloud"))] cache_entries: Option<&()>,
         index: usize,
     ) -> PolarsResult<MemSlice> {
         match self {
@@ -99,24 +102,18 @@ impl ScanSources {
             offset: 0,
         }
     }
-    pub fn as_paths(&self) -> &[PathBuf] {
-        match self {
-            Self::Files(paths) => paths,
-            Self::Buffers(_) => unimplemented!(),
-        }
-    }
 
-    pub fn try_into_paths(&self) -> Option<Arc<[PathBuf]>> {
+    pub fn as_paths(&self) -> Option<&[PathBuf]> {
         match self {
-            Self::Files(paths) => Some(paths.clone()),
+            Self::Files(paths) => Some(paths.as_ref()),
             Self::Buffers(_) => None,
         }
     }
 
-    pub fn into_paths(&self) -> Arc<[PathBuf]> {
+    pub fn into_paths(&self) -> Option<Arc<[PathBuf]>> {
         match self {
-            Self::Files(paths) => paths.clone(),
-            Self::Buffers(_) => unimplemented!(),
+            Self::Files(paths) => Some(paths.clone()),
+            Self::Buffers(_) => None,
         }
     }
 
diff --git a/crates/polars-plan/src/plans/optimizer/predicate_pushdown/mod.rs b/crates/polars-plan/src/plans/optimizer/predicate_pushdown/mod.rs
index d5aefb2a16d7..f42a7ca7239b 100644
--- a/crates/polars-plan/src/plans/optimizer/predicate_pushdown/mod.rs
+++ b/crates/polars-plan/src/plans/optimizer/predicate_pushdown/mod.rs
@@ -366,7 +366,9 @@ impl<'a> PredicatePushDown<'a> {
                 if let (Some(hive_parts), Some(predicate)) = (&scan_hive_parts, &predicate) {
                     if let Some(io_expr) = self.expr_eval.unwrap()(predicate, expr_arena) {
                         if let Some(stats_evaluator) = io_expr.as_stats_evaluator() {
-                            let paths = sources.as_paths();
+                            let paths = sources.as_paths().ok_or_else(|| {
+                                polars_err!(nyi = "Hive partitioning of in-memory buffers")
+                            })?;
                             let mut new_paths = Vec::with_capacity(paths.len());
                             let mut new_hive_parts = Vec::with_capacity(paths.len());
 
diff --git a/crates/polars-python/src/lazyframe/visitor/nodes.rs b/crates/polars-python/src/lazyframe/visitor/nodes.rs
index 37a51e4d481d..4e9344a61d15 100644
--- a/crates/polars-python/src/lazyframe/visitor/nodes.rs
+++ b/crates/polars-python/src/lazyframe/visitor/nodes.rs
@@ -326,7 +326,7 @@ pub(crate) fn into_py(py: Python<'_>, plan: &IR) -> PyResult<PyObject> {
             file_options,
         } => Scan {
             paths: sources
-                .try_into_paths()
+                .into_paths()
                 .ok_or_else(|| PyNotImplementedError::new_err("scan with BytesIO"))?
                 .to_object(py),
             // TODO: file info
diff --git a/crates/polars-stream/src/physical_plan/lower_ir.rs b/crates/polars-stream/src/physical_plan/lower_ir.rs
index d50d90afe52a..b993ea6ac557 100644
--- a/crates/polars-stream/src/physical_plan/lower_ir.rs
+++ b/crates/polars-stream/src/physical_plan/lower_ir.rs
@@ -2,7 +2,7 @@ use std::sync::Arc;
 
 use polars_core::prelude::{InitHashMaps, PlHashMap, PlIndexMap};
 use polars_core::schema::{IndexOfSchema, Schema};
-use polars_error::PolarsResult;
+use polars_error::{polars_err, PolarsResult};
 use polars_plan::plans::expr_ir::{ExprIR, OutputName};
 use polars_plan::plans::{AExpr, IR};
 use polars_plan::prelude::SinkType;
@@ -343,7 +343,9 @@ pub fn lower_ir(
                 unreachable!();
             };
 
-            let paths = sources.into_paths();
+            let paths = sources
+                .into_paths()
+                .ok_or_else(|| polars_err!(nyi = "Streaming scanning of in-memory buffers"))?;
 
             PhysNodeKind::FileScan {
                 paths,
diff --git a/py-polars/tests/unit/io/test_scan.py b/py-polars/tests/unit/io/test_scan.py
index a254daaeaa12..340c4741fa48 100644
--- a/py-polars/tests/unit/io/test_scan.py
+++ b/py-polars/tests/unit/io/test_scan.py
@@ -737,3 +737,27 @@ def test_scan_in_memory(method: str) -> None:
     g.seek(0)
     result = (getattr(pl, f"scan_{method}"))([f, g]).slice(-1, 1).collect()
     assert_frame_equal(df.vstack(df).slice(-1, 1), result)
+
+
+@pytest.mark.parametrize(
+    "method",
+    ["parquet", "csv", "ipc", "ndjson"],
+)
+def test_nyi_async_scan_in_memory(method: str, monkeypatch: pytest.MonkeyPatch) -> None:
+    f = io.BytesIO()
+    df = pl.DataFrame(
+        {
+            "a": [1, 2, 3],
+            "b": ["x", "y", "z"],
+        }
+    )
+
+    (getattr(df, f"write_{method}"))(f)
+
+    f.seek(0)
+    _enable_force_async(monkeypatch)
+    with pytest.raises(
+        pl.exceptions.ComputeError,
+        match="not yet implemented: Asynchronous scanning of in-memory buffers",
+    ):
+        (getattr(pl, f"scan_{method}"))(f).collect()
diff --git a/py-polars/tests/unit/streaming/test_streaming_io.py b/py-polars/tests/unit/streaming/test_streaming_io.py
index ff526d609a0a..0cbf0d90e4ba 100644
--- a/py-polars/tests/unit/streaming/test_streaming_io.py
+++ b/py-polars/tests/unit/streaming/test_streaming_io.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import io
 from typing import TYPE_CHECKING, Any
 from unittest.mock import patch
 
@@ -294,3 +295,26 @@ def test_streaming_empty_parquet_16523(tmp_path: Path) -> None:
     q = pl.scan_parquet(file_path)
     q2 = pl.LazyFrame({"a": [1]}, schema={"a": pl.Int32})
     assert q.join(q2, on="a").collect(streaming=True).shape == (0, 1)
+
+
+@pytest.mark.parametrize(
+    "method",
+    ["parquet", "csv"],
+)
+def test_nyi_scan_in_memory(method: str) -> None:
+    f = io.BytesIO()
+    df = pl.DataFrame(
+        {
+            "a": [1, 2, 3],
+            "b": ["x", "y", "z"],
+        }
+    )
+
+    (getattr(df, f"write_{method}"))(f)
+
+    f.seek(0)
+    with pytest.raises(
+        pl.exceptions.ComputeError,
+        match="not yet implemented: Streaming scanning of in-memory buffers",
+    ):
+        (getattr(pl, f"scan_{method}"))(f).collect(streaming=True)

From 04932705461759f0df5f42bc6f6f3d9d0e0bb062 Mon Sep 17 00:00:00 2001
From: coastalwhite <me@gburghoorn.com>
Date: Fri, 6 Sep 2024 11:55:56 +0200
Subject: [PATCH 10/27] fix: #18581

---
 .../polars-plan/src/plans/functions/count.rs  | 12 +++++-----
 crates/polars-plan/src/plans/functions/mod.rs | 22 +++++++++++++++++--
 crates/polars-plan/src/plans/ir/mod.rs        |  2 +-
 .../tests/unit/lazyframe/optimizations.py     | 13 +++++++++++
 4 files changed, 41 insertions(+), 8 deletions(-)

diff --git a/crates/polars-plan/src/plans/functions/count.rs b/crates/polars-plan/src/plans/functions/count.rs
index 64dc1615d8b7..d30d6ef91d09 100644
--- a/crates/polars-plan/src/plans/functions/count.rs
+++ b/crates/polars-plan/src/plans/functions/count.rs
@@ -25,7 +25,11 @@ use polars_io::SerReader;
 use super::*;
 
 #[allow(unused_variables)]
-pub fn count_rows(sources: &ScanSources, scan_type: &FileScan) -> PolarsResult<DataFrame> {
+pub fn count_rows(
+    sources: &ScanSources,
+    scan_type: &FileScan,
+    alias: Option<PlSmallStr>,
+) -> PolarsResult<DataFrame> {
     #[cfg(not(any(
         feature = "parquet",
         feature = "ipc",
@@ -77,10 +81,8 @@ pub fn count_rows(sources: &ScanSources, scan_type: &FileScan) -> PolarsResult<D
         let count: IdxSize = count.try_into().map_err(
             |_| polars_err!(ComputeError: "count of {} exceeded maximum row size", count),
         )?;
-        DataFrame::new(vec![Series::new(
-            PlSmallStr::from_static(crate::constants::LEN),
-            [count],
-        )])
+        let column_name = alias.unwrap_or(PlSmallStr::from_static(crate::constants::LEN));
+        DataFrame::new(vec![Series::new(column_name, [count])])
     }
 }
 
diff --git a/crates/polars-plan/src/plans/functions/mod.rs b/crates/polars-plan/src/plans/functions/mod.rs
index e453acae6855..2729596b9ab7 100644
--- a/crates/polars-plan/src/plans/functions/mod.rs
+++ b/crates/polars-plan/src/plans/functions/mod.rs
@@ -24,6 +24,7 @@ use strum_macros::IntoStaticStr;
 use crate::dsl::python_udf::PythonFunction;
 #[cfg(feature = "merge_sorted")]
 use crate::plans::functions::merge_sorted::merge_sorted;
+use crate::plans::ir::ScanSourcesDisplay;
 use crate::prelude::*;
 
 #[cfg_attr(feature = "ir_serde", derive(Serialize, Deserialize))]
@@ -266,8 +267,10 @@ impl FunctionIR {
                 ..
             }) => python_udf::call_python_udf(function, df, *validate_output, schema.as_deref()),
             FastCount {
-                sources, scan_type, ..
-            } => count::count_rows(sources, scan_type),
+                sources,
+                scan_type,
+                alias,
+            } => count::count_rows(sources, scan_type, alias.clone()),
             Rechunk => {
                 df.as_single_chunk_par();
                 Ok(df)
@@ -344,6 +347,21 @@ impl Display for FunctionIR {
                     write!(f, "STREAMING")
                 }
             },
+            FastCount {
+                sources,
+                scan_type,
+                alias,
+            } => {
+                let scan_type: &str = scan_type.into();
+                let default_column_name = PlSmallStr::from_static(crate::constants::LEN);
+                let alias = alias.as_ref().unwrap_or(&default_column_name);
+
+                write!(
+                    f,
+                    "FAST COUNT ({scan_type}) {} as \"{alias}\"",
+                    ScanSourcesDisplay(&sources)
+                )
+            },
             v => {
                 let s: &str = v.into();
                 write!(f, "{s}")
diff --git a/crates/polars-plan/src/plans/ir/mod.rs b/crates/polars-plan/src/plans/ir/mod.rs
index 919a4e635e65..a1c96d41ece1 100644
--- a/crates/polars-plan/src/plans/ir/mod.rs
+++ b/crates/polars-plan/src/plans/ir/mod.rs
@@ -8,7 +8,7 @@ use std::borrow::Cow;
 use std::fmt;
 use std::path::{Path, PathBuf};
 
-pub use dot::{EscapeLabel, IRDotDisplay, PathsDisplay};
+pub use dot::{EscapeLabel, IRDotDisplay, PathsDisplay, ScanSourcesDisplay};
 pub use format::{ExprIRDisplay, IRDisplay};
 use hive::HivePartitions;
 use polars_core::error::feature_gated;
diff --git a/py-polars/tests/unit/lazyframe/optimizations.py b/py-polars/tests/unit/lazyframe/optimizations.py
index a44816fad0e6..2417edecdeb8 100644
--- a/py-polars/tests/unit/lazyframe/optimizations.py
+++ b/py-polars/tests/unit/lazyframe/optimizations.py
@@ -1,3 +1,5 @@
+import io
+
 import polars as pl
 from polars.testing import assert_frame_equal
 
@@ -27,3 +29,14 @@ def test_double_sort_maintain_order_18558() -> None:
     )
 
     assert_frame_equal(lf.collect(), expect)
+
+
+def test_fast_count_alias_18581() -> None:
+    f = io.BytesIO()
+    f.write(b"a,b,c\n1,2,3\n4,5,6")
+    f.flush()
+    f.seek(0)
+
+    df = pl.scan_csv(f).select(pl.len().alias("weird_name")).collect()
+
+    assert_frame_equal(pl.DataFrame({"weird_name": 2}), df)

From b97b529b934788b9ebf91a68bc304851f6367c44 Mon Sep 17 00:00:00 2001
From: coastalwhite <me@gburghoorn.com>
Date: Fri, 6 Sep 2024 12:34:23 +0200
Subject: [PATCH 11/27] fix StringIO loading for scan_csv, scan_ndjson

---
 crates/polars-python/src/file.rs     | 30 +++++++++++++++++++---------
 py-polars/polars/dataframe/frame.py  | 10 ++++++++++
 py-polars/tests/unit/io/test_scan.py | 28 ++++++++++++++++++++++++++
 3 files changed, 59 insertions(+), 9 deletions(-)

diff --git a/crates/polars-python/src/file.rs b/crates/polars-python/src/file.rs
index 3d180e2bedf3..3cbb3d364e2f 100644
--- a/crates/polars-python/src/file.rs
+++ b/crates/polars-python/src/file.rs
@@ -12,7 +12,7 @@ use polars::io::mmap::MmapBytesReader;
 use polars_error::{polars_err, polars_warn};
 use pyo3::exceptions::PyTypeError;
 use pyo3::prelude::*;
-use pyo3::types::{PyBytes, PyString};
+use pyo3::types::{PyBytes, PyString, PyStringMethods};
 
 use crate::error::PyPolarsErr;
 use crate::prelude::resolve_homedir;
@@ -47,11 +47,19 @@ impl PyFileLikeObject {
                 .call_method_bound(py, "read", (), None)
                 .expect("no read method found");
 
-            let bytes: &Bound<'_, PyBytes> = bytes
-                .downcast_bound(py)
-                .expect("Expecting to be able to downcast into bytes from read result.");
+            if let Ok(bytes) = bytes.downcast_bound::<PyBytes>(py) {
+                return bytes.as_bytes().to_vec();
+            }
+
+            if let Ok(bytes) = bytes.downcast_bound::<PyString>(py) {
+                return bytes
+                    .to_cow()
+                    .expect("PyString is not valid UTF-8")
+                    .into_owned()
+                    .into_bytes();
+            }
 
-            bytes.as_bytes().to_vec()
+            panic!("Expecting to be able to downcast into bytes from read result.");
         });
 
         Cursor::new(buf)
@@ -215,8 +223,10 @@ pub fn get_either_file_or_path(py_f: PyObject, write: bool) -> PyResult<EitherPy
                 Ok(encoding.eq_ignore_ascii_case("utf-8") || encoding.eq_ignore_ascii_case("utf8"))
             };
 
-            // BytesIO is relatively fast, and some code relies on it.
-            if !py_f.is_exact_instance(&io.getattr("BytesIO").unwrap()) {
+            // BytesIO / StringIO is relatively fast, and some code relies on it.
+            if !py_f.is_exact_instance(&io.getattr("BytesIO").unwrap())
+                || !py_f.is_exact_instance(&io.getattr("StringIO").unwrap())
+            {
                 polars_warn!("Polars found a filename. \
                 Ensure you pass a path to the file instead of a python file object when possible for best \
                 performance.");
@@ -325,8 +335,10 @@ fn get_either_buffer_or_path(
                 ));
             }
 
-            // BytesIO is relatively fast, and some code relies on it.
-            if !py_f.is_exact_instance(&io.getattr("BytesIO").unwrap()) {
+            // BytesIO / StringIO is relatively fast, and some code relies on it.
+            if !py_f.is_exact_instance(&io.getattr("BytesIO").unwrap())
+                || !py_f.is_exact_instance(&io.getattr("StringIO").unwrap())
+            {
                 polars_warn!("Polars found a filename. \
                 Ensure you pass a path to the file instead of a python file object when possible for best \
                 performance.");
diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py
index bca6762cccdd..8a023b4e01ae 100644
--- a/py-polars/polars/dataframe/frame.py
+++ b/py-polars/polars/dataframe/frame.py
@@ -2857,10 +2857,20 @@ def write_csv(
         if not null_value:
             null_value = None
 
+        def write_csv_to_string() -> str:
+            with BytesIO() as buf:
+                self.write_csv(buf)
+                csv_bytes = buf.getvalue()
+            return csv_bytes.decode("utf8")
+
         should_return_buffer = False
         if file is None:
             buffer = file = BytesIO()
             should_return_buffer = True
+        elif isinstance(file, StringIO):
+            csv_str = write_csv_to_string()
+            file.write(csv_str)
+            return None
         elif isinstance(file, (str, os.PathLike)):
             file = normalize_filepath(file)
 
diff --git a/py-polars/tests/unit/io/test_scan.py b/py-polars/tests/unit/io/test_scan.py
index 340c4741fa48..a4c3ac1d133c 100644
--- a/py-polars/tests/unit/io/test_scan.py
+++ b/py-polars/tests/unit/io/test_scan.py
@@ -739,6 +739,34 @@ def test_scan_in_memory(method: str) -> None:
     assert_frame_equal(df.vstack(df).slice(-1, 1), result)
 
 
+@pytest.mark.parametrize(
+    "method",
+    ["csv", "ndjson"],
+)
+def test_scan_stringio(method: str) -> None:
+    f = io.StringIO()
+    df = pl.DataFrame(
+        {
+            "a": [1, 2, 3],
+            "b": ["x", "y", "z"],
+        }
+    )
+
+    (getattr(df, f"write_{method}"))(f)
+
+    f.seek(0)
+    result = (getattr(pl, f"scan_{method}"))(f).collect()
+    assert_frame_equal(df, result)
+
+    g = io.StringIO()
+    (getattr(df, f"write_{method}"))(g)
+
+    f.seek(0)
+    g.seek(0)
+    result = (getattr(pl, f"scan_{method}"))([f, g]).collect()
+    assert_frame_equal(df.vstack(df), result)
+
+
 @pytest.mark.parametrize(
     "method",
     ["parquet", "csv", "ipc", "ndjson"],

From 49bcc85df371a6dc0b7e315300057606675baca3 Mon Sep 17 00:00:00 2001
From: coastalwhite <me@gburghoorn.com>
Date: Fri, 6 Sep 2024 14:34:25 +0200
Subject: [PATCH 12/27] fix async and source lists

---
 .../src/executors/scan/csv.rs                 |   4 +-
 .../src/executors/scan/ipc.rs                 |   7 +-
 .../src/executors/scan/ndjson.rs              |   4 +-
 .../src/executors/scan/parquet.rs             |   7 +-
 .../polars-plan/src/plans/conversion/scans.rs |  14 +--
 .../polars-plan/src/plans/functions/count.rs  |  14 +--
 crates/polars-plan/src/plans/ir/mod.rs        |   4 +
 py-polars/polars/io/csv/functions.py          |   6 +-
 py-polars/polars/io/ipc/functions.py          |   4 +-
 py-polars/polars/io/ndjson.py                 |   6 +-
 py-polars/polars/io/parquet/functions.py      | 113 +++++++++---------
 .../tests/unit/io/test_lazy_count_star.py     |  10 +-
 py-polars/tests/unit/io/test_parquet.py       |  42 +++++++
 py-polars/tests/unit/io/test_scan.py          |  23 +---
 14 files changed, 138 insertions(+), 120 deletions(-)

diff --git a/crates/polars-mem-engine/src/executors/scan/csv.rs b/crates/polars-mem-engine/src/executors/scan/csv.rs
index 7a2ac0d34950..a9048481a4ac 100644
--- a/crates/polars-mem-engine/src/executors/scan/csv.rs
+++ b/crates/polars-mem-engine/src/executors/scan/csv.rs
@@ -57,9 +57,9 @@ impl CsvExec {
 
         let verbose = config::verbose();
         let force_async = config::force_async();
-        let run_async = force_async || self.sources.is_cloud_url();
+        let run_async = (self.sources.is_files() && force_async) || self.sources.is_cloud_url();
 
-        if force_async && verbose {
+        if self.sources.is_files() && force_async && verbose {
             eprintln!("ASYNC READING FORCED");
         }
 
diff --git a/crates/polars-mem-engine/src/executors/scan/ipc.rs b/crates/polars-mem-engine/src/executors/scan/ipc.rs
index 856cd8820ba4..33ad2f54f429 100644
--- a/crates/polars-mem-engine/src/executors/scan/ipc.rs
+++ b/crates/polars-mem-engine/src/executors/scan/ipc.rs
@@ -29,7 +29,7 @@ impl IpcExec {
         };
         let force_async = config::force_async();
 
-        let mut out = if is_cloud || force_async {
+        let mut out = if is_cloud || (self.sources.is_files() && force_async) {
             feature_gated!("cloud", {
                 if force_async && config::verbose() {
                     eprintln!("ASYNC READING FORCED");
@@ -173,10 +173,7 @@ impl IpcExec {
         // concurrently.
         use polars_io::file_cache::init_entries_from_uri_list;
 
-        let paths = self
-            .sources
-            .into_paths()
-            .ok_or_else(|| polars_err!(nyi = "Asynchronous scanning of in-memory buffers"))?;
+        let paths = self.sources.into_paths().unwrap();
 
         tokio::task::block_in_place(|| {
             let cache_entries = init_entries_from_uri_list(
diff --git a/crates/polars-mem-engine/src/executors/scan/ndjson.rs b/crates/polars-mem-engine/src/executors/scan/ndjson.rs
index b37f76ee826d..fb55cb2b38e5 100644
--- a/crates/polars-mem-engine/src/executors/scan/ndjson.rs
+++ b/crates/polars-mem-engine/src/executors/scan/ndjson.rs
@@ -41,9 +41,9 @@ impl JsonExec {
 
         let verbose = config::verbose();
         let force_async = config::force_async();
-        let run_async = force_async || self.sources.is_cloud_url();
+        let run_async = (self.sources.is_files() && force_async) || self.sources.is_cloud_url();
 
-        if force_async && verbose {
+        if self.sources.is_files() && force_async && verbose {
             eprintln!("ASYNC READING FORCED");
         }
 
diff --git a/crates/polars-mem-engine/src/executors/scan/parquet.rs b/crates/polars-mem-engine/src/executors/scan/parquet.rs
index 99581ad2c15d..fd0f53b6d728 100644
--- a/crates/polars-mem-engine/src/executors/scan/parquet.rs
+++ b/crates/polars-mem-engine/src/executors/scan/parquet.rs
@@ -251,10 +251,7 @@ impl ParquetExec {
         use polars_io::utils::slice::split_slice_at_file;
 
         let verbose = verbose();
-        let paths = self
-            .sources
-            .into_paths()
-            .ok_or_else(|| polars_err!(nyi = "Asynchronous scanning of in-memory buffers"))?;
+        let paths = self.sources.into_paths().unwrap();
         let first_metadata = &self.metadata;
         let cloud_options = self.cloud_options.as_ref();
 
@@ -474,7 +471,7 @@ impl ParquetExec {
         let is_cloud = self.sources.is_cloud_url();
         let force_async = config::force_async();
 
-        let out = if is_cloud || force_async {
+        let out = if is_cloud || (self.sources.is_files() && force_async) {
             feature_gated!("cloud", {
                 if force_async && config::verbose() {
                     eprintln!("ASYNC READING FORCED");
diff --git a/crates/polars-plan/src/plans/conversion/scans.rs b/crates/polars-plan/src/plans/conversion/scans.rs
index ca5d85b7ba3e..f7c64ed7612b 100644
--- a/crates/polars-plan/src/plans/conversion/scans.rs
+++ b/crates/polars-plan/src/plans/conversion/scans.rs
@@ -152,12 +152,14 @@ pub(super) fn csv_file_info(
     use polars_io::utils::get_reader_bytes;
     use rayon::iter::{IntoParallelIterator, ParallelIterator};
 
+    polars_ensure!(!sources.is_empty(), ComputeError: "expected at least 1 source");
+
     // TODO:
     // * See if we can do better than scanning all files if there is a row limit
     // * See if we can do this without downloading the entire file
 
     // prints the error message if paths is empty.
-    let run_async = sources.is_cloud_url() || config::force_async();
+    let run_async = sources.is_cloud_url() || (sources.is_files() && config::force_async());
 
     let cache_entries = {
         if run_async {
@@ -165,9 +167,7 @@ pub(super) fn csv_file_info(
                 Some(polars_io::file_cache::init_entries_from_uri_list(
                     sources
                         .as_paths()
-                        .ok_or_else(|| {
-                            polars_err!(nyi = "Asynchronous scanning of in-memory buffers")
-                        })?
+                        .unwrap()
                         .iter()
                         .map(|path| Arc::from(path.to_str().unwrap()))
                         .collect::<Vec<_>>()
@@ -275,7 +275,7 @@ pub(super) fn ndjson_file_info(
         polars_bail!(ComputeError: "expected at least 1 source");
     };
 
-    let run_async = sources.is_cloud_url() || config::force_async();
+    let run_async = sources.is_cloud_url() || (sources.is_files() && config::force_async());
 
     let cache_entries = {
         if run_async {
@@ -283,9 +283,7 @@ pub(super) fn ndjson_file_info(
                 Some(polars_io::file_cache::init_entries_from_uri_list(
                     sources
                         .as_paths()
-                        .ok_or_else(|| {
-                            polars_err!(nyi = "Asynchronous scanning of in-memory buffers")
-                        })?
+                        .unwrap()
                         .iter()
                         .map(|path| Arc::from(path.to_str().unwrap()))
                         .collect::<Vec<_>>()
diff --git a/crates/polars-plan/src/plans/functions/count.rs b/crates/polars-plan/src/plans/functions/count.rs
index d30d6ef91d09..3bba674edb89 100644
--- a/crates/polars-plan/src/plans/functions/count.rs
+++ b/crates/polars-plan/src/plans/functions/count.rs
@@ -129,9 +129,7 @@ pub(super) fn count_rows_parquet(
     if is_cloud {
         feature_gated!("cloud", {
             get_runtime().block_on(count_rows_cloud_parquet(
-                sources.as_paths().ok_or_else(|| {
-                    polars_err!(nyi = "Asynchronous scanning of in-memory buffers")
-                })?,
+                sources.as_paths().unwrap(),
                 cloud_options,
             ))
         })
@@ -181,9 +179,7 @@ pub(super) fn count_rows_ipc(
     if is_cloud {
         feature_gated!("cloud", {
             get_runtime().block_on(count_rows_cloud_ipc(
-                sources.as_paths().ok_or_else(|| {
-                    polars_err!(nyi = "Asynchronous scanning of in-memory buffers")
-                })?,
+                sources.as_paths().unwrap(),
                 cloud_options,
                 metadata,
             ))
@@ -235,7 +231,7 @@ pub(super) fn count_rows_ndjson(
     }
 
     let is_cloud_url = sources.is_cloud_url();
-    let run_async = is_cloud_url || config::force_async();
+    let run_async = is_cloud_url || (sources.is_files() && config::force_async());
 
     let cache_entries = {
         feature_gated!("cloud", {
@@ -243,9 +239,7 @@ pub(super) fn count_rows_ndjson(
                 Some(polars_io::file_cache::init_entries_from_uri_list(
                     sources
                         .as_paths()
-                        .ok_or_else(|| {
-                            polars_err!(nyi = "Asynchronous scanning of in-memory buffers")
-                        })?
+                        .unwrap()
                         .iter()
                         .map(|path| Arc::from(path.to_str().unwrap()))
                         .collect::<Vec<_>>()
diff --git a/crates/polars-plan/src/plans/ir/mod.rs b/crates/polars-plan/src/plans/ir/mod.rs
index a1c96d41ece1..e2f2ca3eae3d 100644
--- a/crates/polars-plan/src/plans/ir/mod.rs
+++ b/crates/polars-plan/src/plans/ir/mod.rs
@@ -131,6 +131,10 @@ impl ScanSources {
         }
     }
 
+    pub fn is_files(&self) -> bool {
+        matches!(self, Self::Files(_))
+    }
+
     pub fn is_cloud_url(&self) -> bool {
         match self {
             Self::Files(paths) => paths.first().map_or(false, polars_io::is_cloud_url),
diff --git a/py-polars/polars/io/csv/functions.py b/py-polars/polars/io/csv/functions.py
index 23d3e86badc4..a6002602af6a 100644
--- a/py-polars/polars/io/csv/functions.py
+++ b/py-polars/polars/io/csv/functions.py
@@ -1239,9 +1239,9 @@ def with_column_names(cols: list[str]) -> list[str]:
 
     if isinstance(source, (str, Path)):
         source = normalize_filepath(source, check_not_directory=False)
-    elif (
-        isinstance(source, (BytesIO, StringIO))
-        or isinstance(source, list)
+    elif isinstance(source, (BytesIO, StringIO)) or (
+        isinstance(source, list)
+        and len(source) > 0
         and isinstance(source[0], (BytesIO, StringIO))
     ):
         pass
diff --git a/py-polars/polars/io/ipc/functions.py b/py-polars/polars/io/ipc/functions.py
index 8c0138df2a36..9945e1c6cbb7 100644
--- a/py-polars/polars/io/ipc/functions.py
+++ b/py-polars/polars/io/ipc/functions.py
@@ -432,7 +432,9 @@ def scan_ipc(
         sources = []
     elif isinstance(source, BytesIO):
         sources = []
-    elif isinstance(source, list) and isinstance(source[0], BytesIO):
+    elif (
+        isinstance(source, list) and len(source) > 0 and isinstance(source[0], BytesIO)
+    ):
         sources = source
         source = None  # type: ignore[assignment]
     else:
diff --git a/py-polars/polars/io/ndjson.py b/py-polars/polars/io/ndjson.py
index a4d8f62e73b6..dfea6cf2871f 100644
--- a/py-polars/polars/io/ndjson.py
+++ b/py-polars/polars/io/ndjson.py
@@ -259,7 +259,11 @@ def scan_ndjson(
         sources = []
     elif isinstance(source, (BytesIO, StringIO)):
         sources = []
-    elif isinstance(source, list) and isinstance(source[0], (BytesIO, StringIO)):
+    elif (
+        isinstance(source, list)
+        and len(source) > 0
+        and isinstance(source[0], (BytesIO, StringIO))
+    ):
         sources = source
         source = None  # type: ignore[assignment]
     else:
diff --git a/py-polars/polars/io/parquet/functions.py b/py-polars/polars/io/parquet/functions.py
index 2eda346e7c26..7e1b2b9a93b1 100644
--- a/py-polars/polars/io/parquet/functions.py
+++ b/py-polars/polars/io/parquet/functions.py
@@ -3,26 +3,26 @@
 import contextlib
 import io
 from pathlib import Path
-from typing import IO, TYPE_CHECKING, Any, Sequence
+from typing import IO, TYPE_CHECKING, Any
 
 import polars.functions as F
+from polars import concat as plconcat
 from polars._utils.deprecation import deprecate_renamed_parameter
 from polars._utils.unstable import issue_unstable_warning
 from polars._utils.various import (
     is_int_sequence,
     normalize_filepath,
 )
-from polars._utils.wrap import wrap_df, wrap_ldf
+from polars._utils.wrap import wrap_ldf
 from polars.convert import from_arrow
 from polars.dependencies import import_optional
 from polars.io._utils import (
-    parse_columns_arg,
     parse_row_index_args,
     prepare_file_arg,
 )
 
 with contextlib.suppress(ImportError):
-    from polars.polars import PyDataFrame, PyLazyFrame
+    from polars.polars import PyLazyFrame
     from polars.polars import read_parquet_schema as _read_parquet_schema
 
 if TYPE_CHECKING:
@@ -33,7 +33,14 @@
 @deprecate_renamed_parameter("row_count_name", "row_index_name", version="0.20.4")
 @deprecate_renamed_parameter("row_count_offset", "row_index_offset", version="0.20.4")
 def read_parquet(
-    source: str | Path | list[str] | list[Path] | IO[bytes] | bytes,
+    source: str
+    | Path
+    | IO[bytes]
+    | bytes
+    | list[str]
+    | list[Path]
+    | list[IO[bytes]]
+    | list[bytes],
     *,
     columns: list[int] | list[str] | None = None,
     n_rows: int | None = None,
@@ -166,18 +173,10 @@ def read_parquet(
         )
 
     # Read file and bytes inputs using `read_parquet`
-    elif isinstance(source, (io.IOBase, bytes)):
-        return _read_parquet_binary(
-            source,
-            columns=columns,
-            n_rows=n_rows,
-            parallel=parallel,
-            row_index_name=row_index_name,
-            row_index_offset=row_index_offset,
-            low_memory=low_memory,
-            use_statistics=use_statistics,
-            rechunk=rechunk,
-        )
+    if isinstance(source, bytes):
+        source = io.BytesIO(source)
+    elif isinstance(source, list) and len(source) > 0 and isinstance(source[0], bytes):
+        source = [io.BytesIO(s) for s in source]  # type: ignore[arg-type]
 
     # For other inputs, defer to `scan_parquet`
     lf = scan_parquet(
@@ -209,7 +208,14 @@ def read_parquet(
 
 
 def _read_parquet_with_pyarrow(
-    source: str | Path | list[str] | list[Path] | IO[bytes] | bytes,
+    source: str
+    | Path
+    | IO[bytes]
+    | bytes
+    | list[str]
+    | list[Path]
+    | list[IO[bytes]]
+    | list[bytes],
     *,
     columns: list[int] | list[str] | None = None,
     storage_options: dict[str, Any] | None = None,
@@ -224,48 +230,35 @@ def _read_parquet_with_pyarrow(
     )
     pyarrow_options = pyarrow_options or {}
 
-    with prepare_file_arg(
-        source,  # type: ignore[arg-type]
-        use_pyarrow=True,
-        storage_options=storage_options,
-    ) as source_prep:
-        pa_table = pyarrow_parquet.read_table(
-            source_prep,
-            memory_map=memory_map,
-            columns=columns,
-            **pyarrow_options,
-        )
-    return from_arrow(pa_table, rechunk=rechunk)  # type: ignore[return-value]
-
+    if (
+        isinstance(source, list)
+        and len(source) > 0
+        and isinstance(source[0], (bytes, io.BytesIO))
+    ):
+        sources = source
+    else:
+        sources = [source]
 
-def _read_parquet_binary(
-    source: IO[bytes] | bytes,
-    *,
-    columns: Sequence[int] | Sequence[str] | None = None,
-    n_rows: int | None = None,
-    row_index_name: str | None = None,
-    row_index_offset: int = 0,
-    parallel: ParallelStrategy = "auto",
-    use_statistics: bool = True,
-    rechunk: bool = False,
-    low_memory: bool = False,
-) -> DataFrame:
-    projection, columns = parse_columns_arg(columns)
-    row_index = parse_row_index_args(row_index_name, row_index_offset)
+    results = []
+    for source in sources:
+        with prepare_file_arg(
+            source,  # type: ignore[arg-type]
+            use_pyarrow=True,
+            storage_options=storage_options,
+        ) as source_prep:
+            pa_table = pyarrow_parquet.read_table(
+                source_prep,
+                memory_map=memory_map,
+                columns=columns,
+                **pyarrow_options,
+            )
+        result = from_arrow(pa_table, rechunk=rechunk)  # type: ignore[return-value]
+        results.append(result)
 
-    with prepare_file_arg(source) as source_prep:
-        pydf = PyDataFrame.read_parquet(
-            source_prep,
-            columns=columns,
-            projection=projection,
-            n_rows=n_rows,
-            row_index=row_index,
-            parallel=parallel,
-            use_statistics=use_statistics,
-            rechunk=rechunk,
-            low_memory=low_memory,
-        )
-    return wrap_df(pydf)
+    if len(results) == 1:
+        return results[0]
+    else:
+        return plconcat(results)
 
 
 def read_parquet_schema(source: str | Path | IO[bytes] | bytes) -> dict[str, DataType]:
@@ -423,7 +416,9 @@ def scan_parquet(
     if isinstance(source, (str, Path)):
         source = normalize_filepath(source, check_not_directory=False)
     elif isinstance(source, io.BytesIO) or (
-        isinstance(source, list) and isinstance(source[0], io.BytesIO)
+        isinstance(source, list)
+        and len(source) > 0
+        and isinstance(source[0], io.BytesIO)
     ):
         pass
     else:
diff --git a/py-polars/tests/unit/io/test_lazy_count_star.py b/py-polars/tests/unit/io/test_lazy_count_star.py
index 7b988bed75c7..a2c03596dd15 100644
--- a/py-polars/tests/unit/io/test_lazy_count_star.py
+++ b/py-polars/tests/unit/io/test_lazy_count_star.py
@@ -23,7 +23,7 @@ def test_count_csv(io_files_path: Path, path: str, n_rows: int) -> None:
     expected = pl.DataFrame(pl.Series("len", [n_rows], dtype=pl.UInt32))
 
     # Check if we are using our fast count star
-    assert "FAST_COUNT" in lf.explain()
+    assert "FAST COUNT" in lf.explain()
     assert_frame_equal(lf.collect(), expected)
 
 
@@ -42,7 +42,7 @@ def test_commented_csv() -> None:
 
     expected = pl.DataFrame(pl.Series("len", [2], dtype=pl.UInt32))
     lf = pl.scan_csv(csv_a.name, comment_prefix="#").select(pl.len())
-    assert "FAST_COUNT" in lf.explain()
+    assert "FAST COUNT" in lf.explain()
     assert_frame_equal(lf.collect(), expected)
 
 
@@ -55,7 +55,7 @@ def test_count_parquet(io_files_path: Path, pattern: str, n_rows: int) -> None:
     expected = pl.DataFrame(pl.Series("len", [n_rows], dtype=pl.UInt32))
 
     # Check if we are using our fast count star
-    assert "FAST_COUNT" in lf.explain()
+    assert "FAST COUNT" in lf.explain()
     assert_frame_equal(lf.collect(), expected)
 
 
@@ -68,7 +68,7 @@ def test_count_ipc(io_files_path: Path, path: str, n_rows: int) -> None:
     expected = pl.DataFrame(pl.Series("len", [n_rows], dtype=pl.UInt32))
 
     # Check if we are using our fast count star
-    assert "FAST_COUNT" in lf.explain()
+    assert "FAST COUNT" in lf.explain()
     assert_frame_equal(lf.collect(), expected)
 
 
@@ -81,7 +81,7 @@ def test_count_ndjson(io_files_path: Path, path: str, n_rows: int) -> None:
     expected = pl.DataFrame(pl.Series("len", [n_rows], dtype=pl.UInt32))
 
     # Check if we are using our fast count star
-    assert "FAST_COUNT" in lf.explain()
+    assert "FAST COUNT" in lf.explain()
     assert_frame_equal(lf.collect(), expected)
 
 
diff --git a/py-polars/tests/unit/io/test_parquet.py b/py-polars/tests/unit/io/test_parquet.py
index f57d8bbf5b38..fe918c866af7 100644
--- a/py-polars/tests/unit/io/test_parquet.py
+++ b/py-polars/tests/unit/io/test_parquet.py
@@ -1835,3 +1835,45 @@ def test_row_index_projection_pushdown_18463(
         df.select("index").slice(1, 1).collect(),
         df.collect().select("index").slice(1, 1),
     )
+
+
+def test_concat_multiple_inmem() -> None:
+    f = io.BytesIO()
+    g = io.BytesIO()
+
+    df1 = pl.DataFrame(
+        {
+            "a": [1, 2, 3],
+            "b": ["xyz", "abc", "wow"],
+        }
+    )
+    df2 = pl.DataFrame(
+        {
+            "a": [5, 6, 7],
+            "b": ["a", "few", "entries"],
+        }
+    )
+
+    dfs = pl.concat([df1, df2])
+
+    df1.write_parquet(f)
+    df2.write_parquet(g)
+
+    f.seek(0)
+    g.seek(0)
+
+    assert_frame_equal(pl.read_parquet([f, g]), dfs)
+
+    f.seek(0)
+    g.seek(0)
+
+    assert_frame_equal(pl.read_parquet([f, g], use_pyarrow=True), dfs)
+
+    f.seek(0)
+    g.seek(0)
+
+    fb = f.read()
+    gb = g.read()
+
+    assert_frame_equal(pl.read_parquet([fb, gb]), dfs)
+    assert_frame_equal(pl.read_parquet([fb, gb], use_pyarrow=True), dfs)
diff --git a/py-polars/tests/unit/io/test_scan.py b/py-polars/tests/unit/io/test_scan.py
index a4c3ac1d133c..15e30ef87274 100644
--- a/py-polars/tests/unit/io/test_scan.py
+++ b/py-polars/tests/unit/io/test_scan.py
@@ -769,23 +769,8 @@ def test_scan_stringio(method: str) -> None:
 
 @pytest.mark.parametrize(
     "method",
-    ["parquet", "csv", "ipc", "ndjson"],
+    [pl.scan_parquet, pl.scan_csv, pl.scan_ipc, pl.scan_ndjson],
 )
-def test_nyi_async_scan_in_memory(method: str, monkeypatch: pytest.MonkeyPatch) -> None:
-    f = io.BytesIO()
-    df = pl.DataFrame(
-        {
-            "a": [1, 2, 3],
-            "b": ["x", "y", "z"],
-        }
-    )
-
-    (getattr(df, f"write_{method}"))(f)
-
-    f.seek(0)
-    _enable_force_async(monkeypatch)
-    with pytest.raises(
-        pl.exceptions.ComputeError,
-        match="not yet implemented: Asynchronous scanning of in-memory buffers",
-    ):
-        (getattr(pl, f"scan_{method}"))(f).collect()
+def test_empty_list(method: Callable[[list[str]], pl.LazyFrame]) -> None:
+    with pytest.raises(pl.exceptions.ComputeError, match="expected at least 1 source"):
+        _ = (method)([]).collect()

From 3070d7e26e93a83e4d008dcdab550753055a76a6 Mon Sep 17 00:00:00 2001
From: coastalwhite <me@gburghoorn.com>
Date: Fri, 6 Sep 2024 14:40:36 +0200
Subject: [PATCH 13/27] small fixes

---
 crates/polars-plan/src/plans/functions/mod.rs | 2 +-
 py-polars/polars/io/csv/functions.py          | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/crates/polars-plan/src/plans/functions/mod.rs b/crates/polars-plan/src/plans/functions/mod.rs
index 2729596b9ab7..61cce46de9af 100644
--- a/crates/polars-plan/src/plans/functions/mod.rs
+++ b/crates/polars-plan/src/plans/functions/mod.rs
@@ -359,7 +359,7 @@ impl Display for FunctionIR {
                 write!(
                     f,
                     "FAST COUNT ({scan_type}) {} as \"{alias}\"",
-                    ScanSourcesDisplay(&sources)
+                    ScanSourcesDisplay(sources)
                 )
             },
             v => {
diff --git a/py-polars/polars/io/csv/functions.py b/py-polars/polars/io/csv/functions.py
index a6002602af6a..7164856901c6 100644
--- a/py-polars/polars/io/csv/functions.py
+++ b/py-polars/polars/io/csv/functions.py
@@ -443,6 +443,8 @@ def read_csv(
         # * The `storage_options` configuration keys are different between
         #   fsspec and object_store (would require a breaking change)
     ):
+        source = normalize_filepath(source, check_not_directory=False)
+
         if schema_overrides_is_list:
             msg = "passing a list to `schema_overrides` is unsupported for hf:// paths"
             raise ValueError(msg)

From 8b4b5232a5afbaf22064e76dcbcadf4956e86ffd Mon Sep 17 00:00:00 2001
From: coastalwhite <me@gburghoorn.com>
Date: Fri, 6 Sep 2024 15:29:26 +0200
Subject: [PATCH 14/27] fix a million mypy errors

---
 py-polars/polars/io/csv/functions.py     | 10 +++++---
 py-polars/polars/io/ipc/functions.py     | 29 +++++++++++++-----------
 py-polars/polars/io/ndjson.py            | 13 ++++++++---
 py-polars/polars/io/parquet/functions.py | 26 +++++++++++----------
 py-polars/tests/unit/io/test_parquet.py  |  7 +++---
 5 files changed, 51 insertions(+), 34 deletions(-)

diff --git a/py-polars/polars/io/csv/functions.py b/py-polars/polars/io/csv/functions.py
index 7164856901c6..d8bc983e0fcc 100644
--- a/py-polars/polars/io/csv/functions.py
+++ b/py-polars/polars/io/csv/functions.py
@@ -443,7 +443,7 @@ def read_csv(
         # * The `storage_options` configuration keys are different between
         #   fsspec and object_store (would require a breaking change)
     ):
-        source = normalize_filepath(source, check_not_directory=False)
+        source = normalize_filepath(v, check_not_directory=False)
 
         if schema_overrides_is_list:
             msg = "passing a list to `schema_overrides` is unsupported for hf:// paths"
@@ -453,7 +453,7 @@ def read_csv(
             raise ValueError(msg)
 
         lf = _scan_csv_impl(
-            source,  # type: ignore[arg-type]
+            source,
             has_header=has_header,
             separator=separator,
             comment_prefix=comment_prefix,
@@ -1249,7 +1249,11 @@ def with_column_names(cols: list[str]) -> list[str]:
         pass
     else:
         source = [
-            normalize_filepath(source, check_not_directory=False) for source in source
+            normalize_filepath(
+                source,  # type: ignore[arg-type]
+                check_not_directory=False,
+            )
+            for source in source
         ]
 
     if not infer_schema:
diff --git a/py-polars/polars/io/ipc/functions.py b/py-polars/polars/io/ipc/functions.py
index 9945e1c6cbb7..e0e213c376da 100644
--- a/py-polars/polars/io/ipc/functions.py
+++ b/py-polars/polars/io/ipc/functions.py
@@ -2,7 +2,7 @@
 
 import contextlib
 import os
-from io import BytesIO
+from io import BytesIO, BufferedIOBase
 from pathlib import Path
 from typing import IO, TYPE_CHECKING, Any, Sequence
 
@@ -427,20 +427,23 @@ def scan_ipc(
     include_file_paths
         Include the path of the source file(s) as a column with this name.
     """
+
+    sources: list[str] | list[Path] | list[IO[bytes]] = []
     if isinstance(source, (str, Path)):
         source = normalize_filepath(source, check_not_directory=False)
-        sources = []
-    elif isinstance(source, BytesIO):
-        sources = []
-    elif (
-        isinstance(source, list) and len(source) > 0 and isinstance(source[0], BytesIO)
-    ):
-        sources = source
-        source = None  # type: ignore[assignment]
-    else:
-        sources = [
-            normalize_filepath(source, check_not_directory=False) for source in source
-        ]
+    elif isinstance(source, list):
+        if len(source) > 0:
+            if isinstance(source[0], (str, Path)):
+                sources = [
+                    normalize_filepath(
+                        source,  # type: ignore[arg-type]
+                        check_not_directory=False,
+                    )
+                    for source in source
+                ]
+            else:
+                sources = source
+
         source = None  # type: ignore[assignment]
 
     pylf = PyLazyFrame.new_from_ipc(
diff --git a/py-polars/polars/io/ndjson.py b/py-polars/polars/io/ndjson.py
index dfea6cf2871f..135ff1a35d75 100644
--- a/py-polars/polars/io/ndjson.py
+++ b/py-polars/polars/io/ndjson.py
@@ -254,11 +254,12 @@ def scan_ndjson(
     include_file_paths
         Include the path of the source file(s) as a column with this name.
     """
+
+    sources: list[str] | list[Path] | list[IO[str]] | list[IO[bytes]] = []
     if isinstance(source, (str, Path)):
         source = normalize_filepath(source, check_not_directory=False)
-        sources = []
     elif isinstance(source, (BytesIO, StringIO)):
-        sources = []
+        pass
     elif (
         isinstance(source, list)
         and len(source) > 0
@@ -267,8 +268,14 @@ def scan_ndjson(
         sources = source
         source = None  # type: ignore[assignment]
     else:
+        assert all(isinstance(s, (str, Path)) for s in source)
+
         sources = [
-            normalize_filepath(source, check_not_directory=False) for source in source
+            normalize_filepath(
+                source,  # type: ignore[arg-type]
+                check_not_directory=False,
+            )
+            for source in source
         ]
         source = None  # type: ignore[assignment]
     if infer_schema_length == 0:
diff --git a/py-polars/polars/io/parquet/functions.py b/py-polars/polars/io/parquet/functions.py
index 7e1b2b9a93b1..61b3b585d067 100644
--- a/py-polars/polars/io/parquet/functions.py
+++ b/py-polars/polars/io/parquet/functions.py
@@ -3,7 +3,7 @@
 import contextlib
 import io
 from pathlib import Path
-from typing import IO, TYPE_CHECKING, Any
+from typing import IO, TYPE_CHECKING, Any, List
 
 import polars.functions as F
 from polars import concat as plconcat
@@ -176,7 +176,8 @@ def read_parquet(
     if isinstance(source, bytes):
         source = io.BytesIO(source)
     elif isinstance(source, list) and len(source) > 0 and isinstance(source[0], bytes):
-        source = [io.BytesIO(s) for s in source]  # type: ignore[arg-type]
+        assert all(isinstance(s, bytes) for s in source) 
+        source = [io.BytesIO(s) for s in source]  # type: ignore
 
     # For other inputs, defer to `scan_parquet`
     lf = scan_parquet(
@@ -230,16 +231,16 @@ def _read_parquet_with_pyarrow(
     )
     pyarrow_options = pyarrow_options or {}
 
-    if (
-        isinstance(source, list)
-        and len(source) > 0
-        and isinstance(source[0], (bytes, io.BytesIO))
-    ):
-        sources = source
+    sources: list[str | Path | IO[bytes] | bytes | list[str] | list[Path]] = []
+    if isinstance(source, list):
+        if len(source) > 0 and isinstance(source[0], (bytes, io.BytesIO)):
+            sources = source # type: ignore
+        else:
+            sources = [source] # type: ignore
     else:
         sources = [source]
 
-    results = []
+    results: list[DataFrame] = []
     for source in sources:
         with prepare_file_arg(
             source,  # type: ignore[arg-type]
@@ -253,7 +254,7 @@ def _read_parquet_with_pyarrow(
                 **pyarrow_options,
             )
         result = from_arrow(pa_table, rechunk=rechunk)  # type: ignore[return-value]
-        results.append(result)
+        results.append(result) # type: ignore[arg-type]
 
     if len(results) == 1:
         return results[0]
@@ -423,11 +424,12 @@ def scan_parquet(
         pass
     else:
         source = [
-            normalize_filepath(source, check_not_directory=False) for source in source
+            normalize_filepath(source, check_not_directory=False) # type: ignore[arg-type]
+            for source in source
         ]
 
     return _scan_parquet_impl(
-        source,
+        source, # type: ignore[arg-type]
         n_rows=n_rows,
         cache=cache,
         parallel=parallel,
diff --git a/py-polars/tests/unit/io/test_parquet.py b/py-polars/tests/unit/io/test_parquet.py
index fe918c866af7..ad7a497e585f 100644
--- a/py-polars/tests/unit/io/test_parquet.py
+++ b/py-polars/tests/unit/io/test_parquet.py
@@ -3,7 +3,7 @@
 import io
 from datetime import datetime, time, timezone
 from decimal import Decimal
-from typing import TYPE_CHECKING, Any, Literal, cast
+from typing import TYPE_CHECKING, Any, Literal, cast, IO
 
 import fsspec
 import numpy as np
@@ -1862,12 +1862,13 @@ def test_concat_multiple_inmem() -> None:
     f.seek(0)
     g.seek(0)
 
-    assert_frame_equal(pl.read_parquet([f, g]), dfs)
+    items: list[IO[bytes]] = [f, g]
+    assert_frame_equal(pl.read_parquet(items), dfs)
 
     f.seek(0)
     g.seek(0)
 
-    assert_frame_equal(pl.read_parquet([f, g], use_pyarrow=True), dfs)
+    assert_frame_equal(pl.read_parquet(items, use_pyarrow=True), dfs)
 
     f.seek(0)
     g.seek(0)

From 17b90b948eb78d9521b60bab268af08028d5ce35 Mon Sep 17 00:00:00 2001
From: coastalwhite <me@gburghoorn.com>
Date: Fri, 6 Sep 2024 15:33:41 +0200
Subject: [PATCH 15/27] mypy

---
 py-polars/polars/io/ipc/functions.py     |  2 --
 py-polars/polars/io/ndjson.py            |  1 -
 py-polars/polars/io/parquet/functions.py | 18 +++++++++---------
 py-polars/tests/unit/io/test_parquet.py  |  2 +-
 4 files changed, 10 insertions(+), 13 deletions(-)

diff --git a/py-polars/polars/io/ipc/functions.py b/py-polars/polars/io/ipc/functions.py
index e0e213c376da..5142b2ae68c6 100644
--- a/py-polars/polars/io/ipc/functions.py
+++ b/py-polars/polars/io/ipc/functions.py
@@ -2,7 +2,6 @@
 
 import contextlib
 import os
-from io import BytesIO, BufferedIOBase
 from pathlib import Path
 from typing import IO, TYPE_CHECKING, Any, Sequence
 
@@ -427,7 +426,6 @@ def scan_ipc(
     include_file_paths
         Include the path of the source file(s) as a column with this name.
     """
-
     sources: list[str] | list[Path] | list[IO[bytes]] = []
     if isinstance(source, (str, Path)):
         source = normalize_filepath(source, check_not_directory=False)
diff --git a/py-polars/polars/io/ndjson.py b/py-polars/polars/io/ndjson.py
index 135ff1a35d75..6cff4ddb1643 100644
--- a/py-polars/polars/io/ndjson.py
+++ b/py-polars/polars/io/ndjson.py
@@ -254,7 +254,6 @@ def scan_ndjson(
     include_file_paths
         Include the path of the source file(s) as a column with this name.
     """
-
     sources: list[str] | list[Path] | list[IO[str]] | list[IO[bytes]] = []
     if isinstance(source, (str, Path)):
         source = normalize_filepath(source, check_not_directory=False)
diff --git a/py-polars/polars/io/parquet/functions.py b/py-polars/polars/io/parquet/functions.py
index 61b3b585d067..04cccc85fc5d 100644
--- a/py-polars/polars/io/parquet/functions.py
+++ b/py-polars/polars/io/parquet/functions.py
@@ -3,7 +3,7 @@
 import contextlib
 import io
 from pathlib import Path
-from typing import IO, TYPE_CHECKING, Any, List
+from typing import IO, TYPE_CHECKING, Any
 
 import polars.functions as F
 from polars import concat as plconcat
@@ -176,8 +176,8 @@ def read_parquet(
     if isinstance(source, bytes):
         source = io.BytesIO(source)
     elif isinstance(source, list) and len(source) > 0 and isinstance(source[0], bytes):
-        assert all(isinstance(s, bytes) for s in source) 
-        source = [io.BytesIO(s) for s in source]  # type: ignore
+        assert all(isinstance(s, bytes) for s in source)
+        source = [io.BytesIO(s) for s in source]  # type: ignore[arg-type, assignment]
 
     # For other inputs, defer to `scan_parquet`
     lf = scan_parquet(
@@ -234,9 +234,9 @@ def _read_parquet_with_pyarrow(
     sources: list[str | Path | IO[bytes] | bytes | list[str] | list[Path]] = []
     if isinstance(source, list):
         if len(source) > 0 and isinstance(source[0], (bytes, io.BytesIO)):
-            sources = source # type: ignore
+            sources = source  # type: ignore[assignment]
         else:
-            sources = [source] # type: ignore
+            sources = [source]  # type: ignore[list-item]
     else:
         sources = [source]
 
@@ -253,8 +253,8 @@ def _read_parquet_with_pyarrow(
                 columns=columns,
                 **pyarrow_options,
             )
-        result = from_arrow(pa_table, rechunk=rechunk)  # type: ignore[return-value]
-        results.append(result) # type: ignore[arg-type]
+        result = from_arrow(pa_table, rechunk=rechunk)
+        results.append(result)  # type: ignore[arg-type]
 
     if len(results) == 1:
         return results[0]
@@ -424,12 +424,12 @@ def scan_parquet(
         pass
     else:
         source = [
-            normalize_filepath(source, check_not_directory=False) # type: ignore[arg-type]
+            normalize_filepath(source, check_not_directory=False)  # type: ignore[arg-type]
             for source in source
         ]
 
     return _scan_parquet_impl(
-        source, # type: ignore[arg-type]
+        source,  # type: ignore[arg-type]
         n_rows=n_rows,
         cache=cache,
         parallel=parallel,
diff --git a/py-polars/tests/unit/io/test_parquet.py b/py-polars/tests/unit/io/test_parquet.py
index ad7a497e585f..db3186a3f874 100644
--- a/py-polars/tests/unit/io/test_parquet.py
+++ b/py-polars/tests/unit/io/test_parquet.py
@@ -3,7 +3,7 @@
 import io
 from datetime import datetime, time, timezone
 from decimal import Decimal
-from typing import TYPE_CHECKING, Any, Literal, cast, IO
+from typing import IO, TYPE_CHECKING, Any, Literal, cast
 
 import fsspec
 import numpy as np

From daae7f1597553cdb78e0fac7f8428c8af144300c Mon Sep 17 00:00:00 2001
From: coastalwhite <me@gburghoorn.com>
Date: Fri, 6 Sep 2024 18:30:32 +0200
Subject: [PATCH 16/27] almost working further file support

---
 crates/polars-io/src/csv/read/parser.rs       |  20 +-
 crates/polars-io/src/mmap.rs                  |   6 +
 crates/polars-lazy/src/scan/csv.rs            |  16 +-
 .../polars-lazy/src/scan/file_list_reader.rs  |   6 +-
 crates/polars-lazy/src/scan/ipc.rs            |   4 +-
 crates/polars-lazy/src/scan/ndjson.rs         |   4 +-
 crates/polars-lazy/src/scan/parquet.rs        |   4 +-
 .../src/executors/scan/csv.rs                 |  22 +-
 .../src/executors/scan/ipc.rs                 |  11 +-
 .../src/executors/scan/ndjson.rs              |  29 +--
 .../src/executors/scan/parquet.rs             |  22 +-
 crates/polars-mem-engine/src/utils.rs         |   6 +-
 crates/polars-plan/src/client/check.rs        |   5 +-
 .../src/plans/conversion/dsl_to_ir.rs         |   4 +-
 .../polars-plan/src/plans/conversion/scans.rs |  65 +++--
 .../polars-plan/src/plans/functions/count.rs  |  89 +++----
 crates/polars-plan/src/plans/ir/dot.rs        |   3 +-
 crates/polars-plan/src/plans/ir/mod.rs        | 108 +++++++--
 .../src/plans/optimizer/count_star.rs         |  35 ++-
 .../plans/optimizer/predicate_pushdown/mod.rs |   2 +-
 crates/polars-python/src/conversion/mod.rs    |  13 +-
 crates/polars-python/src/dataframe/io.rs      | 229 +-----------------
 crates/polars-python/src/file.rs              |  50 ++++
 crates/polars-python/src/lazyframe/general.rs |   3 +-
 .../src/utils/late_materialized_df.rs         |   2 +-
 py-polars/polars/io/csv/functions.py          | 139 ++++-------
 py-polars/polars/io/ipc/functions.py          |  52 ++--
 py-polars/polars/io/ndjson.py                 |  27 +--
 py-polars/polars/io/parquet/functions.py      |   9 +-
 29 files changed, 367 insertions(+), 618 deletions(-)

diff --git a/crates/polars-io/src/csv/read/parser.rs b/crates/polars-io/src/csv/read/parser.rs
index ccda4805792b..282a304003a3 100644
--- a/crates/polars-io/src/csv/read/parser.rs
+++ b/crates/polars-io/src/csv/read/parser.rs
@@ -40,15 +40,7 @@ pub fn count_rows(
 
     let mmap = unsafe { memmap::Mmap::map(&file).unwrap() };
     let owned = &mut vec![];
-    let mut reader_bytes = maybe_decompress_bytes(mmap.as_ref(), owned)?;
-
-    for _ in 0..reader_bytes.len() {
-        if reader_bytes[0] != eol_char {
-            break;
-        }
-
-        reader_bytes = &reader_bytes[1..];
-    }
+    let reader_bytes = maybe_decompress_bytes(mmap.as_ref(), owned)?;
 
     count_rows_from_slice(
         reader_bytes,
@@ -63,13 +55,21 @@ pub fn count_rows(
 /// Read the number of rows without parsing columns
 /// useful for count(*) queries
 pub fn count_rows_from_slice(
-    bytes: &[u8],
+    mut bytes: &[u8],
     separator: u8,
     quote_char: Option<u8>,
     comment_prefix: Option<&CommentPrefix>,
     eol_char: u8,
     has_header: bool,
 ) -> PolarsResult<usize> {
+    for _ in 0..bytes.len() {
+        if bytes[0] != eol_char {
+            break;
+        }
+
+        bytes = &bytes[1..];
+    }
+
     const MIN_ROWS_PER_THREAD: usize = 1024;
     let max_threads = POOL.current_num_threads();
 
diff --git a/crates/polars-io/src/mmap.rs b/crates/polars-io/src/mmap.rs
index 66ea8ed7b48b..ad2c05175810 100644
--- a/crates/polars-io/src/mmap.rs
+++ b/crates/polars-io/src/mmap.rs
@@ -104,6 +104,12 @@ impl MmapBytesReader for BufReader<File> {
     }
 }
 
+impl MmapBytesReader for BufReader<&File> {
+    fn to_file(&self) -> Option<&File> {
+        Some(self.get_ref())
+    }
+}
+
 impl<T> MmapBytesReader for Cursor<T>
 where
     T: AsRef<[u8]> + Send + Sync,
diff --git a/crates/polars-lazy/src/scan/csv.rs b/crates/polars-lazy/src/scan/csv.rs
index 83e34cff0fe5..998f422820c6 100644
--- a/crates/polars-lazy/src/scan/csv.rs
+++ b/crates/polars-lazy/src/scan/csv.rs
@@ -32,7 +32,7 @@ impl LazyCsvReader {
     }
 
     pub fn new_paths(paths: Arc<[PathBuf]>) -> Self {
-        Self::new_with_sources(ScanSources::Files(paths))
+        Self::new_with_sources(ScanSources::Paths(paths))
     }
 
     pub fn new_with_sources(sources: ScanSources) -> Self {
@@ -47,7 +47,7 @@ impl LazyCsvReader {
     }
 
     pub fn new(path: impl AsRef<Path>) -> Self {
-        Self::new_with_sources(ScanSources::Files([path.as_ref().to_path_buf()].into()))
+        Self::new_with_sources(ScanSources::Paths([path.as_ref().to_path_buf()].into()))
     }
 
     /// Skip this number of rows after the header location.
@@ -254,7 +254,7 @@ impl LazyCsvReader {
         };
 
         let schema = match self.sources.clone() {
-            ScanSources::Files(paths) => {
+            ScanSources::Paths(paths) => {
                 // TODO: Path expansion should happen when converting to the IR
                 // https://github.com/pola-rs/polars/issues/17634
                 let paths = expand_paths(&paths[..], self.glob(), self.cloud_options())?;
@@ -266,6 +266,16 @@ impl LazyCsvReader {
                 let mut file = polars_utils::open_file(path)?;
                 infer_schema(get_reader_bytes(&mut file).expect("could not mmap file"))?
             },
+            ScanSources::Files(files) => {
+                let Some(file) = files.first() else {
+                    polars_bail!(ComputeError: "no buffers specified for this reader");
+                };
+
+                infer_schema(
+                    get_reader_bytes(&mut std::io::BufReader::new(file))
+                        .expect("could not mmap file"),
+                )?
+            },
             ScanSources::Buffers(buffers) => {
                 let Some(buffer) = buffers.first() else {
                     polars_bail!(ComputeError: "no buffers specified for this reader");
diff --git a/crates/polars-lazy/src/scan/file_list_reader.rs b/crates/polars-lazy/src/scan/file_list_reader.rs
index 2c8c9d86dd33..28315c96f736 100644
--- a/crates/polars-lazy/src/scan/file_list_reader.rs
+++ b/crates/polars-lazy/src/scan/file_list_reader.rs
@@ -19,8 +19,8 @@ pub trait LazyFileListReader: Clone {
             return self.finish_no_glob();
         }
 
-        let ScanSources::Files(paths) = self.sources() else {
-            unreachable!("in-memory buffers should never be globbed");
+        let ScanSources::Paths(paths) = self.sources() else {
+            unreachable!("opened-files or in-memory buffers should never be globbed");
         };
 
         let lfs = paths
@@ -93,7 +93,7 @@ pub trait LazyFileListReader: Clone {
     /// Set paths of the scanned files.
     #[must_use]
     fn with_paths(self, paths: Arc<[PathBuf]>) -> Self {
-        self.with_sources(ScanSources::Files(paths))
+        self.with_sources(ScanSources::Paths(paths))
     }
 
     /// Configure the row limit.
diff --git a/crates/polars-lazy/src/scan/ipc.rs b/crates/polars-lazy/src/scan/ipc.rs
index e70434e39d8d..8d84ef3de049 100644
--- a/crates/polars-lazy/src/scan/ipc.rs
+++ b/crates/polars-lazy/src/scan/ipc.rs
@@ -125,13 +125,13 @@ impl LazyFrame {
     /// Create a LazyFrame directly from a ipc scan.
     pub fn scan_ipc(path: impl AsRef<Path>, args: ScanArgsIpc) -> PolarsResult<Self> {
         Self::scan_ipc_sources(
-            ScanSources::Files([path.as_ref().to_path_buf()].into()),
+            ScanSources::Paths([path.as_ref().to_path_buf()].into()),
             args,
         )
     }
 
     pub fn scan_ipc_files(paths: Arc<[PathBuf]>, args: ScanArgsIpc) -> PolarsResult<Self> {
-        Self::scan_ipc_sources(ScanSources::Files(paths), args)
+        Self::scan_ipc_sources(ScanSources::Paths(paths), args)
     }
 
     pub fn scan_ipc_sources(sources: ScanSources, args: ScanArgsIpc) -> PolarsResult<Self> {
diff --git a/crates/polars-lazy/src/scan/ndjson.rs b/crates/polars-lazy/src/scan/ndjson.rs
index 195f8e0372a3..e38270ec3e09 100644
--- a/crates/polars-lazy/src/scan/ndjson.rs
+++ b/crates/polars-lazy/src/scan/ndjson.rs
@@ -29,7 +29,7 @@ pub struct LazyJsonLineReader {
 
 impl LazyJsonLineReader {
     pub fn new_paths(paths: Arc<[PathBuf]>) -> Self {
-        Self::new_with_sources(ScanSources::Files(paths))
+        Self::new_with_sources(ScanSources::Paths(paths))
     }
 
     pub fn new_with_sources(sources: ScanSources) -> Self {
@@ -50,7 +50,7 @@ impl LazyJsonLineReader {
     }
 
     pub fn new(path: impl AsRef<Path>) -> Self {
-        Self::new_with_sources(ScanSources::Files([path.as_ref().to_path_buf()].into()))
+        Self::new_with_sources(ScanSources::Paths([path.as_ref().to_path_buf()].into()))
     }
 
     /// Add a row index column.
diff --git a/crates/polars-lazy/src/scan/parquet.rs b/crates/polars-lazy/src/scan/parquet.rs
index ff4f9a73ec78..9adb0f1838be 100644
--- a/crates/polars-lazy/src/scan/parquet.rs
+++ b/crates/polars-lazy/src/scan/parquet.rs
@@ -140,7 +140,7 @@ impl LazyFrame {
     /// Create a LazyFrame directly from a parquet scan.
     pub fn scan_parquet(path: impl AsRef<Path>, args: ScanArgsParquet) -> PolarsResult<Self> {
         Self::scan_parquet_sources(
-            ScanSources::Files([path.as_ref().to_path_buf()].into()),
+            ScanSources::Paths([path.as_ref().to_path_buf()].into()),
             args,
         )
     }
@@ -152,6 +152,6 @@ impl LazyFrame {
 
     /// Create a LazyFrame directly from a parquet scan.
     pub fn scan_parquet_files(paths: Arc<[PathBuf]>, args: ScanArgsParquet) -> PolarsResult<Self> {
-        Self::scan_parquet_sources(ScanSources::Files(paths), args)
+        Self::scan_parquet_sources(ScanSources::Paths(paths), args)
     }
 }
diff --git a/crates/polars-mem-engine/src/executors/scan/csv.rs b/crates/polars-mem-engine/src/executors/scan/csv.rs
index a9048481a4ac..c00a0047d525 100644
--- a/crates/polars-mem-engine/src/executors/scan/csv.rs
+++ b/crates/polars-mem-engine/src/executors/scan/csv.rs
@@ -4,8 +4,6 @@ use polars_core::config;
 use polars_core::utils::{
     accumulate_dataframes_vertical, accumulate_dataframes_vertical_unchecked,
 };
-use polars_error::feature_gated;
-use polars_utils::mmap::MemSlice;
 
 use super::*;
 
@@ -68,25 +66,7 @@ impl CsvExec {
                 let source = self.sources.at(i);
                 let owned = &mut vec![];
 
-                let memslice = match source {
-                    ScanSourceRef::File(path) => {
-                        let file = if run_async {
-                            feature_gated!("cloud", {
-                                polars_io::file_cache::FILE_CACHE
-                                    .get_entry(path.to_str().unwrap())
-                                    // Safety: This was initialized by schema inference.
-                                    .unwrap()
-                                    .try_open_assume_latest()
-                            })
-                        } else {
-                            polars_utils::open_file(path)
-                        }?;
-
-                        let mmap = unsafe { memmap::Mmap::map(&file).unwrap() };
-                        MemSlice::from_mmap(Arc::new(mmap))
-                    },
-                    ScanSourceRef::Buffer(buffer) => MemSlice::from_bytes(buffer.clone()),
-                };
+                let memslice = source.to_memslice_async_latest(run_async)?;
 
                 let reader = std::io::Cursor::new(maybe_decompress_bytes(&memslice, owned)?);
                 let mut df = options
diff --git a/crates/polars-mem-engine/src/executors/scan/ipc.rs b/crates/polars-mem-engine/src/executors/scan/ipc.rs
index 33ad2f54f429..acbcc2d28dd6 100644
--- a/crates/polars-mem-engine/src/executors/scan/ipc.rs
+++ b/crates/polars-mem-engine/src/executors/scan/ipc.rs
@@ -24,8 +24,8 @@ pub struct IpcExec {
 impl IpcExec {
     fn read(&mut self) -> PolarsResult<DataFrame> {
         let is_cloud = match &self.sources {
-            ScanSources::Files(paths) => paths.iter().any(is_cloud_url),
-            ScanSources::Buffers(_) => false,
+            ScanSources::Paths(paths) => paths.iter().any(is_cloud_url),
+            ScanSources::Files(_) | ScanSources::Buffers(_) => false,
         };
         let force_async = config::force_async();
 
@@ -75,13 +75,16 @@ impl IpcExec {
             let source = self.sources.at(index);
 
             let memslice = match source {
-                ScanSourceRef::File(path) => {
+                ScanSourceRef::Path(path) => {
                     let file = match idx_to_cached_file(index) {
                         None => std::fs::File::open(path)?,
                         Some(f) => f?,
                     };
 
-                    MemSlice::from_mmap(Arc::new(unsafe { memmap::Mmap::map(&file).unwrap() }))
+                    MemSlice::from_mmap(Arc::new(unsafe { memmap::Mmap::map(&file)? }))
+                },
+                ScanSourceRef::File(file) => {
+                    MemSlice::from_mmap(Arc::new(unsafe { memmap::Mmap::map(file)? }))
                 },
                 ScanSourceRef::Buffer(buff) => MemSlice::from_bytes(buff.clone()),
             };
diff --git a/crates/polars-mem-engine/src/executors/scan/ndjson.rs b/crates/polars-mem-engine/src/executors/scan/ndjson.rs
index fb55cb2b38e5..06e1d18892c6 100644
--- a/crates/polars-mem-engine/src/executors/scan/ndjson.rs
+++ b/crates/polars-mem-engine/src/executors/scan/ndjson.rs
@@ -1,7 +1,5 @@
 use polars_core::config;
 use polars_core::utils::accumulate_dataframes_vertical;
-use polars_error::feature_gated;
-use polars_utils::mmap::MemSlice;
 
 use super::*;
 
@@ -76,30 +74,9 @@ impl JsonExec {
 
                 let row_index = self.file_scan_options.row_index.as_mut();
 
-                let memslice = match source {
-                    ScanSourceRef::File(path) => {
-                        let file = if run_async {
-                            feature_gated!("cloud", {
-                                match polars_io::file_cache::FILE_CACHE
-                                    .get_entry(path.to_str().unwrap())
-                                    // Safety: This was initialized by schema inference.
-                                    .unwrap()
-                                    .try_open_assume_latest()
-                                {
-                                    Ok(v) => v,
-                                    Err(e) => return Some(Err(e)),
-                                }
-                            })
-                        } else {
-                            match polars_utils::open_file(path) {
-                                Ok(v) => v,
-                                Err(e) => return Some(Err(e)),
-                            }
-                        };
-
-                        MemSlice::from_mmap(Arc::new(unsafe { memmap::Mmap::map(&file).unwrap() }))
-                    },
-                    ScanSourceRef::Buffer(buff) => MemSlice::from_bytes(buff.clone()),
+                let memslice = match source.to_memslice_async_latest(run_async) {
+                    Ok(memslice) => memslice,
+                    Err(err) => return Some(Err(err)),
                 };
 
                 let owned = &mut vec![];
diff --git a/crates/polars-mem-engine/src/executors/scan/parquet.rs b/crates/polars-mem-engine/src/executors/scan/parquet.rs
index fd0f53b6d728..2f32e0b50aa3 100644
--- a/crates/polars-mem-engine/src/executors/scan/parquet.rs
+++ b/crates/polars-mem-engine/src/executors/scan/parquet.rs
@@ -8,7 +8,6 @@ use polars_io::cloud::CloudOptions;
 use polars_io::parquet::metadata::FileMetaDataRef;
 use polars_io::utils::slice::split_slice_at_file;
 use polars_io::RowIndex;
-use polars_utils::mmap::MemSlice;
 
 use super::*;
 
@@ -82,18 +81,7 @@ impl ParquetExec {
                         let row_counts = path_indexes
                             .into_par_iter()
                             .map(|&i| {
-                                let memslice = match self.sources.at(i) {
-                                    ScanSourceRef::File(path) => {
-                                        let file = std::fs::File::open(path)?;
-                                        MemSlice::from_mmap(Arc::new(unsafe {
-                                            memmap::Mmap::map(&file).unwrap()
-                                        }))
-                                    },
-                                    ScanSourceRef::Buffer(buff) => {
-                                        MemSlice::from_bytes(buff.clone())
-                                    },
-                                };
-
+                                let memslice = self.sources.at(i).to_memslice()?;
                                 ParquetReader::new(std::io::Cursor::new(memslice)).num_rows()
                             })
                             .collect::<PolarsResult<Vec<_>>>()?;
@@ -161,13 +149,7 @@ impl ParquetExec {
                     hive_partitions.as_deref(),
                 );
 
-                let memslice = match source {
-                    ScanSourceRef::File(path) => {
-                        let file = std::fs::File::open(path)?;
-                        MemSlice::from_mmap(Arc::new(unsafe { memmap::Mmap::map(&file).unwrap() }))
-                    },
-                    ScanSourceRef::Buffer(buff) => MemSlice::from_bytes(buff.clone()),
-                };
+                let memslice = source.to_memslice()?;
 
                 let mut reader = ParquetReader::new(std::io::Cursor::new(memslice))
                     .read_parallel(parallel)
diff --git a/crates/polars-mem-engine/src/utils.rs b/crates/polars-mem-engine/src/utils.rs
index 06941cbc128d..91bd0e17902a 100644
--- a/crates/polars-mem-engine/src/utils.rs
+++ b/crates/polars-mem-engine/src/utils.rs
@@ -9,7 +9,7 @@ use polars_utils::arena::{Arena, Node};
 ///
 /// # Notes
 ///
-/// - Scan sources with in-memory buffers are ignored.
+/// - Scan sources with opened files or in-memory buffers are ignored.
 pub(crate) fn agg_source_paths<'a>(
     root_lp: Node,
     acc_paths: &mut PlHashSet<&'a Path>,
@@ -18,8 +18,8 @@ pub(crate) fn agg_source_paths<'a>(
     for (_, lp) in lp_arena.iter(root_lp) {
         if let IR::Scan { sources, .. } = lp {
             match sources {
-                ScanSources::Files(paths) => acc_paths.extend(paths.iter().map(|p| p.as_path())),
-                ScanSources::Buffers(_) => {
+                ScanSources::Paths(paths) => acc_paths.extend(paths.iter().map(|p| p.as_path())),
+                ScanSources::Buffers(_) | ScanSources::Files(_) => {
                     // Ignore
                 },
             }
diff --git a/crates/polars-plan/src/client/check.rs b/crates/polars-plan/src/client/check.rs
index 1f5562bb4670..84189840a3dd 100644
--- a/crates/polars-plan/src/client/check.rs
+++ b/crates/polars-plan/src/client/check.rs
@@ -15,11 +15,14 @@ pub(super) fn assert_cloud_eligible(dsl: &DslPlan) -> PolarsResult<()> {
             } => {
                 let sources_lock = sources.lock().unwrap();
                 match &sources_lock.sources {
-                    ScanSources::Files(paths) => {
+                    ScanSources::Paths(paths) => {
                         if paths.iter().any(|p| !is_cloud_url(p)) {
                             return ineligible_error("contains scan of local file system");
                         }
                     },
+                    ScanSources::Files(_) => {
+                        return ineligible_error("contains scan of opened files");
+                    },
                     ScanSources::Buffers(_) => {
                         return ineligible_error("contains scan of in-memory buffer");
                     },
diff --git a/crates/polars-plan/src/plans/conversion/dsl_to_ir.rs b/crates/polars-plan/src/plans/conversion/dsl_to_ir.rs
index 437db91f1975..084779a68a28 100644
--- a/crates/polars-plan/src/plans/conversion/dsl_to_ir.rs
+++ b/crates/polars-plan/src/plans/conversion/dsl_to_ir.rs
@@ -826,7 +826,7 @@ impl DslScanSources {
             return Ok(());
         }
 
-        let ScanSources::Files(paths) = &self.sources else {
+        let ScanSources::Paths(paths) = &self.sources else {
             self.is_expanded = true;
             return Ok(());
         };
@@ -853,7 +853,7 @@ impl DslScanSources {
 
         #[allow(unreachable_code)]
         {
-            self.sources = ScanSources::Files(expanded_sources);
+            self.sources = ScanSources::Paths(expanded_sources);
             self.is_expanded = true;
 
             Ok(())
diff --git a/crates/polars-plan/src/plans/conversion/scans.rs b/crates/polars-plan/src/plans/conversion/scans.rs
index f7c64ed7612b..2b20d9fe932a 100644
--- a/crates/polars-plan/src/plans/conversion/scans.rs
+++ b/crates/polars-plan/src/plans/conversion/scans.rs
@@ -35,44 +35,34 @@ pub(super) fn parquet_file_info(
 ) -> PolarsResult<(FileInfo, Option<FileMetaDataRef>)> {
     use polars_core::error::feature_gated;
 
-    let first_source = sources
-        .first()
-        .ok_or_else(|| polars_err!(ComputeError: "expected at least 1 source"))?;
-
-    let (reader_schema, num_rows, metadata) = match first_source {
-        ScanSourceRef::File(path) => {
-            if is_cloud_url(path) {
-                feature_gated!("cloud", {
-                    let uri = path.to_string_lossy();
-                    get_runtime().block_on(async {
-                        let mut reader =
-                            ParquetAsyncReader::from_uri(&uri, cloud_options, None).await?;
-
-                        PolarsResult::Ok((
-                            reader.schema().await?,
-                            Some(reader.num_rows().await?),
-                            Some(reader.get_metadata().await?.clone()),
-                        ))
-                    })?
-                })
-            } else {
-                let file = polars_utils::open_file(path)?;
-                let mut reader = ParquetReader::new(file);
-                (
-                    reader.schema()?,
-                    Some(reader.num_rows()?),
-                    Some(reader.get_metadata()?.clone()),
-                )
-            }
-        },
-        ScanSourceRef::Buffer(buffer) => {
-            let mut reader = ParquetReader::new(std::io::Cursor::new(buffer));
+    let (reader_schema, num_rows, metadata) = {
+        if sources.is_cloud_url() {
+            let first_path = &sources.as_paths().unwrap()[0];
+            feature_gated!("cloud", {
+                let uri = first_path.to_string_lossy();
+                get_runtime().block_on(async {
+                    let mut reader =
+                        ParquetAsyncReader::from_uri(&uri, cloud_options, None).await?;
+
+                    PolarsResult::Ok((
+                        reader.schema().await?,
+                        Some(reader.num_rows().await?),
+                        Some(reader.get_metadata().await?.clone()),
+                    ))
+                })?
+            })
+        } else {
+            let first_source = sources
+                .first()
+                .ok_or_else(|| polars_err!(ComputeError: "expected at least 1 source"))?;
+            let memslice = first_source.to_memslice()?;
+            let mut reader = ParquetReader::new(std::io::Cursor::new(memslice));
             (
                 reader.schema()?,
                 Some(reader.num_rows()?),
                 Some(reader.get_metadata()?.clone()),
             )
-        },
+        }
     };
 
     let schema = prepare_output_schema(
@@ -103,7 +93,7 @@ pub(super) fn ipc_file_info(
     };
 
     let metadata = match first {
-        ScanSourceRef::File(path) => {
+        ScanSourceRef::Path(path) => {
             if is_cloud_url(path) {
                 feature_gated!("cloud", {
                     let uri = path.to_string_lossy();
@@ -120,6 +110,9 @@ pub(super) fn ipc_file_info(
                 ))?
             }
         },
+        ScanSourceRef::File(file) => {
+            arrow::io::ipc::read::read_file_metadata(&mut std::io::BufReader::new(file))?
+        },
         ScanSourceRef::Buffer(buff) => {
             arrow::io::ipc::read::read_file_metadata(&mut std::io::Cursor::new(buff))?
         },
@@ -182,7 +175,7 @@ pub(super) fn csv_file_info(
 
     let infer_schema_func = |i| {
         let source = sources.at(i);
-        let memslice = source.to_memslice(run_async, cache_entries.as_ref(), i)?;
+        let memslice = source.to_memslice_possibly_async(run_async, cache_entries.as_ref(), i)?;
         let owned = &mut vec![];
         let mut reader = std::io::Cursor::new(maybe_decompress_bytes(&memslice, owned)?);
         if reader.read(&mut [0; 4])? < 2 && csv_options.raise_if_empty {
@@ -308,7 +301,7 @@ pub(super) fn ndjson_file_info(
             )
         }
     } else {
-        let memslice = first.to_memslice(run_async, cache_entries.as_ref(), 0)?;
+        let memslice = first.to_memslice_possibly_async(run_async, cache_entries.as_ref(), 0)?;
         let mut reader = std::io::Cursor::new(maybe_decompress_bytes(&memslice, owned)?);
 
         let schema =
diff --git a/crates/polars-plan/src/plans/functions/count.rs b/crates/polars-plan/src/plans/functions/count.rs
index 3bba674edb89..0b16c8eac994 100644
--- a/crates/polars-plan/src/plans/functions/count.rs
+++ b/crates/polars-plan/src/plans/functions/count.rs
@@ -96,7 +96,7 @@ fn count_all_rows_csv(
     sources
         .iter()
         .map(|source| match source {
-            ScanSourceRef::File(path) => count_rows_csv(
+            ScanSourceRef::Path(path) => count_rows_csv(
                 path,
                 parse_options.separator,
                 parse_options.quote_char,
@@ -104,14 +104,18 @@ fn count_all_rows_csv(
                 parse_options.eol_char,
                 options.has_header,
             ),
-            ScanSourceRef::Buffer(buf) => count_rows_csv_from_slice(
-                &buf[..],
-                parse_options.separator,
-                parse_options.quote_char,
-                parse_options.comment_prefix.as_ref(),
-                parse_options.eol_char,
-                options.has_header,
-            ),
+            _ => {
+                let memslice = source.to_memslice()?;
+
+                count_rows_csv_from_slice(
+                    &memslice[..],
+                    parse_options.separator,
+                    parse_options.quote_char,
+                    parse_options.comment_prefix.as_ref(),
+                    parse_options.eol_char,
+                    options.has_header,
+                )
+            },
         })
         .sum()
 }
@@ -136,13 +140,8 @@ pub(super) fn count_rows_parquet(
     } else {
         sources
             .iter()
-            .map(|source| match source {
-                ScanSourceRef::File(path) => {
-                    ParquetReader::new(polars_utils::open_file(path)?).num_rows()
-                },
-                ScanSourceRef::Buffer(buffer) => {
-                    ParquetReader::new(std::io::Cursor::new(buffer)).num_rows()
-                },
+            .map(|source| {
+                ParquetReader::new(std::io::Cursor::new(source.to_memslice()?)).num_rows()
             })
             .sum::<PolarsResult<usize>>()
     }
@@ -187,13 +186,9 @@ pub(super) fn count_rows_ipc(
     } else {
         sources
             .iter()
-            .map(|source| match source {
-                ScanSourceRef::File(path) => {
-                    count_rows_ipc_sync(&mut polars_utils::open_file(path)?).map(|v| v as usize)
-                },
-                ScanSourceRef::Buffer(buffer) => {
-                    count_rows_ipc_sync(&mut std::io::Cursor::new(buffer)).map(|v| v as usize)
-                },
+            .map(|source| {
+                let memslice = source.to_memslice()?;
+                count_rows_ipc_sync(&mut std::io::Cursor::new(memslice)).map(|v| v as usize)
             })
             .sum::<PolarsResult<usize>>()
     }
@@ -234,8 +229,8 @@ pub(super) fn count_rows_ndjson(
     let run_async = is_cloud_url || (sources.is_files() && config::force_async());
 
     let cache_entries = {
-        feature_gated!("cloud", {
-            if run_async {
+        if run_async {
+            feature_gated!("cloud", {
                 Some(polars_io::file_cache::init_entries_from_uri_list(
                     sources
                         .as_paths()
@@ -246,43 +241,23 @@ pub(super) fn count_rows_ndjson(
                         .as_slice(),
                     cloud_options,
                 )?)
-            } else {
-                None
-            }
-        })
+            })
+        } else {
+            None
+        }
     };
 
     sources
         .iter()
-        .map(|source| match source {
-            ScanSourceRef::File(path) => {
-                let f = if run_async {
-                    feature_gated!("cloud", {
-                        let entry: &Arc<polars_io::file_cache::FileCacheEntry> =
-                            &cache_entries.as_ref().unwrap()[0];
-                        entry.try_open_check_latest()?
-                    })
-                } else {
-                    polars_utils::open_file(path)?
-                };
+        .map(|source| {
+            let memslice =
+                source.to_memslice_possibly_async(run_async, cache_entries.as_ref(), 0)?;
 
-                let mmap = unsafe { memmap::Mmap::map(&f).unwrap() };
-                let owned = &mut vec![];
-
-                let reader = polars_io::ndjson::core::JsonLineReader::new(std::io::Cursor::new(
-                    maybe_decompress_bytes(mmap.as_ref(), owned)?,
-                ));
-                reader.count()
-            },
-            ScanSourceRef::Buffer(buffer) => {
-                polars_ensure!(!run_async, nyi = "BytesIO with force_async");
-
-                let owned = &mut vec![];
-                let reader = polars_io::ndjson::core::JsonLineReader::new(std::io::Cursor::new(
-                    maybe_decompress_bytes(buffer, owned)?,
-                ));
-                reader.count()
-            },
+            let owned = &mut vec![];
+            let reader = polars_io::ndjson::core::JsonLineReader::new(std::io::Cursor::new(
+                maybe_decompress_bytes(&memslice[..], owned)?,
+            ));
+            reader.count()
         })
         .sum()
 }
diff --git a/crates/polars-plan/src/plans/ir/dot.rs b/crates/polars-plan/src/plans/ir/dot.rs
index 3ece8966a857..51050f2fa877 100644
--- a/crates/polars-plan/src/plans/ir/dot.rs
+++ b/crates/polars-plan/src/plans/ir/dot.rs
@@ -351,7 +351,8 @@ struct OptionExprIRDisplay<'a>(Option<ExprIRDisplay<'a>>);
 impl fmt::Display for ScanSourceRef<'_> {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         match self {
-            ScanSourceRef::File(path) => path.display().fmt(f),
+            ScanSourceRef::Path(path) => path.display().fmt(f),
+            ScanSourceRef::File(_) => f.write_str("open-file"),
             ScanSourceRef::Buffer(buff) => write!(f, "{} in-mem bytes", buff.len()),
         }
     }
diff --git a/crates/polars-plan/src/plans/ir/mod.rs b/crates/polars-plan/src/plans/ir/mod.rs
index e2f2ca3eae3d..67ba46d4aca2 100644
--- a/crates/polars-plan/src/plans/ir/mod.rs
+++ b/crates/polars-plan/src/plans/ir/mod.rs
@@ -6,6 +6,7 @@ pub(crate) mod tree_format;
 
 use std::borrow::Cow;
 use std::fmt;
+use std::fs::File;
 use std::path::{Path, PathBuf};
 
 pub use dot::{EscapeLabel, IRDotDisplay, PathsDisplay, ScanSourcesDisplay};
@@ -36,16 +37,44 @@ pub struct IRPlanRef<'a> {
 }
 
 #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
-#[derive(Debug, Clone, Hash, PartialEq, Eq)]
+#[derive(Debug, Clone)]
 pub enum ScanSources {
-    Files(Arc<[PathBuf]>),
+    Paths(Arc<[PathBuf]>),
+
+    #[cfg_attr(feature = "serde", serde(skip))]
+    Files(Arc<[File]>),
     #[cfg_attr(feature = "serde", serde(skip))]
     Buffers(Arc<[bytes::Bytes]>),
 }
 
+impl std::hash::Hash for ScanSources {
+    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
+        std::mem::discriminant(self).hash(state);
+
+        // @NOTE: This is a bit crazy
+        match self {
+            Self::Paths(paths) => paths.hash(state),
+            Self::Files(files) => files.as_ptr().hash(state),
+            Self::Buffers(buffers) => buffers.as_ptr().hash(state),
+        }
+    }
+}
+
+impl PartialEq for ScanSources {
+    fn eq(&self, other: &Self) -> bool {
+        match (self, other) {
+            (ScanSources::Paths(l), ScanSources::Paths(r)) => l == r,
+            _ => false,
+        }
+    }
+}
+
+impl Eq for ScanSources {}
+
 #[derive(Debug, Clone, Copy)]
 pub enum ScanSourceRef<'a> {
-    File(&'a Path),
+    Path(&'a Path),
+    File(&'a File),
     Buffer(&'a bytes::Bytes),
 }
 
@@ -63,12 +92,43 @@ impl Default for ScanSources {
 impl<'a> ScanSourceRef<'a> {
     pub fn to_file_path(&self) -> &str {
         match self {
-            ScanSourceRef::File(path) => path.to_str().unwrap(),
-            ScanSourceRef::Buffer(_) => "in-mem",
+            Self::Path(path) => path.to_str().unwrap(),
+            Self::File(_) => "open-file",
+            Self::Buffer(_) => "in-mem",
+        }
+    }
+
+    pub fn to_memslice(&self) -> PolarsResult<MemSlice> {
+        self.to_memslice_possibly_async(false, None, 0)
+    }
+
+    pub fn to_memslice_async_latest(&self, run_async: bool) -> PolarsResult<MemSlice> {
+        match self {
+            ScanSourceRef::Path(path) => {
+                let file = if run_async {
+                    feature_gated!("cloud", {
+                        polars_io::file_cache::FILE_CACHE
+                            .get_entry(path.to_str().unwrap())
+                            // Safety: This was initialized by schema inference.
+                            .unwrap()
+                            .try_open_assume_latest()?
+                    })
+                } else {
+                    polars_utils::open_file(path)?
+                };
+
+                Ok(MemSlice::from_mmap(Arc::new(unsafe {
+                    memmap::Mmap::map(&file)?
+                })))
+            },
+            ScanSourceRef::File(file) => Ok(MemSlice::from_mmap(Arc::new(unsafe {
+                memmap::Mmap::map(*file)?
+            }))),
+            ScanSourceRef::Buffer(buff) => Ok(MemSlice::from_bytes((*buff).clone())),
         }
     }
 
-    pub fn to_memslice(
+    pub fn to_memslice_possibly_async(
         &self,
         run_async: bool,
         #[cfg(feature = "cloud")] cache_entries: Option<
@@ -78,7 +138,7 @@ impl<'a> ScanSourceRef<'a> {
         index: usize,
     ) -> PolarsResult<MemSlice> {
         match self {
-            Self::File(path) => {
+            Self::Path(path) => {
                 let f = if run_async {
                     feature_gated!("cloud", {
                         cache_entries.unwrap()[index].try_open_check_latest()?
@@ -90,6 +150,10 @@ impl<'a> ScanSourceRef<'a> {
                 let mmap = unsafe { memmap::Mmap::map(&f)? };
                 Ok(MemSlice::from_mmap(Arc::new(mmap)))
             },
+            Self::File(file) => {
+                let mmap = unsafe { memmap::Mmap::map(*file)? };
+                Ok(MemSlice::from_mmap(Arc::new(mmap)))
+            },
             Self::Buffer(buff) => Ok(MemSlice::from_bytes((*buff).clone())),
         }
     }
@@ -105,22 +169,22 @@ impl ScanSources {
 
     pub fn as_paths(&self) -> Option<&[PathBuf]> {
         match self {
-            Self::Files(paths) => Some(paths.as_ref()),
-            Self::Buffers(_) => None,
+            Self::Paths(paths) => Some(paths.as_ref()),
+            Self::Files(_) | Self::Buffers(_) => None,
         }
     }
 
     pub fn into_paths(&self) -> Option<Arc<[PathBuf]>> {
         match self {
-            Self::Files(paths) => Some(paths.clone()),
-            Self::Buffers(_) => None,
+            Self::Paths(paths) => Some(paths.clone()),
+            Self::Files(_) | Self::Buffers(_) => None,
         }
     }
 
     pub fn first_path(&self) -> Option<&Path> {
         match self {
-            ScanSources::Files(paths) => paths.first().map(|p| p.as_path()),
-            ScanSources::Buffers(_) => None,
+            Self::Paths(paths) => paths.first().map(|p| p.as_path()),
+            Self::Files(_) | Self::Buffers(_) => None,
         }
     }
 
@@ -132,18 +196,19 @@ impl ScanSources {
     }
 
     pub fn is_files(&self) -> bool {
-        matches!(self, Self::Files(_))
+        matches!(self, Self::Paths(_))
     }
 
     pub fn is_cloud_url(&self) -> bool {
         match self {
-            Self::Files(paths) => paths.first().map_or(false, polars_io::is_cloud_url),
-            Self::Buffers(_) => false,
+            Self::Paths(paths) => paths.first().map_or(false, polars_io::is_cloud_url),
+            Self::Files(_) | Self::Buffers(_) => false,
         }
     }
 
     pub fn len(&self) -> usize {
         match self {
+            Self::Paths(s) => s.len(),
             Self::Files(s) => s.len(),
             Self::Buffers(s) => s.len(),
         }
@@ -163,17 +228,19 @@ impl ScanSources {
         }
 
         match self {
-            Self::Files(paths) => {
+            Self::Paths(paths) => {
                 PlSmallStr::from_str(paths.first().unwrap().to_string_lossy().as_ref())
             },
+            Self::Files(_) => PlSmallStr::from_static("OPEN_FILES"),
             Self::Buffers(_) => PlSmallStr::from_static("IN_MEMORY"),
         }
     }
 
     pub fn get(&self, idx: usize) -> Option<ScanSourceRef> {
         match self {
-            ScanSources::Files(paths) => paths.get(idx).map(|p| ScanSourceRef::File(p)),
-            ScanSources::Buffers(buffers) => buffers.get(idx).map(ScanSourceRef::Buffer),
+            Self::Paths(paths) => paths.get(idx).map(|p| ScanSourceRef::Path(p)),
+            Self::Files(files) => files.get(idx).map(|f| ScanSourceRef::File(f)),
+            Self::Buffers(buffers) => buffers.get(idx).map(ScanSourceRef::Buffer),
         }
     }
 
@@ -192,7 +259,8 @@ impl<'a> Iterator for ScanSourceIter<'a> {
 
     fn next(&mut self) -> Option<Self::Item> {
         let item = match self.sources {
-            ScanSources::Files(paths) => ScanSourceRef::File(paths.get(self.offset)?),
+            ScanSources::Paths(paths) => ScanSourceRef::Path(paths.get(self.offset)?),
+            ScanSources::Files(files) => ScanSourceRef::File(files.get(self.offset)?),
             ScanSources::Buffers(buffers) => ScanSourceRef::Buffer(buffers.get(self.offset)?),
         };
 
diff --git a/crates/polars-plan/src/plans/optimizer/count_star.rs b/crates/polars-plan/src/plans/optimizer/count_star.rs
index 02c8b94a033c..1f20c83f6a87 100644
--- a/crates/polars-plan/src/plans/optimizer/count_star.rs
+++ b/crates/polars-plan/src/plans/optimizer/count_star.rs
@@ -67,7 +67,7 @@ fn visit_logical_plan_for_scan_paths(
     match lp_arena.get(node) {
         IR::Union { inputs, .. } => {
             enum MutableSources {
-                Files(Vec<PathBuf>),
+                Paths(Vec<PathBuf>),
                 Buffers(Vec<bytes::Bytes>),
             }
 
@@ -76,25 +76,22 @@ fn visit_logical_plan_for_scan_paths(
             for input in inputs {
                 match visit_logical_plan_for_scan_paths(*input, lp_arena, expr_arena, true) {
                     Some(expr) => {
-                        match expr.sources {
-                            ScanSources::Files(paths) => match sources {
-                                Some(MutableSources::Files(ref mut files)) => {
-                                    files.extend_from_slice(&paths[..])
-                                },
-                                Some(MutableSources::Buffers(_)) => {
-                                    todo!("Mixing in memory buffers and paths in count star opt")
-                                },
-                                None => sources = Some(MutableSources::Files(paths.to_vec())),
+                        match (expr.sources, &mut sources) {
+                            (
+                                ScanSources::Paths(paths),
+                                Some(MutableSources::Paths(ref mut mutable_paths)),
+                            ) => mutable_paths.extend_from_slice(&paths[..]),
+                            (ScanSources::Paths(paths), None) => {
+                                sources = Some(MutableSources::Paths(paths.to_vec()))
                             },
-                            ScanSources::Buffers(bs) => match sources {
-                                Some(MutableSources::Files(_)) => {
-                                    todo!("Mixing in memory buffers and paths in count star opt")
-                                },
-                                Some(MutableSources::Buffers(ref mut buffers)) => {
-                                    buffers.extend_from_slice(&bs[..])
-                                },
-                                None => sources = Some(MutableSources::Buffers(bs.to_vec())),
+                            (
+                                ScanSources::Buffers(buffers),
+                                Some(MutableSources::Buffers(ref mut mutable_buffers)),
+                            ) => mutable_buffers.extend_from_slice(&buffers[..]),
+                            (ScanSources::Buffers(buffers), None) => {
+                                sources = Some(MutableSources::Buffers(buffers.to_vec()))
                             },
+                            _ => return None,
                         }
 
                         match &scan_type {
@@ -114,7 +111,7 @@ fn visit_logical_plan_for_scan_paths(
             }
             Some(CountStarExpr {
                 sources: match sources {
-                    Some(MutableSources::Files(files)) => ScanSources::Files(files.into()),
+                    Some(MutableSources::Paths(paths)) => ScanSources::Paths(paths.into()),
                     Some(MutableSources::Buffers(buffers)) => ScanSources::Buffers(buffers.into()),
                     None => ScanSources::default(),
                 },
diff --git a/crates/polars-plan/src/plans/optimizer/predicate_pushdown/mod.rs b/crates/polars-plan/src/plans/optimizer/predicate_pushdown/mod.rs
index f42a7ca7239b..7cb0753e5a6d 100644
--- a/crates/polars-plan/src/plans/optimizer/predicate_pushdown/mod.rs
+++ b/crates/polars-plan/src/plans/optimizer/predicate_pushdown/mod.rs
@@ -403,7 +403,7 @@ impl<'a> PredicatePushDown<'a> {
                                     filter: None,
                                 });
                             } else {
-                                sources = ScanSources::Files(new_paths.into());
+                                sources = ScanSources::Paths(new_paths.into());
                                 scan_hive_parts = Some(Arc::from(new_hive_parts));
                             }
                         }
diff --git a/crates/polars-python/src/conversion/mod.rs b/crates/polars-python/src/conversion/mod.rs
index 886b6f744552..ec05729acc81 100644
--- a/crates/polars-python/src/conversion/mod.rs
+++ b/crates/polars-python/src/conversion/mod.rs
@@ -2,6 +2,7 @@ pub(crate) mod any_value;
 pub(crate) mod chunked_array;
 mod datetime;
 use std::fmt::{Display, Formatter};
+use std::fs::File;
 use std::hash::{Hash, Hasher};
 use std::path::PathBuf;
 
@@ -540,7 +541,8 @@ impl<'py> FromPyObject<'py> for Wrap<ScanSources> {
         }
 
         enum MutableSources {
-            Files(Vec<PathBuf>),
+            Paths(Vec<PathBuf>),
+            Files(Vec<File>),
             Buffers(Vec<bytes::Bytes>),
         }
 
@@ -562,13 +564,19 @@ impl<'py> FromPyObject<'py> for Wrap<ScanSources> {
             EitherPythonFileOrPath::Path(path) => {
                 let mut sources = Vec::with_capacity(num_items);
                 sources.push(path);
+                MutableSources::Paths(sources)
+            },
+            EitherPythonFileOrPath::File(file) => {
+                let mut sources = Vec::with_capacity(num_items);
+                sources.push(file);
                 MutableSources::Files(sources)
             },
         };
 
         for source in iter {
             match (&mut sources, source?) {
-                (MutableSources::Files(v), EitherPythonFileOrPath::Path(p)) => v.push(p),
+                (MutableSources::Paths(v), EitherPythonFileOrPath::Path(p)) => v.push(p),
+                (MutableSources::Files(v), EitherPythonFileOrPath::File(f)) => v.push(f),
                 (MutableSources::Buffers(v), EitherPythonFileOrPath::Py(f)) => v.push(f.as_bytes()),
                 _ => {
                     return Err(PyTypeError::new_err(
@@ -579,6 +587,7 @@ impl<'py> FromPyObject<'py> for Wrap<ScanSources> {
         }
 
         Ok(Wrap(match sources {
+            MutableSources::Paths(i) => ScanSources::Paths(i.into()),
             MutableSources::Files(i) => ScanSources::Files(i.into()),
             MutableSources::Buffers(i) => ScanSources::Buffers(i.into()),
         }))
diff --git a/crates/polars-python/src/dataframe/io.rs b/crates/polars-python/src/dataframe/io.rs
index 12707e93dd85..d56334d35ad0 100644
--- a/crates/polars-python/src/dataframe/io.rs
+++ b/crates/polars-python/src/dataframe/io.rs
@@ -10,7 +10,6 @@ use polars::prelude::*;
 #[cfg(feature = "parquet")]
 use polars_parquet::arrow::write::StatisticsOptions;
 use pyo3::prelude::*;
-use pyo3::pybacked::PyBackedStr;
 
 use super::PyDataFrame;
 #[cfg(feature = "parquet")]
@@ -18,176 +17,13 @@ use crate::conversion::parse_parquet_compression;
 use crate::conversion::Wrap;
 use crate::error::PyPolarsErr;
 use crate::file::{
-    get_either_file, get_file_like, get_mmap_bytes_reader, get_mmap_bytes_reader_and_path,
+    get_either_file, get_file_like, get_mmap_bytes_reader,
     read_if_bytesio, EitherRustPythonFile,
 };
 use crate::prelude::PyCompatLevel;
 
 #[pymethods]
 impl PyDataFrame {
-    #[staticmethod]
-    #[cfg(feature = "csv")]
-    #[pyo3(signature = (
-    py_f, infer_schema_length, chunk_size, has_header, ignore_errors, n_rows,
-    skip_rows, projection, separator, rechunk, columns, encoding, n_threads, path,
-    overwrite_dtype, overwrite_dtype_slice, low_memory, comment_prefix, quote_char,
-    null_values, missing_utf8_is_empty_string, try_parse_dates, skip_rows_after_header,
-    row_index, sample_size, eol_char, raise_if_empty, truncate_ragged_lines, decimal_comma, schema)
-)]
-    pub fn read_csv(
-        py: Python,
-        mut py_f: Bound<PyAny>,
-        infer_schema_length: Option<usize>,
-        chunk_size: usize,
-        has_header: bool,
-        ignore_errors: bool,
-        n_rows: Option<usize>,
-        skip_rows: usize,
-        projection: Option<Vec<usize>>,
-        separator: &str,
-        rechunk: bool,
-        columns: Option<Vec<String>>,
-        encoding: Wrap<CsvEncoding>,
-        n_threads: Option<usize>,
-        path: Option<String>,
-        overwrite_dtype: Option<Vec<(PyBackedStr, Wrap<DataType>)>>,
-        overwrite_dtype_slice: Option<Vec<Wrap<DataType>>>,
-        low_memory: bool,
-        comment_prefix: Option<&str>,
-        quote_char: Option<&str>,
-        null_values: Option<Wrap<NullValues>>,
-        missing_utf8_is_empty_string: bool,
-        try_parse_dates: bool,
-        skip_rows_after_header: usize,
-        row_index: Option<(String, IdxSize)>,
-        sample_size: usize,
-        eol_char: &str,
-        raise_if_empty: bool,
-        truncate_ragged_lines: bool,
-        decimal_comma: bool,
-        schema: Option<Wrap<Schema>>,
-    ) -> PyResult<Self> {
-        let null_values = null_values.map(|w| w.0);
-        let eol_char = eol_char.as_bytes()[0];
-        let row_index = row_index.map(|(name, offset)| RowIndex {
-            name: name.into(),
-            offset,
-        });
-        let quote_char = quote_char.and_then(|s| s.as_bytes().first().copied());
-
-        let overwrite_dtype = overwrite_dtype.map(|overwrite_dtype| {
-            overwrite_dtype
-                .iter()
-                .map(|(name, dtype)| {
-                    let dtype = dtype.0.clone();
-                    Field::new((&**name).into(), dtype)
-                })
-                .collect::<Schema>()
-        });
-
-        let overwrite_dtype_slice = overwrite_dtype_slice.map(|overwrite_dtype| {
-            overwrite_dtype
-                .iter()
-                .map(|dt| dt.0.clone())
-                .collect::<Vec<_>>()
-        });
-
-        py_f = read_if_bytesio(py_f);
-        let mmap_bytes_r = get_mmap_bytes_reader(&py_f)?;
-        let df = py.allow_threads(move || {
-            CsvReadOptions::default()
-                .with_path(path)
-                .with_infer_schema_length(infer_schema_length)
-                .with_has_header(has_header)
-                .with_n_rows(n_rows)
-                .with_skip_rows(skip_rows)
-                .with_ignore_errors(ignore_errors)
-                .with_projection(projection.map(Arc::new))
-                .with_rechunk(rechunk)
-                .with_chunk_size(chunk_size)
-                .with_columns(columns.map(|x| x.into_iter().map(|x| x.into()).collect()))
-                .with_n_threads(n_threads)
-                .with_schema_overwrite(overwrite_dtype.map(Arc::new))
-                .with_dtype_overwrite(overwrite_dtype_slice.map(Arc::new))
-                .with_schema(schema.map(|schema| Arc::new(schema.0)))
-                .with_low_memory(low_memory)
-                .with_skip_rows_after_header(skip_rows_after_header)
-                .with_row_index(row_index)
-                .with_sample_size(sample_size)
-                .with_raise_if_empty(raise_if_empty)
-                .with_parse_options(
-                    CsvParseOptions::default()
-                        .with_separator(separator.as_bytes()[0])
-                        .with_encoding(encoding.0)
-                        .with_missing_is_null(!missing_utf8_is_empty_string)
-                        .with_comment_prefix(comment_prefix)
-                        .with_null_values(null_values)
-                        .with_try_parse_dates(try_parse_dates)
-                        .with_quote_char(quote_char)
-                        .with_eol_char(eol_char)
-                        .with_truncate_ragged_lines(truncate_ragged_lines)
-                        .with_decimal_comma(decimal_comma),
-                )
-                .into_reader_with_file_handle(mmap_bytes_r)
-                .finish()
-                .map_err(PyPolarsErr::from)
-        })?;
-        Ok(df.into())
-    }
-
-    #[staticmethod]
-    #[cfg(feature = "parquet")]
-    #[pyo3(signature = (py_f, columns, projection, n_rows, row_index, low_memory, parallel, use_statistics, rechunk))]
-    pub fn read_parquet(
-        py: Python,
-        py_f: PyObject,
-        columns: Option<Vec<String>>,
-        projection: Option<Vec<usize>>,
-        n_rows: Option<usize>,
-        row_index: Option<(String, IdxSize)>,
-        low_memory: bool,
-        parallel: Wrap<ParallelStrategy>,
-        use_statistics: bool,
-        rechunk: bool,
-    ) -> PyResult<Self> {
-        use EitherRustPythonFile::*;
-
-        let row_index = row_index.map(|(name, offset)| RowIndex {
-            name: name.into(),
-            offset,
-        });
-        let result = match get_either_file(py_f, false)? {
-            Py(f) => {
-                let buf = f.as_buffer();
-                py.allow_threads(move || {
-                    ParquetReader::new(buf)
-                        .with_projection(projection)
-                        .with_columns(columns)
-                        .read_parallel(parallel.0)
-                        .with_slice(n_rows.map(|x| (0, x)))
-                        .with_row_index(row_index)
-                        .set_low_memory(low_memory)
-                        .use_statistics(use_statistics)
-                        .set_rechunk(rechunk)
-                        .finish()
-                })
-            },
-            Rust(f) => py.allow_threads(move || {
-                ParquetReader::new(f)
-                    .with_projection(projection)
-                    .with_columns(columns)
-                    .read_parallel(parallel.0)
-                    .with_slice(n_rows.map(|x| (0, x)))
-                    .with_row_index(row_index)
-                    .use_statistics(use_statistics)
-                    .set_rechunk(rechunk)
-                    .finish()
-            }),
-        };
-        let df = result.map_err(PyPolarsErr::from)?;
-        Ok(PyDataFrame::new(df))
-    }
-
     #[staticmethod]
     #[cfg(feature = "json")]
     pub fn read_json(
@@ -220,69 +56,6 @@ impl PyDataFrame {
         })
     }
 
-    #[staticmethod]
-    #[cfg(feature = "json")]
-    pub fn read_ndjson(
-        py: Python,
-        mut py_f: Bound<PyAny>,
-        ignore_errors: bool,
-        schema: Option<Wrap<Schema>>,
-        schema_overrides: Option<Wrap<Schema>>,
-    ) -> PyResult<Self> {
-        py_f = read_if_bytesio(py_f);
-        let mmap_bytes_r = get_mmap_bytes_reader(&py_f)?;
-
-        let mut builder = JsonReader::new(mmap_bytes_r)
-            .with_json_format(JsonFormat::JsonLines)
-            .with_ignore_errors(ignore_errors);
-
-        if let Some(schema) = schema {
-            builder = builder.with_schema(Arc::new(schema.0));
-        }
-
-        if let Some(schema) = schema_overrides.as_ref() {
-            builder = builder.with_schema_overwrite(&schema.0);
-        }
-
-        let out = py
-            .allow_threads(move || builder.finish())
-            .map_err(|e| PyPolarsErr::Other(format!("{e}")))?;
-        Ok(out.into())
-    }
-
-    #[staticmethod]
-    #[cfg(feature = "ipc")]
-    #[pyo3(signature = (py_f, columns, projection, n_rows, row_index, memory_map))]
-    pub fn read_ipc(
-        py: Python,
-        mut py_f: Bound<PyAny>,
-        columns: Option<Vec<String>>,
-        projection: Option<Vec<usize>>,
-        n_rows: Option<usize>,
-        row_index: Option<(String, IdxSize)>,
-        memory_map: bool,
-    ) -> PyResult<Self> {
-        let row_index = row_index.map(|(name, offset)| RowIndex {
-            name: name.into(),
-            offset,
-        });
-        py_f = read_if_bytesio(py_f);
-        let (mmap_bytes_r, mmap_path) = get_mmap_bytes_reader_and_path(&py_f)?;
-
-        let mmap_path = if memory_map { mmap_path } else { None };
-        let df = py.allow_threads(move || {
-            IpcReader::new(mmap_bytes_r)
-                .with_projection(projection)
-                .with_columns(columns)
-                .with_n_rows(n_rows)
-                .with_row_index(row_index)
-                .memory_mapped(mmap_path)
-                .finish()
-                .map_err(PyPolarsErr::from)
-        })?;
-        Ok(PyDataFrame::new(df))
-    }
-
     #[staticmethod]
     #[cfg(feature = "ipc_streaming")]
     #[pyo3(signature = (py_f, columns, projection, n_rows, row_index, rechunk))]
diff --git a/crates/polars-python/src/file.rs b/crates/polars-python/src/file.rs
index 3cbb3d364e2f..6caaf3bb05b4 100644
--- a/crates/polars-python/src/file.rs
+++ b/crates/polars-python/src/file.rs
@@ -206,6 +206,7 @@ impl EitherRustPythonFile {
 pub enum EitherPythonFileOrPath {
     Py(PyFileLikeObject),
     Path(PathBuf),
+    File(File),
 }
 
 pub fn get_either_file_or_path(py_f: PyObject, write: bool) -> PyResult<EitherPythonFileOrPath> {
@@ -223,6 +224,55 @@ pub fn get_either_file_or_path(py_f: PyObject, write: bool) -> PyResult<EitherPy
                 Ok(encoding.eq_ignore_ascii_case("utf-8") || encoding.eq_ignore_ascii_case("utf8"))
             };
 
+            #[cfg(target_family = "unix")]
+            if let Some(fd) = (py_f.is_exact_instance(&io.getattr("FileIO").unwrap())
+                || (py_f.is_exact_instance(&io.getattr("BufferedReader").unwrap())
+                    || py_f.is_exact_instance(&io.getattr("BufferedWriter").unwrap())
+                    || py_f.is_exact_instance(&io.getattr("BufferedRandom").unwrap())
+                    || py_f.is_exact_instance(&io.getattr("BufferedRWPair").unwrap())
+                    || (py_f.is_exact_instance(&io.getattr("TextIOWrapper").unwrap())
+                        && is_utf8_encoding(&py_f)?))
+                    && if write {
+                        // invalidate read buffer
+                        py_f.call_method0("flush").is_ok()
+                    } else {
+                        // flush write buffer
+                        py_f.call_method1("seek", (0, 1)).is_ok()
+                    })
+            .then(|| {
+                py_f.getattr("fileno")
+                    .and_then(|fileno| fileno.call0())
+                    .and_then(|fileno| fileno.extract::<libc::c_int>())
+                    .ok()
+            })
+            .flatten()
+            .map(|fileno| unsafe {
+                // `File::from_raw_fd()` takes the ownership of the file descriptor.
+                // When the File is dropped, it closes the file descriptor.
+                // This is undesired - the Python file object will become invalid.
+                // Therefore, we duplicate the file descriptor here.
+                // Closing the duplicated file descriptor will not close
+                // the original file descriptor;
+                // and the status, e.g. stream position, is still shared with
+                // the original file descriptor.
+                // We use `F_DUPFD_CLOEXEC` here instead of `dup()`
+                // because it also sets the `O_CLOEXEC` flag on the duplicated file descriptor,
+                // which `dup()` clears.
+                // `open()` in both Rust and Python automatically set `O_CLOEXEC` flag;
+                // it prevents leaking file descriptors across processes,
+                // and we want to be consistent with them.
+                // `F_DUPFD_CLOEXEC` is defined in POSIX.1-2008
+                // and is present on all alive UNIX(-like) systems.
+                libc::fcntl(fileno, libc::F_DUPFD_CLOEXEC, 0)
+            })
+            .filter(|fileno| *fileno != -1)
+            .map(|fileno| fileno as RawFd)
+            {
+                return Ok(EitherPythonFileOrPath::File(unsafe {
+                    File::from_raw_fd(fd)
+                }));
+            }
+
             // BytesIO / StringIO is relatively fast, and some code relies on it.
             if !py_f.is_exact_instance(&io.getattr("BytesIO").unwrap())
                 || !py_f.is_exact_instance(&io.getattr("StringIO").unwrap())
diff --git a/crates/polars-python/src/lazyframe/general.rs b/crates/polars-python/src/lazyframe/general.rs
index e09d5cb7f309..11266a696d5e 100644
--- a/crates/polars-python/src/lazyframe/general.rs
+++ b/crates/polars-python/src/lazyframe/general.rs
@@ -26,8 +26,9 @@ fn pyobject_to_first_path_and_scan_sources(
     use crate::file::{get_either_file_or_path, EitherPythonFileOrPath};
     Ok(match get_either_file_or_path(obj, false)? {
         EitherPythonFileOrPath::Path(path) => {
-            (Some(path.clone()), ScanSources::Files([path].into()))
+            (Some(path.clone()), ScanSources::Paths([path].into()))
         },
+        EitherPythonFileOrPath::File(file) => (None, ScanSources::Files([file].into())),
         EitherPythonFileOrPath::Py(f) => (None, ScanSources::Buffers([f.as_bytes()].into())),
     })
 }
diff --git a/crates/polars-stream/src/utils/late_materialized_df.rs b/crates/polars-stream/src/utils/late_materialized_df.rs
index 9e7322167f7f..b18c5cea0657 100644
--- a/crates/polars-stream/src/utils/late_materialized_df.rs
+++ b/crates/polars-stream/src/utils/late_materialized_df.rs
@@ -25,7 +25,7 @@ impl LateMaterializedDataFrame {
             fmt_str: "LateMaterializedDataFrame",
         });
         IR::Scan {
-            sources: ScanSources::Files(Arc::default()),
+            sources: ScanSources::Paths(Arc::default()),
             file_info: FileInfo::new(schema, None, (None, usize::MAX)),
             hive_parts: None,
             predicate: None,
diff --git a/py-polars/polars/io/csv/functions.py b/py-polars/polars/io/csv/functions.py
index d8bc983e0fcc..61349439fed8 100644
--- a/py-polars/polars/io/csv/functions.py
+++ b/py-polars/polars/io/csv/functions.py
@@ -2,7 +2,7 @@
 
 import contextlib
 import os
-from io import BytesIO, StringIO
+import io
 from pathlib import Path
 from typing import IO, TYPE_CHECKING, Any, Callable, Mapping, Sequence
 
@@ -14,10 +14,9 @@
     is_str_sequence,
     normalize_filepath,
 )
-from polars._utils.wrap import wrap_df, wrap_ldf
+from polars._utils.wrap import wrap_ldf
 from polars.datatypes import N_INFER_DEFAULT, String, parse_into_dtype
 from polars.io._utils import (
-    is_glob_pattern,
     parse_columns_arg,
     parse_row_index_args,
     prepare_file_arg,
@@ -26,7 +25,7 @@
 from polars.io.csv.batched_reader import BatchedCsvReader
 
 with contextlib.suppress(ImportError):  # Module not available when building docs
-    from polars.polars import PyDataFrame, PyLazyFrame
+    from polars.polars import PyLazyFrame
 
 if TYPE_CHECKING:
     from polars import DataFrame, LazyFrame
@@ -564,15 +563,8 @@ def _read_csv_impl(
     decimal_comma: bool = False,
     glob: bool = True,
 ) -> DataFrame:
-    path: str | None
-    if isinstance(source, (str, Path)):
-        path = normalize_filepath(source, check_not_directory=False)
-    else:
-        path = None
-        if isinstance(source, BytesIO):
-            source = source.getvalue()
-        if isinstance(source, StringIO):
-            source = source.getvalue().encode()
+    if isinstance(source, (bytes, memoryview, bytearray)):
+        source = io.BytesIO(source)
 
     dtype_list: Sequence[tuple[str, PolarsDataType]] | None = None
     dtype_slice: Sequence[PolarsDataType] | None = None
@@ -587,93 +579,58 @@ def _read_csv_impl(
             msg = f"`schema_overrides` should be of type list or dict, got {type(schema_overrides).__name__!r}"
             raise TypeError(msg)
 
-    processed_null_values = _process_null_values(null_values)
-
     if isinstance(columns, str):
         columns = [columns]
-    if isinstance(source, str) and is_glob_pattern(source):
-        dtypes_dict = None
-        if dtype_list is not None:
-            dtypes_dict = dict(dtype_list)
-        if dtype_slice is not None:
-            msg = (
-                "cannot use glob patterns and unnamed dtypes as `schema_overrides` argument"
-                "\n\nUse `schema_overrides`: Mapping[str, Type[DataType]]"
-            )
-            raise ValueError(msg)
-        from polars import scan_csv
 
-        scan = scan_csv(
-            source,
-            has_header=has_header,
-            separator=separator,
-            comment_prefix=comment_prefix,
-            quote_char=quote_char,
-            skip_rows=skip_rows,
-            schema=schema,
-            schema_overrides=dtypes_dict,
-            null_values=null_values,
-            missing_utf8_is_empty_string=missing_utf8_is_empty_string,
-            ignore_errors=ignore_errors,
-            infer_schema_length=infer_schema_length,
-            n_rows=n_rows,
-            low_memory=low_memory,
-            rechunk=rechunk,
-            skip_rows_after_header=skip_rows_after_header,
-            row_index_name=row_index_name,
-            row_index_offset=row_index_offset,
-            eol_char=eol_char,
-            raise_if_empty=raise_if_empty,
-            truncate_ragged_lines=truncate_ragged_lines,
-            decimal_comma=decimal_comma,
-            glob=glob,
+    dtypes_dict = None
+    if dtype_list is not None:
+        dtypes_dict = dict(dtype_list)
+    if dtype_slice is not None:
+        msg = (
+            "cannot use glob patterns and unnamed dtypes as `schema_overrides` argument"
+            "\n\nUse `schema_overrides`: Mapping[str, Type[DataType]]"
         )
-        if columns is None:
-            return scan.collect()
-        elif is_str_sequence(columns, allow_str=False):
-            return scan.select(columns).collect()
-        else:
-            msg = (
-                "cannot use glob patterns and integer based projection as `columns` argument"
-                "\n\nUse columns: List[str]"
-            )
-            raise ValueError(msg)
-
-    projection, columns = parse_columns_arg(columns)
+        raise ValueError(msg)
+    from polars import scan_csv
 
-    pydf = PyDataFrame.read_csv(
+    scan = scan_csv(
         source,
-        infer_schema_length,
-        batch_size,
-        has_header,
-        ignore_errors,
-        n_rows,
-        skip_rows,
-        projection,
-        separator,
-        rechunk,
-        columns,
-        encoding,
-        n_threads,
-        path,
-        dtype_list,
-        dtype_slice,
-        low_memory,
-        comment_prefix,
-        quote_char,
-        processed_null_values,
-        missing_utf8_is_empty_string,
-        try_parse_dates,
-        skip_rows_after_header,
-        parse_row_index_args(row_index_name, row_index_offset),
-        sample_size=sample_size,
+        has_header=has_header,
+        separator=separator,
+        comment_prefix=comment_prefix,
+        quote_char=quote_char,
+        skip_rows=skip_rows,
+        schema=schema,
+        schema_overrides=dtypes_dict,
+        null_values=null_values,
+        missing_utf8_is_empty_string=missing_utf8_is_empty_string,
+        ignore_errors=ignore_errors,
+        infer_schema_length=infer_schema_length,
+        n_rows=n_rows,
+        encoding=encoding,
+        low_memory=low_memory,
+        rechunk=rechunk,
+        skip_rows_after_header=skip_rows_after_header,
+        row_index_name=row_index_name,
+        row_index_offset=row_index_offset,
         eol_char=eol_char,
         raise_if_empty=raise_if_empty,
         truncate_ragged_lines=truncate_ragged_lines,
         decimal_comma=decimal_comma,
-        schema=schema,
+        glob=glob,
+        try_parse_dates=try_parse_dates,
     )
-    return wrap_df(pydf)
+
+    if columns is None:
+        return scan.collect()
+    elif is_str_sequence(columns, allow_str=False):
+        return scan.select(columns).collect()
+    else:
+        msg = (
+            "cannot use glob patterns and integer based projection as `columns` argument"
+            "\n\nUse columns: List[str]"
+        )
+        raise ValueError(msg)
 
 
 @deprecate_renamed_parameter("dtypes", "schema_overrides", version="0.20.31")
@@ -1241,10 +1198,10 @@ def with_column_names(cols: list[str]) -> list[str]:
 
     if isinstance(source, (str, Path)):
         source = normalize_filepath(source, check_not_directory=False)
-    elif isinstance(source, (BytesIO, StringIO)) or (
+    elif isinstance(source, io.IOBase) or (
         isinstance(source, list)
         and len(source) > 0
-        and isinstance(source[0], (BytesIO, StringIO))
+        and isinstance(source[0], io.IOBase)
     ):
         pass
     else:
diff --git a/py-polars/polars/io/ipc/functions.py b/py-polars/polars/io/ipc/functions.py
index 5142b2ae68c6..a318ed8d62a4 100644
--- a/py-polars/polars/io/ipc/functions.py
+++ b/py-polars/polars/io/ipc/functions.py
@@ -2,6 +2,7 @@
 
 import contextlib
 import os
+import io
 from pathlib import Path
 from typing import IO, TYPE_CHECKING, Any, Sequence
 
@@ -15,8 +16,6 @@
 from polars._utils.wrap import wrap_df, wrap_ldf
 from polars.dependencies import import_optional
 from polars.io._utils import (
-    is_glob_pattern,
-    is_local_file,
     parse_columns_arg,
     parse_row_index_args,
     prepare_file_arg,
@@ -176,42 +175,31 @@ def _read_ipc_impl(
     rechunk: bool = True,
     memory_map: bool = True,
 ) -> DataFrame:
-    if isinstance(source, (str, Path)):
-        source = normalize_filepath(source, check_not_directory=False)
+    if isinstance(source, (memoryview, bytearray, bytes)):
+        source = io.BytesIO(source)
+
     if isinstance(columns, str):
         columns = [columns]
 
-    if isinstance(source, str) and is_glob_pattern(source) and is_local_file(source):
-        scan = scan_ipc(
-            source,
-            n_rows=n_rows,
-            rechunk=rechunk,
-            row_index_name=row_index_name,
-            row_index_offset=row_index_offset,
-            memory_map=memory_map,
-        )
-        if columns is None:
-            df = scan.collect()
-        elif is_str_sequence(columns, allow_str=False):
-            df = scan.select(columns).collect()
-        else:
-            msg = (
-                "cannot use glob patterns and integer based projection as `columns` argument"
-                "\n\nUse columns: List[str]"
-            )
-            raise TypeError(msg)
-        return df
-
-    projection, columns = parse_columns_arg(columns)
-    pydf = PyDataFrame.read_ipc(
+    scan = scan_ipc(
         source,
-        columns,
-        projection,
-        n_rows,
-        parse_row_index_args(row_index_name, row_index_offset),
+        n_rows=n_rows,
+        rechunk=rechunk,
+        row_index_name=row_index_name,
+        row_index_offset=row_index_offset,
         memory_map=memory_map,
     )
-    return wrap_df(pydf)
+    if columns is None:
+        df = scan.collect()
+    elif is_str_sequence(columns, allow_str=False):
+        df = scan.select(columns).collect()
+    else:
+        msg = (
+            "cannot use glob patterns and integer based projection as `columns` argument"
+            "\n\nUse columns: List[str]"
+        )
+        raise TypeError(msg)
+    return df
 
 
 @deprecate_renamed_parameter("row_count_name", "row_index_name", version="0.20.4")
diff --git a/py-polars/polars/io/ndjson.py b/py-polars/polars/io/ndjson.py
index 6cff4ddb1643..ba1d120890bb 100644
--- a/py-polars/polars/io/ndjson.py
+++ b/py-polars/polars/io/ndjson.py
@@ -3,11 +3,11 @@
 import contextlib
 from io import BytesIO, StringIO
 from pathlib import Path
-from typing import IO, TYPE_CHECKING, Any, Sequence
+from typing import IO, TYPE_CHECKING, Any
 
 from polars._utils.deprecation import deprecate_renamed_parameter
 from polars._utils.various import normalize_filepath
-from polars._utils.wrap import wrap_df, wrap_ldf
+from polars._utils.wrap import wrap_ldf
 from polars.datatypes import N_INFER_DEFAULT
 from polars.io._utils import parse_row_index_args
 
@@ -120,29 +120,6 @@ def read_ndjson(
     │ 3   ┆ 8   │
     └─────┴─────┘
     """
-    if not (
-        isinstance(source, (str, Path))
-        or isinstance(source, Sequence)
-        and source
-        and isinstance(source[0], (str, Path))
-    ):
-        # TODO: A lot of the parameters aren't applied for BytesIO
-        if isinstance(source, StringIO):
-            source = BytesIO(source.getvalue().encode())
-
-        pydf = PyDataFrame.read_ndjson(
-            source,
-            ignore_errors=ignore_errors,
-            schema=schema,
-            schema_overrides=schema_overrides,
-        )
-
-        df = wrap_df(pydf)
-
-        if n_rows:
-            df = df.head(n_rows)
-
-        return df
 
     return scan_ndjson(
         source,  # type: ignore[arg-type]
diff --git a/py-polars/polars/io/parquet/functions.py b/py-polars/polars/io/parquet/functions.py
index 04cccc85fc5d..6320a0072578 100644
--- a/py-polars/polars/io/parquet/functions.py
+++ b/py-polars/polars/io/parquet/functions.py
@@ -171,9 +171,8 @@ def read_parquet(
             memory_map=memory_map,
             rechunk=rechunk,
         )
-
     # Read file and bytes inputs using `read_parquet`
-    if isinstance(source, bytes):
+    elif isinstance(source, bytes):
         source = io.BytesIO(source)
     elif isinstance(source, list) and len(source) > 0 and isinstance(source[0], bytes):
         assert all(isinstance(s, bytes) for s in source)
@@ -233,7 +232,7 @@ def _read_parquet_with_pyarrow(
 
     sources: list[str | Path | IO[bytes] | bytes | list[str] | list[Path]] = []
     if isinstance(source, list):
-        if len(source) > 0 and isinstance(source[0], (bytes, io.BytesIO)):
+        if len(source) > 0 and isinstance(source[0], (bytes, io.IOBase)):
             sources = source  # type: ignore[assignment]
         else:
             sources = [source]  # type: ignore[list-item]
@@ -416,10 +415,10 @@ def scan_parquet(
 
     if isinstance(source, (str, Path)):
         source = normalize_filepath(source, check_not_directory=False)
-    elif isinstance(source, io.BytesIO) or (
+    elif isinstance(source, io.IOBase) or (
         isinstance(source, list)
         and len(source) > 0
-        and isinstance(source[0], io.BytesIO)
+        and isinstance(source[0], io.IOBase)
     ):
         pass
     else:

From 0b49b136ef2581abdedb494b081632a379640db4 Mon Sep 17 00:00:00 2001
From: coastalwhite <me@gburghoorn.com>
Date: Sun, 8 Sep 2024 13:54:54 +0200
Subject: [PATCH 17/27] add bytes and file descriptors

---
 crates/polars-plan/src/plans/ir/mod.rs        |   2 +-
 crates/polars-python/src/conversion/mod.rs    |   8 +-
 crates/polars-python/src/dataframe/io.rs      | 229 +++++++++++++++++-
 crates/polars-python/src/file.rs              |  11 +-
 crates/polars-python/src/lazyframe/general.rs |   1 +
 py-polars/polars/_utils/various.py            |  18 ++
 py-polars/polars/io/csv/functions.py          | 158 +++++++-----
 py-polars/polars/io/ipc/functions.py          |  82 ++++---
 py-polars/polars/io/ndjson.py                 |  58 +++--
 py-polars/polars/io/parquet/functions.py      |  15 +-
 py-polars/tests/unit/io/test_csv.py           |   5 +
 py-polars/tests/unit/io/test_ipc.py           |   4 +
 py-polars/tests/unit/io/test_parquet.py       |  13 +
 13 files changed, 475 insertions(+), 129 deletions(-)

diff --git a/crates/polars-plan/src/plans/ir/mod.rs b/crates/polars-plan/src/plans/ir/mod.rs
index 67ba46d4aca2..cf0b5ee8df7d 100644
--- a/crates/polars-plan/src/plans/ir/mod.rs
+++ b/crates/polars-plan/src/plans/ir/mod.rs
@@ -239,7 +239,7 @@ impl ScanSources {
     pub fn get(&self, idx: usize) -> Option<ScanSourceRef> {
         match self {
             Self::Paths(paths) => paths.get(idx).map(|p| ScanSourceRef::Path(p)),
-            Self::Files(files) => files.get(idx).map(|f| ScanSourceRef::File(f)),
+            Self::Files(files) => files.get(idx).map(ScanSourceRef::File),
             Self::Buffers(buffers) => buffers.get(idx).map(ScanSourceRef::Buffer),
         }
     }
diff --git a/crates/polars-python/src/conversion/mod.rs b/crates/polars-python/src/conversion/mod.rs
index ec05729acc81..4eee205d8550 100644
--- a/crates/polars-python/src/conversion/mod.rs
+++ b/crates/polars-python/src/conversion/mod.rs
@@ -571,6 +571,11 @@ impl<'py> FromPyObject<'py> for Wrap<ScanSources> {
                 sources.push(file);
                 MutableSources::Files(sources)
             },
+            EitherPythonFileOrPath::Buffer(buffer) => {
+                let mut sources = Vec::with_capacity(num_items);
+                sources.push(buffer);
+                MutableSources::Buffers(sources)
+            },
         };
 
         for source in iter {
@@ -578,9 +583,10 @@ impl<'py> FromPyObject<'py> for Wrap<ScanSources> {
                 (MutableSources::Paths(v), EitherPythonFileOrPath::Path(p)) => v.push(p),
                 (MutableSources::Files(v), EitherPythonFileOrPath::File(f)) => v.push(f),
                 (MutableSources::Buffers(v), EitherPythonFileOrPath::Py(f)) => v.push(f.as_bytes()),
+                (MutableSources::Buffers(v), EitherPythonFileOrPath::Buffer(f)) => v.push(f),
                 _ => {
                     return Err(PyTypeError::new_err(
-                        "Cannot combine in-memory bytes and paths for scan sources",
+                        "Cannot combine in-memory bytes, paths and files for scan sources",
                     ))
                 },
             }
diff --git a/crates/polars-python/src/dataframe/io.rs b/crates/polars-python/src/dataframe/io.rs
index d56334d35ad0..12707e93dd85 100644
--- a/crates/polars-python/src/dataframe/io.rs
+++ b/crates/polars-python/src/dataframe/io.rs
@@ -10,6 +10,7 @@ use polars::prelude::*;
 #[cfg(feature = "parquet")]
 use polars_parquet::arrow::write::StatisticsOptions;
 use pyo3::prelude::*;
+use pyo3::pybacked::PyBackedStr;
 
 use super::PyDataFrame;
 #[cfg(feature = "parquet")]
@@ -17,13 +18,176 @@ use crate::conversion::parse_parquet_compression;
 use crate::conversion::Wrap;
 use crate::error::PyPolarsErr;
 use crate::file::{
-    get_either_file, get_file_like, get_mmap_bytes_reader,
+    get_either_file, get_file_like, get_mmap_bytes_reader, get_mmap_bytes_reader_and_path,
     read_if_bytesio, EitherRustPythonFile,
 };
 use crate::prelude::PyCompatLevel;
 
 #[pymethods]
 impl PyDataFrame {
+    #[staticmethod]
+    #[cfg(feature = "csv")]
+    #[pyo3(signature = (
+    py_f, infer_schema_length, chunk_size, has_header, ignore_errors, n_rows,
+    skip_rows, projection, separator, rechunk, columns, encoding, n_threads, path,
+    overwrite_dtype, overwrite_dtype_slice, low_memory, comment_prefix, quote_char,
+    null_values, missing_utf8_is_empty_string, try_parse_dates, skip_rows_after_header,
+    row_index, sample_size, eol_char, raise_if_empty, truncate_ragged_lines, decimal_comma, schema)
+)]
+    pub fn read_csv(
+        py: Python,
+        mut py_f: Bound<PyAny>,
+        infer_schema_length: Option<usize>,
+        chunk_size: usize,
+        has_header: bool,
+        ignore_errors: bool,
+        n_rows: Option<usize>,
+        skip_rows: usize,
+        projection: Option<Vec<usize>>,
+        separator: &str,
+        rechunk: bool,
+        columns: Option<Vec<String>>,
+        encoding: Wrap<CsvEncoding>,
+        n_threads: Option<usize>,
+        path: Option<String>,
+        overwrite_dtype: Option<Vec<(PyBackedStr, Wrap<DataType>)>>,
+        overwrite_dtype_slice: Option<Vec<Wrap<DataType>>>,
+        low_memory: bool,
+        comment_prefix: Option<&str>,
+        quote_char: Option<&str>,
+        null_values: Option<Wrap<NullValues>>,
+        missing_utf8_is_empty_string: bool,
+        try_parse_dates: bool,
+        skip_rows_after_header: usize,
+        row_index: Option<(String, IdxSize)>,
+        sample_size: usize,
+        eol_char: &str,
+        raise_if_empty: bool,
+        truncate_ragged_lines: bool,
+        decimal_comma: bool,
+        schema: Option<Wrap<Schema>>,
+    ) -> PyResult<Self> {
+        let null_values = null_values.map(|w| w.0);
+        let eol_char = eol_char.as_bytes()[0];
+        let row_index = row_index.map(|(name, offset)| RowIndex {
+            name: name.into(),
+            offset,
+        });
+        let quote_char = quote_char.and_then(|s| s.as_bytes().first().copied());
+
+        let overwrite_dtype = overwrite_dtype.map(|overwrite_dtype| {
+            overwrite_dtype
+                .iter()
+                .map(|(name, dtype)| {
+                    let dtype = dtype.0.clone();
+                    Field::new((&**name).into(), dtype)
+                })
+                .collect::<Schema>()
+        });
+
+        let overwrite_dtype_slice = overwrite_dtype_slice.map(|overwrite_dtype| {
+            overwrite_dtype
+                .iter()
+                .map(|dt| dt.0.clone())
+                .collect::<Vec<_>>()
+        });
+
+        py_f = read_if_bytesio(py_f);
+        let mmap_bytes_r = get_mmap_bytes_reader(&py_f)?;
+        let df = py.allow_threads(move || {
+            CsvReadOptions::default()
+                .with_path(path)
+                .with_infer_schema_length(infer_schema_length)
+                .with_has_header(has_header)
+                .with_n_rows(n_rows)
+                .with_skip_rows(skip_rows)
+                .with_ignore_errors(ignore_errors)
+                .with_projection(projection.map(Arc::new))
+                .with_rechunk(rechunk)
+                .with_chunk_size(chunk_size)
+                .with_columns(columns.map(|x| x.into_iter().map(|x| x.into()).collect()))
+                .with_n_threads(n_threads)
+                .with_schema_overwrite(overwrite_dtype.map(Arc::new))
+                .with_dtype_overwrite(overwrite_dtype_slice.map(Arc::new))
+                .with_schema(schema.map(|schema| Arc::new(schema.0)))
+                .with_low_memory(low_memory)
+                .with_skip_rows_after_header(skip_rows_after_header)
+                .with_row_index(row_index)
+                .with_sample_size(sample_size)
+                .with_raise_if_empty(raise_if_empty)
+                .with_parse_options(
+                    CsvParseOptions::default()
+                        .with_separator(separator.as_bytes()[0])
+                        .with_encoding(encoding.0)
+                        .with_missing_is_null(!missing_utf8_is_empty_string)
+                        .with_comment_prefix(comment_prefix)
+                        .with_null_values(null_values)
+                        .with_try_parse_dates(try_parse_dates)
+                        .with_quote_char(quote_char)
+                        .with_eol_char(eol_char)
+                        .with_truncate_ragged_lines(truncate_ragged_lines)
+                        .with_decimal_comma(decimal_comma),
+                )
+                .into_reader_with_file_handle(mmap_bytes_r)
+                .finish()
+                .map_err(PyPolarsErr::from)
+        })?;
+        Ok(df.into())
+    }
+
+    #[staticmethod]
+    #[cfg(feature = "parquet")]
+    #[pyo3(signature = (py_f, columns, projection, n_rows, row_index, low_memory, parallel, use_statistics, rechunk))]
+    pub fn read_parquet(
+        py: Python,
+        py_f: PyObject,
+        columns: Option<Vec<String>>,
+        projection: Option<Vec<usize>>,
+        n_rows: Option<usize>,
+        row_index: Option<(String, IdxSize)>,
+        low_memory: bool,
+        parallel: Wrap<ParallelStrategy>,
+        use_statistics: bool,
+        rechunk: bool,
+    ) -> PyResult<Self> {
+        use EitherRustPythonFile::*;
+
+        let row_index = row_index.map(|(name, offset)| RowIndex {
+            name: name.into(),
+            offset,
+        });
+        let result = match get_either_file(py_f, false)? {
+            Py(f) => {
+                let buf = f.as_buffer();
+                py.allow_threads(move || {
+                    ParquetReader::new(buf)
+                        .with_projection(projection)
+                        .with_columns(columns)
+                        .read_parallel(parallel.0)
+                        .with_slice(n_rows.map(|x| (0, x)))
+                        .with_row_index(row_index)
+                        .set_low_memory(low_memory)
+                        .use_statistics(use_statistics)
+                        .set_rechunk(rechunk)
+                        .finish()
+                })
+            },
+            Rust(f) => py.allow_threads(move || {
+                ParquetReader::new(f)
+                    .with_projection(projection)
+                    .with_columns(columns)
+                    .read_parallel(parallel.0)
+                    .with_slice(n_rows.map(|x| (0, x)))
+                    .with_row_index(row_index)
+                    .use_statistics(use_statistics)
+                    .set_rechunk(rechunk)
+                    .finish()
+            }),
+        };
+        let df = result.map_err(PyPolarsErr::from)?;
+        Ok(PyDataFrame::new(df))
+    }
+
     #[staticmethod]
     #[cfg(feature = "json")]
     pub fn read_json(
@@ -56,6 +220,69 @@ impl PyDataFrame {
         })
     }
 
+    #[staticmethod]
+    #[cfg(feature = "json")]
+    pub fn read_ndjson(
+        py: Python,
+        mut py_f: Bound<PyAny>,
+        ignore_errors: bool,
+        schema: Option<Wrap<Schema>>,
+        schema_overrides: Option<Wrap<Schema>>,
+    ) -> PyResult<Self> {
+        py_f = read_if_bytesio(py_f);
+        let mmap_bytes_r = get_mmap_bytes_reader(&py_f)?;
+
+        let mut builder = JsonReader::new(mmap_bytes_r)
+            .with_json_format(JsonFormat::JsonLines)
+            .with_ignore_errors(ignore_errors);
+
+        if let Some(schema) = schema {
+            builder = builder.with_schema(Arc::new(schema.0));
+        }
+
+        if let Some(schema) = schema_overrides.as_ref() {
+            builder = builder.with_schema_overwrite(&schema.0);
+        }
+
+        let out = py
+            .allow_threads(move || builder.finish())
+            .map_err(|e| PyPolarsErr::Other(format!("{e}")))?;
+        Ok(out.into())
+    }
+
+    #[staticmethod]
+    #[cfg(feature = "ipc")]
+    #[pyo3(signature = (py_f, columns, projection, n_rows, row_index, memory_map))]
+    pub fn read_ipc(
+        py: Python,
+        mut py_f: Bound<PyAny>,
+        columns: Option<Vec<String>>,
+        projection: Option<Vec<usize>>,
+        n_rows: Option<usize>,
+        row_index: Option<(String, IdxSize)>,
+        memory_map: bool,
+    ) -> PyResult<Self> {
+        let row_index = row_index.map(|(name, offset)| RowIndex {
+            name: name.into(),
+            offset,
+        });
+        py_f = read_if_bytesio(py_f);
+        let (mmap_bytes_r, mmap_path) = get_mmap_bytes_reader_and_path(&py_f)?;
+
+        let mmap_path = if memory_map { mmap_path } else { None };
+        let df = py.allow_threads(move || {
+            IpcReader::new(mmap_bytes_r)
+                .with_projection(projection)
+                .with_columns(columns)
+                .with_n_rows(n_rows)
+                .with_row_index(row_index)
+                .memory_mapped(mmap_path)
+                .finish()
+                .map_err(PyPolarsErr::from)
+        })?;
+        Ok(PyDataFrame::new(df))
+    }
+
     #[staticmethod]
     #[cfg(feature = "ipc_streaming")]
     #[pyo3(signature = (py_f, columns, projection, n_rows, row_index, rechunk))]
diff --git a/crates/polars-python/src/file.rs b/crates/polars-python/src/file.rs
index 6caaf3bb05b4..2857b37a4891 100644
--- a/crates/polars-python/src/file.rs
+++ b/crates/polars-python/src/file.rs
@@ -205,6 +205,7 @@ impl EitherRustPythonFile {
 
 pub enum EitherPythonFileOrPath {
     Py(PyFileLikeObject),
+    Buffer(bytes::Bytes),
     Path(PathBuf),
     File(File),
 }
@@ -212,6 +213,14 @@ pub enum EitherPythonFileOrPath {
 pub fn get_either_file_or_path(py_f: PyObject, write: bool) -> PyResult<EitherPythonFileOrPath> {
     Python::with_gil(|py| {
         let py_f = py_f.into_bound(py);
+
+        // If the pyobject is a `bytes` class
+        if let Ok(bytes) = py_f.downcast::<PyBytes>() {
+            return Ok(EitherPythonFileOrPath::Buffer(
+                bytes::Bytes::copy_from_slice(bytes.as_bytes()),
+            ));
+        }
+
         if let Ok(s) = py_f.extract::<Cow<str>>() {
             let file_path = std::path::Path::new(&*s);
             let file_path = resolve_homedir(file_path);
@@ -275,7 +284,7 @@ pub fn get_either_file_or_path(py_f: PyObject, write: bool) -> PyResult<EitherPy
 
             // BytesIO / StringIO is relatively fast, and some code relies on it.
             if !py_f.is_exact_instance(&io.getattr("BytesIO").unwrap())
-                || !py_f.is_exact_instance(&io.getattr("StringIO").unwrap())
+                && !py_f.is_exact_instance(&io.getattr("StringIO").unwrap())
             {
                 polars_warn!("Polars found a filename. \
                 Ensure you pass a path to the file instead of a python file object when possible for best \
diff --git a/crates/polars-python/src/lazyframe/general.rs b/crates/polars-python/src/lazyframe/general.rs
index 11266a696d5e..a1c8cb93bde9 100644
--- a/crates/polars-python/src/lazyframe/general.rs
+++ b/crates/polars-python/src/lazyframe/general.rs
@@ -30,6 +30,7 @@ fn pyobject_to_first_path_and_scan_sources(
         },
         EitherPythonFileOrPath::File(file) => (None, ScanSources::Files([file].into())),
         EitherPythonFileOrPath::Py(f) => (None, ScanSources::Buffers([f.as_bytes()].into())),
+        EitherPythonFileOrPath::Buffer(buff) => (None, ScanSources::Buffers([buff].into())),
     })
 }
 
diff --git a/py-polars/polars/_utils/various.py b/py-polars/polars/_utils/various.py
index 014e601de8e2..f82bbec0d785 100644
--- a/py-polars/polars/_utils/various.py
+++ b/py-polars/polars/_utils/various.py
@@ -84,6 +84,24 @@ def _is_iterable_of(val: Iterable[object], eltype: type | tuple[type, ...]) -> b
     return all(isinstance(x, eltype) for x in val)
 
 
+def is_path_or_str_sequence(
+    val: object, *, allow_str: bool = False, include_series: bool = False
+) -> TypeGuard[Sequence[str | Path]]:
+    """
+    Check that `val` is a sequence of strings or paths.
+
+    Note that a single string is a sequence of strings by definition, use
+    `allow_str=False` to return False on a single string.
+    """
+    if allow_str is False and isinstance(val, str):
+        return False
+    elif _check_for_numpy(val) and isinstance(val, np.ndarray):
+        return np.issubdtype(val.dtype, np.str_)
+    elif include_series and isinstance(val, pl.Series):
+        return val.dtype == pl.String
+    return isinstance(val, Sequence) and _is_iterable_of(val, (Path, str))
+
+
 def is_bool_sequence(
     val: object, *, include_series: bool = False
 ) -> TypeGuard[Sequence[bool]]:
diff --git a/py-polars/polars/io/csv/functions.py b/py-polars/polars/io/csv/functions.py
index 61349439fed8..ceba49391560 100644
--- a/py-polars/polars/io/csv/functions.py
+++ b/py-polars/polars/io/csv/functions.py
@@ -2,7 +2,7 @@
 
 import contextlib
 import os
-import io
+from io import BytesIO, StringIO
 from pathlib import Path
 from typing import IO, TYPE_CHECKING, Any, Callable, Mapping, Sequence
 
@@ -11,12 +11,14 @@
 from polars._utils.deprecation import deprecate_renamed_parameter
 from polars._utils.various import (
     _process_null_values,
+    is_path_or_str_sequence,
     is_str_sequence,
     normalize_filepath,
 )
-from polars._utils.wrap import wrap_ldf
+from polars._utils.wrap import wrap_df, wrap_ldf
 from polars.datatypes import N_INFER_DEFAULT, String, parse_into_dtype
 from polars.io._utils import (
+    is_glob_pattern,
     parse_columns_arg,
     parse_row_index_args,
     prepare_file_arg,
@@ -25,7 +27,7 @@
 from polars.io.csv.batched_reader import BatchedCsvReader
 
 with contextlib.suppress(ImportError):  # Module not available when building docs
-    from polars.polars import PyLazyFrame
+    from polars.polars import PyDataFrame, PyLazyFrame
 
 if TYPE_CHECKING:
     from polars import DataFrame, LazyFrame
@@ -563,8 +565,15 @@ def _read_csv_impl(
     decimal_comma: bool = False,
     glob: bool = True,
 ) -> DataFrame:
-    if isinstance(source, (bytes, memoryview, bytearray)):
-        source = io.BytesIO(source)
+    path: str | None
+    if isinstance(source, (str, Path)):
+        path = normalize_filepath(source, check_not_directory=False)
+    else:
+        path = None
+        if isinstance(source, BytesIO):
+            source = source.getvalue()
+        if isinstance(source, StringIO):
+            source = source.getvalue().encode()
 
     dtype_list: Sequence[tuple[str, PolarsDataType]] | None = None
     dtype_slice: Sequence[PolarsDataType] | None = None
@@ -579,58 +588,93 @@ def _read_csv_impl(
             msg = f"`schema_overrides` should be of type list or dict, got {type(schema_overrides).__name__!r}"
             raise TypeError(msg)
 
+    processed_null_values = _process_null_values(null_values)
+
     if isinstance(columns, str):
         columns = [columns]
+    if isinstance(source, str) and is_glob_pattern(source):
+        dtypes_dict = None
+        if dtype_list is not None:
+            dtypes_dict = dict(dtype_list)
+        if dtype_slice is not None:
+            msg = (
+                "cannot use glob patterns and unnamed dtypes as `schema_overrides` argument"
+                "\n\nUse `schema_overrides`: Mapping[str, Type[DataType]]"
+            )
+            raise ValueError(msg)
+        from polars import scan_csv
 
-    dtypes_dict = None
-    if dtype_list is not None:
-        dtypes_dict = dict(dtype_list)
-    if dtype_slice is not None:
-        msg = (
-            "cannot use glob patterns and unnamed dtypes as `schema_overrides` argument"
-            "\n\nUse `schema_overrides`: Mapping[str, Type[DataType]]"
+        scan = scan_csv(
+            source,
+            has_header=has_header,
+            separator=separator,
+            comment_prefix=comment_prefix,
+            quote_char=quote_char,
+            skip_rows=skip_rows,
+            schema=schema,
+            schema_overrides=dtypes_dict,
+            null_values=null_values,
+            missing_utf8_is_empty_string=missing_utf8_is_empty_string,
+            ignore_errors=ignore_errors,
+            infer_schema_length=infer_schema_length,
+            n_rows=n_rows,
+            low_memory=low_memory,
+            rechunk=rechunk,
+            skip_rows_after_header=skip_rows_after_header,
+            row_index_name=row_index_name,
+            row_index_offset=row_index_offset,
+            eol_char=eol_char,
+            raise_if_empty=raise_if_empty,
+            truncate_ragged_lines=truncate_ragged_lines,
+            decimal_comma=decimal_comma,
+            glob=glob,
         )
-        raise ValueError(msg)
-    from polars import scan_csv
+        if columns is None:
+            return scan.collect()
+        elif is_str_sequence(columns, allow_str=False):
+            return scan.select(columns).collect()
+        else:
+            msg = (
+                "cannot use glob patterns and integer based projection as `columns` argument"
+                "\n\nUse columns: List[str]"
+            )
+            raise ValueError(msg)
+
+    projection, columns = parse_columns_arg(columns)
 
-    scan = scan_csv(
+    pydf = PyDataFrame.read_csv(
         source,
-        has_header=has_header,
-        separator=separator,
-        comment_prefix=comment_prefix,
-        quote_char=quote_char,
-        skip_rows=skip_rows,
-        schema=schema,
-        schema_overrides=dtypes_dict,
-        null_values=null_values,
-        missing_utf8_is_empty_string=missing_utf8_is_empty_string,
-        ignore_errors=ignore_errors,
-        infer_schema_length=infer_schema_length,
-        n_rows=n_rows,
-        encoding=encoding,
-        low_memory=low_memory,
-        rechunk=rechunk,
-        skip_rows_after_header=skip_rows_after_header,
-        row_index_name=row_index_name,
-        row_index_offset=row_index_offset,
+        infer_schema_length,
+        batch_size,
+        has_header,
+        ignore_errors,
+        n_rows,
+        skip_rows,
+        projection,
+        separator,
+        rechunk,
+        columns,
+        encoding,
+        n_threads,
+        path,
+        dtype_list,
+        dtype_slice,
+        low_memory,
+        comment_prefix,
+        quote_char,
+        processed_null_values,
+        missing_utf8_is_empty_string,
+        try_parse_dates,
+        skip_rows_after_header,
+        parse_row_index_args(row_index_name, row_index_offset),
+        sample_size=sample_size,
         eol_char=eol_char,
         raise_if_empty=raise_if_empty,
         truncate_ragged_lines=truncate_ragged_lines,
         decimal_comma=decimal_comma,
-        glob=glob,
-        try_parse_dates=try_parse_dates,
+        schema=schema,
     )
-
-    if columns is None:
-        return scan.collect()
-    elif is_str_sequence(columns, allow_str=False):
-        return scan.select(columns).collect()
-    else:
-        msg = (
-            "cannot use glob patterns and integer based projection as `columns` argument"
-            "\n\nUse columns: List[str]"
-        )
-        raise ValueError(msg)
+    return wrap_df(pydf)
 
 
 @deprecate_renamed_parameter("dtypes", "schema_overrides", version="0.20.31")
@@ -947,10 +991,12 @@ def scan_csv(
     | Path
     | IO[str]
     | IO[bytes]
+    | bytes
     | list[str]
     | list[Path]
     | list[IO[str]]
-    | list[IO[bytes]],
+    | list[IO[bytes]]
+    | list[bytes],
     *,
     has_header: bool = True,
     separator: str = ",",
@@ -1198,19 +1244,9 @@ def with_column_names(cols: list[str]) -> list[str]:
 
     if isinstance(source, (str, Path)):
         source = normalize_filepath(source, check_not_directory=False)
-    elif isinstance(source, io.IOBase) or (
-        isinstance(source, list)
-        and len(source) > 0
-        and isinstance(source[0], io.IOBase)
-    ):
-        pass
-    else:
+    elif is_path_or_str_sequence(source, allow_str=False):
         source = [
-            normalize_filepath(
-                source,  # type: ignore[arg-type]
-                check_not_directory=False,
-            )
-            for source in source
+            normalize_filepath(source, check_not_directory=False) for source in source
         ]
 
     if not infer_schema:
@@ -1255,10 +1291,12 @@ def _scan_csv_impl(
     source: str
     | IO[str]
     | IO[bytes]
+    | bytes
     | list[str]
     | list[Path]
     | list[IO[str]]
-    | list[IO[bytes]],
+    | list[IO[bytes]]
+    | list[bytes],
     *,
     has_header: bool = True,
     separator: str = ",",
diff --git a/py-polars/polars/io/ipc/functions.py b/py-polars/polars/io/ipc/functions.py
index a318ed8d62a4..b704ce814ab8 100644
--- a/py-polars/polars/io/ipc/functions.py
+++ b/py-polars/polars/io/ipc/functions.py
@@ -2,7 +2,6 @@
 
 import contextlib
 import os
-import io
 from pathlib import Path
 from typing import IO, TYPE_CHECKING, Any, Sequence
 
@@ -10,12 +9,15 @@
 import polars.functions as F
 from polars._utils.deprecation import deprecate_renamed_parameter
 from polars._utils.various import (
+    is_path_or_str_sequence,
     is_str_sequence,
     normalize_filepath,
 )
 from polars._utils.wrap import wrap_df, wrap_ldf
 from polars.dependencies import import_optional
 from polars.io._utils import (
+    is_glob_pattern,
+    is_local_file,
     parse_columns_arg,
     parse_row_index_args,
     prepare_file_arg,
@@ -175,31 +177,42 @@ def _read_ipc_impl(
     rechunk: bool = True,
     memory_map: bool = True,
 ) -> DataFrame:
-    if isinstance(source, (memoryview, bytearray, bytes)):
-        source = io.BytesIO(source)
-
+    if isinstance(source, (str, Path)):
+        source = normalize_filepath(source, check_not_directory=False)
     if isinstance(columns, str):
         columns = [columns]
 
-    scan = scan_ipc(
+    if isinstance(source, str) and is_glob_pattern(source) and is_local_file(source):
+        scan = scan_ipc(
+            source,
+            n_rows=n_rows,
+            rechunk=rechunk,
+            row_index_name=row_index_name,
+            row_index_offset=row_index_offset,
+            memory_map=memory_map,
+        )
+        if columns is None:
+            df = scan.collect()
+        elif is_str_sequence(columns, allow_str=False):
+            df = scan.select(columns).collect()
+        else:
+            msg = (
+                "cannot use glob patterns and integer based projection as `columns` argument"
+                "\n\nUse columns: List[str]"
+            )
+            raise TypeError(msg)
+        return df
+
+    projection, columns = parse_columns_arg(columns)
+    pydf = PyDataFrame.read_ipc(
         source,
-        n_rows=n_rows,
-        rechunk=rechunk,
-        row_index_name=row_index_name,
-        row_index_offset=row_index_offset,
+        columns,
+        projection,
+        n_rows,
+        parse_row_index_args(row_index_name, row_index_offset),
         memory_map=memory_map,
     )
-    if columns is None:
-        df = scan.collect()
-    elif is_str_sequence(columns, allow_str=False):
-        df = scan.select(columns).collect()
-    else:
-        msg = (
-            "cannot use glob patterns and integer based projection as `columns` argument"
-            "\n\nUse columns: List[str]"
-        )
-        raise TypeError(msg)
-    return df
+    return wrap_df(pydf)
 
 
 @deprecate_renamed_parameter("row_count_name", "row_index_name", version="0.20.4")
@@ -334,7 +347,14 @@ def read_ipc_schema(source: str | Path | IO[bytes] | bytes) -> dict[str, DataTyp
 @deprecate_renamed_parameter("row_count_name", "row_index_name", version="0.20.4")
 @deprecate_renamed_parameter("row_count_offset", "row_index_offset", version="0.20.4")
 def scan_ipc(
-    source: str | Path | IO[bytes] | list[str] | list[Path] | list[IO[bytes]],
+    source: str
+    | Path
+    | IO[bytes]
+    | bytes
+    | list[str]
+    | list[Path]
+    | list[IO[bytes]]
+    | list[bytes],
     *,
     n_rows: int | None = None,
     cache: bool = True,
@@ -414,21 +434,17 @@ def scan_ipc(
     include_file_paths
         Include the path of the source file(s) as a column with this name.
     """
-    sources: list[str] | list[Path] | list[IO[bytes]] = []
+    sources: list[str] | list[Path] | list[IO[bytes]] | list[bytes] = []
     if isinstance(source, (str, Path)):
         source = normalize_filepath(source, check_not_directory=False)
     elif isinstance(source, list):
-        if len(source) > 0:
-            if isinstance(source[0], (str, Path)):
-                sources = [
-                    normalize_filepath(
-                        source,  # type: ignore[arg-type]
-                        check_not_directory=False,
-                    )
-                    for source in source
-                ]
-            else:
-                sources = source
+        if is_path_or_str_sequence(source):
+            sources = [
+                normalize_filepath(source, check_not_directory=False)
+                for source in source
+            ]
+        else:
+            sources = source
 
         source = None  # type: ignore[assignment]
 
diff --git a/py-polars/polars/io/ndjson.py b/py-polars/polars/io/ndjson.py
index ba1d120890bb..8b4cf39076c1 100644
--- a/py-polars/polars/io/ndjson.py
+++ b/py-polars/polars/io/ndjson.py
@@ -3,11 +3,11 @@
 import contextlib
 from io import BytesIO, StringIO
 from pathlib import Path
-from typing import IO, TYPE_CHECKING, Any
+from typing import IO, TYPE_CHECKING, Any, Sequence
 
 from polars._utils.deprecation import deprecate_renamed_parameter
-from polars._utils.various import normalize_filepath
-from polars._utils.wrap import wrap_ldf
+from polars._utils.various import is_path_or_str_sequence, normalize_filepath
+from polars._utils.wrap import wrap_df, wrap_ldf
 from polars.datatypes import N_INFER_DEFAULT
 from polars.io._utils import parse_row_index_args
 
@@ -120,6 +120,29 @@ def read_ndjson(
     │ 3   ┆ 8   │
     └─────┴─────┘
     """
+    if not (
+        isinstance(source, (str, Path))
+        or isinstance(source, Sequence)
+        and source
+        and isinstance(source[0], (str, Path))
+    ):
+        # TODO: A lot of the parameters aren't applied for BytesIO
+        if isinstance(source, StringIO):
+            source = BytesIO(source.getvalue().encode())
+
+        pydf = PyDataFrame.read_ndjson(
+            source,
+            ignore_errors=ignore_errors,
+            schema=schema,
+            schema_overrides=schema_overrides,
+        )
+
+        df = wrap_df(pydf)
+
+        if n_rows:
+            df = df.head(n_rows)
+
+        return df
 
     return scan_ndjson(
         source,  # type: ignore[arg-type]
@@ -234,26 +257,17 @@ def scan_ndjson(
     sources: list[str] | list[Path] | list[IO[str]] | list[IO[bytes]] = []
     if isinstance(source, (str, Path)):
         source = normalize_filepath(source, check_not_directory=False)
-    elif isinstance(source, (BytesIO, StringIO)):
-        pass
-    elif (
-        isinstance(source, list)
-        and len(source) > 0
-        and isinstance(source[0], (BytesIO, StringIO))
-    ):
-        sources = source
-        source = None  # type: ignore[assignment]
-    else:
-        assert all(isinstance(s, (str, Path)) for s in source)
-
-        sources = [
-            normalize_filepath(
-                source,  # type: ignore[arg-type]
-                check_not_directory=False,
-            )
-            for source in source
-        ]
+    elif isinstance(source, list):
+        if is_path_or_str_sequence(source):
+            sources = [
+                normalize_filepath(source, check_not_directory=False)
+                for source in source
+            ]
+        else:
+            sources = source
+
         source = None  # type: ignore[assignment]
+
     if infer_schema_length == 0:
         msg = "'infer_schema_length' should be positive"
         raise ValueError(msg)
diff --git a/py-polars/polars/io/parquet/functions.py b/py-polars/polars/io/parquet/functions.py
index 6320a0072578..bc434b05cc2d 100644
--- a/py-polars/polars/io/parquet/functions.py
+++ b/py-polars/polars/io/parquet/functions.py
@@ -11,6 +11,7 @@
 from polars._utils.unstable import issue_unstable_warning
 from polars._utils.various import (
     is_int_sequence,
+    is_path_or_str_sequence,
     normalize_filepath,
 )
 from polars._utils.wrap import wrap_ldf
@@ -171,8 +172,9 @@ def read_parquet(
             memory_map=memory_map,
             rechunk=rechunk,
         )
+
     # Read file and bytes inputs using `read_parquet`
-    elif isinstance(source, bytes):
+    if isinstance(source, bytes):
         source = io.BytesIO(source)
     elif isinstance(source, list) and len(source) > 0 and isinstance(source[0], bytes):
         assert all(isinstance(s, bytes) for s in source)
@@ -415,16 +417,9 @@ def scan_parquet(
 
     if isinstance(source, (str, Path)):
         source = normalize_filepath(source, check_not_directory=False)
-    elif isinstance(source, io.IOBase) or (
-        isinstance(source, list)
-        and len(source) > 0
-        and isinstance(source[0], io.IOBase)
-    ):
-        pass
-    else:
+    elif is_path_or_str_sequence(source):
         source = [
-            normalize_filepath(source, check_not_directory=False)  # type: ignore[arg-type]
-            for source in source
+            normalize_filepath(source, check_not_directory=False) for source in source
         ]
 
     return _scan_parquet_impl(
diff --git a/py-polars/tests/unit/io/test_csv.py b/py-polars/tests/unit/io/test_csv.py
index eab89d3b7855..fcacedead1d4 100644
--- a/py-polars/tests/unit/io/test_csv.py
+++ b/py-polars/tests/unit/io/test_csv.py
@@ -953,6 +953,7 @@ def test_write_csv_separator() -> None:
     df.write_csv(f, separator="\t")
     f.seek(0)
     assert f.read() == b"a\tb\n1\t1\n2\t2\n3\t3\n"
+    f.seek(0)
     assert_frame_equal(df, pl.read_csv(f, separator="\t"))
 
 
@@ -962,6 +963,7 @@ def test_write_csv_line_terminator() -> None:
     df.write_csv(f, line_terminator="\r\n")
     f.seek(0)
     assert f.read() == b"a,b\r\n1,1\r\n2,2\r\n3,3\r\n"
+    f.seek(0)
     assert_frame_equal(df, pl.read_csv(f, eol_char="\n"))
 
 
@@ -996,6 +998,7 @@ def test_quoting_round_trip() -> None:
         }
     )
     df.write_csv(f)
+    f.seek(0)
     read_df = pl.read_csv(f)
     assert_frame_equal(read_df, df)
 
@@ -1183,6 +1186,7 @@ def test_csv_write_escape_headers() -> None:
     out = io.BytesIO()
     df1.write_csv(out)
 
+    out.seek(0)
     df2 = pl.read_csv(out)
     assert_frame_equal(df1, df2)
     assert df2.schema == {"c,o,l,u,m,n": pl.Int64}
@@ -2279,4 +2283,5 @@ def test_read_csv_cast_unparsable_later(
 ) -> None:
     f = io.BytesIO()
     df.write_csv(f)
+    f.seek(0)
     assert df.equals(pl.read_csv(f, schema={"x": dtype}))
diff --git a/py-polars/tests/unit/io/test_ipc.py b/py-polars/tests/unit/io/test_ipc.py
index 18e19f4ec885..a1505cf3d2fc 100644
--- a/py-polars/tests/unit/io/test_ipc.py
+++ b/py-polars/tests/unit/io/test_ipc.py
@@ -44,11 +44,13 @@ def test_from_to_buffer(
 ) -> None:
     # use an ad-hoc buffer (file=None)
     buf1 = write_ipc(df, stream, None, compression=compression)
+    buf1.seek(0)
     read_df = read_ipc(stream, buf1, use_pyarrow=False)
     assert_frame_equal(df, read_df, categorical_as_str=True)
 
     # explicitly supply an existing buffer
     buf2 = io.BytesIO()
+    buf2.seek(0)
     write_ipc(df, stream, buf2, compression=compression)
     buf2.seek(0)
     read_df = read_ipc(stream, buf2, use_pyarrow=False)
@@ -245,6 +247,7 @@ def test_list_nested_enum() -> None:
     df = pl.DataFrame(pl.Series("list_cat", [["a", "b", "c", None]], dtype=dtype))
     buffer = io.BytesIO()
     df.write_ipc(buffer, compat_level=CompatLevel.newest())
+    buffer.seek(0)
     df = pl.read_ipc(buffer)
     assert df.get_column("list_cat").dtype == dtype
 
@@ -258,6 +261,7 @@ def test_struct_nested_enum() -> None:
     )
     buffer = io.BytesIO()
     df.write_ipc(buffer, compat_level=CompatLevel.newest())
+    buffer.seek(0)
     df = pl.read_ipc(buffer)
     assert df.get_column("struct_cat").dtype == dtype
 
diff --git a/py-polars/tests/unit/io/test_parquet.py b/py-polars/tests/unit/io/test_parquet.py
index db3186a3f874..3da465561bd1 100644
--- a/py-polars/tests/unit/io/test_parquet.py
+++ b/py-polars/tests/unit/io/test_parquet.py
@@ -685,6 +685,19 @@ def test_write_parquet_with_null_col(tmp_path: Path) -> None:
     assert_frame_equal(out, df)
 
 
+@pytest.mark.write_disk
+def test_scan_parquet_binary_buffered_reader(tmp_path: Path) -> None:
+    tmp_path.mkdir(exist_ok=True)
+
+    df = pl.DataFrame({"a": [1, 2, 3]})
+    file_path = tmp_path / "test.parquet"
+    df.write_parquet(file_path)
+
+    with file_path.open("rb") as f:
+        out = pl.scan_parquet(f).collect()
+    assert_frame_equal(out, df)
+
+
 @pytest.mark.write_disk
 def test_read_parquet_binary_buffered_reader(tmp_path: Path) -> None:
     tmp_path.mkdir(exist_ok=True)

From be81e80e9af6c64b3dd11f4efcefe3549bbbef9d Mon Sep 17 00:00:00 2001
From: coastalwhite <me@gburghoorn.com>
Date: Sun, 8 Sep 2024 13:57:56 +0200
Subject: [PATCH 18/27] mypy

---
 py-polars/polars/io/ipc/functions.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/py-polars/polars/io/ipc/functions.py b/py-polars/polars/io/ipc/functions.py
index b704ce814ab8..17ee17d6843b 100644
--- a/py-polars/polars/io/ipc/functions.py
+++ b/py-polars/polars/io/ipc/functions.py
@@ -112,7 +112,7 @@ def read_ipc(
             raise ValueError(msg)
 
         lf = scan_ipc(
-            source,  # type: ignore[arg-type]
+            source,
             n_rows=n_rows,
             memory_map=memory_map,
             storage_options=storage_options,

From 29fe063c4c069adadc076246fae10b553061ffe6 Mon Sep 17 00:00:00 2001
From: coastalwhite <me@gburghoorn.com>
Date: Sun, 8 Sep 2024 14:01:47 +0200
Subject: [PATCH 19/27] add bytes to ndjson

---
 py-polars/polars/io/ndjson.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/py-polars/polars/io/ndjson.py b/py-polars/polars/io/ndjson.py
index 8b4cf39076c1..8fc2addd6901 100644
--- a/py-polars/polars/io/ndjson.py
+++ b/py-polars/polars/io/ndjson.py
@@ -170,10 +170,12 @@ def scan_ndjson(
     | Path
     | IO[str]
     | IO[bytes]
+    | bytes
     | list[str]
     | list[Path]
     | list[IO[str]]
-    | list[IO[bytes]],
+    | list[IO[bytes]]
+    | bytes,
     *,
     schema: SchemaDefinition | None = None,
     schema_overrides: SchemaDefinition | None = None,

From 5daebe4e2d8178f98ec11ee54dc1c6ab8afd52d5 Mon Sep 17 00:00:00 2001
From: coastalwhite <me@gburghoorn.com>
Date: Sun, 8 Sep 2024 14:28:42 +0200
Subject: [PATCH 20/27] clean up the code a bit

---
 .../src/executors/scan/csv.rs                 |   6 +-
 .../src/executors/scan/ipc.rs                 |   4 +-
 .../src/executors/scan/ndjson.rs              |   6 +-
 .../src/executors/scan/parquet.rs             |   4 +-
 .../polars-plan/src/plans/conversion/scans.rs |   4 +-
 .../polars-plan/src/plans/functions/count.rs  |   2 +-
 crates/polars-plan/src/plans/ir/mod.rs        | 246 +---------------
 .../polars-plan/src/plans/ir/scan_sources.rs  | 270 ++++++++++++++++++
 crates/polars-python/src/conversion/mod.rs    |  22 +-
 crates/polars-python/src/file.rs              |  21 +-
 crates/polars-python/src/lazyframe/general.rs |  11 +-
 11 files changed, 310 insertions(+), 286 deletions(-)
 create mode 100644 crates/polars-plan/src/plans/ir/scan_sources.rs

diff --git a/crates/polars-mem-engine/src/executors/scan/csv.rs b/crates/polars-mem-engine/src/executors/scan/csv.rs
index c00a0047d525..0ebcb7632ae7 100644
--- a/crates/polars-mem-engine/src/executors/scan/csv.rs
+++ b/crates/polars-mem-engine/src/executors/scan/csv.rs
@@ -55,9 +55,9 @@ impl CsvExec {
 
         let verbose = config::verbose();
         let force_async = config::force_async();
-        let run_async = (self.sources.is_files() && force_async) || self.sources.is_cloud_url();
+        let run_async = (self.sources.is_paths() && force_async) || self.sources.is_cloud_url();
 
-        if self.sources.is_files() && force_async && verbose {
+        if self.sources.is_paths() && force_async && verbose {
             eprintln!("ASYNC READING FORCED");
         }
 
@@ -75,7 +75,7 @@ impl CsvExec {
                     .finish()?;
 
                 if let Some(col) = &self.file_options.include_file_paths {
-                    let name = source.to_file_path();
+                    let name = source.to_include_path_name();
 
                     unsafe {
                         df.with_column_unchecked(
diff --git a/crates/polars-mem-engine/src/executors/scan/ipc.rs b/crates/polars-mem-engine/src/executors/scan/ipc.rs
index acbcc2d28dd6..edde4765fec5 100644
--- a/crates/polars-mem-engine/src/executors/scan/ipc.rs
+++ b/crates/polars-mem-engine/src/executors/scan/ipc.rs
@@ -29,7 +29,7 @@ impl IpcExec {
         };
         let force_async = config::force_async();
 
-        let mut out = if is_cloud || (self.sources.is_files() && force_async) {
+        let mut out = if is_cloud || (self.sources.is_paths() && force_async) {
             feature_gated!("cloud", {
                 if force_async && config::verbose() {
                     eprintln!("ASYNC READING FORCED");
@@ -102,7 +102,7 @@ impl IpcExec {
                     self.file_options
                         .include_file_paths
                         .as_ref()
-                        .map(|x| (x.clone(), Arc::from(source.to_file_path()))),
+                        .map(|x| (x.clone(), Arc::from(source.to_include_path_name()))),
                 )
                 .finish()
         };
diff --git a/crates/polars-mem-engine/src/executors/scan/ndjson.rs b/crates/polars-mem-engine/src/executors/scan/ndjson.rs
index 06e1d18892c6..a662760fd54b 100644
--- a/crates/polars-mem-engine/src/executors/scan/ndjson.rs
+++ b/crates/polars-mem-engine/src/executors/scan/ndjson.rs
@@ -39,9 +39,9 @@ impl JsonExec {
 
         let verbose = config::verbose();
         let force_async = config::force_async();
-        let run_async = (self.sources.is_files() && force_async) || self.sources.is_cloud_url();
+        let run_async = (self.sources.is_paths() && force_async) || self.sources.is_cloud_url();
 
-        if self.sources.is_files() && force_async && verbose {
+        if self.sources.is_paths() && force_async && verbose {
             eprintln!("ASYNC READING FORCED");
         }
 
@@ -108,7 +108,7 @@ impl JsonExec {
                 }
 
                 if let Some(col) = &self.file_scan_options.include_file_paths {
-                    let name = source.to_file_path();
+                    let name = source.to_include_path_name();
                     unsafe {
                         df.with_column_unchecked(
                             StringChunked::full(col.clone(), name, df.height()).into_series(),
diff --git a/crates/polars-mem-engine/src/executors/scan/parquet.rs b/crates/polars-mem-engine/src/executors/scan/parquet.rs
index 2f32e0b50aa3..a37fc7c42f33 100644
--- a/crates/polars-mem-engine/src/executors/scan/parquet.rs
+++ b/crates/polars-mem-engine/src/executors/scan/parquet.rs
@@ -161,7 +161,7 @@ impl ParquetExec {
                         self.file_options
                             .include_file_paths
                             .as_ref()
-                            .map(|x| (x.clone(), Arc::from(source.to_file_path()))),
+                            .map(|x| (x.clone(), Arc::from(source.to_include_path_name()))),
                     );
 
                 reader
@@ -453,7 +453,7 @@ impl ParquetExec {
         let is_cloud = self.sources.is_cloud_url();
         let force_async = config::force_async();
 
-        let out = if is_cloud || (self.sources.is_files() && force_async) {
+        let out = if is_cloud || (self.sources.is_paths() && force_async) {
             feature_gated!("cloud", {
                 if force_async && config::verbose() {
                     eprintln!("ASYNC READING FORCED");
diff --git a/crates/polars-plan/src/plans/conversion/scans.rs b/crates/polars-plan/src/plans/conversion/scans.rs
index 2b20d9fe932a..25dd61aa1eb9 100644
--- a/crates/polars-plan/src/plans/conversion/scans.rs
+++ b/crates/polars-plan/src/plans/conversion/scans.rs
@@ -152,7 +152,7 @@ pub(super) fn csv_file_info(
     // * See if we can do this without downloading the entire file
 
     // prints the error message if paths is empty.
-    let run_async = sources.is_cloud_url() || (sources.is_files() && config::force_async());
+    let run_async = sources.is_cloud_url() || (sources.is_paths() && config::force_async());
 
     let cache_entries = {
         if run_async {
@@ -268,7 +268,7 @@ pub(super) fn ndjson_file_info(
         polars_bail!(ComputeError: "expected at least 1 source");
     };
 
-    let run_async = sources.is_cloud_url() || (sources.is_files() && config::force_async());
+    let run_async = sources.is_cloud_url() || (sources.is_paths() && config::force_async());
 
     let cache_entries = {
         if run_async {
diff --git a/crates/polars-plan/src/plans/functions/count.rs b/crates/polars-plan/src/plans/functions/count.rs
index 0b16c8eac994..7375ff47ff31 100644
--- a/crates/polars-plan/src/plans/functions/count.rs
+++ b/crates/polars-plan/src/plans/functions/count.rs
@@ -226,7 +226,7 @@ pub(super) fn count_rows_ndjson(
     }
 
     let is_cloud_url = sources.is_cloud_url();
-    let run_async = is_cloud_url || (sources.is_files() && config::force_async());
+    let run_async = is_cloud_url || (sources.is_paths() && config::force_async());
 
     let cache_entries = {
         if run_async {
diff --git a/crates/polars-plan/src/plans/ir/mod.rs b/crates/polars-plan/src/plans/ir/mod.rs
index cf0b5ee8df7d..a9eb45b6406f 100644
--- a/crates/polars-plan/src/plans/ir/mod.rs
+++ b/crates/polars-plan/src/plans/ir/mod.rs
@@ -1,22 +1,20 @@
 mod dot;
 mod format;
 mod inputs;
+mod scan_sources;
 mod schema;
 pub(crate) mod tree_format;
 
 use std::borrow::Cow;
 use std::fmt;
-use std::fs::File;
-use std::path::{Path, PathBuf};
 
 pub use dot::{EscapeLabel, IRDotDisplay, PathsDisplay, ScanSourcesDisplay};
 pub use format::{ExprIRDisplay, IRDisplay};
 use hive::HivePartitions;
-use polars_core::error::feature_gated;
 use polars_core::prelude::*;
 use polars_utils::idx_vec::UnitVec;
-use polars_utils::mmap::MemSlice;
 use polars_utils::unitvec;
+pub use scan_sources::{ScanSourceIter, ScanSourceRef, ScanSources};
 #[cfg(feature = "ir_serde")]
 use serde::{Deserialize, Serialize};
 
@@ -36,246 +34,6 @@ pub struct IRPlanRef<'a> {
     pub expr_arena: &'a Arena<AExpr>,
 }
 
-#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
-#[derive(Debug, Clone)]
-pub enum ScanSources {
-    Paths(Arc<[PathBuf]>),
-
-    #[cfg_attr(feature = "serde", serde(skip))]
-    Files(Arc<[File]>),
-    #[cfg_attr(feature = "serde", serde(skip))]
-    Buffers(Arc<[bytes::Bytes]>),
-}
-
-impl std::hash::Hash for ScanSources {
-    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
-        std::mem::discriminant(self).hash(state);
-
-        // @NOTE: This is a bit crazy
-        match self {
-            Self::Paths(paths) => paths.hash(state),
-            Self::Files(files) => files.as_ptr().hash(state),
-            Self::Buffers(buffers) => buffers.as_ptr().hash(state),
-        }
-    }
-}
-
-impl PartialEq for ScanSources {
-    fn eq(&self, other: &Self) -> bool {
-        match (self, other) {
-            (ScanSources::Paths(l), ScanSources::Paths(r)) => l == r,
-            _ => false,
-        }
-    }
-}
-
-impl Eq for ScanSources {}
-
-#[derive(Debug, Clone, Copy)]
-pub enum ScanSourceRef<'a> {
-    Path(&'a Path),
-    File(&'a File),
-    Buffer(&'a bytes::Bytes),
-}
-
-pub struct ScanSourceSliceInfo {
-    pub item_slice: std::ops::Range<usize>,
-    pub source_slice: std::ops::Range<usize>,
-}
-
-impl Default for ScanSources {
-    fn default() -> Self {
-        Self::Buffers(Arc::default())
-    }
-}
-
-impl<'a> ScanSourceRef<'a> {
-    pub fn to_file_path(&self) -> &str {
-        match self {
-            Self::Path(path) => path.to_str().unwrap(),
-            Self::File(_) => "open-file",
-            Self::Buffer(_) => "in-mem",
-        }
-    }
-
-    pub fn to_memslice(&self) -> PolarsResult<MemSlice> {
-        self.to_memslice_possibly_async(false, None, 0)
-    }
-
-    pub fn to_memslice_async_latest(&self, run_async: bool) -> PolarsResult<MemSlice> {
-        match self {
-            ScanSourceRef::Path(path) => {
-                let file = if run_async {
-                    feature_gated!("cloud", {
-                        polars_io::file_cache::FILE_CACHE
-                            .get_entry(path.to_str().unwrap())
-                            // Safety: This was initialized by schema inference.
-                            .unwrap()
-                            .try_open_assume_latest()?
-                    })
-                } else {
-                    polars_utils::open_file(path)?
-                };
-
-                Ok(MemSlice::from_mmap(Arc::new(unsafe {
-                    memmap::Mmap::map(&file)?
-                })))
-            },
-            ScanSourceRef::File(file) => Ok(MemSlice::from_mmap(Arc::new(unsafe {
-                memmap::Mmap::map(*file)?
-            }))),
-            ScanSourceRef::Buffer(buff) => Ok(MemSlice::from_bytes((*buff).clone())),
-        }
-    }
-
-    pub fn to_memslice_possibly_async(
-        &self,
-        run_async: bool,
-        #[cfg(feature = "cloud")] cache_entries: Option<
-            &Vec<Arc<polars_io::file_cache::FileCacheEntry>>,
-        >,
-        #[cfg(not(feature = "cloud"))] cache_entries: Option<&()>,
-        index: usize,
-    ) -> PolarsResult<MemSlice> {
-        match self {
-            Self::Path(path) => {
-                let f = if run_async {
-                    feature_gated!("cloud", {
-                        cache_entries.unwrap()[index].try_open_check_latest()?
-                    })
-                } else {
-                    polars_utils::open_file(path)?
-                };
-
-                let mmap = unsafe { memmap::Mmap::map(&f)? };
-                Ok(MemSlice::from_mmap(Arc::new(mmap)))
-            },
-            Self::File(file) => {
-                let mmap = unsafe { memmap::Mmap::map(*file)? };
-                Ok(MemSlice::from_mmap(Arc::new(mmap)))
-            },
-            Self::Buffer(buff) => Ok(MemSlice::from_bytes((*buff).clone())),
-        }
-    }
-}
-
-impl ScanSources {
-    pub fn iter(&self) -> ScanSourceIter {
-        ScanSourceIter {
-            sources: self,
-            offset: 0,
-        }
-    }
-
-    pub fn as_paths(&self) -> Option<&[PathBuf]> {
-        match self {
-            Self::Paths(paths) => Some(paths.as_ref()),
-            Self::Files(_) | Self::Buffers(_) => None,
-        }
-    }
-
-    pub fn into_paths(&self) -> Option<Arc<[PathBuf]>> {
-        match self {
-            Self::Paths(paths) => Some(paths.clone()),
-            Self::Files(_) | Self::Buffers(_) => None,
-        }
-    }
-
-    pub fn first_path(&self) -> Option<&Path> {
-        match self {
-            Self::Paths(paths) => paths.first().map(|p| p.as_path()),
-            Self::Files(_) | Self::Buffers(_) => None,
-        }
-    }
-
-    pub fn to_dsl(self, is_expanded: bool) -> DslScanSources {
-        DslScanSources {
-            sources: self,
-            is_expanded,
-        }
-    }
-
-    pub fn is_files(&self) -> bool {
-        matches!(self, Self::Paths(_))
-    }
-
-    pub fn is_cloud_url(&self) -> bool {
-        match self {
-            Self::Paths(paths) => paths.first().map_or(false, polars_io::is_cloud_url),
-            Self::Files(_) | Self::Buffers(_) => false,
-        }
-    }
-
-    pub fn len(&self) -> usize {
-        match self {
-            Self::Paths(s) => s.len(),
-            Self::Files(s) => s.len(),
-            Self::Buffers(s) => s.len(),
-        }
-    }
-
-    pub fn is_empty(&self) -> bool {
-        self.len() == 0
-    }
-
-    pub fn first(&self) -> Option<ScanSourceRef> {
-        self.get(0)
-    }
-
-    pub fn id(&self) -> PlSmallStr {
-        if self.is_empty() {
-            return PlSmallStr::from_static("EMPTY");
-        }
-
-        match self {
-            Self::Paths(paths) => {
-                PlSmallStr::from_str(paths.first().unwrap().to_string_lossy().as_ref())
-            },
-            Self::Files(_) => PlSmallStr::from_static("OPEN_FILES"),
-            Self::Buffers(_) => PlSmallStr::from_static("IN_MEMORY"),
-        }
-    }
-
-    pub fn get(&self, idx: usize) -> Option<ScanSourceRef> {
-        match self {
-            Self::Paths(paths) => paths.get(idx).map(|p| ScanSourceRef::Path(p)),
-            Self::Files(files) => files.get(idx).map(ScanSourceRef::File),
-            Self::Buffers(buffers) => buffers.get(idx).map(ScanSourceRef::Buffer),
-        }
-    }
-
-    pub fn at(&self, idx: usize) -> ScanSourceRef {
-        self.get(idx).unwrap()
-    }
-}
-
-pub struct ScanSourceIter<'a> {
-    sources: &'a ScanSources,
-    offset: usize,
-}
-
-impl<'a> Iterator for ScanSourceIter<'a> {
-    type Item = ScanSourceRef<'a>;
-
-    fn next(&mut self) -> Option<Self::Item> {
-        let item = match self.sources {
-            ScanSources::Paths(paths) => ScanSourceRef::Path(paths.get(self.offset)?),
-            ScanSources::Files(files) => ScanSourceRef::File(files.get(self.offset)?),
-            ScanSources::Buffers(buffers) => ScanSourceRef::Buffer(buffers.get(self.offset)?),
-        };
-
-        self.offset += 1;
-        Some(item)
-    }
-
-    fn size_hint(&self) -> (usize, Option<usize>) {
-        let len = self.sources.len() - self.offset;
-        (len, Some(len))
-    }
-}
-
-impl<'a> ExactSizeIterator for ScanSourceIter<'a> {}
-
 /// [`IR`] is a representation of [`DslPlan`] with [`Node`]s which are allocated in an [`Arena`]
 /// In this IR the logical plan has access to the full dataset.
 #[derive(Clone, Debug, Default)]
diff --git a/crates/polars-plan/src/plans/ir/scan_sources.rs b/crates/polars-plan/src/plans/ir/scan_sources.rs
new file mode 100644
index 000000000000..5261d6ede706
--- /dev/null
+++ b/crates/polars-plan/src/plans/ir/scan_sources.rs
@@ -0,0 +1,270 @@
+use std::fs::File;
+use std::path::{Path, PathBuf};
+use std::sync::Arc;
+
+use polars_core::error::{feature_gated, PolarsResult};
+use polars_utils::mmap::MemSlice;
+use polars_utils::pl_str::PlSmallStr;
+
+use super::DslScanSources;
+
+/// Set of sources to scan from
+///
+/// This is can either be a list of paths to files, opened files or in-memory buffers. Mixing of
+/// buffers is not currently possible.
+#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
+#[derive(Debug, Clone)]
+pub enum ScanSources {
+    Paths(Arc<[PathBuf]>),
+
+    #[cfg_attr(feature = "serde", serde(skip))]
+    Files(Arc<[File]>),
+    #[cfg_attr(feature = "serde", serde(skip))]
+    Buffers(Arc<[bytes::Bytes]>),
+}
+
+/// A reference to a single item in [`ScanSources`]
+#[derive(Debug, Clone, Copy)]
+pub enum ScanSourceRef<'a> {
+    Path(&'a Path),
+    File(&'a File),
+    Buffer(&'a bytes::Bytes),
+}
+
+/// An iterator for [`ScanSources`]
+pub struct ScanSourceIter<'a> {
+    sources: &'a ScanSources,
+    offset: usize,
+}
+
+impl Default for ScanSources {
+    fn default() -> Self {
+        Self::Buffers(Arc::default())
+    }
+}
+
+impl std::hash::Hash for ScanSources {
+    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
+        std::mem::discriminant(self).hash(state);
+
+        // @NOTE: This is a bit crazy
+        //
+        // We don't really want to hash the file descriptors or the whole buffers so for now we
+        // just settle with the fact that the memory behind Arc's does not really move. Therefore,
+        // we can just hash the pointer.
+        match self {
+            Self::Paths(paths) => paths.hash(state),
+            Self::Files(files) => files.as_ptr().hash(state),
+            Self::Buffers(buffers) => buffers.as_ptr().hash(state),
+        }
+    }
+}
+
+impl PartialEq for ScanSources {
+    fn eq(&self, other: &Self) -> bool {
+        match (self, other) {
+            (ScanSources::Paths(l), ScanSources::Paths(r)) => l == r,
+            (ScanSources::Files(l), ScanSources::Files(r)) => std::ptr::eq(l.as_ptr(), r.as_ptr()),
+            (ScanSources::Buffers(l), ScanSources::Buffers(r)) => {
+                std::ptr::eq(l.as_ptr(), r.as_ptr())
+            },
+            _ => false,
+        }
+    }
+}
+
+impl Eq for ScanSources {}
+
+impl ScanSources {
+    pub fn iter(&self) -> ScanSourceIter {
+        ScanSourceIter {
+            sources: self,
+            offset: 0,
+        }
+    }
+
+    pub fn to_dsl(self, is_expanded: bool) -> DslScanSources {
+        DslScanSources {
+            sources: self,
+            is_expanded,
+        }
+    }
+
+    /// Are the sources all paths?
+    pub fn is_paths(&self) -> bool {
+        matches!(self, Self::Paths(_))
+    }
+
+    /// Try cast the scan sources to [`ScanSources::Paths`]
+    pub fn as_paths(&self) -> Option<&[PathBuf]> {
+        match self {
+            Self::Paths(paths) => Some(paths.as_ref()),
+            Self::Files(_) | Self::Buffers(_) => None,
+        }
+    }
+
+    /// Try cast the scan sources to [`ScanSources::Paths`] with a clone
+    pub fn into_paths(&self) -> Option<Arc<[PathBuf]>> {
+        match self {
+            Self::Paths(paths) => Some(paths.clone()),
+            Self::Files(_) | Self::Buffers(_) => None,
+        }
+    }
+
+    /// Try get the first path in the scan sources
+    pub fn first_path(&self) -> Option<&Path> {
+        match self {
+            Self::Paths(paths) => paths.first().map(|p| p.as_path()),
+            Self::Files(_) | Self::Buffers(_) => None,
+        }
+    }
+
+    /// Is the first path a cloud URL?
+    pub fn is_cloud_url(&self) -> bool {
+        self.first_path().is_some_and(polars_io::is_cloud_url)
+    }
+
+    pub fn len(&self) -> usize {
+        match self {
+            Self::Paths(s) => s.len(),
+            Self::Files(s) => s.len(),
+            Self::Buffers(s) => s.len(),
+        }
+    }
+
+    pub fn is_empty(&self) -> bool {
+        self.len() == 0
+    }
+
+    pub fn first(&self) -> Option<ScanSourceRef> {
+        self.get(0)
+    }
+
+    /// Turn the [`ScanSources`] into some kind of identifier
+    pub fn id(&self) -> PlSmallStr {
+        if self.is_empty() {
+            return PlSmallStr::from_static("EMPTY");
+        }
+
+        match self {
+            Self::Paths(paths) => {
+                PlSmallStr::from_str(paths.first().unwrap().to_string_lossy().as_ref())
+            },
+            Self::Files(_) => PlSmallStr::from_static("OPEN_FILES"),
+            Self::Buffers(_) => PlSmallStr::from_static("IN_MEMORY"),
+        }
+    }
+
+    /// Get the scan source at specific address
+    pub fn get(&self, idx: usize) -> Option<ScanSourceRef> {
+        match self {
+            Self::Paths(paths) => paths.get(idx).map(|p| ScanSourceRef::Path(p)),
+            Self::Files(files) => files.get(idx).map(ScanSourceRef::File),
+            Self::Buffers(buffers) => buffers.get(idx).map(ScanSourceRef::Buffer),
+        }
+    }
+
+    /// Get the scan source at specific address
+    ///
+    /// # Panics
+    ///
+    /// If the `idx` is out of range.
+    #[track_caller]
+    pub fn at(&self, idx: usize) -> ScanSourceRef {
+        self.get(idx).unwrap()
+    }
+}
+
+impl<'a> ScanSourceRef<'a> {
+    /// Get the name for `include_paths`
+    pub fn to_include_path_name(&self) -> &str {
+        match self {
+            Self::Path(path) => path.to_str().unwrap(),
+            Self::File(_) => "open-file",
+            Self::Buffer(_) => "in-mem",
+        }
+    }
+
+    /// Turn the scan source into a memory slice
+    pub fn to_memslice(&self) -> PolarsResult<MemSlice> {
+        self.to_memslice_possibly_async(false, None, 0)
+    }
+
+    pub fn to_memslice_async_latest(&self, run_async: bool) -> PolarsResult<MemSlice> {
+        match self {
+            ScanSourceRef::Path(path) => {
+                let file = if run_async {
+                    feature_gated!("cloud", {
+                        polars_io::file_cache::FILE_CACHE
+                            .get_entry(path.to_str().unwrap())
+                            // Safety: This was initialized by schema inference.
+                            .unwrap()
+                            .try_open_assume_latest()?
+                    })
+                } else {
+                    polars_utils::open_file(path)?
+                };
+
+                Ok(MemSlice::from_mmap(Arc::new(unsafe {
+                    memmap::Mmap::map(&file)?
+                })))
+            },
+            ScanSourceRef::File(file) => Ok(MemSlice::from_mmap(Arc::new(unsafe {
+                memmap::Mmap::map(*file)?
+            }))),
+            ScanSourceRef::Buffer(buff) => Ok(MemSlice::from_bytes((*buff).clone())),
+        }
+    }
+
+    pub fn to_memslice_possibly_async(
+        &self,
+        run_async: bool,
+        #[cfg(feature = "cloud")] cache_entries: Option<
+            &Vec<Arc<polars_io::file_cache::FileCacheEntry>>,
+        >,
+        #[cfg(not(feature = "cloud"))] cache_entries: Option<&()>,
+        index: usize,
+    ) -> PolarsResult<MemSlice> {
+        match self {
+            Self::Path(path) => {
+                let f = if run_async {
+                    feature_gated!("cloud", {
+                        cache_entries.unwrap()[index].try_open_check_latest()?
+                    })
+                } else {
+                    polars_utils::open_file(path)?
+                };
+
+                let mmap = unsafe { memmap::Mmap::map(&f)? };
+                Ok(MemSlice::from_mmap(Arc::new(mmap)))
+            },
+            Self::File(file) => {
+                let mmap = unsafe { memmap::Mmap::map(*file)? };
+                Ok(MemSlice::from_mmap(Arc::new(mmap)))
+            },
+            Self::Buffer(buff) => Ok(MemSlice::from_bytes((*buff).clone())),
+        }
+    }
+}
+
+impl<'a> Iterator for ScanSourceIter<'a> {
+    type Item = ScanSourceRef<'a>;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        let item = match self.sources {
+            ScanSources::Paths(paths) => ScanSourceRef::Path(paths.get(self.offset)?),
+            ScanSources::Files(files) => ScanSourceRef::File(files.get(self.offset)?),
+            ScanSources::Buffers(buffers) => ScanSourceRef::Buffer(buffers.get(self.offset)?),
+        };
+
+        self.offset += 1;
+        Some(item)
+    }
+
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        let len = self.sources.len() - self.offset;
+        (len, Some(len))
+    }
+}
+
+impl<'a> ExactSizeIterator for ScanSourceIter<'a> {}
diff --git a/crates/polars-python/src/conversion/mod.rs b/crates/polars-python/src/conversion/mod.rs
index 4eee205d8550..fd8e97cb7adc 100644
--- a/crates/polars-python/src/conversion/mod.rs
+++ b/crates/polars-python/src/conversion/mod.rs
@@ -32,7 +32,7 @@ use pyo3::pybacked::PyBackedStr;
 use pyo3::types::{PyDict, PyList, PySequence};
 
 use crate::error::PyPolarsErr;
-use crate::file::{get_either_file_or_path, EitherPythonFileOrPath};
+use crate::file::{get_python_scan_source_input, PythonScanSourceInput};
 #[cfg(feature = "object")]
 use crate::object::OBJECT_NAME;
 use crate::prelude::*;
@@ -549,29 +549,24 @@ impl<'py> FromPyObject<'py> for Wrap<ScanSources> {
         let num_items = list.len();
         let mut iter = list
             .into_iter()
-            .map(|val| get_either_file_or_path(val.unbind(), false));
+            .map(|val| get_python_scan_source_input(val.unbind(), false));
 
         let Some(first) = iter.next() else {
             return Ok(Wrap(ScanSources::default()));
         };
 
         let mut sources = match first? {
-            EitherPythonFileOrPath::Py(f) => {
-                let mut sources = Vec::with_capacity(num_items);
-                sources.push(f.as_bytes());
-                MutableSources::Buffers(sources)
-            },
-            EitherPythonFileOrPath::Path(path) => {
+            PythonScanSourceInput::Path(path) => {
                 let mut sources = Vec::with_capacity(num_items);
                 sources.push(path);
                 MutableSources::Paths(sources)
             },
-            EitherPythonFileOrPath::File(file) => {
+            PythonScanSourceInput::File(file) => {
                 let mut sources = Vec::with_capacity(num_items);
                 sources.push(file);
                 MutableSources::Files(sources)
             },
-            EitherPythonFileOrPath::Buffer(buffer) => {
+            PythonScanSourceInput::Buffer(buffer) => {
                 let mut sources = Vec::with_capacity(num_items);
                 sources.push(buffer);
                 MutableSources::Buffers(sources)
@@ -580,10 +575,9 @@ impl<'py> FromPyObject<'py> for Wrap<ScanSources> {
 
         for source in iter {
             match (&mut sources, source?) {
-                (MutableSources::Paths(v), EitherPythonFileOrPath::Path(p)) => v.push(p),
-                (MutableSources::Files(v), EitherPythonFileOrPath::File(f)) => v.push(f),
-                (MutableSources::Buffers(v), EitherPythonFileOrPath::Py(f)) => v.push(f.as_bytes()),
-                (MutableSources::Buffers(v), EitherPythonFileOrPath::Buffer(f)) => v.push(f),
+                (MutableSources::Paths(v), PythonScanSourceInput::Path(p)) => v.push(p),
+                (MutableSources::Files(v), PythonScanSourceInput::File(f)) => v.push(f),
+                (MutableSources::Buffers(v), PythonScanSourceInput::Buffer(f)) => v.push(f),
                 _ => {
                     return Err(PyTypeError::new_err(
                         "Cannot combine in-memory bytes, paths and files for scan sources",
diff --git a/crates/polars-python/src/file.rs b/crates/polars-python/src/file.rs
index 2857b37a4891..33d084c5130c 100644
--- a/crates/polars-python/src/file.rs
+++ b/crates/polars-python/src/file.rs
@@ -203,20 +203,22 @@ impl EitherRustPythonFile {
     }
 }
 
-pub enum EitherPythonFileOrPath {
-    Py(PyFileLikeObject),
+pub enum PythonScanSourceInput {
     Buffer(bytes::Bytes),
     Path(PathBuf),
     File(File),
 }
 
-pub fn get_either_file_or_path(py_f: PyObject, write: bool) -> PyResult<EitherPythonFileOrPath> {
+pub fn get_python_scan_source_input(
+    py_f: PyObject,
+    write: bool,
+) -> PyResult<PythonScanSourceInput> {
     Python::with_gil(|py| {
         let py_f = py_f.into_bound(py);
 
         // If the pyobject is a `bytes` class
         if let Ok(bytes) = py_f.downcast::<PyBytes>() {
-            return Ok(EitherPythonFileOrPath::Buffer(
+            return Ok(PythonScanSourceInput::Buffer(
                 bytes::Bytes::copy_from_slice(bytes.as_bytes()),
             ));
         }
@@ -224,7 +226,7 @@ pub fn get_either_file_or_path(py_f: PyObject, write: bool) -> PyResult<EitherPy
         if let Ok(s) = py_f.extract::<Cow<str>>() {
             let file_path = std::path::Path::new(&*s);
             let file_path = resolve_homedir(file_path);
-            Ok(EitherPythonFileOrPath::Path(file_path))
+            Ok(PythonScanSourceInput::Path(file_path))
         } else {
             let io = py.import_bound("io").unwrap();
             let is_utf8_encoding = |py_f: &Bound<PyAny>| -> PyResult<bool> {
@@ -277,7 +279,7 @@ pub fn get_either_file_or_path(py_f: PyObject, write: bool) -> PyResult<EitherPy
             .filter(|fileno| *fileno != -1)
             .map(|fileno| fileno as RawFd)
             {
-                return Ok(EitherPythonFileOrPath::File(unsafe {
+                return Ok(PythonScanSourceInput::File(unsafe {
                     File::from_raw_fd(fd)
                 }));
             }
@@ -314,8 +316,9 @@ pub fn get_either_file_or_path(py_f: PyObject, write: bool) -> PyResult<EitherPy
                 py_f
             };
             PyFileLikeObject::ensure_requirements(&py_f, !write, write, !write)?;
-            let f = PyFileLikeObject::new(py_f.to_object(py));
-            Ok(EitherPythonFileOrPath::Py(f))
+            Ok(PythonScanSourceInput::Buffer(
+                PyFileLikeObject::new(py_f.to_object(py)).as_bytes(),
+            ))
         }
     })
 }
@@ -396,7 +399,7 @@ fn get_either_buffer_or_path(
 
             // BytesIO / StringIO is relatively fast, and some code relies on it.
             if !py_f.is_exact_instance(&io.getattr("BytesIO").unwrap())
-                || !py_f.is_exact_instance(&io.getattr("StringIO").unwrap())
+                && !py_f.is_exact_instance(&io.getattr("StringIO").unwrap())
             {
                 polars_warn!("Polars found a filename. \
                 Ensure you pass a path to the file instead of a python file object when possible for best \
diff --git a/crates/polars-python/src/lazyframe/general.rs b/crates/polars-python/src/lazyframe/general.rs
index a1c8cb93bde9..362d114817b5 100644
--- a/crates/polars-python/src/lazyframe/general.rs
+++ b/crates/polars-python/src/lazyframe/general.rs
@@ -23,14 +23,13 @@ use crate::{PyDataFrame, PyExpr, PyLazyGroupBy};
 fn pyobject_to_first_path_and_scan_sources(
     obj: PyObject,
 ) -> PyResult<(Option<PathBuf>, ScanSources)> {
-    use crate::file::{get_either_file_or_path, EitherPythonFileOrPath};
-    Ok(match get_either_file_or_path(obj, false)? {
-        EitherPythonFileOrPath::Path(path) => {
+    use crate::file::{get_python_scan_source_input, PythonScanSourceInput};
+    Ok(match get_python_scan_source_input(obj, false)? {
+        PythonScanSourceInput::Path(path) => {
             (Some(path.clone()), ScanSources::Paths([path].into()))
         },
-        EitherPythonFileOrPath::File(file) => (None, ScanSources::Files([file].into())),
-        EitherPythonFileOrPath::Py(f) => (None, ScanSources::Buffers([f.as_bytes()].into())),
-        EitherPythonFileOrPath::Buffer(buff) => (None, ScanSources::Buffers([buff].into())),
+        PythonScanSourceInput::File(file) => (None, ScanSources::Files([file].into())),
+        PythonScanSourceInput::Buffer(buff) => (None, ScanSources::Buffers([buff].into())),
     })
 }
 

From 393e589391ff02e946135bbd28af21fea54dbab9 Mon Sep 17 00:00:00 2001
From: coastalwhite <me@gburghoorn.com>
Date: Sun, 8 Sep 2024 14:35:23 +0200
Subject: [PATCH 21/27] fix mypy

---
 py-polars/polars/io/ndjson.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/py-polars/polars/io/ndjson.py b/py-polars/polars/io/ndjson.py
index 8fc2addd6901..cd9ea92bf3c0 100644
--- a/py-polars/polars/io/ndjson.py
+++ b/py-polars/polars/io/ndjson.py
@@ -145,7 +145,7 @@ def read_ndjson(
         return df
 
     return scan_ndjson(
-        source,  # type: ignore[arg-type]
+        source,
         schema=schema,
         schema_overrides=schema_overrides,
         infer_schema_length=infer_schema_length,

From 1ddc3a580cc3febf737b08b280af52242fca5d79 Mon Sep 17 00:00:00 2001
From: coastalwhite <me@gburghoorn.com>
Date: Sun, 8 Sep 2024 15:46:05 +0200
Subject: [PATCH 22/27] fix mmap

---
 crates/polars-io/src/ipc/mmap.rs              |  13 +-
 crates/polars-io/src/mmap.rs                  |  83 +------------
 crates/polars-io/src/utils/byte_source.rs     |   5 +-
 crates/polars-io/src/utils/other.rs           |   6 +-
 .../src/executors/scan/ipc.rs                 |   6 +-
 .../polars-plan/src/plans/ir/scan_sources.rs  |  18 +--
 crates/polars-python/src/dataframe/io.rs      |   2 +-
 crates/polars-utils/src/mmap.rs               | 112 ++++++++++++++++--
 8 files changed, 123 insertions(+), 122 deletions(-)

diff --git a/crates/polars-io/src/ipc/mmap.rs b/crates/polars-io/src/ipc/mmap.rs
index 854bd4c8d9d7..74ef2b533462 100644
--- a/crates/polars-io/src/ipc/mmap.rs
+++ b/crates/polars-io/src/ipc/mmap.rs
@@ -3,9 +3,10 @@ use arrow::io::ipc::read::{Dictionaries, FileMetadata};
 use arrow::mmap::{mmap_dictionaries_unchecked, mmap_unchecked};
 use arrow::record_batch::RecordBatch;
 use polars_core::prelude::*;
+use polars_utils::mmap::MMapSemaphore;
 
 use super::ipc_file::IpcReader;
-use crate::mmap::{MMapSemaphore, MmapBytesReader};
+use crate::mmap::MmapBytesReader;
 use crate::predicates::PhysicalIoExpr;
 use crate::shared::{finish_reader, ArrowReader};
 use crate::utils::{apply_projection, columns_to_projection};
@@ -15,17 +16,9 @@ impl<R: MmapBytesReader> IpcReader<R> {
         &mut self,
         predicate: Option<Arc<dyn PhysicalIoExpr>>,
     ) -> PolarsResult<DataFrame> {
-        #[cfg(target_family = "unix")]
-        use std::os::unix::fs::MetadataExt;
         match self.reader.to_file() {
             Some(file) => {
-                #[cfg(target_family = "unix")]
-                let metadata = file.metadata()?;
-                let mmap = unsafe { memmap::Mmap::map(file).unwrap() };
-                #[cfg(target_family = "unix")]
-                let semaphore = MMapSemaphore::new(metadata.dev(), metadata.ino(), mmap);
-                #[cfg(not(target_family = "unix"))]
-                let semaphore = MMapSemaphore::new(mmap);
+                let semaphore = MMapSemaphore::new_from_file(&file)?;
                 let metadata =
                     read::read_file_metadata(&mut std::io::Cursor::new(semaphore.as_ref()))?;
 
diff --git a/crates/polars-io/src/mmap.rs b/crates/polars-io/src/mmap.rs
index ad2c05175810..498c73da1a9d 100644
--- a/crates/polars-io/src/mmap.rs
+++ b/crates/polars-io/src/mmap.rs
@@ -1,84 +1,9 @@
-#[cfg(target_family = "unix")]
-use std::collections::btree_map::Entry;
-#[cfg(target_family = "unix")]
-use std::collections::BTreeMap;
 use std::fs::File;
 use std::io::{BufReader, Cursor, Read, Seek};
 use std::sync::Arc;
-#[cfg(target_family = "unix")]
-use std::sync::Mutex;
 
-use memmap::Mmap;
-#[cfg(target_family = "unix")]
-use once_cell::sync::Lazy;
 use polars_core::config::verbose;
-#[cfg(target_family = "unix")]
-use polars_error::polars_bail;
-use polars_error::PolarsResult;
-use polars_utils::mmap::MemSlice;
-
-// Keep track of memory mapped files so we don't write to them while reading
-// Use a btree as it uses less memory than a hashmap and this thing never shrinks.
-// Write handle in Windows is exclusive, so this is only necessary in Unix.
-#[cfg(target_family = "unix")]
-static MEMORY_MAPPED_FILES: Lazy<Mutex<BTreeMap<(u64, u64), u32>>> =
-    Lazy::new(|| Mutex::new(Default::default()));
-
-pub(crate) struct MMapSemaphore {
-    #[cfg(target_family = "unix")]
-    key: (u64, u64),
-    mmap: Mmap,
-}
-
-impl MMapSemaphore {
-    #[cfg(target_family = "unix")]
-    pub(super) fn new(dev: u64, ino: u64, mmap: Mmap) -> Self {
-        let mut guard = MEMORY_MAPPED_FILES.lock().unwrap();
-        let key = (dev, ino);
-        guard.insert(key, 1);
-        Self { key, mmap }
-    }
-
-    #[cfg(not(target_family = "unix"))]
-    pub(super) fn new(mmap: Mmap) -> Self {
-        Self { mmap }
-    }
-}
-
-impl AsRef<[u8]> for MMapSemaphore {
-    #[inline]
-    fn as_ref(&self) -> &[u8] {
-        self.mmap.as_ref()
-    }
-}
-
-#[cfg(target_family = "unix")]
-impl Drop for MMapSemaphore {
-    fn drop(&mut self) {
-        let mut guard = MEMORY_MAPPED_FILES.lock().unwrap();
-        if let Entry::Occupied(mut e) = guard.entry(self.key) {
-            let v = e.get_mut();
-            *v -= 1;
-
-            if *v == 0 {
-                e.remove_entry();
-            }
-        }
-    }
-}
-
-pub fn ensure_not_mapped(#[allow(unused)] file: &File) -> PolarsResult<()> {
-    #[cfg(target_family = "unix")]
-    {
-        use std::os::unix::fs::MetadataExt;
-        let guard = MEMORY_MAPPED_FILES.lock().unwrap();
-        let metadata = file.metadata()?;
-        if guard.contains_key(&(metadata.dev(), metadata.ino())) {
-            polars_bail!(ComputeError: "cannot write to file: already memory mapped");
-        }
-    }
-    Ok(())
-}
+use polars_utils::mmap::{MMapSemaphore, MemSlice};
 
 /// Trait used to get a hold to file handler or to the underlying bytes
 /// without performing a Read.
@@ -143,7 +68,7 @@ impl<T: MmapBytesReader> MmapBytesReader for &mut T {
 pub enum ReaderBytes<'a> {
     Borrowed(&'a [u8]),
     Owned(Vec<u8>),
-    Mapped(memmap::Mmap, &'a File),
+    Mapped(MMapSemaphore, &'a File),
 }
 
 impl std::ops::Deref for ReaderBytes<'_> {
@@ -152,7 +77,7 @@ impl std::ops::Deref for ReaderBytes<'_> {
         match self {
             Self::Borrowed(ref_bytes) => ref_bytes,
             Self::Owned(vec) => vec,
-            Self::Mapped(mmap, _) => mmap,
+            Self::Mapped(mmap, _) => mmap.as_ref(),
         }
     }
 }
@@ -180,7 +105,7 @@ impl<'a, T: 'a + MmapBytesReader> From<&'a mut T> for ReaderBytes<'a> {
             None => {
                 if let Some(f) = m.to_file() {
                     let f = unsafe { std::mem::transmute::<&File, &'a File>(f) };
-                    let mmap = unsafe { memmap::Mmap::map(f).unwrap() };
+                    let mmap = MMapSemaphore::new_from_file(f).unwrap();
                     ReaderBytes::Mapped(mmap, f)
                 } else {
                     if verbose() {
diff --git a/crates/polars-io/src/utils/byte_source.rs b/crates/polars-io/src/utils/byte_source.rs
index fce7e795ce46..72cbabb3dd5c 100644
--- a/crates/polars-io/src/utils/byte_source.rs
+++ b/crates/polars-io/src/utils/byte_source.rs
@@ -1,7 +1,7 @@
 use std::ops::Range;
 use std::sync::Arc;
 
-use polars_error::{to_compute_err, PolarsResult};
+use polars_error::PolarsResult;
 use polars_utils::_limit_path_len_io_err;
 use polars_utils::mmap::MemSlice;
 
@@ -34,9 +34,8 @@ impl MemSliceByteSource {
                 .into_std()
                 .await,
         );
-        let mmap = Arc::new(unsafe { memmap::Mmap::map(file.as_ref()) }.map_err(to_compute_err)?);
 
-        Ok(Self(MemSlice::from_mmap(mmap)))
+        Ok(Self(MemSlice::from_file(file.as_ref())?))
     }
 }
 
diff --git a/crates/polars-io/src/utils/other.rs b/crates/polars-io/src/utils/other.rs
index 3c1ab1e248d8..7267a6616924 100644
--- a/crates/polars-io/src/utils/other.rs
+++ b/crates/polars-io/src/utils/other.rs
@@ -7,6 +7,7 @@ use polars_core::prelude::*;
 #[cfg(any(feature = "ipc_streaming", feature = "parquet"))]
 use polars_core::utils::{accumulate_dataframes_vertical_unchecked, split_df_as_ref};
 use polars_error::to_compute_err;
+use polars_utils::mmap::MMapSemaphore;
 use regex::{Regex, RegexBuilder};
 
 use crate::mmap::{MmapBytesReader, ReaderBytes};
@@ -21,12 +22,15 @@ pub fn get_reader_bytes<'a, R: Read + MmapBytesReader + ?Sized>(
         .ok()
         .and_then(|offset| Some((reader.to_file()?, offset)))
     {
-        let mmap = unsafe { memmap::MmapOptions::new().offset(offset).map(file)? };
+        let mut options = memmap::MmapOptions::new();
+        options.offset(offset);
 
         // somehow bck thinks borrows alias
         // this is sound as file was already bound to 'a
         use std::fs::File;
+
         let file = unsafe { std::mem::transmute::<&File, &'a File>(file) };
+        let mmap = MMapSemaphore::new_from_file_with_options(file, options)?;
         Ok(ReaderBytes::Mapped(mmap, file))
     } else {
         // we can get the bytes for free
diff --git a/crates/polars-mem-engine/src/executors/scan/ipc.rs b/crates/polars-mem-engine/src/executors/scan/ipc.rs
index edde4765fec5..78b31f268756 100644
--- a/crates/polars-mem-engine/src/executors/scan/ipc.rs
+++ b/crates/polars-mem-engine/src/executors/scan/ipc.rs
@@ -81,11 +81,9 @@ impl IpcExec {
                         Some(f) => f?,
                     };
 
-                    MemSlice::from_mmap(Arc::new(unsafe { memmap::Mmap::map(&file)? }))
-                },
-                ScanSourceRef::File(file) => {
-                    MemSlice::from_mmap(Arc::new(unsafe { memmap::Mmap::map(file)? }))
+                    MemSlice::from_file(&file)?
                 },
+                ScanSourceRef::File(file) => MemSlice::from_file(file)?,
                 ScanSourceRef::Buffer(buff) => MemSlice::from_bytes(buff.clone()),
             };
 
diff --git a/crates/polars-plan/src/plans/ir/scan_sources.rs b/crates/polars-plan/src/plans/ir/scan_sources.rs
index 5261d6ede706..1bdb92fda904 100644
--- a/crates/polars-plan/src/plans/ir/scan_sources.rs
+++ b/crates/polars-plan/src/plans/ir/scan_sources.rs
@@ -205,13 +205,9 @@ impl<'a> ScanSourceRef<'a> {
                     polars_utils::open_file(path)?
                 };
 
-                Ok(MemSlice::from_mmap(Arc::new(unsafe {
-                    memmap::Mmap::map(&file)?
-                })))
+                MemSlice::from_file(&file)
             },
-            ScanSourceRef::File(file) => Ok(MemSlice::from_mmap(Arc::new(unsafe {
-                memmap::Mmap::map(*file)?
-            }))),
+            ScanSourceRef::File(file) => MemSlice::from_file(file),
             ScanSourceRef::Buffer(buff) => Ok(MemSlice::from_bytes((*buff).clone())),
         }
     }
@@ -227,7 +223,7 @@ impl<'a> ScanSourceRef<'a> {
     ) -> PolarsResult<MemSlice> {
         match self {
             Self::Path(path) => {
-                let f = if run_async {
+                let file = if run_async {
                     feature_gated!("cloud", {
                         cache_entries.unwrap()[index].try_open_check_latest()?
                     })
@@ -235,13 +231,9 @@ impl<'a> ScanSourceRef<'a> {
                     polars_utils::open_file(path)?
                 };
 
-                let mmap = unsafe { memmap::Mmap::map(&f)? };
-                Ok(MemSlice::from_mmap(Arc::new(mmap)))
-            },
-            Self::File(file) => {
-                let mmap = unsafe { memmap::Mmap::map(*file)? };
-                Ok(MemSlice::from_mmap(Arc::new(mmap)))
+                MemSlice::from_file(&file)
             },
+            Self::File(file) => MemSlice::from_file(file),
             Self::Buffer(buff) => Ok(MemSlice::from_bytes((*buff).clone())),
         }
     }
diff --git a/crates/polars-python/src/dataframe/io.rs b/crates/polars-python/src/dataframe/io.rs
index 12707e93dd85..dbdf91ddff09 100644
--- a/crates/polars-python/src/dataframe/io.rs
+++ b/crates/polars-python/src/dataframe/io.rs
@@ -4,11 +4,11 @@ use std::sync::Arc;
 
 #[cfg(feature = "avro")]
 use polars::io::avro::AvroCompression;
-use polars::io::mmap::ensure_not_mapped;
 use polars::io::RowIndex;
 use polars::prelude::*;
 #[cfg(feature = "parquet")]
 use polars_parquet::arrow::write::StatisticsOptions;
+use polars_utils::mmap::ensure_not_mapped;
 use pyo3::prelude::*;
 use pyo3::pybacked::PyBackedStr;
 
diff --git a/crates/polars-utils/src/mmap.rs b/crates/polars-utils/src/mmap.rs
index c753525b43ee..9e946b3dac52 100644
--- a/crates/polars-utils/src/mmap.rs
+++ b/crates/polars-utils/src/mmap.rs
@@ -1,14 +1,16 @@
+use std::fs::File;
 use std::io;
-use std::sync::Arc;
 
 pub use memmap::Mmap;
 
 mod private {
+    use std::fs::File;
     use std::ops::Deref;
     use std::sync::Arc;
 
-    pub use memmap::Mmap;
+    use polars_error::PolarsResult;
 
+    use super::MMapSemaphore;
     use crate::mem::prefetch_l2;
 
     /// A read-only reference to a slice of memory that can potentially be memory-mapped.
@@ -34,7 +36,7 @@ mod private {
     #[allow(unused)]
     enum MemSliceInner {
         Bytes(bytes::Bytes),
-        Mmap(Arc<Mmap>),
+        Mmap(Arc<MMapSemaphore>),
     }
 
     impl Deref for MemSlice {
@@ -82,7 +84,7 @@ mod private {
         }
 
         #[inline]
-        pub fn from_mmap(mmap: Arc<Mmap>) -> Self {
+        pub fn from_mmap(mmap: Arc<MMapSemaphore>) -> Self {
             Self {
                 slice: unsafe {
                     std::mem::transmute::<&[u8], &'static [u8]>(mmap.as_ref().as_ref())
@@ -91,6 +93,12 @@ mod private {
             }
         }
 
+        #[inline]
+        pub fn from_file(file: &File) -> PolarsResult<Self> {
+            let mmap = MMapSemaphore::new_from_file(file)?;
+            Ok(Self::from_mmap(Arc::new(mmap)))
+        }
+
         /// Construct a `MemSlice` that simply wraps around a `&[u8]`.
         #[inline]
         pub fn from_slice(slice: &'static [u8]) -> Self {
@@ -115,6 +123,8 @@ mod private {
     }
 }
 
+use memmap::MmapOptions;
+use polars_error::{polars_bail, PolarsResult};
 pub use private::MemSlice;
 
 /// A cursor over a [`MemSlice`].
@@ -156,11 +166,6 @@ impl MemReader {
         Self::new(MemSlice::from_bytes(bytes))
     }
 
-    #[inline(always)]
-    pub fn from_mmap(mmap: Arc<Mmap>) -> Self {
-        Self::new(MemSlice::from_mmap(mmap))
-    }
-
     // Construct a `MemSlice` that simply wraps around a `&[u8]`. The caller must ensure the
     /// slice outlives the returned `MemSlice`.
     #[inline]
@@ -231,8 +236,91 @@ impl io::Seek for MemReader {
     }
 }
 
-mod tests {
+// Keep track of memory mapped files so we don't write to them while reading
+// Use a btree as it uses less memory than a hashmap and this thing never shrinks.
+// Write handle in Windows is exclusive, so this is only necessary in Unix.
+#[cfg(target_family = "unix")]
+static MEMORY_MAPPED_FILES: once_cell::sync::Lazy<
+    std::sync::Mutex<std::collections::BTreeMap<(u64, u64), u32>>,
+> = once_cell::sync::Lazy::new(|| std::sync::Mutex::new(Default::default()));
+
+#[derive(Debug)]
+pub struct MMapSemaphore {
+    #[cfg(target_family = "unix")]
+    key: (u64, u64),
+    mmap: Mmap,
+}
+
+impl MMapSemaphore {
+    pub fn new_from_file_with_options(
+        file: &File,
+        options: MmapOptions,
+    ) -> PolarsResult<MMapSemaphore> {
+        let mmap = unsafe { options.map(file) }?;
+
+        #[cfg(target_family = "unix")]
+        {
+            use std::os::unix::fs::MetadataExt;
+            let metadata = file.metadata()?;
+
+            let mut guard = MEMORY_MAPPED_FILES.lock().unwrap();
+            let key = (metadata.dev(), metadata.ino());
+            match guard.entry(key) {
+                std::collections::btree_map::Entry::Occupied(mut e) => *e.get_mut() += 1,
+                std::collections::btree_map::Entry::Vacant(e) => _ = e.insert(1),
+            }
+            Ok(Self { key, mmap })
+        }
+
+        #[cfg(not(target_family = "unix"))]
+        Ok(Self { mmap })
+    }
+
+    pub fn new_from_file(file: &File) -> PolarsResult<MMapSemaphore> {
+        Self::new_from_file_with_options(file, MmapOptions::default())
+    }
+
+    pub fn as_ptr(&self) -> *const u8 {
+        self.mmap.as_ptr()
+    }
+}
+
+impl AsRef<[u8]> for MMapSemaphore {
+    #[inline]
+    fn as_ref(&self) -> &[u8] {
+        self.mmap.as_ref()
+    }
+}
+
+#[cfg(target_family = "unix")]
+impl Drop for MMapSemaphore {
+    fn drop(&mut self) {
+        let mut guard = MEMORY_MAPPED_FILES.lock().unwrap();
+        if let std::collections::btree_map::Entry::Occupied(mut e) = guard.entry(self.key) {
+            let v = e.get_mut();
+            *v -= 1;
+
+            if *v == 0 {
+                e.remove_entry();
+            }
+        }
+    }
+}
 
+pub fn ensure_not_mapped(#[allow(unused)] file: &File) -> PolarsResult<()> {
+    #[cfg(target_family = "unix")]
+    {
+        use std::os::unix::fs::MetadataExt;
+        let guard = MEMORY_MAPPED_FILES.lock().unwrap();
+        let metadata = file.metadata()?;
+        if guard.contains_key(&(metadata.dev(), metadata.ino())) {
+            polars_bail!(ComputeError: "cannot write to file: already memory mapped");
+        }
+    }
+    Ok(())
+}
+
+mod tests {
     #[test]
     fn test_mem_slice_zero_copy() {
         use std::sync::Arc;
@@ -271,9 +359,11 @@ mod tests {
         }
 
         {
+            use crate::mmap::MMapSemaphore;
+
             let path = "../../examples/datasets/foods1.csv";
             let file = std::fs::File::open(path).unwrap();
-            let mmap = unsafe { memmap::Mmap::map(&file) }.unwrap();
+            let mmap = MMapSemaphore::new_from_file(&file).unwrap();
             let ptr = mmap.as_ptr();
 
             let mem_slice = MemSlice::from_mmap(Arc::new(mmap));

From 124b5484bbe50b90ee73d158864e83ff147ca006 Mon Sep 17 00:00:00 2001
From: coastalwhite <me@gburghoorn.com>
Date: Sun, 8 Sep 2024 15:47:57 +0200
Subject: [PATCH 23/27] clippy

---
 crates/polars-io/src/ipc/mmap.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crates/polars-io/src/ipc/mmap.rs b/crates/polars-io/src/ipc/mmap.rs
index 74ef2b533462..f0343642482e 100644
--- a/crates/polars-io/src/ipc/mmap.rs
+++ b/crates/polars-io/src/ipc/mmap.rs
@@ -18,7 +18,7 @@ impl<R: MmapBytesReader> IpcReader<R> {
     ) -> PolarsResult<DataFrame> {
         match self.reader.to_file() {
             Some(file) => {
-                let semaphore = MMapSemaphore::new_from_file(&file)?;
+                let semaphore = MMapSemaphore::new_from_file(file)?;
                 let metadata =
                     read::read_file_metadata(&mut std::io::Cursor::new(semaphore.as_ref()))?;
 

From b2888111948a03d2e67a84fb6d3012a2f358c4e1 Mon Sep 17 00:00:00 2001
From: coastalwhite <me@gburghoorn.com>
Date: Sun, 8 Sep 2024 16:04:30 +0200
Subject: [PATCH 24/27] remove broken test

---
 py-polars/tests/unit/io/test_ipc.py | 26 --------------------------
 1 file changed, 26 deletions(-)

diff --git a/py-polars/tests/unit/io/test_ipc.py b/py-polars/tests/unit/io/test_ipc.py
index a1505cf3d2fc..52f69a450507 100644
--- a/py-polars/tests/unit/io/test_ipc.py
+++ b/py-polars/tests/unit/io/test_ipc.py
@@ -343,29 +343,3 @@ def test_ipc_decimal_15920(
         path = f"{tmp_path}/data"
         df.write_ipc(path)
         assert_frame_equal(pl.read_ipc(path), df)
-
-
-@pytest.mark.write_disk
-def test_ipc_raise_on_writing_mmap(tmp_path: Path) -> None:
-    p = tmp_path / "foo.ipc"
-    df = pl.DataFrame({"foo": [1, 2, 3]})
-    # first write is allowed
-    df.write_ipc(p)
-
-    # now open as memory mapped
-    df = pl.read_ipc(p, memory_map=True)
-
-    if os.name == "nt":
-        # In Windows, it's the duty of the system to ensure exclusive access
-        with pytest.raises(
-            OSError,
-            match=re.escape(
-                "The requested operation cannot be performed on a file with a user-mapped section open. (os error 1224)"
-            ),
-        ):
-            df.write_ipc(p)
-    else:
-        with pytest.raises(
-            ComputeError, match="cannot write to file: already memory mapped"
-        ):
-            df.write_ipc(p)

From a3ffac672b3127131bd8a60b3aae78903455c999 Mon Sep 17 00:00:00 2001
From: coastalwhite <me@gburghoorn.com>
Date: Sun, 8 Sep 2024 16:05:02 +0200
Subject: [PATCH 25/27] ruff

---
 py-polars/tests/unit/io/test_ipc.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/py-polars/tests/unit/io/test_ipc.py b/py-polars/tests/unit/io/test_ipc.py
index 52f69a450507..dd60d0ae209c 100644
--- a/py-polars/tests/unit/io/test_ipc.py
+++ b/py-polars/tests/unit/io/test_ipc.py
@@ -1,8 +1,6 @@
 from __future__ import annotations
 
 import io
-import os
-import re
 from decimal import Decimal
 from typing import TYPE_CHECKING, Any
 
@@ -10,7 +8,6 @@
 import pytest
 
 import polars as pl
-from polars.exceptions import ComputeError
 from polars.interchange.protocol import CompatLevel
 from polars.testing import assert_frame_equal
 

From e35eabe6355fa79b077cede1adf53aae7f393b9f Mon Sep 17 00:00:00 2001
From: coastalwhite <me@gburghoorn.com>
Date: Mon, 9 Sep 2024 09:17:13 +0200
Subject: [PATCH 26/27] remove stale memory_map option

---
 crates/polars-io/src/ipc/ipc_file.rs          | 5 +----
 crates/polars-lazy/src/scan/ipc.rs            | 6 +-----
 crates/polars-python/src/lazyframe/general.rs | 4 +---
 py-polars/polars/io/ipc/functions.py          | 6 +++---
 4 files changed, 6 insertions(+), 15 deletions(-)

diff --git a/crates/polars-io/src/ipc/ipc_file.rs b/crates/polars-io/src/ipc/ipc_file.rs
index 9347a453b426..feaea44f5417 100644
--- a/crates/polars-io/src/ipc/ipc_file.rs
+++ b/crates/polars-io/src/ipc/ipc_file.rs
@@ -51,10 +51,7 @@ use crate::RowIndex;
 
 #[derive(Clone, Debug, PartialEq, Hash)]
 #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
-pub struct IpcScanOptions {
-    /// Not used anymore.
-    pub memory_map: bool,
-}
+pub struct IpcScanOptions;
 
 /// Read Arrows IPC format into a DataFrame
 ///
diff --git a/crates/polars-lazy/src/scan/ipc.rs b/crates/polars-lazy/src/scan/ipc.rs
index 8d84ef3de049..a9f8c8b98b0f 100644
--- a/crates/polars-lazy/src/scan/ipc.rs
+++ b/crates/polars-lazy/src/scan/ipc.rs
@@ -13,7 +13,6 @@ pub struct ScanArgsIpc {
     pub cache: bool,
     pub rechunk: bool,
     pub row_index: Option<RowIndex>,
-    pub memory_map: bool,
     pub cloud_options: Option<CloudOptions>,
     pub hive_options: HiveOptions,
     pub include_file_paths: Option<PlSmallStr>,
@@ -26,7 +25,6 @@ impl Default for ScanArgsIpc {
             cache: true,
             rechunk: false,
             row_index: None,
-            memory_map: true,
             cloud_options: Default::default(),
             hive_options: Default::default(),
             include_file_paths: None,
@@ -53,9 +51,7 @@ impl LazyFileListReader for LazyIpcReader {
     fn finish(self) -> PolarsResult<LazyFrame> {
         let args = self.args;
 
-        let options = IpcScanOptions {
-            memory_map: args.memory_map,
-        };
+        let options = IpcScanOptions {};
 
         let mut lf: LazyFrame = DslBuilder::scan_ipc(
             self.sources.to_dsl(false),
diff --git a/crates/polars-python/src/lazyframe/general.rs b/crates/polars-python/src/lazyframe/general.rs
index 362d114817b5..86bcd3c2566b 100644
--- a/crates/polars-python/src/lazyframe/general.rs
+++ b/crates/polars-python/src/lazyframe/general.rs
@@ -310,7 +310,7 @@ impl PyLazyFrame {
 
     #[cfg(feature = "ipc")]
     #[staticmethod]
-    #[pyo3(signature = (source, sources, n_rows, cache, rechunk, row_index, memory_map, cloud_options, hive_partitioning, hive_schema, try_parse_hive_dates, retries, file_cache_ttl, include_file_paths))]
+    #[pyo3(signature = (source, sources, n_rows, cache, rechunk, row_index, cloud_options, hive_partitioning, hive_schema, try_parse_hive_dates, retries, file_cache_ttl, include_file_paths))]
     fn new_from_ipc(
         source: Option<PyObject>,
         sources: Wrap<ScanSources>,
@@ -318,7 +318,6 @@ impl PyLazyFrame {
         cache: bool,
         rechunk: bool,
         row_index: Option<(String, IdxSize)>,
-        memory_map: bool,
         cloud_options: Option<Vec<(String, String)>>,
         hive_partitioning: Option<bool>,
         hive_schema: Option<Wrap<Schema>>,
@@ -344,7 +343,6 @@ impl PyLazyFrame {
             cache,
             rechunk,
             row_index,
-            memory_map,
             #[cfg(feature = "cloud")]
             cloud_options: None,
             hive_options,
diff --git a/py-polars/polars/io/ipc/functions.py b/py-polars/polars/io/ipc/functions.py
index 17ee17d6843b..43fbc8136de2 100644
--- a/py-polars/polars/io/ipc/functions.py
+++ b/py-polars/polars/io/ipc/functions.py
@@ -114,7 +114,6 @@ def read_ipc(
         lf = scan_ipc(
             source,
             n_rows=n_rows,
-            memory_map=memory_map,
             storage_options=storage_options,
             row_index_name=row_index_name,
             row_index_offset=row_index_offset,
@@ -189,7 +188,6 @@ def _read_ipc_impl(
             rechunk=rechunk,
             row_index_name=row_index_name,
             row_index_offset=row_index_offset,
-            memory_map=memory_map,
         )
         if columns is None:
             df = scan.collect()
@@ -448,6 +446,9 @@ def scan_ipc(
 
         source = None  # type: ignore[assignment]
 
+    # Memory Mapping is now a no-op
+    _ = memory_map
+
     pylf = PyLazyFrame.new_from_ipc(
         source,
         sources,
@@ -455,7 +456,6 @@ def scan_ipc(
         cache,
         rechunk,
         parse_row_index_args(row_index_name, row_index_offset),
-        memory_map=memory_map,
         cloud_options=storage_options,
         retries=retries,
         file_cache_ttl=file_cache_ttl,

From 1e2fa0dfbce927be739bfb51e1e46ed24fb0ae3e Mon Sep 17 00:00:00 2001
From: coastalwhite <me@gburghoorn.com>
Date: Mon, 9 Sep 2024 09:21:10 +0200
Subject: [PATCH 27/27] fix test

---
 crates/polars-lazy/src/tests/io.rs | 1 -
 1 file changed, 1 deletion(-)

diff --git a/crates/polars-lazy/src/tests/io.rs b/crates/polars-lazy/src/tests/io.rs
index 57beafc63033..a1d3f2c050a8 100644
--- a/crates/polars-lazy/src/tests/io.rs
+++ b/crates/polars-lazy/src/tests/io.rs
@@ -417,7 +417,6 @@ fn test_ipc_globbing() -> PolarsResult<()> {
             cache: true,
             rechunk: false,
             row_index: None,
-            memory_map: true,
             cloud_options: None,
             hive_options: Default::default(),
             include_file_paths: None,