From 667bcd4bb203ea84590c4f26c367d37fe6655c40 Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Wed, 25 Oct 2023 16:09:12 +0200 Subject: [PATCH] chore(rust): remove parquet logic from `polars-arrow` and consolidate logic in `polars-parquet` crate. (#12022) --- Cargo.toml | 1 + crates/polars-arrow/Cargo.toml | 38 ---- .../src/array/fixed_size_binary/mod.rs | 2 +- crates/polars-arrow/src/io/mod.rs | 4 - crates/polars-core/Cargo.toml | 2 - crates/polars-io/Cargo.toml | 13 +- crates/polars-io/src/export.rs | 4 +- crates/polars-io/src/parquet/async_impl.rs | 4 +- crates/polars-io/src/parquet/mmap.rs | 8 +- crates/polars-io/src/parquet/mod.rs | 2 +- crates/polars-io/src/parquet/predicates.rs | 4 +- crates/polars-io/src/parquet/read.rs | 4 +- crates/polars-io/src/parquet/read_impl.rs | 4 +- crates/polars-io/src/parquet/write.rs | 4 +- crates/polars-lazy/Cargo.toml | 2 +- .../physical_plan/executors/scan/parquet.rs | 2 +- crates/polars-parquet/Cargo.toml | 44 ++++ crates/polars-parquet/LICENSE | 196 ++++++++++++++++++ .../src/arrow}/mod.rs | 2 - .../src/arrow}/read/README.md | 0 .../src/arrow}/read/deserialize/README.md | 0 .../arrow}/read/deserialize/binary/basic.rs | 8 +- .../read/deserialize/binary/dictionary.rs | 10 +- .../src/arrow}/read/deserialize/binary/mod.rs | 0 .../arrow}/read/deserialize/binary/nested.rs | 10 +- .../arrow}/read/deserialize/binary/utils.rs | 3 +- .../arrow}/read/deserialize/boolean/basic.rs | 8 +- .../arrow}/read/deserialize/boolean/mod.rs | 0 .../arrow}/read/deserialize/boolean/nested.rs | 8 +- .../arrow}/read/deserialize/dictionary/mod.rs | 6 +- .../read/deserialize/dictionary/nested.rs | 6 +- .../deserialize/fixed_size_binary/basic.rs | 6 +- .../fixed_size_binary/dictionary.rs | 8 +- .../read/deserialize/fixed_size_binary/mod.rs | 0 .../deserialize/fixed_size_binary/nested.rs | 14 +- .../deserialize/fixed_size_binary/utils.rs | 0 .../src/arrow}/read/deserialize/mod.rs | 26 +-- .../src/arrow}/read/deserialize/nested.rs | 10 +- .../arrow}/read/deserialize/nested_utils.rs | 4 +- .../src/arrow}/read/deserialize/null/mod.rs | 8 +- .../arrow}/read/deserialize/null/nested.rs | 6 +- .../read/deserialize/primitive/basic.rs | 8 +- .../read/deserialize/primitive/dictionary.rs | 8 +- .../read/deserialize/primitive/integer.rs | 10 +- .../arrow}/read/deserialize/primitive/mod.rs | 0 .../read/deserialize/primitive/nested.rs | 8 +- .../src/arrow}/read/deserialize/simple.rs | 8 +- .../src/arrow}/read/deserialize/struct_.rs | 4 +- .../src/arrow}/read/deserialize/utils.rs | 4 +- .../src/arrow}/read/file.rs | 8 +- .../src/arrow}/read/indexes/binary.rs | 6 +- .../src/arrow}/read/indexes/boolean.rs | 2 +- .../arrow}/read/indexes/fixed_len_binary.rs | 8 +- .../src/arrow}/read/indexes/mod.rs | 8 +- .../src/arrow}/read/indexes/primitive.rs | 8 +- .../src/arrow}/read/mod.rs | 11 +- .../src/arrow}/read/row_group.rs | 8 +- .../src/arrow}/read/schema/convert.rs | 6 +- .../src/arrow}/read/schema/metadata.rs | 4 +- .../src/arrow}/read/schema/mod.rs | 2 +- .../src/arrow}/read/statistics/binary.rs | 5 +- .../src/arrow}/read/statistics/boolean.rs | 3 +- .../src/arrow}/read/statistics/dictionary.rs | 8 +- .../src/arrow}/read/statistics/fixlen.rs | 6 +- .../src/arrow}/read/statistics/list.rs | 8 +- .../src/arrow}/read/statistics/map.rs | 6 +- .../src/arrow}/read/statistics/mod.rs | 9 +- .../src/arrow}/read/statistics/null.rs | 3 +- .../src/arrow}/read/statistics/primitive.rs | 7 +- .../src/arrow}/read/statistics/struct_.rs | 6 +- .../src/arrow}/read/statistics/utf8.rs | 5 +- .../src/arrow}/write/binary/basic.rs | 8 +- .../src/arrow}/write/binary/mod.rs | 0 .../src/arrow}/write/binary/nested.rs | 8 +- .../src/arrow}/write/boolean/basic.rs | 4 +- .../src/arrow}/write/boolean/mod.rs | 0 .../src/arrow}/write/boolean/nested.rs | 6 +- .../src/arrow}/write/dictionary.rs | 10 +- .../src/arrow}/write/file.rs | 2 +- .../src/arrow}/write/fixed_len_bytes.rs | 6 +- .../src/arrow}/write/mod.rs | 18 +- .../src/arrow}/write/nested/def.rs | 5 +- .../src/arrow}/write/nested/mod.rs | 2 +- .../src/arrow}/write/nested/rep.rs | 0 .../src/arrow}/write/pages.rs | 16 +- .../src/arrow}/write/primitive/basic.rs | 8 +- .../src/arrow}/write/primitive/mod.rs | 0 .../src/arrow}/write/primitive/nested.rs | 8 +- .../src/arrow}/write/row_group.rs | 6 +- .../src/arrow}/write/schema.rs | 6 +- .../src/arrow}/write/sink.rs | 10 +- .../src/arrow}/write/utf8/basic.rs | 6 +- .../src/arrow}/write/utf8/mod.rs | 0 .../src/arrow}/write/utf8/nested.rs | 8 +- .../src/arrow}/write/utils.rs | 2 +- crates/polars-parquet/src/lib.rs | 3 + .../src/executors/sources/parquet.rs | 3 +- crates/polars-plan/Cargo.toml | 3 +- .../polars-plan/src/logical_plan/file_scan.rs | 2 +- crates/polars/Cargo.toml | 2 +- py-polars/Cargo.lock | 21 +- py-polars/Cargo.toml | 3 +- py-polars/src/functions/io.rs | 2 +- 103 files changed, 533 insertions(+), 313 deletions(-) create mode 100644 crates/polars-parquet/Cargo.toml create mode 100644 crates/polars-parquet/LICENSE rename crates/{polars-arrow/src/io/parquet => polars-parquet/src/arrow}/mod.rs (81%) rename crates/{polars-arrow/src/io/parquet => polars-parquet/src/arrow}/read/README.md (100%) rename crates/{polars-arrow/src/io/parquet => polars-parquet/src/arrow}/read/deserialize/README.md (100%) rename crates/{polars-arrow/src/io/parquet => polars-parquet/src/arrow}/read/deserialize/binary/basic.rs (99%) rename crates/{polars-arrow/src/io/parquet => polars-parquet/src/arrow}/read/deserialize/binary/dictionary.rs (94%) rename crates/{polars-arrow/src/io/parquet => polars-parquet/src/arrow}/read/deserialize/binary/mod.rs (100%) rename crates/{polars-arrow/src/io/parquet => polars-parquet/src/arrow}/read/deserialize/binary/nested.rs (97%) rename crates/{polars-arrow/src/io/parquet => polars-parquet/src/arrow}/read/deserialize/binary/utils.rs (99%) rename crates/{polars-arrow/src/io/parquet => polars-parquet/src/arrow}/read/deserialize/boolean/basic.rs (98%) rename crates/{polars-arrow/src/io/parquet => polars-parquet/src/arrow}/read/deserialize/boolean/mod.rs (100%) rename crates/{polars-arrow/src/io/parquet => polars-parquet/src/arrow}/read/deserialize/boolean/nested.rs (97%) rename crates/{polars-arrow/src/io/parquet => polars-parquet/src/arrow}/read/deserialize/dictionary/mod.rs (98%) rename crates/{polars-arrow/src/io/parquet => polars-parquet/src/arrow}/read/deserialize/dictionary/nested.rs (98%) rename crates/{polars-arrow/src/io/parquet => polars-parquet/src/arrow}/read/deserialize/fixed_size_binary/basic.rs (98%) rename crates/{polars-arrow/src/io/parquet => polars-parquet/src/arrow}/read/deserialize/fixed_size_binary/dictionary.rs (94%) rename crates/{polars-arrow/src/io/parquet => polars-parquet/src/arrow}/read/deserialize/fixed_size_binary/mod.rs (100%) rename crates/{polars-arrow/src/io/parquet => polars-parquet/src/arrow}/read/deserialize/fixed_size_binary/nested.rs (93%) rename crates/{polars-arrow/src/io/parquet => polars-parquet/src/arrow}/read/deserialize/fixed_size_binary/utils.rs (100%) rename crates/{polars-arrow/src/io/parquet => polars-parquet/src/arrow}/read/deserialize/mod.rs (89%) rename crates/{polars-arrow/src/io/parquet => polars-parquet/src/arrow}/read/deserialize/nested.rs (99%) rename crates/{polars-arrow/src/io/parquet => polars-parquet/src/arrow}/read/deserialize/nested_utils.rs (99%) rename crates/{polars-arrow/src/io/parquet => polars-parquet/src/arrow}/read/deserialize/null/mod.rs (95%) rename crates/{polars-arrow/src/io/parquet => polars-parquet/src/arrow}/read/deserialize/null/nested.rs (95%) rename crates/{polars-arrow/src/io/parquet => polars-parquet/src/arrow}/read/deserialize/primitive/basic.rs (98%) rename crates/{polars-arrow/src/io/parquet => polars-parquet/src/arrow}/read/deserialize/primitive/dictionary.rs (96%) rename crates/{polars-arrow/src/io/parquet => polars-parquet/src/arrow}/read/deserialize/primitive/integer.rs (97%) rename crates/{polars-arrow/src/io/parquet => polars-parquet/src/arrow}/read/deserialize/primitive/mod.rs (100%) rename crates/{polars-arrow/src/io/parquet => polars-parquet/src/arrow}/read/deserialize/primitive/nested.rs (98%) rename crates/{polars-arrow/src/io/parquet => polars-parquet/src/arrow}/read/deserialize/simple.rs (99%) rename crates/{polars-arrow/src/io/parquet => polars-parquet/src/arrow}/read/deserialize/struct_.rs (95%) rename crates/{polars-arrow/src/io/parquet => polars-parquet/src/arrow}/read/deserialize/utils.rs (99%) rename crates/{polars-arrow/src/io/parquet => polars-parquet/src/arrow}/read/file.rs (98%) rename crates/{polars-arrow/src/io/parquet => polars-parquet/src/arrow}/read/indexes/binary.rs (91%) rename crates/{polars-arrow/src/io/parquet => polars-parquet/src/arrow}/read/indexes/boolean.rs (92%) rename crates/{polars-arrow/src/io/parquet => polars-parquet/src/arrow}/read/indexes/fixed_len_binary.rs (91%) rename crates/{polars-arrow/src/io/parquet => polars-parquet/src/arrow}/read/indexes/mod.rs (98%) rename crates/{polars-arrow/src/io/parquet => polars-parquet/src/arrow}/read/indexes/primitive.rs (97%) rename crates/{polars-arrow/src/io/parquet => polars-parquet/src/arrow}/read/mod.rs (93%) rename crates/{polars-arrow/src/io/parquet => polars-parquet/src/arrow}/read/row_group.rs (98%) rename crates/{polars-arrow/src/io/parquet => polars-parquet/src/arrow}/read/schema/convert.rs (99%) rename crates/{polars-arrow/src/io/parquet => polars-parquet/src/arrow}/read/schema/metadata.rs (95%) rename crates/{polars-arrow/src/io/parquet => polars-parquet/src/arrow}/read/schema/mod.rs (98%) rename crates/{polars-arrow/src/io/parquet => polars-parquet/src/arrow}/read/statistics/binary.rs (89%) rename crates/{polars-arrow/src/io/parquet => polars-parquet/src/arrow}/read/statistics/boolean.rs (92%) rename crates/{polars-arrow/src/io/parquet => polars-parquet/src/arrow}/read/statistics/dictionary.rs (90%) rename crates/{polars-arrow/src/io/parquet => polars-parquet/src/arrow}/read/statistics/fixlen.rs (97%) rename crates/{polars-arrow/src/io/parquet => polars-parquet/src/arrow}/read/statistics/list.rs (93%) rename crates/{polars-arrow/src/io/parquet => polars-parquet/src/arrow}/read/statistics/map.rs (89%) rename crates/{polars-arrow/src/io/parquet => polars-parquet/src/arrow}/read/statistics/mod.rs (99%) rename crates/{polars-arrow/src/io/parquet => polars-parquet/src/arrow}/read/statistics/null.rs (93%) rename crates/{polars-arrow/src/io/parquet => polars-parquet/src/arrow}/read/statistics/primitive.rs (96%) rename crates/{polars-arrow/src/io/parquet => polars-parquet/src/arrow}/read/statistics/struct_.rs (90%) rename crates/{polars-arrow/src/io/parquet => polars-parquet/src/arrow}/read/statistics/utf8.rs (91%) rename crates/{polars-arrow/src/io/parquet => polars-parquet/src/arrow}/write/binary/basic.rs (97%) rename crates/{polars-arrow/src/io/parquet => polars-parquet/src/arrow}/write/binary/mod.rs (100%) rename crates/{polars-arrow/src/io/parquet => polars-parquet/src/arrow}/write/binary/nested.rs (87%) rename crates/{polars-arrow/src/io/parquet => polars-parquet/src/arrow}/write/boolean/basic.rs (97%) rename crates/{polars-arrow/src/io/parquet => polars-parquet/src/arrow}/write/boolean/mod.rs (100%) rename crates/{polars-arrow/src/io/parquet => polars-parquet/src/arrow}/write/boolean/nested.rs (89%) rename crates/{polars-arrow/src/io/parquet => polars-parquet/src/arrow}/write/dictionary.rs (97%) rename crates/{polars-arrow/src/io/parquet => polars-parquet/src/arrow}/write/file.rs (98%) rename crates/{polars-arrow/src/io/parquet => polars-parquet/src/arrow}/write/fixed_len_bytes.rs (96%) rename crates/{polars-arrow/src/io/parquet => polars-parquet/src/arrow}/write/mod.rs (98%) rename crates/{polars-arrow/src/io/parquet => polars-parquet/src/arrow}/write/nested/def.rs (99%) rename crates/{polars-arrow/src/io/parquet => polars-parquet/src/arrow}/write/nested/mod.rs (99%) rename crates/{polars-arrow/src/io/parquet => polars-parquet/src/arrow}/write/nested/rep.rs (100%) rename crates/{polars-arrow/src/io/parquet => polars-parquet/src/arrow}/write/pages.rs (98%) rename crates/{polars-arrow/src/io/parquet => polars-parquet/src/arrow}/write/primitive/basic.rs (96%) rename crates/{polars-arrow/src/io/parquet => polars-parquet/src/arrow}/write/primitive/mod.rs (100%) rename crates/{polars-arrow/src/io/parquet => polars-parquet/src/arrow}/write/primitive/nested.rs (88%) rename crates/{polars-arrow/src/io/parquet => polars-parquet/src/arrow}/write/row_group.rs (98%) rename crates/{polars-arrow/src/io/parquet => polars-parquet/src/arrow}/write/schema.rs (98%) rename crates/{polars-arrow/src/io/parquet => polars-parquet/src/arrow}/write/sink.rs (96%) rename crates/{polars-arrow/src/io/parquet => polars-parquet/src/arrow}/write/utf8/basic.rs (96%) rename crates/{polars-arrow/src/io/parquet => polars-parquet/src/arrow}/write/utf8/mod.rs (100%) rename crates/{polars-arrow/src/io/parquet => polars-parquet/src/arrow}/write/utf8/nested.rs (88%) rename crates/{polars-arrow/src/io/parquet => polars-parquet/src/arrow}/write/utils.rs (99%) create mode 100644 crates/polars-parquet/src/lib.rs diff --git a/Cargo.toml b/Cargo.toml index 9da586a7d6cc..daff2b89f542 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -68,6 +68,7 @@ polars-core = { version = "0.34.2", path = "crates/polars-core", default-feature polars-plan = { version = "0.34.2", path = "crates/polars-plan", default-features = false } polars-lazy = { version = "0.34.2", path = "crates/polars-lazy", default-features = false } polars-pipe = { version = "0.34.2", path = "crates/polars-pipe", default-features = false } +polars-parquet = { version = "0.34.2", path = "crates/polars-parquet", default-features = false } polars-row = { version = "0.34.2", path = "crates/polars-row", default-features = false } polars-ffi = { version = "0.34.2", path = "crates/polars-ffi", default-features = false } polars-ops = { version = "0.34.2", path = "crates/polars-ops", default-features = false } diff --git a/crates/polars-arrow/Cargo.toml b/crates/polars-arrow/Cargo.toml index d94131f278a4..d2bbce0b5d38 100644 --- a/crates/polars-arrow/Cargo.toml +++ b/crates/polars-arrow/Cargo.toml @@ -34,7 +34,6 @@ ethnum = { workspace = true } # To efficiently cast numbers to strings lexical-core = { workspace = true, optional = true } -fallible-streaming-iterator = { workspace = true, optional = true } regex = { workspace = true, optional = true } regex-syntax = { version = "0.8", optional = true } streaming-iterator = { workspace = true } @@ -49,8 +48,6 @@ hex = { workspace = true, optional = true } lz4 = { version = "1.24", optional = true } zstd = { version = "0.13", optional = true } -base64 = { workspace = true, optional = true } - # to write to parquet as a stream futures = { workspace = true, optional = true } @@ -74,7 +71,6 @@ arrow-array = { workspace = true, optional = true } arrow-buffer = { workspace = true, optional = true } arrow-data = { workspace = true, optional = true } arrow-schema = { workspace = true, optional = true } -parquet2 = { workspace = true, optional = true, default-features = true, features = ["async"] } [dev-dependencies] avro-rs = { version = "0.13", features = ["snappy"] } @@ -109,8 +105,6 @@ full = [ "io_ipc_write_async", "io_ipc_read_async", "io_ipc_compression", - "io_parquet", - "io_parquet_compression", "io_avro", "io_avro_compression", "io_avro_async", @@ -126,38 +120,6 @@ io_ipc_read_async = ["io_ipc", "futures", "async-stream"] io_ipc_compression = ["lz4", "zstd"] io_flight = ["io_ipc", "arrow-format/flight-data"] -# base64 + io_ipc because arrow schemas are stored as base64-encoded ipc format. -io_parquet = [ - "parquet2", - "parquet2/async", - "polars-error/parquet2", - "io_ipc", - "base64", - "futures", - "fallible-streaming-iterator", -] - -io_parquet_compression = [ - "io_parquet_zstd", - "io_parquet_gzip", - "io_parquet_snappy", - "io_parquet_lz4", - "io_parquet_brotli", -] - -# sample testing of generated arrow data -io_parquet_sample_test = ["io_parquet"] - -# compression backends -io_parquet_zstd = ["parquet2/zstd"] -io_parquet_snappy = ["parquet2/snappy"] -io_parquet_gzip = ["parquet2/gzip"] -io_parquet_lz4 = ["parquet2/lz4"] -io_parquet_brotli = ["parquet2/brotli"] - -# parquet bloom filter functions -io_parquet_bloom_filter = ["parquet2/bloom_filter"] - io_avro = ["avro-schema", "polars-error/avro-schema"] io_avro_compression = [ "avro-schema/compression", diff --git a/crates/polars-arrow/src/array/fixed_size_binary/mod.rs b/crates/polars-arrow/src/array/fixed_size_binary/mod.rs index 710c87d1b7bf..4d808f23a842 100644 --- a/crates/polars-arrow/src/array/fixed_size_binary/mod.rs +++ b/crates/polars-arrow/src/array/fixed_size_binary/mod.rs @@ -215,7 +215,7 @@ impl FixedSizeBinaryArray { } } - pub(crate) fn get_size(data_type: &DataType) -> usize { + pub fn get_size(data_type: &DataType) -> usize { Self::maybe_get_size(data_type).unwrap() } } diff --git a/crates/polars-arrow/src/io/mod.rs b/crates/polars-arrow/src/io/mod.rs index 72bf37ba9ea5..a2b178304df0 100644 --- a/crates/polars-arrow/src/io/mod.rs +++ b/crates/polars-arrow/src/io/mod.rs @@ -10,10 +10,6 @@ pub mod ipc; #[cfg_attr(docsrs, doc(cfg(feature = "io_flight")))] pub mod flight; -#[cfg(feature = "io_parquet")] -#[cfg_attr(docsrs, doc(cfg(feature = "io_parquet")))] -pub mod parquet; - #[cfg(feature = "io_avro")] #[cfg_attr(docsrs, doc(cfg(feature = "io_avro")))] pub mod avro; diff --git a/crates/polars-core/Cargo.toml b/crates/polars-core/Cargo.toml index 3f32c614e53f..b6f20b2750a2 100644 --- a/crates/polars-core/Cargo.toml +++ b/crates/polars-core/Cargo.toml @@ -117,8 +117,6 @@ dtype-u16 = [] dtype-categorical = [] dtype-struct = [] -parquet = ["arrow/io_parquet"] - # scale to terabytes? bigidx = ["arrow/bigidx", "polars-utils/bigidx"] python = [] diff --git a/crates/polars-io/Cargo.toml b/crates/polars-io/Cargo.toml index 18e69c0e9647..f3fd0059fb51 100644 --- a/crates/polars-io/Cargo.toml +++ b/crates/polars-io/Cargo.toml @@ -12,6 +12,7 @@ description = "IO related logic for the Polars DataFrame library" polars-core = { workspace = true } polars-error = { workspace = true } polars-json = { workspace = true, optional = true } +polars-parquet = { workspace = true, optional = true } polars-time = { workspace = true, features = [], optional = true } polars-utils = { workspace = true } @@ -92,8 +93,16 @@ dtype-struct = ["polars-core/dtype-struct"] dtype-decimal = ["polars-core/dtype-decimal"] fmt = ["polars-core/fmt"] lazy = [] -parquet = ["polars-core/parquet", "arrow/io_parquet", "arrow/io_parquet_compression"] -async = ["async-trait", "futures", "tokio", "tokio-util", "arrow/io_ipc_write_async", "polars-error/regex"] +parquet = ["polars-parquet", "polars-parquet/compression"] +async = [ + "async-trait", + "futures", + "tokio", + "tokio-util", + "arrow/io_ipc_write_async", + "polars-error/regex", + "polars-parquet?/async", +] cloud = ["object_store", "async", "polars-error/object_store", "url"] aws = ["object_store/aws", "cloud", "reqwest"] azure = ["object_store/azure", "cloud"] diff --git a/crates/polars-io/src/export.rs b/crates/polars-io/src/export.rs index accd86716cb6..cb167988061a 100644 --- a/crates/polars-io/src/export.rs +++ b/crates/polars-io/src/export.rs @@ -1,4 +1,4 @@ #[cfg(feature = "parquet")] -pub use arrow::io::parquet::read::statistics::Statistics as ParquetStatistics; +pub use polars_parquet::read::statistics::Statistics as ParquetStatistics; #[cfg(feature = "parquet")] -pub use arrow::io::parquet::read::statistics::*; +pub use polars_parquet::read::statistics::*; diff --git a/crates/polars-io/src/parquet/async_impl.rs b/crates/polars-io/src/parquet/async_impl.rs index 989208ad65f5..fb22f3ac498a 100644 --- a/crates/polars-io/src/parquet/async_impl.rs +++ b/crates/polars-io/src/parquet/async_impl.rs @@ -3,8 +3,6 @@ use std::borrow::Cow; use std::ops::Range; use std::sync::Arc; -use arrow::io::parquet::read::{self as parquet2_read, RowGroupMetaData}; -use arrow::io::parquet::write::FileMetaData; use bytes::Bytes; use futures::future::try_join_all; use object_store::path::Path as ObjectPath; @@ -14,6 +12,8 @@ use polars_core::datatypes::PlHashMap; use polars_core::error::{to_compute_err, PolarsResult}; use polars_core::prelude::*; use polars_core::schema::Schema; +use polars_parquet::read::{self as parquet2_read, RowGroupMetaData}; +use polars_parquet::write::FileMetaData; use smartstring::alias::String as SmartString; use super::cloud::{build_object_store, CloudLocation, CloudReader}; diff --git a/crates/polars-io/src/parquet/mmap.rs b/crates/polars-io/src/parquet/mmap.rs index cb162478cbf2..94c1c9911c12 100644 --- a/crates/polars-io/src/parquet/mmap.rs +++ b/crates/polars-io/src/parquet/mmap.rs @@ -1,11 +1,11 @@ use arrow::datatypes::Field; -use arrow::io::parquet::read::{ - column_iter_to_arrays, get_field_columns, ArrayIter, BasicDecompressor, ColumnChunkMetaData, - PageReader, -}; use bytes::Bytes; #[cfg(feature = "async")] use polars_core::datatypes::PlHashMap; +use polars_parquet::read::{ + column_iter_to_arrays, get_field_columns, ArrayIter, BasicDecompressor, ColumnChunkMetaData, + PageReader, +}; use super::*; diff --git a/crates/polars-io/src/parquet/mod.rs b/crates/polars-io/src/parquet/mod.rs index 1e9b4da863dc..9f056e547b6f 100644 --- a/crates/polars-io/src/parquet/mod.rs +++ b/crates/polars-io/src/parquet/mod.rs @@ -22,7 +22,7 @@ mod read; mod read_impl; mod write; -use arrow::io::parquet::write::FileMetaData; +pub use polars_parquet::write::FileMetaData; pub use read::*; pub use write::{BrotliLevel, GzipLevel, ZstdLevel, *}; diff --git a/crates/polars-io/src/parquet/predicates.rs b/crates/polars-io/src/parquet/predicates.rs index fe4b8636ec88..72950b5cec63 100644 --- a/crates/polars-io/src/parquet/predicates.rs +++ b/crates/polars-io/src/parquet/predicates.rs @@ -1,6 +1,6 @@ -use arrow::io::parquet::read::statistics::{deserialize, Statistics}; -use arrow::io::parquet::read::RowGroupMetaData; use polars_core::prelude::*; +use polars_parquet::read::statistics::{deserialize, Statistics}; +use polars_parquet::read::RowGroupMetaData; use crate::predicates::{BatchStats, ColumnStats, PhysicalIoExpr}; diff --git a/crates/polars-io/src/parquet/read.rs b/crates/polars-io/src/parquet/read.rs index c09bc65a39cc..ca31fce016e5 100644 --- a/crates/polars-io/src/parquet/read.rs +++ b/crates/polars-io/src/parquet/read.rs @@ -1,10 +1,10 @@ use std::io::{Read, Seek}; use std::sync::Arc; -use arrow::io::parquet::read; -use arrow::io::parquet::write::FileMetaData; use polars_core::prelude::*; use polars_core::utils::accumulate_dataframes_vertical_unchecked; +use polars_parquet::read; +use polars_parquet::write::FileMetaData; #[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; diff --git a/crates/polars-io/src/parquet/read_impl.rs b/crates/polars-io/src/parquet/read_impl.rs index eecaa59549fa..8f3dabe772c9 100644 --- a/crates/polars-io/src/parquet/read_impl.rs +++ b/crates/polars-io/src/parquet/read_impl.rs @@ -5,11 +5,11 @@ use std::ops::{Deref, Range}; use std::sync::Arc; use arrow::array::new_empty_array; -use arrow::io::parquet::read; -use arrow::io::parquet::read::{ArrayIter, FileMetaData, RowGroupMetaData}; use polars_core::prelude::*; use polars_core::utils::{accumulate_dataframes_vertical, split_df}; use polars_core::POOL; +use polars_parquet::read; +use polars_parquet::read::{ArrayIter, FileMetaData, RowGroupMetaData}; use rayon::prelude::*; use super::mmap::ColumnStore; diff --git a/crates/polars-io/src/parquet/write.rs b/crates/polars-io/src/parquet/write.rs index 71d5876cd31c..9a0677faf4a2 100644 --- a/crates/polars-io/src/parquet/write.rs +++ b/crates/polars-io/src/parquet/write.rs @@ -3,11 +3,11 @@ use std::io::Write; use arrow::array::Array; use arrow::chunk::Chunk; use arrow::datatypes::{DataType as ArrowDataType, PhysicalType}; -use arrow::io::parquet::read::ParquetError; -use arrow::io::parquet::write::{self, DynIter, DynStreamingIterator, Encoding, FileWriter, *}; use polars_core::prelude::*; use polars_core::utils::{accumulate_dataframes_vertical_unchecked, split_df}; use polars_core::POOL; +use polars_parquet::read::ParquetError; +use polars_parquet::write::{self, DynIter, DynStreamingIterator, Encoding, FileWriter, *}; use rayon::prelude::*; #[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; diff --git a/crates/polars-lazy/Cargo.toml b/crates/polars-lazy/Cargo.toml index 3870ff3e3fc8..4d32f0fbb42e 100644 --- a/crates/polars-lazy/Cargo.toml +++ b/crates/polars-lazy/Cargo.toml @@ -38,7 +38,7 @@ version_check = { workspace = true } [features] nightly = ["polars-core/nightly", "polars-pipe?/nightly", "polars-plan/nightly"] streaming = ["chunked_ids", "polars-pipe", "polars-plan/streaming", "polars-ops/chunked_ids"] -parquet = ["polars-core/parquet", "polars-io/parquet", "polars-plan/parquet", "polars-pipe?/parquet"] +parquet = ["polars-io/parquet", "polars-plan/parquet", "polars-pipe?/parquet"] async = [ "polars-plan/async", "polars-io/cloud", diff --git a/crates/polars-lazy/src/physical_plan/executors/scan/parquet.rs b/crates/polars-lazy/src/physical_plan/executors/scan/parquet.rs index b6a20ddfe0f9..dda996d1e326 100644 --- a/crates/polars-lazy/src/physical_plan/executors/scan/parquet.rs +++ b/crates/polars-lazy/src/physical_plan/executors/scan/parquet.rs @@ -2,8 +2,8 @@ use std::path::{Path, PathBuf}; use polars_core::config::{concurrent_download_limit, verbose}; use polars_core::utils::accumulate_dataframes_vertical; -use polars_core::utils::arrow::io::parquet::read::FileMetaData; use polars_io::cloud::CloudOptions; +use polars_io::parquet::FileMetaData; use polars_io::{is_cloud_url, RowCount}; use super::*; diff --git a/crates/polars-parquet/Cargo.toml b/crates/polars-parquet/Cargo.toml new file mode 100644 index 000000000000..673d000740ac --- /dev/null +++ b/crates/polars-parquet/Cargo.toml @@ -0,0 +1,44 @@ +[package] +name = "polars-parquet" +version = { workspace = true } +authors = [ + "Jorge C. Leitao ", + "Apache Arrow ", + "Ritchie Vink ", +] +edition = { workspace = true } +homepage = { workspace = true } +license-file = "./LICENSE" +repository = { workspace = true } +description = "polars parquet IO" + +[dependencies] +ahash = { workspace = true } +arrow = { workspace = true, features = ["io_ipc"] } +base64 = { workspace = true } +ethnum = { workspace = true } +fallible-streaming-iterator = { workspace = true, optional = true } +futures = { workspace = true, optional = true } +num-traits = { workspace = true } +parquet2 = { workspace = true, optional = true, default-features = true, features = ["async"] } +polars-error = { workspace = true, features = ["parquet2"] } +simdutf8 = { workspace = true } + +[features] +bloom_filter = ["parquet2/bloom_filter"] +async = ["futures"] + +compression = [ + "zstd", + "gzip", + "snappy", + "lz4", + "brotli", +] + +# compression backends +zstd = ["parquet2/zstd"] +snappy = ["parquet2/snappy"] +gzip = ["parquet2/gzip"] +lz4 = ["parquet2/lz4"] +brotli = ["parquet2/brotli"] diff --git a/crates/polars-parquet/LICENSE b/crates/polars-parquet/LICENSE new file mode 100644 index 000000000000..a4b4b70523c3 --- /dev/null +++ b/crates/polars-parquet/LICENSE @@ -0,0 +1,196 @@ +Some of the code in this crate is subject to the Apache 2 license below, as it +was taken from the arrow2 Rust crate in October 2023. Later changes are subject +to the MIT license in ../../LICENSE. + + + + Apache License + Version 2.0, January 2004 + https://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + Copyright 2020-2022 Jorge C. Leitão + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. \ No newline at end of file diff --git a/crates/polars-arrow/src/io/parquet/mod.rs b/crates/polars-parquet/src/arrow/mod.rs similarity index 81% rename from crates/polars-arrow/src/io/parquet/mod.rs rename to crates/polars-parquet/src/arrow/mod.rs index ae26b76b52b9..1ccb35dfeccf 100644 --- a/crates/polars-arrow/src/io/parquet/mod.rs +++ b/crates/polars-parquet/src/arrow/mod.rs @@ -1,5 +1,3 @@ -//! APIs to read from and write to Parquet format. - pub mod read; pub mod write; diff --git a/crates/polars-arrow/src/io/parquet/read/README.md b/crates/polars-parquet/src/arrow/read/README.md similarity index 100% rename from crates/polars-arrow/src/io/parquet/read/README.md rename to crates/polars-parquet/src/arrow/read/README.md diff --git a/crates/polars-arrow/src/io/parquet/read/deserialize/README.md b/crates/polars-parquet/src/arrow/read/deserialize/README.md similarity index 100% rename from crates/polars-arrow/src/io/parquet/read/deserialize/README.md rename to crates/polars-parquet/src/arrow/read/deserialize/README.md diff --git a/crates/polars-arrow/src/io/parquet/read/deserialize/binary/basic.rs b/crates/polars-parquet/src/arrow/read/deserialize/binary/basic.rs similarity index 99% rename from crates/polars-arrow/src/io/parquet/read/deserialize/binary/basic.rs rename to crates/polars-parquet/src/arrow/read/deserialize/binary/basic.rs index d615077f4efc..902fa69d0031 100644 --- a/crates/polars-arrow/src/io/parquet/read/deserialize/binary/basic.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/binary/basic.rs @@ -1,6 +1,10 @@ use std::collections::VecDeque; use std::default::Default; +use arrow::array::{Array, BinaryArray, Utf8Array}; +use arrow::bitmap::MutableBitmap; +use arrow::datatypes::{DataType, PhysicalType}; +use arrow::offset::Offset; use parquet2::deserialize::SliceFilteredIter; use parquet2::encoding::{delta_length_byte_array, hybrid_rle, Encoding}; use parquet2::page::{split_buffer, DataPage, DictPage}; @@ -13,10 +17,6 @@ use super::super::utils::{ }; use super::super::{utils, Pages}; use super::utils::*; -use crate::array::{Array, BinaryArray, Utf8Array}; -use crate::bitmap::MutableBitmap; -use crate::datatypes::{DataType, PhysicalType}; -use crate::offset::Offset; #[derive(Debug)] pub(super) struct Required<'a> { diff --git a/crates/polars-arrow/src/io/parquet/read/deserialize/binary/dictionary.rs b/crates/polars-parquet/src/arrow/read/deserialize/binary/dictionary.rs similarity index 94% rename from crates/polars-arrow/src/io/parquet/read/deserialize/binary/dictionary.rs rename to crates/polars-parquet/src/arrow/read/deserialize/binary/dictionary.rs index 06abcfea80e4..d4c91dbc5d72 100644 --- a/crates/polars-arrow/src/io/parquet/read/deserialize/binary/dictionary.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/binary/dictionary.rs @@ -1,5 +1,9 @@ use std::collections::VecDeque; +use arrow::array::{Array, BinaryArray, DictionaryArray, DictionaryKey, Utf8Array}; +use arrow::bitmap::MutableBitmap; +use arrow::datatypes::{DataType, PhysicalType}; +use arrow::offset::Offset; use parquet2::page::DictPage; use polars_error::PolarsResult; @@ -7,11 +11,7 @@ use super::super::dictionary::*; use super::super::utils::MaybeNext; use super::super::Pages; use super::utils::{Binary, SizedBinaryIter}; -use crate::array::{Array, BinaryArray, DictionaryArray, DictionaryKey, Utf8Array}; -use crate::bitmap::MutableBitmap; -use crate::datatypes::{DataType, PhysicalType}; -use crate::io::parquet::read::deserialize::nested_utils::{InitNested, NestedState}; -use crate::offset::Offset; +use crate::arrow::read::deserialize::nested_utils::{InitNested, NestedState}; /// An iterator adapter over [`Pages`] assumed to be encoded as parquet's dictionary-encoded binary representation #[derive(Debug)] diff --git a/crates/polars-arrow/src/io/parquet/read/deserialize/binary/mod.rs b/crates/polars-parquet/src/arrow/read/deserialize/binary/mod.rs similarity index 100% rename from crates/polars-arrow/src/io/parquet/read/deserialize/binary/mod.rs rename to crates/polars-parquet/src/arrow/read/deserialize/binary/mod.rs diff --git a/crates/polars-arrow/src/io/parquet/read/deserialize/binary/nested.rs b/crates/polars-parquet/src/arrow/read/deserialize/binary/nested.rs similarity index 97% rename from crates/polars-arrow/src/io/parquet/read/deserialize/binary/nested.rs rename to crates/polars-parquet/src/arrow/read/deserialize/binary/nested.rs index 5deca22bd1c5..37c0a35006f6 100644 --- a/crates/polars-arrow/src/io/parquet/read/deserialize/binary/nested.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/binary/nested.rs @@ -1,5 +1,9 @@ use std::collections::VecDeque; +use arrow::array::Array; +use arrow::bitmap::MutableBitmap; +use arrow::datatypes::DataType; +use arrow::offset::Offset; use parquet2::encoding::Encoding; use parquet2::page::{split_buffer, DataPage, DictPage}; use parquet2::schema::Repetition; @@ -10,11 +14,7 @@ use super::super::utils; use super::super::utils::MaybeNext; use super::basic::{deserialize_plain, finish, Dict, ValuesDictionary}; use super::utils::*; -use crate::array::Array; -use crate::bitmap::MutableBitmap; -use crate::datatypes::DataType; -use crate::io::parquet::read::Pages; -use crate::offset::Offset; +use crate::arrow::read::Pages; #[derive(Debug)] enum State<'a> { diff --git a/crates/polars-arrow/src/io/parquet/read/deserialize/binary/utils.rs b/crates/polars-parquet/src/arrow/read/deserialize/binary/utils.rs similarity index 99% rename from crates/polars-arrow/src/io/parquet/read/deserialize/binary/utils.rs rename to crates/polars-parquet/src/arrow/read/deserialize/binary/utils.rs index 961268db2beb..24540caaab97 100644 --- a/crates/polars-arrow/src/io/parquet/read/deserialize/binary/utils.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/binary/utils.rs @@ -1,5 +1,6 @@ +use arrow::offset::{Offset, Offsets}; + use super::super::utils::Pushable; -use crate::offset::{Offset, Offsets}; /// [`Pushable`] for variable length binary data. #[derive(Debug)] diff --git a/crates/polars-arrow/src/io/parquet/read/deserialize/boolean/basic.rs b/crates/polars-parquet/src/arrow/read/deserialize/boolean/basic.rs similarity index 98% rename from crates/polars-arrow/src/io/parquet/read/deserialize/boolean/basic.rs rename to crates/polars-parquet/src/arrow/read/deserialize/boolean/basic.rs index da7575f2c0f9..413cfd15da35 100644 --- a/crates/polars-arrow/src/io/parquet/read/deserialize/boolean/basic.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/boolean/basic.rs @@ -1,5 +1,9 @@ use std::collections::VecDeque; +use arrow::array::BooleanArray; +use arrow::bitmap::utils::BitmapIter; +use arrow::bitmap::MutableBitmap; +use arrow::datatypes::DataType; use parquet2::deserialize::SliceFilteredIter; use parquet2::encoding::Encoding; use parquet2::page::{split_buffer, DataPage, DictPage}; @@ -11,10 +15,6 @@ use super::super::utils::{ FilteredOptionalPageValidity, MaybeNext, OptionalPageValidity, }; use super::super::{utils, Pages}; -use crate::array::BooleanArray; -use crate::bitmap::utils::BitmapIter; -use crate::bitmap::MutableBitmap; -use crate::datatypes::DataType; #[derive(Debug)] struct Values<'a>(BitmapIter<'a>); diff --git a/crates/polars-arrow/src/io/parquet/read/deserialize/boolean/mod.rs b/crates/polars-parquet/src/arrow/read/deserialize/boolean/mod.rs similarity index 100% rename from crates/polars-arrow/src/io/parquet/read/deserialize/boolean/mod.rs rename to crates/polars-parquet/src/arrow/read/deserialize/boolean/mod.rs diff --git a/crates/polars-arrow/src/io/parquet/read/deserialize/boolean/nested.rs b/crates/polars-parquet/src/arrow/read/deserialize/boolean/nested.rs similarity index 97% rename from crates/polars-arrow/src/io/parquet/read/deserialize/boolean/nested.rs rename to crates/polars-parquet/src/arrow/read/deserialize/boolean/nested.rs index 6bb139b683ed..d3a8c0b305c4 100644 --- a/crates/polars-arrow/src/io/parquet/read/deserialize/boolean/nested.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/boolean/nested.rs @@ -1,5 +1,9 @@ use std::collections::VecDeque; +use arrow::array::BooleanArray; +use arrow::bitmap::utils::BitmapIter; +use arrow::bitmap::MutableBitmap; +use arrow::datatypes::DataType; use parquet2::encoding::Encoding; use parquet2::page::{split_buffer, DataPage, DictPage}; use parquet2::schema::Repetition; @@ -8,10 +12,6 @@ use polars_error::PolarsResult; use super::super::nested_utils::*; use super::super::utils::MaybeNext; use super::super::{utils, Pages}; -use crate::array::BooleanArray; -use crate::bitmap::utils::BitmapIter; -use crate::bitmap::MutableBitmap; -use crate::datatypes::DataType; // The state of a `DataPage` of `Boolean` parquet boolean type #[allow(clippy::large_enum_variant)] diff --git a/crates/polars-arrow/src/io/parquet/read/deserialize/dictionary/mod.rs b/crates/polars-parquet/src/arrow/read/deserialize/dictionary/mod.rs similarity index 98% rename from crates/polars-arrow/src/io/parquet/read/deserialize/dictionary/mod.rs rename to crates/polars-parquet/src/arrow/read/deserialize/dictionary/mod.rs index b545293e37a4..acb795eb04d4 100644 --- a/crates/polars-arrow/src/io/parquet/read/deserialize/dictionary/mod.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/dictionary/mod.rs @@ -2,6 +2,9 @@ mod nested; use std::collections::VecDeque; +use arrow::array::{Array, DictionaryArray, DictionaryKey, PrimitiveArray}; +use arrow::bitmap::MutableBitmap; +use arrow::datatypes::DataType; use parquet2::deserialize::SliceFilteredIter; use parquet2::encoding::hybrid_rle::HybridRleDecoder; use parquet2::encoding::Encoding; @@ -13,9 +16,6 @@ use super::utils::{ FilteredOptionalPageValidity, MaybeNext, OptionalPageValidity, }; use super::Pages; -use crate::array::{Array, DictionaryArray, DictionaryKey, PrimitiveArray}; -use crate::bitmap::MutableBitmap; -use crate::datatypes::DataType; // The state of a `DataPage` of `Primitive` parquet primitive type #[derive(Debug)] diff --git a/crates/polars-arrow/src/io/parquet/read/deserialize/dictionary/nested.rs b/crates/polars-parquet/src/arrow/read/deserialize/dictionary/nested.rs similarity index 98% rename from crates/polars-arrow/src/io/parquet/read/deserialize/dictionary/nested.rs rename to crates/polars-parquet/src/arrow/read/deserialize/dictionary/nested.rs index 89f3d63770b9..03aded4b2b97 100644 --- a/crates/polars-arrow/src/io/parquet/read/deserialize/dictionary/nested.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/dictionary/nested.rs @@ -1,5 +1,8 @@ use std::collections::VecDeque; +use arrow::array::{Array, DictionaryArray, DictionaryKey}; +use arrow::bitmap::MutableBitmap; +use arrow::datatypes::DataType; use parquet2::encoding::hybrid_rle::HybridRleDecoder; use parquet2::encoding::Encoding; use parquet2::page::{DataPage, DictPage, Page}; @@ -10,9 +13,6 @@ use super::super::super::Pages; use super::super::nested_utils::*; use super::super::utils::{dict_indices_decoder, not_implemented, MaybeNext, PageState}; use super::finish_key; -use crate::array::{Array, DictionaryArray, DictionaryKey}; -use crate::bitmap::MutableBitmap; -use crate::datatypes::DataType; // The state of a required DataPage with a boolean physical type #[derive(Debug)] diff --git a/crates/polars-arrow/src/io/parquet/read/deserialize/fixed_size_binary/basic.rs b/crates/polars-parquet/src/arrow/read/deserialize/fixed_size_binary/basic.rs similarity index 98% rename from crates/polars-arrow/src/io/parquet/read/deserialize/fixed_size_binary/basic.rs rename to crates/polars-parquet/src/arrow/read/deserialize/fixed_size_binary/basic.rs index b9c5625ab347..e51122eec41e 100644 --- a/crates/polars-arrow/src/io/parquet/read/deserialize/fixed_size_binary/basic.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/fixed_size_binary/basic.rs @@ -1,5 +1,8 @@ use std::collections::VecDeque; +use arrow::array::FixedSizeBinaryArray; +use arrow::bitmap::MutableBitmap; +use arrow::datatypes::DataType; use parquet2::deserialize::SliceFilteredIter; use parquet2::encoding::{hybrid_rle, Encoding}; use parquet2::page::{split_buffer, DataPage, DictPage}; @@ -13,9 +16,6 @@ use super::super::utils::{ }; use super::super::Pages; use super::utils::FixedSizeBinary; -use crate::array::FixedSizeBinaryArray; -use crate::bitmap::MutableBitmap; -use crate::datatypes::DataType; pub(super) type Dict = Vec; diff --git a/crates/polars-arrow/src/io/parquet/read/deserialize/fixed_size_binary/dictionary.rs b/crates/polars-parquet/src/arrow/read/deserialize/fixed_size_binary/dictionary.rs similarity index 94% rename from crates/polars-arrow/src/io/parquet/read/deserialize/fixed_size_binary/dictionary.rs rename to crates/polars-parquet/src/arrow/read/deserialize/fixed_size_binary/dictionary.rs index 914d0b516fda..346f092fab84 100644 --- a/crates/polars-arrow/src/io/parquet/read/deserialize/fixed_size_binary/dictionary.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/fixed_size_binary/dictionary.rs @@ -1,15 +1,15 @@ use std::collections::VecDeque; +use arrow::array::{Array, DictionaryArray, DictionaryKey, FixedSizeBinaryArray}; +use arrow::bitmap::MutableBitmap; +use arrow::datatypes::DataType; use parquet2::page::DictPage; use polars_error::PolarsResult; use super::super::dictionary::*; use super::super::utils::MaybeNext; use super::super::Pages; -use crate::array::{Array, DictionaryArray, DictionaryKey, FixedSizeBinaryArray}; -use crate::bitmap::MutableBitmap; -use crate::datatypes::DataType; -use crate::io::parquet::read::deserialize::nested_utils::{InitNested, NestedState}; +use crate::arrow::read::deserialize::nested_utils::{InitNested, NestedState}; /// An iterator adapter over [`Pages`] assumed to be encoded as parquet's dictionary-encoded binary representation #[derive(Debug)] diff --git a/crates/polars-arrow/src/io/parquet/read/deserialize/fixed_size_binary/mod.rs b/crates/polars-parquet/src/arrow/read/deserialize/fixed_size_binary/mod.rs similarity index 100% rename from crates/polars-arrow/src/io/parquet/read/deserialize/fixed_size_binary/mod.rs rename to crates/polars-parquet/src/arrow/read/deserialize/fixed_size_binary/mod.rs diff --git a/crates/polars-arrow/src/io/parquet/read/deserialize/fixed_size_binary/nested.rs b/crates/polars-parquet/src/arrow/read/deserialize/fixed_size_binary/nested.rs similarity index 93% rename from crates/polars-arrow/src/io/parquet/read/deserialize/fixed_size_binary/nested.rs rename to crates/polars-parquet/src/arrow/read/deserialize/fixed_size_binary/nested.rs index bafbbde63c6e..11a1e3f044a7 100644 --- a/crates/polars-arrow/src/io/parquet/read/deserialize/fixed_size_binary/nested.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/fixed_size_binary/nested.rs @@ -1,5 +1,8 @@ use std::collections::VecDeque; +use arrow::array::FixedSizeBinaryArray; +use arrow::bitmap::MutableBitmap; +use arrow::datatypes::DataType; use parquet2::encoding::Encoding; use parquet2::page::{DataPage, DictPage}; use parquet2::schema::Repetition; @@ -7,15 +10,12 @@ use polars_error::PolarsResult; use super::super::utils::{not_implemented, MaybeNext, PageState}; use super::utils::FixedSizeBinary; -use crate::array::FixedSizeBinaryArray; -use crate::bitmap::MutableBitmap; -use crate::datatypes::DataType; -use crate::io::parquet::read::deserialize::fixed_size_binary::basic::{ +use crate::arrow::read::deserialize::fixed_size_binary::basic::{ finish, Dict, Optional, OptionalDictionary, Required, RequiredDictionary, }; -use crate::io::parquet::read::deserialize::nested_utils::{next, NestedDecoder}; -use crate::io::parquet::read::deserialize::utils::Pushable; -use crate::io::parquet::read::{InitNested, NestedState, Pages}; +use crate::arrow::read::deserialize::nested_utils::{next, NestedDecoder}; +use crate::arrow::read::deserialize::utils::Pushable; +use crate::arrow::read::{InitNested, NestedState, Pages}; #[derive(Debug)] enum State<'a> { diff --git a/crates/polars-arrow/src/io/parquet/read/deserialize/fixed_size_binary/utils.rs b/crates/polars-parquet/src/arrow/read/deserialize/fixed_size_binary/utils.rs similarity index 100% rename from crates/polars-arrow/src/io/parquet/read/deserialize/fixed_size_binary/utils.rs rename to crates/polars-parquet/src/arrow/read/deserialize/fixed_size_binary/utils.rs diff --git a/crates/polars-arrow/src/io/parquet/read/deserialize/mod.rs b/crates/polars-parquet/src/arrow/read/deserialize/mod.rs similarity index 89% rename from crates/polars-arrow/src/io/parquet/read/deserialize/mod.rs rename to crates/polars-parquet/src/arrow/read/deserialize/mod.rs index 987e455f4a34..3b8281373c96 100644 --- a/crates/polars-arrow/src/io/parquet/read/deserialize/mod.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/mod.rs @@ -11,6 +11,9 @@ mod simple; mod struct_; mod utils; +use arrow::array::{Array, DictionaryKey, FixedSizeListArray, ListArray, MapArray}; +use arrow::datatypes::{DataType, Field, IntervalUnit}; +use arrow::offset::Offsets; use parquet2::read::get_page_iterator as _get_page_iterator; use parquet2::schema::types::PrimitiveType; use simple::page_iter_to_arrays; @@ -18,9 +21,6 @@ use simple::page_iter_to_arrays; pub use self::nested_utils::{init_nested, InitNested, NestedArrayIter, NestedState}; pub use self::struct_::StructIterator; use super::*; -use crate::array::{Array, DictionaryKey, FixedSizeListArray, ListArray, MapArray}; -use crate::datatypes::{DataType, Field, IntervalUnit}; -use crate::offset::Offsets; /// Creates a new iterator of compressed pages. pub fn get_page_iterator( @@ -112,15 +112,15 @@ pub fn create_map( fn is_primitive(data_type: &DataType) -> bool { matches!( data_type.to_physical_type(), - crate::datatypes::PhysicalType::Primitive(_) - | crate::datatypes::PhysicalType::Null - | crate::datatypes::PhysicalType::Boolean - | crate::datatypes::PhysicalType::Utf8 - | crate::datatypes::PhysicalType::LargeUtf8 - | crate::datatypes::PhysicalType::Binary - | crate::datatypes::PhysicalType::LargeBinary - | crate::datatypes::PhysicalType::FixedSizeBinary - | crate::datatypes::PhysicalType::Dictionary(_) + arrow::datatypes::PhysicalType::Primitive(_) + | arrow::datatypes::PhysicalType::Null + | arrow::datatypes::PhysicalType::Boolean + | arrow::datatypes::PhysicalType::Utf8 + | arrow::datatypes::PhysicalType::LargeUtf8 + | arrow::datatypes::PhysicalType::Binary + | arrow::datatypes::PhysicalType::LargeBinary + | arrow::datatypes::PhysicalType::FixedSizeBinary + | arrow::datatypes::PhysicalType::Dictionary(_) ) } @@ -153,7 +153,7 @@ where /// Returns the number of (parquet) columns that a [`DataType`] contains. pub fn n_columns(data_type: &DataType) -> usize { - use crate::datatypes::PhysicalType::*; + use arrow::datatypes::PhysicalType::*; match data_type.to_physical_type() { Null | Boolean | Primitive(_) | Binary | FixedSizeBinary | LargeBinary | Utf8 | Dictionary(_) | LargeUtf8 => 1, diff --git a/crates/polars-arrow/src/io/parquet/read/deserialize/nested.rs b/crates/polars-parquet/src/arrow/read/deserialize/nested.rs similarity index 99% rename from crates/polars-arrow/src/io/parquet/read/deserialize/nested.rs rename to crates/polars-parquet/src/arrow/read/deserialize/nested.rs index 603ecc819519..6741c05ee852 100644 --- a/crates/polars-arrow/src/io/parquet/read/deserialize/nested.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/nested.rs @@ -1,12 +1,12 @@ +use arrow::array::PrimitiveArray; +use arrow::datatypes::{DataType, Field}; +use arrow::match_integer_type; use ethnum::I256; use parquet2::schema::types::PrimitiveType; use polars_error::polars_bail; use super::nested_utils::{InitNested, NestedArrayIter}; use super::*; -use crate::array::PrimitiveArray; -use crate::datatypes::{DataType, Field}; -use crate::match_integer_type; /// Converts an iterator of arrays to a trait object returning trait objects #[inline] @@ -48,8 +48,8 @@ pub fn columns_to_iter_recursive<'a, I: 'a>( where I: Pages, { - use crate::datatypes::PhysicalType::*; - use crate::datatypes::PrimitiveType::*; + use arrow::datatypes::PhysicalType::*; + use arrow::datatypes::PrimitiveType::*; Ok(match field.data_type().to_physical_type() { Null => { diff --git a/crates/polars-arrow/src/io/parquet/read/deserialize/nested_utils.rs b/crates/polars-parquet/src/arrow/read/deserialize/nested_utils.rs similarity index 99% rename from crates/polars-arrow/src/io/parquet/read/deserialize/nested_utils.rs rename to crates/polars-parquet/src/arrow/read/deserialize/nested_utils.rs index 482d5117a7da..f27841f7f5b4 100644 --- a/crates/polars-arrow/src/io/parquet/read/deserialize/nested_utils.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/nested_utils.rs @@ -1,5 +1,7 @@ use std::collections::VecDeque; +use arrow::array::Array; +use arrow::bitmap::MutableBitmap; use parquet2::encoding::hybrid_rle::HybridRleDecoder; use parquet2::page::{split_buffer, DataPage, DictPage, Page}; use parquet2::read::levels::get_bit_width; @@ -8,8 +10,6 @@ use polars_error::PolarsResult; use super::super::Pages; pub use super::utils::Zip; use super::utils::{DecodedState, MaybeNext, PageState}; -use crate::array::Array; -use crate::bitmap::MutableBitmap; /// trait describing deserialized repetition and definition levels pub trait Nested: std::fmt::Debug + Send + Sync { diff --git a/crates/polars-arrow/src/io/parquet/read/deserialize/null/mod.rs b/crates/polars-parquet/src/arrow/read/deserialize/null/mod.rs similarity index 95% rename from crates/polars-arrow/src/io/parquet/read/deserialize/null/mod.rs rename to crates/polars-parquet/src/arrow/read/deserialize/null/mod.rs index ca051e81243f..ad6227a45f2e 100644 --- a/crates/polars-arrow/src/io/parquet/read/deserialize/null/mod.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/null/mod.rs @@ -1,11 +1,11 @@ mod nested; +use arrow::array::NullArray; +use arrow::datatypes::DataType; pub(super) use nested::NestedIter; use parquet2::page::Page; use super::super::{ArrayIter, Pages}; -use crate::array::NullArray; -use crate::datatypes::DataType; /// Converts [`Pages`] to an [`ArrayIter`] pub fn iter_to_arrays<'a, I>( @@ -54,6 +54,8 @@ where #[cfg(test)] mod tests { + use arrow::array::NullArray; + use arrow::datatypes::DataType; use parquet2::encoding::Encoding; use parquet2::error::Error as ParquetError; use parquet2::metadata::Descriptor; @@ -62,8 +64,6 @@ mod tests { use polars_error::*; use super::iter_to_arrays; - use crate::array::NullArray; - use crate::datatypes::DataType; #[test] fn limit() { diff --git a/crates/polars-arrow/src/io/parquet/read/deserialize/null/nested.rs b/crates/polars-parquet/src/arrow/read/deserialize/null/nested.rs similarity index 95% rename from crates/polars-arrow/src/io/parquet/read/deserialize/null/nested.rs rename to crates/polars-parquet/src/arrow/read/deserialize/null/nested.rs index dc3f0fe8364b..0da98d13f17d 100644 --- a/crates/polars-arrow/src/io/parquet/read/deserialize/null/nested.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/null/nested.rs @@ -1,13 +1,13 @@ use std::collections::VecDeque; +use arrow::array::NullArray; +use arrow::datatypes::DataType; use parquet2::page::{DataPage, DictPage}; use polars_error::PolarsResult; use super::super::nested_utils::*; use super::super::{utils, Pages}; -use crate::array::NullArray; -use crate::datatypes::DataType; -use crate::io::parquet::read::deserialize::utils::DecodedState; +use crate::arrow::read::deserialize::utils::DecodedState; impl<'a> utils::PageState<'a> for usize { fn len(&self) -> usize { diff --git a/crates/polars-arrow/src/io/parquet/read/deserialize/primitive/basic.rs b/crates/polars-parquet/src/arrow/read/deserialize/primitive/basic.rs similarity index 98% rename from crates/polars-arrow/src/io/parquet/read/deserialize/primitive/basic.rs rename to crates/polars-parquet/src/arrow/read/deserialize/primitive/basic.rs index 7b08ed5ca176..acd6ea5ae785 100644 --- a/crates/polars-arrow/src/io/parquet/read/deserialize/primitive/basic.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/primitive/basic.rs @@ -1,5 +1,9 @@ use std::collections::VecDeque; +use arrow::array::MutablePrimitiveArray; +use arrow::bitmap::MutableBitmap; +use arrow::datatypes::DataType; +use arrow::types::NativeType; use parquet2::deserialize::SliceFilteredIter; use parquet2::encoding::{hybrid_rle, Encoding}; use parquet2::page::{split_buffer, DataPage, DictPage}; @@ -9,10 +13,6 @@ use polars_error::PolarsResult; use super::super::utils::{get_selected_rows, FilteredOptionalPageValidity, OptionalPageValidity}; use super::super::{utils, Pages}; -use crate::array::MutablePrimitiveArray; -use crate::bitmap::MutableBitmap; -use crate::datatypes::DataType; -use crate::types::NativeType; #[derive(Debug)] pub(super) struct FilteredRequiredValues<'a> { diff --git a/crates/polars-arrow/src/io/parquet/read/deserialize/primitive/dictionary.rs b/crates/polars-parquet/src/arrow/read/deserialize/primitive/dictionary.rs similarity index 96% rename from crates/polars-arrow/src/io/parquet/read/deserialize/primitive/dictionary.rs rename to crates/polars-parquet/src/arrow/read/deserialize/primitive/dictionary.rs index 07cefe799151..6f476cbafe79 100644 --- a/crates/polars-arrow/src/io/parquet/read/deserialize/primitive/dictionary.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/primitive/dictionary.rs @@ -1,5 +1,9 @@ use std::collections::VecDeque; +use arrow::array::{Array, DictionaryArray, DictionaryKey, PrimitiveArray}; +use arrow::bitmap::MutableBitmap; +use arrow::datatypes::DataType; +use arrow::types::NativeType; use parquet2::page::DictPage; use parquet2::types::NativeType as ParquetNativeType; use polars_error::PolarsResult; @@ -9,10 +13,6 @@ use super::super::nested_utils::{InitNested, NestedState}; use super::super::utils::MaybeNext; use super::super::Pages; use super::basic::deserialize_plain; -use crate::array::{Array, DictionaryArray, DictionaryKey, PrimitiveArray}; -use crate::bitmap::MutableBitmap; -use crate::datatypes::DataType; -use crate::types::NativeType; fn read_dict(data_type: DataType, op: F, dict: &DictPage) -> Box where diff --git a/crates/polars-arrow/src/io/parquet/read/deserialize/primitive/integer.rs b/crates/polars-parquet/src/arrow/read/deserialize/primitive/integer.rs similarity index 97% rename from crates/polars-arrow/src/io/parquet/read/deserialize/primitive/integer.rs rename to crates/polars-parquet/src/arrow/read/deserialize/primitive/integer.rs index 3aeebac95219..8472a54bda3d 100644 --- a/crates/polars-arrow/src/io/parquet/read/deserialize/primitive/integer.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/primitive/integer.rs @@ -1,5 +1,9 @@ use std::collections::VecDeque; +use arrow::array::MutablePrimitiveArray; +use arrow::bitmap::MutableBitmap; +use arrow::datatypes::DataType; +use arrow::types::NativeType; use num_traits::AsPrimitive; use parquet2::deserialize::SliceFilteredIter; use parquet2::encoding::delta_bitpacked::Decoder; @@ -11,13 +15,9 @@ use polars_error::{to_compute_err, PolarsResult}; use super::super::{utils, Pages}; use super::basic::{finish, PrimitiveDecoder, State as PrimitiveState}; -use crate::array::MutablePrimitiveArray; -use crate::bitmap::MutableBitmap; -use crate::datatypes::DataType; -use crate::io::parquet::read::deserialize::utils::{ +use crate::arrow::read::deserialize::utils::{ get_selected_rows, FilteredOptionalPageValidity, OptionalPageValidity, }; -use crate::types::NativeType; /// The state of a [`DataPage`] of an integer parquet type (i32 or i64) #[derive(Debug)] diff --git a/crates/polars-arrow/src/io/parquet/read/deserialize/primitive/mod.rs b/crates/polars-parquet/src/arrow/read/deserialize/primitive/mod.rs similarity index 100% rename from crates/polars-arrow/src/io/parquet/read/deserialize/primitive/mod.rs rename to crates/polars-parquet/src/arrow/read/deserialize/primitive/mod.rs diff --git a/crates/polars-arrow/src/io/parquet/read/deserialize/primitive/nested.rs b/crates/polars-parquet/src/arrow/read/deserialize/primitive/nested.rs similarity index 98% rename from crates/polars-arrow/src/io/parquet/read/deserialize/primitive/nested.rs rename to crates/polars-parquet/src/arrow/read/deserialize/primitive/nested.rs index 7e515d23ec2d..3b87d3fcde92 100644 --- a/crates/polars-arrow/src/io/parquet/read/deserialize/primitive/nested.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/primitive/nested.rs @@ -1,5 +1,9 @@ use std::collections::VecDeque; +use arrow::array::PrimitiveArray; +use arrow::bitmap::MutableBitmap; +use arrow::datatypes::DataType; +use arrow::types::NativeType; use parquet2::encoding::Encoding; use parquet2::page::{DataPage, DictPage}; use parquet2::schema::Repetition; @@ -9,10 +13,6 @@ use polars_error::PolarsResult; use super::super::nested_utils::*; use super::super::{utils, Pages}; use super::basic::{deserialize_plain, Values, ValuesDictionary}; -use crate::array::PrimitiveArray; -use crate::bitmap::MutableBitmap; -use crate::datatypes::DataType; -use crate::types::NativeType; // The state of a `DataPage` of `Primitive` parquet primitive type #[allow(clippy::large_enum_variant)] diff --git a/crates/polars-arrow/src/io/parquet/read/deserialize/simple.rs b/crates/polars-parquet/src/arrow/read/deserialize/simple.rs similarity index 99% rename from crates/polars-arrow/src/io/parquet/read/deserialize/simple.rs rename to crates/polars-parquet/src/arrow/read/deserialize/simple.rs index 7ab2b35e0735..119b569116f5 100644 --- a/crates/polars-arrow/src/io/parquet/read/deserialize/simple.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/simple.rs @@ -1,3 +1,7 @@ +use arrow::array::{Array, DictionaryKey, MutablePrimitiveArray, PrimitiveArray}; +use arrow::datatypes::{DataType, IntervalUnit, TimeUnit}; +use arrow::match_integer_type; +use arrow::types::{days_ms, i256, NativeType}; use ethnum::I256; use parquet2::schema::types::{ PhysicalType, PrimitiveLogicalType, PrimitiveType, TimeUnit as ParquetTimeUnit, @@ -7,10 +11,6 @@ use polars_error::{polars_bail, PolarsResult}; use super::super::{ArrayIter, Pages}; use super::{binary, boolean, fixed_size_binary, null, primitive}; -use crate::array::{Array, DictionaryKey, MutablePrimitiveArray, PrimitiveArray}; -use crate::datatypes::{DataType, IntervalUnit, TimeUnit}; -use crate::match_integer_type; -use crate::types::{days_ms, i256, NativeType}; /// Converts an iterator of arrays to a trait object returning trait objects #[inline] diff --git a/crates/polars-arrow/src/io/parquet/read/deserialize/struct_.rs b/crates/polars-parquet/src/arrow/read/deserialize/struct_.rs similarity index 95% rename from crates/polars-arrow/src/io/parquet/read/deserialize/struct_.rs rename to crates/polars-parquet/src/arrow/read/deserialize/struct_.rs index 1dcd46d1f965..d47654733e4b 100644 --- a/crates/polars-arrow/src/io/parquet/read/deserialize/struct_.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/struct_.rs @@ -1,8 +1,8 @@ +use arrow::array::{Array, StructArray}; +use arrow::datatypes::{DataType, Field}; use polars_error::PolarsResult; use super::nested_utils::{NestedArrayIter, NestedState}; -use crate::array::{Array, StructArray}; -use crate::datatypes::{DataType, Field}; /// An iterator adapter over [`NestedArrayIter`] assumed to be encoded as Struct arrays pub struct StructIterator<'a> { diff --git a/crates/polars-arrow/src/io/parquet/read/deserialize/utils.rs b/crates/polars-parquet/src/arrow/read/deserialize/utils.rs similarity index 99% rename from crates/polars-arrow/src/io/parquet/read/deserialize/utils.rs rename to crates/polars-parquet/src/arrow/read/deserialize/utils.rs index 1f54a7369a94..8e3eb98147c9 100644 --- a/crates/polars-arrow/src/io/parquet/read/deserialize/utils.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/utils.rs @@ -1,5 +1,7 @@ use std::collections::VecDeque; +use arrow::bitmap::utils::BitmapIter; +use arrow::bitmap::MutableBitmap; use parquet2::deserialize::{ FilteredHybridEncoded, FilteredHybridRleDecoderIter, HybridDecoderBitmapIter, HybridEncoded, }; @@ -10,8 +12,6 @@ use parquet2::schema::Repetition; use polars_error::{polars_err, to_compute_err, PolarsError, PolarsResult}; use super::super::Pages; -use crate::bitmap::utils::BitmapIter; -use crate::bitmap::MutableBitmap; pub fn not_implemented(page: &DataPage) -> PolarsError { let is_optional = page.descriptor.primitive_type.field_info.repetition == Repetition::Optional; diff --git a/crates/polars-arrow/src/io/parquet/read/file.rs b/crates/polars-parquet/src/arrow/read/file.rs similarity index 98% rename from crates/polars-arrow/src/io/parquet/read/file.rs rename to crates/polars-parquet/src/arrow/read/file.rs index 4b986758206b..109011175c7b 100644 --- a/crates/polars-arrow/src/io/parquet/read/file.rs +++ b/crates/polars-parquet/src/arrow/read/file.rs @@ -1,13 +1,13 @@ use std::io::{Read, Seek}; +use arrow::array::Array; +use arrow::chunk::Chunk; +use arrow::datatypes::Schema; use parquet2::indexes::FilteredPage; use polars_error::PolarsResult; use super::{RowGroupDeserializer, RowGroupMetaData}; -use crate::array::Array; -use crate::chunk::Chunk; -use crate::datatypes::Schema; -use crate::io::parquet::read::read_columns_many; +use crate::arrow::read::read_columns_many; /// An iterator of [`Chunk`]s coming from row groups of a parquet file. /// diff --git a/crates/polars-arrow/src/io/parquet/read/indexes/binary.rs b/crates/polars-parquet/src/arrow/read/indexes/binary.rs similarity index 91% rename from crates/polars-arrow/src/io/parquet/read/indexes/binary.rs rename to crates/polars-parquet/src/arrow/read/indexes/binary.rs index f8eeebc60a57..83de6a6f525a 100644 --- a/crates/polars-arrow/src/io/parquet/read/indexes/binary.rs +++ b/crates/polars-parquet/src/arrow/read/indexes/binary.rs @@ -1,10 +1,10 @@ +use arrow::array::{Array, BinaryArray, PrimitiveArray, Utf8Array}; +use arrow::datatypes::{DataType, PhysicalType}; +use arrow::trusted_len::TrustedLen; use parquet2::indexes::PageIndex; use polars_error::{to_compute_err, PolarsResult}; use super::ColumnPageStatistics; -use crate::array::{Array, BinaryArray, PrimitiveArray, Utf8Array}; -use crate::datatypes::{DataType, PhysicalType}; -use crate::trusted_len::TrustedLen; pub fn deserialize( indexes: &[PageIndex>], diff --git a/crates/polars-arrow/src/io/parquet/read/indexes/boolean.rs b/crates/polars-parquet/src/arrow/read/indexes/boolean.rs similarity index 92% rename from crates/polars-arrow/src/io/parquet/read/indexes/boolean.rs rename to crates/polars-parquet/src/arrow/read/indexes/boolean.rs index 70977197d103..5c809673eba1 100644 --- a/crates/polars-arrow/src/io/parquet/read/indexes/boolean.rs +++ b/crates/polars-parquet/src/arrow/read/indexes/boolean.rs @@ -1,7 +1,7 @@ +use arrow::array::{BooleanArray, PrimitiveArray}; use parquet2::indexes::PageIndex; use super::ColumnPageStatistics; -use crate::array::{BooleanArray, PrimitiveArray}; pub fn deserialize(indexes: &[PageIndex]) -> ColumnPageStatistics { ColumnPageStatistics { diff --git a/crates/polars-arrow/src/io/parquet/read/indexes/fixed_len_binary.rs b/crates/polars-parquet/src/arrow/read/indexes/fixed_len_binary.rs similarity index 91% rename from crates/polars-arrow/src/io/parquet/read/indexes/fixed_len_binary.rs rename to crates/polars-parquet/src/arrow/read/indexes/fixed_len_binary.rs index 26002e5857d5..c6cede8dd466 100644 --- a/crates/polars-arrow/src/io/parquet/read/indexes/fixed_len_binary.rs +++ b/crates/polars-parquet/src/arrow/read/indexes/fixed_len_binary.rs @@ -1,10 +1,10 @@ +use arrow::array::{Array, FixedSizeBinaryArray, MutableFixedSizeBinaryArray, PrimitiveArray}; +use arrow::datatypes::{DataType, PhysicalType, PrimitiveType}; +use arrow::trusted_len::TrustedLen; +use arrow::types::{i256, NativeType}; use parquet2::indexes::PageIndex; use super::ColumnPageStatistics; -use crate::array::{Array, FixedSizeBinaryArray, MutableFixedSizeBinaryArray, PrimitiveArray}; -use crate::datatypes::{DataType, PhysicalType, PrimitiveType}; -use crate::trusted_len::TrustedLen; -use crate::types::{i256, NativeType}; pub fn deserialize(indexes: &[PageIndex>], data_type: DataType) -> ColumnPageStatistics { ColumnPageStatistics { diff --git a/crates/polars-arrow/src/io/parquet/read/indexes/mod.rs b/crates/polars-parquet/src/arrow/read/indexes/mod.rs similarity index 98% rename from crates/polars-arrow/src/io/parquet/read/indexes/mod.rs rename to crates/polars-parquet/src/arrow/read/indexes/mod.rs index 040edc77ba5e..1abd34c5a968 100644 --- a/crates/polars-arrow/src/io/parquet/read/indexes/mod.rs +++ b/crates/polars-parquet/src/arrow/read/indexes/mod.rs @@ -16,12 +16,12 @@ mod primitive; use std::collections::VecDeque; use std::io::{Read, Seek}; +use arrow::array::{Array, UInt64Array}; +use arrow::datatypes::{DataType, Field, PhysicalType, PrimitiveType}; pub use parquet2::indexes::{FilteredPage, Interval}; use polars_error::{polars_bail, PolarsResult}; use super::get_field_pages; -use crate::array::{Array, UInt64Array}; -use crate::datatypes::{DataType, Field, PhysicalType, PrimitiveType}; /// Page statistics of an Arrow field. #[derive(Debug, PartialEq)] @@ -63,7 +63,7 @@ pub struct ColumnPageStatistics { /// # Error /// This function errors iff the value is not deserializable to arrow (e.g. invalid utf-8) fn deserialize( - indexes: &mut VecDeque<&Box>, + indexes: &mut VecDeque<&dyn ParquetIndex>, data_type: DataType, ) -> PolarsResult { match data_type.to_physical_type() { @@ -278,7 +278,7 @@ pub fn read_columns_indexes( .iter() .map(|field| { let indexes = get_field_pages(chunks, &indexes, &field.name); - let mut indexes = indexes.into_iter().collect(); + let mut indexes = indexes.into_iter().map(|boxed| boxed.as_ref()).collect(); deserialize(&mut indexes, field.data_type.clone()) }) diff --git a/crates/polars-arrow/src/io/parquet/read/indexes/primitive.rs b/crates/polars-parquet/src/arrow/read/indexes/primitive.rs similarity index 97% rename from crates/polars-arrow/src/io/parquet/read/indexes/primitive.rs rename to crates/polars-parquet/src/arrow/read/indexes/primitive.rs index 90e52e4a4aaf..fd551c35a2b0 100644 --- a/crates/polars-arrow/src/io/parquet/read/indexes/primitive.rs +++ b/crates/polars-parquet/src/arrow/read/indexes/primitive.rs @@ -1,13 +1,13 @@ +use arrow::array::{Array, MutablePrimitiveArray, PrimitiveArray}; +use arrow::datatypes::{DataType, TimeUnit}; +use arrow::trusted_len::TrustedLen; +use arrow::types::{i256, NativeType}; use ethnum::I256; use parquet2::indexes::PageIndex; use parquet2::schema::types::{PrimitiveLogicalType, PrimitiveType, TimeUnit as ParquetTimeUnit}; use parquet2::types::int96_to_i64_ns; use super::ColumnPageStatistics; -use crate::array::{Array, MutablePrimitiveArray, PrimitiveArray}; -use crate::datatypes::{DataType, TimeUnit}; -use crate::trusted_len::TrustedLen; -use crate::types::{i256, NativeType}; #[inline] fn deserialize_int32>>( diff --git a/crates/polars-arrow/src/io/parquet/read/mod.rs b/crates/polars-parquet/src/arrow/read/mod.rs similarity index 93% rename from crates/polars-arrow/src/io/parquet/read/mod.rs rename to crates/polars-parquet/src/arrow/read/mod.rs index b1cf40115e9f..1a8cdc4c05b7 100644 --- a/crates/polars-arrow/src/io/parquet/read/mod.rs +++ b/crates/polars-parquet/src/arrow/read/mod.rs @@ -10,11 +10,14 @@ pub mod statistics; use std::io::{Read, Seek}; +use arrow::array::Array; +use arrow::types::{i256, NativeType}; pub use deserialize::{ column_iter_to_arrays, create_list, create_map, get_page_iterator, init_nested, n_columns, InitNested, NestedArrayIter, NestedState, StructIterator, }; pub use file::{FileReader, RowGroupReader}; +#[cfg(feature = "async")] use futures::{AsyncRead, AsyncSeek}; // re-exports of parquet2's relevant APIs pub use parquet2::{ @@ -39,9 +42,6 @@ use polars_error::PolarsResult; pub use row_group::*; pub use schema::{infer_schema, FileMetaData}; -use crate::array::Array; -use crate::types::{i256, NativeType}; - /// Trait describing a [`FallibleStreamingIterator`] of [`Page`] pub trait Pages: FallibleStreamingIterator + Send + Sync @@ -59,14 +59,15 @@ pub fn read_metadata(reader: &mut R) -> PolarsResult( reader: &mut R, ) -> PolarsResult { Ok(_read_metadata_async(reader).await?) } -fn convert_days_ms(value: &[u8]) -> crate::types::days_ms { - crate::types::days_ms( +fn convert_days_ms(value: &[u8]) -> arrow::types::days_ms { + arrow::types::days_ms( i32::from_le_bytes(value[4..8].try_into().unwrap()), i32::from_le_bytes(value[8..12].try_into().unwrap()), ) diff --git a/crates/polars-arrow/src/io/parquet/read/row_group.rs b/crates/polars-parquet/src/arrow/read/row_group.rs similarity index 98% rename from crates/polars-arrow/src/io/parquet/read/row_group.rs rename to crates/polars-parquet/src/arrow/read/row_group.rs index ff9689eb87dd..24c7c9c64d40 100644 --- a/crates/polars-arrow/src/io/parquet/read/row_group.rs +++ b/crates/polars-parquet/src/arrow/read/row_group.rs @@ -1,15 +1,15 @@ use std::io::{Read, Seek}; +use arrow::array::Array; +use arrow::chunk::Chunk; +use arrow::datatypes::Field; use parquet2::indexes::FilteredPage; use parquet2::metadata::ColumnChunkMetaData; use parquet2::read::{BasicDecompressor, IndexedPageReader, PageMetaData, PageReader}; use polars_error::PolarsResult; use super::{ArrayIter, RowGroupMetaData}; -use crate::array::Array; -use crate::chunk::Chunk; -use crate::datatypes::Field; -use crate::io::parquet::read::column_iter_to_arrays; +use crate::arrow::read::column_iter_to_arrays; /// An [`Iterator`] of [`Chunk`] that (dynamically) adapts a vector of iterators of [`Array`] into /// an iterator of [`Chunk`]. diff --git a/crates/polars-arrow/src/io/parquet/read/schema/convert.rs b/crates/polars-parquet/src/arrow/read/schema/convert.rs similarity index 99% rename from crates/polars-arrow/src/io/parquet/read/schema/convert.rs rename to crates/polars-parquet/src/arrow/read/schema/convert.rs index e7c70840b6d2..549eaf654d1d 100644 --- a/crates/polars-arrow/src/io/parquet/read/schema/convert.rs +++ b/crates/polars-parquet/src/arrow/read/schema/convert.rs @@ -1,12 +1,12 @@ //! This module has entry points, [`parquet_to_arrow_schema`] and the more configurable [`parquet_to_arrow_schema_with_options`]. +use arrow::datatypes::{DataType, Field, IntervalUnit, TimeUnit}; use parquet2::schema::types::{ FieldInfo, GroupConvertedType, GroupLogicalType, IntegerType, ParquetType, PhysicalType, PrimitiveConvertedType, PrimitiveLogicalType, PrimitiveType, TimeUnit as ParquetTimeUnit, }; use parquet2::schema::Repetition; -use crate::datatypes::{DataType, Field, IntervalUnit, TimeUnit}; -use crate::io::parquet::read::schema::SchemaInferenceOptions; +use crate::arrow::read::schema::SchemaInferenceOptions; /// Converts [`ParquetType`]s to a [`Field`], ignoring parquet fields that do not contain /// any physical column. @@ -398,11 +398,11 @@ pub(crate) fn to_data_type( #[cfg(test)] mod tests { + use arrow::datatypes::{DataType, Field, TimeUnit}; use parquet2::metadata::SchemaDescriptor; use polars_error::*; use super::*; - use crate::datatypes::{DataType, Field, TimeUnit}; #[test] fn test_flat_primitives() -> PolarsResult<()> { diff --git a/crates/polars-arrow/src/io/parquet/read/schema/metadata.rs b/crates/polars-parquet/src/arrow/read/schema/metadata.rs similarity index 95% rename from crates/polars-arrow/src/io/parquet/read/schema/metadata.rs rename to crates/polars-parquet/src/arrow/read/schema/metadata.rs index ec1efe4f5376..c3056cd63597 100644 --- a/crates/polars-arrow/src/io/parquet/read/schema/metadata.rs +++ b/crates/polars-parquet/src/arrow/read/schema/metadata.rs @@ -1,11 +1,11 @@ +use arrow::datatypes::{Metadata, Schema}; +use arrow::io::ipc::read::deserialize_schema; use base64::engine::general_purpose; use base64::Engine as _; pub use parquet2::metadata::KeyValue; use polars_error::{polars_bail, PolarsResult}; use super::super::super::ARROW_SCHEMA_META_KEY; -use crate::datatypes::{Metadata, Schema}; -use crate::io::ipc::read::deserialize_schema; /// Reads an arrow schema from Parquet's file metadata. Returns `None` if no schema was found. /// # Errors diff --git a/crates/polars-arrow/src/io/parquet/read/schema/mod.rs b/crates/polars-parquet/src/arrow/read/schema/mod.rs similarity index 98% rename from crates/polars-arrow/src/io/parquet/read/schema/mod.rs rename to crates/polars-parquet/src/arrow/read/schema/mod.rs index 945b4459f3fa..2d4c8b5da54f 100644 --- a/crates/polars-arrow/src/io/parquet/read/schema/mod.rs +++ b/crates/polars-parquet/src/arrow/read/schema/mod.rs @@ -1,5 +1,5 @@ //! APIs to handle Parquet <-> Arrow schemas. -use crate::datatypes::{Schema, TimeUnit}; +use arrow::datatypes::{Schema, TimeUnit}; mod convert; mod metadata; diff --git a/crates/polars-arrow/src/io/parquet/read/statistics/binary.rs b/crates/polars-parquet/src/arrow/read/statistics/binary.rs similarity index 89% rename from crates/polars-arrow/src/io/parquet/read/statistics/binary.rs rename to crates/polars-parquet/src/arrow/read/statistics/binary.rs index e0a307d8095d..925d81176e2b 100644 --- a/crates/polars-arrow/src/io/parquet/read/statistics/binary.rs +++ b/crates/polars-parquet/src/arrow/read/statistics/binary.rs @@ -1,9 +1,8 @@ +use arrow::array::{MutableArray, MutableBinaryArray}; +use arrow::offset::Offset; use parquet2::statistics::{BinaryStatistics, Statistics as ParquetStatistics}; use polars_error::PolarsResult; -use crate::array::{MutableArray, MutableBinaryArray}; -use crate::offset::Offset; - pub(super) fn push( from: Option<&dyn ParquetStatistics>, min: &mut dyn MutableArray, diff --git a/crates/polars-arrow/src/io/parquet/read/statistics/boolean.rs b/crates/polars-parquet/src/arrow/read/statistics/boolean.rs similarity index 92% rename from crates/polars-arrow/src/io/parquet/read/statistics/boolean.rs rename to crates/polars-parquet/src/arrow/read/statistics/boolean.rs index 202ee69061de..23a5504124ce 100644 --- a/crates/polars-arrow/src/io/parquet/read/statistics/boolean.rs +++ b/crates/polars-parquet/src/arrow/read/statistics/boolean.rs @@ -1,8 +1,7 @@ +use arrow::array::{MutableArray, MutableBooleanArray}; use parquet2::statistics::{BooleanStatistics, Statistics as ParquetStatistics}; use polars_error::PolarsResult; -use crate::array::{MutableArray, MutableBooleanArray}; - pub(super) fn push( from: Option<&dyn ParquetStatistics>, min: &mut dyn MutableArray, diff --git a/crates/polars-arrow/src/io/parquet/read/statistics/dictionary.rs b/crates/polars-parquet/src/arrow/read/statistics/dictionary.rs similarity index 90% rename from crates/polars-arrow/src/io/parquet/read/statistics/dictionary.rs rename to crates/polars-parquet/src/arrow/read/statistics/dictionary.rs index 4548efd5d92d..f07c7d0f7df5 100644 --- a/crates/polars-arrow/src/io/parquet/read/statistics/dictionary.rs +++ b/crates/polars-parquet/src/arrow/read/statistics/dictionary.rs @@ -1,9 +1,9 @@ +use arrow::array::*; +use arrow::datatypes::{DataType, PhysicalType}; +use arrow::match_integer_type; use polars_error::PolarsResult; use super::make_mutable; -use crate::array::*; -use crate::datatypes::{DataType, PhysicalType}; -use crate::match_integer_type; #[derive(Debug)] pub struct DynMutableDictionary { @@ -33,7 +33,7 @@ impl MutableArray for DynMutableDictionary { self.inner.len() } - fn validity(&self) -> Option<&crate::bitmap::MutableBitmap> { + fn validity(&self) -> Option<&arrow::bitmap::MutableBitmap> { self.inner.validity() } diff --git a/crates/polars-arrow/src/io/parquet/read/statistics/fixlen.rs b/crates/polars-parquet/src/arrow/read/statistics/fixlen.rs similarity index 97% rename from crates/polars-arrow/src/io/parquet/read/statistics/fixlen.rs rename to crates/polars-parquet/src/arrow/read/statistics/fixlen.rs index 8a6322a7343f..b5e03eaa38f2 100644 --- a/crates/polars-arrow/src/io/parquet/read/statistics/fixlen.rs +++ b/crates/polars-parquet/src/arrow/read/statistics/fixlen.rs @@ -1,11 +1,11 @@ +use arrow::array::*; +use arrow::types::{days_ms, i256}; use ethnum::I256; use parquet2::statistics::{FixedLenStatistics, Statistics as ParquetStatistics}; use polars_error::PolarsResult; use super::super::{convert_days_ms, convert_i128}; -use crate::array::*; -use crate::io::parquet::read::convert_i256; -use crate::types::{days_ms, i256}; +use crate::arrow::read::convert_i256; pub(super) fn push_i128( from: Option<&dyn ParquetStatistics>, diff --git a/crates/polars-arrow/src/io/parquet/read/statistics/list.rs b/crates/polars-parquet/src/arrow/read/statistics/list.rs similarity index 93% rename from crates/polars-arrow/src/io/parquet/read/statistics/list.rs rename to crates/polars-parquet/src/arrow/read/statistics/list.rs index d88fd1c6606d..f8da5408060d 100644 --- a/crates/polars-arrow/src/io/parquet/read/statistics/list.rs +++ b/crates/polars-parquet/src/arrow/read/statistics/list.rs @@ -1,9 +1,9 @@ +use arrow::array::*; +use arrow::datatypes::DataType; +use arrow::offset::Offsets; use polars_error::PolarsResult; use super::make_mutable; -use crate::array::*; -use crate::datatypes::DataType; -use crate::offset::Offsets; #[derive(Debug)] pub struct DynMutableListArray { @@ -32,7 +32,7 @@ impl MutableArray for DynMutableListArray { self.inner.len() } - fn validity(&self) -> Option<&crate::bitmap::MutableBitmap> { + fn validity(&self) -> Option<&arrow::bitmap::MutableBitmap> { self.inner.validity() } diff --git a/crates/polars-arrow/src/io/parquet/read/statistics/map.rs b/crates/polars-parquet/src/arrow/read/statistics/map.rs similarity index 89% rename from crates/polars-arrow/src/io/parquet/read/statistics/map.rs rename to crates/polars-parquet/src/arrow/read/statistics/map.rs index 5dc51d9382f6..06bcbf42f90e 100644 --- a/crates/polars-arrow/src/io/parquet/read/statistics/map.rs +++ b/crates/polars-parquet/src/arrow/read/statistics/map.rs @@ -1,8 +1,8 @@ +use arrow::array::{Array, MapArray, MutableArray}; +use arrow::datatypes::DataType; use polars_error::PolarsResult; use super::make_mutable; -use crate::array::{Array, MapArray, MutableArray}; -use crate::datatypes::DataType; #[derive(Debug)] pub struct DynMutableMapArray { @@ -31,7 +31,7 @@ impl MutableArray for DynMutableMapArray { self.inner.len() } - fn validity(&self) -> Option<&crate::bitmap::MutableBitmap> { + fn validity(&self) -> Option<&arrow::bitmap::MutableBitmap> { None } diff --git a/crates/polars-arrow/src/io/parquet/read/statistics/mod.rs b/crates/polars-parquet/src/arrow/read/statistics/mod.rs similarity index 99% rename from crates/polars-arrow/src/io/parquet/read/statistics/mod.rs rename to crates/polars-parquet/src/arrow/read/statistics/mod.rs index c1ebfc1020db..ada51dbc2d39 100644 --- a/crates/polars-arrow/src/io/parquet/read/statistics/mod.rs +++ b/crates/polars-parquet/src/arrow/read/statistics/mod.rs @@ -2,6 +2,10 @@ use std::collections::VecDeque; use std::sync::Arc; +use arrow::array::*; +use arrow::datatypes::{DataType, Field, IntervalUnit, PhysicalType}; +use arrow::types::i256; +use arrow::with_match_primitive_type; use ethnum::I256; use parquet2::metadata::RowGroupMetaData; use parquet2::schema::types::{ @@ -14,11 +18,6 @@ use parquet2::statistics::{ use parquet2::types::int96_to_i64_ns; use polars_error::{polars_bail, PolarsResult}; -use crate::array::*; -use crate::datatypes::{DataType, Field, IntervalUnit, PhysicalType}; -use crate::types::i256; -use crate::with_match_primitive_type; - mod binary; mod boolean; mod dictionary; diff --git a/crates/polars-arrow/src/io/parquet/read/statistics/null.rs b/crates/polars-parquet/src/arrow/read/statistics/null.rs similarity index 93% rename from crates/polars-arrow/src/io/parquet/read/statistics/null.rs rename to crates/polars-parquet/src/arrow/read/statistics/null.rs index 319d8a4b4d09..913193b4aae1 100644 --- a/crates/polars-arrow/src/io/parquet/read/statistics/null.rs +++ b/crates/polars-parquet/src/arrow/read/statistics/null.rs @@ -1,7 +1,6 @@ +use arrow::array::*; use polars_error::PolarsResult; -use crate::array::*; - pub(super) fn push(min: &mut dyn MutableArray, max: &mut dyn MutableArray) -> PolarsResult<()> { let min = min.as_mut_any().downcast_mut::().unwrap(); let max = max.as_mut_any().downcast_mut::().unwrap(); diff --git a/crates/polars-arrow/src/io/parquet/read/statistics/primitive.rs b/crates/polars-parquet/src/arrow/read/statistics/primitive.rs similarity index 96% rename from crates/polars-arrow/src/io/parquet/read/statistics/primitive.rs rename to crates/polars-parquet/src/arrow/read/statistics/primitive.rs index 5d8a88aa9668..ecfa2e18972f 100644 --- a/crates/polars-arrow/src/io/parquet/read/statistics/primitive.rs +++ b/crates/polars-parquet/src/arrow/read/statistics/primitive.rs @@ -1,12 +1,11 @@ +use arrow::array::*; +use arrow::datatypes::TimeUnit; +use arrow::types::NativeType; use parquet2::schema::types::{PrimitiveLogicalType, TimeUnit as ParquetTimeUnit}; use parquet2::statistics::{PrimitiveStatistics, Statistics as ParquetStatistics}; use parquet2::types::NativeType as ParquetNativeType; use polars_error::PolarsResult; -use crate::array::*; -use crate::datatypes::TimeUnit; -use crate::types::NativeType; - pub fn timestamp(logical_type: Option<&PrimitiveLogicalType>, time_unit: TimeUnit, x: i64) -> i64 { let unit = if let Some(PrimitiveLogicalType::Timestamp { unit, .. }) = logical_type { unit diff --git a/crates/polars-arrow/src/io/parquet/read/statistics/struct_.rs b/crates/polars-parquet/src/arrow/read/statistics/struct_.rs similarity index 90% rename from crates/polars-arrow/src/io/parquet/read/statistics/struct_.rs rename to crates/polars-parquet/src/arrow/read/statistics/struct_.rs index c45ac3dede94..0383d0bf5057 100644 --- a/crates/polars-arrow/src/io/parquet/read/statistics/struct_.rs +++ b/crates/polars-parquet/src/arrow/read/statistics/struct_.rs @@ -1,8 +1,8 @@ +use arrow::array::{Array, MutableArray, StructArray}; +use arrow::datatypes::DataType; use polars_error::PolarsResult; use super::make_mutable; -use crate::array::{Array, MutableArray, StructArray}; -use crate::datatypes::DataType; #[derive(Debug)] pub struct DynMutableStructArray { @@ -33,7 +33,7 @@ impl MutableArray for DynMutableStructArray { self.inner[0].len() } - fn validity(&self) -> Option<&crate::bitmap::MutableBitmap> { + fn validity(&self) -> Option<&arrow::bitmap::MutableBitmap> { None } diff --git a/crates/polars-arrow/src/io/parquet/read/statistics/utf8.rs b/crates/polars-parquet/src/arrow/read/statistics/utf8.rs similarity index 91% rename from crates/polars-arrow/src/io/parquet/read/statistics/utf8.rs rename to crates/polars-parquet/src/arrow/read/statistics/utf8.rs index cf5f1dd15318..a716e8d22b8a 100644 --- a/crates/polars-arrow/src/io/parquet/read/statistics/utf8.rs +++ b/crates/polars-parquet/src/arrow/read/statistics/utf8.rs @@ -1,9 +1,8 @@ +use arrow::array::{MutableArray, MutableUtf8Array}; +use arrow::offset::Offset; use parquet2::statistics::{BinaryStatistics, Statistics as ParquetStatistics}; use polars_error::PolarsResult; -use crate::array::{MutableArray, MutableUtf8Array}; -use crate::offset::Offset; - pub(super) fn push( from: Option<&dyn ParquetStatistics>, min: &mut dyn MutableArray, diff --git a/crates/polars-arrow/src/io/parquet/write/binary/basic.rs b/crates/polars-parquet/src/arrow/write/binary/basic.rs similarity index 97% rename from crates/polars-arrow/src/io/parquet/write/binary/basic.rs rename to crates/polars-parquet/src/arrow/write/binary/basic.rs index 7da923802d92..ee2cb022e699 100644 --- a/crates/polars-arrow/src/io/parquet/write/binary/basic.rs +++ b/crates/polars-parquet/src/arrow/write/binary/basic.rs @@ -1,3 +1,6 @@ +use arrow::array::{Array, BinaryArray}; +use arrow::bitmap::Bitmap; +use arrow::offset::Offset; use parquet2::encoding::{delta_bitpacked, Encoding}; use parquet2::page::DataPage; use parquet2::schema::types::PrimitiveType; @@ -5,10 +8,7 @@ use parquet2::statistics::{serialize_statistics, BinaryStatistics, ParquetStatis use polars_error::{polars_bail, PolarsResult}; use super::super::{utils, WriteOptions}; -use crate::array::{Array, BinaryArray}; -use crate::bitmap::Bitmap; -use crate::io::parquet::read::schema::is_nullable; -use crate::offset::Offset; +use crate::arrow::read::schema::is_nullable; pub(crate) fn encode_plain( array: &BinaryArray, diff --git a/crates/polars-arrow/src/io/parquet/write/binary/mod.rs b/crates/polars-parquet/src/arrow/write/binary/mod.rs similarity index 100% rename from crates/polars-arrow/src/io/parquet/write/binary/mod.rs rename to crates/polars-parquet/src/arrow/write/binary/mod.rs diff --git a/crates/polars-arrow/src/io/parquet/write/binary/nested.rs b/crates/polars-parquet/src/arrow/write/binary/nested.rs similarity index 87% rename from crates/polars-arrow/src/io/parquet/write/binary/nested.rs rename to crates/polars-parquet/src/arrow/write/binary/nested.rs index 1dbb3e4060f0..d72917df942d 100644 --- a/crates/polars-arrow/src/io/parquet/write/binary/nested.rs +++ b/crates/polars-parquet/src/arrow/write/binary/nested.rs @@ -1,3 +1,5 @@ +use arrow::array::{Array, BinaryArray}; +use arrow::offset::Offset; use parquet2::encoding::Encoding; use parquet2::page::DataPage; use parquet2::schema::types::PrimitiveType; @@ -5,10 +7,8 @@ use polars_error::PolarsResult; use super::super::{nested, utils, WriteOptions}; use super::basic::{build_statistics, encode_plain}; -use crate::array::{Array, BinaryArray}; -use crate::io::parquet::read::schema::is_nullable; -use crate::io::parquet::write::Nested; -use crate::offset::Offset; +use crate::arrow::read::schema::is_nullable; +use crate::arrow::write::Nested; pub fn array_to_page( array: &BinaryArray, diff --git a/crates/polars-arrow/src/io/parquet/write/boolean/basic.rs b/crates/polars-parquet/src/arrow/write/boolean/basic.rs similarity index 97% rename from crates/polars-arrow/src/io/parquet/write/boolean/basic.rs rename to crates/polars-parquet/src/arrow/write/boolean/basic.rs index 7269a18d2288..c18a9b0bfb24 100644 --- a/crates/polars-arrow/src/io/parquet/write/boolean/basic.rs +++ b/crates/polars-parquet/src/arrow/write/boolean/basic.rs @@ -1,3 +1,4 @@ +use arrow::array::*; use parquet2::encoding::hybrid_rle::bitpacked_encode; use parquet2::encoding::Encoding; use parquet2::page::DataPage; @@ -8,8 +9,7 @@ use parquet2::statistics::{ use polars_error::PolarsResult; use super::super::{utils, WriteOptions}; -use crate::array::*; -use crate::io::parquet::read::schema::is_nullable; +use crate::arrow::read::schema::is_nullable; fn encode(iterator: impl Iterator, buffer: &mut Vec) -> PolarsResult<()> { // encode values using bitpacking diff --git a/crates/polars-arrow/src/io/parquet/write/boolean/mod.rs b/crates/polars-parquet/src/arrow/write/boolean/mod.rs similarity index 100% rename from crates/polars-arrow/src/io/parquet/write/boolean/mod.rs rename to crates/polars-parquet/src/arrow/write/boolean/mod.rs diff --git a/crates/polars-arrow/src/io/parquet/write/boolean/nested.rs b/crates/polars-parquet/src/arrow/write/boolean/nested.rs similarity index 89% rename from crates/polars-arrow/src/io/parquet/write/boolean/nested.rs rename to crates/polars-parquet/src/arrow/write/boolean/nested.rs index c25a4da1704b..3ee9cfba328f 100644 --- a/crates/polars-arrow/src/io/parquet/write/boolean/nested.rs +++ b/crates/polars-parquet/src/arrow/write/boolean/nested.rs @@ -1,3 +1,4 @@ +use arrow::array::{Array, BooleanArray}; use parquet2::encoding::Encoding; use parquet2::page::DataPage; use parquet2::schema::types::PrimitiveType; @@ -5,9 +6,8 @@ use polars_error::PolarsResult; use super::super::{nested, utils, WriteOptions}; use super::basic::{build_statistics, encode_plain}; -use crate::array::{Array, BooleanArray}; -use crate::io::parquet::read::schema::is_nullable; -use crate::io::parquet::write::Nested; +use crate::arrow::read::schema::is_nullable; +use crate::arrow::write::Nested; pub fn array_to_page( array: &BooleanArray, diff --git a/crates/polars-arrow/src/io/parquet/write/dictionary.rs b/crates/polars-parquet/src/arrow/write/dictionary.rs similarity index 97% rename from crates/polars-arrow/src/io/parquet/write/dictionary.rs rename to crates/polars-parquet/src/arrow/write/dictionary.rs index 1f59ab075037..e996e78770d5 100644 --- a/crates/polars-arrow/src/io/parquet/write/dictionary.rs +++ b/crates/polars-parquet/src/arrow/write/dictionary.rs @@ -1,3 +1,6 @@ +use arrow::array::{Array, DictionaryArray, DictionaryKey}; +use arrow::bitmap::{Bitmap, MutableBitmap}; +use arrow::datatypes::DataType; use parquet2::encoding::hybrid_rle::encode_u32; use parquet2::encoding::Encoding; use parquet2::page::{DictPage, Page}; @@ -17,11 +20,8 @@ use super::primitive::{ }; use super::utf8::{build_statistics as utf8_build_statistics, encode_plain as utf8_encode_plain}; use super::{nested, Nested, WriteOptions}; -use crate::array::{Array, DictionaryArray, DictionaryKey}; -use crate::bitmap::{Bitmap, MutableBitmap}; -use crate::datatypes::DataType; -use crate::io::parquet::read::schema::is_nullable; -use crate::io::parquet::write::{slice_nested_leaf, utils}; +use crate::arrow::read::schema::is_nullable; +use crate::arrow::write::{slice_nested_leaf, utils}; fn serialize_def_levels_simple( validity: Option<&Bitmap>, diff --git a/crates/polars-arrow/src/io/parquet/write/file.rs b/crates/polars-parquet/src/arrow/write/file.rs similarity index 98% rename from crates/polars-arrow/src/io/parquet/write/file.rs rename to crates/polars-parquet/src/arrow/write/file.rs index 7c9f60b33fb5..b0fad55b1e1d 100644 --- a/crates/polars-arrow/src/io/parquet/write/file.rs +++ b/crates/polars-parquet/src/arrow/write/file.rs @@ -1,12 +1,12 @@ use std::io::Write; +use arrow::datatypes::Schema; use parquet2::metadata::{KeyValue, SchemaDescriptor}; use parquet2::write::{RowGroupIter, WriteOptions as FileWriteOptions}; use polars_error::{PolarsError, PolarsResult}; use super::schema::schema_to_metadata_key; use super::{to_parquet_schema, ThriftFileMetaData, WriteOptions}; -use crate::datatypes::Schema; /// Attaches [`Schema`] to `key_value_metadata` pub fn add_arrow_schema( diff --git a/crates/polars-arrow/src/io/parquet/write/fixed_len_bytes.rs b/crates/polars-parquet/src/arrow/write/fixed_len_bytes.rs similarity index 96% rename from crates/polars-arrow/src/io/parquet/write/fixed_len_bytes.rs rename to crates/polars-parquet/src/arrow/write/fixed_len_bytes.rs index b0216d2d7761..0531f66ffa4d 100644 --- a/crates/polars-arrow/src/io/parquet/write/fixed_len_bytes.rs +++ b/crates/polars-parquet/src/arrow/write/fixed_len_bytes.rs @@ -1,3 +1,5 @@ +use arrow::array::{Array, FixedSizeBinaryArray, PrimitiveArray}; +use arrow::types::i256; use parquet2::encoding::Encoding; use parquet2::page::DataPage; use parquet2::schema::types::PrimitiveType; @@ -6,9 +8,7 @@ use polars_error::PolarsResult; use super::binary::ord_binary; use super::{utils, WriteOptions}; -use crate::array::{Array, FixedSizeBinaryArray, PrimitiveArray}; -use crate::io::parquet::read::schema::is_nullable; -use crate::types::i256; +use crate::arrow::read::schema::is_nullable; pub(crate) fn encode_plain(array: &FixedSizeBinaryArray, is_optional: bool, buffer: &mut Vec) { // append the non-null values diff --git a/crates/polars-arrow/src/io/parquet/write/mod.rs b/crates/polars-parquet/src/arrow/write/mod.rs similarity index 98% rename from crates/polars-arrow/src/io/parquet/write/mod.rs rename to crates/polars-parquet/src/arrow/write/mod.rs index 5b6e8c16baad..a1db4a4f4147 100644 --- a/crates/polars-arrow/src/io/parquet/write/mod.rs +++ b/crates/polars-parquet/src/arrow/write/mod.rs @@ -2,7 +2,7 @@ //! //! # Arrow/Parquet Interoperability //! As of [parquet-format v2.9](https://github.com/apache/parquet-format/blob/master/LogicalTypes.md) -//! there are Arrow [DataTypes](crate::datatypes::DataType) which do not have a parquet +//! there are Arrow [DataTypes](arrow::datatypes::DataType) which do not have a parquet //! representation. These include but are not limited to: //! * `DataType::Timestamp(TimeUnit::Second, _)` //! * `DataType::Int64` @@ -22,10 +22,14 @@ mod pages; mod primitive; mod row_group; mod schema; +#[cfg(feature = "async")] mod sink; mod utf8; mod utils; +use arrow::array::*; +use arrow::datatypes::*; +use arrow::types::{days_ms, i256, NativeType}; pub use nested::{num_values, write_rep_and_def}; pub use pages::{to_leaves, to_nested, to_parquet_leaves}; pub use parquet2::compression::{BrotliLevel, CompressionOptions, GzipLevel, ZstdLevel}; @@ -43,10 +47,6 @@ pub use parquet2::write::{ pub use parquet2::{fallible_streaming_iterator, FallibleStreamingIterator}; pub use utils::write_def_levels; -use crate::array::*; -use crate::datatypes::*; -use crate::types::{days_ms, i256, NativeType}; - /// Currently supported options to write to parquet #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub struct WriteOptions { @@ -60,16 +60,16 @@ pub struct WriteOptions { pub data_pagesize_limit: Option, } +use arrow::compute::aggregate::estimated_bytes_size; +use arrow::match_integer_type; pub use file::FileWriter; pub use pages::{array_to_columns, Nested}; use polars_error::{polars_bail, PolarsResult}; pub use row_group::{row_group_iter, RowGroupIterator}; pub use schema::to_parquet_type; +#[cfg(feature = "async")] pub use sink::FileSink; -use crate::compute::aggregate::estimated_bytes_size; -use crate::match_integer_type; - /// returns offset and length to slice the leaf values pub fn slice_nested_leaf(nested: &[Nested]) -> (usize, usize) { // find the deepest recursive dremel structure as that one determines how many values we must @@ -808,7 +808,7 @@ fn transverse_recursive T + Clone>( map: F, encodings: &mut Vec, ) { - use crate::datatypes::PhysicalType::*; + use arrow::datatypes::PhysicalType::*; match data_type.to_physical_type() { Null | Boolean | Primitive(_) | Binary | FixedSizeBinary | LargeBinary | Utf8 | Dictionary(_) | LargeUtf8 => encodings.push(map(data_type)), diff --git a/crates/polars-arrow/src/io/parquet/write/nested/def.rs b/crates/polars-parquet/src/arrow/write/nested/def.rs similarity index 99% rename from crates/polars-arrow/src/io/parquet/write/nested/def.rs rename to crates/polars-parquet/src/arrow/write/nested/def.rs index 02947dd5bef9..15e7322d393b 100644 --- a/crates/polars-arrow/src/io/parquet/write/nested/def.rs +++ b/crates/polars-parquet/src/arrow/write/nested/def.rs @@ -1,8 +1,9 @@ +use arrow::bitmap::Bitmap; +use arrow::offset::Offset; + use super::super::pages::{ListNested, Nested}; use super::rep::num_values; use super::to_length; -use crate::bitmap::Bitmap; -use crate::offset::Offset; trait DebugIter: Iterator + std::fmt::Debug {} diff --git a/crates/polars-arrow/src/io/parquet/write/nested/mod.rs b/crates/polars-parquet/src/arrow/write/nested/mod.rs similarity index 99% rename from crates/polars-arrow/src/io/parquet/write/nested/mod.rs rename to crates/polars-parquet/src/arrow/write/nested/mod.rs index e764b58af103..4fed334a820f 100644 --- a/crates/polars-arrow/src/io/parquet/write/nested/mod.rs +++ b/crates/polars-parquet/src/arrow/write/nested/mod.rs @@ -1,6 +1,7 @@ mod def; mod rep; +use arrow::offset::Offset; use parquet2::encoding::hybrid_rle::encode_u32; use parquet2::read::levels::get_bit_width; use parquet2::write::Version; @@ -8,7 +9,6 @@ use polars_error::PolarsResult; pub use rep::num_values; use super::Nested; -use crate::offset::Offset; fn write_levels_v1) -> PolarsResult<()>>( buffer: &mut Vec, diff --git a/crates/polars-arrow/src/io/parquet/write/nested/rep.rs b/crates/polars-parquet/src/arrow/write/nested/rep.rs similarity index 100% rename from crates/polars-arrow/src/io/parquet/write/nested/rep.rs rename to crates/polars-parquet/src/arrow/write/nested/rep.rs diff --git a/crates/polars-arrow/src/io/parquet/write/pages.rs b/crates/polars-parquet/src/arrow/write/pages.rs similarity index 98% rename from crates/polars-arrow/src/io/parquet/write/pages.rs rename to crates/polars-parquet/src/arrow/write/pages.rs index 417563bac57f..46698d083f96 100644 --- a/crates/polars-arrow/src/io/parquet/write/pages.rs +++ b/crates/polars-parquet/src/arrow/write/pages.rs @@ -1,16 +1,16 @@ use std::fmt::Debug; +use arrow::array::{Array, ListArray, MapArray, StructArray}; +use arrow::bitmap::Bitmap; +use arrow::datatypes::PhysicalType; +use arrow::offset::{Offset, OffsetsBuffer}; use parquet2::page::Page; use parquet2::schema::types::{ParquetType, PrimitiveType as ParquetPrimitiveType}; use parquet2::write::DynIter; use polars_error::{polars_bail, PolarsResult}; use super::{array_to_pages, Encoding, WriteOptions}; -use crate::array::{Array, ListArray, MapArray, StructArray}; -use crate::bitmap::Bitmap; -use crate::datatypes::PhysicalType; -use crate::io::parquet::read::schema::is_nullable; -use crate::offset::{Offset, OffsetsBuffer}; +use crate::arrow::read::schema::is_nullable; #[derive(Debug, Clone, PartialEq)] pub struct ListNested { @@ -256,14 +256,14 @@ pub fn array_to_columns + Send + Sync>( #[cfg(test)] mod tests { + use arrow::array::*; + use arrow::bitmap::Bitmap; + use arrow::datatypes::*; use parquet2::schema::types::{GroupLogicalType, PrimitiveConvertedType, PrimitiveLogicalType}; use parquet2::schema::Repetition; use super::super::{FieldInfo, ParquetPhysicalType, ParquetPrimitiveType}; use super::*; - use crate::array::*; - use crate::bitmap::Bitmap; - use crate::datatypes::*; #[test] fn test_struct() { diff --git a/crates/polars-arrow/src/io/parquet/write/primitive/basic.rs b/crates/polars-parquet/src/arrow/write/primitive/basic.rs similarity index 96% rename from crates/polars-arrow/src/io/parquet/write/primitive/basic.rs rename to crates/polars-parquet/src/arrow/write/primitive/basic.rs index 2982e4911dc3..81d7b5cba943 100644 --- a/crates/polars-arrow/src/io/parquet/write/primitive/basic.rs +++ b/crates/polars-parquet/src/arrow/write/primitive/basic.rs @@ -1,3 +1,5 @@ +use arrow::array::{Array, PrimitiveArray}; +use arrow::types::NativeType; use parquet2::encoding::delta_bitpacked::encode; use parquet2::encoding::Encoding; use parquet2::page::DataPage; @@ -7,10 +9,8 @@ use parquet2::types::NativeType as ParquetNativeType; use polars_error::{polars_bail, PolarsResult}; use super::super::{utils, WriteOptions}; -use crate::array::{Array, PrimitiveArray}; -use crate::io::parquet::read::schema::is_nullable; -use crate::io::parquet::write::utils::ExactSizedIter; -use crate::types::NativeType; +use crate::arrow::read::schema::is_nullable; +use crate::arrow::write::utils::ExactSizedIter; pub(crate) fn encode_plain( array: &PrimitiveArray, diff --git a/crates/polars-arrow/src/io/parquet/write/primitive/mod.rs b/crates/polars-parquet/src/arrow/write/primitive/mod.rs similarity index 100% rename from crates/polars-arrow/src/io/parquet/write/primitive/mod.rs rename to crates/polars-parquet/src/arrow/write/primitive/mod.rs diff --git a/crates/polars-arrow/src/io/parquet/write/primitive/nested.rs b/crates/polars-parquet/src/arrow/write/primitive/nested.rs similarity index 88% rename from crates/polars-arrow/src/io/parquet/write/primitive/nested.rs rename to crates/polars-parquet/src/arrow/write/primitive/nested.rs index 231e1da46609..a5cb2229de6f 100644 --- a/crates/polars-arrow/src/io/parquet/write/primitive/nested.rs +++ b/crates/polars-parquet/src/arrow/write/primitive/nested.rs @@ -1,3 +1,5 @@ +use arrow::array::{Array, PrimitiveArray}; +use arrow::types::NativeType as ArrowNativeType; use parquet2::encoding::Encoding; use parquet2::page::DataPage; use parquet2::schema::types::PrimitiveType; @@ -7,10 +9,8 @@ use polars_error::PolarsResult; use super::super::{nested, utils, WriteOptions}; use super::basic::{build_statistics, encode_plain}; -use crate::array::{Array, PrimitiveArray}; -use crate::io::parquet::read::schema::is_nullable; -use crate::io::parquet::write::Nested; -use crate::types::NativeType as ArrowNativeType; +use crate::arrow::read::schema::is_nullable; +use crate::arrow::write::Nested; pub fn array_to_page( array: &PrimitiveArray, diff --git a/crates/polars-arrow/src/io/parquet/write/row_group.rs b/crates/polars-parquet/src/arrow/write/row_group.rs similarity index 98% rename from crates/polars-arrow/src/io/parquet/write/row_group.rs rename to crates/polars-parquet/src/arrow/write/row_group.rs index 5b1c170ff9f7..6d2269c178b9 100644 --- a/crates/polars-arrow/src/io/parquet/write/row_group.rs +++ b/crates/polars-parquet/src/arrow/write/row_group.rs @@ -1,3 +1,6 @@ +use arrow::array::Array; +use arrow::chunk::Chunk; +use arrow::datatypes::Schema; use parquet2::error::Error as ParquetError; use parquet2::schema::types::ParquetType; use parquet2::write::Compressor; @@ -8,9 +11,6 @@ use super::{ array_to_columns, to_parquet_schema, DynIter, DynStreamingIterator, Encoding, RowGroupIter, SchemaDescriptor, WriteOptions, }; -use crate::array::Array; -use crate::chunk::Chunk; -use crate::datatypes::Schema; /// Maps a [`Chunk`] and parquet-specific options to an [`RowGroupIter`] used to /// write to parquet diff --git a/crates/polars-arrow/src/io/parquet/write/schema.rs b/crates/polars-parquet/src/arrow/write/schema.rs similarity index 98% rename from crates/polars-arrow/src/io/parquet/write/schema.rs rename to crates/polars-parquet/src/arrow/write/schema.rs index 716e854c4e39..89fa6c7ef99a 100644 --- a/crates/polars-arrow/src/io/parquet/write/schema.rs +++ b/crates/polars-parquet/src/arrow/write/schema.rs @@ -1,3 +1,5 @@ +use arrow::datatypes::{DataType, Field, Schema, TimeUnit}; +use arrow::io::ipc::write::{default_ipc_fields, schema_to_bytes}; use base64::engine::general_purpose; use base64::Engine as _; use parquet2::metadata::KeyValue; @@ -9,9 +11,7 @@ use parquet2::schema::Repetition; use polars_error::{polars_bail, PolarsResult}; use super::super::ARROW_SCHEMA_META_KEY; -use crate::datatypes::{DataType, Field, Schema, TimeUnit}; -use crate::io::ipc::write::{default_ipc_fields, schema_to_bytes}; -use crate::io::parquet::write::decimal_length_from_precision; +use crate::arrow::write::decimal_length_from_precision; pub fn schema_to_metadata_key(schema: &Schema) -> KeyValue { let serialized_schema = schema_to_bytes(schema, &default_ipc_fields(&schema.fields)); diff --git a/crates/polars-arrow/src/io/parquet/write/sink.rs b/crates/polars-parquet/src/arrow/write/sink.rs similarity index 96% rename from crates/polars-arrow/src/io/parquet/write/sink.rs rename to crates/polars-parquet/src/arrow/write/sink.rs index d8d2734ce461..16ffd4176d1d 100644 --- a/crates/polars-arrow/src/io/parquet/write/sink.rs +++ b/crates/polars-parquet/src/arrow/write/sink.rs @@ -2,6 +2,9 @@ use std::pin::Pin; use std::task::Poll; use ahash::AHashMap; +use arrow::array::Array; +use arrow::chunk::Chunk; +use arrow::datatypes::Schema; use futures::future::BoxFuture; use futures::{AsyncWrite, AsyncWriteExt, FutureExt, Sink, TryFutureExt}; use parquet2::metadata::KeyValue; @@ -10,9 +13,6 @@ use polars_error::{polars_bail, to_compute_err, PolarsError, PolarsResult}; use super::file::add_arrow_schema; use super::{Encoding, SchemaDescriptor, WriteOptions}; -use crate::array::Array; -use crate::chunk::Chunk; -use crate::datatypes::Schema; /// Sink that writes array [`chunks`](Chunk) as a Parquet file. /// @@ -51,7 +51,7 @@ where ) } - let parquet_schema = crate::io::parquet::write::to_parquet_schema(&schema)?; + let parquet_schema = crate::arrow::write::to_parquet_schema(&schema)?; let created_by = Some("Arrow2 - Native Rust implementation of Arrow".to_string()); let writer = FileStreamer::new( writer, @@ -124,7 +124,7 @@ where } let this = self.get_mut(); if let Some(mut writer) = this.writer.take() { - let rows = crate::io::parquet::write::row_group_iter( + let rows = crate::arrow::write::row_group_iter( item, this.encodings.clone(), this.parquet_schema.fields().to_vec(), diff --git a/crates/polars-arrow/src/io/parquet/write/utf8/basic.rs b/crates/polars-parquet/src/arrow/write/utf8/basic.rs similarity index 96% rename from crates/polars-arrow/src/io/parquet/write/utf8/basic.rs rename to crates/polars-parquet/src/arrow/write/utf8/basic.rs index 76cf4cd73cd1..cb64dfd561f5 100644 --- a/crates/polars-arrow/src/io/parquet/write/utf8/basic.rs +++ b/crates/polars-parquet/src/arrow/write/utf8/basic.rs @@ -1,3 +1,5 @@ +use arrow::array::{Array, Utf8Array}; +use arrow::offset::Offset; use parquet2::encoding::Encoding; use parquet2::page::DataPage; use parquet2::schema::types::PrimitiveType; @@ -6,9 +8,7 @@ use polars_error::{polars_bail, PolarsResult}; use super::super::binary::{encode_delta, ord_binary}; use super::super::{utils, WriteOptions}; -use crate::array::{Array, Utf8Array}; -use crate::io::parquet::read::schema::is_nullable; -use crate::offset::Offset; +use crate::arrow::read::schema::is_nullable; pub(crate) fn encode_plain( array: &Utf8Array, diff --git a/crates/polars-arrow/src/io/parquet/write/utf8/mod.rs b/crates/polars-parquet/src/arrow/write/utf8/mod.rs similarity index 100% rename from crates/polars-arrow/src/io/parquet/write/utf8/mod.rs rename to crates/polars-parquet/src/arrow/write/utf8/mod.rs diff --git a/crates/polars-arrow/src/io/parquet/write/utf8/nested.rs b/crates/polars-parquet/src/arrow/write/utf8/nested.rs similarity index 88% rename from crates/polars-arrow/src/io/parquet/write/utf8/nested.rs rename to crates/polars-parquet/src/arrow/write/utf8/nested.rs index 163317eff96f..a0a8640dde9f 100644 --- a/crates/polars-arrow/src/io/parquet/write/utf8/nested.rs +++ b/crates/polars-parquet/src/arrow/write/utf8/nested.rs @@ -1,3 +1,5 @@ +use arrow::array::{Array, Utf8Array}; +use arrow::offset::Offset; use parquet2::encoding::Encoding; use parquet2::page::DataPage; use parquet2::schema::types::PrimitiveType; @@ -5,10 +7,8 @@ use polars_error::PolarsResult; use super::super::{nested, utils, WriteOptions}; use super::basic::{build_statistics, encode_plain}; -use crate::array::{Array, Utf8Array}; -use crate::io::parquet::read::schema::is_nullable; -use crate::io::parquet::write::Nested; -use crate::offset::Offset; +use crate::arrow::read::schema::is_nullable; +use crate::arrow::write::Nested; pub fn array_to_page( array: &Utf8Array, diff --git a/crates/polars-arrow/src/io/parquet/write/utils.rs b/crates/polars-parquet/src/arrow/write/utils.rs similarity index 99% rename from crates/polars-arrow/src/io/parquet/write/utils.rs rename to crates/polars-parquet/src/arrow/write/utils.rs index 1014a704f499..c7424e6e6f4d 100644 --- a/crates/polars-arrow/src/io/parquet/write/utils.rs +++ b/crates/polars-parquet/src/arrow/write/utils.rs @@ -1,3 +1,4 @@ +use arrow::bitmap::Bitmap; use parquet2::compression::CompressionOptions; use parquet2::encoding::hybrid_rle::encode_bool; use parquet2::encoding::Encoding; @@ -8,7 +9,6 @@ use parquet2::statistics::ParquetStatistics; use polars_error::PolarsResult; use super::{Version, WriteOptions}; -use crate::bitmap::Bitmap; fn encode_iter_v1>(buffer: &mut Vec, iter: I) -> PolarsResult<()> { buffer.extend_from_slice(&[0; 4]); diff --git a/crates/polars-parquet/src/lib.rs b/crates/polars-parquet/src/lib.rs new file mode 100644 index 000000000000..4b64c583ce23 --- /dev/null +++ b/crates/polars-parquet/src/lib.rs @@ -0,0 +1,3 @@ +#![allow(clippy::len_without_is_empty)] +pub mod arrow; +pub use arrow::{read, write}; diff --git a/crates/polars-pipe/src/executors/sources/parquet.rs b/crates/polars-pipe/src/executors/sources/parquet.rs index b18293041a5b..d8039a37f25f 100644 --- a/crates/polars-pipe/src/executors/sources/parquet.rs +++ b/crates/polars-pipe/src/executors/sources/parquet.rs @@ -3,10 +3,9 @@ use std::path::PathBuf; use std::sync::Arc; use polars_core::error::*; -use polars_core::utils::arrow::io::parquet::read::FileMetaData; use polars_core::POOL; use polars_io::cloud::CloudOptions; -use polars_io::parquet::{BatchedParquetReader, ParquetReader}; +use polars_io::parquet::{BatchedParquetReader, FileMetaData, ParquetReader}; use polars_io::pl_async::get_runtime; use polars_io::prelude::materialize_projection; #[cfg(feature = "async")] diff --git a/crates/polars-plan/Cargo.toml b/crates/polars-plan/Cargo.toml index 52f5f3ed595f..660b6875a148 100644 --- a/crates/polars-plan/Cargo.toml +++ b/crates/polars-plan/Cargo.toml @@ -17,6 +17,7 @@ polars-core = { workspace = true, features = ["lazy", "zip_with", "random"] } polars-ffi = { workspace = true, optional = true } polars-io = { workspace = true, features = ["lazy"] } polars-ops = { workspace = true, features = ["zip_with"] } +polars-parquet = { workspace = true, optional = true } polars-time = { workspace = true, optional = true } polars-utils = { workspace = true } @@ -51,7 +52,7 @@ serde = [ "polars-ops/serde", ] streaming = [] -parquet = ["polars-core/parquet", "polars-io/parquet"] +parquet = ["polars-io/parquet", "polars-parquet"] async = ["polars-io/async"] cloud = ["async", "polars-io/cloud"] ipc = ["polars-io/ipc"] diff --git a/crates/polars-plan/src/logical_plan/file_scan.rs b/crates/polars-plan/src/logical_plan/file_scan.rs index 0a3c66831927..2d664e86ea3f 100644 --- a/crates/polars-plan/src/logical_plan/file_scan.rs +++ b/crates/polars-plan/src/logical_plan/file_scan.rs @@ -1,5 +1,5 @@ #[cfg(feature = "parquet")] -use arrow::io::parquet::write::FileMetaData; +use polars_parquet::write::FileMetaData; use super::*; diff --git a/crates/polars/Cargo.toml b/crates/polars/Cargo.toml index 518ff3adb523..ab98a9dc0d85 100644 --- a/crates/polars/Cargo.toml +++ b/crates/polars/Cargo.toml @@ -57,7 +57,7 @@ serde-lazy = [ "polars-io/serde", "polars-ops/serde", ] -parquet = ["polars-io", "polars-core/parquet", "polars-lazy?/parquet", "polars-io/parquet", "polars-sql?/parquet"] +parquet = ["polars-io", "polars-lazy?/parquet", "polars-io/parquet", "polars-sql?/parquet"] async = ["polars-lazy?/async"] cloud = ["polars-lazy?/cloud", "polars-io/cloud"] cloud_write = ["cloud", "polars-lazy?/cloud_write"] diff --git a/py-polars/Cargo.lock b/py-polars/Cargo.lock index 5514892a096d..1c5c1d9140c6 100644 --- a/py-polars/Cargo.lock +++ b/py-polars/Cargo.lock @@ -1605,14 +1605,12 @@ dependencies = [ "arrow-format", "atoi", "avro-schema", - "base64", "bytemuck", "chrono", "chrono-tz", "dyn-clone", "either", "ethnum", - "fallible-streaming-iterator", "foreign_vec", "futures", "getrandom", @@ -1621,7 +1619,6 @@ dependencies = [ "lz4", "multiversion", "num-traits", - "parquet2", "polars-error", "rustc_version", "serde", @@ -1711,6 +1708,7 @@ dependencies = [ "polars-core", "polars-error", "polars-json", + "polars-parquet", "polars-time", "polars-utils", "rayon", @@ -1803,6 +1801,21 @@ dependencies = [ "version_check", ] +[[package]] +name = "polars-parquet" +version = "0.34.2" +dependencies = [ + "ahash", + "base64", + "ethnum", + "futures", + "num-traits", + "parquet2", + "polars-arrow", + "polars-error", + "simdutf8", +] + [[package]] name = "polars-pipe" version = "0.34.2" @@ -1842,6 +1855,7 @@ dependencies = [ "polars-ffi", "polars-io", "polars-ops", + "polars-parquet", "polars-time", "polars-utils", "pyo3", @@ -1948,6 +1962,7 @@ dependencies = [ "polars-error", "polars-lazy", "polars-ops", + "polars-parquet", "polars-plan", "pyo3", "pyo3-built", diff --git a/py-polars/Cargo.toml b/py-polars/Cargo.toml index a62f7f916052..a186dc8e8708 100644 --- a/py-polars/Cargo.toml +++ b/py-polars/Cargo.toml @@ -13,6 +13,7 @@ polars-core = { path = "../crates/polars-core", default-features = false, featur polars-error = { path = "../crates/polars-error" } polars-lazy = { path = "../crates/polars-lazy", default-features = false, features = ["python"] } polars-ops = { path = "../crates/polars-ops", default-features = false, features = ["convert_index"] } +polars-parquet = { path = "../crates/polars-parquet", default-features = false, optional = true } polars-plan = { path = "../crates/polars-plan", default-features = false } ahash = ">=0.8.5" @@ -102,7 +103,7 @@ dtype-i16 = [] dtype-u8 = [] dtype-u16 = [] avro = ["polars/avro"] -parquet = ["polars/parquet"] +parquet = ["polars/parquet", "polars-parquet"] ipc = ["polars/ipc"] ipc_streaming = ["polars/ipc_streaming"] is_in = ["polars/is_in"] diff --git a/py-polars/src/functions/io.rs b/py-polars/src/functions/io.rs index b42b96d64898..c7551971bbef 100644 --- a/py-polars/src/functions/io.rs +++ b/py-polars/src/functions/io.rs @@ -28,7 +28,7 @@ pub fn read_ipc_schema(py: Python, py_f: PyObject) -> PyResult { #[cfg(feature = "parquet")] #[pyfunction] pub fn read_parquet_schema(py: Python, py_f: PyObject) -> PyResult { - use polars_core::export::arrow::io::parquet::read::{infer_schema, read_metadata}; + use polars_parquet::read::{infer_schema, read_metadata}; let metadata = match get_either_file(py_f, false)? { EitherRustPythonFile::Rust(mut r) => read_metadata(&mut r).map_err(PyPolarsErr::from)?,