From c91e0786c53ab0662834e690cdb9709a337f7e7c Mon Sep 17 00:00:00 2001 From: Gijs Burghoorn Date: Sun, 4 Aug 2024 10:21:42 +0200 Subject: [PATCH] feat: Add boolean Parquet HybridRle encoding (#18022) --- crates/polars-io/src/parquet/write/writer.rs | 1 + .../src/arrow/write/boolean/basic.rs | 50 +++++++++++++------ crates/polars-parquet/src/arrow/write/mod.rs | 9 ++-- 3 files changed, 42 insertions(+), 18 deletions(-) diff --git a/crates/polars-io/src/parquet/write/writer.rs b/crates/polars-io/src/parquet/write/writer.rs index 32b104306aa6..3129421e21d7 100644 --- a/crates/polars-io/src/parquet/write/writer.rs +++ b/crates/polars-io/src/parquet/write/writer.rs @@ -146,6 +146,7 @@ fn encoding_map(data_type: &ArrowDataType) -> Encoding { | PhysicalType::LargeUtf8 | PhysicalType::Utf8View | PhysicalType::BinaryView => Encoding::RleDictionary, + PhysicalType::Boolean => Encoding::Rle, PhysicalType::Primitive(dt) => { use arrow::types::PrimitiveType::*; match dt { diff --git a/crates/polars-parquet/src/arrow/write/boolean/basic.rs b/crates/polars-parquet/src/arrow/write/boolean/basic.rs index b2ef5ca92148..0735ba2f4d6c 100644 --- a/crates/polars-parquet/src/arrow/write/boolean/basic.rs +++ b/crates/polars-parquet/src/arrow/write/boolean/basic.rs @@ -1,9 +1,9 @@ use arrow::array::*; -use polars_error::PolarsResult; +use polars_error::{polars_bail, PolarsResult}; use super::super::{utils, WriteOptions}; use crate::arrow::read::schema::is_nullable; -use crate::parquet::encoding::hybrid_rle::bitpacked_encode; +use crate::parquet::encoding::hybrid_rle::{self, bitpacked_encode}; use crate::parquet::encoding::Encoding; use crate::parquet::page::DataPage; use crate::parquet::schema::types::PrimitiveType; @@ -23,25 +23,41 @@ pub(super) fn encode_plain( is_optional: bool, buffer: &mut Vec, ) -> PolarsResult<()> { - if is_optional { - let iter = array.non_null_values_iter().take( - array - .validity() - .as_ref() - .map(|x| x.len() - x.unset_bits()) - .unwrap_or_else(|| array.len()), - ); - encode(iter, buffer) + if is_optional && array.validity().is_some() { + encode(array.non_null_values_iter(), buffer) } else { - let iter = array.values().iter(); - encode(iter, buffer) + encode(array.values().iter(), buffer) } } +pub(super) fn encode_hybrid_rle( + array: &BooleanArray, + is_optional: bool, + buffer: &mut Vec, +) -> PolarsResult<()> { + buffer.extend_from_slice(&[0; 4]); + let start = buffer.len(); + + if is_optional && array.validity().is_some() { + hybrid_rle::encode(buffer, array.non_null_values_iter(), 1)?; + } else { + hybrid_rle::encode(buffer, array.values().iter(), 1)?; + } + + let length = buffer.len() - start; + + // write the first 4 bytes as length + let length = (length as i32).to_le_bytes(); + (0..4).for_each(|i| buffer[start - 4 + i] = length[i]); + + Ok(()) +} + pub fn array_to_page( array: &BooleanArray, options: WriteOptions, type_: PrimitiveType, + encoding: Encoding, ) -> PolarsResult { let is_optional = is_nullable(&type_.field_info); @@ -58,7 +74,11 @@ pub fn array_to_page( let definition_levels_byte_length = buffer.len(); - encode_plain(array, is_optional, &mut buffer)?; + match encoding { + Encoding::Plain => encode_plain(array, is_optional, &mut buffer)?, + Encoding::Rle => encode_hybrid_rle(array, is_optional, &mut buffer)?, + other => polars_bail!(nyi = "Encoding boolean as {other:?}"), + } let statistics = if options.has_statistics() { Some(build_statistics(array, &options.statistics)) @@ -76,7 +96,7 @@ pub fn array_to_page( statistics, type_, options, - Encoding::Plain, + encoding, ) } diff --git a/crates/polars-parquet/src/arrow/write/mod.rs b/crates/polars-parquet/src/arrow/write/mod.rs index 9022bab0e2c9..950e5fc16837 100644 --- a/crates/polars-parquet/src/arrow/write/mod.rs +++ b/crates/polars-parquet/src/arrow/write/mod.rs @@ -361,9 +361,12 @@ pub fn array_to_page_simple( let data_type = array.data_type(); match data_type.to_logical_type() { - ArrowDataType::Boolean => { - boolean::array_to_page(array.as_any().downcast_ref().unwrap(), options, type_) - }, + ArrowDataType::Boolean => boolean::array_to_page( + array.as_any().downcast_ref().unwrap(), + options, + type_, + encoding, + ), // casts below MUST match the casts done at the metadata (field -> parquet type). ArrowDataType::UInt8 => { return primitive::array_to_page_integer::(