Skip to content

Commit

Permalink
feat: Add boolean Parquet HybridRle encoding (#18022)
Browse files Browse the repository at this point in the history
  • Loading branch information
coastalwhite authored Aug 4, 2024
1 parent b1cb91e commit c91e078
Show file tree
Hide file tree
Showing 3 changed files with 42 additions and 18 deletions.
1 change: 1 addition & 0 deletions crates/polars-io/src/parquet/write/writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,7 @@ fn encoding_map(data_type: &ArrowDataType) -> Encoding {
| PhysicalType::LargeUtf8
| PhysicalType::Utf8View
| PhysicalType::BinaryView => Encoding::RleDictionary,
PhysicalType::Boolean => Encoding::Rle,
PhysicalType::Primitive(dt) => {
use arrow::types::PrimitiveType::*;
match dt {
Expand Down
50 changes: 35 additions & 15 deletions crates/polars-parquet/src/arrow/write/boolean/basic.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
use arrow::array::*;
use polars_error::PolarsResult;
use polars_error::{polars_bail, PolarsResult};

use super::super::{utils, WriteOptions};
use crate::arrow::read::schema::is_nullable;
use crate::parquet::encoding::hybrid_rle::bitpacked_encode;
use crate::parquet::encoding::hybrid_rle::{self, bitpacked_encode};
use crate::parquet::encoding::Encoding;
use crate::parquet::page::DataPage;
use crate::parquet::schema::types::PrimitiveType;
Expand All @@ -23,25 +23,41 @@ pub(super) fn encode_plain(
is_optional: bool,
buffer: &mut Vec<u8>,
) -> PolarsResult<()> {
if is_optional {
let iter = array.non_null_values_iter().take(
array
.validity()
.as_ref()
.map(|x| x.len() - x.unset_bits())
.unwrap_or_else(|| array.len()),
);
encode(iter, buffer)
if is_optional && array.validity().is_some() {
encode(array.non_null_values_iter(), buffer)
} else {
let iter = array.values().iter();
encode(iter, buffer)
encode(array.values().iter(), buffer)
}
}

pub(super) fn encode_hybrid_rle(
array: &BooleanArray,
is_optional: bool,
buffer: &mut Vec<u8>,
) -> PolarsResult<()> {
buffer.extend_from_slice(&[0; 4]);
let start = buffer.len();

if is_optional && array.validity().is_some() {
hybrid_rle::encode(buffer, array.non_null_values_iter(), 1)?;
} else {
hybrid_rle::encode(buffer, array.values().iter(), 1)?;
}

let length = buffer.len() - start;

// write the first 4 bytes as length
let length = (length as i32).to_le_bytes();
(0..4).for_each(|i| buffer[start - 4 + i] = length[i]);

Ok(())
}

pub fn array_to_page(
array: &BooleanArray,
options: WriteOptions,
type_: PrimitiveType,
encoding: Encoding,
) -> PolarsResult<DataPage> {
let is_optional = is_nullable(&type_.field_info);

Expand All @@ -58,7 +74,11 @@ pub fn array_to_page(

let definition_levels_byte_length = buffer.len();

encode_plain(array, is_optional, &mut buffer)?;
match encoding {
Encoding::Plain => encode_plain(array, is_optional, &mut buffer)?,
Encoding::Rle => encode_hybrid_rle(array, is_optional, &mut buffer)?,
other => polars_bail!(nyi = "Encoding boolean as {other:?}"),
}

let statistics = if options.has_statistics() {
Some(build_statistics(array, &options.statistics))
Expand All @@ -76,7 +96,7 @@ pub fn array_to_page(
statistics,
type_,
options,
Encoding::Plain,
encoding,
)
}

Expand Down
9 changes: 6 additions & 3 deletions crates/polars-parquet/src/arrow/write/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -361,9 +361,12 @@ pub fn array_to_page_simple(
let data_type = array.data_type();

match data_type.to_logical_type() {
ArrowDataType::Boolean => {
boolean::array_to_page(array.as_any().downcast_ref().unwrap(), options, type_)
},
ArrowDataType::Boolean => boolean::array_to_page(
array.as_any().downcast_ref().unwrap(),
options,
type_,
encoding,
),
// casts below MUST match the casts done at the metadata (field -> parquet type).
ArrowDataType::UInt8 => {
return primitive::array_to_page_integer::<u8, i32>(
Expand Down

0 comments on commit c91e078

Please sign in to comment.