From d681c3e034284a22ad499d1843ed7481978c5630 Mon Sep 17 00:00:00 2001 From: Eduard Karacharov Date: Tue, 18 Jun 2024 21:07:58 +0300 Subject: [PATCH] faster iterators for nullable columns --- arrow-row/src/fixed.rs | 76 +++++++++++++++++------------------------- arrow-row/src/lib.rs | 12 +++---- 2 files changed, 36 insertions(+), 52 deletions(-) diff --git a/arrow-row/src/fixed.rs b/arrow-row/src/fixed.rs index 461d9cd9c1c0..3d9920708f9b 100644 --- a/arrow-row/src/fixed.rs +++ b/arrow-row/src/fixed.rs @@ -18,9 +18,10 @@ use crate::array::PrimitiveArray; use crate::null_sentinel; use arrow_array::builder::BufferBuilder; -use arrow_array::{Array, ArrowPrimitiveType, BooleanArray, FixedSizeBinaryArray}; +use arrow_array::{ArrowPrimitiveType, BooleanArray, FixedSizeBinaryArray}; use arrow_buffer::{ - bit_util, i256, ArrowNativeType, Buffer, IntervalDayTime, IntervalMonthDayNano, MutableBuffer, + bit_util, i256, ArrowNativeType, BooleanBuffer, Buffer, IntervalDayTime, IntervalMonthDayNano, + MutableBuffer, NullBuffer, }; use arrow_data::{ArrayData, ArrayDataBuilder}; use arrow_schema::{DataType, SortOptions}; @@ -216,22 +217,20 @@ where /// /// - 1 byte `0` if null or `1` if valid /// - bytes of [`FixedLengthEncoding`] -pub fn encode( +pub fn encode( data: &mut [u8], offsets: &mut [usize], - array: &PrimitiveArray, + values: &[T], + nulls: &NullBuffer, opts: SortOptions, -) where - T::Native: FixedLengthEncoding, -{ - let mut offset_idx = 1; - for maybe_val in array { - let offset = &mut offsets[offset_idx]; - let end_offset = *offset + T::Native::ENCODED_LEN; - if let Some(val) = maybe_val { +) { + for (value_idx, is_valid) in nulls.iter().enumerate() { + let offset = &mut offsets[value_idx + 1]; + let end_offset = *offset + T::ENCODED_LEN; + if is_valid { let to_write = &mut data[*offset..end_offset]; to_write[0] = 1; - let mut encoded = val.encode(); + let mut encoded = values[value_idx].encode(); if opts.descending { // Flip bits to reverse order encoded.as_mut().iter_mut().for_each(|v| *v = !*v) @@ -241,26 +240,20 @@ pub fn encode( data[*offset] = null_sentinel(opts); } *offset = end_offset; - offset_idx += 1; } } /// Encoding for non-nullable primitive arrays. /// Iterates directly over the `values`, and skips NULLs-checking. -pub fn encode_not_null( +pub fn encode_not_null( data: &mut [u8], offsets: &mut [usize], - array: &PrimitiveArray, + values: &[T], opts: SortOptions, -) where - T::Native: FixedLengthEncoding, -{ - assert!(!array.is_nullable()); - - let mut offset_idx = 1; - for val in array.values() { - let offset = &mut offsets[offset_idx]; - let end_offset = *offset + T::Native::ENCODED_LEN; +) { + for (value_idx, val) in values.iter().enumerate() { + let offset = &mut offsets[value_idx + 1]; + let end_offset = *offset + T::ENCODED_LEN; let to_write = &mut data[*offset..end_offset]; to_write[0] = 1; @@ -272,7 +265,6 @@ pub fn encode_not_null( to_write[1..].copy_from_slice(encoded.as_ref()); *offset = end_offset; - offset_idx += 1; } } @@ -280,20 +272,20 @@ pub fn encode_not_null( /// /// - 1 byte `0` if null or `1` if valid /// - bytes of [`FixedLengthEncoding`] -pub fn encode_bool( +pub fn encode_boolean( data: &mut [u8], offsets: &mut [usize], - array: &BooleanArray, + values: &BooleanBuffer, + nulls: &NullBuffer, opts: SortOptions, ) { - let mut offset_idx = 1; - for maybe_val in array { - let offset = &mut offsets[offset_idx]; + for (idx, is_valid) in nulls.iter().enumerate() { + let offset = &mut offsets[idx + 1]; let end_offset = *offset + bool::ENCODED_LEN; - if let Some(val) = maybe_val { + if is_valid { let to_write = &mut data[*offset..end_offset]; to_write[0] = 1; - let mut encoded = val.encode(); + let mut encoded = values.value(idx).encode(); if opts.descending { // Flip bits to reverse order encoded.as_mut().iter_mut().for_each(|v| *v = !*v) @@ -303,23 +295,19 @@ pub fn encode_bool( data[*offset] = null_sentinel(opts); } *offset = end_offset; - offset_idx += 1; } } /// Encoding for non-nullable boolean arrays. /// Iterates directly over `values`, and skips NULLs-checking. -pub fn encode_bool_not_null( +pub fn encode_boolean_not_null( data: &mut [u8], offsets: &mut [usize], - array: &BooleanArray, + values: &BooleanBuffer, opts: SortOptions, ) { - assert!(!array.is_nullable()); - - let mut offset_idx = 1; - for val in array.values() { - let offset = &mut offsets[offset_idx]; + for (value_idx, val) in values.iter().enumerate() { + let offset = &mut offsets[value_idx + 1]; let end_offset = *offset + bool::ENCODED_LEN; let to_write = &mut data[*offset..end_offset]; @@ -332,7 +320,6 @@ pub fn encode_bool_not_null( to_write[1..].copy_from_slice(encoded.as_ref()); *offset = end_offset; - offset_idx += 1; } } @@ -343,9 +330,7 @@ pub fn encode_fixed_size_binary( opts: SortOptions, ) { let len = array.value_length() as usize; - let mut offset_idx = 1; - for maybe_val in array { - let offset = &mut offsets[offset_idx]; + for (offset, maybe_val) in offsets.iter_mut().skip(1).zip(array.iter()) { let end_offset = *offset + len + 1; if let Some(val) = maybe_val { let to_write = &mut data[*offset..end_offset]; @@ -359,7 +344,6 @@ pub fn encode_fixed_size_binary( data[*offset] = null_sentinel(opts); } *offset = end_offset; - offset_idx += 1; } } diff --git a/arrow-row/src/lib.rs b/arrow-row/src/lib.rs index 8e67858fa5af..935db80385bb 100644 --- a/arrow-row/src/lib.rs +++ b/arrow-row/src/lib.rs @@ -1147,18 +1147,18 @@ fn encode_column( Encoder::Stateless => { downcast_primitive_array! { column => { - if column.is_nullable(){ - fixed::encode(data, offsets, column, opts) + if let Some(nulls) = column.nulls().filter(|n| n.null_count() > 0){ + fixed::encode(data, offsets, column.values(), nulls, opts) } else { - fixed::encode_not_null(data, offsets, column, opts) + fixed::encode_not_null(data, offsets, column.values(), opts) } } DataType::Null => {} DataType::Boolean => { - if column.is_nullable(){ - fixed::encode_bool(data, offsets, column.as_boolean(), opts) + if let Some(nulls) = column.nulls().filter(|n| n.null_count() > 0){ + fixed::encode_boolean(data, offsets, column.as_boolean().values(), nulls, opts) } else { - fixed::encode_bool_not_null(data, offsets, column.as_boolean(), opts) + fixed::encode_boolean_not_null(data, offsets, column.as_boolean().values(), opts) } } DataType::Binary => {