From fd00ee6baa562ad24280135c7d9cd23e249dbf45 Mon Sep 17 00:00:00 2001 From: Gijs Burghoorn Date: Mon, 5 Aug 2024 10:54:00 +0200 Subject: [PATCH] perf: Zerocopy buffers for `FixedSizeBinary` to `BinaryView` cast (#18043) --- crates/polars-arrow/src/array/binview/view.rs | 55 ++++++++++++---- .../src/compute/cast/binary_to.rs | 65 ++++++++++++++++++- 2 files changed, 105 insertions(+), 15 deletions(-) diff --git a/crates/polars-arrow/src/array/binview/view.rs b/crates/polars-arrow/src/array/binview/view.rs index 4437c6e9ce86..e2ee058092d7 100644 --- a/crates/polars-arrow/src/array/binview/view.rs +++ b/crates/polars-arrow/src/array/binview/view.rs @@ -62,15 +62,15 @@ impl View { unsafe { std::mem::transmute(self) } } - /// Create a new inline view + /// Create a new inline view without verifying the length /// - /// # Panics + /// # Safety /// - /// Panics if the `bytes.len() > View::MAX_INLINE_SIZE`. + /// It needs to hold that `bytes.len() <= View::MAX_INLINE_SIZE`. #[inline] - pub fn new_inline(bytes: &[u8]) -> Self { + pub unsafe fn new_inline_unchecked(bytes: &[u8]) -> Self { debug_assert!(bytes.len() <= u32::MAX as usize); - assert!(bytes.len() as u32 <= Self::MAX_INLINE_SIZE); + debug_assert!(bytes.len() as u32 <= Self::MAX_INLINE_SIZE); let mut view = Self { length: bytes.len() as u32, @@ -92,18 +92,47 @@ impl View { view } + /// Create a new inline view + /// + /// # Panics + /// + /// Panics if the `bytes.len() > View::MAX_INLINE_SIZE`. + #[inline] + pub fn new_inline(bytes: &[u8]) -> Self { + assert!(bytes.len() as u32 <= Self::MAX_INLINE_SIZE); + unsafe { Self::new_inline_unchecked(bytes) } + } + + /// Create a new inline view + /// + /// # Safety + /// + /// It needs to hold that `bytes.len() > View::MAX_INLINE_SIZE`. + #[inline] + pub unsafe fn new_noninline_unchecked(bytes: &[u8], buffer_idx: u32, offset: u32) -> Self { + debug_assert!(bytes.len() <= u32::MAX as usize); + debug_assert!(bytes.len() as u32 > View::MAX_INLINE_SIZE); + + // SAFETY: The invariant of this function guarantees that this is safe. + let prefix = unsafe { u32::from_le_bytes(bytes[0..4].try_into().unwrap_unchecked()) }; + Self { + length: bytes.len() as u32, + prefix, + buffer_idx, + offset, + } + } + #[inline] pub fn new_from_bytes(bytes: &[u8], buffer_idx: u32, offset: u32) -> Self { debug_assert!(bytes.len() <= u32::MAX as usize); - if bytes.len() as u32 <= Self::MAX_INLINE_SIZE { - Self::new_inline(bytes) - } else { - Self { - length: bytes.len() as u32, - prefix: u32::from_le_bytes(bytes[0..4].try_into().unwrap()), - buffer_idx, - offset, + // SAFETY: We verify the invariant with the outer if statement + unsafe { + if bytes.len() as u32 <= Self::MAX_INLINE_SIZE { + Self::new_inline_unchecked(bytes) + } else { + Self::new_noninline_unchecked(bytes, buffer_idx, offset) } } } diff --git a/crates/polars-arrow/src/compute/cast/binary_to.rs b/crates/polars-arrow/src/compute/cast/binary_to.rs index fa53a58f8bbb..57feef1baa72 100644 --- a/crates/polars-arrow/src/compute/cast/binary_to.rs +++ b/crates/polars-arrow/src/compute/cast/binary_to.rs @@ -1,7 +1,10 @@ +use std::sync::Arc; + use polars_error::PolarsResult; use super::CastOptionsImpl; use crate::array::*; +use crate::buffer::Buffer; use crate::datatypes::ArrowDataType; use crate::offset::{Offset, Offsets}; use crate::types::NativeType; @@ -179,8 +182,66 @@ pub fn fixed_size_binary_binary( } pub fn fixed_size_binary_to_binview(from: &FixedSizeBinaryArray) -> BinaryViewArray { - let mutable = MutableBinaryViewArray::from_values_iter(from.values_iter()); - mutable.freeze().with_validity(from.validity().cloned()) + let datatype = <[u8] as ViewType>::DATA_TYPE; + + // Fast path: all the views are inlineable + if from.size() <= View::MAX_INLINE_SIZE as usize { + // @NOTE: There is something with the code-generation of `View::new_inline_unchecked` that + // prevents it from properly SIMD-ing this loop. It insists on memcpying while it should + // know that the size is really small. Dispatching over the `from.size()` and making it + // constant does make loop SIMD, but it does not actually speed anything up and the code it + // generates is still horrible. + // + // This is really slow, and I don't think it has to be. + + // SAFETY: We checked that slice.len() <= View::MAX_INLINE_SIZE before + let views = from + .values_iter() + .map(|slice| unsafe { View::new_inline_unchecked(slice) }) + .collect::>(); + return BinaryViewArray::try_new(datatype, views, Arc::default(), from.validity().cloned()) + .unwrap(); + } + + const MAX_BYTES_PER_BUFFER: usize = u32::MAX as usize; + + let size = from.size(); + let num_bytes = from.len() * size; + let num_buffers = num_bytes.div_ceil(MAX_BYTES_PER_BUFFER); + assert!(num_buffers < u32::MAX as usize); + + let num_elements_per_buffer = MAX_BYTES_PER_BUFFER / size; + // This is NOT equal to MAX_BYTES_PER_BUFFER because of integer division + let split_point = num_elements_per_buffer * size; + + // This is zero-copy for the buffer since split just increases the the data since + let mut buffer = from.values().clone(); + let mut buffers = Vec::with_capacity(num_buffers); + for _ in 0..num_buffers - 1 { + let slice; + (slice, buffer) = buffer.split_at(split_point); + buffers.push(slice); + } + buffers.push(buffer); + + let mut iter = from.values_iter(); + let iter = iter.by_ref(); + let mut views = Vec::with_capacity(from.len()); + for buffer_idx in 0..num_buffers { + views.extend( + iter.take(num_elements_per_buffer) + .enumerate() + .map(|(i, slice)| { + // SAFETY: We checked that slice.len() > View::MAX_INLINE_SIZE before + unsafe { + View::new_noninline_unchecked(slice, buffer_idx as u32, (i * size) as u32) + } + }), + ); + } + let views = views.into(); + + BinaryViewArray::try_new(datatype, views, buffers.into(), from.validity().cloned()).unwrap() } /// Conversion of binary