From eba9a18fc601660737bca6401bd9fd90272e255e Mon Sep 17 00:00:00 2001 From: coastalwhite Date: Tue, 7 Jan 2025 09:14:02 +0100 Subject: [PATCH] fix: Soundness when loading Parquet string statistics Fixes #20533. --- crates/polars-arrow/src/array/binview/mod.rs | 11 ++++++++++- .../src/arrow/read/statistics/binview.rs | 4 ++-- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/crates/polars-arrow/src/array/binview/mod.rs b/crates/polars-arrow/src/array/binview/mod.rs index 70804fb6c981..ebc2a6f03437 100644 --- a/crates/polars-arrow/src/array/binview/mod.rs +++ b/crates/polars-arrow/src/array/binview/mod.rs @@ -50,8 +50,9 @@ pub trait ViewType: Sealed + 'static + PartialEq + AsRef { type Owned: Debug + Clone + Sync + Send + AsRef; /// # Safety - /// The caller must ensure `index < self.len()`. + /// The caller must ensure that `slice` is a valid view. unsafe fn from_bytes_unchecked(slice: &[u8]) -> &Self; + fn from_bytes(slice: &[u8]) -> Option<&Self>; fn to_bytes(&self) -> &[u8]; @@ -70,6 +71,10 @@ impl ViewType for str { unsafe fn from_bytes_unchecked(slice: &[u8]) -> &Self { std::str::from_utf8_unchecked(slice) } + #[inline(always)] + fn from_bytes(slice: &[u8]) -> Option<&Self> { + std::str::from_utf8(slice).ok() + } #[inline(always)] fn to_bytes(&self) -> &[u8] { @@ -93,6 +98,10 @@ impl ViewType for [u8] { unsafe fn from_bytes_unchecked(slice: &[u8]) -> &Self { slice } + #[inline(always)] + fn from_bytes(slice: &[u8]) -> Option<&Self> { + Some(slice) + } #[inline(always)] fn to_bytes(&self) -> &[u8] { diff --git a/crates/polars-parquet/src/arrow/read/statistics/binview.rs b/crates/polars-parquet/src/arrow/read/statistics/binview.rs index d78b45004fb8..f89f401c733f 100644 --- a/crates/polars-parquet/src/arrow/read/statistics/binview.rs +++ b/crates/polars-parquet/src/arrow/read/statistics/binview.rs @@ -19,11 +19,11 @@ pub(super) fn push( min.push(from.and_then(|s| { let opt_b = s.min_value.as_deref(); - unsafe { opt_b.map(|b| T::from_bytes_unchecked(b)) } + opt_b.and_then(T::from_bytes) })); max.push(from.and_then(|s| { let opt_b = s.max_value.as_deref(); - unsafe { opt_b.map(|b| T::from_bytes_unchecked(b)) } + opt_b.and_then(T::from_bytes) })); Ok(())