From db20a812454e8b9e9ec099fc41564ea7c17c0e3f Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 17 Jan 2025 16:56:10 -0500 Subject: [PATCH] Add `simdutf8` feature to make `simdutf8` optional, consolidate `check_valid_utf8` (#6979) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Add `simd8tf8` feature * Consolidate check utf8 * Publically doc and export * fmt * Update parquet/src/util/utf8.rs Co-authored-by: Daniël Heres * enable by default --------- Co-authored-by: Daniël Heres --- .github/workflows/parquet.yml | 2 + parquet/Cargo.toml | 6 +- parquet/README.md | 20 ++++--- .../src/arrow/array_reader/byte_view_array.rs | 12 +--- parquet/src/arrow/buffer/offset_buffer.rs | 10 +--- parquet/src/lib.rs | 3 + parquet/src/util/mod.rs | 2 + parquet/src/util/utf8.rs | 57 +++++++++++++++++++ 8 files changed, 83 insertions(+), 29 deletions(-) create mode 100644 parquet/src/util/utf8.rs diff --git a/.github/workflows/parquet.yml b/.github/workflows/parquet.yml index 19503fde7991..4c46fde198bd 100644 --- a/.github/workflows/parquet.yml +++ b/.github/workflows/parquet.yml @@ -97,6 +97,8 @@ jobs: run: cargo check -p parquet --no-default-features - name: Check compilation --no-default-features --features arrow run: cargo check -p parquet --no-default-features --features arrow + - name: Check compilation --no-default-features --features simdutf8 + run: cargo check -p parquet --no-default-features --features simdutf8 - name: Check compilation --no-default-features --all-features run: cargo check -p parquet --all-features - name: Check compilation --all-targets diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index c14c0e1d34c4..54992d864d85 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -69,7 +69,7 @@ paste = { version = "1.0" } half = { version = "2.1", default-features = false, features = ["num-traits"] } sysinfo = { version = "0.33.0", optional = true, default-features = false, features = ["system"] } crc32fast = { version = "1.4.2", optional = true, default-features = false } -simdutf8 = { version = "0.1.5"} +simdutf8 = { version = "0.1.5", optional = true, default-features = false } [dev-dependencies] base64 = { version = "0.22", default-features = false, features = ["std"] } @@ -98,7 +98,7 @@ zstd-sys = { version = ">=2.0.0, <2.0.14", default-features = false } all-features = true [features] -default = ["arrow", "snap", "brotli", "flate2", "lz4", "zstd", "base64"] +default = ["arrow", "snap", "brotli", "flate2", "lz4", "zstd", "base64", "simdutf8"] # Enable lz4 lz4 = ["lz4_flex"] # Enable arrow reader/writer APIs @@ -121,6 +121,8 @@ zstd = ["dep:zstd", "zstd-sys"] sysinfo = ["dep:sysinfo"] # Verify 32-bit CRC checksum when decoding parquet pages crc = ["dep:crc32fast"] +# Enable SIMD UTF-8 validation +simdutf8 = ["dep:simdutf8"] [[example]] diff --git a/parquet/README.md b/parquet/README.md index 9ff1d921d692..1224e52f3f5a 100644 --- a/parquet/README.md +++ b/parquet/README.md @@ -51,17 +51,21 @@ major releases may contain breaking API changes. The `parquet` crate provides the following features which may be enabled in your `Cargo.toml`: -- `arrow` (default) - support for reading / writing [`arrow`](https://crates.io/crates/arrow) arrays to / from parquet -- `async` - support `async` APIs for reading parquet -- `json` - support for reading / writing `json` data to / from parquet -- `brotli` (default) - support for parquet using `brotli` compression -- `flate2` (default) - support for parquet using `gzip` compression -- `lz4` (default) - support for parquet using `lz4` compression -- `zstd` (default) - support for parquet using `zstd` compression -- `snap` (default) - support for parquet using `snappy` compression +- `arrow` (default) - support for reading / writing [`arrow`] arrays to / from Parquet +- `async` - support `async` APIs for reading Parquet +- `json` - support for reading / writing `json` data to / from Parquet +- `brotli` (default) - support for Parquet using `brotli` compression +- `flate2` (default) - support for Parquet using `gzip` compression +- `lz4` (default) - support for Parquet using `lz4` compression +- `zstd` (default) - support for Parquet using `zstd` compression +- `snap` (default) - support for Parquet using `snappy` compression - `cli` - parquet [CLI tools](https://github.com/apache/arrow-rs/tree/main/parquet/src/bin) - `crc` - enables functionality to automatically verify checksums of each page (if present) when decoding - `experimental` - Experimental APIs which may change, even between minor releases +- `simdutf8` (default) - Use the [`simdutf8`] crate for SIMD-accelerated UTF-8 validation + +[`arrow`]: https://crates.io/crates/arrow +[`simdutf8`]: https://crates.io/crates/simdutf8 ## Parquet Feature Status diff --git a/parquet/src/arrow/array_reader/byte_view_array.rs b/parquet/src/arrow/array_reader/byte_view_array.rs index 00627ad612ea..8df659060040 100644 --- a/parquet/src/arrow/array_reader/byte_view_array.rs +++ b/parquet/src/arrow/array_reader/byte_view_array.rs @@ -27,6 +27,7 @@ use crate::data_type::Int32Type; use crate::encodings::decoding::{Decoder, DeltaBitPackDecoder}; use crate::errors::{ParquetError, Result}; use crate::schema::types::ColumnDescPtr; +use crate::util::utf8::check_valid_utf8; use arrow_array::{builder::make_view, ArrayRef}; use arrow_buffer::Buffer; use arrow_data::ByteView; @@ -681,17 +682,6 @@ impl ByteViewArrayDecoderDelta { } } -/// Check that `val` is a valid UTF-8 sequence -pub fn check_valid_utf8(val: &[u8]) -> Result<()> { - match simdutf8::basic::from_utf8(val) { - Ok(_) => Ok(()), - Err(_) => { - let e = simdutf8::compat::from_utf8(val).unwrap_err(); - Err(general_err!("encountered non UTF-8 data: {}", e)) - } - } -} - #[cfg(test)] mod tests { use arrow_array::StringViewArray; diff --git a/parquet/src/arrow/buffer/offset_buffer.rs b/parquet/src/arrow/buffer/offset_buffer.rs index 8dfb859612cb..5051dce12b37 100644 --- a/parquet/src/arrow/buffer/offset_buffer.rs +++ b/parquet/src/arrow/buffer/offset_buffer.rs @@ -18,6 +18,7 @@ use crate::arrow::buffer::bit_util::iter_set_bits_rev; use crate::arrow::record_reader::buffer::ValuesBuffer; use crate::errors::{ParquetError, Result}; +use crate::util::utf8::check_valid_utf8; use arrow_array::{make_array, ArrayRef, OffsetSizeTrait}; use arrow_buffer::{ArrowNativeType, Buffer}; use arrow_data::ArrayDataBuilder; @@ -117,14 +118,7 @@ impl OffsetBuffer { /// /// [`Self::try_push`] can perform this validation check on insertion pub fn check_valid_utf8(&self, start_offset: usize) -> Result<()> { - match simdutf8::basic::from_utf8(&self.values.as_slice()[start_offset..]) { - Ok(_) => Ok(()), - Err(_) => { - let e = simdutf8::compat::from_utf8(&self.values.as_slice()[start_offset..]) - .unwrap_err(); - Err(general_err!("encountered non UTF-8 data: {}", e)) - } - } + check_valid_utf8(&self.values.as_slice()[start_offset..]) } /// Converts this into an [`ArrayRef`] with the provided `data_type` and `null_buffer` diff --git a/parquet/src/lib.rs b/parquet/src/lib.rs index 3b63845e709c..3ca0dbe98791 100644 --- a/parquet/src/lib.rs +++ b/parquet/src/lib.rs @@ -131,6 +131,9 @@ pub mod data_type; pub use self::encodings::{decoding, encoding}; experimental!(#[macro_use] mod util); + +pub use util::utf8; + #[cfg(feature = "arrow")] pub mod arrow; pub mod column; diff --git a/parquet/src/util/mod.rs b/parquet/src/util/mod.rs index dfa1285afcf2..1431132473e9 100644 --- a/parquet/src/util/mod.rs +++ b/parquet/src/util/mod.rs @@ -19,8 +19,10 @@ pub mod bit_util; mod bit_pack; pub(crate) mod interner; + #[cfg(any(test, feature = "test_common"))] pub(crate) mod test_common; +pub mod utf8; #[cfg(any(test, feature = "test_common"))] pub use self::test_common::page_util::{ diff --git a/parquet/src/util/utf8.rs b/parquet/src/util/utf8.rs new file mode 100644 index 000000000000..2a537b5e53de --- /dev/null +++ b/parquet/src/util/utf8.rs @@ -0,0 +1,57 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! [`check_valid_utf8`] validation function +use crate::errors::{ParquetError, Result}; + +/// Check that `val` is a valid UTF-8 sequence. +/// +/// If the `simdutf8` feature is enabled, this function will use +/// SIMD-accelerated validation from the [`simdutf8`] crate. Otherwise, it will use +/// [`std::str::from_utf8`]. +/// +/// # Errors +/// +/// Returns `Err::General` with a message compatible with [`std::str::from_utf8`] on failure. +/// +/// # Example +/// ``` +/// use parquet::utf8::check_valid_utf8; +/// assert!(check_valid_utf8(b"hello").is_ok()); +/// assert!(check_valid_utf8(b"hello \xF0\x9F\x98\x8E").is_ok()); +/// // invalid UTF-8 +/// assert!(check_valid_utf8(b"hello \xF0\x9F\x98").is_err()); +/// ``` +/// +/// [`simdutf8`]: https://crates.io/crates/simdutf8 +#[inline(always)] +pub fn check_valid_utf8(val: &[u8]) -> Result<()> { + #[cfg(feature = "simdutf8")] + match simdutf8::basic::from_utf8(val) { + Ok(_) => Ok(()), + Err(_) => { + // Use simdutf8::compat to return details about the decoding error + let e = simdutf8::compat::from_utf8(val).unwrap_err(); + Err(general_err!("encountered non UTF-8 data: {}", e)) + } + } + #[cfg(not(feature = "simdutf8"))] + match std::str::from_utf8(val) { + Ok(_) => Ok(()), + Err(e) => Err(general_err!("encountered non UTF-8 data: {}", e)), + } +}