Skip to content

Commit

Permalink
Add simdutf8 feature to make simdutf8 optional, consolidate `chec…
Browse files Browse the repository at this point in the history
…k_valid_utf8` (#6979)

* Add `simd8tf8` feature

* Consolidate check utf8

* Publically doc and export

* fmt

* Update parquet/src/util/utf8.rs

Co-authored-by: Daniël Heres <danielheres@gmail.com>

* enable by default

---------

Co-authored-by: Daniël Heres <danielheres@gmail.com>
  • Loading branch information
alamb and Dandandan authored Jan 17, 2025
1 parent 9cd7018 commit db20a81
Show file tree
Hide file tree
Showing 8 changed files with 83 additions and 29 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/parquet.yml
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,8 @@ jobs:
run: cargo check -p parquet --no-default-features
- name: Check compilation --no-default-features --features arrow
run: cargo check -p parquet --no-default-features --features arrow
- name: Check compilation --no-default-features --features simdutf8
run: cargo check -p parquet --no-default-features --features simdutf8
- name: Check compilation --no-default-features --all-features
run: cargo check -p parquet --all-features
- name: Check compilation --all-targets
Expand Down
6 changes: 4 additions & 2 deletions parquet/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ paste = { version = "1.0" }
half = { version = "2.1", default-features = false, features = ["num-traits"] }
sysinfo = { version = "0.33.0", optional = true, default-features = false, features = ["system"] }
crc32fast = { version = "1.4.2", optional = true, default-features = false }
simdutf8 = { version = "0.1.5"}
simdutf8 = { version = "0.1.5", optional = true, default-features = false }

[dev-dependencies]
base64 = { version = "0.22", default-features = false, features = ["std"] }
Expand Down Expand Up @@ -98,7 +98,7 @@ zstd-sys = { version = ">=2.0.0, <2.0.14", default-features = false }
all-features = true

[features]
default = ["arrow", "snap", "brotli", "flate2", "lz4", "zstd", "base64"]
default = ["arrow", "snap", "brotli", "flate2", "lz4", "zstd", "base64", "simdutf8"]
# Enable lz4
lz4 = ["lz4_flex"]
# Enable arrow reader/writer APIs
Expand All @@ -121,6 +121,8 @@ zstd = ["dep:zstd", "zstd-sys"]
sysinfo = ["dep:sysinfo"]
# Verify 32-bit CRC checksum when decoding parquet pages
crc = ["dep:crc32fast"]
# Enable SIMD UTF-8 validation
simdutf8 = ["dep:simdutf8"]


[[example]]
Expand Down
20 changes: 12 additions & 8 deletions parquet/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -51,17 +51,21 @@ major releases may contain breaking API changes.

The `parquet` crate provides the following features which may be enabled in your `Cargo.toml`:

- `arrow` (default) - support for reading / writing [`arrow`](https://crates.io/crates/arrow) arrays to / from parquet
- `async` - support `async` APIs for reading parquet
- `json` - support for reading / writing `json` data to / from parquet
- `brotli` (default) - support for parquet using `brotli` compression
- `flate2` (default) - support for parquet using `gzip` compression
- `lz4` (default) - support for parquet using `lz4` compression
- `zstd` (default) - support for parquet using `zstd` compression
- `snap` (default) - support for parquet using `snappy` compression
- `arrow` (default) - support for reading / writing [`arrow`] arrays to / from Parquet
- `async` - support `async` APIs for reading Parquet
- `json` - support for reading / writing `json` data to / from Parquet
- `brotli` (default) - support for Parquet using `brotli` compression
- `flate2` (default) - support for Parquet using `gzip` compression
- `lz4` (default) - support for Parquet using `lz4` compression
- `zstd` (default) - support for Parquet using `zstd` compression
- `snap` (default) - support for Parquet using `snappy` compression
- `cli` - parquet [CLI tools](https://github.com/apache/arrow-rs/tree/main/parquet/src/bin)
- `crc` - enables functionality to automatically verify checksums of each page (if present) when decoding
- `experimental` - Experimental APIs which may change, even between minor releases
- `simdutf8` (default) - Use the [`simdutf8`] crate for SIMD-accelerated UTF-8 validation

[`arrow`]: https://crates.io/crates/arrow
[`simdutf8`]: https://crates.io/crates/simdutf8

## Parquet Feature Status

Expand Down
12 changes: 1 addition & 11 deletions parquet/src/arrow/array_reader/byte_view_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ use crate::data_type::Int32Type;
use crate::encodings::decoding::{Decoder, DeltaBitPackDecoder};
use crate::errors::{ParquetError, Result};
use crate::schema::types::ColumnDescPtr;
use crate::util::utf8::check_valid_utf8;
use arrow_array::{builder::make_view, ArrayRef};
use arrow_buffer::Buffer;
use arrow_data::ByteView;
Expand Down Expand Up @@ -681,17 +682,6 @@ impl ByteViewArrayDecoderDelta {
}
}

/// Check that `val` is a valid UTF-8 sequence
pub fn check_valid_utf8(val: &[u8]) -> Result<()> {
match simdutf8::basic::from_utf8(val) {
Ok(_) => Ok(()),
Err(_) => {
let e = simdutf8::compat::from_utf8(val).unwrap_err();
Err(general_err!("encountered non UTF-8 data: {}", e))
}
}
}

#[cfg(test)]
mod tests {
use arrow_array::StringViewArray;
Expand Down
10 changes: 2 additions & 8 deletions parquet/src/arrow/buffer/offset_buffer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
use crate::arrow::buffer::bit_util::iter_set_bits_rev;
use crate::arrow::record_reader::buffer::ValuesBuffer;
use crate::errors::{ParquetError, Result};
use crate::util::utf8::check_valid_utf8;
use arrow_array::{make_array, ArrayRef, OffsetSizeTrait};
use arrow_buffer::{ArrowNativeType, Buffer};
use arrow_data::ArrayDataBuilder;
Expand Down Expand Up @@ -117,14 +118,7 @@ impl<I: OffsetSizeTrait> OffsetBuffer<I> {
///
/// [`Self::try_push`] can perform this validation check on insertion
pub fn check_valid_utf8(&self, start_offset: usize) -> Result<()> {
match simdutf8::basic::from_utf8(&self.values.as_slice()[start_offset..]) {
Ok(_) => Ok(()),
Err(_) => {
let e = simdutf8::compat::from_utf8(&self.values.as_slice()[start_offset..])
.unwrap_err();
Err(general_err!("encountered non UTF-8 data: {}", e))
}
}
check_valid_utf8(&self.values.as_slice()[start_offset..])
}

/// Converts this into an [`ArrayRef`] with the provided `data_type` and `null_buffer`
Expand Down
3 changes: 3 additions & 0 deletions parquet/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,9 @@ pub mod data_type;
pub use self::encodings::{decoding, encoding};

experimental!(#[macro_use] mod util);

pub use util::utf8;

#[cfg(feature = "arrow")]
pub mod arrow;
pub mod column;
Expand Down
2 changes: 2 additions & 0 deletions parquet/src/util/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,10 @@
pub mod bit_util;
mod bit_pack;
pub(crate) mod interner;

#[cfg(any(test, feature = "test_common"))]
pub(crate) mod test_common;
pub mod utf8;

#[cfg(any(test, feature = "test_common"))]
pub use self::test_common::page_util::{
Expand Down
57 changes: 57 additions & 0 deletions parquet/src/util/utf8.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

//! [`check_valid_utf8`] validation function
use crate::errors::{ParquetError, Result};

/// Check that `val` is a valid UTF-8 sequence.
///
/// If the `simdutf8` feature is enabled, this function will use
/// SIMD-accelerated validation from the [`simdutf8`] crate. Otherwise, it will use
/// [`std::str::from_utf8`].
///
/// # Errors
///
/// Returns `Err::General` with a message compatible with [`std::str::from_utf8`] on failure.
///
/// # Example
/// ```
/// use parquet::utf8::check_valid_utf8;
/// assert!(check_valid_utf8(b"hello").is_ok());
/// assert!(check_valid_utf8(b"hello \xF0\x9F\x98\x8E").is_ok());
/// // invalid UTF-8
/// assert!(check_valid_utf8(b"hello \xF0\x9F\x98").is_err());
/// ```
///
/// [`simdutf8`]: https://crates.io/crates/simdutf8
#[inline(always)]
pub fn check_valid_utf8(val: &[u8]) -> Result<()> {
#[cfg(feature = "simdutf8")]
match simdutf8::basic::from_utf8(val) {
Ok(_) => Ok(()),
Err(_) => {
// Use simdutf8::compat to return details about the decoding error
let e = simdutf8::compat::from_utf8(val).unwrap_err();
Err(general_err!("encountered non UTF-8 data: {}", e))
}
}
#[cfg(not(feature = "simdutf8"))]
match std::str::from_utf8(val) {
Ok(_) => Ok(()),
Err(e) => Err(general_err!("encountered non UTF-8 data: {}", e)),
}
}

0 comments on commit db20a81

Please sign in to comment.