Skip to content

Commit

Permalink
Faster parquet utf8 validation using simdjson (apache#6668)
Browse files Browse the repository at this point in the history
* Faster utf8 validation

* Move dependency

---------

Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>
  • Loading branch information
2 people authored and svencowart committed Jan 14, 2025
1 parent e3a290f commit fb2852b
Show file tree
Hide file tree
Showing 3 changed files with 12 additions and 4 deletions.
1 change: 1 addition & 0 deletions parquet/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ paste = { version = "1.0" }
half = { version = "2.1", default-features = false, features = ["num-traits"] }
sysinfo = { version = "0.33.0", optional = true, default-features = false, features = ["system"] }
crc32fast = { version = "1.4.2", optional = true, default-features = false }
simdutf8 = { version = "0.1.5"}

[dev-dependencies]
base64 = { version = "0.22", default-features = false, features = ["std"] }
Expand Down
7 changes: 5 additions & 2 deletions parquet/src/arrow/array_reader/byte_view_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -683,9 +683,12 @@ impl ByteViewArrayDecoderDelta {

/// Check that `val` is a valid UTF-8 sequence
pub fn check_valid_utf8(val: &[u8]) -> Result<()> {
match std::str::from_utf8(val) {
match simdutf8::basic::from_utf8(val) {
Ok(_) => Ok(()),
Err(e) => Err(general_err!("encountered non UTF-8 data: {}", e)),
Err(_) => {
let e = simdutf8::compat::from_utf8(val).unwrap_err();
Err(general_err!("encountered non UTF-8 data: {}", e))
}
}
}

Expand Down
8 changes: 6 additions & 2 deletions parquet/src/arrow/buffer/offset_buffer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -117,9 +117,13 @@ impl<I: OffsetSizeTrait> OffsetBuffer<I> {
///
/// [`Self::try_push`] can perform this validation check on insertion
pub fn check_valid_utf8(&self, start_offset: usize) -> Result<()> {
match std::str::from_utf8(&self.values.as_slice()[start_offset..]) {
match simdutf8::basic::from_utf8(&self.values.as_slice()[start_offset..]) {
Ok(_) => Ok(()),
Err(e) => Err(general_err!("encountered non UTF-8 data: {}", e)),
Err(_) => {
let e = simdutf8::compat::from_utf8(&self.values.as_slice()[start_offset..])
.unwrap_err();
Err(general_err!("encountered non UTF-8 data: {}", e))
}
}
}

Expand Down

0 comments on commit fb2852b

Please sign in to comment.