Skip to content
This repository has been archived by the owner on Feb 18, 2024. It is now read-only.

Fix: native hashed-hash for MutableDictionaryArray #1564

Merged
merged 4 commits into from
Sep 6, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
82 changes: 65 additions & 17 deletions src/array/dictionary/value_map.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use std::borrow::Borrow;
use std::fmt::{self, Debug};
use std::hash::{BuildHasher, Hash, Hasher};
use std::hash::{BuildHasher, BuildHasherDefault, Hash, Hasher};

use hashbrown::hash_map::RawEntryMut;
use hashbrown::HashMap;
Expand All @@ -15,10 +15,54 @@

use super::DictionaryKey;

/// Hasher for pre-hashed values; similar to `hash_hasher` but with native endianness.
///
/// We know that we'll only use it for `u64` values, so we can avoid endian conversion.
///
/// Invariant: hash of a u64 value is always equal to itself.
#[derive(Copy, Clone, Default)]
pub struct PassthroughHasher(u64);

impl Hasher for PassthroughHasher {
#[inline]
fn write_u64(&mut self, value: u64) {
self.0 = value;
}

fn write(&mut self, _: &[u8]) {
unreachable!();
}

Check warning on line 34 in src/array/dictionary/value_map.rs

View check run for this annotation

Codecov / codecov/patch

src/array/dictionary/value_map.rs#L32-L34

Added lines #L32 - L34 were not covered by tests

#[inline]
fn finish(&self) -> u64 {
self.0
}
}

#[derive(Clone)]

Check warning on line 42 in src/array/dictionary/value_map.rs

View check run for this annotation

Codecov / codecov/patch

src/array/dictionary/value_map.rs#L42

Added line #L42 was not covered by tests
pub struct Hashed<K> {
hash: u64,
key: K,
}

#[inline]
fn ahash_hash<T: Hash + ?Sized>(value: &T) -> u64 {
let mut hasher = BuildHasherDefault::<ahash::AHasher>::default().build_hasher();
value.hash(&mut hasher);
hasher.finish()
}

impl<K> Hash for Hashed<K> {
#[inline]
fn hash<H: Hasher>(&self, state: &mut H) {
self.hash.hash(state)
}
}

#[derive(Clone)]
pub struct ValueMap<K: DictionaryKey, M: MutableArray> {
values: M,
map: HashMap<K, ()>, // NB: *only* use insert_hashed_nocheck() and no other hashmap API
pub values: M,
pub map: HashMap<Hashed<K>, (), BuildHasherDefault<PassthroughHasher>>, // NB: *only* use insert_hashed_nocheck() and no other hashmap API
}

impl<K: DictionaryKey, M: MutableArray> ValueMap<K, M> {
Expand All @@ -39,22 +83,28 @@
M: Indexable,
M::Type: Eq + Hash,
{
let mut map = HashMap::with_capacity(values.len());
let mut map = HashMap::<Hashed<K>, _, _>::with_capacity_and_hasher(
values.len(),
BuildHasherDefault::<PassthroughHasher>::default(),
);
for index in 0..values.len() {
let key = K::try_from(index).map_err(|_| Error::Overflow)?;
// safety: we only iterate within bounds
let value = unsafe { values.value_unchecked_at(index) };
let mut hasher = map.hasher().build_hasher();
value.borrow().hash(&mut hasher);
let hash = hasher.finish();
match map.raw_entry_mut().from_hash(hash, |_| true) {
let hash = ahash_hash(value.borrow());
match map.raw_entry_mut().from_hash(hash, |item| {
// safety: invariant of the struct, it's always in bounds since we maintain it
let stored_value = unsafe { values.value_unchecked_at(item.key.as_usize()) };
stored_value.borrow() == value.borrow()
}) {
RawEntryMut::Occupied(_) => {
return Err(Error::InvalidArgumentError(
"duplicate value in dictionary values array".into(),
))
}
RawEntryMut::Vacant(entry) => {
entry.insert_hashed_nocheck(hash, key, ()); // NB: don't use .insert() here!
// NB: don't use .insert() here!
entry.insert_hashed_nocheck(hash, Hashed { hash, key }, ());
}
}
}
Expand Down Expand Up @@ -91,24 +141,22 @@
V: AsIndexed<M>,
M::Type: Eq + Hash,
{
let mut hasher = self.map.hasher().build_hasher();
value.as_indexed().hash(&mut hasher);
let hash = hasher.finish();

let hash = ahash_hash(value.as_indexed());
Ok(
match self.map.raw_entry_mut().from_hash(hash, |key| {
match self.map.raw_entry_mut().from_hash(hash, |item| {
// safety: we've already checked (the inverse) when we pushed it, so it should be ok?
let index = unsafe { key.as_usize() };
let index = unsafe { item.key.as_usize() };
// safety: invariant of the struct, it's always in bounds since we maintain it
let stored_value = unsafe { self.values.value_unchecked_at(index) };
stored_value.borrow() == value.as_indexed()
}) {
RawEntryMut::Occupied(entry) => *entry.key(),
RawEntryMut::Occupied(entry) => entry.key().key,
RawEntryMut::Vacant(entry) => {
let index = self.values.len();
let key = K::try_from(index).map_err(|_| Error::Overflow)?;
entry.insert_hashed_nocheck(hash, key, ()); // NB: don't use .insert() here!
entry.insert_hashed_nocheck(hash, Hashed { hash, key }, ()); // NB: don't use .insert() here!
push(&mut self.values, value)?;
debug_assert_eq!(self.values.len(), index + 1);
key
}
},
Expand Down
17 changes: 17 additions & 0 deletions tests/it/array/dictionary/mutable.rs
Original file line number Diff line number Diff line change
Expand Up @@ -150,3 +150,20 @@ fn test_push_utf8_ex() {
fn test_push_i64_ex() {
test_push_ex::<MutablePrimitiveArray<i64>, _>(vec![10, 20, 30, 20], |i| 1000 + i as i64);
}

#[test]
fn test_big_dict() {
let n = 10;
let strings = (0..10).map(|i| i.to_string()).collect::<Vec<_>>();
let mut arr = MutableDictionaryArray::<u8, MutableUtf8Array<i32>>::new();
for s in &strings {
arr.try_push(Some(s)).unwrap();
}
assert_eq!(arr.values().len(), n);
for _ in 0..10_000 {
for s in &strings {
arr.try_push(Some(s)).unwrap();
}
}
assert_eq!(arr.values().len(), n);
}
Loading