From 1fe0f9457449b7a82da58a65d6fe02c9b7b6dcd5 Mon Sep 17 00:00:00 2001 From: Ivan Smirnov Date: Tue, 5 Sep 2023 19:19:16 +0100 Subject: [PATCH 1/4] chore: add another test for MutableDictionaryArray --- tests/it/array/dictionary/mutable.rs | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/tests/it/array/dictionary/mutable.rs b/tests/it/array/dictionary/mutable.rs index 9570339893..a7845114d9 100644 --- a/tests/it/array/dictionary/mutable.rs +++ b/tests/it/array/dictionary/mutable.rs @@ -150,3 +150,20 @@ fn test_push_utf8_ex() { fn test_push_i64_ex() { test_push_ex::, _>(vec![10, 20, 30, 20], |i| 1000 + i as i64); } + +#[test] +fn test_big_dict() { + let n = 10; + let strings = (0..10).map(|i| i.to_string()).collect::>(); + let mut arr = MutableDictionaryArray::>::new(); + for s in &strings { + arr.try_push(Some(s)).unwrap(); + } + assert_eq!(arr.values().len(), n); + for _ in 0..10_000 { + for s in &strings { + arr.try_push(Some(s)).unwrap(); + } + } + assert_eq!(arr.values().len(), n); +} From 79f4b61506267be6fede372057bd9ac16543ca3a Mon Sep 17 00:00:00 2001 From: Ivan Smirnov Date: Tue, 5 Sep 2023 19:19:45 +0100 Subject: [PATCH 2/4] fix: switch ValueMap to hashed-hash layout --- src/array/dictionary/value_map.rs | 90 +++++++++++++++++++++++++------ 1 file changed, 74 insertions(+), 16 deletions(-) diff --git a/src/array/dictionary/value_map.rs b/src/array/dictionary/value_map.rs index 35b59aaa2a..44b1075422 100644 --- a/src/array/dictionary/value_map.rs +++ b/src/array/dictionary/value_map.rs @@ -1,6 +1,6 @@ use std::borrow::Borrow; use std::fmt::{self, Debug}; -use std::hash::{BuildHasher, Hash, Hasher}; +use std::hash::{BuildHasher, BuildHasherDefault, Hash, Hasher}; use hashbrown::hash_map::RawEntryMut; use hashbrown::HashMap; @@ -15,10 +15,61 @@ use crate::{ use super::DictionaryKey; +/// Hasher for pre-hashed values; similar to `hash_hasher` but with native endianness. +/// +/// We know that we'll only use it for `u64` values, so we can avoid endian conversion. +/// +/// Invariant: hash of a u64 value is always equal to itself. +#[derive(Copy, Clone, Default)] +pub struct HashHasherNative(u64); + +impl Hasher for HashHasherNative { + #[inline] + fn write(&mut self, value: &[u8]) { + #[cfg(target_endian = "little")] + { + for (index, item) in value.iter().enumerate().take(8) { + self.0 ^= u64::from(*item) << (index * 8); + } + } + #[cfg(target_endian = "big")] + { + for (index, item) in value.iter().rev().enumerate().take(8) { + self.0 ^= u64::from(*item) << (index * 8); + } + } + } + + #[inline] + fn finish(&self) -> u64 { + self.0 + } +} + +#[derive(Clone)] +pub struct Hashed { + hash: u64, + key: K, +} + +#[inline] +fn ahash_hash(value: &T) -> u64 { + let mut hasher = BuildHasherDefault::::default().build_hasher(); + value.hash(&mut hasher); + hasher.finish() +} + +impl Hash for Hashed { + #[inline] + fn hash(&self, state: &mut H) { + self.hash.hash(state) + } +} + #[derive(Clone)] pub struct ValueMap { - values: M, - map: HashMap, // NB: *only* use insert_hashed_nocheck() and no other hashmap API + pub values: M, + pub map: HashMap, (), BuildHasherDefault>, // NB: *only* use insert_hashed_nocheck() and no other hashmap API } impl ValueMap { @@ -39,22 +90,29 @@ impl ValueMap { M: Indexable, M::Type: Eq + Hash, { - let mut map = HashMap::with_capacity(values.len()); + let mut map = HashMap::, _, _>::with_capacity_and_hasher( + values.len(), + BuildHasherDefault::::default(), + ); for index in 0..values.len() { let key = K::try_from(index).map_err(|_| Error::Overflow)?; // safety: we only iterate within bounds let value = unsafe { values.value_unchecked_at(index) }; - let mut hasher = map.hasher().build_hasher(); - value.borrow().hash(&mut hasher); - let hash = hasher.finish(); - match map.raw_entry_mut().from_hash(hash, |_| true) { + let hash = ahash_hash(value.borrow()); + let hash_hash = hash; // NOTE: we can do this ONLY due to the invariant of HashHasherNative + match map.raw_entry_mut().from_hash(hash_hash, |item| { + // safety: invariant of the struct, it's always in bounds since we maintain it + let stored_value = unsafe { values.value_unchecked_at(item.key.as_usize()) }; + stored_value.borrow() == value.borrow() + }) { RawEntryMut::Occupied(_) => { return Err(Error::InvalidArgumentError( "duplicate value in dictionary values array".into(), )) } RawEntryMut::Vacant(entry) => { - entry.insert_hashed_nocheck(hash, key, ()); // NB: don't use .insert() here! + entry.insert_hashed_nocheck(hash_hash, Hashed { hash, key }, ()); + // NB: don't use .insert() here! } } } @@ -91,24 +149,24 @@ impl ValueMap { V: AsIndexed, M::Type: Eq + Hash, { - let mut hasher = self.map.hasher().build_hasher(); - value.as_indexed().hash(&mut hasher); - let hash = hasher.finish(); + let hash = ahash_hash(value.as_indexed()); + let hash_hash = hash; // NOTE: we can do this ONLY due to the invariant of HashHasherNative Ok( - match self.map.raw_entry_mut().from_hash(hash, |key| { + match self.map.raw_entry_mut().from_hash(hash_hash, |item| { // safety: we've already checked (the inverse) when we pushed it, so it should be ok? - let index = unsafe { key.as_usize() }; + let index = unsafe { item.key.as_usize() }; // safety: invariant of the struct, it's always in bounds since we maintain it let stored_value = unsafe { self.values.value_unchecked_at(index) }; stored_value.borrow() == value.as_indexed() }) { - RawEntryMut::Occupied(entry) => *entry.key(), + RawEntryMut::Occupied(entry) => entry.key().key, RawEntryMut::Vacant(entry) => { let index = self.values.len(); let key = K::try_from(index).map_err(|_| Error::Overflow)?; - entry.insert_hashed_nocheck(hash, key, ()); // NB: don't use .insert() here! + entry.insert_hashed_nocheck(hash_hash, Hashed { hash, key }, ()); // NB: don't use .insert() here! push(&mut self.values, value)?; + assert_eq!(self.values.len(), index + 1); key } }, From a801819a1b03ca87ee13ac4cc6c9399cf7380fcb Mon Sep 17 00:00:00 2001 From: Ivan Smirnov Date: Wed, 6 Sep 2023 11:19:54 +0100 Subject: [PATCH 3/4] fix: update ValueMap to use passthrough-hasher --- src/array/dictionary/value_map.rs | 38 ++++++++++++------------------- 1 file changed, 14 insertions(+), 24 deletions(-) diff --git a/src/array/dictionary/value_map.rs b/src/array/dictionary/value_map.rs index 44b1075422..8fe0a0cb87 100644 --- a/src/array/dictionary/value_map.rs +++ b/src/array/dictionary/value_map.rs @@ -21,23 +21,16 @@ use super::DictionaryKey; /// /// Invariant: hash of a u64 value is always equal to itself. #[derive(Copy, Clone, Default)] -pub struct HashHasherNative(u64); +pub struct PassthroughHasher(u64); -impl Hasher for HashHasherNative { +impl Hasher for PassthroughHasher { #[inline] - fn write(&mut self, value: &[u8]) { - #[cfg(target_endian = "little")] - { - for (index, item) in value.iter().enumerate().take(8) { - self.0 ^= u64::from(*item) << (index * 8); - } - } - #[cfg(target_endian = "big")] - { - for (index, item) in value.iter().rev().enumerate().take(8) { - self.0 ^= u64::from(*item) << (index * 8); - } - } + fn write_u64(&mut self, value: u64) { + self.0 = value; + } + + fn write(&mut self, _: &[u8]) { + unreachable!(); } #[inline] @@ -69,7 +62,7 @@ impl Hash for Hashed { #[derive(Clone)] pub struct ValueMap { pub values: M, - pub map: HashMap, (), BuildHasherDefault>, // NB: *only* use insert_hashed_nocheck() and no other hashmap API + pub map: HashMap, (), BuildHasherDefault>, // NB: *only* use insert_hashed_nocheck() and no other hashmap API } impl ValueMap { @@ -92,15 +85,14 @@ impl ValueMap { { let mut map = HashMap::, _, _>::with_capacity_and_hasher( values.len(), - BuildHasherDefault::::default(), + BuildHasherDefault::::default(), ); for index in 0..values.len() { let key = K::try_from(index).map_err(|_| Error::Overflow)?; // safety: we only iterate within bounds let value = unsafe { values.value_unchecked_at(index) }; let hash = ahash_hash(value.borrow()); - let hash_hash = hash; // NOTE: we can do this ONLY due to the invariant of HashHasherNative - match map.raw_entry_mut().from_hash(hash_hash, |item| { + match map.raw_entry_mut().from_hash(hash, |item| { // safety: invariant of the struct, it's always in bounds since we maintain it let stored_value = unsafe { values.value_unchecked_at(item.key.as_usize()) }; stored_value.borrow() == value.borrow() @@ -111,8 +103,8 @@ impl ValueMap { )) } RawEntryMut::Vacant(entry) => { - entry.insert_hashed_nocheck(hash_hash, Hashed { hash, key }, ()); // NB: don't use .insert() here! + entry.insert_hashed_nocheck(hash, Hashed { hash, key }, ()); } } } @@ -150,10 +142,8 @@ impl ValueMap { M::Type: Eq + Hash, { let hash = ahash_hash(value.as_indexed()); - let hash_hash = hash; // NOTE: we can do this ONLY due to the invariant of HashHasherNative - Ok( - match self.map.raw_entry_mut().from_hash(hash_hash, |item| { + match self.map.raw_entry_mut().from_hash(hash, |item| { // safety: we've already checked (the inverse) when we pushed it, so it should be ok? let index = unsafe { item.key.as_usize() }; // safety: invariant of the struct, it's always in bounds since we maintain it @@ -164,7 +154,7 @@ impl ValueMap { RawEntryMut::Vacant(entry) => { let index = self.values.len(); let key = K::try_from(index).map_err(|_| Error::Overflow)?; - entry.insert_hashed_nocheck(hash_hash, Hashed { hash, key }, ()); // NB: don't use .insert() here! + entry.insert_hashed_nocheck(hash, Hashed { hash, key }, ()); // NB: don't use .insert() here! push(&mut self.values, value)?; assert_eq!(self.values.len(), index + 1); key From 4ba4f1faa307f07d2a314a0812ddb59e0ee05030 Mon Sep 17 00:00:00 2001 From: Ivan Smirnov Date: Wed, 6 Sep 2023 13:45:22 +0100 Subject: [PATCH 4/4] fix: assert_eq->debug_assert_eq in ValueMap --- src/array/dictionary/value_map.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/array/dictionary/value_map.rs b/src/array/dictionary/value_map.rs index 8fe0a0cb87..eb0f8790ca 100644 --- a/src/array/dictionary/value_map.rs +++ b/src/array/dictionary/value_map.rs @@ -156,7 +156,7 @@ impl ValueMap { let key = K::try_from(index).map_err(|_| Error::Overflow)?; entry.insert_hashed_nocheck(hash, Hashed { hash, key }, ()); // NB: don't use .insert() here! push(&mut self.values, value)?; - assert_eq!(self.values.len(), index + 1); + debug_assert_eq!(self.values.len(), index + 1); key } },