Implement a runtime-creation-sized Bloom Filter (#5)

# Description - Adds a bloom filter implementation `runtime_sized::BloomFilter` - Splits out bloom filter implementations across modules - Implements rejection sampling in `HashIndexIterator` (has no effect if the size is a power of two) ## Link to issue Fixes #4 ## Type of change - [x] New feature (non-breaking change that adds functionality) - [x] Refactor (non-breaking change that updates existing functionality) - [x] Breaking change (fix or feature that would cause existing functionality to not work as expected) - [x] This change requires a documentation update - [x] Comments have been added/updated ## Test plan (required) - [x] ~~Needs more tests. Consider this somewhat WIP.~~
wnfs-wg · Jul 31, 2023 · fe077c7 · fe077c7 · github-actions · Jul 31, 2023
1 parent ae5bda4
commit fe077c7
Show file tree

Hide file tree

Showing 8 changed files with 892 additions and 367 deletions.
diff --git a/deterministic-bloom-benches/benches/bloom_filter_benchmark.rs b/deterministic-bloom-benches/benches/bloom_filter_benchmark.rs
@@ -1,5 +1,5 @@
 use criterion::{criterion_group, criterion_main, Criterion};
-use deterministic_bloom::BloomFilter;
+use deterministic_bloom::const_size::BloomFilter;
 use rand::Rng;
 
 pub fn add_benchmark(crit: &mut Criterion) {

diff --git a/deterministic-bloom-wasm/src/lib.rs b/deterministic-bloom-wasm/src/lib.rs
@@ -5,7 +5,7 @@
 //! Wasm/JS bindings for [BloomFilter]
 
 use derive_more::{From, Into};
-use deterministic_bloom::BloomFilter;
+use deterministic_bloom::const_size::BloomFilter;
 use std::boxed::Box;
 use wasm_bindgen::prelude::{wasm_bindgen, JsError};
 
@@ -175,9 +175,9 @@ macro_rules! gen_bloom {
         }
 
         impl TryFrom<Vec<u8>> for $name {
-            type Error = deterministic_bloom::Error;
+            type Error = deterministic_bloom::common::Error;
 
-            fn try_from(vec: Vec<u8>) -> Result<Self, deterministic_bloom::Error> {
+            fn try_from(vec: Vec<u8>) -> Result<Self, deterministic_bloom::common::Error> {
                 <BloomFilter<$n, $k>>::try_from(vec).map($name::from)
             }
         }

diff --git a/deterministic-bloom/proptest-regressions/lib.txt b/deterministic-bloom/proptest-regressions/lib.txt
@@ -0,0 +1,8 @@
+# Seeds for failure cases proptest has generated in the past. It is
+# automatically read and these particular cases re-run before any
+# novel cases are generated.
+#
+# It is recommended to check this file in to source control so that
+# everyone who runs the test benefits from these saved cases.
+cc 291d42539f7fb8127033c08ad7c09dba8c1b74a026ba9af55d7c1cbfb3673e80 # shrinks to input = _BloomParamsFprCalcRoundTripsArgs { bloom_bytes: 1, n_elems: 1, fpr: 0.4123907350873679 }
+cc a4e3f11f5e30029214f9bca6d33a195cbeeeac10f7c128da8f2124cc5ac1920b # shrinks to input = _BloomParamsFprCalcRoundTripsArgs { n_elems: 548, fpr: 0.8964377231312348 }
diff --git a/deterministic-bloom/src/common.rs b/deterministic-bloom/src/common.rs
@@ -0,0 +1,218 @@
+use std::{f64::consts::LN_2, fmt::Debug};
+use xxhash_rust::xxh3;
+
+/// An iterator that generates indices into some bloom filter based on deterministic hashing of specified item.
+///
+/// # Examples
+///
+/// ```
+/// use deterministic_bloom::const_size::BloomFilter;
+///
+/// let filter = BloomFilter::<256, 30>::default();
+/// let indices = filter.hash_indices(&[0xF5u8; 32]);
+/// let indices = indices.collect::<Vec<_>>();
+///
+/// assert_eq!(indices.len(), 30);
+/// ```
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct HashIndexIterator<'a, T: AsRef<[u8]>> {
+    item: &'a T,
+    bit_size: usize,
+    index: u64,
+}
+
+/// Optimal bloom parameters for some false positive rate at a maximum number of
+/// elements added, or for some byte size with target element count, etc.
+///
+/// Captures the bloom filter byte size needed as well as the number of hash function
+/// evaluations needed per item to insert.
+///
+/// To construct this, use
+/// - [`BloomParams::new_from_fpr`] for constructing this from a given false positive rate and desired capacity,
+/// - similarly [`BloomParams::new_from_fpr_po2`], but with power-of-two sizes,
+/// - [`BloomParams::new_from_size`] for constructing from desired size and capacity.
+#[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Debug)]
+pub struct BloomParams {
+    /// size of the bloom filter in bytes, non-zero
+    pub byte_size: usize,
+    /// hashing functions used/number of bits set per element, non-zero
+    pub k_hashes: usize,
+}
+
+/// Errors for [BloomFilter] operations.
+#[derive(thiserror::Error, miette::Diagnostic, Debug)]
+pub enum Error {
+    /// Report a size mismatch when importing a Bloom filter from a [Vec].
+    #[error("Cannot convert vector to BloomFilter: expected {expected}, but got {actual}")]
+    #[diagnostic(url(docsrs))]
+    VectorImportSizeMismatch {
+        /// The expected size in the [BloomFilter].
+        expected: usize,
+
+        /// The actual size of the [Vec].
+        actual: usize,
+    },
+}
+
+//------------------------------------------------------------------------------
+// Implementations
+//------------------------------------------------------------------------------
+
+impl<'a, T: AsRef<[u8]>> HashIndexIterator<'a, T> {
+    /// Creates a new iterator.
+    pub fn new(item: &'a T, bit_size: usize) -> Self {
+        Self {
+            item,
+            index: 0,
+            bit_size,
+        }
+    }
+}
+
+impl<T: AsRef<[u8]>> Iterator for HashIndexIterator<'_, T> {
+    type Item = usize;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        let bit_size_po2 = self.bit_size.next_power_of_two();
+        loop {
+            let hash = xxh3::xxh3_64_with_seed(self.item.as_ref(), self.index) as usize;
+            self.index += 1;
+
+            // Rejection sampling for non-power-of-two bit sizes
+            let value = hash % bit_size_po2;
+            if value < self.bit_size {
+                return Some(value);
+            }
+        }
+    }
+}
+
+impl BloomParams {
+    /// Construct optimal bloom parameters for given number maximum elements
+    /// that the bloom filter will hold as well as the approximate
+    /// false positive rate it should have at that capacity.
+    ///
+    /// `n_elems` must be non-zero, and `fpr` must be between 0 and 1, exclusive.
+    ///
+    /// This will generate non-power-of-two sizes for bloom filters.
+    /// For a variant that power-of-two (po2) sizes, see [`BloomParams::new_from_fpr_po2`].
+    ///
+    /// # Example
+    ///
+    /// ```
+    /// use deterministic_bloom::common::BloomParams;
+    ///
+    /// // figure out bloom parameters for 47 elements with a one in a billion false positive rate:
+    /// let params = BloomParams::new_from_fpr(47, 1.0 / 1_000_000_000.0);
+    /// assert_eq!(params, BloomParams {
+    ///     byte_size: 254,
+    ///     k_hashes: 30,
+    /// })
+    /// ```
+    pub fn new_from_fpr(n_elems: u64, fpr: f64) -> Self {
+        let byte_size = Self::optimal_byte_size(n_elems, fpr);
+        let k_hashes = Self::optimal_k_hashes(byte_size * 8, n_elems);
+
+        Self {
+            byte_size,
+            k_hashes,
+        }
+    }
+
+    /// Construct optimal bloom parameters for given capacity `n_elems` and false positive rate,
+    /// where the target size will always be a power-of-two.
+    ///
+    /// `n_elems` must be non-zero, and `fpr` must be between 0.0 and 1.0, exclusive.
+    ///
+    /// It is often desirable to go for power-of-two sizes, since that simplifies generating
+    /// bit indices by not requiring rejection sampling.
+    ///
+    /// # Example
+    ///
+    /// ```
+    /// use deterministic_bloom::common::BloomParams;
+    ///
+    /// // Generate some bloom parameters
+    /// let params = BloomParams::new_from_fpr_po2(1_000_000, 0.0001);
+    /// assert_eq!(params.byte_size, params.byte_size.next_power_of_two());
+    /// ```
+    pub fn new_from_fpr_po2(n_elems: u64, fpr: f64) -> Self {
+        let byte_size = Self::optimal_byte_size(n_elems, fpr).next_power_of_two();
+        let k_hashes = Self::optimal_k_hashes(byte_size * 8, n_elems);
+
+        Self {
+            byte_size,
+            k_hashes,
+        }
+    }
+
+    /// Construct optimal bloom parameters for given bloom filter `byte_size` and capacity `n_elems`.
+    pub fn new_from_size(byte_size: usize, n_elems: u64) -> Self {
+        Self {
+            byte_size,
+            k_hashes: Self::optimal_k_hashes(byte_size * 8, n_elems),
+        }
+    }
+
+    /// Compute the approximate false positive rate at `n_elems`.
+    /// `n_elems` must be non-zero.
+    ///
+    /// Returns the false positive rate as a number between 0.0 and 1.0.
+    pub fn false_positive_rate_at(&self, n_elems: u64) -> f64 {
+        debug_assert!(n_elems != 0);
+
+        let k = self.k_hashes as f64;
+        let ki = self.k_hashes as i32;
+        let m = (self.byte_size * 8) as f64;
+        let n = n_elems as f64;
+
+        // see https://hur.st/bloomfilter/
+        (1.0 - (-k / (m / n)).exp()).powi(ki)
+    }
+
+    fn optimal_byte_size(n_elems: u64, fpr: f64) -> usize {
+        debug_assert!(n_elems != 0);
+        debug_assert!(fpr > 0.0 && fpr < 1.0);
+
+        let n = n_elems as f64;
+        let bit_size = n * fpr.ln() / -(LN_2 * LN_2);
+        (bit_size / 8.0).ceil() as usize
+    }
+
+    fn optimal_k_hashes(bloom_bits: usize, n_elems: u64) -> usize {
+        debug_assert!(bloom_bits != 0);
+        debug_assert!(n_elems != 0);
+
+        let m = bloom_bits as f64;
+        let n = n_elems as f64;
+        let k_hashes = ((m / n) * LN_2).ceil() as usize;
+        std::cmp::max(k_hashes, 1)
+    }
+}
+
+#[cfg(test)]
+mod proptests {
+    use super::BloomParams;
+    use proptest::prop_assert;
+    use test_strategy::proptest;
+
+    #[proptest(cases = 10_000)]
+    fn bloom_params_fpr_calc_round_trips(
+        #[strategy(100u64..1_000_000)] n_elems: u64,
+        #[strategy(0.0..0.1)] fpr: f64,
+    ) {
+        if fpr == 0.0 {
+            return Ok(());
+        }
+
+        let params = BloomParams::new_from_fpr(n_elems, fpr);
+        let fpr_computed = params.false_positive_rate_at(n_elems);
+
+        // The computed FPR can differ from the target FPR due to
+        // rounding errors and the fact that only multiple-of-8
+        // bloom sizes are allowed.
+        let fpr_diff = (fpr_computed - fpr).abs();
+        // We're fine if it's within 15% of a margin-of-error.
+        prop_assert!(fpr_diff < fpr * 0.15);
+    }
+}