Skip to content

Commit

Permalink
Implement a runtime-creation-sized Bloom Filter (#5)
Browse files Browse the repository at this point in the history
# Description

- Adds a bloom filter implementation `runtime_sized::BloomFilter`
- Splits out bloom filter implementations across modules
- Implements rejection sampling in `HashIndexIterator` (has no effect if
the size is a power of two)

## Link to issue

Fixes #4 

## Type of change

- [x] New feature (non-breaking change that adds functionality)
- [x] Refactor (non-breaking change that updates existing functionality)
- [x] Breaking change (fix or feature that would cause existing
functionality to not work as expected)
- [x] This change requires a documentation update
- [x] Comments have been added/updated

## Test plan (required)

- [x] ~~Needs more tests. Consider this somewhat WIP.~~
  • Loading branch information
matheus23 authored Jul 31, 2023
1 parent ae5bda4 commit fe077c7
Show file tree
Hide file tree
Showing 8 changed files with 892 additions and 367 deletions.
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
use criterion::{criterion_group, criterion_main, Criterion};
use deterministic_bloom::BloomFilter;
use deterministic_bloom::const_size::BloomFilter;
use rand::Rng;

pub fn add_benchmark(crit: &mut Criterion) {
Expand Down
6 changes: 3 additions & 3 deletions deterministic-bloom-wasm/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
//! Wasm/JS bindings for [BloomFilter]

use derive_more::{From, Into};
use deterministic_bloom::BloomFilter;
use deterministic_bloom::const_size::BloomFilter;
use std::boxed::Box;
use wasm_bindgen::prelude::{wasm_bindgen, JsError};

Expand Down Expand Up @@ -175,9 +175,9 @@ macro_rules! gen_bloom {
}

impl TryFrom<Vec<u8>> for $name {
type Error = deterministic_bloom::Error;
type Error = deterministic_bloom::common::Error;

fn try_from(vec: Vec<u8>) -> Result<Self, deterministic_bloom::Error> {
fn try_from(vec: Vec<u8>) -> Result<Self, deterministic_bloom::common::Error> {
<BloomFilter<$n, $k>>::try_from(vec).map($name::from)
}
}
Expand Down
8 changes: 8 additions & 0 deletions deterministic-bloom/proptest-regressions/lib.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# Seeds for failure cases proptest has generated in the past. It is
# automatically read and these particular cases re-run before any
# novel cases are generated.
#
# It is recommended to check this file in to source control so that
# everyone who runs the test benefits from these saved cases.
cc 291d42539f7fb8127033c08ad7c09dba8c1b74a026ba9af55d7c1cbfb3673e80 # shrinks to input = _BloomParamsFprCalcRoundTripsArgs { bloom_bytes: 1, n_elems: 1, fpr: 0.4123907350873679 }
cc a4e3f11f5e30029214f9bca6d33a195cbeeeac10f7c128da8f2124cc5ac1920b # shrinks to input = _BloomParamsFprCalcRoundTripsArgs { n_elems: 548, fpr: 0.8964377231312348 }
218 changes: 218 additions & 0 deletions deterministic-bloom/src/common.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,218 @@
use std::{f64::consts::LN_2, fmt::Debug};
use xxhash_rust::xxh3;

/// An iterator that generates indices into some bloom filter based on deterministic hashing of specified item.
///
/// # Examples
///
/// ```
/// use deterministic_bloom::const_size::BloomFilter;
///
/// let filter = BloomFilter::<256, 30>::default();
/// let indices = filter.hash_indices(&[0xF5u8; 32]);
/// let indices = indices.collect::<Vec<_>>();
///
/// assert_eq!(indices.len(), 30);
/// ```
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct HashIndexIterator<'a, T: AsRef<[u8]>> {
item: &'a T,
bit_size: usize,
index: u64,
}

/// Optimal bloom parameters for some false positive rate at a maximum number of
/// elements added, or for some byte size with target element count, etc.
///
/// Captures the bloom filter byte size needed as well as the number of hash function
/// evaluations needed per item to insert.
///
/// To construct this, use
/// - [`BloomParams::new_from_fpr`] for constructing this from a given false positive rate and desired capacity,
/// - similarly [`BloomParams::new_from_fpr_po2`], but with power-of-two sizes,
/// - [`BloomParams::new_from_size`] for constructing from desired size and capacity.
#[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Debug)]
pub struct BloomParams {
/// size of the bloom filter in bytes, non-zero
pub byte_size: usize,
/// hashing functions used/number of bits set per element, non-zero
pub k_hashes: usize,
}

/// Errors for [BloomFilter] operations.
#[derive(thiserror::Error, miette::Diagnostic, Debug)]
pub enum Error {
/// Report a size mismatch when importing a Bloom filter from a [Vec].
#[error("Cannot convert vector to BloomFilter: expected {expected}, but got {actual}")]
#[diagnostic(url(docsrs))]
VectorImportSizeMismatch {
/// The expected size in the [BloomFilter].
expected: usize,

/// The actual size of the [Vec].
actual: usize,
},
}

//------------------------------------------------------------------------------
// Implementations
//------------------------------------------------------------------------------

impl<'a, T: AsRef<[u8]>> HashIndexIterator<'a, T> {
/// Creates a new iterator.
pub fn new(item: &'a T, bit_size: usize) -> Self {
Self {
item,
index: 0,
bit_size,
}
}
}

impl<T: AsRef<[u8]>> Iterator for HashIndexIterator<'_, T> {
type Item = usize;

fn next(&mut self) -> Option<Self::Item> {
let bit_size_po2 = self.bit_size.next_power_of_two();
loop {
let hash = xxh3::xxh3_64_with_seed(self.item.as_ref(), self.index) as usize;
self.index += 1;

// Rejection sampling for non-power-of-two bit sizes
let value = hash % bit_size_po2;
if value < self.bit_size {
return Some(value);
}
}
}
}

impl BloomParams {
/// Construct optimal bloom parameters for given number maximum elements
/// that the bloom filter will hold as well as the approximate
/// false positive rate it should have at that capacity.
///
/// `n_elems` must be non-zero, and `fpr` must be between 0 and 1, exclusive.
///
/// This will generate non-power-of-two sizes for bloom filters.
/// For a variant that power-of-two (po2) sizes, see [`BloomParams::new_from_fpr_po2`].
///
/// # Example
///
/// ```
/// use deterministic_bloom::common::BloomParams;
///
/// // figure out bloom parameters for 47 elements with a one in a billion false positive rate:
/// let params = BloomParams::new_from_fpr(47, 1.0 / 1_000_000_000.0);
/// assert_eq!(params, BloomParams {
/// byte_size: 254,
/// k_hashes: 30,
/// })
/// ```
pub fn new_from_fpr(n_elems: u64, fpr: f64) -> Self {
let byte_size = Self::optimal_byte_size(n_elems, fpr);
let k_hashes = Self::optimal_k_hashes(byte_size * 8, n_elems);

Self {
byte_size,
k_hashes,
}
}

/// Construct optimal bloom parameters for given capacity `n_elems` and false positive rate,
/// where the target size will always be a power-of-two.
///
/// `n_elems` must be non-zero, and `fpr` must be between 0.0 and 1.0, exclusive.
///
/// It is often desirable to go for power-of-two sizes, since that simplifies generating
/// bit indices by not requiring rejection sampling.
///
/// # Example
///
/// ```
/// use deterministic_bloom::common::BloomParams;
///
/// // Generate some bloom parameters
/// let params = BloomParams::new_from_fpr_po2(1_000_000, 0.0001);
/// assert_eq!(params.byte_size, params.byte_size.next_power_of_two());
/// ```
pub fn new_from_fpr_po2(n_elems: u64, fpr: f64) -> Self {
let byte_size = Self::optimal_byte_size(n_elems, fpr).next_power_of_two();
let k_hashes = Self::optimal_k_hashes(byte_size * 8, n_elems);

Self {
byte_size,
k_hashes,
}
}

/// Construct optimal bloom parameters for given bloom filter `byte_size` and capacity `n_elems`.
pub fn new_from_size(byte_size: usize, n_elems: u64) -> Self {
Self {
byte_size,
k_hashes: Self::optimal_k_hashes(byte_size * 8, n_elems),
}
}

/// Compute the approximate false positive rate at `n_elems`.
/// `n_elems` must be non-zero.
///
/// Returns the false positive rate as a number between 0.0 and 1.0.
pub fn false_positive_rate_at(&self, n_elems: u64) -> f64 {
debug_assert!(n_elems != 0);

let k = self.k_hashes as f64;
let ki = self.k_hashes as i32;
let m = (self.byte_size * 8) as f64;
let n = n_elems as f64;

// see https://hur.st/bloomfilter/
(1.0 - (-k / (m / n)).exp()).powi(ki)
}

fn optimal_byte_size(n_elems: u64, fpr: f64) -> usize {
debug_assert!(n_elems != 0);
debug_assert!(fpr > 0.0 && fpr < 1.0);

let n = n_elems as f64;
let bit_size = n * fpr.ln() / -(LN_2 * LN_2);
(bit_size / 8.0).ceil() as usize
}

fn optimal_k_hashes(bloom_bits: usize, n_elems: u64) -> usize {
debug_assert!(bloom_bits != 0);
debug_assert!(n_elems != 0);

let m = bloom_bits as f64;
let n = n_elems as f64;
let k_hashes = ((m / n) * LN_2).ceil() as usize;
std::cmp::max(k_hashes, 1)
}
}

#[cfg(test)]
mod proptests {
use super::BloomParams;
use proptest::prop_assert;
use test_strategy::proptest;

#[proptest(cases = 10_000)]
fn bloom_params_fpr_calc_round_trips(
#[strategy(100u64..1_000_000)] n_elems: u64,
#[strategy(0.0..0.1)] fpr: f64,
) {
if fpr == 0.0 {
return Ok(());
}

let params = BloomParams::new_from_fpr(n_elems, fpr);
let fpr_computed = params.false_positive_rate_at(n_elems);

// The computed FPR can differ from the target FPR due to
// rounding errors and the fact that only multiple-of-8
// bloom sizes are allowed.
let fpr_diff = (fpr_computed - fpr).abs();
// We're fine if it's within 15% of a margin-of-error.
prop_assert!(fpr_diff < fpr * 0.15);
}
}
Loading

1 comment on commit fe077c7

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Performance Alert ⚠️

Possible performance regression was detected for benchmark 'Rust Benchmark'.
Benchmark result of this commit is worse than the previous benchmark result exceeding threshold 2.

Benchmark suite Current: fe077c7 Previous: ae5bda4 Ratio
add 1544 ns/iter (± 58) 535 ns/iter (± 1) 2.89

This comment was automatically generated by workflow using github-action-benchmark.

CC: @expede

Please sign in to comment.