From 9f4448b15b591ca0bb85c9be5eaa80a5be82e446 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Fri, 23 Aug 2024 13:20:18 -0400 Subject: [PATCH] always train in bulk --- benches/compress.rs | 105 +++++++------------------- examples/file_compressor.rs | 7 +- examples/round_trip.rs | 2 +- fuzz/fuzz_targets/fuzz_compress.rs | 2 +- fuzz/fuzz_targets/fuzz_train.rs | 2 +- src/builder.rs | 115 ++++++++++++----------------- src/lib.rs | 15 +--- src/lossy_pht.rs | 46 +++--------- tests/correctness.rs | 15 ++-- 9 files changed, 102 insertions(+), 207 deletions(-) diff --git a/benches/compress.rs b/benches/compress.rs index 6499a34..c27a0c1 100644 --- a/benches/compress.rs +++ b/benches/compress.rs @@ -63,7 +63,7 @@ fn bench_dbtext(c: &mut Criterion) { let mut text = String::new(); let lines: Vec<&[u8]> = { - let mut file = File::open("benches/data/wikipedia").unwrap(); + let mut file = File::open(path).unwrap(); file.read_to_string(&mut text).unwrap(); text.lines().map(|line| line.as_bytes()).collect() @@ -71,13 +71,13 @@ fn bench_dbtext(c: &mut Criterion) { group.bench_function("train-and-compress", |b| { b.iter(|| { - let compressor = Compressor::train_bulk(&lines); + let compressor = Compressor::train(&lines); let _ = std::hint::black_box(compressor.compress_bulk(std::hint::black_box(&lines))); }); }); - let compressor = Compressor::train_bulk(&lines); + let compressor = Compressor::train(&lines); group.throughput(Throughput::Bytes( lines.iter().map(|l| l.len() as u64).sum::(), )); @@ -89,6 +89,24 @@ fn bench_dbtext(c: &mut Criterion) { }); group.finish(); + + // Report the compression factor for this dataset. + let uncompressed_size = lines.iter().map(|l| l.len()).sum::(); + let compressor = Compressor::train(&lines); + + // Show the symbols + for code in 256..compressor.symbol_table().len() { + let symbol = compressor.symbol_table()[code]; + let code = code - 256; + println!("symbol[{code}] = {symbol:?}"); + } + + let compressed = compressor.compress_bulk(&lines); + let compressed_size = compressed.iter().map(|l| l.len()).sum::(); + let ratio = 100.0 * (compressed_size as f64) / (uncompressed_size as f64); + println!( + "compressed {name} {uncompressed_size} => {compressed_size}B ({ratio}% of original)" + ) } run_dataset_bench( @@ -111,81 +129,14 @@ fn bench_dbtext(c: &mut Criterion) { "benches/data/urls", c, ); -} - -fn bench_tpch_comments(c: &mut Criterion) { - let mut group = c.benchmark_group("tpch"); - - group.bench_function("train-only", |b| { - b.iter(|| { - let mut file = File::open("/Users/aduffy/code/cwi-fsst/build/comments").unwrap(); - let mut text = String::new(); - file.read_to_string(&mut text).unwrap(); - - let lines: Vec<&str> = text.lines().collect(); - let lines_sliced: Vec<&[u8]> = lines.iter().map(|s| s.as_bytes()).collect(); - - let _ = - std::hint::black_box(Compressor::train_bulk(std::hint::black_box(&lines_sliced))); - // let _ = std::hint::black_box(compressor.compress_bulk(&lines_sliced)); - }); - }); - - let mut file = File::open("/Users/aduffy/code/cwi-fsst/build/comments").unwrap(); - let mut text = String::new(); - file.read_to_string(&mut text).unwrap(); - - let lines: Vec<&str> = text.lines().collect(); - let lines_sliced: Vec<&[u8]> = lines.iter().map(|s| s.as_bytes()).collect(); - - let compressor = Compressor::train_bulk(&lines_sliced); - - group.throughput(Throughput::Bytes( - lines.iter().map(|l| l.len() as u64).sum::(), - )); - group.bench_function("compress-only", |b| { - b.iter(|| { - let _ = std::hint::black_box(compressor.compress_bulk(&lines_sliced)); - }); - }); - group.bench_function("train-and-compress", |b| { - b.iter(|| { - let mut file = File::open("/Users/aduffy/code/cwi-fsst/build/comments").unwrap(); - let mut text = String::new(); - file.read_to_string(&mut text).unwrap(); - - let lines: Vec<&str> = text.lines().collect(); - let lines_sliced: Vec<&[u8]> = lines.iter().map(|s| s.as_bytes()).collect(); - - let compressor = Compressor::train_bulk(&lines_sliced); - let _ = std::hint::black_box(compressor.compress_bulk(&lines_sliced)); - }); - }); - - group.finish(); - - let mut file = File::open("/Users/aduffy/code/cwi-fsst/build/comments").unwrap(); - let mut text = String::new(); - file.read_to_string(&mut text).unwrap(); - - let lines: Vec<&str> = text.lines().collect(); - let lines_sliced: Vec<&[u8]> = lines.iter().map(|s| s.as_bytes()).collect(); - let mut lines_total = Vec::new(); - for slice in &lines_sliced { - lines_total.extend_from_slice(slice); - } - - let compressor = Compressor::train_bulk(&lines_sliced); - let compressed = compressor.compress(&lines_total); - - println!( - "compressed {} => {} ({}%)", - lines_total.len(), - compressed.len(), - 100.0 * (compressed.len() as f64) / (lines_total.len() as f64), - ) + run_dataset_bench( + "dbtext/urls", + "https://raw.githubusercontent.com/cwida/fsst/4e188a/paper/dbtext/urls", + "benches/data/urls", + c, + ); } -criterion_group!(compress_bench, bench_tpch_comments, bench_dbtext); +criterion_group!(compress_bench, bench_dbtext); criterion_main!(compress_bench); diff --git a/examples/file_compressor.rs b/examples/file_compressor.rs index 1e3a58d..2c52649 100644 --- a/examples/file_compressor.rs +++ b/examples/file_compressor.rs @@ -37,12 +37,11 @@ fn main() { let mut output = File::create(output_path).unwrap(); - let compressor = Compressor::train_bulk(&lines); + let compressor = Compressor::train(&lines); let mut compressed_size = 0; for text in lines { - let compressed = compressor.compress(&text); - compressed_size += compressed.len(); - output.write(&compressed).unwrap(); + let compressed = compressor.compress(text); + compressed_size += output.write(&compressed).unwrap(); } println!( diff --git a/examples/round_trip.rs b/examples/round_trip.rs index 038b932..7044f72 100644 --- a/examples/round_trip.rs +++ b/examples/round_trip.rs @@ -7,7 +7,7 @@ use fsst::Compressor; fn main() { // Train on a sample. let sample = "the quick brown fox jumped over the lazy dog"; - let trained = Compressor::train(sample.as_bytes()); + let trained = Compressor::train(&vec![sample.as_bytes()]); let compressed = trained.compress(sample.as_bytes()); println!("compressed: {} => {}", sample.len(), compressed.len()); // decompress now diff --git a/fuzz/fuzz_targets/fuzz_compress.rs b/fuzz/fuzz_targets/fuzz_compress.rs index a871293..50e9d31 100644 --- a/fuzz/fuzz_targets/fuzz_compress.rs +++ b/fuzz/fuzz_targets/fuzz_compress.rs @@ -4,7 +4,7 @@ use libfuzzer_sys::fuzz_target; fuzz_target!(|data: &[u8]| { let compressor = - fsst::Compressor::train("the quick brown fox jumped over the lazy dog".as_bytes()); + fsst::Compressor::train(&vec![b"the quick brown fox jumped over the lazy dog"]); let compress = compressor.compress(data); let decompress = compressor.decompressor().decompress(&compress); assert_eq!(&decompress, data); diff --git a/fuzz/fuzz_targets/fuzz_train.rs b/fuzz/fuzz_targets/fuzz_train.rs index 5d3dada..18581e1 100644 --- a/fuzz/fuzz_targets/fuzz_train.rs +++ b/fuzz/fuzz_targets/fuzz_train.rs @@ -3,5 +3,5 @@ use libfuzzer_sys::fuzz_target; fuzz_target!(|data: &[u8]| { - let _ = fsst::Compressor::train(data); + let _ = fsst::Compressor::train(&vec![data]); }); diff --git a/src/builder.rs b/src/builder.rs index 6c612b1..bc6f452 100644 --- a/src/builder.rs +++ b/src/builder.rs @@ -206,9 +206,9 @@ impl Counter { /// /// [FSST paper]: https://www.vldb.org/pvldb/vol13/p2649-boncz.pdf #[cfg(not(miri))] -const MAX_GENERATIONS: usize = 5; +const GENERATIONS: [usize; 5] = [8usize, 38, 68, 98, 128]; #[cfg(miri)] -const MAX_GENERATIONS: usize = 2; +const GENERATIONS: [usize; 3] = [8usize, 38, 128]; const FSST_SAMPLETARGET: usize = 1 << 14; const FSST_SAMPLEMAX: usize = 1 << 15; @@ -220,6 +220,7 @@ const FSST_SAMPLELINE: usize = 512; /// returned slices are pointers into the `sample_buf`. /// /// SAFETY: sample_buf must be >= FSST_SAMPLEMAX bytes long. Providing something less may cause unexpected failures. +#[allow(clippy::ptr_arg)] fn make_sample<'a, 'b: 'a>(sample_buf: &'a mut Vec, str_in: &Vec<&'b [u8]>) -> Vec<&'a [u8]> { debug_assert!( sample_buf.capacity() >= FSST_SAMPLEMAX, @@ -239,13 +240,13 @@ fn make_sample<'a, 'b: 'a>(sample_buf: &'a mut Vec, str_in: &Vec<&'b [u8]>) while sample_buf_offset < sample_lim { sample_rnd = fsst_hash(sample_rnd); - let mut line_nr = sample_rnd % str_in.len(); + let mut line_nr = (sample_rnd as usize) % str_in.len(); // Find the first non-empty chunk starting at line_nr, wrapping around if // necessary. // // TODO: this will loop infinitely if there are no non-empty lines in the sample - while str_in[line_nr].len() == 0 { + while str_in[line_nr].is_empty() { if line_nr == str_in.len() { line_nr = 0; } @@ -254,10 +255,9 @@ fn make_sample<'a, 'b: 'a>(sample_buf: &'a mut Vec, str_in: &Vec<&'b [u8]>) let line = str_in[line_nr]; let chunks = 1 + ((line.len() - 1) / FSST_SAMPLELINE); sample_rnd = fsst_hash(sample_rnd); - let chunk = FSST_SAMPLELINE * (sample_rnd % chunks); + let chunk = FSST_SAMPLELINE * ((sample_rnd as usize) % chunks); let len = FSST_SAMPLELINE.min(line.len() - chunk); - // println!("extending sample with chunk str_in[{line_nr}][{chunk}...len={len}]"); sample_buf.extend_from_slice(&str_in[line_nr][chunk..chunk + len]); @@ -273,7 +273,11 @@ fn make_sample<'a, 'b: 'a>(sample_buf: &'a mut Vec, str_in: &Vec<&'b [u8]>) sample } -fn fsst_hash(value: usize) -> usize { +/// Hash function used in various components of the library. +/// +/// This is equivalent to the FSST_HASH macro from the C++ implementation. +#[inline] +pub(crate) fn fsst_hash(value: u64) -> u64 { (value * 2971215073) ^ (value >> 15) } @@ -307,56 +311,24 @@ impl Compressor { /// code). /// /// [FSST paper]: https://www.vldb.org/pvldb/vol13/p2649-boncz.pdf - pub fn train(corpus: impl AsRef<[u8]>) -> Self { + pub fn train(values: &Vec<&[u8]>) -> Self { + let mut counters = Counter::new(); let mut compressor = Compressor::default(); - // TODO(aduffy): handle truncating/sampling if corpus > requires sample size. - let sample = corpus.as_ref(); - if sample.is_empty() { - return compressor; - } - - // Make the sample for each iteration. - // - // The sample is just a vector of slices, so we don't actually have to move anything around. - let mut counter = Counter::new(); - for _generation in 0..(MAX_GENERATIONS - 1) { - compressor.compress_count(sample, &mut counter); - compressor.optimize(&counter, 128); - counter.clear(); + if values.is_empty() { + return compressor; } - compressor.compress_count(sample, &mut counter); - compressor.optimize(&counter, 128); - - compressor - } - - /// Train on a collection of samples. - pub fn train_bulk(values: &Vec<&[u8]>) -> Self { let mut sample_memory = Vec::with_capacity(FSST_SAMPLEMAX); let sample = make_sample(&mut sample_memory, values); - - let mut counters = Counter::new(); - let mut compressor = Compressor::default(); - - for sample_frac in [8usize, 38, 68, 98, 128] { - // let mut skips = 0; - for i in 0..sample.len() { - if sample_frac < 128 { - if fsst_hash(i) & 127 > sample_frac { - // skips += 1; - continue; - } + for sample_frac in GENERATIONS { + for (i, line) in sample.iter().enumerate() { + if sample_frac < 128 && ((fsst_hash(i as u64) & 127) as usize) > sample_frac { + continue; } - compressor.compress_count(sample[i], &mut counters); + compressor.compress_count(line, &mut counters); } - // println!( - // "sampleFrac={sample_frac} -- skipped {} of {}", - // skips, - // sample.len() - // ); compressor.optimize(&counters, sample_frac); counters.clear(); @@ -403,6 +375,11 @@ impl Compressor { counter.record_count1(code_u16); if prev_code != MAX_CODE { counter.record_count2(prev_code, code_u16); + // Also record the first byte of the next code + let first_byte_code = + self.symbols[code_u16 as usize].first_byte() as u16; + counter.record_count1(first_byte_code); + counter.record_count2(prev_code, first_byte_code); } prev_code = code_u16; } @@ -494,9 +471,9 @@ impl Compressor { // From the c++ impl: // "improves both compression speed (less candidates), but also quality!!" - if count < 5 * sample_frac / 128 { - continue; - } + // if count < (5 * sample_frac / 128) { + // continue; + // } let mut gain = count * symbol1_len; // NOTE: use heuristic from C++ implementation to boost the gain of single-byte symbols. @@ -512,7 +489,7 @@ impl Compressor { }); } - // Skip on last round, or when symbol cannot be extended. + // Skip merges on last round, or when symbol cannot be extended. if sample_frac >= 128 || symbol1_len == 8 { continue; } @@ -552,19 +529,19 @@ impl Compressor { // // Note that because of the lossy hash table, we won't accidentally // save the same ASCII character twice into the table. - // if include_ascii { - // for character in - // " abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ[](){}:?/<>".bytes() - // { - // if n_symbols == 255 { - // break; - // } - - // if self.insert(Symbol::from_u8(character)) { - // n_symbols += 1 - // } - // } - // } + if sample_frac < 128 { + for character in + " abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ[](){}:?/<>".bytes() + { + if n_symbols == 255 { + break; + } + + if self.insert(Symbol::from_u8(character)) { + n_symbols += 1 + } + } + } } } @@ -613,11 +590,13 @@ mod test { #[test] fn test_builder() { // Train a Compressor on the toy string - let text = "hello world"; - let table = Compressor::train(text.as_bytes()); + let text = b"hello hello hello hello"; + + // count of 5 is the cutoff for including a symbol in the table. + let table = Compressor::train(&vec![text, text, text, text, text]); // Use the table to compress a string, see the values - let compressed = table.compress(text.as_bytes()); + let compressed = table.compress(text); // Ensure that the compressed string has no escape bytes assert!(compressed.iter().all(|b| *b != ESCAPE_CODE)); diff --git a/src/lib.rs b/src/lib.rs index f8e472f..132919c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -105,19 +105,6 @@ impl Symbol { } } -#[cfg(test)] -mod test { - use crate::Symbol; - - #[test] - fn test_concat() { - let symbola = Symbol::from_u8(b'a'); - let symbolb = Symbol::from_u8(b'b'); - let symbolab = symbola.concat(symbolb); - assert_eq!(&symbolab.0.to_le_bytes()[0..symbolab.len()], b"ab"); - } -} - impl Debug for Symbol { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { let slice = &self.0.to_le_bytes()[0..self.len()]; @@ -557,7 +544,7 @@ impl Compressor { #[inline] pub(crate) fn advance_8byte_word(word: u64, bytes: usize) -> u64 { - // shift the word off the right-end, because little endian means the first + // shift the word off the low-end, because little endian means the first // char is stored in the LSB. // // Note that even though this looks like it branches, Rust compiles this to a diff --git a/src/lossy_pht.rs b/src/lossy_pht.rs index e420d86..6afc9cc 100644 --- a/src/lossy_pht.rs +++ b/src/lossy_pht.rs @@ -1,5 +1,6 @@ use std::fmt::Debug; +use crate::builder::fsst_hash; use crate::CodeMeta; use crate::Symbol; use crate::MAX_CODE; @@ -81,56 +82,33 @@ impl LossyPHT { /// True if the symbol was inserted into the table, false if it was rejected due to collision. pub(crate) fn insert(&mut self, symbol: Symbol, code: u8) -> bool { let prefix_3bytes = symbol.as_u64() & 0xFF_FF_FF; - let slot = self.hash(prefix_3bytes) as usize & (HASH_TABLE_SIZE - 1); - - unsafe { - let entry = self.slots.as_mut_ptr().add(slot); - if (*entry).code.extended_code() != MAX_CODE { - // in-use - false - } else { - // unused - (*entry).symbol = symbol; - (*entry).code = CodeMeta::new_symbol(code, symbol); - (*entry).ignored_bits = (64 - 8 * symbol.len()) as u16; - true - } + let slot = fsst_hash(prefix_3bytes) as usize & (HASH_TABLE_SIZE - 1); + let entry = &mut self.slots[slot]; + if !entry.is_unused() { + false + } else { + entry.symbol = symbol; + entry.code = CodeMeta::new_symbol(code, symbol); + entry.ignored_bits = (64 - 8 * symbol.len()) as u16; + true } - - // let entry = &mut self.slots[slot]; - // if !entry.is_unused() { - // false - // } else { - // entry.symbol = symbol; - // entry.code = CodeMeta::new_symbol(code, symbol); - // entry.ignored_bits = (64 - 8 * symbol.len()) as u16; - // true - // } } /// Remove the symbol from the hashtable, if it exists. pub(crate) fn remove(&mut self, symbol: Symbol) { let prefix_3bytes = symbol.as_u64() & 0xFF_FF_FF; - let slot = self.hash(prefix_3bytes) as usize & (HASH_TABLE_SIZE - 1); + let slot = fsst_hash(prefix_3bytes) as usize & (HASH_TABLE_SIZE - 1); self.slots[slot].code = CodeMeta::EMPTY; } #[inline] pub(crate) fn lookup(&self, word: u64) -> &TableEntry { let prefix_3bytes = word & 0xFF_FF_FF; - let slot = self.hash(prefix_3bytes) as usize & (HASH_TABLE_SIZE - 1); + let slot = fsst_hash(prefix_3bytes) as usize & (HASH_TABLE_SIZE - 1); // SAFETY: the slot is guaranteed to between 0...(HASH_TABLE_SIZE - 1). unsafe { self.slots.get_unchecked(slot) } } - - /// Hash a value to find the bucket it belongs in. - /// - /// The particular hash function comes from the code listing of Algorithm 4 of the FSST paper. - #[inline] - fn hash(&self, value: u64) -> u64 { - (value * 2971215073) ^ (value >> 15) - } } impl Default for LossyPHT { diff --git a/tests/correctness.rs b/tests/correctness.rs index bded628..946170f 100644 --- a/tests/correctness.rs +++ b/tests/correctness.rs @@ -1,6 +1,6 @@ #![cfg(test)] -use fsst::Compressor; +use fsst::{Compressor, Symbol}; static PREAMBLE: &str = r#" When in the Course of human events, it becomes necessary for one people to dissolve @@ -16,7 +16,7 @@ static ART_OF_WAR: &str = include_str!("./fixtures/art_of_war.txt"); #[test] fn test_basic() { // Roundtrip the declaration - let trained = Compressor::train(PREAMBLE); + let trained = Compressor::train(&vec![PREAMBLE.as_bytes()]); let compressed = trained.compress(PREAMBLE.as_bytes()); let decompressed = trained.decompressor().decompress(&compressed); assert_eq!(decompressed, PREAMBLE.as_bytes()); @@ -24,7 +24,7 @@ fn test_basic() { #[test] fn test_train_on_empty() { - let trained = Compressor::train(""); + let trained = Compressor::train(&vec![]); // We can still compress with it, but the symbols are going to be empty. let compressed = trained.compress("the quick brown fox jumped over the lazy dog".as_bytes()); assert_eq!( @@ -35,7 +35,8 @@ fn test_train_on_empty() { #[test] fn test_one_byte() { - let empty = Compressor::train(&[0x01]); + let mut empty = Compressor::default(); + empty.insert(Symbol::from_u8(0x01)); let compressed = empty.compress(&[0x01]); assert_eq!(compressed, vec![0u8]); @@ -46,7 +47,7 @@ fn test_one_byte() { #[test] fn test_zeros() { let training_data: Vec = vec![0, 1, 2, 3, 4, 0]; - let trained = Compressor::train(&training_data); + let trained = Compressor::train(&vec![&training_data]); let compressed = trained.compress(&[4, 0]); assert_eq!(trained.decompressor().decompress(&compressed), &[4, 0]); } @@ -56,7 +57,7 @@ fn test_zeros() { fn test_large() { let corpus: Vec = DECLARATION.bytes().cycle().take(10_240).collect(); - let trained = Compressor::train(&corpus); + let trained = Compressor::train(&vec![&corpus]); let massive: Vec = DECLARATION .bytes() .cycle() @@ -69,7 +70,7 @@ fn test_large() { #[test] fn test_chinese() { - let trained = Compressor::train(ART_OF_WAR.as_bytes()); + let trained = Compressor::train(&vec![ART_OF_WAR.as_bytes()]); assert_eq!( ART_OF_WAR.as_bytes(), trained