From 9f4448b15b591ca0bb85c9be5eaa80a5be82e446 Mon Sep 17 00:00:00 2001
From: Andrew Duffy <andreweduffy@gmail.com>
Date: Fri, 23 Aug 2024 13:20:18 -0400
Subject: [PATCH] always train in bulk

---
 benches/compress.rs                | 105 +++++++-------------------
 examples/file_compressor.rs        |   7 +-
 examples/round_trip.rs             |   2 +-
 fuzz/fuzz_targets/fuzz_compress.rs |   2 +-
 fuzz/fuzz_targets/fuzz_train.rs    |   2 +-
 src/builder.rs                     | 115 ++++++++++++-----------------
 src/lib.rs                         |  15 +---
 src/lossy_pht.rs                   |  46 +++---------
 tests/correctness.rs               |  15 ++--
 9 files changed, 102 insertions(+), 207 deletions(-)
diff --git a/benches/compress.rs b/benches/compress.rs
index 6499a34..c27a0c1 100644
--- a/benches/compress.rs
+++ b/benches/compress.rs
@@ -63,7 +63,7 @@ fn bench_dbtext(c: &mut Criterion) {
 
         let mut text = String::new();
         let lines: Vec<&[u8]> = {
-            let mut file = File::open("benches/data/wikipedia").unwrap();
+            let mut file = File::open(path).unwrap();
             file.read_to_string(&mut text).unwrap();
 
             text.lines().map(|line| line.as_bytes()).collect()
@@ -71,13 +71,13 @@ fn bench_dbtext(c: &mut Criterion) {
 
         group.bench_function("train-and-compress", |b| {
             b.iter(|| {
-                let compressor = Compressor::train_bulk(&lines);
+                let compressor = Compressor::train(&lines);
                 let _ =
                     std::hint::black_box(compressor.compress_bulk(std::hint::black_box(&lines)));
             });
         });
 
-        let compressor = Compressor::train_bulk(&lines);
+        let compressor = Compressor::train(&lines);
         group.throughput(Throughput::Bytes(
             lines.iter().map(|l| l.len() as u64).sum::<u64>(),
         ));
@@ -89,6 +89,24 @@ fn bench_dbtext(c: &mut Criterion) {
         });
 
         group.finish();
+
+        // Report the compression factor for this dataset.
+        let uncompressed_size = lines.iter().map(|l| l.len()).sum::<usize>();
+        let compressor = Compressor::train(&lines);
+
+        // Show the symbols
+        for code in 256..compressor.symbol_table().len() {
+            let symbol = compressor.symbol_table()[code];
+            let code = code - 256;
+            println!("symbol[{code}] = {symbol:?}");
+        }
+
+        let compressed = compressor.compress_bulk(&lines);
+        let compressed_size = compressed.iter().map(|l| l.len()).sum::<usize>();
+        let ratio = 100.0 * (compressed_size as f64) / (uncompressed_size as f64);
+        println!(
+            "compressed {name} {uncompressed_size} => {compressed_size}B ({ratio}% of original)"
+        )
     }
 
     run_dataset_bench(
@@ -111,81 +129,14 @@ fn bench_dbtext(c: &mut Criterion) {
         "benches/data/urls",
         c,
     );
-}
-
-fn bench_tpch_comments(c: &mut Criterion) {
-    let mut group = c.benchmark_group("tpch");
-
-    group.bench_function("train-only", |b| {
-        b.iter(|| {
-            let mut file = File::open("/Users/aduffy/code/cwi-fsst/build/comments").unwrap();
-            let mut text = String::new();
-            file.read_to_string(&mut text).unwrap();
-
-            let lines: Vec<&str> = text.lines().collect();
-            let lines_sliced: Vec<&[u8]> = lines.iter().map(|s| s.as_bytes()).collect();
-
-            let _ =
-                std::hint::black_box(Compressor::train_bulk(std::hint::black_box(&lines_sliced)));
-            // let _ = std::hint::black_box(compressor.compress_bulk(&lines_sliced));
-        });
-    });
-
-    let mut file = File::open("/Users/aduffy/code/cwi-fsst/build/comments").unwrap();
-    let mut text = String::new();
-    file.read_to_string(&mut text).unwrap();
-
-    let lines: Vec<&str> = text.lines().collect();
-    let lines_sliced: Vec<&[u8]> = lines.iter().map(|s| s.as_bytes()).collect();
-
-    let compressor = Compressor::train_bulk(&lines_sliced);
-
-    group.throughput(Throughput::Bytes(
-        lines.iter().map(|l| l.len() as u64).sum::<u64>(),
-    ));
-    group.bench_function("compress-only", |b| {
-        b.iter(|| {
-            let _ = std::hint::black_box(compressor.compress_bulk(&lines_sliced));
-        });
-    });
 
-    group.bench_function("train-and-compress", |b| {
-        b.iter(|| {
-            let mut file = File::open("/Users/aduffy/code/cwi-fsst/build/comments").unwrap();
-            let mut text = String::new();
-            file.read_to_string(&mut text).unwrap();
-
-            let lines: Vec<&str> = text.lines().collect();
-            let lines_sliced: Vec<&[u8]> = lines.iter().map(|s| s.as_bytes()).collect();
-
-            let compressor = Compressor::train_bulk(&lines_sliced);
-            let _ = std::hint::black_box(compressor.compress_bulk(&lines_sliced));
-        });
-    });
-
-    group.finish();
-
-    let mut file = File::open("/Users/aduffy/code/cwi-fsst/build/comments").unwrap();
-    let mut text = String::new();
-    file.read_to_string(&mut text).unwrap();
-
-    let lines: Vec<&str> = text.lines().collect();
-    let lines_sliced: Vec<&[u8]> = lines.iter().map(|s| s.as_bytes()).collect();
-    let mut lines_total = Vec::new();
-    for slice in &lines_sliced {
-        lines_total.extend_from_slice(slice);
-    }
-
-    let compressor = Compressor::train_bulk(&lines_sliced);
-    let compressed = compressor.compress(&lines_total);
-
-    println!(
-        "compressed {} => {} ({}%)",
-        lines_total.len(),
-        compressed.len(),
-        100.0 * (compressed.len() as f64) / (lines_total.len() as f64),
-    )
+    run_dataset_bench(
+        "dbtext/urls",
+        "https://raw.githubusercontent.com/cwida/fsst/4e188a/paper/dbtext/urls",
+        "benches/data/urls",
+        c,
+    );
 }
 
-criterion_group!(compress_bench, bench_tpch_comments, bench_dbtext);
+criterion_group!(compress_bench, bench_dbtext);
 criterion_main!(compress_bench);
diff --git a/examples/file_compressor.rs b/examples/file_compressor.rs
index 1e3a58d..2c52649 100644
--- a/examples/file_compressor.rs
+++ b/examples/file_compressor.rs
@@ -37,12 +37,11 @@ fn main() {
 
     let mut output = File::create(output_path).unwrap();
 
-    let compressor = Compressor::train_bulk(&lines);
+    let compressor = Compressor::train(&lines);
     let mut compressed_size = 0;
     for text in lines {
-        let compressed = compressor.compress(&text);
-        compressed_size += compressed.len();
-        output.write(&compressed).unwrap();
+        let compressed = compressor.compress(text);
+        compressed_size += output.write(&compressed).unwrap();
     }
 
     println!(
diff --git a/examples/round_trip.rs b/examples/round_trip.rs
index 038b932..7044f72 100644
--- a/examples/round_trip.rs
+++ b/examples/round_trip.rs
@@ -7,7 +7,7 @@ use fsst::Compressor;
 fn main() {
     // Train on a sample.
     let sample = "the quick brown fox jumped over the lazy dog";
-    let trained = Compressor::train(sample.as_bytes());
+    let trained = Compressor::train(&vec![sample.as_bytes()]);
     let compressed = trained.compress(sample.as_bytes());
     println!("compressed: {} => {}", sample.len(), compressed.len());
     // decompress now
diff --git a/fuzz/fuzz_targets/fuzz_compress.rs b/fuzz/fuzz_targets/fuzz_compress.rs
index a871293..50e9d31 100644
--- a/fuzz/fuzz_targets/fuzz_compress.rs
+++ b/fuzz/fuzz_targets/fuzz_compress.rs
@@ -4,7 +4,7 @@ use libfuzzer_sys::fuzz_target;
 
 fuzz_target!(|data: &[u8]| {
     let compressor =
-        fsst::Compressor::train("the quick brown fox jumped over the lazy dog".as_bytes());
+        fsst::Compressor::train(&vec![b"the quick brown fox jumped over the lazy dog"]);
     let compress = compressor.compress(data);
     let decompress = compressor.decompressor().decompress(&compress);
     assert_eq!(&decompress, data);
diff --git a/fuzz/fuzz_targets/fuzz_train.rs b/fuzz/fuzz_targets/fuzz_train.rs
index 5d3dada..18581e1 100644
--- a/fuzz/fuzz_targets/fuzz_train.rs
+++ b/fuzz/fuzz_targets/fuzz_train.rs
@@ -3,5 +3,5 @@
 use libfuzzer_sys::fuzz_target;
 
 fuzz_target!(|data: &[u8]| {
-    let _ = fsst::Compressor::train(data);
+    let _ = fsst::Compressor::train(&vec![data]);
 });
diff --git a/src/builder.rs b/src/builder.rs
index 6c612b1..bc6f452 100644
--- a/src/builder.rs
+++ b/src/builder.rs
@@ -206,9 +206,9 @@ impl Counter {
 ///
 /// [FSST paper]: https://www.vldb.org/pvldb/vol13/p2649-boncz.pdf
 #[cfg(not(miri))]
-const MAX_GENERATIONS: usize = 5;
+const GENERATIONS: [usize; 5] = [8usize, 38, 68, 98, 128];
 #[cfg(miri)]
-const MAX_GENERATIONS: usize = 2;
+const GENERATIONS: [usize; 3] = [8usize, 38, 128];
 
 const FSST_SAMPLETARGET: usize = 1 << 14;
 const FSST_SAMPLEMAX: usize = 1 << 15;
@@ -220,6 +220,7 @@ const FSST_SAMPLELINE: usize = 512;
 /// returned slices are pointers into the `sample_buf`.
 ///
 /// SAFETY: sample_buf must be >= FSST_SAMPLEMAX bytes long. Providing something less may cause unexpected failures.
+#[allow(clippy::ptr_arg)]
 fn make_sample<'a, 'b: 'a>(sample_buf: &'a mut Vec<u8>, str_in: &Vec<&'b [u8]>) -> Vec<&'a [u8]> {
     debug_assert!(
         sample_buf.capacity() >= FSST_SAMPLEMAX,
@@ -239,13 +240,13 @@ fn make_sample<'a, 'b: 'a>(sample_buf: &'a mut Vec<u8>, str_in: &Vec<&'b [u8]>)
 
     while sample_buf_offset < sample_lim {
         sample_rnd = fsst_hash(sample_rnd);
-        let mut line_nr = sample_rnd % str_in.len();
+        let mut line_nr = (sample_rnd as usize) % str_in.len();
 
         // Find the first non-empty chunk starting at line_nr, wrapping around if
         // necessary.
         //
         // TODO: this will loop infinitely if there are no non-empty lines in the sample
-        while str_in[line_nr].len() == 0 {
+        while str_in[line_nr].is_empty() {
             if line_nr == str_in.len() {
                 line_nr = 0;
             }
@@ -254,10 +255,9 @@ fn make_sample<'a, 'b: 'a>(sample_buf: &'a mut Vec<u8>, str_in: &Vec<&'b [u8]>)
         let line = str_in[line_nr];
         let chunks = 1 + ((line.len() - 1) / FSST_SAMPLELINE);
         sample_rnd = fsst_hash(sample_rnd);
-        let chunk = FSST_SAMPLELINE * (sample_rnd % chunks);
+        let chunk = FSST_SAMPLELINE * ((sample_rnd as usize) % chunks);
 
         let len = FSST_SAMPLELINE.min(line.len() - chunk);
-        // println!("extending sample with chunk str_in[{line_nr}][{chunk}...len={len}]");
 
         sample_buf.extend_from_slice(&str_in[line_nr][chunk..chunk + len]);
 
@@ -273,7 +273,11 @@ fn make_sample<'a, 'b: 'a>(sample_buf: &'a mut Vec<u8>, str_in: &Vec<&'b [u8]>)
     sample
 }
 
-fn fsst_hash(value: usize) -> usize {
+/// Hash function used in various components of the library.
+///
+/// This is equivalent to the FSST_HASH macro from the C++ implementation.
+#[inline]
+pub(crate) fn fsst_hash(value: u64) -> u64 {
     (value * 2971215073) ^ (value >> 15)
 }
 
@@ -307,56 +311,24 @@ impl Compressor {
     /// code).
     ///
     /// [FSST paper]: https://www.vldb.org/pvldb/vol13/p2649-boncz.pdf
-    pub fn train(corpus: impl AsRef<[u8]>) -> Self {
+    pub fn train(values: &Vec<&[u8]>) -> Self {
+        let mut counters = Counter::new();
         let mut compressor = Compressor::default();
-        // TODO(aduffy): handle truncating/sampling if corpus > requires sample size.
-        let sample = corpus.as_ref();
-        if sample.is_empty() {
-            return compressor;
-        }
-
-        // Make the sample for each iteration.
-        //
-        // The sample is just a vector of slices, so we don't actually have to move anything around.
 
-        let mut counter = Counter::new();
-        for _generation in 0..(MAX_GENERATIONS - 1) {
-            compressor.compress_count(sample, &mut counter);
-            compressor.optimize(&counter, 128);
-            counter.clear();
+        if values.is_empty() {
+            return compressor;
         }
 
-        compressor.compress_count(sample, &mut counter);
-        compressor.optimize(&counter, 128);
-
-        compressor
-    }
-
-    /// Train on a collection of samples.
-    pub fn train_bulk(values: &Vec<&[u8]>) -> Self {
         let mut sample_memory = Vec::with_capacity(FSST_SAMPLEMAX);
         let sample = make_sample(&mut sample_memory, values);
-
-        let mut counters = Counter::new();
-        let mut compressor = Compressor::default();
-
-        for sample_frac in [8usize, 38, 68, 98, 128] {
-            // let mut skips = 0;
-            for i in 0..sample.len() {
-                if sample_frac < 128 {
-                    if fsst_hash(i) & 127 > sample_frac {
-                        // skips += 1;
-                        continue;
-                    }
+        for sample_frac in GENERATIONS {
+            for (i, line) in sample.iter().enumerate() {
+                if sample_frac < 128 && ((fsst_hash(i as u64) & 127) as usize) > sample_frac {
+                    continue;
                 }
 
-                compressor.compress_count(sample[i], &mut counters);
+                compressor.compress_count(line, &mut counters);
             }
-            // println!(
-            //     "sampleFrac={sample_frac} -- skipped {} of {}",
-            //     skips,
-            //     sample.len()
-            // );
 
             compressor.optimize(&counters, sample_frac);
             counters.clear();
@@ -403,6 +375,11 @@ impl Compressor {
                         counter.record_count1(code_u16);
                         if prev_code != MAX_CODE {
                             counter.record_count2(prev_code, code_u16);
+                            // Also record the first byte of the next code
+                            let first_byte_code =
+                                self.symbols[code_u16 as usize].first_byte() as u16;
+                            counter.record_count1(first_byte_code);
+                            counter.record_count2(prev_code, first_byte_code);
                         }
                         prev_code = code_u16;
                     }
@@ -494,9 +471,9 @@ impl Compressor {
 
             // From the c++ impl:
             // "improves both compression speed (less candidates), but also quality!!"
-            if count < 5 * sample_frac / 128 {
-                continue;
-            }
+            // if count < (5 * sample_frac / 128) {
+            //     continue;
+            // }
 
             let mut gain = count * symbol1_len;
             // NOTE: use heuristic from C++ implementation to boost the gain of single-byte symbols.
@@ -512,7 +489,7 @@ impl Compressor {
                 });
             }
 
-            // Skip on last round, or when symbol cannot be extended.
+            // Skip merges on last round, or when symbol cannot be extended.
             if sample_frac >= 128 || symbol1_len == 8 {
                 continue;
             }
@@ -552,19 +529,19 @@ impl Compressor {
         //
         // Note that because of the lossy hash table, we won't accidentally
         // save the same ASCII character twice into the table.
-        // if include_ascii {
-        //     for character in
-        //         " abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ[](){}:?/<>".bytes()
-        //     {
-        //         if n_symbols == 255 {
-        //             break;
-        //         }
-
-        //         if self.insert(Symbol::from_u8(character)) {
-        //             n_symbols += 1
-        //         }
-        //     }
-        // }
+        if sample_frac < 128 {
+            for character in
+                " abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ[](){}:?/<>".bytes()
+            {
+                if n_symbols == 255 {
+                    break;
+                }
+
+                if self.insert(Symbol::from_u8(character)) {
+                    n_symbols += 1
+                }
+            }
+        }
     }
 }
 
@@ -613,11 +590,13 @@ mod test {
     #[test]
     fn test_builder() {
         // Train a Compressor on the toy string
-        let text = "hello world";
-        let table = Compressor::train(text.as_bytes());
+        let text = b"hello hello hello hello";
+
+        // count of 5 is the cutoff for including a symbol in the table.
+        let table = Compressor::train(&vec![text, text, text, text, text]);
 
         // Use the table to compress a string, see the values
-        let compressed = table.compress(text.as_bytes());
+        let compressed = table.compress(text);
 
         // Ensure that the compressed string has no escape bytes
         assert!(compressed.iter().all(|b| *b != ESCAPE_CODE));
diff --git a/src/lib.rs b/src/lib.rs
index f8e472f..132919c 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -105,19 +105,6 @@ impl Symbol {
     }
 }
 
-#[cfg(test)]
-mod test {
-    use crate::Symbol;
-
-    #[test]
-    fn test_concat() {
-        let symbola = Symbol::from_u8(b'a');
-        let symbolb = Symbol::from_u8(b'b');
-        let symbolab = symbola.concat(symbolb);
-        assert_eq!(&symbolab.0.to_le_bytes()[0..symbolab.len()], b"ab");
-    }
-}
-
 impl Debug for Symbol {
     fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
         let slice = &self.0.to_le_bytes()[0..self.len()];
@@ -557,7 +544,7 @@ impl Compressor {
 
 #[inline]
 pub(crate) fn advance_8byte_word(word: u64, bytes: usize) -> u64 {
-    // shift the word off the right-end, because little endian means the first
+    // shift the word off the low-end, because little endian means the first
     // char is stored in the LSB.
     //
     // Note that even though this looks like it branches, Rust compiles this to a
diff --git a/src/lossy_pht.rs b/src/lossy_pht.rs
index e420d86..6afc9cc 100644
--- a/src/lossy_pht.rs
+++ b/src/lossy_pht.rs
@@ -1,5 +1,6 @@
 use std::fmt::Debug;
 
+use crate::builder::fsst_hash;
 use crate::CodeMeta;
 use crate::Symbol;
 use crate::MAX_CODE;
@@ -81,56 +82,33 @@ impl LossyPHT {
     /// True if the symbol was inserted into the table, false if it was rejected due to collision.
     pub(crate) fn insert(&mut self, symbol: Symbol, code: u8) -> bool {
         let prefix_3bytes = symbol.as_u64() & 0xFF_FF_FF;
-        let slot = self.hash(prefix_3bytes) as usize & (HASH_TABLE_SIZE - 1);
-
-        unsafe {
-            let entry = self.slots.as_mut_ptr().add(slot);
-            if (*entry).code.extended_code() != MAX_CODE {
-                // in-use
-                false
-            } else {
-                // unused
-                (*entry).symbol = symbol;
-                (*entry).code = CodeMeta::new_symbol(code, symbol);
-                (*entry).ignored_bits = (64 - 8 * symbol.len()) as u16;
-                true
-            }
+        let slot = fsst_hash(prefix_3bytes) as usize & (HASH_TABLE_SIZE - 1);
+        let entry = &mut self.slots[slot];
+        if !entry.is_unused() {
+            false
+        } else {
+            entry.symbol = symbol;
+            entry.code = CodeMeta::new_symbol(code, symbol);
+            entry.ignored_bits = (64 - 8 * symbol.len()) as u16;
+            true
         }
-
-        // let entry = &mut self.slots[slot];
-        // if !entry.is_unused() {
-        //     false
-        // } else {
-        //     entry.symbol = symbol;
-        //     entry.code = CodeMeta::new_symbol(code, symbol);
-        //     entry.ignored_bits = (64 - 8 * symbol.len()) as u16;
-        //     true
-        // }
     }
 
     /// Remove the symbol from the hashtable, if it exists.
     pub(crate) fn remove(&mut self, symbol: Symbol) {
         let prefix_3bytes = symbol.as_u64() & 0xFF_FF_FF;
-        let slot = self.hash(prefix_3bytes) as usize & (HASH_TABLE_SIZE - 1);
+        let slot = fsst_hash(prefix_3bytes) as usize & (HASH_TABLE_SIZE - 1);
         self.slots[slot].code = CodeMeta::EMPTY;
     }
 
     #[inline]
     pub(crate) fn lookup(&self, word: u64) -> &TableEntry {
         let prefix_3bytes = word & 0xFF_FF_FF;
-        let slot = self.hash(prefix_3bytes) as usize & (HASH_TABLE_SIZE - 1);
+        let slot = fsst_hash(prefix_3bytes) as usize & (HASH_TABLE_SIZE - 1);
 
         // SAFETY: the slot is guaranteed to between 0...(HASH_TABLE_SIZE - 1).
         unsafe { self.slots.get_unchecked(slot) }
     }
-
-    /// Hash a value to find the bucket it belongs in.
-    ///
-    /// The particular hash function comes from the code listing of Algorithm 4 of the FSST paper.
-    #[inline]
-    fn hash(&self, value: u64) -> u64 {
-        (value * 2971215073) ^ (value >> 15)
-    }
 }
 
 impl Default for LossyPHT {
diff --git a/tests/correctness.rs b/tests/correctness.rs
index bded628..946170f 100644
--- a/tests/correctness.rs
+++ b/tests/correctness.rs
@@ -1,6 +1,6 @@
 #![cfg(test)]
 
-use fsst::Compressor;
+use fsst::{Compressor, Symbol};
 
 static PREAMBLE: &str = r#"
 When in the Course of human events, it becomes necessary for one people to dissolve
@@ -16,7 +16,7 @@ static ART_OF_WAR: &str = include_str!("./fixtures/art_of_war.txt");
 #[test]
 fn test_basic() {
     // Roundtrip the declaration
-    let trained = Compressor::train(PREAMBLE);
+    let trained = Compressor::train(&vec![PREAMBLE.as_bytes()]);
     let compressed = trained.compress(PREAMBLE.as_bytes());
     let decompressed = trained.decompressor().decompress(&compressed);
     assert_eq!(decompressed, PREAMBLE.as_bytes());
@@ -24,7 +24,7 @@ fn test_basic() {
 
 #[test]
 fn test_train_on_empty() {
-    let trained = Compressor::train("");
+    let trained = Compressor::train(&vec![]);
     // We can still compress with it, but the symbols are going to be empty.
     let compressed = trained.compress("the quick brown fox jumped over the lazy dog".as_bytes());
     assert_eq!(
@@ -35,7 +35,8 @@ fn test_train_on_empty() {
 
 #[test]
 fn test_one_byte() {
-    let empty = Compressor::train(&[0x01]);
+    let mut empty = Compressor::default();
+    empty.insert(Symbol::from_u8(0x01));
 
     let compressed = empty.compress(&[0x01]);
     assert_eq!(compressed, vec![0u8]);
@@ -46,7 +47,7 @@ fn test_one_byte() {
 #[test]
 fn test_zeros() {
     let training_data: Vec<u8> = vec![0, 1, 2, 3, 4, 0];
-    let trained = Compressor::train(&training_data);
+    let trained = Compressor::train(&vec![&training_data]);
     let compressed = trained.compress(&[4, 0]);
     assert_eq!(trained.decompressor().decompress(&compressed), &[4, 0]);
 }
@@ -56,7 +57,7 @@ fn test_zeros() {
 fn test_large() {
     let corpus: Vec<u8> = DECLARATION.bytes().cycle().take(10_240).collect();
 
-    let trained = Compressor::train(&corpus);
+    let trained = Compressor::train(&vec![&corpus]);
     let massive: Vec<u8> = DECLARATION
         .bytes()
         .cycle()
@@ -69,7 +70,7 @@ fn test_large() {
 
 #[test]
 fn test_chinese() {
-    let trained = Compressor::train(ART_OF_WAR.as_bytes());
+    let trained = Compressor::train(&vec![ART_OF_WAR.as_bytes()]);
     assert_eq!(
         ART_OF_WAR.as_bytes(),
         trained