From 99423a64543f837e78b842f2ff21dd6334633905 Mon Sep 17 00:00:00 2001 From: Xiangzhuang Shen Date: Mon, 27 May 2024 11:13:04 +0800 Subject: [PATCH 1/2] add: changelog --- CHANGELOG.md | 2 ++ Cargo.toml | 3 +-- src/ngram.rs | 54 +++++++++------------------------------------------- 3 files changed, 12 insertions(+), 47 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a427ad5..f714104 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Fixed +- `ngram` bench use counter lib's function rather than the truly used function. ### Changed - Use AHash in ngram module diff --git a/Cargo.toml b/Cargo.toml index 24f5b88..0012c26 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -15,10 +15,9 @@ name = "bleuscore" crate-type = ["cdylib", "rlib"] [dependencies] -cached = "0.50.0" +cached = "0.51.3" regex = "1.10.4" lazy_static = "1.4.0" -counter = "0.5.7" rayon = "1.10.0" ahash = "0.8.11" diff --git a/src/ngram.rs b/src/ngram.rs index 260d932..18f2573 100644 --- a/src/ngram.rs +++ b/src/ngram.rs @@ -1,5 +1,4 @@ use ahash::AHashMap; -use counter::Counter; /// Here the tokens' type is `&[String]` rather than `&Vec` /// to fix `clippy::not_unsafe_ptr_arg_deref` error. @@ -17,24 +16,10 @@ pub fn get_token_ngram_counter(tokens: &[String], max_order: usize) -> AHashMap< count_map } -/// TODO: change to use Counter to count ngram -#[allow(dead_code)] -fn get_ngram_counter(line: &str, max_order: usize) -> Counter<&str> { - let mut counts: Counter<&str> = Counter::new(); - for order in 1..=max_order { - for start_index in 0..(line.len().saturating_sub(order - 1)) { - // println!("line: {}, start_index: {}, order: {}", line, start_index, order); - let ngram = &line[start_index..(start_index + order)]; - // println!("ngram: {}", ngram); - counts[&ngram] += 1; - } - } - counts -} #[cfg(test)] mod test { - use crate::ngram::{get_ngram_counter, get_token_ngram_counter}; + use crate::ngram::get_token_ngram_counter; #[test] fn test_get_token_ngram_short() { @@ -69,47 +54,26 @@ mod test { assert_eq!(counter.len(), 9); } - - #[test] - fn test_get_ngram_short() { - let counter = get_ngram_counter("ab", 4); - assert_eq!(counter[&"a"], 1); - assert_eq!(counter[&"b"], 1); - assert_eq!(counter[&"ab"], 1); - } - - #[test] - fn test_get_ngram_long() { - let counter = get_ngram_counter("aabc", 4); - assert_eq!(counter[&"a"], 2); - assert_eq!(counter[&"b"], 1); - assert_eq!(counter[&"c"], 1); - assert_eq!(counter[&"d"], 0); - - assert_eq!(counter[&"aa"], 1); - assert_eq!(counter[&"ab"], 1); - assert_eq!(counter[&"bc"], 1); - assert_eq!(counter[&"ac"], 0); - - assert_eq!(counter[&"aab"], 1); - assert_eq!(counter[&"aabc"], 1); - } } #[cfg(test)] mod benchmark { - use crate::ngram::get_ngram_counter; + use crate::ngram::get_token_ngram_counter; use test::Bencher; #[bench] fn bench_ngram(b: &mut Bencher) { - let line = "aabc"; - let max_order = 4; + let tokens: Vec = vec![ + "a".to_string(), + "a".to_string(), + "b".to_string(), + "c".to_string(), + ]; let max_order = 4; let iter_num: usize = 100; b.iter(|| { std::hint::black_box(for _ in 1..=iter_num { - get_ngram_counter(line, max_order); + get_token_ngram_counter(&tokens, max_order); }); }); } From 8f16ecac3fea0901ce88cf7c862c682aeaba8191 Mon Sep 17 00:00:00 2001 From: Xiangzhuang Shen Date: Mon, 27 May 2024 11:13:51 +0800 Subject: [PATCH 2/2] fix: cargo fmt --- src/ngram.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ngram.rs b/src/ngram.rs index 18f2573..0b02193 100644 --- a/src/ngram.rs +++ b/src/ngram.rs @@ -16,7 +16,6 @@ pub fn get_token_ngram_counter(tokens: &[String], max_order: usize) -> AHashMap< count_map } - #[cfg(test)] mod test { use crate::ngram::get_token_ngram_counter; @@ -68,7 +67,8 @@ mod benchmark { "a".to_string(), "b".to_string(), "c".to_string(), - ]; let max_order = 4; + ]; + let max_order = 4; let iter_num: usize = 100; b.iter(|| {