add: sacrebleu benchmark

shenxiangzhuang · Apr 23, 2024 · d03b522 · d03b522
1 parent c6167a5
commit d03b522
Show file tree

Hide file tree

Showing 6 changed files with 27 additions and 3 deletions.
diff --git a/README.md b/README.md
@@ -48,4 +48,13 @@ print(results)
 ## Benchmark
 
 ### Simple
+We use the demo data shown in quick start to do this simple benchmark.
+You can check the [benchmark/simple](./benchmark/simple) for the benchmark source code.
+
+[//]: # (https://app.warp.dev/block/Mt8BOS3rllMuryMkcI4Gr5)
 ![img.png](asset/benchmark/simple.png)
+
+
+Note that bleuscore gets same result with huggingface evaluate, but sacrebleu gets different result.
+(The reason maybe related to the implementation details in sacrebleu)
+
diff --git a/asset/benchmark/simple.png b/asset/benchmark/simple.png
diff --git a/benchmark/bench.sh b/benchmark/bench.sh
@@ -1 +1,4 @@
-hyperfine --warmup 5 --runs 100 "python simple/rs_bleuscore.py" "python simple/hf_evaluate.py"
+hyperfine --warmup 5 --runs 100   \
+  "python simple/rs_bleuscore.py" \
+  "python simple/sacre_bleu.py"   \
+  "python simple/hf_evaluate.py"
diff --git a/benchmark/simple/sacre_bleu.py b/benchmark/simple/sacre_bleu.py
@@ -0,0 +1,11 @@
+from sacrebleu.metrics import BLEU
+
+
+predictions = ["hello there general kenobi", "foo bar foobar"]
+references = [
+    ["hello there general kenobi", "hello there !"],
+    ["foo bar foobar"]
+]
+
+bleu = BLEU(smooth_method="none", max_ngram_order=4, tokenize='13a')
+results = bleu.corpus_score(predictions, references)
diff --git a/pyproject.toml b/pyproject.toml
@@ -37,7 +37,7 @@ Homepage = 'https://github.com/shenxiangzhuang/bleuscore'
 Source = 'https://github.com/shenxiangzhuang/bleuscore'
 
 [project.optional-dependencies]
-test = ["pytest", "pytest-sugar", "hypothesis", "evaluate"]
+test = ["pytest", "pytest-sugar", "hypothesis", "evaluate", "sacrebleu"]
 lint = ["black", "ruff~=0.3.7"]
 #docs = []
 #dev = []

diff --git a/src/bleu.rs b/src/bleu.rs
@@ -14,7 +14,8 @@ pub struct BleuScore {
     pub reference_length: usize,
 }
 
-/// compute the BLEU score with `Tokenizer13a` as default tokenizer
+/// compute the BLEU score with `Tokenizer13a` as default tokenizer.
+/// The implementation is based on [huggingface/nmt](https://github.com/huggingface/evaluate/blob/main/metrics/bleu/bleu.py)
 pub fn compute_score(
     references: Vec<Vec<String>>,
     predictions: Vec<String>,