add: ngram counter (#5)

shenxiangzhuang · Apr 18, 2024 · 60b1ff0 · 60b1ff0
1 parent 31f5a13
commit 60b1ff0
Show file tree

Hide file tree

Showing 4 changed files with 150 additions and 0 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -12,6 +12,7 @@ crate-type = ["cdylib"]
 cached = "0.49.3"
 regex = "1.10.4"
 lazy_static = "1.4.0"
+counter = "0.5.7"
 
 [dependencies.pyo3]
 version = "0.21.1"

diff --git a/benchmark/py_bleu.py b/benchmark/py_bleu.py
@@ -0,0 +1,112 @@
+# Copyright 2017 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Python implementation of BLEU and smooth-BLEU.
+
+This module provides a Python implementation of BLEU and smooth-BLEU.
+Smooth BLEU is computed following the method outlined in the paper:
+Chin-Yew Lin, Franz Josef Och. ORANGE: a method for evaluating automatic
+evaluation metrics for machine translation. COLING 2004.
+"""
+
+import collections
+import math
+
+
+def _get_ngrams(segment, max_order):
+    """Extracts all n-grams upto a given maximum order from an input segment.
+  
+    Args:
+      segment: text segment from which n-grams will be extracted.
+      max_order: maximum length in tokens of the n-grams returned by this
+          methods.
+  
+    Returns:
+      The Counter containing all n-grams upto max_order in segment
+      with a count of how many times each n-gram occurred.
+    """
+    ngram_counts = collections.Counter()
+    for order in range(1, max_order + 1):
+        for i in range(0, len(segment) - order + 1):
+            ngram = tuple(segment[i:i+order])
+            ngram_counts[ngram] += 1
+    return ngram_counts
+
+
+def compute_bleu(reference_corpus, translation_corpus, max_order=4,
+                 smooth=False):
+    """Computes BLEU score of translated segments against one or more references.
+  
+    Args:
+      reference_corpus: list of lists of references for each translation. Each
+          reference should be tokenized into a list of tokens.
+      translation_corpus: list of translations to score. Each translation
+          should be tokenized into a list of tokens.
+      max_order: Maximum n-gram order to use when computing BLEU score.
+      smooth: Whether or not to apply Lin et al. 2004 smoothing.
+  
+    Returns:
+      3-Tuple with the BLEU score, n-gram precisions, geometric mean of n-gram
+      precisions and brevity penalty.
+    """
+    matches_by_order = [0] * max_order
+    possible_matches_by_order = [0] * max_order
+    reference_length = 0
+    translation_length = 0
+    for (references, translation) in zip(reference_corpus,
+                                         translation_corpus):
+        reference_length += min(len(r) for r in references)
+        translation_length += len(translation)
+
+        merged_ref_ngram_counts = collections.Counter()
+        for reference in references:
+            merged_ref_ngram_counts |= _get_ngrams(reference, max_order)
+        translation_ngram_counts = _get_ngrams(translation, max_order)
+        overlap = translation_ngram_counts & merged_ref_ngram_counts
+        for ngram in overlap:
+            matches_by_order[len(ngram)-1] += overlap[ngram]
+        for order in range(1, max_order+1):
+            possible_matches = len(translation) - order + 1
+            if possible_matches > 0:
+                possible_matches_by_order[order-1] += possible_matches
+
+    precisions = [0] * max_order
+    for i in range(0, max_order):
+        if smooth:
+            precisions[i] = ((matches_by_order[i] + 1.) /
+                             (possible_matches_by_order[i] + 1.))
+        else:
+            if possible_matches_by_order[i] > 0:
+                precisions[i] = (float(matches_by_order[i]) /
+                                 possible_matches_by_order[i])
+            else:
+                precisions[i] = 0.0
+
+    if min(precisions) > 0:
+        p_log_sum = sum((1. / max_order) * math.log(p) for p in precisions)
+        geo_mean = math.exp(p_log_sum)
+    else:
+        geo_mean = 0
+
+    ratio = float(translation_length) / reference_length
+
+    if ratio > 1.0:
+        bp = 1.
+    else:
+        bp = math.exp(1 - 1. / ratio)
+
+    bleu = geo_mean * bp
+
+    return (bleu, precisions, bp, ratio, translation_length, reference_length)
diff --git a/src/lib.rs b/src/lib.rs
@@ -1,4 +1,5 @@
 mod tokenizer;
+mod ngram;
 
 use pyo3::prelude::*;
 use crate::tokenizer::Tokenizer;

diff --git a/src/ngram.rs b/src/ngram.rs
@@ -0,0 +1,36 @@
+use counter::Counter;
+
+
+pub fn get_ngram_counter(line: &str, max_order: usize) -> Counter<&str> {
+    let mut counts: Counter<&str> = Counter::new();
+    for order in 1..=max_order {
+        for start_index in 0..=(line.len() - order) {
+            let ngram = &line[start_index..(start_index + order)];
+            counts[&ngram] += 1;
+        }
+    }
+    counts
+}
+
+
+#[cfg(test)]
+mod test {
+    use crate::ngram::{get_ngram_counter};
+
+    #[test]
+    fn test_get_ngram() {
+        let counter = get_ngram_counter("aabc", 4);
+        assert_eq!(counter[&"a"], 2);
+        assert_eq!(counter[&"b"], 1);
+        assert_eq!(counter[&"c"], 1);
+        assert_eq!(counter[&"d"], 0);
+
+        assert_eq!(counter[&"aa"], 1);
+        assert_eq!(counter[&"ab"], 1);
+        assert_eq!(counter[&"bc"], 1);
+        assert_eq!(counter[&"ac"], 0);
+
+        assert_eq!(counter[&"aab"], 1);
+        assert_eq!(counter[&"aabc"], 1);
+    }
+}