Skip to content

Commit

Permalink
add: ngram counter (#5)
Browse files Browse the repository at this point in the history
  • Loading branch information
shenxiangzhuang authored Apr 18, 2024
1 parent 31f5a13 commit 60b1ff0
Show file tree
Hide file tree
Showing 4 changed files with 150 additions and 0 deletions.
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ crate-type = ["cdylib"]
cached = "0.49.3"
regex = "1.10.4"
lazy_static = "1.4.0"
counter = "0.5.7"

[dependencies.pyo3]
version = "0.21.1"
Expand Down
112 changes: 112 additions & 0 deletions benchmark/py_bleu.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
# Copyright 2017 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

"""Python implementation of BLEU and smooth-BLEU.
This module provides a Python implementation of BLEU and smooth-BLEU.
Smooth BLEU is computed following the method outlined in the paper:
Chin-Yew Lin, Franz Josef Och. ORANGE: a method for evaluating automatic
evaluation metrics for machine translation. COLING 2004.
"""

import collections
import math


def _get_ngrams(segment, max_order):
"""Extracts all n-grams upto a given maximum order from an input segment.
Args:
segment: text segment from which n-grams will be extracted.
max_order: maximum length in tokens of the n-grams returned by this
methods.
Returns:
The Counter containing all n-grams upto max_order in segment
with a count of how many times each n-gram occurred.
"""
ngram_counts = collections.Counter()
for order in range(1, max_order + 1):
for i in range(0, len(segment) - order + 1):
ngram = tuple(segment[i:i+order])
ngram_counts[ngram] += 1
return ngram_counts


def compute_bleu(reference_corpus, translation_corpus, max_order=4,
smooth=False):
"""Computes BLEU score of translated segments against one or more references.
Args:
reference_corpus: list of lists of references for each translation. Each
reference should be tokenized into a list of tokens.
translation_corpus: list of translations to score. Each translation
should be tokenized into a list of tokens.
max_order: Maximum n-gram order to use when computing BLEU score.
smooth: Whether or not to apply Lin et al. 2004 smoothing.
Returns:
3-Tuple with the BLEU score, n-gram precisions, geometric mean of n-gram
precisions and brevity penalty.
"""
matches_by_order = [0] * max_order
possible_matches_by_order = [0] * max_order
reference_length = 0
translation_length = 0
for (references, translation) in zip(reference_corpus,
translation_corpus):
reference_length += min(len(r) for r in references)
translation_length += len(translation)

merged_ref_ngram_counts = collections.Counter()
for reference in references:
merged_ref_ngram_counts |= _get_ngrams(reference, max_order)
translation_ngram_counts = _get_ngrams(translation, max_order)
overlap = translation_ngram_counts & merged_ref_ngram_counts
for ngram in overlap:
matches_by_order[len(ngram)-1] += overlap[ngram]
for order in range(1, max_order+1):
possible_matches = len(translation) - order + 1
if possible_matches > 0:
possible_matches_by_order[order-1] += possible_matches

precisions = [0] * max_order
for i in range(0, max_order):
if smooth:
precisions[i] = ((matches_by_order[i] + 1.) /
(possible_matches_by_order[i] + 1.))
else:
if possible_matches_by_order[i] > 0:
precisions[i] = (float(matches_by_order[i]) /
possible_matches_by_order[i])
else:
precisions[i] = 0.0

if min(precisions) > 0:
p_log_sum = sum((1. / max_order) * math.log(p) for p in precisions)
geo_mean = math.exp(p_log_sum)
else:
geo_mean = 0

ratio = float(translation_length) / reference_length

if ratio > 1.0:
bp = 1.
else:
bp = math.exp(1 - 1. / ratio)

bleu = geo_mean * bp

return (bleu, precisions, bp, ratio, translation_length, reference_length)
1 change: 1 addition & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
mod tokenizer;
mod ngram;

use pyo3::prelude::*;
use crate::tokenizer::Tokenizer;
Expand Down
36 changes: 36 additions & 0 deletions src/ngram.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
use counter::Counter;


pub fn get_ngram_counter(line: &str, max_order: usize) -> Counter<&str> {
let mut counts: Counter<&str> = Counter::new();
for order in 1..=max_order {
for start_index in 0..=(line.len() - order) {
let ngram = &line[start_index..(start_index + order)];
counts[&ngram] += 1;
}
}
counts
}


#[cfg(test)]
mod test {
use crate::ngram::{get_ngram_counter};

#[test]
fn test_get_ngram() {
let counter = get_ngram_counter("aabc", 4);
assert_eq!(counter[&"a"], 2);
assert_eq!(counter[&"b"], 1);
assert_eq!(counter[&"c"], 1);
assert_eq!(counter[&"d"], 0);

assert_eq!(counter[&"aa"], 1);
assert_eq!(counter[&"ab"], 1);
assert_eq!(counter[&"bc"], 1);
assert_eq!(counter[&"ac"], 0);

assert_eq!(counter[&"aab"], 1);
assert_eq!(counter[&"aabc"], 1);
}
}

0 comments on commit 60b1ff0

Please sign in to comment.