Implement deterministic option for rouge metric

PiperOrigin-RevId: 541002628
google-research · Jun 16, 2023 · 258fd30 · 258fd30
1 parent 6b000ae
commit 258fd30
Show file tree

Hide file tree

Showing 6 changed files with 2,117 additions and 14 deletions.
diff --git a/t5/evaluation/metrics.py b/t5/evaluation/metrics.py
@@ -73,33 +73,37 @@ def bleu(targets, predictions, tokenizer="intl"):
   return {"bleu": bleu_score.score}
 
 
-def rouge(targets,
-          predictions,
-          score_keys=("rouge1", "rouge2", "rougeLsum"),
-          **kwargs):
-  """Computes rouge score.
+def _prepare_summary_rouge(summary):
+  # Make sure the summary is not bytes-type
+  # Add newlines between sentences so that rougeLsum is computed correctly.
+  summary = summary.replace(" . ", " .\n")
+  return summary
+
+
+def rouge(
+    targets,
+    predictions,
+    score_keys=("rouge1", "rouge2", "rougeLsum"),
+    **kwargs,
+):
+  """Computes rouge score nondeterministically using the bootstrap.
 
   Args:
     targets: list of strings
     predictions: list of strings
     score_keys: list of strings with the keys to compute.
     **kwargs: additional keyword arguments for RougeScorer.
+
   Returns:
     dict with score_key: rouge score across all targets and predictions
   """
 
   scorer = rouge_scorer.RougeScorer(rouge_types=score_keys, **kwargs)
   aggregator = scoring.BootstrapAggregator()
 
-  def _prepare_summary(summary):
-    # Make sure the summary is not bytes-type
-    # Add newlines between sentences so that rougeLsum is computed correctly.
-    summary = summary.replace(" . ", " .\n")
-    return summary
-
   for prediction, target in zip(predictions, targets):
-    target = _prepare_summary(target)
-    prediction = _prepare_summary(prediction)
+    target = _prepare_summary_rouge(target)
+    prediction = _prepare_summary_rouge(prediction)
     aggregator.add_scores(scorer.score(target=target, prediction=prediction))
   result = aggregator.aggregate()
   for key in score_keys:
@@ -113,6 +117,40 @@ def _prepare_summary(summary):
   return {key: result[key].mid.fmeasure*100 for key in score_keys}
 
 
+def rouge_mean(
+    targets,
+    predictions,
+    score_keys=("rouge1", "rouge2", "rougeLsum"),
+    **kwargs,
+):
+  """Computes rouge score deterministically (no bootstrap).
+
+  Args:
+    targets: list of strings
+    predictions: list of strings
+    score_keys: list of strings with the keys to compute
+    **kwargs: additional keyword arguments for RougeScorer.
+
+  Returns:
+    dict with score_key: rouge score across all targets and predictions
+  """
+
+  scorer = rouge_scorer.RougeScorer(rouge_types=score_keys, **kwargs)
+  count = 0
+  sum_scores = collections.defaultdict(float)
+  for prediction, target in zip(predictions, targets):
+    target = _prepare_summary_rouge(target)
+    prediction = _prepare_summary_rouge(prediction)
+    scores = scorer.score(target=target, prediction=prediction)
+    count += 1
+    for k, v in scores.items():
+      sum_scores[k] += v.fmeasure
+  if count == 0:
+    raise ValueError("Predictions and targets must both have nonzero length")
+  result = {k: v / count for k, v in sum_scores.items()}
+  return {key: result[key] * 100 for key in score_keys}
+
+
 def span_squad(targets, predictions):
   """Computes SQuAD metrics for span prediction tasks.
 
@@ -735,4 +773,3 @@ def merge(self, other: "ShardedSquad") -> "ShardedSquad":
 
   def compute(self):
     return {"f1": self.f1, "em": self.em}
-
diff --git a/t5/evaluation/metrics_test.py b/t5/evaluation/metrics_test.py
@@ -49,12 +49,20 @@ def test_same_rouge(self):
     self.assertDictClose(
         metrics.rouge([ref, ref], [ref, ref]),
         {"rouge1": 100, "rouge2": 100, "rougeLsum": 100})
+    self.assertDictClose(
+        metrics.rouge_mean([ref, ref], [ref, ref]),
+        {"rouge1": 100, "rouge2": 100, "rougeLsum": 100},
+    )
 
   def test_different_rouge(self):
     ref = "this is a string"
     self.assertDictClose(
         metrics.rouge([ref, ref], ["", ""]),
         {"rouge1": 0, "rouge2": 0, "rougeLsum": 0})
+    self.assertDictClose(
+        metrics.rouge_mean([ref, ref], ["", ""]),
+        {"rouge1": 0, "rouge2": 0, "rougeLsum": 0},
+    )
 
   def test_same_squad(self):
     ref = "this is a string"

diff --git a/t5/evaluation/scoring_test.py b/t5/evaluation/scoring_test.py
@@ -0,0 +1,57 @@
+# Copyright 2023 The T5 Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for equivalent scores between different t5.evaluation.metrics."""
+
+import json
+import os
+
+from absl.testing import absltest
+from t5.evaluation import metrics
+from t5.evaluation import test_utils
+
+# Delta for matching rouge values between the different scorers.
+_DELTA = 0.5
+
+_TESTDATA_PREFIX = os.path.join(os.path.dirname(__file__), "testdata")
+
+_LARGE_TARGETS_FILE = os.path.join(_TESTDATA_PREFIX, "target_large.txt")
+
+_LARGE_PREDICTIONS_FILE = os.path.join(_TESTDATA_PREFIX, "prediction_large.txt")
+
+_EXPECTED_RESULTS_FILE = os.path.join(
+    _TESTDATA_PREFIX, "expected_bootstrap_results.json"
+)
+
+
+class ScoringTest(test_utils.BaseMetricsTest):
+
+  def setUp(self):
+    super(ScoringTest, self).setUp()
+    with open(_LARGE_TARGETS_FILE, "r") as f:
+      self.targets = f.readlines()
+    with open(_LARGE_PREDICTIONS_FILE, "r") as f:
+      self.predictions = f.readlines()
+    with open(_EXPECTED_RESULTS_FILE, "r") as f:
+      self.expected_bootstrap_result = json.load(f)
+
+  def test_rouge_variants(self):
+    mean_result = metrics.rouge_mean(self.targets, self.predictions)
+    self.assertDictClose(
+        mean_result, self.expected_bootstrap_result, delta=_DELTA
+    )
+
+
+if __name__ == "__main__":
+  absltest.main()
diff --git a/t5/evaluation/testdata/expected_bootstrap_results.json b/t5/evaluation/testdata/expected_bootstrap_results.json
@@ -0,0 +1 @@
+{"rouge1": 38.517523467350486, "rouge2": 22.1524010821563, "rougeLsum": 38.362646331618514}