Skip to content

Commit

Permalink
Implement deterministic option for rouge metric
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 541002628
  • Loading branch information
T5 Team authored and t5-copybara committed Jun 16, 2023
1 parent 6b000ae commit 258fd30
Show file tree
Hide file tree
Showing 6 changed files with 2,117 additions and 14 deletions.
65 changes: 51 additions & 14 deletions t5/evaluation/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,33 +73,37 @@ def bleu(targets, predictions, tokenizer="intl"):
return {"bleu": bleu_score.score}


def rouge(targets,
predictions,
score_keys=("rouge1", "rouge2", "rougeLsum"),
**kwargs):
"""Computes rouge score.
def _prepare_summary_rouge(summary):
# Make sure the summary is not bytes-type
# Add newlines between sentences so that rougeLsum is computed correctly.
summary = summary.replace(" . ", " .\n")
return summary


def rouge(
targets,
predictions,
score_keys=("rouge1", "rouge2", "rougeLsum"),
**kwargs,
):
"""Computes rouge score nondeterministically using the bootstrap.
Args:
targets: list of strings
predictions: list of strings
score_keys: list of strings with the keys to compute.
**kwargs: additional keyword arguments for RougeScorer.
Returns:
dict with score_key: rouge score across all targets and predictions
"""

scorer = rouge_scorer.RougeScorer(rouge_types=score_keys, **kwargs)
aggregator = scoring.BootstrapAggregator()

def _prepare_summary(summary):
# Make sure the summary is not bytes-type
# Add newlines between sentences so that rougeLsum is computed correctly.
summary = summary.replace(" . ", " .\n")
return summary

for prediction, target in zip(predictions, targets):
target = _prepare_summary(target)
prediction = _prepare_summary(prediction)
target = _prepare_summary_rouge(target)
prediction = _prepare_summary_rouge(prediction)
aggregator.add_scores(scorer.score(target=target, prediction=prediction))
result = aggregator.aggregate()
for key in score_keys:
Expand All @@ -113,6 +117,40 @@ def _prepare_summary(summary):
return {key: result[key].mid.fmeasure*100 for key in score_keys}


def rouge_mean(
targets,
predictions,
score_keys=("rouge1", "rouge2", "rougeLsum"),
**kwargs,
):
"""Computes rouge score deterministically (no bootstrap).
Args:
targets: list of strings
predictions: list of strings
score_keys: list of strings with the keys to compute
**kwargs: additional keyword arguments for RougeScorer.
Returns:
dict with score_key: rouge score across all targets and predictions
"""

scorer = rouge_scorer.RougeScorer(rouge_types=score_keys, **kwargs)
count = 0
sum_scores = collections.defaultdict(float)
for prediction, target in zip(predictions, targets):
target = _prepare_summary_rouge(target)
prediction = _prepare_summary_rouge(prediction)
scores = scorer.score(target=target, prediction=prediction)
count += 1
for k, v in scores.items():
sum_scores[k] += v.fmeasure
if count == 0:
raise ValueError("Predictions and targets must both have nonzero length")
result = {k: v / count for k, v in sum_scores.items()}
return {key: result[key] * 100 for key in score_keys}


def span_squad(targets, predictions):
"""Computes SQuAD metrics for span prediction tasks.
Expand Down Expand Up @@ -735,4 +773,3 @@ def merge(self, other: "ShardedSquad") -> "ShardedSquad":

def compute(self):
return {"f1": self.f1, "em": self.em}

8 changes: 8 additions & 0 deletions t5/evaluation/metrics_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,12 +49,20 @@ def test_same_rouge(self):
self.assertDictClose(
metrics.rouge([ref, ref], [ref, ref]),
{"rouge1": 100, "rouge2": 100, "rougeLsum": 100})
self.assertDictClose(
metrics.rouge_mean([ref, ref], [ref, ref]),
{"rouge1": 100, "rouge2": 100, "rougeLsum": 100},
)

def test_different_rouge(self):
ref = "this is a string"
self.assertDictClose(
metrics.rouge([ref, ref], ["", ""]),
{"rouge1": 0, "rouge2": 0, "rougeLsum": 0})
self.assertDictClose(
metrics.rouge_mean([ref, ref], ["", ""]),
{"rouge1": 0, "rouge2": 0, "rougeLsum": 0},
)

def test_same_squad(self):
ref = "this is a string"
Expand Down
57 changes: 57 additions & 0 deletions t5/evaluation/scoring_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
# Copyright 2023 The T5 Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Tests for equivalent scores between different t5.evaluation.metrics."""

import json
import os

from absl.testing import absltest
from t5.evaluation import metrics
from t5.evaluation import test_utils

# Delta for matching rouge values between the different scorers.
_DELTA = 0.5

_TESTDATA_PREFIX = os.path.join(os.path.dirname(__file__), "testdata")

_LARGE_TARGETS_FILE = os.path.join(_TESTDATA_PREFIX, "target_large.txt")

_LARGE_PREDICTIONS_FILE = os.path.join(_TESTDATA_PREFIX, "prediction_large.txt")

_EXPECTED_RESULTS_FILE = os.path.join(
_TESTDATA_PREFIX, "expected_bootstrap_results.json"
)


class ScoringTest(test_utils.BaseMetricsTest):

def setUp(self):
super(ScoringTest, self).setUp()
with open(_LARGE_TARGETS_FILE, "r") as f:
self.targets = f.readlines()
with open(_LARGE_PREDICTIONS_FILE, "r") as f:
self.predictions = f.readlines()
with open(_EXPECTED_RESULTS_FILE, "r") as f:
self.expected_bootstrap_result = json.load(f)

def test_rouge_variants(self):
mean_result = metrics.rouge_mean(self.targets, self.predictions)
self.assertDictClose(
mean_result, self.expected_bootstrap_result, delta=_DELTA
)


if __name__ == "__main__":
absltest.main()
1 change: 1 addition & 0 deletions t5/evaluation/testdata/expected_bootstrap_results.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"rouge1": 38.517523467350486, "rouge2": 22.1524010821563, "rougeLsum": 38.362646331618514}
Loading

0 comments on commit 258fd30

Please sign in to comment.