sari_hook.py

# coding=utf-8
# Copyright 2019 The Tensor2Tensor Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""SARI score for evaluating paraphrasing and other text generation models.

The score is introduced in the following paper:

   Optimizing Statistical Machine Translation for Text Simplification
   Wei Xu, Courtney Napoles, Ellie Pavlick, Quanze Chen and Chris Callison-Burch
   In Transactions of the Association for Computational Linguistics (TACL) 2015
   http://cs.jhu.edu/~napoles/res/tacl2016-optimizing.pdf

This implementation has two differences with the GitHub [1] implementation:
  (1) Define 0/0=1 instead of 0 to give higher scores for predictions that match
      a target exactly.
  (2) Fix an alleged bug [2] in the deletion score computation.

[1] https://github.com/cocoxu/simplification/blob/master/SARI.py
    (commit 0210f15)
[2] https://github.com/cocoxu/simplification/issues/6
"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import collections

import numpy as np
# import tensorflow as tf

# The paper that intoduces the SARI score uses only the precision of the deleted
# tokens (i.e. beta=0). To give more emphasis on recall, you may set, e.g.,
# beta=1.
BETA_FOR_SARI_DELETION_F_MEASURE = 0


def _get_ngram_counter(ids, n):
  """Get a Counter with the ngrams of the given ID list.

  Args:
    ids: np.array or a list corresponding to a single sentence
    n: n-gram size

  Returns:
    collections.Counter with ID tuples as keys and 1s as values.
  """
  # Remove zero IDs used to pad the sequence.
  ids = [token_id for token_id in ids if token_id != 0]
  ngram_list = [tuple(ids[i:i + n]) for i in range(len(ids) + 1 - n)]
  ngrams = set(ngram_list)
  counts = collections.Counter()
  for ngram in ngrams:
    counts[ngram] = 1
  return counts


def _get_fbeta_score(true_positives, selected, relevant, beta=1):
  """Compute Fbeta score.

  Args:
    true_positives: Number of true positive ngrams.
    selected: Number of selected ngrams.
    relevant: Number of relevant ngrams.
    beta: 0 gives precision only, 1 gives F1 score, and Inf gives recall only.

  Returns:
    Fbeta score.
  """
  precision = 1
  if selected > 0:
    precision = true_positives / selected
  if beta == 0:
    return precision
  recall = 1
  if relevant > 0:
    recall = true_positives / relevant
  if precision > 0 and recall > 0:
    beta2 = beta * beta
    return (1 + beta2) * precision * recall / (beta2 * precision + recall)
  else:
    return 0


def get_addition_score(source_counts, prediction_counts, target_counts):
  """Compute the addition score (Equation 4 in the paper)."""
  added_to_prediction_counts = prediction_counts - source_counts
  true_positives = sum((added_to_prediction_counts & target_counts).values())
  selected = sum(added_to_prediction_counts.values())
  # Note that in the paper the summation is done over all the ngrams in the
  # output rather than the ngrams in the following set difference. Since the
  # former does not make as much sense we compute the latter, which is also done
  # in the GitHub implementation.
  relevant = sum((target_counts - source_counts).values())
  return _get_fbeta_score(true_positives, selected, relevant)


def get_keep_score(source_counts, prediction_counts, target_counts):
  """Compute the keep score (Equation 5 in the paper)."""
  source_and_prediction_counts = source_counts & prediction_counts
  source_and_target_counts = source_counts & target_counts
  true_positives = sum((source_and_prediction_counts &
                        source_and_target_counts).values())
  selected = sum(source_and_prediction_counts.values())
  relevant = sum(source_and_target_counts.values())
  return _get_fbeta_score(true_positives, selected, relevant)


def get_deletion_score(source_counts, prediction_counts, target_counts, beta=0):
  """Compute the deletion score (Equation 6 in the paper)."""
  source_not_prediction_counts = source_counts - prediction_counts
  source_not_target_counts = source_counts - target_counts
  true_positives = sum((source_not_prediction_counts &
                        source_not_target_counts).values())
  selected = sum(source_not_prediction_counts.values())
  relevant = sum(source_not_target_counts.values())
  return _get_fbeta_score(true_positives, selected, relevant, beta=beta)


def get_sari_score(source_ids, prediction_ids, list_of_targets,
                   max_gram_size=4, beta_for_deletion=0):
  """Compute the SARI score for a single prediction and one or more targets.

  Args:
    source_ids: a list / np.array of SentencePiece IDs
    prediction_ids: a list / np.array of SentencePiece IDs
    list_of_targets: a list of target ID lists / np.arrays
    max_gram_size: int. largest n-gram size we care about (e.g. 3 for unigrams,
        bigrams, and trigrams)
    beta_for_deletion: beta for deletion F score.

  Returns:
    the SARI score and its three components: add, keep, and deletion scores
  """
  addition_scores = []
  keep_scores = []
  deletion_scores = []
  for n in range(1, max_gram_size + 1):
    source_counts = _get_ngram_counter(source_ids, n)
    prediction_counts = _get_ngram_counter(prediction_ids, n)
    # All ngrams in the targets with count 1.
    target_counts = collections.Counter()
    # All ngrams in the targets with count r/num_targets, where r is the number
    # of targets where the ngram occurs.
    weighted_target_counts = collections.Counter()
    num_nonempty_targets = 0
    for target_ids_i in list_of_targets:
      target_counts_i = _get_ngram_counter(target_ids_i, n)
      if target_counts_i:
        weighted_target_counts += target_counts_i
        num_nonempty_targets += 1
    for gram in weighted_target_counts.keys():
      weighted_target_counts[gram] /= num_nonempty_targets
      target_counts[gram] = 1
    keep_scores.append(get_keep_score(source_counts, prediction_counts,
                                      weighted_target_counts))
    deletion_scores.append(get_deletion_score(source_counts, prediction_counts,
                                              weighted_target_counts,
                                              beta_for_deletion))
    addition_scores.append(get_addition_score(source_counts, prediction_counts,
                                              target_counts))

  avg_keep_score = sum(keep_scores) / max_gram_size
  avg_addition_score = sum(addition_scores) / max_gram_size
  avg_deletion_score = sum(deletion_scores) / max_gram_size
  sari = (avg_keep_score + avg_addition_score + avg_deletion_score) / 3.0
  return sari, avg_keep_score, avg_addition_score, avg_deletion_score


# def get_sari(source_ids, prediction_ids, target_ids, max_gram_size=4):
#   """Computes the SARI scores from the given source, prediction and targets.
#
#   Args:
#     source_ids: A 2D tf.Tensor of size (batch_size , sequence_length)
#     prediction_ids: A 2D tf.Tensor of size (batch_size, sequence_length)
#     target_ids: A 3D tf.Tensor of size (batch_size, number_of_targets,
#         sequence_length)
#     max_gram_size: int. largest n-gram size we care about (e.g. 3 for unigrams,
#         bigrams, and trigrams)
#
#   Returns:
#     A 4-tuple of 1D float Tensors of size (batch_size) for the SARI score and
#         the keep, addition and deletion scores.
#   """
#
#   def get_sari_numpy(source_ids, prediction_ids, target_ids):
#     """Iterate over elements in the batch and call the SARI function."""
#     sari_scores = []
#     keep_scores = []
#     add_scores = []
#     deletion_scores = []
#     # Iterate over elements in the batch.
#     for source_ids_i, prediction_ids_i, target_ids_i in zip(
#         source_ids, prediction_ids, target_ids):
#       sari, keep, add, deletion = get_sari_score(
#           source_ids_i, prediction_ids_i, target_ids_i, max_gram_size,
#           BETA_FOR_SARI_DELETION_F_MEASURE)
#       sari_scores.append(sari)
#       keep_scores.append(keep)
#       add_scores.append(add)
#       deletion_scores.append(deletion)
#     return (np.asarray(sari_scores), np.asarray(keep_scores),
#             np.asarray(add_scores), np.asarray(deletion_scores))
#
#   sari, keep, add, deletion = tf.py_func(
#       get_sari_numpy,
#       [source_ids, prediction_ids, target_ids],
#       [tf.float64, tf.float64, tf.float64, tf.float64])
#   return sari, keep, add, deletion
#
#
# def sari_score(predictions, labels, features, **unused_kwargs):
#   """Computes the SARI scores from the given source, prediction and targets.
#
#   An approximate SARI scoring method since we do not glue word pieces or
#   decode the ids and tokenize the output. By default, we use ngram order of 4.
#   Also, this does not have beam search.
#
#   Args:
#     predictions: tensor, model predictions.
#     labels: tensor, gold output.
#     features: dict, containing inputs.
#
#   Returns:
#     sari: int, approx sari score
#   """
#   if "inputs" not in features:
#     raise ValueError("sari_score requires inputs feature")
#
#   # Convert the inputs and outputs to a [batch_size, sequence_length] tensor.
#   inputs = tf.squeeze(features["inputs"], axis=[-1, -2])
#   outputs = tf.to_int32(tf.argmax(predictions, axis=-1))
#   outputs = tf.squeeze(outputs, axis=[-1, -2])
#
#   # Convert the labels to a [batch_size, 1, sequence_length] tensor.
#   labels = tf.squeeze(labels, axis=[-1, -2])
#   labels = tf.expand_dims(labels, axis=1)
#
#   score, _, _, _ = get_sari(inputs, outputs, labels)
#   return score, tf.constant(1.0)