This repository contains the data for our report "Is ChatGPT a Good NLG Evaluator? A Preliminary Study"
please refer to data/*.json
from correlations import sample_level_correlation_summeval, dataset_level_correlation_summeval
for aspect in ['coherence', 'relevance', 'consistency', 'fluency']:
for aspect in ['coherence', 'relevance', 'consistency', 'fluency']:
from correlations import sample_level_correlation_openmeva, dataset_level_correlation_openmeva
from correlations import dataset_level_correlation_bagel
for aspect in ['informativeness', 'naturalness', 'quality']:
prompt_summeval_coherence = """Score the following news summarization given the corresponding news with respect to coherence with one to five stars, where one star means "incoherence" and five stars means "perfect coherence". Note that coherence measures the quality of all sentences collectively, to the fit together and sound naturally. Consider the quality of the summary as a whole.
News: %s
Summary: %s
""" % (article, generated_summ)
prompt_summeval_relevance = """Score the following news summarization given the corresponding news with respect to relevance with one to five stars, where one star means "irrelevance" and five stars means "perfect relevance". Note that relevance measures how well the summary captures the key points of the article. Consider whether all and only the important aspects are contained in the summary.
News: %s
Summary: %s
""" % (article, generated_summ)
prompt_summeval_consistency = """Score the following news summarization given the corresponding news with respect to consistency with one to five stars, where one star means "inconsistency" and five stars means "perfect consistency". Note that consistency measures whether the facts in the summary are consistent with the facts in the original article. Consider whether the summary does reproduce all facts accurately and does not make up untrue information.
News: %s
Summary: %s
""" % (article, generated_summ)
prompt_summeval_fluency = """Score the following news summarization given the corresponding news with respect to fluency with one to five stars, where one star means "disfluency" and five stars means "perfect fluency". Note that fluency measures the quality of individual sentences, are they well-written and grammatically correct. Consider the quality of individual sentences.
News: %s
Summary: %s
""" % (article, generated_summ)
prompt_openmeva = """Score the following storyline given the beginning of the story with one to five stars.
Where one star means "Nonsense",
two stars mean "The storyline has some connections with the beginning, but is not understandable",
three stars mean "The storyline has some connections with the beginning and is understandable",
four stars mean "The storyline is consistent with the beginning and possibly involves a few grammar mistakes",
and five stars mean "Perfect storyline and grammar".
The beginning of the story: %s
Storyline: %s
""" % (story_beginning, generated_storyline)
prompt_bagel_informativeness = """Score the following natural text given the corresponding reference with respect to informativeness with one to five stars, where one star means "uninformative" and five stars means "perfect informativeness". Note that informativeness is defined as whether it contains all the information in the reference.
The reference: %s
The natural text: %s
""" % (reference, sys_summ)
prompt_bagel_naturalness = """Score the following natural text given the corresponding structured information with one to five stars, where one star means "unnaturalness" and five stars means "perfect naturalness".
The structured information: %s
The natural text: %s
""" % (src, sys_summ)
prompt_bagel_quality = """Score the following natural text given the corresponding structured information with one to five stars, where one star means "nonsense or no core meaning preserved" and five stars means "perfect core meaning and grammar".
The structured information: %s
The natural text: %s
""" % (src, sys_summ)
Please cite our work if you find it useful.
Part of this code is inspired by BARTScore and OpenMEVA:
- The results of baseline metrics in
are provided by BARTScore - The results of baseline metrics in
are calculated by the standard implementation of OpenMEVA and BARTScore
Prompts are inspired by Large Language Models Are State-of-the-Art Evaluators of Translation Quality