-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathevaluate.py
60 lines (49 loc) · 2.15 KB
/
evaluate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import json
from collections import defaultdict
from pathlib import Path
import fire
import nltk
import pandas as pd
import rouge
import torch
from bert_score import BERTScorer
from ctc import CTC
@torch.no_grad()
def run(hyp_file, test_file):
hyp_file = Path(hyp_file)
if hyp_file.is_dir():
hyp_file = next(hyp_file.glob("test*.hypo"))
device = "cuda" if torch.cuda.is_available() else "cpu"
hyp = [x.strip() for x in open(hyp_file)]
test = json.load(open(test_file))
ref = [[r.strip() for r in x["ref"]] for x in test]
src = [x["src"] for x in test]
print("ROUGE")
evaluator = rouge.Rouge(metrics=["rouge-n", "rouge-l"], max_n=2, limit_length=False, apply_avg=True,
stemming=True, ensure_compatibility=True)
scores = evaluator.get_scores(hyp, ref).items()
scores = {"_".join((metric, k)): v for metric, vs in scores for k, v in vs.items()}
print("BERTScore")
bert_scorer = BERTScorer(model_type="roberta-large", lang="en", device=device)
for key, val in zip("prf", bert_scorer.score(hyp, ref)):
scores[f"bert_score-roberta_{key}"] = val.mean().item()
novel_ngrams = defaultdict(list)
for h, s in zip(hyp, src):
tokenized_src = [nltk.word_tokenize(x.lower()) for x in s]
tokenized_hyp = nltk.word_tokenize(h.lower())
for n in range(1, 5):
src_ngrams = {ng for x in tokenized_src for ng in nltk.ngrams(x, n=n)}
hyp_ngrams = list(nltk.ngrams(tokenized_hyp, n=n))
novel_ngram_ratio = 100 * sum(ng not in src_ngrams for ng in hyp_ngrams) / len(hyp_ngrams)
novel_ngrams[f"novel-{n}gram"].append(novel_ngram_ratio)
for n in range(1, 5):
scores[f"novel-{n}gram"] = sum(novel_ngrams[f"novel-{n}gram"]) / len(novel_ngrams[f"novel-{n}gram"])
ctc = CTC()
consistency, relevance = zip(*(ctc(s, h, r) for s, h, r in zip(src, hyp, ref)))
scores["consistency"] = sum(consistency) / len(consistency)
scores["relevance"] = sum(relevance) / len(relevance)
scores = pd.Series(scores)
print(scores)
scores.to_json(str(hyp_file) + "_score.json")
if __name__ == '__main__':
fire.Fire(run)