-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathevaluate.py
129 lines (102 loc) · 4.93 KB
/
evaluate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import json, os, re
from utils.span_selection_utils import *
from utils.program_generation_utils import *
import math
def evaluate_program_result(pred_prog, gold_prog):
'''
execution acc
execution acc = exact match = f1
'''
invalid_flag, exe_res = eval_program(pred_prog)
gold = program_tokenization(gold_prog)
invalid_flag, exe_gold_res = eval_program(gold)
if invalid_flag:
print(gold)
if exe_res == exe_gold_res:
exe_acc = 1
else:
exe_acc = 0
return exe_acc, exe_acc
def evaluate_span_program_result(span_ans, prog_ans):
span_ans = str(span_ans)
if str_to_num(span_ans) != "n/a":
span_ans = str_to_num(span_ans)
if math.isclose(prog_ans, span_ans, abs_tol= min(abs(min(prog_ans, span_ans) / 1000), 0.1)):
exact_match, f1 = 1, 1
else:
exact_match, f1 = 0, 0
else:
exact_match, f1 = get_span_selection_metrics(span_ans, str(prog_ans))
return exact_match, f1
def combine_predictions(span_selection_json_in, program_generation_json_in, test_file_json_in, output_dir):
span_selection_data = json.load(open(span_selection_json_in))
program_generation_data = json.load(open(program_generation_json_in))
orig_data = json.load(open(test_file_json_in))
prediction_dict = {}
for example in span_selection_data + program_generation_data:
uid = example["uid"]
pred_ans = example["predicted_ans"]
pred_program = example["predicted_program"]
if uid in prediction_dict:
print(f"uid {uid} already in prediction_dict")
else:
prediction_dict[uid] = {
"uid": uid,
"predicted_ans": pred_ans,
"predicted_program": pred_program
}
output_data = []
for example in orig_data:
output_data.append(prediction_dict[example["uid"]])
mode = "dev" if "dev" in test_file_json_in else "test"
output_file = os.path.join(output_dir, f"{mode}_predictions.json")
json.dump(output_data, open(output_file, "w"), indent=4)
print(f"{mode}: Combine {len(span_selection_data)} examples from span selection output, {len(program_generation_data)} examples from program generation output. The prediction are generated in {output_file}")
return prediction_dict
def evaluation_prediction_result(span_selection_json_in, program_generation_json_in, test_file_json_in, output_dir):
exact_match_total, f1_total = 0, 0
prediction_dict = combine_predictions(span_selection_json_in, program_generation_json_in, test_file_json_in, output_dir)
if "test" in test_file_json_in:
print("Please submit the test prediction file to CodaLab to get the results")
return
orig_data = json.load(open(test_file_json_in))
num_examples = len(orig_data)
for example in orig_data:
uid = example["uid"]
pred = prediction_dict[uid]
gold_prog = example["qa"]["program"]
gold_ans = example["qa"]["answer"]
# both program generation
if pred["predicted_program"] and gold_prog:
exact_acc, f1_acc = evaluate_program_result(pred["predicted_program"], gold_prog)
# both span selection
elif not pred["predicted_program"] and not gold_prog:
exact_acc, f1_acc = get_span_selection_metrics(pred["predicted_ans"], gold_ans)
# gold is span selection, pred is program generation
elif not pred["predicted_program"] and gold_prog:
exact_acc, f1_acc = evaluate_span_program_result(span_ans = pred["predicted_ans"], prog_ans = gold_ans)
# gold is program generation, pred is span selection
elif pred["predicted_program"] and not gold_prog:
exact_acc, f1_acc = evaluate_span_program_result(span_ans = gold_ans, prog_ans = pred["predicted_ans"])
exact_match_total += exact_acc
f1_total += f1_acc
exact_match_score, f1_score = exact_match_total / num_examples, f1_total / num_examples
print(f"Exact Match Score: {exact_match_score}, F1 Score: {f1_score}")
return exact_match_score, f1_score
if __name__ == '__main__':
test_path = sys.argv[1]
if "dev" in test_path:
mode = "dev"
elif "test" in test_path:
mode = "test"
else:
raise ValueError("Cannot recognize the file name")
output_dir = "output"
span_selection_dir = "span_selection_output"
program_generation_dir = "program_generation_output"
span_selection_json_in = os.path.join(output_dir, span_selection_dir, f"{mode}_predictions.json")
program_generation_json_in = os.path.join(output_dir, program_generation_dir, f"{mode}_predictions.json")
test_file_json_in = os.path.join("dataset", f"{mode}.json")
prediction_output_dir = os.path.join(output_dir, "final_predictions")
os.makedirs(prediction_output_dir, exist_ok=True)
evaluation_prediction_result(span_selection_json_in, program_generation_json_in, test_file_json_in, prediction_output_dir)