-
Notifications
You must be signed in to change notification settings - Fork 0
/
workflow.py
157 lines (134 loc) · 6.93 KB
/
workflow.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
import answerSelection
import generatorRules
import generatorTransformer
import distractorsNER
import distractorsWordNet
import distractorsDBpedia
import distractorsGlove
import distractorsTransformer
import pdfText
from corefResol import spacyExperimentalCoref
from questionRanking import summarizeTF_IDF
from distractorRanking import sortDistractorsGPT2
from questionRanking import scoreGPT2
from loggerLog import *
def allSteps(text, answerOption, questionOption, distractorOption, maxNumberQuestions):
distractorsPenalty = 0.5
nDistractors = 5
completeOutput = []
# COREFERENCE RESOLUTION
try:
logger.info("Performing coreference resolution")
text = spacyExperimentalCoref(text)
except Exception as e:
logger.error("Exception while performing coreference resolution (spacyExperimentalCoref): %s", e)
# SUMMARY
try:
logger.info("Performing text summarization")
if maxNumberQuestions == "default":
text = summarizeTF_IDF(text, 50)
else:
text = summarizeTF_IDF(text, int(maxNumberQuestions))
except Exception as e:
logger.error("Exception while performing text summarization (summarizeTF_IDF): %s", e)
output = []
# ANSWER SELECTION
try:
logger.info("Performing answer selection")
# 0:named entities, 1:noun chunks, 2:transformer
if answerOption in ["0","1","2"]:
output = answerSelection.excerptAnswers(text, int(answerOption))
# default:named entities
elif answerOption == "default":
output = answerSelection.excerptAnswers(text, 0)
except Exception as e:
logger.error("Exception while performing answer selection (answerSelection.excerptAnswers): %s", e)
# QUESTION GENERATION
# 0: rules, 1:prepend transformer, 2:prototype transformer, 3:e2e
try:
logger.info("Performing question generation")
if questionOption == "0":
output = generatorRules.getQuestions(output)
elif questionOption in ["1","2","3"]:
output = generatorTransformer.getQuestions(output, int(questionOption))
elif questionOption == "default":
output = generatorTransformer.getQuestions(output, 2)
except Exception as e:
logger.error("Exception while performing question generation: %s", e)
# before selecting distractors, we want to remain with only one question per sentence
try:
output = questionSentence(output)
except Exception as e:
logger.error("Exception in function questionSentence in allSteps (workflow.py): %s", e)
# DISTRACTOR GENERATION
try:
logger.info("Performing distractor selection")
for i in range(len(output)):
for j in range(len(output[i]["answers"])):
questions = output[i]["answers"][j]["questions"]
if questions:
answer = output[i]["answers"][j]["answer"]
if distractorOption == "1" or distractorOption == "6":
answerLabel = output[i]["answers"][j]["label"]
output[i]["answers"][j]["distractorsNER"] = distractorsNER.getDistractors(answer, answerLabel, text, distractorsPenalty, nDistractors)
if distractorOption == "2" or distractorOption == "6":
output[i]["answers"][j]["distractorsWordNet"] = distractorsWordNet.getDistractors(answer, distractorsPenalty, nDistractors)
if distractorOption == "3" or distractorOption == "6":
output[i]["answers"][j]["distractorsDBpedia"] = distractorsDBpedia.getDistractors(answer, nDistractors, distractorsPenalty)
if distractorOption == "4" or distractorOption == "6":
output[i]["answers"][j]["distractorsGlove"] = distractorsGlove.getDistractors(answer, nDistractors, distractorsPenalty)
if distractorOption == "5" or distractorOption == "6":
question = questions[0]
output[i]["answers"][j]["distractorsTransformer"] = distractorsTransformer.getDistractors(text, question, answer, nDistractors)
elif distractorOption == "default": #default: GloVe
output[i]["answers"][j]["distractorsGlove"] = distractorsGlove.getDistractors(answer, nDistractors, distractorsPenalty)
except Exception as e:
logger.error("Exception while performing distractor selection: %s", e)
completeOutput += output
finalOutput = outputAPI(completeOutput)
logger.info("Returning %s questions", len(finalOutput))
return finalOutput
def allStepsPDF(filename, answerSelection, questionGeneration, distractorSelection, maxNumberQuestions):
try:
logger.info("Reading PDF")
text = pdfText.readPdf(filename)
except Exception as e:
logger.error("Exception in pdfText.readPdf in allStepsPDF (workflow.py): %s", e)
return allSteps(text, answerSelection, questionGeneration, distractorSelection, maxNumberQuestions)
def outputAPI(originalOutput):
newOutput = []
for item in originalOutput:
for answer in item["answers"]:
dic = {}
dic["question"] = answer["questions"][0]
dic["answers"] = {"correct_answers": [], "wrong_answers": []}
dic["answers"]["correct_answers"] = [answer["answer"]]
distractorKeys = ["distractorsNER", "distractorsWordNet", "distractorsDBpedia", "distractorsGlove", "distractorsTransformer"]
for key in distractorKeys:
if key in answer.keys():
dic["answers"]["wrong_answers"] += answer[key]
# keep only unique distractors (the same distractor could have been obtained by multiple methods)
dic["answers"]["wrong_answers"] = (list(set(dic["answers"]["wrong_answers"])))
# SORT DISTRACTORS
dic["answers"]["wrong_answers"] = list(sortDistractorsGPT2(dic["question"], dic["answers"]["wrong_answers"]).keys())
newOutput.append(dic)
return newOutput
# limit to only one question per sentence
def questionSentence(originalOutput):
newOutput = []
for item in originalOutput:
bestAnswerItem = []
for answerItems in item['answers']:
if len(answerItems['questions']) > 0:
answerItems['questions'] = [answerItems['questions'][0]]
answer = answerItems['answer']
question = answerItems['questions'][0]
if(bestAnswerItem == []):
bestAnswerItem = [answerItems, scoreGPT2(question+" "+answer)]
else:
score = scoreGPT2(question+" "+answer)
if score < bestAnswerItem[1]:
bestAnswerItem = [answerItems, score]
if len(bestAnswerItem) > 0:
newOutput.append({'sentence':item['sentence'], 'answers': [bestAnswerItem[0]]})
return newOutput