-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpredict.py
372 lines (269 loc) · 12.7 KB
/
predict.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
from transformers import *
import pandas as pd
import numpy as np
import torch
def masked_model_predict(INPUT_TEXT, TOP_N, tokenizer, model):
'''
Get the top N predictions of the model for the LAST word of the sentence
as well as their corresponding probabilities.
This function is only compatible with maked models (e.g. BertForMaskedLM).
Parameters
-----------
INPUT_TEXT : str
Input sentence. Must include "<mask>".
e.g. "She could tell he was mad by the sound of her <mask>."
TOP_N : int
Top N predictions to return.
tokenizer : transformers tokenizer object
model : transformers model object
e.g. pretrained_weights = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(pretrained_weights)
model = BertForMaskedLM.from_pretrained(pretrained_weights)
Returns
-----------
out_preds: list of tuples
Top N predictions and their corresponding probabilities
'''
model.eval() # Sets the module in evaluation mode.
mask_token = tokenizer.mask_token
INPUT_TEXT = INPUT_TEXT.replace('<mask>', mask_token) # replace the <mask> with corresponding model token mask
input_ids = tokenizer.encode(INPUT_TEXT, return_tensors="pt")
mask_token_index = torch.where(input_ids == tokenizer.mask_token_id)[1]
with torch.no_grad(): # for faster computation
token_logits = model(input_ids)[0]
mask_token_logits = token_logits[0, mask_token_index, :]
probs = torch.softmax(mask_token_logits, 1)
top_N_tokens_idx = torch.topk(mask_token_logits, TOP_N, dim=1).indices[0].tolist()
top_N_tokens = [tokenizer.decode([token]).strip() for token in top_N_tokens_idx]
top_N_probs = probs[0, top_N_tokens_idx].tolist()
out_preds = list(zip(top_N_tokens, top_N_probs))
return out_preds
def masked_model_get_distribution(INPUT_TEXT, proba_at, TOP_N, tokenizer, model):
'''
Varies slightly from masked_model_predict(): no need to add a mask, add the
word at which predictions ar required using proba_at.
This was written to easily get the distribution accross the vocab for any word
in the sentence, although it works exactly like masked_model_predict().
This function is only compatible with maked models (e.g. BertForMaskedLM).
Parameters
-----------
INPUT_TEXT : str
Input sentence. Must NOT include "<mask>".
e.g. "She could tell he was mad by the sound of her voice."
proba_at : str
Set which word to get the probas at.
e.g. 'sound'
TOP_N : int
Top N predictions to return.
tokenizer : transformers tokenizer object
model : transformers model object
e.g. pretrained_weights = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(pretrained_weights)
model = BertForMaskedLM.from_pretrained(pretrained_weights)
Returns
-----------
top_N_probs
top_N_tokens
'''
model.eval() # Sets the module in evaluation mode.
input_ids = tokenizer.encode(INPUT_TEXT, return_tensors="pt")
query_token_index = torch.where(input_ids == tokenizer.encode(proba_at)[1])[1]
with torch.no_grad(): # for faster computation
token_logits = model(input_ids)[0]
query_token_logits = token_logits[0, query_token_index, :]
probs = torch.softmax(query_token_logits, 1)
top_N_tokens_idx = torch.topk(query_token_logits, TOP_N, dim=1).indices[0].tolist()
top_N_tokens = [tokenizer.decode([token]).strip() for token in top_N_tokens_idx]
top_N_probs = probs[0, top_N_tokens_idx].tolist() # Distribution
return top_N_probs, top_N_tokens
def autoreg_model_predict(INPUT_TEXT, TOP_N, tokenizer, model, padding=True):
'''
Get the top N predictions of the model for the last word of the sentence
as well as their corresponding probabilities.
Use this function with causal/unidirectional language modeling (e.g. gpt, gpt2).
Parameters
-----------
INPUT_TEXT : str
Input sentence. No need to add a mask, predicts last (mising) word.
e.g. 'I went to the bakery to buy some'
TOP_N : int
Top N predictions to return.
tokenizer : transformers tokenizer object
model : transformers model object
e.g. pretrained_weights = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(pretrained_weights)
model = GPT2LMHeadModel.from_pretrained(pretrained_weights)
padding : bool
Add bos and eos padding.
- For GPT set to True
- For GPT2 set to False
Returns
-----------
out_preds: list of tuples
Top N predictions and their corresponding probabilities
'''
model.eval() #Sets the module in evaluation mode.
# -------- First, get top 5 predictions
input_ids = torch.tensor([tokenizer.encode(INPUT_TEXT)])
with torch.no_grad():
outputs = model(input_ids=input_ids)
predictions = outputs[0]
logits = outputs[0][0]
# get the predicted next N sub-word
idx_top_N_tokens = torch.topk(predictions[0, -1, :], TOP_N).indices.tolist()
top_N_tokens = [tokenizer.decode([token]).strip() for token in idx_top_N_tokens]
# -------- Second caclulate their probabilities
probabilities = []
if padding == True:
bos = tokenizer.bos_token
eos = tokenizer.eos_token
sentences = [bos + ' ' + INPUT_TEXT + top_N_tokens[i] + ' ' + eos for i in range(len(top_N_tokens))]
else:
sentences = [INPUT_TEXT + top_N_tokens[i] + ' ' + '<|endoftext|>' for i in range(len(top_N_tokens))]
for sentence in sentences:
input_ids = torch.tensor([tokenizer.encode(sentence)])
with torch.no_grad():
outputs = model(input_ids=input_ids)
logits = outputs[0][0]
probs = torch.softmax(logits,1)
index = len(input_ids[0])-2
token_id = input_ids[0][index]
probability = probs[index-1][token_id].item()
probabilities.append(probability)
# -------- Format into list of tuples
out_preds = list(zip(top_N_tokens, probabilities))
return out_preds
def autoreg_model_get_distribution(INPUT_TEXT, proba_at, TOP_N, tokenizer, model, padding=True):
'''
Varies slightly from autoreg_model_predict()
This was written to easily get the distribution accross the vocab for any word
in the sentence, although it works exactly like autoreg_model_predict().
Use this function with causal/unidirectional language modeling (e.g. gpt, gpt2).
Parameters
-----------
INPUT_TEXT : str
Input sentence. No mask, write full sentence.
e.g. 'I went to the bakery to buy some bread.'
proba_at : str
Set which word to get the probas at.
e.g. 'bakery'
TOP_N : int
Top N predictions to return.
tokenizer : transformers tokenizer object
model : transformers model object
e.g. pretrained_weights = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(pretrained_weights)
model = GPT2LMHeadModel.from_pretrained(pretrained_weights)
padding : bool
Add bos and eos padding.
- For GPT set to True
- For GPT2 set to False
Returns
-----------
probabilities
top_N_tokens
'''
# --------- First, edit input by splitting it at the proba_at word. This is done because the model predicts the last word in the sentence.
# Split the input into seperate words
INPUT_TEXT = INPUT_TEXT.split(' ')
# Edit to have the dots as a seperate word, result is ['dog', 'ate', 'bone', '.']
INPUT_TEXT_clean = []
for w in INPUT_TEXT:
if '.' in w:
w = w.strip('.')
INPUT_TEXT_clean.append(w)
INPUT_TEXT_clean.append('.')
else:
INPUT_TEXT_clean.append(w)
INPUT_TEXT_clean = np.array(INPUT_TEXT_clean) # as numpy
split_here = np.where(INPUT_TEXT_clean == proba_at)[0][0] # determine where to split the sentence
# create cleaned input, overwrite INPUT_TEXT
INPUT_TEXT = ' '.join(INPUT_TEXT_clean[:split_here])
model.eval() #Sets the module in evaluation mode.
# -------- Second, get top 5 predictions
input_ids = torch.tensor([tokenizer.encode(INPUT_TEXT)])
with torch.no_grad():
outputs = model(input_ids=input_ids)
predictions = outputs[0]
logits = outputs[0][0]
# get the predicted next N sub-word
idx_top_N_tokens = torch.topk(predictions[0, -1, :], TOP_N).indices.tolist()
top_N_tokens = [tokenizer.decode([token]).strip() for token in idx_top_N_tokens]
# -------- Third caclulate their probabilities
probabilities = []
if padding == True:
bos = tokenizer.bos_token
eos = tokenizer.eos_token
sentences = [bos + ' ' + INPUT_TEXT + top_N_tokens[i] + ' ' + eos for i in range(len(top_N_tokens))]
else:
sentences = [INPUT_TEXT + top_N_tokens[i] + ' ' + '<|endoftext|>' for i in range(len(top_N_tokens))]
for sentence in sentences:
input_ids = torch.tensor([tokenizer.encode(sentence)])
with torch.no_grad():
outputs = model(input_ids=input_ids)
logits = outputs[0][0]
probs = torch.softmax(logits,1)
index = len(input_ids[0])-2
token_id = input_ids[0][index]
probability = probs[index-1][token_id].item()
probabilities.append(probability)
return probabilities, top_N_tokens
def xlnet_model_predict(INPUT_TEXT, TOP_N, tokenizer, model, PADDING_TEXT=None):
'''
Get the top N predictions of the model for the last word of the sentence
as well as their corresponding probabilities.
Use this function with permutation models (e.g. XLnet).
XL models required random padding (PADDING_TEXT) for the permutation to work.
Parameters
-----------
INPUT_TEXT : str
Input sentence.
PADDING_TEXT : str
Random padding text necessary for the function to work.
If None, uses the default random padding (xlnet_model_pred?? for details)
TOP_N : int
Top N predictions to return.
tokenizer : transformers tokenizer object
model : transformers model object
Returns
-----------
out_preds: list of tuples
Top N predictions and their corresponding probabilities
'''
# sets default padding text
if PADDING_TEXT == None:
PADDING_TEXT = '''In 1991, the remains of Russian Tsar Nicholas II and his family
(except for Alexei and Maria) are discovered.
The voice of Nicholas's young son, Tsarevich Alexei Nikolaevich, narrates the
remainder of the story. 1883 Western Siberia,
a young Grigori Rasputin is asked by his father and a group of men to perform magic.
Rasputin has a vision and denounces one of the men as a horse thief. Although his
father initially slaps him for making such an accusation, Rasputin watches as the
man is chased outside and beaten. Twenty years later, Rasputin sees a vision of
the Virgin Mary, prompting him to become a priest. Rasputin quickly becomes famous,
with people, even a bishop, begging for his blessing. <eod>'''
model.eval() # Sets the module in evaluation mode.
bos = tokenizer.bos_token
eos = tokenizer.eos_token
mask_token = tokenizer.mask_token
INPUT_TEXT = bos + ' ' + INPUT_TEXT + mask_token + ' ' + eos
tokenize_input = tokenizer.tokenize(PADDING_TEXT + INPUT_TEXT)
tokenize_text = tokenizer.tokenize(INPUT_TEXT)
words = []
lp = []
for max_word_id in range((len(tokenize_input) - len(tokenize_text)), (len(tokenize_input))):
sent = tokenize_input
input_ids = torch.tensor([tokenizer.convert_tokens_to_ids(sent)])
perm_mask = torch.zeros((1, input_ids.shape[1], input_ids.shape[1]), dtype=torch.float)
perm_mask[:, :, max_word_id:] = 1.0
target_mapping = torch.zeros((1, 1, input_ids.shape[1]), dtype=torch.float)
target_mapping[0, 0, max_word_id] = 1.0
with torch.no_grad():
outputs = model(input_ids, perm_mask=perm_mask, target_mapping=target_mapping)
next_token_logits = outputs[0] # Output has shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size]
predicted_prob = torch.softmax(next_token_logits[0], 1)
tokens_id = [torch.topk(next_token_logits[0], TOP_N, dim=1).indices[0].tolist()]
words.append([tokenizer.decode([token]).strip() for token in tokens_id[0]])
lp.append(predicted_prob[0][tokens_id[0]].tolist())
out_preds = list(zip(words[-2], lp[-2]))
return out_preds