-
Notifications
You must be signed in to change notification settings - Fork 0
/
eval_twists.py
608 lines (485 loc) · 22.4 KB
/
eval_twists.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
'''Evaluate different adaptions to the sentence representation'''
import train
import config
import model as m
import numpy as np
import torch
import analyse_repr.analyse as analyse
import embeddingholder
import mydataloader
import sys
import os
import matplotlib.pyplot as plt
color_palette = [
'#e6194b',
'#3cb44b',
'#ffe119',
'#0082c8',
'#f58231',
'#911eb4',
'#46f0f0',
'#000000',
'#f032e6',
'#d2f53c',
'#fabebe',
'#008080',
'#aa6e28',
'#800000',
'#808000',
'#000080',
'#808080',
'#e6beff',
'#aaffc3',
'#fffac8',
'#ff0000',
'#00ff00',
'#0000ff',
'#ff00ff',
'#234567',
'#00ffff'
]
from torch.utils.data import DataLoader
import torch.autograd as autograd
from train import CollocateBatchWithSents
from docopt import docopt
def run_twists(classifier, data_train, data_dev, padding_token, twister_queue):
'''
Runs a series of twisted models, specified in twister_queue. The twisted model
gets evaluated on train and dev data.
:param classifier model to use
:param data_train train data
:param data_dev dev data
:param padding_token padding token from embeddings
:param twister_queue [(name-to-print, twister)] to run the twisted models.
'''
# Load model, data, embeddings
for name, twister in twister_queue:
print(name)
print('Accuracy (train):', train.evaluate(classifier, data_train, 32, padding_token, twister=twister))
print('Accuracy (dev):', train.evaluate(classifier, data_dev, 32, padding_token, twister=twister))
print('')
sys.stdout.flush()
# out: train acc, dev acc, train acc delta, dev acc delta
def mirror(v, mirror_point):
'''
Mirrors the value on the mirror point, s.t. the distance the point is below mirror_point
will be the distance the resulting point is above mirror_point and vice versa.
:param v value to mirror
:param mirror_point this is the point to relate to when mirroring.
'''
diff = np.absolute(mirror_point - v)
if v < mirror_point:
return mirror_point + diff
else:
return mirror_point - diff
def flip_dimension(dim, repr, a_set):
'''
Flip the dimensional value by mirroring it in the middle between min_value and max_value.
'''
mirror_point = a_set.min[dim] + np.absolute(a_set.max[dim] - a_set.min[dim]) / 2
to_flip = repr[:,dim].data
flipped = m.cuda_wrap(torch.FloatTensor([mirror(v, mirror_point) for v in to_flip]))
repr[:,dim] = flipped
return repr
def eval_mf(classifier, data_train, data_dev, padding_token, a_set):
def flip_fn(rep, typ, tools):
a_set = tools[0]
flip_premise = tools[1]
flip_hyp = tools[2]
# Flip premise representations
if typ == 'premise':
for flip_premise_idx in flip_premise:
rep = flip_dimension(flip_premise_idx, rep, a_set)
# Flip hypothesis represnetations
if typ == 'hypothesis':
for flip_hyp_idx in flip_hyp:
rep = flip_dimension(flip_hyp_idx, rep, a_set)
return rep
# invert male dimensions of premise
t_p_m1 = m.ModelTwister(flip_fn, (a_set, [602], []))
t_p_m2 = m.ModelTwister(flip_fn, (a_set, [602, 199], []))
t_p_m3 = m.ModelTwister(flip_fn, (a_set, [602, 199, 280], []))
t_p_m4 = m.ModelTwister(flip_fn, (a_set, [602, 199, 280, 89], []))
# invert male dimensions of hypothesis
t_h_m1 = m.ModelTwister(flip_fn, (a_set, [], [602]))
t_h_m2 = m.ModelTwister(flip_fn, (a_set, [], [602, 199]))
t_h_m3 = m.ModelTwister(flip_fn, (a_set, [], [602, 199, 280]))
t_h_m4 = m.ModelTwister(flip_fn, (a_set, [], [602, 199, 280, 89]))
# invert male dimensions of premise and hypothesis
t_ph_m1 = m.ModelTwister(flip_fn, (a_set, [602], [602]))
t_ph_m2 = m.ModelTwister(flip_fn, (a_set, [602, 199], [602, 199]))
t_ph_m3 = m.ModelTwister(flip_fn, (a_set, [602, 199, 280], [602, 199, 280]))
t_ph_m4 = m.ModelTwister(flip_fn, (a_set, [602, 199, 280, 89], [602, 199, 280, 89]))
# invert female dimensions of premise
t_p_f1 = m.ModelTwister(flip_fn, (a_set, [1730], []))
t_p_f2 = m.ModelTwister(flip_fn, (a_set, [1730, 845], []))
t_p_f3 = m.ModelTwister(flip_fn, (a_set, [1730, 845, 311], []))
t_p_f4 = m.ModelTwister(flip_fn, (a_set, [1730, 845, 311, 609], []))
# invert female dimensions of hypothesis
t_h_f1 = m.ModelTwister(flip_fn, (a_set, [], [1730]))
t_h_f2 = m.ModelTwister(flip_fn, (a_set, [], [1730, 845]))
t_h_f3 = m.ModelTwister(flip_fn, (a_set, [], [1730, 845, 311]))
t_h_f4 = m.ModelTwister(flip_fn, (a_set, [], [1730, 845, 311, 609]))
# invert female dimensions of premise and hypothesis
t_ph_f1 = m.ModelTwister(flip_fn, (a_set, [1730], [1730]))
t_ph_f2 = m.ModelTwister(flip_fn, (a_set, [1730, 845], [1730, 845]))
t_ph_f3 = m.ModelTwister(flip_fn, (a_set, [1730, 845, 311], [1730, 845, 311]))
t_ph_f4 = m.ModelTwister(flip_fn, (a_set, [1730, 845, 311, 609], [1730, 845, 311, 609]))
# invert male&female dimensions of premise
t_p_mf1 = m.ModelTwister(flip_fn, (a_set, [602, 1730], []))
t_p_mf2 = m.ModelTwister(flip_fn, (a_set, [602, 199, 1730, 845], []))
t_p_mf3 = m.ModelTwister(flip_fn, (a_set, [602, 199, 280, 1730, 845, 311], []))
t_p_mf4 = m.ModelTwister(flip_fn, (a_set, [602, 199, 280, 89, 1730, 845, 311, 609], []))
# invert male&female dimensions of hypothesis
t_h_mf1 = m.ModelTwister(flip_fn, (a_set, [], [602, 1730]))
t_h_mf2 = m.ModelTwister(flip_fn, (a_set, [], [602, 199, 1730, 845]))
t_h_mf3 = m.ModelTwister(flip_fn, (a_set, [], [602, 199, 280, 1730, 845, 311]))
t_h_mf4 = m.ModelTwister(flip_fn, (a_set, [], [602, 199, 280, 89, 1730, 845, 311, 609]))
# invert male&female dimensions of premise and hypothesis
t_ph_mf1 = m.ModelTwister(flip_fn, (a_set, [602, 1730], [602, 1730]))
t_ph_mf2 = m.ModelTwister(flip_fn, (a_set, [602, 199, 1730, 845], [602, 199, 1730, 845]))
t_ph_mf3 = m.ModelTwister(flip_fn, (a_set, [602, 199, 280, 1730, 845, 311], [602, 199, 280, 1730, 845, 311]))
t_ph_mf4 = m.ModelTwister(flip_fn, (a_set, [602, 199, 280, 89, 1730, 845, 311, 609], [602, 199, 280, 89, 1730, 845, 311, 609]))
twister_queue = [
('No twist', None),
('Invert male(1) in premise', t_p_m1), ('Invert male(2) in premise', t_p_m2), ('Invert male(3) in premise', t_p_m3), ('Invert male(4) in premise', t_p_m4),
('Invert male(1) in hyp', t_h_m1), ('Invert male(2) in hyp', t_h_m2), ('Invert male(3) in hyp', t_h_m3), ('Invert male(4) in hyp', t_h_m4),
('Invert male(1) in both', t_ph_m1), ('Invert male(2) in both', t_ph_m2), ('Invert male(3) in both', t_ph_m3), ('Invert male(4) in both', t_ph_m4),
('Invert female(1) in premise', t_p_f1), ('Invert female(2) in premise', t_p_f2), ('Invert female(3) in premise', t_p_f3), ('Invert female(4) in premise', t_p_f4),
('Invert female(1) in hyp', t_h_f1), ('Invert female(2) in hyp', t_h_f2), ('Invert female(3) in hyp', t_h_f3), ('Invert female(4) in hyp', t_h_f4),
('Invert female(1) in both', t_ph_f1), ('Invert female(2) in both', t_ph_f2), ('Invert female(3) in both', t_ph_f3), ('Invert female(4) in both', t_ph_f4),
('Invert mf(1) in premise', t_p_mf1), ('Invert mf(2) in premise', t_p_mf2), ('Invert mf(3) in premise', t_p_mf3), ('Invert mf(4) in premise', t_p_mf4),
('Invert mf(1) in hyp', t_h_mf1), ('Invert mf(2) in hyp', t_h_mf2), ('Invert mf(3) in hyp', t_h_mf3), ('Invert mf(4) in hyp', t_h_mf4),
('Invert mf(1) in both', t_ph_mf1), ('Invert mf(2) in both', t_ph_mf2), ('Invert mf(3) in both', t_ph_mf3), ('Invert mf(4) in both', t_ph_mf4)
]
t2_p_m1 = m.ModelTwister(flip_fn, (a_set, [89], []))
t2_p_m2 = m.ModelTwister(flip_fn, (a_set, [89, 280], []))
t2_p_m3 = m.ModelTwister(flip_fn, (a_set, [89, 280, 199], []))
t2_p_m4 = m.ModelTwister(flip_fn, (a_set, [89, 280, 199, 602], []))
twister_queue2 = [
('Invert male(1) in premise', t2_p_m1), ('Invert male(2) in premise', t2_p_m2), ('Invert male(3) in premise', t2_p_m3), ('Invert male(4) in premise', t2_p_m4),
]
run_twists(classifier, data_train, data_dev, padding_token, twister_queue2)
#evaluate.evaluate(model_path, data_path, embeddings_path, twister = twister)
def eval_outlier(classifier, data_train, data_dev, padding_token, a_set):
vals = [(i, a_set.mean[i], a_set.sd[i], a_set.min[i], a_set.max[i],
max([np.absolute(a_set.mean[i] - a_set.max[i]), np.absolute(a_set.mean[i] - a_set.max[i])])) for i in range(len(a_set.max))]
vals = [(i, mean, sd, vmin, vmax, meandiff, meandiff / sd) for i, mean, sd, vmin, vmax, meandiff in vals]
def crop_val(v, vmin, vmax):
if v > vmax:
return vmax
elif v < vmin:
return vmin
else:
return v
def crop_fn(rep, typ, tools):
means = tools[0]
sd = tools[1]
factor = tools[2]
batch_size = rep.size()[0]
dimens = rep.size()[1]
for dim in range(dimens):
vmin = means[dim] - factor * sd[dim]
vmax = means[dim] + factor * sd[dim]
cropped_vals = m.cuda_wrap(torch.FloatTensor([crop_val(v, vmin, vmax) for v in rep[:, dim].data]))
rep[:, dim] = cropped_vals
return rep
twister_queue = [
('outlier=15SD', m.ModelTwister(crop_fn, (a_set.mean, a_set.sd, 15))),
('outlier=12SD', m.ModelTwister(crop_fn, (a_set.mean, a_set.sd, 12))),
('outlier=10SD', m.ModelTwister(crop_fn, (a_set.mean, a_set.sd, 10))),
('outlier=9SD', m.ModelTwister(crop_fn, (a_set.mean, a_set.sd, 9))),
('outlier=8SD', m.ModelTwister(crop_fn, (a_set.mean, a_set.sd, 8))),
('outlier=6SD', m.ModelTwister(crop_fn, (a_set.mean, a_set.sd, 6))),
('outlier=5SD', m.ModelTwister(crop_fn, (a_set.mean, a_set.sd, 5)))
]
run_twists(classifier, data_train, data_dev, padding_token, twister_queue)
def find_mf_misclassified_sents(classifier, data_train, data_dev, padding_token, a_set):
'''
Find all samples that have normally been classified correctly and now are classified wrong,
when inverting male/female dimensions.
'''
IDX_GOLD = 6
IDX_PREDICTED = 7
IDX_PREDICTED_INV = 8
def flip_fn(rep, typ, tools):
a_set = tools[0]
flip_indizes = tools[1]
if typ == 'hypothesis':
for idx in flip_indizes:
rep = flip_dimension(idx, rep, a_set)
return rep
# do this for train/dev:
experiment_name = './analyses/invert_4m4f_'
twist = m.ModelTwister(flip_fn, (a_set, [602, 199, 280, 89, 1730, 845, 311, 609]))
#twist = m.ModelTwister(flip_fn, (a_set, [1, 2, 3, 4, 5, 6, 7, 8]))
for name, data_set in [('train', data_train), ('dev', data_dev)]:
print('# Load data:', name)
loader = [
DataLoader(chunk, drop_last = False, batch_size=1, shuffle=False, collate_fn=CollocateBatchWithSents(padding_token))
for chunk in data_set
]
correct_before_correct_after = []
correct_before_incorrect_after = []
incorrect_before_correct_after = []
incorrect_before_incorrect_after = []
for chunk in loader:
print('classify new chunk')
# classify normally
for batch_p, batch_h, batch_lbl, batch_sent_p, batch_sent_h in chunk:
label_scores_normal, act_indizes, reprs = classifier(
autograd.Variable(m.cuda_wrap(batch_p), requires_grad=False),
autograd.Variable(m.cuda_wrap(batch_h), requires_grad=False),
output_sent_info = True)
_, predicted_indizes_normal = torch.max(label_scores_normal, dim=1)
label_scores_normal = label_scores_normal.data
activation_p = act_indizes[0].data
activation_h = act_indizes[1].data
reprs_p = reprs[0].data
reprs_h = reprs[1].data
# classify with m/f dimensions inverted
label_scores_inverted = classifier(
autograd.Variable(m.cuda_wrap(batch_p), requires_grad=False),
autograd.Variable(m.cuda_wrap(batch_h), requires_grad=False),
twister = twist).data
_, predicted_indizes_inverted = torch.max(label_scores_inverted, dim=1)
# check for gold - classified_normally - classified_inverted
amount = label_scores_normal.size()[0]
# (premise repr, premise act, premise words, hyp repr, hyp act, hyp words, lbl gold, lbl predicted normal, lbl predicted inversed, confidence normal, confidence inverse)
processed_data = [(
reprs_p[i].cpu(), activation_p[i].cpu(), batch_sent_p[i],
reprs_h[i].cpu(), activation_h[i].cpu(), batch_sent_h[i],
batch_lbl.cpu()[i], predicted_indizes_normal[i].data.cpu()[0], predicted_indizes_inverted.cpu()[i],
label_scores_normal.cpu()[i], label_scores_inverted.cpu()[i]
) for i in range(amount)]
# sort
correct_before_correct_after += [d for d in processed_data if d[IDX_GOLD] == d[IDX_PREDICTED] and d[IDX_GOLD] == d[IDX_PREDICTED_INV]]
correct_before_incorrect_after += [d for d in processed_data if d[IDX_GOLD] == d[IDX_PREDICTED] and d[IDX_GOLD] != d[IDX_PREDICTED_INV]]
incorrect_before_correct_after += [d for d in processed_data if d[IDX_GOLD] != d[IDX_PREDICTED] and d[IDX_GOLD] == d[IDX_PREDICTED_INV]]
incorrect_before_incorrect_after += [d for d in processed_data if d[IDX_GOLD] != d[IDX_PREDICTED] and d[IDX_GOLD] != d[IDX_PREDICTED_INV]]
# write to file
print('Write to file.')
file_name = experiment_name + name
out_arr = [
('correct_correct', correct_before_correct_after),
('correct_incorrect', correct_before_incorrect_after),
('incorrect_correct', incorrect_before_correct_after),
('incorrect_incorrect', incorrect_before_incorrect_after)
]
def stringify_arr(arr):
return ' '.join([str(v) for v in arr]) + '\n'
for appendix, samples in out_arr:
with open(file_name + '-' + appendix + '.txt', 'w') as f_out:
for p_rep, p_act, p_words, h_rep, h_act, h_words, gold, predicted, predicted_inv, scores, scores_inv in samples:
f_out.write(stringify_arr(p_words))
f_out.write(stringify_arr(p_act))
f_out.write(stringify_arr(p_rep))
f_out.write(stringify_arr(h_words))
f_out.write(stringify_arr(h_act))
f_out.write(stringify_arr(h_rep))
f_out.write(stringify_arr([gold, predicted, predicted_inv]))
f_out.write(stringify_arr(scores))
f_out.write(stringify_arr(scores_inv))
def create_misclassification_dict(labels):
md = dict()
for lbl_gold in labels:
md[lbl_gold] = dict()
for lbl_predicted_normal in labels:
md[lbl_gold][lbl_predicted_normal] = dict()
for lbl_predicted_twisted in labels:
md[lbl_gold][lbl_predicted_normal][lbl_predicted_twisted] = []
return md
def eval_output_twist(classifier, experiment_name, twister, dataset, padding_token, labels=[0,1,2]):
'''
This will evaluate and output the results of a twist done to the models sentence representation.
All results will be stored in s specific folder, containing the counting results and samples for
each category <gold-label> - <predicted-label-normal> <predicted-label-twisted>
:param experiment_name will result in the folder name where everything will be stored.
:param twister how to change the sentence representation
:param dataset The data that gets evaluated
'''
print('# Load data')
sys.stdout.flush()
loader = [
DataLoader(chunk, drop_last = False, batch_size=1, shuffle=False, collate_fn=CollocateBatchWithSents(padding_token))
for chunk in dataset
]
print('# Start training')
sys.stdout.flush()
classification_dict = create_misclassification_dict(labels)
for chunk in loader:
print('# Train new chunk')
sys.stdout.flush()
for batch_p, batch_h, batch_lbl, batch_sent_p, batch_sent_h in chunk:
# Get label scores
label_scores_normal, act_indizes, reprs = classifier(
autograd.Variable(m.cuda_wrap(batch_p), requires_grad=False),
autograd.Variable(m.cuda_wrap(batch_h), requires_grad=False),
output_sent_info = True)
_, predicted_indizes_normal = torch.max(label_scores_normal, dim=1)
label_scores_normal = label_scores_normal.data
label_scores_twisted = classifier(
autograd.Variable(m.cuda_wrap(batch_p), requires_grad=False),
autograd.Variable(m.cuda_wrap(batch_h), requires_grad=False),
twister = twister).data
_, predicted_indizes_twisted = torch.max(label_scores_twisted, dim=1)
gold_label = batch_lbl.cpu()[0]
predicted_normal_label = predicted_indizes_normal.data.cpu()[0]
predicted_twisted_label = predicted_indizes_twisted.cpu()[0]
sample = (' '.join(batch_sent_p[0]), ' '.join(batch_sent_h[0]))
classification_dict[gold_label][predicted_normal_label][predicted_twisted_label].append(sample)
# write out samples
directory = './analyses/' + experiment_name
if not os.path.exists(directory):
os.makedirs(directory)
count_results = []
for lbl_gold in classification_dict.keys():
for lbl_pred in classification_dict[lbl_gold].keys():
for lbl_pred_twist in classification_dict[lbl_gold][lbl_pred].keys():
name_gold = mydataloader.index_to_tag[lbl_gold]
name_pred = mydataloader.index_to_tag[lbl_pred]
name_pred_twisted = mydataloader.index_to_tag[lbl_pred_twist]
name = '-'.join([name_gold, name_pred, name_pred_twisted])
count_results.append((name, len(classification_dict[lbl_gold][lbl_pred][lbl_pred_twist])))
# write samples
with open(directory + '/' + name + '.txt', 'w') as f_out:
for p, h in classification_dict[lbl_gold][lbl_pred][lbl_pred_twist]:
f_out.write(p + '\n')
f_out.write(h + '\n')
# write our stats
with open(directory + '/results.txt', 'w') as f_out:
for name, cnt in count_results:
f_out.write(name + ' ' + str(cnt) + '\n')
def eval_no_negative_values(appendix, classifier, data, padding_token):
def change_repr(rep, typ, tools):
mask = tools
new_rep = autograd.Variable(m.cuda_wrap(torch.max(rep.data, tools)))
return new_rep
experiment_name = 'No negative values' + appendix
mask = m.cuda_wrap(torch.zeros(1, classifier.dimen_sent_repr).float())
twister = m.ModelTwister(change_repr, mask)
eval_output_twist(classifier, experiment_name, twister, data, padding_token)
def plot_correct_incorrect_bar(x_labels, misclassification_dict, title, block=True):
num_groups = len(x_labels)
# how many classified as label as array for plotting
amounts = dict()
for lbl in x_labels:
amounts[lbl] = [misclassification_dict[l][lbl] for l in x_labels]
# how many of gold label for acc
amounts_gold = dict()
for lbl in x_labels:
amounts_gold[lbl] = sum([misclassification_dict[lbl][l] for l in x_labels])
# calculate accuracy
amount_data = [amounts_gold[lbl] for lbl in x_labels]
correct = [amounts[lbl][i] for i, lbl in enumerate(x_labels)]
print(title)
print(amount_data)
print(correct)
accuracy = round(sum(correct) / sum(amount_data) * 100, 2)
lbl_accuracies = [round(correct[i] / amount_data[i] * 100, 2) for i in range(len(x_labels))]
fig, ax = plt.subplots()
index = np.arange(num_groups)
colors = [color_palette[i] for i in index]
bar_width = .1
for i, lbl in enumerate(x_labels):
plt.bar(index + i * bar_width, amounts[lbl], bar_width, color=color_palette[i], label=lbl)
plt.xlabel('Gold label')
plt.ylabel('# samples')
x_labels = [lbl + ' (' + str(lbl_accuracies[i]) + ')' for i, lbl in enumerate(x_labels)]
plt.xticks(index + bar_width, x_labels)
plt.title(title + ' (' + str(accuracy) + ')')
plt.legend(title='Classified as:', bbox_to_anchor=(0,1.12,1,0.2), loc="lower left",
mode="expand", borderaxespad=0, ncol=3)
plt.subplots_adjust(top=0.8)
plt.show(block=block)
def plot_results(output_file):
def create_simple_dict(labels):
d = dict()
for lbl_gold in labels:
d[lbl_gold] = dict()
for lbl_pred in labels:
d[lbl_gold][lbl_pred] = 0
return d
with open(output_file) as f_in:
data = [line.strip().split(' ') for line in f_in.readlines()]
data = [(v[0].split('-'), int(v[1])) for v in data]
data = [(lbl[0], lbl[1], lbl[2], amount) for lbl, amount in data]
x_labels = mydataloader.index_to_tag
# Plot original model
d = create_simple_dict(x_labels)
for lbl_gold, lbl_pred, _, amount in data:
d[lbl_gold][lbl_pred] += amount
plot_correct_incorrect_bar(x_labels, d, 'Normal model', block=False)
# Plot twisted model
d = create_simple_dict(x_labels)
for lbl_gold, _, lbl_pred, amount in data:
d[lbl_gold][lbl_pred] += amount
plot_correct_incorrect_bar(x_labels, d, 'Changed representation model')
def eval_male_female_to_neutral_plural(appendix, classifier, data, padding_token):
def change_repr(rep, typ, tools):
gender_dims = tools[0]
genderless_plural_dims = tools[1]
# get min of gender dims
rep_vals = rep[0]
min_gender = torch.min((torch.Tensor([rep_vals[dim].data[0] for dim in gender_dims])))
# change plural dims appropriately
copied_rep = autograd.Variable(rep.data.clone().copy_(rep.data))
for plural_dim in genderless_plural_dims:
new_val = torch.max(m.cuda_wrap(torch.Tensor((rep_vals[plural_dim].data[0], min_gender))))
copied_rep[:,plural_dim] = new_val
return copied_rep
experiment_name = 'match mf to genderless combined' + appendix
twister = m.ModelTwister(change_repr, ([602, 199, 280, 89, 1730, 845, 311, 609], [1449, 821]))
#twister = m.ModelTwister(change_repr, ([3,4,5,6,7], [1,2]))
eval_output_twist(classifier, experiment_name, twister, data, padding_token)
mapper = dict()
mapper['mf'] = eval_mf
mapper['ol'] = eval_outlier
mapper['misclassified_sents_mf'] = find_mf_misclassified_sents
output_mapper = dict()
output_mapper['mfn'] = eval_male_female_to_neutral_plural
output_mapper['noneg'] = eval_no_negative_values
def main():
args = docopt("""Evaluate on given dataset in terms of accuracy.
Usage:
eval_twist.py normal <model> <data_train> <data_dev> <statpath> <type> [<embeddings>]
eval_twist.py output <model> <data> <type> <appendix>
eval_twist.py plot <output>
<model> = Path to trained model
<data> = Path to data to test model with
<embeddings> = New embedding file to use unknown words from
""")
if args['plot'] != None:
return plot_results(args['<output>'])
model_path = args['<model>']
data_path_train = args['<data_train>']
data_path_dev = args['<data_dev>']
data_path_single = args['<data>']
embeddings_path = args['<embeddings>']
eval_type = args['<type>']
stat_path = args['<statpath>']
appendix = args['<appendix>']
print('# Loading model ...')
if embeddings_path == None:
embedding_holder = embeddingholder.EmbeddingHolder(config.PATH_WORD_EMBEDDINGS)
else:
embedding_holder = embeddingholder.EmbeddingHolder(embeddings_path)
classifier, _ = m.load_model(model_path, embedding_holder=embedding_holder)
m.cuda_wrap(classifier)
classifier.eval()
if args['output'] != None:
data = mydataloader.get_dataset_chunks(data_path_single, embedding_holder, chunk_size=32*400, include_sent=True)
output_mapper[eval_type](appendix, classifier, data, embedding_holder.padding())
else:
a_set = analyse.AnalyseSet(stat_path)
include_sent = False
if eval_type in ['misclassified_sents_mf']:
include_sent = True
print('# Loading data train ...')
data_train = mydataloader.get_dataset_chunks(data_path_train, embedding_holder, chunk_size=32*400, mark_as='', include_sent=include_sent)
print('# Loading data dev ...')
data_dev = mydataloader.get_dataset_chunks(data_path_dev, embedding_holder, chunk_size=32*400, mark_as='', include_sent=include_sent)
print('# Evaluate ...')
mapper[eval_type](classifier, data_train, data_dev, embedding_holder.padding(), a_set)
if __name__ == '__main__':
main()