-
Notifications
You must be signed in to change notification settings - Fork 56
/
prepare_data.py
116 lines (90 loc) · 4.37 KB
/
prepare_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import numpy as np
from keras.utils.np_utils import to_categorical
import json
import h5py
import os
from constants import *
def right_align(seq,lengths):
v = np.zeros(np.shape(seq))
N = np.shape(seq)[1]
for i in range(np.shape(seq)[0]):
v[i][N-lengths[i]:N]=seq[i][0:lengths[i]]
return v
def read_data(data_limit):
print "Reading Data..."
img_data = h5py.File(data_img)
ques_data = h5py.File(data_prepo)
img_data = np.array(img_data['images_train'])
img_pos_train = ques_data['img_pos_train'][:data_limit]
train_img_data = np.array([img_data[_-1,:] for _ in img_pos_train])
# Normalizing images
tem = np.sqrt(np.sum(np.multiply(train_img_data, train_img_data), axis=1))
train_img_data = np.divide(train_img_data, np.transpose(np.tile(tem,(4096,1))))
#shifting padding to left side
ques_train = np.array(ques_data['ques_train'])[:data_limit, :]
ques_length_train = np.array(ques_data['ques_length_train'])[:data_limit]
ques_train = right_align(ques_train, ques_length_train)
train_X = [train_img_data, ques_train]
# NOTE should've consturcted one-hots using exhausitve list of answers, cause some answers may not be in dataset
# To temporarily rectify this, all those answer indices is set to 1 in validation set
train_y = to_categorical(ques_data['answers'])[:data_limit, :]
return train_X, train_y
def get_val_data():
img_data = h5py.File(data_img)
ques_data = h5py.File(data_prepo)
metadata = get_metadata()
with open(val_annotations_path, 'r') as an_file:
annotations = json.loads(an_file.read())
img_data = np.array(img_data['images_test'])
img_pos_train = ques_data['img_pos_test']
train_img_data = np.array([img_data[_-1,:] for _ in img_pos_train])
tem = np.sqrt(np.sum(np.multiply(train_img_data, train_img_data), axis=1))
train_img_data = np.divide(train_img_data, np.transpose(np.tile(tem,(4096,1))))
ques_train = np.array(ques_data['ques_test'])
ques_length_train = np.array(ques_data['ques_length_test'])
ques_train = right_align(ques_train, ques_length_train)
# Convert all last index to 0, coz embeddings were made that way :/
for _ in ques_train:
if 12602 in _:
_[_==12602] = 0
val_X = [train_img_data, ques_train]
ans_to_ix = {str(ans):int(i) for i,ans in metadata['ix_to_ans'].items()}
ques_annotations = {}
for _ in annotations['annotations']:
idx = ans_to_ix.get(_['multiple_choice_answer'].lower())
_['multiple_choice_answer_idx'] = 1 if idx in [None, 1000] else idx
ques_annotations[_['question_id']] = _
abs_val_y = [ques_annotations[ques_id]['multiple_choice_answer_idx'] for ques_id in ques_data['question_id_test']]
abs_val_y = to_categorical(np.array(abs_val_y))
multi_val_y = [list(set([ans_to_ix.get(_['answer'].lower()) for _ in ques_annotations[ques_id]['answers']])) for ques_id in ques_data['question_id_test']]
for i,_ in enumerate(multi_val_y):
multi_val_y[i] = [1 if ans in [None, 1000] else ans for ans in _]
return val_X, abs_val_y, multi_val_y
def get_metadata():
meta_data = json.load(open(data_prepo_meta, 'r'))
meta_data['ix_to_word'] = {str(word):int(i) for i,word in meta_data['ix_to_word'].items()}
return meta_data
def prepare_embeddings(num_words, embedding_dim, metadata):
if os.path.exists(embedding_matrix_filename):
with h5py.File(embedding_matrix_filename) as f:
return np.array(f['embedding_matrix'])
print "Embedding Data..."
with open(train_questions_path, 'r') as qs_file:
questions = json.loads(qs_file.read())
texts = [str(_['question']) for _ in questions['questions']]
embeddings_index = {}
with open(glove_path, 'r') as glove_file:
for line in glove_file:
values = line.split()
word = values[0]
coefs = np.asarray(values[1:], dtype='float32')
embeddings_index[word] = coefs
embedding_matrix = np.zeros((num_words, embedding_dim))
word_index = metadata['ix_to_word']
for word, i in word_index.items():
embedding_vector = embeddings_index.get(word)
if embedding_vector is not None:
embedding_matrix[i] = embedding_vector
with h5py.File(embedding_matrix_filename, 'w') as f:
f.create_dataset('embedding_matrix', data=embedding_matrix)
return embedding_matrix