-
Notifications
You must be signed in to change notification settings - Fork 37
/
Copy pathdata.py
100 lines (79 loc) · 3.14 KB
/
data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import random
from collections import namedtuple
import pickle
class Vocabulary(dict):
"""
Bi-directional look up dictionary for the vocabulary
Args:
(dict): the default python dict class is extended
"""
def __init__(self, vocabulary_file_name):
with open(vocabulary_file_name) as vocabulary_file:
for line in vocabulary_file:
key, value = line.split()
self[int(key)] = value
self[0] = '<PAD>'
def __setitem__(self, key, value):
if key in self:
raise Exception('Repeat Key', key)
if value in self:
raise Exception('Repeat value', value)
dict.__setitem__(self, key, value)
dict.__setitem__(self, value, key)
def __delitem__(self, key):
dict.__delitem__(self, self[key])
dict.__delitem__(self, key)
def __len__(self):
return dict.__len__(self) // 2
class QAData():
"""
Load the train/predecit/test data
"""
def __init__(self):
self.vocabulary = Vocabulary("./data/vocab_all.txt")
self.dec_timesteps=150
self.enc_timesteps=150
self.answers = pickle.load(open("./data/answers.pkl",'rb'))
self.training_set = pickle.load(open("./data/train.pkl",'rb'))
def pad(self, data, length):
"""
pad the data to meet given length requirement
Args:
data (vector): vector of question or answer
length(integer): length of desired vector
"""
from keras.preprocessing.sequence import pad_sequences
return pad_sequences(data, maxlen=length, padding='post', truncating='post', value=0)
def get_training_data(self):
"""
Return training question and answers
"""
questions = []
good_answers = []
for j, qa in enumerate(self.training_set):
questions.extend([qa['question']] * len(qa['answers']))
good_answers.extend([self.answers[i] for i in qa['answers']])
# pad the question and answers
questions = self.pad(questions, self.enc_timesteps)
good_answers = self.pad(good_answers, self.dec_timesteps)
bad_answers = self.pad(random.sample(list(self.answers.values()), len(good_answers)), self.dec_timesteps)
return questions,good_answers,bad_answers
def process_data(self, d):
"""
Process the predection data
"""
indices = d['good'] + d['bad']
answers = self.pad([self.answers[i] for i in indices], self.dec_timesteps)
question = self.pad([d['question']] * len(indices), self.enc_timesteps)
return indices,answers,question
def process_test_data(self, question, answers):
"""
Process the test data
"""
answer_unpadded = []
for answer in answers:
print (answer.split(' '))
answer_unpadded.append([self.vocabulary[word] for word in answer.split(' ')])
answers = self.pad(answer_unpadded, self.dec_timesteps)
question = self.pad([[self.vocabulary[word] for word in question.split(' ')]] * len(answers), self.enc_timesteps)
return answers, question