-
Notifications
You must be signed in to change notification settings - Fork 17
/
Copy pathutil.py
332 lines (233 loc) · 9.27 KB
/
util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
from keras.preprocessing.text import text_to_word_sequence
from keras.layers import Layer
import keras.utils
import keras.backend as K
from nltk import FreqDist
import numpy as np
from keras.preprocessing import sequence
from scipy.misc import logsumexp
from collections import defaultdict, Counter, OrderedDict
import os
"""
Various utility functions for loading data and performing other common operations.
Some of this code is based on Based on https://github.com/ChunML/seq2seq/blob/master/seq2seq_utils.py
"""
# Special tokens
EXTRA_SYMBOLS = ['<PAD>', '<START>', '<UNK>', '<EOS>']
DIR = os.path.dirname(os.path.realpath(__file__))
def load_words(source, vocab_size=10000, limit=None, max_length=None):
"""
Loads sentences (or other natural language sequences) from a text file. Assumes a single sequence per line.
:param source: Text file to read from
:param vocab_size: Maximum number of words to retain. If there are more unique words than this, the most frequent
"vocab_size" words are used, and the rest are replaced by the <UNK> symbol
:param limit: If not None, only the first "character_limit" characters are read. Useful for debugging on large corpora.
:param max_length: If not none, any sentence longer containing more words than this is removed.
:return: (1) A list of lists of integers representing the encoded sentences, (3) a dict from strings to ints
representing the mapping from words to indices (2) a list of strings representing the mapping from indices to
words.
"""
# Reading raw text from source and destination files
f = open(source, 'r')
x_data = f.read()
f.close()
print('raw data read')
if limit is not None:
x_data = x_data[:limit]
# Splitting raw text into array of sequences
x = [text_to_word_sequence(x) for x in x_data.split('\n') if len(x) > 0]
if max_length is not None:
x = [s for s in x if len(s) <= max_length]
# Creating the vocabulary set with the most common words (leaving room for PAD, START, UNK)
dist = FreqDist(np.hstack(x))
x_vocab = dist.most_common(vocab_size - len(EXTRA_SYMBOLS))
# Creating an array of words from the vocabulary set, we will use this array as index-to-word dictionary
i2w = [word[0] for word in x_vocab]
# Adding the word "ZERO" to the beginning of the array
i2w = EXTRA_SYMBOLS + i2w
# Creating the word-to-index dictionary from the array created above
w2i = {word:ix for ix, word in enumerate(i2w)}
# Converting each word to its index value
for i, sentence in enumerate(x):
for j, word in enumerate(sentence):
if word in w2i:
x[i][j] = w2i[word]
else:
x[i][j] = w2i['<UNK>']
return x, w2i, i2w
def load_characters(source, length=None, limit=None,):
"""
Reads a text file as a stream of characters. The stream is cut into chunks of equal size
:param source: The text file to read
:param length: The size of the chunks. If None, the stream is delimited by line-ends and the resulting sequence will
have variable length
:param limit: If not None, only the first "character_limit" characters are read. Useful for debugging on large corpora.
:return: (1) A list of lists
"""
# Reading raw text from source and destination files
f = open(source, 'r')
x_data = f.read()
f.close()
print('raw data read')
if limit is not None:
x_data = x_data[:limit]
# Splitting raw text into array of sequences
if length is None:
x = [list(line) for line in x_data.split('\n') if len(line) > 0]
else:
x = [list(chunk) for chunk in chunks(x_data, length)]
# Creating the vocabulary set with the most common characters (leaving room for PAD, START, UNK)
chars = set()
for line in x:
for char in line:
chars.add(char)
# Creating an array of words from the vocabulary set, we will use this array as index-to-word dictionary
i2c = list(chars)
# Adding the special symbol to the beginning of the array
i2c = EXTRA_SYMBOLS + i2c
# Creating the word-to-index dictionary from the array created above
c2i = {word:ix for ix, word in enumerate(i2c)}
# Converting each word to its index value
for i, sentence in enumerate(x):
for j, word in enumerate(sentence):
if word in c2i:
x[i][j] = c2i[word]
else:
x[i][j] = c2i['<UNK>']
return x, c2i, i2c
def process_data(word_sentences, max_len, word_to_ix):
# Vectorizing each element in each sequence
sequences = np.zeros((len(word_sentences), max_len, len(word_to_ix)))
for i, sentence in enumerate(word_sentences):
for j, word in enumerate(sentence):
sequences[i, j, word] = 1.
return sequences
def batch_pad(x, batch_size, min_length=3, add_eos=False, extra_padding=0):
"""
Takes a list of integer sequences, sorts them by lengths and pads them so that sentences in each batch have the
same length.
:param x:
:return: A list of tensors containing equal-length sequences padded to the length of the longest sequence in the batch
"""
x = sorted(x, key=lambda l : len(l))
if add_eos:
eos = EXTRA_SYMBOLS.index('<EOS>')
x = [sent + [eos,] for sent in x]
batches = []
start = 0
while start < len(x):
end = start + batch_size
if end > len(x):
end = len(x)
batch = x[start:end]
mlen = max([len(l) + extra_padding for l in batch])
if mlen >= min_length:
batch = sequence.pad_sequences(batch, maxlen=mlen, dtype='int32', padding='post', truncating='post')
batches.append(batch)
start += batch_size
print('max length per batch: ', [max([len(l) for l in batch]) for batch in batches])
return batches
def to_categorical(batch, num_classes):
"""
Converts a batch of length-padded integer sequences to a one-hot encoded sequence
:param batch:
:param num_classes:
:return:
"""
b, l = batch.shape
out = np.zeros((b, l, num_classes))
for i in range(b):
seq = batch[0, :]
out[i, :, :] = keras.utils.to_categorical(seq, num_classes=num_classes)
return out
def chunks(l, n):
"""Yield successive n-sized chunks from l."""
for i in range(0, len(l), n):
yield l[i:i + n]
def sample(preds, temperature=1.0):
"""
Sample an index from a probability vector
:param preds:
:param temperature:
:return:
"""
preds = np.asarray(preds).astype('float64')
if temperature == 0.0:
return np.argmax(preds)
preds = np.log(preds) / temperature
exp_preds = np.exp(preds)
preds = exp_preds / np.sum(exp_preds)
probas = np.random.multinomial(1, preds, 1)
return np.argmax(probas)
def sample_logits(preds, temperature=1.0):
"""
Sample an index from a logit vector.
:param preds:
:param temperature:
:return:
"""
preds = np.asarray(preds).astype('float64')
if temperature == 0.0:
return np.argmax(preds)
preds = preds / temperature
preds = preds - logsumexp(preds)
choice = np.random.choice(len(preds), 1, p=np.exp(preds))
return choice
class KLLayer(Layer):
"""
Identity transform layer that adds KL divergence
to the final model loss.
During training, call
K.set_value(kl_layer.weight, new_value)
to scale the KL loss term.
based on:
http://tiao.io/posts/implementing-variational-autoencoders-in-keras-beyond-the-quickstart-tutorial/
"""
def __init__(self, weight = None, *args, **kwargs):
self.is_placeholder = True
self.weight = weight
super().__init__(*args, **kwargs)
def call(self, inputs):
mu, log_var = inputs
kl_batch = - .5 * K.sum(1 + log_var -
K.square(mu) -
K.exp(log_var), axis=-1)
loss = K.mean(kl_batch)
if self.weight is not None:
loss = loss * self.weight
self.add_loss(loss, inputs=inputs)
return inputs
class Sample(Layer):
"""
Performs sampling step
"""
def __init__(self, *args, **kwargs):
self.is_placeholder = True
super().__init__(*args, **kwargs)
def call(self, inputs):
mu, log_var, eps = inputs
z = K.exp(.5 * log_var) * eps + mu
return z
def compute_output_shape(self, input_shape):
shape_mu, _, _ = input_shape
return shape_mu
def interpolate(start, end, steps):
result = np.zeros((steps+2, start.shape[0]))
for i, d in enumerate(np.linspace(0,1, steps+2)):
result[i, :] = start * (1-d) + end * d
return result
class OrderedCounter(Counter, OrderedDict):
'Counter that remembers the order elements are first encountered'
def __repr__(self):
return '%s(%r)' % (self.__class__.__name__, OrderedDict(self))
def __reduce__(self):
return self.__class__, (OrderedDict(self),)
def idx2word(idx, i2w, pad_idx):
sent_str = [str()]*len(idx)
for i, sent in enumerate(idx):
for word_id in sent:
if word_id == pad_idx:
break
sent_str[i] += i2w[str(word_id.item())] + " "
sent_str[i] = sent_str[i].strip()
return sent_str