This repository has been archived by the owner on Jan 8, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 2
/
features.py
executable file
·257 lines (192 loc) · 7.86 KB
/
features.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
#!/usr/bin/env python3
"""
Module to manage creation of input features/labels and Embeddings
"""
from abc import ABC, abstractmethod
from gensim.models import KeyedVectors
from numpy import array as nparray
from numpy import random, zeros
from tqdm import tqdm
import pymagnitude as magnitude
class Embeddings(ABC):
"""
Abstract Base Class for Word Embeddings.
This class should implement the polymorph function embeddings() to producs Embeddings from a list of tokens
"""
padding_marker = '__PADDING__'
def __init__(self, dimensions):
"""
:param int dimensions: Dimensions of the Word Embedding Vectors
:return: Embeddings Object
"""
self.dimensions = dimensions
super().__init__()
@abstractmethod
def embeddings(self, tokens):
"""
This method will take a list of tokens and returns a list of Word Embeddings with the same size.
:param list tokens: List of tokens to transform into Embeddings
"""
raise NotImplementedError
class DummyEmbeddings(Embeddings):
"""
Generates random numpy arrays as embeddings for development
"""
def embeddings(self, tokens):
"""
This method will take a list of tokens and returns a list of Dummy Word Embeddings with the same size.
:param list tokens: List of tokens to transform into Embeddings
:return: List of numpy arrays with given dimensions
"""
return_list = []
for token in tokens:
if token == Embeddings.padding_marker:
# Add zero vector for padding
return_list.append(zeros(self.dimensions))
else:
# Add random vector
return_list.append(random.rand(self.dimensions))
return return_list
class Word2Vec(Embeddings):
"""
Handles the GoogleNews Word2Vec vectors
https://code.google.com/archive/p/word2vec/
"""
def __init__(self, filepath='source/GoogleNews-vectors-negative300.bin', dimensions=300):
"""
Load the pretrained Word2Vec vectors
:param string filename: Path to gensim Keyedvectors file as *.bin
:param int dimensions: Dimensions of the Vectors (to generate zeros for padding)
"""
self.dimensions = dimensions
self.word2vec = KeyedVectors.load_word2vec_format(filepath, binary=True)
def embeddings(self, tokens):
"""
Transforms a list of tokens into a list of embeddings
:param list tokens: List of tokens to transform into Embeddings
:return: List of Word2Vec embeddings for given tokens
"""
return_list = []
for token in tokens:
if token in self.word2vec:
return_list.append(self.word2vec[token])
elif token == Embeddings.padding_marker:
# Add zero vector for padding
return_list.append(zeros(self.dimensions))
else:
# Add a random vector
return_list.append(random.rand(self.dimensions))
return return_list
class Magnitudes(Embeddings):
"""
Handles the Embeddings using pymagnitude
https://github.com/plasticityai/magnitude
"""
def __init__(self, filepath='source/wiki-news-300d-1M-subword.magnitude', dimensions=300):
"""
Load the pretrained Embeddings
:param string filename: Path to pymagnitude file as *.magnitude
:param int dimensions: Dimensions of the Vectors (to generate zeros for padding)
"""
self.dimensions = dimensions
self.filepath = filepath
self.vectors = magnitude.Magnitude(filepath)
def embeddings(self, tokens):
"""
Transforms a list of tokens into a list of embeddings
:param list tokens: List of tokens to transform into Embeddings
:return: List of embeddings for given tokens
"""
return_list = []
for token in tokens:
if token == Embeddings.padding_marker:
return_list.append(zeros(self.dimensions))
elif token in self.vectors:
vec = self.vectors.query(token)
return_list.append(vec)
else:
# pymagnitude finds the most similar
vec = self.vectors.query(token)
return_list.append(vec)
return return_list
def chunks(lst, max_len):
"""
Break a list into chunks of size max_len.
:param list lst: List of elements
:param int max_len: Maximum length of the chunks
:return: Generator yielding lists of size max_len from original list
"""
for idx in range(0, len(lst), max_len):
yield lst[idx:idx + max_len]
def slice_it(list_of_lists, max_len=50):
"""
Slices a list if lists into elements the size max_len
:param list list_of_lists: List of lists
:param int max_len: Maximum length of the list elements
:return: List of lists, with elements of max_len size
"""
slices = []
for elem in list_of_lists:
[slices.append(chunk) for chunk in chunks(elem, max_len)]
return slices
def add_padding(tokens, max_len=50, pad_value='__PADDING__'):
"""
Pad a list of tokens with value to max_len length.
Uses the given pad_value to produce dummy tokens for padding
:param list tokens: List of tokens to add padding to
:param int max_len: Maximum length of the new padded list
:param string value: Value to use for padding
:param bool split_if_too_long: Split lists if they are too long
:return: List of tokens with padding at the end
"""
if len(tokens) <= max_len:
# Append value to end of short token list
return tokens + [pad_value] * (max_len - len(tokens))
elif len(tokens) >= max_len:
# Cutoff if sentence is too long
return tokens[0:max_len]
return tokens
def compile_input_and_labels_for_sentence(sentence, Vectors, max_len=50):
"""
Adds padding to the sentence tokens and labels
:param list sentences: List of sentnces to generated labels from
:param Embeddings vectors: Embeddings instance to generate embeddings from
:param int max_len: Maximum length of sentences
:return: Tuple containing:
x_inputs as list of Embeddings for a given sentence,
y_labels as list of labels for a given sentence
"""
x_inputs = []
y_labels = []
z_pos = []
# Unpack tuples and pad the sequence to a fixed length
padded_sentence_tokens = add_padding([token[0] for token in sentence], max_len=max_len)
padded_sentence_labels = add_padding([label[1] for label in sentence], pad_value=-1, max_len=max_len)
padded_sentence_postag = add_padding([pos[2] for pos in sentence], pad_value='PAD', max_len=max_len)
x_inputs = Vectors.embeddings(padded_sentence_tokens)
y_labels = padded_sentence_labels
z_pos = padded_sentence_postag
return x_inputs, y_labels, z_pos
def generate_input_and_labels(sentences, Vectors, max_len=50):
"""
Takes a list of sentences and returns a list of
- Input data x (list of tokens)
- Corresponding labels y (list of labels)
:param list sentences: list sentences as list of tuples
:param Embeddings vectors: Embeddings instance to generate embeddings from
:param int max_len: Maximum length of the new padded list
:return: Tuple containing:
numpy array x, list of lists containing sentences as embeddings
numpy array y, list if lists containing labels for sentences in x
"""
list_of_x = []
list_of_y = []
list_of_z = []
# Breakup sentences into smaller chunks
sliced_sentences = slice_it(sentences, max_len)
for sentence in tqdm(sliced_sentences):
x, y, z = compile_input_and_labels_for_sentence(sentence, Vectors, max_len=max_len)
list_of_x.append(x)
list_of_y.append(y)
list_of_z.append(z)
return nparray(list_of_x), nparray(list_of_y), list_of_z