forked from JJJavaLai/POSTagging
-
Notifications
You must be signed in to change notification settings - Fork 0
/
EmbeddingGloVe.py
81 lines (66 loc) · 2.48 KB
/
EmbeddingGloVe.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import pickle
import os
import numpy as np
with open('PickledData/Glove.pkl', 'rb') as f:
V1 = pickle.load(f)
print('Total %s word vectors. \n' % len(V1))
# train_words: train word set,
# validation_words: validation words set,
# test_words: test words set
with open('PickledData/pickled_data.pkl', 'rb') as f:
pickled_data = pickle.load(f)
train_words = pickled_data["train_words"]
validation_words = pickled_data["validation_words"]
test_words = pickled_data["test_words"]
EMBEDDING_DIMENSION = 300
def get_embedding_weights(vocabulary):
shape = (len(vocabulary), EMBEDDING_DIMENSION)
embedding_weights = np.empty((shape))
keys = list(vocabulary.keys())
for i in range(0, len(keys)):
embedding_weights[i] = vocabulary[keys[i]]
return embedding_weights
print("Original length of GloVe vocabulary: {} \n".format(len(V1)))
OOV1_counters = 1
train_vocabulary_size = len(train_words)
V2 = {}
for word in train_words:
if word not in V1.keys():
OOV1_counters += 1
V2[word] = np.random.uniform(size=(1, EMBEDDING_DIMENSION))
else:
V2[word] = V1[word]
print("Find {} words in OOV1".format(OOV1_counters))
embedding_weights_V2 = get_embedding_weights(V2)
print("V2 = V1 + OOV1, shape of V2: {} \n".format(embedding_weights_V2.shape))
OOV2_counters = 1
V3 = {}
for word in validation_words:
if word not in V2.keys():
OOV2_counters += 1
V3[word] = np.random.uniform(size=(1, EMBEDDING_DIMENSION))
else:
V3[word] = V2[word]
print("Find {} words in OOV2".format(OOV2_counters))
embedding_weights_V3 = get_embedding_weights(V3)
print("V3 = V1 + OOV1 + OOV2, shape of V3: {} \n".format(embedding_weights_V3.shape))
OOV3_counters = 1
V4 = {}
for word in test_words:
if word not in V3.keys():
OOV3_counters += 1
V4[word] = np.random.uniform(size=(1, EMBEDDING_DIMENSION))
else:
V4[word] = V3[word]
print("Find {} words in OOV3".format(OOV3_counters))
embedding_weights_V4 = get_embedding_weights(V4)
print("V4 = V1 + OOV1 + OOV2 + OOV3, shape of V4: {} \n".format(embedding_weights_V4.shape))
if not os.path.exists('PickledData/'):
print('MAKING DIRECTORY PickledData/ to save pickled preprocessed data')
os.makedirs('PickledData/')
with open('PickledData/V2.pkl', 'wb') as f:
pickle.dump(embedding_weights_V2, f)
with open('PickledData/V3.pkl', 'wb') as f:
pickle.dump(embedding_weights_V3, f)
with open('PickledData/V4.pkl', 'wb') as f:
pickle.dump(embedding_weights_V4, f)