-
Notifications
You must be signed in to change notification settings - Fork 0
/
load_embedding.py
32 lines (26 loc) · 1.28 KB
/
load_embedding.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
from gensim import models
import tensorflow as tf
import numpy as np
def load_embedding(session, vocab, emb, path, dim_embedding, vocab_size):
'''
session Tensorflow session object
vocab A dictionary mapping token strings to vocabulary IDs
emb Embedding tensor of shape vocabulary_size x dim_embedding
path Path to embedding file
dim_embedding Dimensionality of the external embedding.
'''
print("Loading external embeddings from %s" % path)
model = models.KeyedVectors.load_word2vec_format(path, binary=False)
external_embedding = np.zeros(shape=(vocab_size, dim_embedding))
matches = 0
for tok, idx in vocab.items():
if tok in model.vocab:
external_embedding[idx] = model[tok]
matches += 1
else:
print("%s not in embedding file" % tok)
external_embedding[idx] = np.random.uniform(low=-0.25, high=0.25, size=dim_embedding)
print("%d words out of %d could be loaded" % (matches, vocab_size))
pretrained_embeddings = tf.placeholder(tf.float32, [None, None])
assign_op = emb.assign(pretrained_embeddings)
#return session.run(assign_op, {pretrained_embeddings: external_embedding}) # here, embeddings are actually set