-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathWord2Vec_Generator.py
executable file
·85 lines (74 loc) · 2.63 KB
/
Word2Vec_Generator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import torch
from torch.autograd import Variable
import numpy as np
import torch.functional as F
import torch.nn.functional as F
import pandas as pd
#
# features = pd.read_csv('feature_bow.csv',header=None) #
# corpus = features.values.reshape(-1)
# #
corpus = [
'he is a king',
'she is a queen',
'he is a man',
'she is a woman',
'warsaw is poland capital',
'berlin is germany capital',
'paris is france capital',
]
# print(corpus.values)
def tokenize_corpus(corpus):
tokens = [x.split() for x in corpus]
return tokens
tokenized_corpus = tokenize_corpus(corpus)
vocabulary = []
for sentence in tokenized_corpus:
for token in sentence:
if token not in vocabulary:
vocabulary.append(token)
word2idx = {w: idx for (idx, w) in enumerate(vocabulary)}
idx2word = {idx: w for (idx, w) in enumerate(vocabulary)}
vocabulary_size = len(vocabulary)
window_size = 2
idx_pairs = []
# for each sentence
for sentence in tokenized_corpus:
indices = [word2idx[word] for word in sentence]
# for each word, threated as center word
for center_word_pos in range(len(indices)):
# for each window position
for w in range(-window_size, window_size + 1):
context_word_pos = center_word_pos + w
# make soure not jump out sentence
if context_word_pos < 0 or context_word_pos >= len(indices) or center_word_pos == context_word_pos:
continue
context_word_idx = indices[context_word_pos]
idx_pairs.append((indices[center_word_pos], context_word_idx))
idx_pairs = np.array(idx_pairs) # it will be useful to have this as numpy array
def get_input_layer(word_idx):
x = torch.zeros(vocabulary_size).float()
x[word_idx] = 1.0
return x
embedding_dims = 5
W1 = Variable(torch.randn(embedding_dims, vocabulary_size).float(), requires_grad=True)
W2 = Variable(torch.randn(vocabulary_size, embedding_dims).float(), requires_grad=True)
num_epochs = 101
learning_rate = 0.001
for epo in range(num_epochs):
loss_val = 0
for data, target in idx_pairs:
x = Variable(get_input_layer(data)).float()
y_true = Variable(torch.from_numpy(np.array([target])).long())
z1 = torch.matmul(W1, x)
z2 = torch.matmul(W2, z1)
log_softmax = F.log_softmax(z2, dim=0)
loss = F.nll_loss(log_softmax.view(1, -1), y_true)
loss_val += loss.item()
loss.backward()
W1.data -= learning_rate * W1.grad.data
W2.data -= learning_rate * W2.grad.data
W1.grad.data.zero_()
W2.grad.data.zero_()
if epo % 10 == 0:
print(f'Loss at epo {epo}: {loss_val / len(idx_pairs)}')