-
Notifications
You must be signed in to change notification settings - Fork 0
/
run.py
74 lines (60 loc) · 2.23 KB
/
run.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
#!/usr/bin/env python
import random
import numpy as np
from utils.treebank import StanfordSentiment
import matplotlib
matplotlib.use('agg')
import matplotlib.pyplot as plt
import time
from word2vec import *
from sgd import *
# Check Python Version
import sys
assert sys.version_info[0] == 3
assert sys.version_info[1] >= 5
# Reset the random seed to make sure that everyone gets the same results
random.seed(314)
dataset = StanfordSentiment()
tokens = dataset.tokens()
nWords = len(tokens)
# We are going to train 10-dimensional vectors for this assignment
dimVectors = 10
# Context size
C = 5
# Reset the random seed to make sure that everyone gets the same results
random.seed(31415)
np.random.seed(9265)
startTime=time.time()
wordVectors = np.concatenate(
((np.random.rand(nWords, dimVectors) - 0.5) /
dimVectors, np.zeros((nWords, dimVectors))),
axis=0)
wordVectors = sgd(
lambda vec: word2vec_sgd_wrapper(skipgram, tokens, vec, dataset, C,
negSamplingLossAndGradient),
wordVectors, 0.3, 40000, None, True, PRINT_EVERY=10)
# Note that normalization is not called here. This is not a bug,
# normalizing during training loses the notion of length.
print("sanity check: cost at convergence should be around or below 10")
print("training took %d seconds" % (time.time() - startTime))
# concatenate the input and output word vectors
wordVectors = np.concatenate(
(wordVectors[:nWords,:], wordVectors[nWords:,:]),
axis=0)
visualizeWords = [
"great", "cool", "brilliant", "wonderful", "well", "amazing",
"worth", "sweet", "enjoyable", "boring", "bad", "dumb",
"annoying", "female", "male", "queen", "king", "man", "woman", "rain", "snow",
"hail", "coffee", "tea"]
visualizeIdx = [tokens[word] for word in visualizeWords]
visualizeVecs = wordVectors[visualizeIdx, :]
temp = (visualizeVecs - np.mean(visualizeVecs, axis=0))
covariance = 1.0 / len(visualizeIdx) * temp.T.dot(temp)
U,S,V = np.linalg.svd(covariance)
coord = temp.dot(U[:,0:2])
for i in range(len(visualizeWords)):
plt.text(coord[i,0], coord[i,1], visualizeWords[i],
bbox=dict(facecolor='green', alpha=0.1))
plt.xlim((np.min(coord[:,0]), np.max(coord[:,0])))
plt.ylim((np.min(coord[:,1]), np.max(coord[:,1])))
plt.savefig('word_vectors.png')