-
Notifications
You must be signed in to change notification settings - Fork 76
/
Copy pathsentence2vec_demo.py
91 lines (79 loc) · 3.36 KB
/
sentence2vec_demo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
#!/usr/bin/python3
#
# Copyright 2016-2022 Peter de Vocht
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
from sentence2vec import Word, Sentence, sentence_to_vec
# inner product of two vectors
def inner_product(v1, v2):
if len(v1) == len(v2):
sum = 0.0
size_v1 = 0.0
size_v2 = 0.0
for i in range(len(v1)):
size_v1 += v1[i] * v1[i]
size_v2 += v2[i] * v2[i]
sum += v1[i] * v2[i]
size_v1 = math.sqrt(size_v1)
size_v2 = math.sqrt(size_v2)
size_mult = size_v1 * size_v2
if size_mult != 0.0:
return round(sum / size_mult, 4)
return 0.0
if __name__ == '__main__':
glove_file = 'glove/glove.6B.100d.txt'
embedding_size = 100 # dimension of glove, must match glove_file size (50 = 50d, 100 = 100d, etc)
# load the glove set from file
glove_50_dict = dict()
with open(glove_file, 'rt') as reader:
for line in reader:
line = line.strip().split(' ')
if len(line) == (embedding_size + 1):
word = line[0]
vector = [float(item) for item in line[1:]]
glove_50_dict[word] = vector
# load some simple sentences for testing similarities between
sentences = []
with open('semantic_test_text.txt') as reader:
for line in reader:
if len(line.strip()) > 0:
sentences.append(line.strip().split(' '))
# convert the above sentences to vectors using spacy's large model vectors
sentence_list = []
for sentence in sentences:
word_list = []
for word in sentence:
if word.lower() in glove_50_dict: # ignore OOVs
word_list.append(Word(word, glove_50_dict[word.lower()]))
if len(word_list) > 0: # did we find any words (not an empty set)
sentence_list.append(Sentence(word_list))
# apply single sentence word embedding
sentence_vector_lookup = dict()
sentence_vectors = sentence_to_vec(sentence_list, embedding_size) # all vectors converted together
if len(sentence_vectors) == len(sentence_list):
for i in range(len(sentence_vectors)):
# map: text of the sentence -> vector
sentence_vector_lookup[sentence_list[i].__str__()] = sentence_vectors[i]
# go through each sentence and compare it with each other sentence and find the best match
for text1, vector1 in sentence_vector_lookup.items():
best_match = ''
best_score = 0.0
for text2, vector2 in sentence_vector_lookup.items():
if text2 == text1: # only when different
continue
match = inner_product(vector1, vector2)
if match > best_score:
best_score = match
best_match = text2
print("best match \"{}\" => \"{}\" (score {})".format(text1, best_match, str(best_score)))