forked from shimech/boston_terrier
-
Notifications
You must be signed in to change notification settings - Fork 0
/
text_rank_w2v.py
46 lines (36 loc) · 1.36 KB
/
text_rank_w2v.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
# -*- coding:utf-8 -*-
from sumy.summarizers.text_rank import TextRankSummarizer
from sumy.summarizers.lex_rank import LexRankSummarizer
import math
import numpy
from gensim.models import word2vec
w2v_model = word2vec.Word2Vec.load("models/update.bin")
class TextRankSummarizer2(TextRankSummarizer):
@staticmethod
def _rate_sentences_edge(words1, words2):
rank = 0
for w1 in words1:
for w2 in words2:
if w1 in w2v_model.wv and w2 in w2v_model.wv:
rank += w2v_model.similarity(w1, w2)
else:
rank += int(w1 == w2)
if rank == 0:
return 0.0
assert len(words1) > 0 and len(words2) > 0
norm = math.log(len(words1)) + math.log(len(words2))
if numpy.isclose(norm, 0.):
# This should only happen when words1 and words2 only have a single word.
# Thus, rank can only be 0 or 1.
assert rank in (0, 1)
return rank * 1.0
else:
return rank / norm
def add_vocab(inputText):
sentences = word2vec.Text8Corpus(inputText)
w2v_model.build_vocab(sentences, update=True)
w2v_model.train(
sentences, total_examples=w2v_model.corpus_count, epochs=w2v_model.iter)
w2v_model.save("models/update.bin")
if __name__ == "__main__":
add_vocab("corps.txt")