-
Notifications
You must be signed in to change notification settings - Fork 53
/
summarizer_data_utils.py
202 lines (165 loc) · 6.22 KB
/
summarizer_data_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
import os
import time
import re
import html
from collections import Counter
import nltk
import numpy as np
def preprocess_sentence(text, keep_most=False):
"""
Helper function to remove html, unneccessary spaces and punctuation.
Args:
text: String.
keep_most: Boolean. depending if True or False, we either
keep only letters and numbers or also other characters.
Returns:
processed text.
"""
text = text.lower()
text = fixup(text)
text = re.sub(r"<br />", " ", text)
if keep_most:
text = re.sub(r"[^a-z0-9%!?.,:()/]", " ", text)
else:
text = re.sub(r"[^a-z0-9]", " ", text)
text = re.sub(r" ", " ", text)
text = re.sub(r" ", " ", text)
text = re.sub(r" ", " ", text)
text = text.strip()
return text
def fixup(x):
re1 = re.compile(r' +')
x = x.replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace(
'nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace(
'<br />', "\n").replace('\\"', '"').replace('<unk>', 'u_n').replace(' @.@ ', '.').replace(
' @-@ ', '-').replace('\\', ' \\ ')
return re1.sub(' ', html.unescape(x))
def preprocess(text, keep_most=False):
"""
Splits the text into sentences, preprocesses
and tokenizes each sentence.
Args:
text: String. multiple sentences.
keep_most: Boolean. depending if True or False, we either
keep only letters and numbers or also other characters.
Returns:
preprocessed and tokenized text.
"""
tokenized = []
for sentence in nltk.sent_tokenize(text):
sentence = preprocess_sentence(sentence, keep_most)
sentence = nltk.word_tokenize(sentence)
for token in sentence:
tokenized.append(token)
return tokenized
def preprocess_texts_and_summaries(texts,
summaries,
keep_most=False):
"""iterates given list of texts and given list of summaries and tokenizes every
review using the tokenize_review() function.
apart from that we count up all the words in the texts and summaries.
returns: - processed texts
- processed summaries
- array containing all the unique words together with their counts
sorted by counts.
"""
start_time = time.time()
processed_texts = []
processed_summaries = []
words = []
for text in texts:
text = preprocess(text, keep_most)
for word in text:
words.append(word)
processed_texts.append(text)
for summary in summaries:
summary = preprocess(summary, keep_most)
for word in summary:
words.append(word)
processed_summaries.append(summary)
words_counted = Counter(words).most_common()
print('Processing Time: ', time.time() - start_time)
return processed_texts, processed_summaries, words_counted
def create_word_inds_dicts(words_counted,
specials=None,
min_occurences=0):
""" creates lookup dicts from word to index and back.
returns the lookup dicts and an array of words that were not used,
due to rare occurence.
"""
missing_words = []
word2ind = {}
ind2word = {}
i = 0
if specials is not None:
for sp in specials:
word2ind[sp] = i
ind2word[i] = sp
i += 1
for (word, count) in words_counted:
if count >= min_occurences:
word2ind[word] = i
ind2word[i] = word
i += 1
else:
missing_words.append(word)
return word2ind, ind2word, missing_words
def convert_sentence(review, word2ind):
""" converts the given sent to int values corresponding to the given word2ind"""
inds = []
unknown_words = []
for word in review:
if word in word2ind.keys():
inds.append(int(word2ind[word]))
else:
inds.append(int(word2ind['<UNK>']))
unknown_words.append(word)
return inds, unknown_words
def convert_to_inds(input, word2ind, eos=False, sos=False):
converted_input = []
all_unknown_words = set()
for inp in input:
converted_inp, unknown_words = convert_sentence(inp, word2ind)
if eos:
converted_inp.append(word2ind['<EOS>'])
if sos:
converted_inp.insert(0, word2ind['<SOS>'])
converted_input.append(converted_inp)
all_unknown_words.update(unknown_words)
return converted_input, all_unknown_words
def convert_inds_to_text(inds, ind2word, preprocess=False):
""" convert the given indexes back to text """
words = [ind2word[word] for word in inds]
return words
def load_pretrained_embeddings(path):
"""loads pretrained embeddings. stores each embedding in a
dictionary with its corresponding word
"""
embeddings = {}
with open(path, 'r', encoding='utf-8') as f:
for line in f:
values = line.split(' ')
word = values[0]
embedding_vector = np.array(values[1:], dtype='float32')
embeddings[word] = embedding_vector
return embeddings
def create_and_save_embedding_matrix(word2ind,
pretrained_embeddings_path,
save_path,
embedding_dim=300):
"""creates embedding matrix for each word in word2ind. if that words is in
pretrained_embeddings, that vector is used. otherwise initialized
randomly.
"""
pretrained_embeddings = load_pretrained_embeddings(pretrained_embeddings_path)
embedding_matrix = np.zeros((len(word2ind), embedding_dim), dtype=np.float32)
for word, i in word2ind.items():
if word in pretrained_embeddings.keys():
embedding_matrix[i] = pretrained_embeddings[word]
else:
embedding = np.array(np.random.uniform(-1.0, 1.0, embedding_dim))
embedding_matrix[i] = embedding
if not os.path.exists(os.path.dirname(save_path)):
os.makedirs(os.path.dirname(save_path))
np.save(save_path, embedding_matrix)
return np.array(embedding_matrix)