-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathrest_helper.py
55 lines (44 loc) · 1.53 KB
/
rest_helper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import nltk
import pandas as pd
import numpy as np
import gensim
import re
from nltk.corpus import stopwords
class MyTokenizer:
def __init__(self):
pass
def fit(self, X, y=None):
return self
def transform(self, X):
transformed_X = []
for document in X:
tokenized_doc = []
for sent in nltk.sent_tokenize(document):
tokenized_doc += nltk.word_tokenize(sent)
transformed_X.append(np.array(tokenized_doc))
return np.array(transformed_X)
def fit_transform(self, X, y=None):
return self.transform(X)
class MeanEmbeddingVectorizer(object):
def __init__(self, word2vec):
self.word2vec = word2vec
# if a text is empty we should return a vector of zeros
# with the same dimensionality as all the other vectors
self.dim = len(word2vec.wv.syn0[0])
def fit(self, X, y=None):
return self
def transform(self, X):
X = MyTokenizer().fit_transform(X)
return np.array([
np.mean([self.word2vec.wv[w] for w in words if w in self.word2vec.wv]
or [np.zeros(self.dim)], axis=0)
for words in X
])
def fit_transform(self, X, y=None):
return self.transform(X)
def preprocess(raw_text):
no_punc = re.sub("[^a-zA-Z0-9]", " ", raw_text)
words = no_punc.lower().split()
stops = set(stopwords.words("english"))
meaningful_words = " ".join([w for w in words if w not in stops])
return meaningful_words