-
Notifications
You must be signed in to change notification settings - Fork 57
/
remove_words.py
74 lines (57 loc) · 1.81 KB
/
remove_words.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import sys
import nltk
from nltk.corpus import stopwords
from utils import clean_str, clean_str_sst, loadWord2Vec
if len(sys.argv) < 2:
sys.exit("Use: python remove_words.py <dataset>")
dataset = sys.argv[1]
if 'SST' in dataset:
func = clean_str_sst
else:
func = clean_str
try:
least_freq = sys.argv[2]
except:
least_freq = 5
print('using default least word frequency = 5')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
print(stop_words)
doc_content_list = []
with open('data/corpus/' + dataset + '.txt', 'rb') as f:
for line in f.readlines():
doc_content_list.append(line.strip().decode('latin1'))
word_freq = {} # to remove rare words
for doc_content in doc_content_list:
temp = func(doc_content)
words = temp.split()
for word in words:
if word in word_freq:
word_freq[word] += 1
else:
word_freq[word] = 1
clean_docs = []
for doc_content in doc_content_list:
temp = func(doc_content)
words = temp.split()
doc_words = []
for word in words:
if dataset == 'mr' or 'SST' in dataset:
doc_words.append(word)
elif word not in stop_words and word_freq[word] >= least_freq:
doc_words.append(word)
doc_str = ' '.join(doc_words).strip()
clean_docs.append(doc_str)
clean_corpus_str = '\n'.join(clean_docs)
with open('data/corpus/' + dataset + '.clean.txt', 'w') as f:
f.write(clean_corpus_str)
len_list = []
with open('data/corpus/' + dataset + '.clean.txt', 'r') as f:
for line in f.readlines():
if line == '\n':
continue
temp = line.strip().split()
len_list.append(len(temp))
print('min_len : ' + str(min(len_list)))
print('max_len : ' + str(max(len_list)))
print('average_len : ' + str(sum(len_list)/len(len_list)))