-
Notifications
You must be signed in to change notification settings - Fork 0
/
Tokenize.py
37 lines (29 loc) · 841 Bytes
/
Tokenize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
import sys
import os
from os.path import isfile, join
from Tokenizer import Tokenizer
import timeit
from multiprocessing import Pool
t = Tokenizer(negate=False, html_special=False)
TOKENS = []
def tok(file):
f = open(file, 'r', encoding='utf8')
tokens = t.tokenize(f.read())
f.close()
return tokens
def st():
FOLDERS = set(sys.argv[1:])
for folder in FOLDERS:
files = [join(folder, f) for f in os.listdir(folder) if isfile(join(folder, f))]
for file in files:
tok(file)
def main():
FOLDERS = set(sys.argv[1:])
for folder in FOLDERS:
files = [join(folder, f) for f in os.listdir(folder) if isfile(join(folder, f))]
TOKENS.extend(p.map(tok, files))
if __name__ == '__main__':
p = Pool(8)
main()
print(len(TOKENS))
print(TOKENS[24999])