-
Notifications
You must be signed in to change notification settings - Fork 35
/
Copy pathtasks.py
43 lines (31 loc) · 1.02 KB
/
tasks.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
# tasks.py
import collections
import json
import os
import sys
import uuid
from pathlib import Path
from nltk.corpus import stopwords
COMMON_WORDS = set(stopwords.words("english"))
BASE_DIR = Path(__file__).resolve(strict=True).parent
DATA_DIR = Path(BASE_DIR).joinpath("data")
OUTPUT_DIR = Path(BASE_DIR).joinpath("output")
def save_file(filename, data):
random_str = uuid.uuid4().hex
outfile = f"{filename}_{random_str}.txt"
with open(Path(OUTPUT_DIR).joinpath(outfile), "w") as outfile:
outfile.write(data)
def get_word_counts(filename):
wordcount = collections.Counter()
# get counts
with open(Path(DATA_DIR).joinpath(filename), "r") as f:
for line in f:
wordcount.update(line.split())
for word in set(COMMON_WORDS):
del wordcount[word]
# save file
save_file(filename, json.dumps(dict(wordcount.most_common(20))))
proc = os.getpid()
print(f"Processed {filename} with process id: {proc}")
if __name__ == "__main__":
get_word_counts(sys.argv[1])