-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun_01_analyze_dataset.py
69 lines (50 loc) · 3.07 KB
/
run_01_analyze_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import json
import os
from bson import json_util
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from snowballstemmer import TurkishStemmer
from utils import get_filtered_query_terms, get_filtered_article_text, print_model_into_file
class DatasetAnalyzer:
def __init__(self):
self.stop = stopwords.words('turkish')
self.stemmer = SnowballStemmer('english') # turkish is not supported.
# self.stemmer = TurkishStemmer()
self.json_files = {}
self.statistics = {}
self.total_statistics = {}
self.total_statistics['total_number_of_documents'] = 0
self.total_statistics['total_avarage_query_length'] = 0
self.total_statistics['total_avarage_text_length'] = 0
def analyze(self, path):
for root, dirs, files in os.walk(path):
if root != 'news_resources/':
print('files in ' + root + ': ' + ', '.join(files))
self.json_files[root] = []
self.statistics[root] = {}
query_lengths = 0
text_lengths = 0
for file in files:
if file.endswith('.json'):
fullpath = root + '/' + file
article = json.load(open(fullpath), object_hook=json_util.object_hook) # load article.
self.json_files[root].append(article) # append it to json files.
filtered_query_terms = get_filtered_query_terms(self.stemmer, self.stop, article) # get filtered query terms.
text_filtered_terms = get_filtered_article_text(self.stemmer, self.stop, article) # get filtered text terms
query_lengths += len(filtered_query_terms) # sum query lengths
text_lengths += len(text_filtered_terms) # sum text lengths
self.statistics[root]['number_of_documents'] = len(self.json_files[root])
self.statistics[root]['avarage_query_length'] = query_lengths / len(self.json_files[root])
self.statistics[root]['avarage_text_length'] = text_lengths / len(self.json_files[root])
self.total_statistics['total_number_of_documents'] += len(self.json_files[root])
self.total_statistics['total_avarage_query_length'] += self.statistics[root]['avarage_query_length']
self.total_statistics['total_avarage_text_length'] += self.statistics[root]['avarage_text_length']
self.total_statistics['total_avarage_query_length'] /= len(self.json_files)
self.total_statistics['total_avarage_text_length'] /= len(self.json_files)
return self.json_files, self.statistics, self.total_statistics
INPUT_DIRECTORY = "news_resources/"
OUTPUT_FILEPATH = "output/"
datasetAnalyzer = DatasetAnalyzer()
json_files, statistics, total_statistics = datasetAnalyzer.analyze(INPUT_DIRECTORY)
print_model_into_file(statistics, OUTPUT_FILEPATH + "/dataset_statistics/statistics.json")
print_model_into_file(total_statistics, OUTPUT_FILEPATH + "/dataset_statistics/total_statistics.json")