-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprocess_docs.py
165 lines (138 loc) · 5.33 KB
/
process_docs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
import os
import pickle as pkl
from typing import Any
import pandas as pd
from bertopic.representation import BaseRepresentation
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from tqdm import tqdm
from umap import UMAP
import shutup
import config.config as cfg
from berts.configurableBERT import analyse_bert as conf_bert
from berts.ultrafastBERT import analyse_bert as pre_bert
from preprocessing.preprocessing import clean, save_preprocessed_as_text
shutup.please()
def preprocess_docs(
chunk_size=10000,
duplicates=True,
emojis=True,
urls=True,
hashtags=True,
hashtags_content=True,
ats=True,
ats_content=True,
punctuation=True,
digits=True,
stopwords=True,
stopwords_lang_codes=None,
stopwords_custom=None,
min_doc_length=5,
duplicate_cleanup=True):
if stopwords_custom is None:
stopwords_custom = []
if stopwords_lang_codes is None:
stopwords_lang_codes = ["en"]
if os.path.isfile("preprocessed.txt"):
reply = input("Your current state of cleaned texts will be overwritten. Continue? (y/n)")
if reply == "y":
os.remove("preprocessed.txt")
print("Deleted preprocessed.txt")
else:
exit(69)
print("Building Dataframe in batches of {} docs...".format(chunk_size))
documents = os.listdir(cfg.gdelt_out())
file_chunks = [documents[i:i + chunk_size] for i in range(0, len(documents), chunk_size)]
c = 1
for file_chunk in file_chunks:
print("-"*16)
print("Batch {} of {}".format(c, len(file_chunks)))
print("-" * 16)
loop = tqdm(file_chunk)
docs = []
files = []
corrupted_files = 0
for doc_file in loop:
with open(os.path.join(cfg.gdelt_out(), doc_file), "rb") as d:
try:
document = pkl.load(d)
d.close()
if document.main_content_present():
docs.append(document)
files.append(doc_file)
except EOFError as _:
corrupted_files += 1
loop.set_postfix_str("Corrupted files: {}".format(corrupted_files))
chunk_df = pd.DataFrame([vars(doc) for doc in docs])
chunk_df["filename"] = files
cleaned_df = clean(chunk_df,
duplicates=duplicates,
emojis=emojis,
urls=urls,
hashtags=hashtags,
hashtags_content=hashtags_content,
ats=ats,
ats_content=ats_content,
punctuation=punctuation,
digits=digits,
stopwords=stopwords,
stopwords_lang_codes=stopwords_lang_codes,
stopwords_custom=stopwords_custom,
min_doc_length=min_doc_length,
duplicate_cleanup=duplicate_cleanup)
print("Applying cleaned text to chunk of documents...")
for filename, cleaned_content in zip(
cleaned_df["filename"].values.tolist(), cleaned_df["main_content"].values.tolist()
):
with open(os.path.join(cfg.gdelt_out(), filename), "rb") as d:
document = pkl.load(d)
d.close()
document.cleaned_content = cleaned_content
document.save_document()
print("Done, now saving to preprocessing file...")
save_preprocessed_as_text(cleaned_df)
c += 1
def analyse_docs(
BERT_key=None,
river_app=False,
river_conf=None,
language: str = "multilingual",
top_n_words: int = 10,
n_gram_range: tuple[int, int] = (1, 1),
min_topic_size: int = 10,
nr_topics: int | str | None = None,
low_memory: bool = False,
calculate_probabilities: bool = False,
seed_topic_list: list[list[str]] | None = None,
zeroshot_topic_list: list[str] | None = None,
zeroshot_min_similarity: float = .7,
embedding_model: Any = None,
umap_model: UMAP | None = None,
hdbscan_model: HDBSCAN | None = None,
vectorizer_model: CountVectorizer | None = None,
ctfidf_model: TfidfTransformer | None = None,
representation_model: BaseRepresentation | None = None,
verbose: bool = True):
conf_bert(
pretrained_model=BERT_key,
river_app=river_app,
river_conf=river_conf,
language=language,
top_n_words=top_n_words,
n_gram_range=n_gram_range,
min_topic_size=min_topic_size,
nr_topics=nr_topics,
low_memory=low_memory,
calculate_probabilities=calculate_probabilities,
seed_topic_list=seed_topic_list,
zeroshot_topic_list=zeroshot_topic_list,
zeroshot_min_similarity=zeroshot_min_similarity,
embedding_model=embedding_model,
umap_model=umap_model,
hdbscan_model=hdbscan_model,
vectorizer_model=vectorizer_model,
ctfidf_model=ctfidf_model,
representation_model=representation_model,
verbose=verbose)
if __name__ == '__main__':
pass