-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathclean_dataset.py
76 lines (58 loc) · 2.23 KB
/
clean_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import concurrent
import re
import sys
import numpy as np
import pandas as pd
from so_tokenizers.SOTokenizer import ark_twokenize, stokenizer
from so_tokenizers.utils import get_tag_list, TOKEN_SEP, just_text
from pebble import ProcessPool
def clean_title(title):
try:
return TOKEN_SEP.join(ark_twokenize.tokenizeRawTweetText(title))
# TODO: doesn't work on titles
# return stokenizer.tokenize(title)
except:
print("Error", title)
def clean_tags(tag_list):
return get_tag_list(tag_list)
def clean_body(body):
clean_html = re.compile('<.*?>')
clean_body = re.sub(clean_html, '', body)
clean_filepath = re.compile('^(.+)\/([^\/]+)$')
clean_body = re.sub(clean_filepath, '', body)
def remove_token(x):
return all(term not in x for term in ["https", ":/", "//"])
tokens = ark_twokenize.tokenizeRawTweetText(clean_body)
tokens = list(filter(remove_token, tokens))
return TOKEN_SEP.join(tokens)
def main():
np.random.seed(123)
df = pd.read_csv('data/SO_dataset_full.csv')
df['Tags'] = df['Tags'].apply(clean_tags)
df['Title'] = df['Title'].apply(clean_title)
df['Body'] = df['Body'].apply(lambda x: clean_body(just_text(x)))
# for index, r in df.iterrows():
# text = r['Body']
# # print(text)
# cleaned_text = just_text(text)
# tokenized_body = clean_body(cleaned_text)
# TODO: not used anymore, SOTokenizer fails on several rows
# with ProcessPool() as pool:
# future = pool.schedule(stokenizer.tokenize, args=[cleaned_text])
#
# # if running, the container process will be terminated
# # a new process will be started consuming the next task
# try:
# result_text = future.result(timeout=0.5)
# except IndexError:
# result_text = ""
# future.cancel()
# except concurrent.futures.TimeoutError:
# print("this took too long...", index)
# result_text = ""
# future.cancel()
df = df.replace("", pd.np.nan)
df = df.dropna(subset=['Body'])
df.to_csv("data/so_dataset_cleaned.csv", index=False)
if __name__ == '__main__':
main()