-
Notifications
You must be signed in to change notification settings - Fork 9
/
parse.py
95 lines (79 loc) · 2.44 KB
/
parse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
#!/usr/bin/env python
import sys
import os
import json
import unwiki
import xmltodict
from xml.dom.minidom import parse
from io import StringIO
from html.parser import HTMLParser
from unidecode import unidecode
from langdetect import detect
quotesObject = {}
if (len(sys.argv) == 1):
print("You must specify an input file.")
sys.exit()
if (len(sys.argv) == 2):
cutoffArg = 100
langArg = "en"
if (len(sys.argv) == 3):
cutoffArg = int(sys.argv[2])
langArg = "en"
if (len(sys.argv) > 3):
cutoffArg = int(sys.argv[2])
langArg = str(sys.argv[3])
def writeQuotes(content):
global langArg
global cutoffArg
quoteList = []
write = False
i = 0
while i < len(content):
line = content[i]
if line.startswith('==') and line[2] != "=":
write = False
if write and line.startswith('* '):
cleaned_line = unwiki.loads(line) # Remove wiki markup
cleaned_line = strip_tags(cleaned_line) # Remove HTML tags
cleaned_line = unidecode(cleaned_line) # Convert unicode to ASCII
cleaned_line = cleaned_line.replace("\\'", "") # Remove escaped apostrophes
cleaned_line = cleaned_line.replace('\"', '') # Remove double quotes
' '.join(cleaned_line.split()) # Remove extra whitespace
cleaned_line = cleaned_line[2:] # Remove bullet point
if ("://" not in cleaned_line and len(cleaned_line) < cutoffArg):
if (langArg == "all"):
quoteList.append(cleaned_line)
elif (detect(cleaned_line) == langArg):
quoteList.append(cleaned_line)
if line == '==Quotes==' or line == '== Quotes ==':
write = True
i += 1
return quoteList
def handle(_, value):
global quotesObject
try:
quoteList = writeQuotes(str(value['revision']['text']).split('\\n'))
if len(quoteList) > 0:
quotesObject[str(value['title'])] = quoteList
except Exception as e:
pass
return True
class MLStripper(HTMLParser):
def __init__(self):
super().__init__()
self.reset()
self.strict = False
self.convert_charrefs= True
self.text = StringIO()
def handle_data(self, d):
self.text.write(d)
def get_data(self):
return self.text.getvalue()
def strip_tags(html):
s = MLStripper()
s.feed(html)
return s.get_data()
xmltodict.parse(open(str(sys.argv[1]), "rb"), item_depth=2, item_callback=handle)
os.makedirs('data', exist_ok=True)
with open('data/quotes-' + str(cutoffArg) + '-' + str(langArg) + '.json', 'w') as outfile:
json.dump(quotesObject, outfile, sort_keys=True, indent=4, ensure_ascii=True)