-
Notifications
You must be signed in to change notification settings - Fork 1
/
export_parser.py
executable file
·103 lines (93 loc) · 3.38 KB
/
export_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
#!/usr/bin/python
import urllib2
from BeautifulSoup import BeautifulSoup
import re
import string
import os
import sys
if len(sys.argv) != 2:
print "Incorrect arguments"
exit(22)
else:
base_dir_name = re.sub(' ','_',sys.argv[1])
wikipedia_base_url = "http://en.wikipedia.org/wiki/Special:Export/%s"
url = wikipedia_base_url % base_dir_name
opener = urllib2.build_opener()
opener.addheaders = [('User-agent', 'lookingGlass/0.1')]
page = opener.open(url)
soup = BeautifulSoup(page)
if "REDIRECT" in soup.text.contents[0]:
base_dir_name = re.sub(' ','_',re.sub(r'.*\[\[(.*)\]\]',r'\1',soup.text.contents[0]))
url = wikipedia_base_url % base_dir_name
page = opener.open(url)
soup = BeautifulSoup(page)
page_list = []
page_list.append(base_dir_name)
broth = re.sub("\|[^\]]*?\]\]",']]',soup.text.contents[0])
regex = re.compile("\[\[([\w\s]*)\]\]")
for i in regex.findall(broth):
page_list.append(re.sub(' ','_',i))
for dir_name in page_list:
url = wikipedia_base_url % dir_name
if dir_name != base_dir_name:
page = opener.open(url)
soup = BeautifulSoup(page)
if not soup.text:
continue
if "REDIRECT" in soup.text.contents[0]:
dir_name = re.sub(' ','_',re.sub(r'.*\[\[(.*)\]\]',r'\1',soup.text.contents[0]))
url = wikipedia_base_url % base_dir_name
page = opener.open(url)
soup = BeautifulSoup(page)
dir_name = "./" + base_dir_name + "/" + dir_name + "/"
else:
dir_name = "./" + base_dir_name + "/"
if not os.path.isdir(dir_name):
os.mkdir(dir_name)
contents = []
para_list = []
contents.append("Abstract")
regex = re.compile("==[\w\s]*?==")
jack = regex.findall(soup.text.contents[0])
for i in jack:
contents.append(i.strip("==").strip(' '))
regex = re.compile('\{{2}.ite.*?\}{2}\n?',re.DOTALL | re.MULTILINE)
broth = re.sub(regex,'',soup.text.contents[0])
broth = re.sub("&.t.*?&.t;\n?",'',broth)
regex = re.compile("\[\[Image.+?(\[\[[\w\s\.]*?\]\])?.*\]\]\n?")
broth = re.sub(regex,'',broth)
regex = re.compile("\[\[File.+?(\[\[[\w\s\.]*?\]\])?.*\]\]\n?")
broth = re.sub(regex,'',broth)
broth = re.sub('\[\[[a-z\-]{2,10}\:.*\]\]\n?','',broth)
regex = re.compile('\{\|.*?\|\}\n?',re.DOTALL | re.MULTILINE)
broth = re.sub(regex,'',broth)
regex = re.compile('\{\{.*?\}\}\n?',re.DOTALL | re.MULTILINE)
broth = re.sub(regex,'',broth)
regex = re.compile('\{.*?\}\n?',re.DOTALL | re.MULTILINE)
broth = re.sub(regex,'',broth)
broth = re.sub("http\:[^\s|^\]]*",'',broth)
pattern = re.compile('\[\[Category\:.*\]\]\n?',re.IGNORECASE)
broth = re.sub(pattern,'',broth)
broth = re.sub("\[\[[^\]]*?\|",'[[',broth)
broth = re.sub("\* |\[ ?| ?\]|\'{2,3}|\&.{2,4}\;|\{|\}",'',broth)
regex = re.compile("=?=?.*?==",re.DOTALL|re.MULTILINE)
tag = iter(contents)
filename = tag.next()
try:
search_tag = tag.next()
except: search_tag = ''
text = regex.findall(broth)
for i in text:
if search_tag in i.strip("="):
if filename:
filename = re.sub(',','.2C',re.sub(' ','_',filename))
f = open(dir_name + filename,'w')
filename = search_tag
try:
search_tag = tag.next()
except: search_tag = ''
f.write(string.join(para_list, '\n') + "\n")
para_list = []
f.close()
else:
para_list.append(i.strip("=").encode("utf-8"))