-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwikipedia.py
89 lines (66 loc) · 2.7 KB
/
wikipedia.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import json
import urllib.request
import sys
import time
import os
class Wikipedia:
def __init__(self, subdomain="en", summary=True):
self.subdomain = subdomain
self.summary = summary
def random_article_titles(self, num_of_articles=20):
''' Returns titles of random Wikipedia articles '''
url = "https://"+self.subdomain+".wikipedia.org/w/api.php?format=json&action=query&list=random&rnnamespace=0&rnlimit="+str(num_of_articles)
json_doc = urllib.request.urlopen(url).read().decode(encoding="utf-8", errors="ignore")
parsed = json.loads(json_doc)
titles=[]
for article in parsed["query"]["random"]:
titles.append(article["title"])
return titles
def get(self, titles):
''' Returns full or summarized Wikipedia articles specified by their titles '''
if titles==None or len(titles)<1:
return None
articles_dict=dict()
if self.summary:
titles_string="|".join(titles)
url = "https://"+self.subdomain+".wikipedia.org/w/api.php?format=json&action=query&prop=extracts&exlimit=max&explaintext&redirects&exintro&titles="+urllib.parse.quote_plus(titles_string)
json_doc = urllib.request.urlopen(url).read().decode(encoding="utf-8", errors="ignore")
parsed = json.loads(json_doc)
pages = parsed["query"]["pages"]
for i in pages:
page = pages[i]
title=page["title"].encode(encoding="utf-8", errors="ignore").decode(encoding="utf-8")
content=page["extract"].encode(encoding="utf-8", errors="ignore").decode(encoding="utf-8")
articles_dict[title]=content
else:
for title in titles:
url = "https://"+self.subdomain+".wikipedia.org/w/api.php?format=json&action=query&prop=extracts&exlimit=max&explaintext&redirects&titles="+urllib.parse.quote_plus(title)
json_doc = urllib.request.urlopen(url).read().decode("utf-8", errors="ignore")
parsed = json.loads(json_doc)
pages = parsed["query"]["pages"]
for i in pages:
page = pages[i]
title=page["title"].encode(encoding="utf-8", errors="ignore").decode(encoding="utf-8")
content=page["extract"].encode(encoding="utf-8", errors="ignore").decode(encoding="utf-8")
articles_dict[title]=content
return articles_dict
def crawl(self, time_limit):
''' Crawls Wikipedia for the specified amount of time in seconds '''
try:
os.mkdir("./data/")
except FileExistsError:
pass
start = time.time()
while True:
titles = self.random_article_titles()
articles = self.get(titles)
for title in articles:
clean_title = "".join(c for c in title if c.isalnum())
path = "./data/"+clean_title+".txt"
if os.path.exists(path):
continue
with open(path, "wt", encoding="utf-8") as f:
f.write(articles[title])
end = time.time()
if end-start>time_limit:
break