-
Notifications
You must be signed in to change notification settings - Fork 8
/
gan.py
executable file
·172 lines (142 loc) · 5.49 KB
/
gan.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
# -*- coding: utf-8 -*-
#! /usr/bin/env python
from __future__ import unicode_literals
import re
import mwclient
from theobot import password
from theobot import bot
from theobot import spellcheck_awb
# from theobot import spellcheck # _awb covers this
from itertools import groupby
import operator
import collections
# CC-BY-SA Theopolisme
def main():
global site
site = mwclient.Site('en.wikipedia.org')
site.login(password.username, password.password)
global CLEANUPTEMPLATES
CLEANUPTEMPLATES = listpages(site.Categories['Cleanup templates'])
print "Got cleanup templates."
# process_article(site.Pages['Earth: Final Conflict']) # debugging
gan = site.Pages['Wikipedia:Good article nominations']
articles = re.findall(r"""#\s*{{GANentry\|1=(.*?)\|""",gan.edit(),flags=re.U)
# This prints article data for ten articles to one page, to save space in demo
done = 0
alphabet = collections.defaultdict(list)
for article in articles:
if done < 10:
alphabet[article[0].upper()].append(process_article(site.Pages[article]))
done += 1
else:
break
for letter,value in alphabet.items():
print "Saving {}".format(letter)
text = '\n'.join(value)
site.Pages["User:Theo's Little Bot/GAN/"+letter].save(text,"[[WP:BOT|Bot]]: Updating [[WP:GAN|GAN]] report")
def listpages(category):
"""Recursively goes through a category."""
results = []
for page in category:
if page.namespace == 14: # 14 is the category namespace
results += listpages(page)
else:
results.append(page.name)
return results
def process_article(page):
print "\n\n=====\nProcessing {}".format(page.name)
text = page.edit()
results = """== [[{0}]] ==\n'''Generated ~~~~~ by [[User:Theo's Little Bot|]]'''""".format(page.name)
## ALERTS
results += alerts(text)
## CLEANUP TEMPLATES
results += cleanup(text)
## IMAGES
results += images(text,page)
print results
return results
def alerts(text):
results = "\n\n=== Alerts ==="
alerts = []
# at least one citation per paragraph
def paragraph(lines):
for group_separator,line_iteration in groupby(lines.splitlines(True),key=unicode.isspace):
if not group_separator:
yield ''.join(line_iteration)
paratext = re.sub(r""".*?==""","==",text,count=1,flags=re.DOTALL) # the lead does not need citations
for p in paragraph(paratext):
if not re.search(r"""(==|Category:|\[\[File:|\[\[Image:|{{Authority control|{{.*}}|^<.*?>|^;|^\|)""",p.strip()):
if not re.search(r"""(sfn|\<ref)""",p) and len(p) > 100: # only look at paragraphs longer than 100 chars
alerts.append("\n* Lacking a citation in the paragraph beginning {{{{xt|{}...}}}}".format(p[:70]))
# check for common misspelled words
print "Checking for misspelled words..."
sp = []
# from [[Wikipedia:Lists of common misspellings/For machines]]
# * disabled because _awb covers this, below
#for spell_tuple in spellcheck.Misspellings(text).check():
# sp.append("\"{0}\" (line {1})".format(spell_tuple[1],spell_tuple[0]))
# from [[Wikipedia:AutoWikiBrowser/Typos]]
for line,typos in spellcheck_awb.typos(text).items():
for typo,correction in typos:
typoindex = text.index(typo)
sp.append("\"{0}\" → \"{1}\" ({{{{xt|<nowiki>...{2}...</nowiki>}}}})".format(typo,correction,text[typoindex-20:typoindex+20].replace('\n', ' ')))
if len(sp) > 0:
alerts.append("\n* Possible typo(s) or misspelling(s) detected: " + ', '.join(sp))
if len(alerts) > 0:
results += ''.join(alerts)
else:
results += "\n''There are no alerts for this page.''"
return results
def cleanup(text):
def process_template(templatename):
"""Checks if a template is a maintenance template."""
template = site.Pages['Template:'+templatename]
if template.redirect == False:
if template.name in CLEANUPTEMPLATES:
return True
else:
pass
else:
target = re.findall(r"""\[\[(.*?)\]\]""",template.edit(),flags=re.U)[0]
process_template(target)
results = "\n\n=== Cleanup templates on page ==="
templates = re.findall(r"""\{\{(.*?)(?:\|.*?\}\}|\}\})""",text,flags=re.U)
taggedtemp = []
for templatename in list(set(templates)):
if process_template(templatename) == True:
taggedtemp.append(templatename)
if len(taggedtemp) > 0:
for templatename in taggedtemp:
results += "\n*{{tlx|"+templatename+"}}"
else:
results += "\n''The bot found no cleanup templates on this page.''"
return results
def images(text,page):
results = "\n\n=== Images used on page ==="
imageresults = []
for image in page.images():
tempstring = ""
original_description = image.edit()
if original_description != "":
text = site.expandtemplates(text=original_description).lower()
if text.find("non-free use") != -1:
tempstring += " {{red|'''The image is non-free.'''}}"
if text.find("""width: 15%; font-size: 12pt">Non-free media information and [[Wikipedia:Non-free use rationale guideline|use rationale]]""") != -1:
tempstring += " A recognizd non-free use rationale was found."
if text.find("imbox-license") != -1:
tempstring += " A license was provided for the image."
if text.find("public domain") != -1:
tempstring += " The image is said to be in the public domain."
if tempstring == "":
tempstring += " No information was found for the image."
else:
tempstring += " The image is hosted on Commons, so assuming freely licensed."
tempstring = "*[[:{}]]:".format(image.name) + tempstring
imageresults.append(tempstring)
if len(imageresults) > 0:
results += '\n' + '\n'.join(imageresults)
else:
results += "\n''The bot found no images on this page.''"
return results
if __name__ == '__main__':
main()