-
Notifications
You must be signed in to change notification settings - Fork 0
/
wiktionary.py
126 lines (109 loc) · 4.91 KB
/
wiktionary.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
# -*- coding: utf-8 -*-
import sys
import re
import argparse
from lxml import etree
from pathlib2 import Path
import bz2
import urllib2
import codecs
#############################################
# all of this can essentially be ignored. it's just unicode compat
#
# without this, encoding issues cause errors
reload(sys)
sys.setdefaultencoding('utf-8')
# windows compat
if sys.platform == "win32":
try:
import uniconsole
except ImportError:
sys.exc_clear() # could be just pass, of course
else:
del uniconsole # reduce pollution, not needed anymore
#############################################
def is_ascii(s):
return all(ord(char) < 128 for char in s.decode('utf-8'))
def is_latin(s):
return all((ord(char) < 592 or ord(char) > 7679 and ord(char) < 7936) for char in s.decode('utf-8'))
# impolete, but helps remove single letters from results
def is_letter(s):
if len(s.decode('utf-8')) > 1:
return False
else:
return s in [u"A", u"B", u"C", u"D", u"E", u"F", u"G", u"H", u"I", u"J", u"K", u"L", u"M", u"N", u"O", u"P", u"Q", u"R", u"S",
u"T", u"U", u"V", u"X", u"Y", u"Z", u"А", u"Б", u"Ц", u"Д", u"Е", u"Ф", u"Г", u"Х", u"И", u"Й", u"К", u"Л", u"М",
u"Н", u"О", u"П", u"Я", u"Р", u"С", u"Т", u"У", u"Ж", u"В", u"Ь", u"Ы", u"З", u"ا", u"ب", u" ت", u"ث", u"ج", u"ح",
u"خ", u"د", u"ذ", u"ر", u"ز", u"س", u"ش", u"ص", u"ض", u"ط", u"ظ", u"ع", u"غ", u"ف", u"ق", u"з",
u"ك", u"ل", u"م", u"ن", u"ه", u"و", u"ي", u"ج", u"ت"]
def download_file(url, file_name):
u = urllib2.urlopen(url)
f = open(file_name, 'wb')
meta = u.info()
file_size = int(meta.getheaders("Content-Length")[0])
print "Downloading: %s Bytes: %s" % (file_name, file_size)
file_size_dl = 0
block_sz = 8192
while True:
buffer = u.read(block_sz)
if not buffer:
break
file_size_dl += len(buffer)
f.write(buffer)
status = r"%10d [%3.2f%%]" % (file_size_dl, file_size_dl * 100. / file_size)
status = status + chr(8)*(len(status)+1)
print status,
f.close()
parser = argparse.ArgumentParser(description='Parse Wiktionary dump for multilingual words')
parser.add_argument("--lang", nargs="?", default="en")
parser.add_argument("-r", "--redownload", action="store_true")
parser.add_argument("--onlyascii", action="store_true")
parser.add_argument("--nonascii", action="store_true")
parser.add_argument("--nonlatin", action="store_true")
parser.add_argument("--noletters", action="store_true")
args = parser.parse_args()
Path("dumps").mkdir(exist_ok=True)
DUMP_FILE_PATH = "dumps/%swiktionary-latest-pages-articles.xml" % args.lang
if not Path(DUMP_FILE_PATH).is_file() or args.redownload:
print "Downloading latest %s-wiktionary database dump..." % args.lang
compressed_dump_path = "dumps/%swiktionary-latest-pages-articles.xml.bz2" % args.lang
url = "https://dumps.wikimedia.org/%swiktionary/latest/%swiktionary-latest-pages-articles.xml.bz2" % (args.lang, args.lang)
download_file(url, compressed_dump_path)
print "Dump downloaded. Extracting..."
with open(DUMP_FILE_PATH, 'wb') as dump_file, bz2.BZ2File(compressed_dump_path, 'rb') as dump_bz2:
for data in iter(lambda: dump_bz2.read(100 * 1024), b''):
dump_file.write(data)
print "Dump extracted."
articles = dict()
count = 0
for event, element in etree.iterparse(DUMP_FILE_PATH, tag="{http://www.mediawiki.org/xml/export-0.10/}page"):
title = element.findtext("{http://www.mediawiki.org/xml/export-0.10/}title")
if (":" not in title and not (args.nonascii and is_ascii(title)) and not (args.noletters and is_letter(title))
and not (args.onlyascii and not is_ascii(title)) and not (args.nonlatin and is_latin(title))
and element.find("{http://www.mediawiki.org/xml/export-0.10/}redirect") is None):
revision = element.find("{http://www.mediawiki.org/xml/export-0.10/}revision")
text = revision.findtext("{http://www.mediawiki.org/xml/export-0.10/}text")
langsRE = re.compile(r'(^|\s)==.*[^=]==(\s|$)')
langsMatch = langsRE.findall(text)
articles[title] = len(langsMatch)
count += 1
if count % 10000 == 0:
print '%d articles processed' % count
element.clear()
Path("results").mkdir(exist_ok=True)
f = codecs.open("results/%s-topwords.txt" % args.lang, "w", "utf-8")
print "Writing results to file..."
count = 1
rank = count
prevRankVal = -1
for article in sorted(articles, key=articles.get, reverse=True):
numLangs = articles[article]
if numLangs != prevRankVal:
rank = count
prevRankVal = numLangs
f.write("#%d\t%s: %d languages\n" % (rank, article, numLangs))
count += 1
if count > 10000:
break
f.close()
print "Done."