-
-
Notifications
You must be signed in to change notification settings - Fork 1
/
wotdbot_extract_words.py
46 lines (34 loc) · 1.34 KB
/
wotdbot_extract_words.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
#!/usr/bin/env python
"""
Extract [Finnish] words from English Wiktionary.
Download enwiktionary-20140118-pages-articles-multistream.xml.bz2 (484.5M)
or newer and unzip. Then run:
python wotdbot_extract_words.py > data/finnish.txt
For other languages, change "Finnish" in the if and "fi" in the regex.
"""
import re
words = set() # A set ensures no duplicates
with open("enwiktionary-20140118-pages-articles-multistream.xml") as infile:
for line in infile:
if "* Finnish: " in line:
# https://en.wiktionary.org/wiki/Help:Translations
# Match first text after t|fi in double squiggly brackets.
# For example:
# * Finnish: {{t+|fi|muukalainen}}, {{t+|fi|ulkomaalainen}}
# ->
# muukalainen
# May or may not have a plus: {t+|fi| or {t|fi|
# (plus indicates fi.wiki page)
# May have grammatical gender, like the "|p" in:
# * Finnish: {{t+|fi|tee}}, {{t|fi|teenlehdet|p}}
match = re.match(r"[^{]*{{t[+]+\|fi\|([^}|]+)", line)
if match:
# Test:
# print line.rstrip()
# print match.group(0), "--->", match.group(1)
# print match.group(1)
words.add(match.group(1))
# print len(words)
for word in words:
print(word)
# End of file