-
Notifications
You must be signed in to change notification settings - Fork 1
/
make_dictionary.py
executable file
·50 lines (40 loc) · 1.33 KB
/
make_dictionary.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
#!/usr/bin/env python
"""
This script requieres a dump of the wiktionary for the language in question.
Check the README.md or run the following command.
$ make dictionary.db
"""
import dbm
import gzip
import sys
def process_page_row(row, dictionary):
"""Write word into dictionary"""
if row[1] == '0':
word = row[2].lower().strip().replace('_', ' ').replace("'", "")
dictionary[word] = '1'
print(word)
def split_sql_insert(line):
"""Break a long sql insert into a list of lists of values"""
value_list = line.split("),(")
value_list[0] = value_list[0].replace("INSERT INTO `page` VALUES (", "")
value_list[-1] = value_list[-1].replace(");\n", "")
return value_list
def parse_sql(filename, dictionary):
with gzip.open(filename, 'rt') as infile:
data = infile.readlines()
for line in data:
if not line.startswith('INSERT'):
continue
else:
rows = split_sql_insert(line)
for row in rows:
try:
process_page_row(row.split(','), dictionary)
except Exception as e:
print(f"Exception {e}")
def main(filename):
with dbm.open('dictionary.db', 'c') as dictionary:
parse_sql(filename, dictionary)
if __name__ == "__main__":
filename = sys.argv[1]
main(filename)