-
Notifications
You must be signed in to change notification settings - Fork 0
/
get_romaji_to_kana.py
78 lines (65 loc) · 1.82 KB
/
get_romaji_to_kana.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
"""
Helper script to get romaji to kana conversion
"""
import json
import requests
from bs4 import BeautifulSoup
from lxml import html
url = 'https://en.wikipedia.org/wiki/Romanization_of_Japanese'
wiki_text = requests.get(url).text
soup = BeautifulSoup(wiki_text, features='lxml')
table = soup.findAll('table', class_='wikitable')
conv_tbl = table[1]
# print(conv_tbl)
conv = {}
trs = conv_tbl.tbody.find_all('tr')
for tr in trs:
tds = tr.find_all('td')
if len(tds) == 0:
continue
kana = tds[0].get_text().strip()
for idx in range(2, len(tds)):
romaji = tds[idx].get_text().strip()
if romaji.startswith("n-n"): #romaji == 'n-n'
conv['nn'] = kana
conv['n '] = kana
continue
# prevent getting overriden by obsolete chars like we wi
if romaji in conv:
continue
conv[romaji] = kana
# add small tsu
conv['ltsu'] = '\u3063'
four_letters = {'shya': 'sha', 'shyu': 'shu', 'shyo': 'sho',
'chya': 'cha', 'chyu': 'chu', 'chyo': 'cho'}
for key, val in four_letters.items():
if val not in conv:
print(f'{val} conv key not found.')
continue
conv[key] = conv[val]
extras = {
'va': '\u3094\u3041',
'vi': '\u3094\u3043',
'vu': '\u3094',
've': '\u3094\u3047',
'vo': '\u3094\u3049',
'vya': '\u3094\u3083',
'vyu': '\u3094\u3085',
'vyo': '\u3094\u3087',
'fya': '\u3075\u3083',
'fyu': '\u3075\u3085',
'fyo': '\u3075\u3087',
'fa': '\u3075\u3041',
'fi': '\u3075\u3043',
'fe': '\u3075\u3047',
'fo': '\u3075\u3049',
'wu': '\u3046',
'wi': '\u3046\u3043',
'we': '\u3046\u3047',
'yi': '\u3044',
'ye': '\u3044\u3047'
}
for key, val in extras.items():
conv[key] = val
with open('conversion.json', 'w') as f:
f.write(json.dumps(conv, indent=4))