forked from life4/homoglyphs
-
Notifications
You must be signed in to change notification settings - Fork 4
/
generate.py
88 lines (73 loc) · 2.68 KB
/
generate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
"""
Stolen and little bit refactored:
https://github.com/vhf/confusable_homoglyphs/blob/master/confusable_homoglyphs/cli.py
"""
import re
from collections import defaultdict
from urllib.request import urlopen
import json
from pathlib import Path
import sys
try:
homoglyphs_dir = sys.argv[1]
except IndexError:
homoglyphs_dir = "homoglyphs"
path = Path(homoglyphs_dir)
def generate_categories():
"""Generates the categories JSON data file from the unicode specification.
"""
# inspired by https://gist.github.com/anonymous/2204527
rex = re.compile(r'([0-9A-F]+)(?:\.\.([0-9A-F]+))?\W+(\w+)\s*#\s*\w+', re.UNICODE)
url = 'ftp://ftp.unicode.org/Public/UNIDATA/Scripts.txt'
content = urlopen(url).read().decode('utf-8').split('\n')
points = []
aliases = set()
for line in content:
match = re.findall(rex, line)
if not match:
continue
code_point_range_from, code_point_range_to, alias = match[0]
alias = alias.upper()
aliases.add(alias)
points.append((
int(code_point_range_from, 16),
int(code_point_range_to or code_point_range_from, 16),
alias,
))
points.sort()
with (path / 'categories.json').open('w') as stream:
data = {'points': points, 'aliases': sorted(aliases)}
stream.write(json.dumps(data, indent=2, sort_keys=True))
def generate_confusables():
"""Generates the confusables JSON data file from the unicode specification.
"""
url = 'ftp://ftp.unicode.org/Public/security/latest/confusables.txt'
file = urlopen(url).read().decode('utf-8').split('\n')
confusables_matrix = defaultdict(set)
rex = re.compile(
r'[0-9A-F ]+\s+;\s*[0-9A-F ]+\s+;\s*\w+\s*#'
r'\*?\s*\( (?P<char1>.+) → (?P<char2>.+) \) '
r'.+ → .+\t'
r'#(?:\s→(?P<steps>.+)→)?',
re.UNICODE,
)
for line in file:
match = re.search(rex, line)
if not match:
continue
chars = {match.group('char1'), match.group('char2')}
# for char in (match.group('steps') or '').split('→'):
# if char:
# chars.add(char)
for char1 in chars:
for char2 in chars:
if char1 == char2:
continue
confusables_matrix[char1].add(char2)
confusables_matrix[char2].add(char1)
confusables_matrix = {char: sorted(table) for char, table in confusables_matrix.items()}
with (path / 'confusables.json').open('w') as stream:
stream.write(json.dumps(confusables_matrix, indent=2, sort_keys=True))
if __name__ == '__main__':
# generate_categories()
generate_confusables()