-
Notifications
You must be signed in to change notification settings - Fork 0
/
character_classes.py
87 lines (74 loc) · 4.22 KB
/
character_classes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
# classes for characters, for building alphabets
# Each name defines a broad category of symbols, for easy construction of alphabets:
# * all chars in the same sublist map to the same code
# * the char that comes first in a sublist is assumed to be the subset's representative
space_charset = [ ' ' ]
latin_charset = [
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
['A','Á','Â','Ã','Ä','Å','Æ','Ă','Ą','À', 'Ā'], ['a','á','â','ã','ä','å','æ','ă','ą','à','ā'],
'B', 'b',
['C','Ç','Ć','Ĉ','Ċ','Č'], ['c','ç','ć','ĉ','ċ','č'],
['D','Ð','Ď','Đ'], ['d','ð','ď','đ'],
['E','È','É','Ê','Ë','Ē','Ĕ','Ė','Ę','Ě'], ['e','è','é','ê','ë','ē','ĕ','ė','ę','ě'],
'F', 'f',
['G','Ĝ','Ğ','Ġ','Ģ'], ['g','ĝ','ğ','ġ','ģ','ḡ'],
['H','Ĥ','Ħ'], ['h','ĥ','ħ'],
['I','Ì','Í','Î','Ï','Ĩ','Ī','Ĭ','Į','İ','IJ'], ['i','ì','í','î','ï','ĩ','ī','ĭ','į','ı','ij'],
['J','Ĵ', 'Ɉ'], ['j','ĵ','ɉ'],
['K','Ķ'], ['k','ķ','ĸ'],
['L','Ĺ','Ļ','Ľ','Ŀ','Ł','£'], ['l','ĺ','ļ','ľ','ŀ','ł'],
'M', 'm',
['N','Ñ','Ń','Ņ','Ň','Ŋ'], ['n','ñ','ń','ņ','ň','ʼn','ŋ'],
['O','Ò','Ó','Ô','Õ','Ö','Ō','Ŏ','Ő','Œ'], ['o','ò','ó','ô','õ','ö','ō','ŏ','ő','œ','°'],
['P','Ꝑ','Ꝓ'], ['p','ꝑ','ꝓ'],
['Q','Ꝗ','Ꝙ'], ['q','ꝗ','ꝙ'],
['R','Ŕ','Ŗ','Ř','Ꝛ','Ꝝ','ꝶ'], ['r','ŕ','ŗ','ř','ˀ','ꝛ','ꝝ','ꝵ'],
['S','Ś','Ŝ','Ş','Š','ß'], ['s','ś','ŝ','ş','š','ſ','ẜ','\uf2f7'], # last is German schilling sign
['T','Ţ','Ť','Ŧ'], ['t','ţ','ť','ŧ','ꝷ'],
['U','Ù','Ú','Û','Ü','Ũ','Ū','Ŭ','Ů','Ű','Ų'], ['u','ù','ú','û','ü','ũ','ū','ŭ','ů','ű','ų'],
['V','Ꝟ'], ['v','ꝟ'],
['W','Ŵ','Ẅ'], ['w','ŵ','ẅ'],
'X', 'x',
['Y','Ȳ','Ý','Ÿ','Ŷ','Ẏ'], ['y','ȳ','ý','ÿ','ŷ','ẏ','ẙ'],
['Z','Ź','Ż','Ž','Ẑ', 'Ƶ', 'Ʒ', 'Ȥ'], ['z','ź','ż','ž', 'ẑ', 'ƶ', 'ʒ', 'ȥ'],
]
punctuation_charset = [[ '.','✳'], [',',';',':'], ['-','¬','—','='], ['¶','§' ]]
# respectively: acdehimortuvx
superscript_charset = [['\u0363','\u0368','\u0369','\u0364','\u036a','\u0365','\u036b','\u0366','\u036c','\u036d','\u0367','\u036e','\u036f']]
# variety of diacritics (used in combination with other letters, not to
# be confused with accented pre-composed characters). Possible ways to handle them:
#
# * if the combination stands for itself:
#
# + replace combination with its equivalent pre-composed character (not always possible, nor useful)
# + discard the diacritic mark from the sample (giving essentially the same result as mapping the accented characters above on their base representative)
#
# * if the combination is an abbreviation = standing for (base letter + string)
#
# + replace with an abbreviation representative (eg. '*'), if sticking with 'basic' transcription
# + expand the abbreviation (not always possible, nor relevant)
#
diacritic_charset = [ ["'","ʼ"] + [ chr(c) for c in range(0x300,0x316) ] + [ '\u0321', '\u0322', '\u0327', '\u0328', '\u1dd0'] ] # 0x0300-0x0316: above, 0x0321-0x1dd0: attached below
parenthesis_charset = [ ['(',')','[',']','{','}','/','\\','|'] ]
# all abbreviation signs that are conventional stand-in for a string
# of characters = more than a 'decorated' letter
abbreviation_charset = [ '*', 'ƺ','Ꝯ','ꝯ','Ꝫ','ꝫ','ȝ','₰','ꝰ','ꝭ','&','₎','Ꜿ','כּ' ]
hebrew_charset = [ chr(c) for c in range(0x0591,0x05f5) ]
greek_charset = [ chr(c) for c in range(0x03b1,0x03e1) ] + [ chr(c) for c in range(0x0391,0x03b0) ]
all_charsets = space_charset \
+ latin_charset \
+ punctuation_charset \
+ superscript_charset \
+ diacritic_charset \
+ parenthesis_charset \
+ abbreviation_charset \
+ hebrew_charset \
+ greek_charset
# check for duplicates:
from libs import list_utils as lu
import itertools
from collections import Counter
all_chars = lu.flatten( all_charsets )
if len(all_chars) != len(set(all_chars)):
duplicates = list( itertools.filterfalse( lambda x: x[1]==1, Counter(sorted(all_chars)).items()))
raise ValueError(f"Duplicate characters in the input sublists: {duplicates}")