-
Notifications
You must be signed in to change notification settings - Fork 0
/
charNameConvert.py
executable file
·921 lines (788 loc) · 33.8 KB
/
charNameConvert.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
#!/usr/bin/env python3
#
# charNameConvert.py
# 2022-06-15: Written by Steven J. DeRose.
#
import sys
import os
import codecs
import re
from enum import IntEnum
from subprocess import check_output, CalledProcessError
import xml.dom.minidom
from xml.dom.minidom import Node
from collections import defaultdict
from typing import Dict, List # , Union, IO,
import logging
lg = logging.getLogger("charNameConvert")
def fatal(msg:str) -> None:
lg.critical(msg); sys.exit()
__metadata__ = {
"title" : "charNameConvert",
"description" : "Interface Rahtz's huge table of tex/xml/afii/etc char names.",
"rightsHolder" : "Steven J. DeRose",
"creator" : "http://viaf.org/viaf/50334488",
"type" : "http://purl.org/dc/dcmitype/Software",
"language" : "Python 3.7",
"created" : "2022-06-15",
"modified" : "2022-06-15",
"publisher" : "http://github.com/sderose",
"license" : "https://creativecommons.org/licenses/by-sa/3.0/"
}
__version__ = __metadata__["modified"]
descr = """
=Name=
charNameConvert: Interface Rahtz's huge table of tex/xml/afii/etc char names.
=Description=
Provide an interface to information about special character
names as defined by many organizations. These
are based on (imho, superb) data assembled by my old friend Sebastian Rahtz,
David Carlisle, and others [https://www.w3.org/Math/characters/unicode.xml].
You can get a chart of equivalents, or do conversions from/to the various
representations, as well as literal Unicode characters. In the case of HTML or XML,
decimal and hexadecimal character references are also available, either as
the preferred representation, or as the fallback when no named entity is available.
==Usage from code==
The API (currently) only handles conversion of code points and bare names.
It doesn't know anything about HTML "&", TEX "\\", etc. So if you want to recognize
or transform references in a particular document/file syntax, you'll need to do
that on top. One possible example, to go from TEX to HTML special characters,
such as \\phi to &phgr;:
from charNameConvert import charNameConvert
cnc = charNameConvert(os.environ["HOME"]+"/myStuff/unicode.xml")
cmap:Dict = cnc.getMap("latex", "html4")
def fixChar(m):
if "\\"+m.group(1) in cmap: return "&" + cmap[m.group(1) + ";"]
return m
for rec in sys.stdin.readlines():
rec = re.sub(r"\\\\(\\w+)(?![\\[\\{])", fixChar, rec)
print rec
Note that latex, varlatex, IEEE, AMS, and Springer
include backslashes (and sometimes more),
such as achieving Unicode MATHEMATICAL BOLD via \\mathbf{X}.
Entities, however, are stored with no "&" or ";" (but those are adding
during conversion).
==Usage on the command line==
You can get a conversion chart with --chart (as TeX definitions with --oformat texdefs),
or convert an actual file with:
charNameConvert.py --frCode [name1] --toCode [name2] [options] [files]
This will convert stdin by replacing any character references in the input,
which are encoded by the syntax rules of system [name1],
to their equivalent in the --to standard. For example:
charNameConvert.py --frCode LaTeX --to html
would change \\phi to &phgr;, among many others.
You can omit --frCode, in which case literal characters outside the normal
ASCII range will be affected. The input character set defaults to utf-8,
but can be changed with --iencoding.
Similarly, if you omit --toCode, characters whose encoding is recognized in the
input, will be written out as literals in the output.
I expect to add options to handle numeric character references as well.
The input and output formats supported are (for U+2022 BULLET as example):
* latex -- \\textbullet
* xml -- •
* xml10 -- •
* xml16 -- •
* literal -- [an encoded character per se]
* slashu == \\uFFFF as used in various languages
The "xml" cases are slightly special, because the source data only includes
the name, not the "&" and ";". So these are recognized and generated
specially. Also,
you can specify --fallback for what to do if a character is recognized, but
does not exist in the requested output format. It can be one of:
* "unchanged"
* "literal"
* "xml10"
* "xml16" (this is the default)
* "slashu"
For example, a character with no html named entity, will be written as &#x___;,
where "___" is the hex character code, if you specify --fallback xml10.
==Names of supported character standards==
The source-names used here are mostly the same as in the source data (sample below).
But note that different SGML or XML
schemas can define their own entity names, which may use the same name in
different ways. Therefore, we use the names of the entity sets (such as "html4-symbol")
rather than just "entity":
* afii
* latex
* mathlatex
* APS
* ACS
* Wolfram
* html4
* isopub
* mmlalias (this seems to have many duplicates)
* description (this is generally the full unicode name)
Particular font position is also available, via two properties:
* fontname
* fontpos="1"
Finally, you can specify these forms:
* slashu -- the hex Unicode code point, like \\uFFFF
* url -- this is not yet supported, but will %-escape the UTF-8 bytes.
For output, \\U000FFFFF is used for Unicode code points too large for \\uFFFF.
If you set --short, \\xFF will be used when possible.
=Source format=
After some preliminaries, the data consists of a long series of elements like this:
<character id="U02022" dec="8226" mode="mixed" type="binaryop">
<afii>EB6E</afii>
<latex>\textbullet </latex>
<mathlatex>\bullet</mathlatex>
<APS>bull</APS>
<ACS>bbull</ACS>
<Wolfram>Bullet</Wolfram>
<entity id="bull" set="html4-symbol">
<desc>bullet = black small circle</desc>
</entity>
<entity id="bull" set="8879-isopub">
<desc>/bullet B: =round bullet, filled</desc>
</entity>
<entity id="bullet" set="mmlalias">
<desc>alias ISOPUB bull</desc>
</entity>
<font name="hlcra" pos="1"/>
<description>BULLET</description>
</character>
The data sometimes has entries like this, which are presently ignored:
<surrogate mathvariant="sans-serif-bold-italic" ref="U003D6"/>
=Related commands and data=
The source data on character names is from [https://www.w3.org/Math/characters/unicode.xml].
A copy is also available at [https://github.com/sderose/Charsets.git/blob/master/unicode.xml].
There is an enormous collection of TeX character sets at
[https://ctan.org/pkg/comprehensive].
=Known bugs and Limitations=
The --entitySets feature, for giving a prioritized list of which sets to try for
translating (in or out), is unfinished, as is an equivalent feature for multiple
latex libraries.
Several entity sets define multiple names for a single character, such as
'entity.9573-13-isoamsr' defining 'smile' and 'ssmile' both to U+2323 (at least
according to the source data I have). Only the first is kept. This should be no
serious problem for output, but the others won't be recognized on input.
Some entries are for a combination of characters, as shown below. These generate
a warning during loading (except with -q), and are discarded.
<character id="U0003C-020D2" dec="60-8402" type="other" mode="unknown">
<entity id="nvlt" set="9573-13-isoamsn">
<desc>not, vert, less-than</desc>
</entity>
<description unicode="combination">LESS-THAN SIGN with vertical line</description>
</character>
Some information in the source is not used, such as the "type" and "mode" attributes
of characters.
Various integrity checks are done, such as making sure the hex and decimal code points
match for each character. This is usually done by a bare assertion, so if they ever
come up you'll have to look in the data or code to see what happened.
I haven't done anything special for surrogate, mmlextra, Elsevier, or bmp.
I've found one apparent error: U027FA has a 'font' entry for name 'hlcry'
that specifies position '40)'. It is discarded.
=To do=
Implement priority of latex, varlatex, mathlaex, mathvariant. Similar to
how XML entity sets work.
Add a feature to take any XML entity set(s), and write out TeX definitions to
make them available with the same names.... E.g.:
\\def{\\bgr}{^^^^03b2} % GREEK SMALL LETTER BETA
Bonus points: Make &bgr; work directly in TeX....
=History=
* 2022-06-15: Written by Steven J. DeRose.
=Rights=
Copyright 2022-06-15 by Steven J. DeRose. This work is licensed under a
Creative Commons Attribution-Share-alike 3.0 unported license.
See [http://creativecommons.org/licenses/by-sa/3.0/] for more information.
For the most recent version, see [http://www.derose.net/steve/utilities]
or [https://github.com/sderose].
=Options=
"""
###############################################################################
# Standard formats we know about.
# Probably just add the entity-sets to these, and ditch the enum.
#
class CharStd(IntEnum): # TODO: Drop
slashu = 0 # Special
afii = 1
latex = 2
mathlatex = 3 # ?
varlatex = 4 # ?
ACS = 5
AIP = 6
APS = 7
IEEE = 8
Springer = 9
Wolfram = 10
html4 = 11 # Special
isopub = 12 # Special
mmlalias = 13 # Special -- duplicates
surrogate = 14
Elsevier = 15
bmp = 16
prop = 17
font = 18 # Special (tuple)
description = 19
# The names for different charset standards or expressions. These are
# mostly element type names in the source, with the corresponding name
# or string as their text content. However:
# "entity" is repeatable and has a set-name and ent name;
# "slashu" is custom, to hold the \\u or similar value
# "description" is just that.
# "font", not yet supported, has to be a tuple.
#
# First item: says whether to show by default in tostring(), --chart,...
INCL = True
EXCL = False
# Second item: TODO What were these?
F = P = 1
# Third item: What kind of value?
# TODO: What to do with cases that are not in Unicode?
OTHER = 0
NAME = 1 # For ones that are just a name, like entity.xxx
LATEX = 2 # For ones that take a LaTeX string as value
FONT = 3 # A font name plus position in that font
propNames = {
# (EXCL, ?, type, freq),
"ACS": (EXCL, F, str, "61", ),
"AIP": (EXCL, F, str, "394", ),
"AMS": (EXCL, F, str, "526", ),
"APS": (EXCL, F, str, "463", ),
"Elsevier": (EXCL, F, str, "745", ),
"IEEE": (EXCL, F, str, "223", ),
"Springer": (EXCL, F, str, "30", ),
"Wolfram": (EXCL, F, str, "695", ),
"afii": (EXCL, F, str, "1170", ),
"bmp": (EXCL, F, str, "24", ),
#"character": (EXCL, F, str, "5646", ),
"charlist": (EXCL, P, str, "1", ), # ?
"comment": (EXCL, P, str, "210", ),
"desc": (EXCL, P, str, "3974", ), # ?
"description": (INCL, P, str, "5646", ),
"elsrender": (EXCL, F, str, "50", ),
"entity": (EXCL, F, NAME, "3975", ), # SPECIAL
"entitygroups": (EXCL, P, str, "1", ), # ?
"font": (EXCL, P, FONT, "560", ),
"group": (EXCL, P, OTHER, "5", ),
"latex": (INCL, F, LATEX, "2480", ),
"literal": (EXCL, F, str, ), # SPECIAL
"mathlatex": (INCL, F, LATEX, "198", ),
"mathvariant": (INCL, F, LATEX, "14", ),
"set": (EXCL, F, OTHER, "56", ),
"surrogate": (EXCL, F, str, "1016", ),
"varlatex": (INCL, F, LATEX, "18", ),
"xref": (EXCL, P, OTHER, "63", ),
#"@image": (EXCL, F, str, "1442", ),
#"@mode": (EXCL, F, str, "4321", ),
#"@type": (EXCL, F, str, "4321", ),
}
###############################################################################
# The entity sets mentioned in the source data. To translate these, specify
# --from or --to "xml" and then give a list of these with --entitySets.
# TODO: Provide way to scan for outright conflicts over the same name?
#
entitySetMap = {
"8879-isoamsa": "=", # freq 56
"8879-isoamsb": "=", # freq 42
"8879-isoamsc": "=", # freq 10
"8879-isoamsn": "=", # freq 59
"8879-isoamso": "=", # freq 18
"8879-isoamsr": "=", # freq 84
"8879-isobox": "=", # freq 40
"8879-isocyr1": "=", # freq 67
"8879-isocyr2": "=", # freq 26
"8879-isodia": "=", # freq 14
"8879-isogrk1": "=", # freq 49
"8879-isogrk2": "=", # freq 20
"8879-isogrk3": "=", # freq 43
"8879-isogrk4": "=", # freq 43
"8879-isolat1": "=", # freq 62
"8879-isolat2": "=", # freq 121
"8879-isonum": "=", # freq 76
"8879-isopub": "=", # freq 84
"8879-isotech": "=", # freq 62
#
"9573-13-isoamsa": "=", # freq 146
"9573-13-isoamsb": "=", # freq 119
"9573-13-isoamsc": "=", # freq 22
"9573-13-isoamsn": "=", # freq 90
"9573-13-isoamso": "=", # freq 52
"9573-13-isoamsr": "=", # freq 180
"9573-13-isogrk3": "=", # freq 43
"9573-13-isogrk4": "=", # freq 43
"9573-13-isomfrk": "=", # freq 52
"9573-13-isomopf": "=", # freq 26
"9573-13-isomscr": "=", # freq 52
"9573-13-isotech": "=", # freq 161
#
"html4-lat1": "=", # freq 96
"html4-special": "=", # freq 32
"html4-symbol": "=", # freq 124
#
"ISOAMSA": "=", # freq 10
"ISOAMSC": "=", # freq 1 # Is this right? TODO
"ISOAMSO": "=", # freq 1 # Is this right? TODO
"ISOAMSR": "=", # freq 4
#
"ISObox": "=", # freq 40
#
"ISOCYR1": "=", # freq 67
"ISOCYR2": "=", # freq 26
#
"ISODIA": "=", # freq 9
#
"ISOGRK1": "=", # freq 49
"ISOGRK2": "=", # freq 20
"ISOGRK3": "=", # freq 2 # Is this right? TODO
#
"ISOLAT2": "=", # freq 117 # I guess lat1 is redundant w/ html4?
#
"ISOPUB": "=", # freq 66
#
"ISOTECH": "=", # freq 1 # Is this right? TODO
#
"mmlalias": "=", # freq 548
"mmlextra": "=", # freq 107
#
"predefined": "=", # freq 5 # XML predefined set, useful w/ --fallback.
#
"STIX": "=", # freq 688
}
# Provide shorthand for set of entity sets. See expandEntitySets(),
# which is called from processOptions().
entitySetGroups = [
"8879", "9573", "html49", "ISOAMS", "ISOCYR", "ISOGRK", "mml"
]
# Similar for LaTeX sets
texGroups = [
"latex", "varlatex", "mathlatex", "mathvariant"
]
def expandEntitySets(eSets:list):
newList = []
for es in eSets:
if (es in entitySetMap):
newList.append(es)
elif (es in entitySetGroups):
for cand in entitySetMap.keys():
if cand.startswith(es): newList.append(cand)
else:
assert False
return newList
###############################################################################
#
class CharStdInfo:
"""Data on one Unicode character, as expressed in a ton of systems.
"""
NOT_AVAILABLE = "--" # Show for missing code in --chart.
def __init__(self, codePoint:int):
assert codePoint >= 0 and codePoint <= 0x1FFFF
self.codePoint = codePoint
# .names may be a character name (as in XML) or a whole LaTeX command.
self.names = {}
self.names["slashu"] = self.getSlash(codePoint, args.short)
self.names["literal"] = chr(codePoint)
def addStd(self, whichStd:str, value:str):
try:
if (whichStd in self.names):
if (not args.quiet):
lg.info("Duplicate prop %s for U+%05x.", whichStd, self.codePoint)
return False
self.names[whichStd] = value
except IndexError as e:
print("Can't set prop '%s' for code point %05x.\n %s" %
(whichStd, self.codePoint, e))
return False
return True
def tostring(self, include:Dict=None, compact:bool=True) -> str:
if (include is None): include = propNames
buf = ""
desc = self.names["description"]
for stdName in include.keys():
if (stdName == "description"):
continue
if (stdName == "html4"):
curName = self.getXML()
elif (stdName in self.names):
curName = self.names[stdName]
else:
continue
#curName = CharStdInfo.NOT_AVAILABLE
if (compact):
buf += " %s=\"%s\"" % (stdName, curName)
else:
buf += " %-12s: %s\n" % (stdName, curName)
if (compact):
buf = "<code n=\"0x%05x\" desc=\"%s\"\n %s />" % (self.codePoint, desc, buf)
else:
buf = "U+%05x: \n%s" % (self.codePoint, buf)
return buf
def getXML(self):
"""Find, assemble, and return an XML entity reference to the given
character. This depends on which entity set(s) are chosen.
If no named entity is found among the chosen sets, a fallback is
generated, to a numeric character reference, a backslash code,
or the literal character.
"""
codePoint = self.codePoint
x = self.names["html4"]
#print("getXML for cp %05x, html name is: %s" % (codePoint, x if x else "[none]"))
if (x is not None): return "&%s;" % (x)
if (args.fallback == "unchanged"): return "unchanged"
if (args.fallback == "xml10"): return "&#%04d;" % (codePoint)
if (args.fallback == "xml16"): return "&#%04x;" % (codePoint)
if (args.fallback == "literal"): return chr(codePoint) # not checking gt lt etc.
if (args.fallback == "slashu"): return self.getSlash(codePoint, args.short)
assert False, "Bad --fallback value '%s'." % (args.fallback)
def findEntity(self, eSets:list) -> str:
"""Search the sequence of selected entity sets for the first one that
has the given character, and return it the entity name.
"""
for eSet in eSets:
if ("entity."+eSet in self.names): return self.names["entity."+eSet]
return None
def findAllEntities(self, eSets:list) -> List:
"""Search the sequence of selected entity sets and return a list of
pairs, each of (entitySetName, entityName).
"""
found = []
for eSet in eSets:
if ("entity."+eSet in self.names):
found.append( (eSet, self.names["entity."+eSet]) )
return found
@staticmethod
def getSlash(codePoint:int, short:bool=False):
if (short and codePoint <= 0xFF): return "\\x%02x" % (codePoint)
if (codePoint <= 0xFFFF): return "\\u%04x" % (codePoint)
else: return "\\U%08x" % (codePoint)
###############################################################################
#
class charNameConvert():
"""Gather information from Sebastian Rahtz et al's great DB,
mapping character expressions across various representations.
TODO: Finish the entity and font mappings.
"""
def __init__(self, path:str=None):
super(charNameConvert, self).__init__()
self.sourceUrl = "https://www.w3.org/Math/characters/unicode.xml"
self.charDict = {} # codepoint: CharStdInfo
self.nCombinations = 0
self.displayProps = []
if (path is None):
self.path = os.path.join(os.environ["HOME"], ".strfchr", "unicode.xml")
else:
self.path = path
self.setDisplayProps()
self.loadData()
def setDisplayProps(self, dp:List=None) -> None:
"""Make a list of all the properties to include (TODO: Option for this?)
TODO: Add the entity.xxx ones?
"""
if (dp):
self.displayProps = dp
else:
self.displayProps = []
for k, v in propNames.items():
if (v[0]): self.displayProps.append(k)
def downloadData(self) -> None:
try:
if (not os.path.exists(self.path)):
check_output([ "curl", self.sourceUrl, ">>", self.path ])
except CalledProcessError:
pass
if (not os.path.exists(self.path)):
lg.fatal("Could not download data from '%s'.", self.sourceUrl)
def loadData(self, incl:Dict=None) -> None:
self.downloadData()
#DomExtensions.DomExtensions.patchDom()
xdom = xml.dom.minidom.parse(self.path)
#print(xdom.toprettyxml())
charList = xdom.documentElement
assert charList.nodeName == "charlist"
lg.info("charList child count: %d", len(charList.childNodes))
nChars = 0
self.charDict = {}
for charEl in charList.childNodes:
if (charEl.nodeName != "character"): continue
idVal = charEl.getAttribute("id")
dec = charEl.getAttribute("dec")
lg.info("loading char %5s (d%06s)", idVal, dec)
if ("-" in idVal):
if not args.quiet:
descNode = self.getChild(charEl, "description")
desc = self.getText(descNode) if descNode else "???"
lg.info("Combination character ignored, id '%s' (%s).", idVal, desc)
self.nCombinations += 1
continue
try:
assert re.match(r"U[0-9a-f]{5,5}$", idVal, re.I)
assert dec.isdecimal()
dec = int(dec, 10)
assert int(idVal[1:], 16) == dec
except ValueError as e:
lg.warning("ValueError (idVal '%s', dec '%s') in:\n%s\n%s",
idVal, dec, charEl.toprettyxml() if args.verbose else "", e)
continue
ci = CharStdInfo(dec)
self.charDict[dec] = ci
nChars += 1
for propEl in charEl.childNodes:
if (propEl.nodeType == xml.dom.Node.TEXT_NODE
and propEl.data.strip() == ""): continue
assert propEl.nodeType == xml.dom.Node.ELEMENT_NODE
prop = propEl.nodeName
val = self.getText(propEl)
if (prop == "entity"):
# Move the entity-set name to our property name
# (not real happy with this approach...)
eSet = propEl.getAttribute("set")
if (eSet == "mmlalias"): continue # Avoid duplicates for now: TODO
prop = "entity." + eSet
val = propEl.getAttribute("id")
rc = ci.addStd(prop, val)
elif (prop == "font"):
nam = propEl.getAttribute("name")
pos = propEl.getAttribute("pos")
try:
assert int(pos) >= 0 and int(pos) < 0x1FFFF
except (AssertionError, ValueError):
# One known error in data, 'hlcry' -> pos '40)'.
lg.warning("font: @pos '%s' bad for name '%s':\n%s",
pos, nam, self.maybeXml(charEl))
continue
val = nam + " " + pos
rc = ci.addStd("font", val)
elif (prop == "description"):
rc = ci.addStd("description", val)
elif (prop in propNames):
rc = ci.addStd(prop, val)
else:
if (not args.quiet):
lg.warning("Unexpected property spec '%s'.", prop)
continue
if (not rc):
lg.info("******* Problem adding prop '%s', val '%s' in:\n%s",
prop, val, self.maybeXml(charEl))
if (args.verbose > 1 and incl and len(incl) > 0):
lg.info(ci.tostring(include=incl))
lg.info("Char defs loaded: %d (%d non-Unicode combinations ignored).",
nChars, self.nCombinations)
assert len(self.charDict) == nChars
def findConflicts(self):
"""Search all loaded info for cases where the same name (such as an XML
entity or a LaTeX command name) is used for multiple different characters.
Don't complain is the *same* name is declared in more than one set, though.
"""
codePointsByCodename = {}
for cp in sorted(self.charDict.keys()):
for std, codename in self.charDict[cp].names.items():
if (codename.startswith("\\")):
if (re.match(r"\\\w+\W", codename)): continue
codename = codename[1:]
if (codename in codePointsByCodename):
codePointsByCodename[codename].append( (std,cp) )
else:
codePointsByCodename[codename] = [ (std,cp) ]
# Now we have an inverted table
allCodeNames = sorted(codePointsByCodename.keys())
conflictCount = 0
for codename in allCodeNames:
pairList = codePointsByCodename[codename]
uniqueCodePoints = set([ p[1] for p in pairList ])
if (len(uniqueCodePoints) < 2): continue
buf = ", ".join([ ("%s:U+%04x" % p) for p in pairList ])
print("%s: %s" % (codename, buf))
conflictCount += 1
return conflictCount
def getMap(self, fr, to) -> Dict:
"""Create a dict mapping codePoint -> (fr, to) for all pairs known.
TODO: Need to handle the multiple-entity-sets and multiple LaTeX sets cases!
"""
newMap = {}
targetMissing = 0
for codePoint, charStdInfo in self.charDict.items():
try:
if (fr not in charStdInfo.names):
continue
if (to not in charStdInfo.names):
targetMissing += 1
else:
newMap[codePoint] = (charStdInfo.names[fr], charStdInfo.names[to])
except KeyError as e:
fatal("getMap failed to get from '%s' to '%s' for %04x:\n %s" %
(fr, to, codePoint, e))
if (targetMissing):
lg.warning("%d characters in '%s' not mappable to '%s'.",
targetMissing, fr, to)
return newMap
@staticmethod
def getText(node:Node) -> str:
if (node.nodeType == Node.TEXT_NODE): return node.data
buf = ""
for ch in node.childNodes:
buf += charNameConvert.getText(ch)
return buf
@staticmethod
def getChild(node:Node, ename:str) -> Node:
for ch in node.childNodes:
if (ch.nodeName == ename): return ch
return None
@staticmethod
def maybeXml(node:Node):
#if (not args.verbose): return ""
return re.sub(r"\n\s*\n+", "\n", node.toprettyxml(), flags=re.M)
###############################################################################
#
def doChart(frCode:str, toCode:str, oformat:str="texdefs", incl:List=None) -> None:
"""Is this better sorted by codePoint or fromString?
"""
lg.info("Starting chart, '%s' to '%s'.", frCode, toCode)
cnc = charNameConvert(os.environ["sjdUtilsDir"] + "/CharSets/unicode.xml")
lg.info("%d chars loaded.", len(cnc.charDict))
cnmap:Dict = cnc.getMap(frCode, toCode)
opener = """
\\documentclass{article}
\\usepackage[english]{babel}
\\usepackage[letterpaper,top=2cm,bottom=2cm,left=3cm,right=3cm,marginparwidth=1.75cm]{geometry}
\\usepackage{amsmath}
\\usepackage{graphicx}
\\usepackage[colorlinks=true, allcolors=blue]{hyperref}
\\title{List of LaTeX special characters}
\\author{charNameConvert.py}
"""
print(opener)
print("%% Definitions: frCode '%s', toCode '%s'.\n%%" % (frCode, toCode))
# Collect all the character pairs
cpList = sorted(cnmap.keys())
for codePoint in cpList:
fromString, toString = cnmap[codePoint]
if (oformat == "chart"):
print(cnc.charDict[codePoint].tostring())
elif (oformat == "texdefs"): # toCode better be TeX-like
texPart = "\\def{%s}{^^^^^%05x} " % (toString, codePoint)
cmtPart = "'%s' = %s" % (fromString, cnc.charDict[codePoint].names["description"])
print("%-40s %% %s" % (texPart, cmtPart))
else:
assert False, "Unknown oformat '%s'." % (oformat)
print("""
% -30-
""")
def doFindConflicts():
lg.fatal("findConflicts() not yet finished.")
#findConflicts()
cmap = None
notFound = defaultdict(int)
def doOneFile(path:str) -> int:
"""Read and deal with one individual file.
"""
global cmap, notFound
notFound = defaultdict(int)
if (not path):
if (sys.stdin.isatty()): print("Waiting on STDIN...")
fh = sys.stdin
else:
try:
fh = codecs.open(path, "rb", encoding=args.iencoding)
except IOError as e:
lg.info("Cannot open '%s':\n %s", path, e)
return 0
cnc = charNameConvert(os.environ["sjdUtilsDir"] + "/CharSets/unicode.xml")
cmap = cnc.getMap(args.fromCode, args.toCode)
for rec in fh.readlines():
rec = re.sub(r"(\\\w+)(?!\w)", fixChar, rec)
print(rec)
if (len(notFound) > 0):
lg.warning("Some characters not mapped in '%s'.", path)
# TODO: Option to print the list
def fixChar(m) -> str:
if m.group(1) in cmap:
return cmap[m.group(1)]
notFound[m.group(1)] += 1
return m.group(1)
###############################################################################
# Main
#
if __name__ == "__main__":
import argparse
esChoices = list(entitySetMap.keys())
esChoices.extend(entitySetGroups)
def processOptions() -> argparse.Namespace:
try:
from BlockFormatter import BlockFormatter
parser = argparse.ArgumentParser(
description=descr, formatter_class=BlockFormatter)
except ImportError:
parser = argparse.ArgumentParser(description=descr)
parser.add_argument(
"--iencoding", type=str, metavar="E", default="utf-8",
help="Assume this character coding for input. Default: utf-8.")
parser.add_argument(
"--chart", action="store_true",
help="Output a chart in TEX, of the --frCode/--toCode equivalents.")
parser.add_argument(
"--compact", action="store_true",
help="With --chart, use a one-line per codepoint format.")
parser.add_argument(
"--entitySets", "-e", type=str, action="append", choices=esChoices,
help="Which entity sets to check, in order (repeatable).")
parser.add_argument(
"--fallback", type=str, default="xml16",
choices=[ "literal", "xml10", "xml16", "unchanged", "slashu" ],
help="With --toCode xml, if no xml named entity if available, use this form.")
parser.add_argument(
"--findConflicts", action="store_true",
help="Search for any name defined to multiple Unicode codepoints.")
parser.add_argument(
"--frCode", type=str, default="literal",
choices=[ "literal", "xml", "xml10", "xml16", "latex", "slashu" ],
help="Input expresses special characters in this format.")
parser.add_argument(
"--includeCode", action="append", type=str, choices=esChoices,
help="With --chart, include these encodings. Repeatable (ordered).")
parser.add_argument(
"--oencoding", type=str, metavar="E", default="utf-8",
help="Use this character coding for output. Default: iencoding.")
parser.add_argument(
"--oformat", "--outputFormat", "--output-format",
type=str, choices=[ "texdefs", "chart" ], default="chart",
help="With --chart, what output layout to generate.")
parser.add_argument(
"--quiet", "-q", action="store_true",
help="Suppress most messages.")
parser.add_argument(
"--short", action="store_true",
help="With --toCode slashu, use \\xFF form when possible.")
parser.add_argument(
"--toCode", type=str, default="latex",
choices=[ "literal", "xml", "xml10", "xml16", "latex", "slashu" ],
help="Output in this format.")
parser.add_argument(
"--unicode", action="store_const", dest="iencoding",
const="utf8", help="Assume utf-8 for input files.")
parser.add_argument(
"--verbose", "-v", action="count", default=0,
help="Add more messages (repeatable).")
parser.add_argument(
"--version", action="version", version=__version__,
help="Display version information, then exit.")
parser.add_argument(
"files", type=str, nargs=argparse.REMAINDER,
help="Path(s) to input file(s)")
args0 = parser.parse_args()
# Support shorthand for groups, like all 8879, etc.
if (args0.entitySets):
args0.entitySets = expandEntitySets(args0.entitySets)
return(args0)
###########################################################################
#
args = processOptions()
if (args.iencoding and not args.oencoding):
args.oencoding = args.iencoding
if (args.oencoding):
# https://stackoverflow.com/questions/4374455/
# sys.stdout = codecs.getwriter("utf-8")(sys.stdout.detach())
sys.stdout.reconfigure(encoding="utf-8")
if (args.chart):
doChart(args.frCode, args.toCode, oformat=args.oformat, incl=args.includeCode)
sys.exit()
if (args.findConflicts):
doFindConflicts()
sys.exit()
if (len(args.files) == 0):
lg.info("charNameConvert.py: No files specified....")
doOneFile(None)
else:
for path0 in args.files:
doOneFile(path0)
if (not args.quiet):
lg.info("charNameConvert.py: Done, %d files.\n", pw.getStat("regular"))