-
Notifications
You must be signed in to change notification settings - Fork 4
/
utf8ienc.dtx
2292 lines (2276 loc) · 88.4 KB
/
utf8ienc.dtx
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
% \iffalse meta-comment
%
% Copyright (C) 1993-2021
% The LaTeX Project and any individual authors listed elsewhere
% in this file.
%
% This file is part of the LaTeX base system.
% -------------------------------------------
%
% It may be distributed and/or modified under the
% conditions of the LaTeX Project Public License, either version 1.3c
% of this license or (at your option) any later version.
% The latest version of this license is in
% https://www.latex-project.org/lppl.txt
% and version 1.3c or later is part of all distributions of LaTeX
% version 2008 or later.
%
% This file has the LPPL maintenance status "maintained".
%
% The list of all files belonging to the LaTeX base distribution is
% given in the file `manifest.txt'. See also `legal.txt' for additional
% information.
%
% The list of derived (unpacked) files belonging to the distribution
% and covered by LPPL is defined by the unpacking scripts (with
% extension .ins) which are part of the distribution.
%
% \fi
%
% \iffalse
%<*driver>
\documentclass{ltxdoc}
\usepackage[latin1,utf8]{inputenc}% force utf8 to be re-loaded, to get version info
\GetFileInfo{utf8.def}
\title{Providing some UTF-8 support via \texttt{inputenc}}
\date{\fileversion\space\filedate{} printed \today}
\author{%
David Carlisle \and Frank Mittelbach \and
Chris Rowley\thanks{Borrowing heavily from tables by Sebastian Rahtz; some table
and code cleanup by Javier Bezos}}
\begin{document}
\MaintainedByLaTeXTeam{latex}
\maketitle
\tableofcontents
\DocInput{utf8ienc.dtx}
\end{document}
%</driver>
% \fi
%
% \newpage
%
% \section{Introduction}
%
%
% \subsection{Background and general stuff}
%
% For many reasons what this package provides is a long way from any
% type of `Unicode compliance'.
%
% In stark contrast to 8-bit character sets, with 16 or more bits it can
% easily be very inefficient to support the full range.\footnote{In
% fact, \LaTeX's current 8-bit support does not go so far as to make
% all 8-bit characters into valid input.} Moreover, useful support of
% character input by a typesetting system overwhelmingly means finding
% an acceptable visual representation of a sequence of characters and
% this, for \LaTeX{}, means having available a suitably encoded 8-bit
% font.
%
% Unfortunately it is not possible to predict exactly what valid UTF-8
% octet sequences will appear in a particular file so it is best to
% make all the unsupported but valid sequences produce a reasonably
% clear and noticeable error message.
%
% There are two directions from which to approach the question of what
% to load. One is to specify the ranges of Unicode characters that will
% result in some sensible typesetting; this requires the provider to
% ensure that suitable fonts are loaded and that these input characters
% generate the correct typesetting via the encodings of those fonts. The
% other is to inspect the font encodings to be used and use these to
% define which input Unicode characters should be supported.
%
% For Western European languages, at least, going in either direction
% leads to many straightforward decisions and a few that are more
% subjective. In both cases some of the specifications are \TeX{}
% specific whilst most are independent of the particular typesetting
% software in use.
%
% As we have argued elsewhere, \LaTeX{} needs to refer to characters via
% `seven-bit-text' names and, so far, these have been chosen by
% reference to historical sources such as Plain \TeX{} or Adobe encoding
% descriptions. It is unclear whether this ad hoc naming structure should
% simply be extended or whether it would be useful to
% supplement it with standardised internal Unicode character names such as
% one or more of the following:\footnote{Burkhard und Holger Mittelbach
% spielen mit mir! Sie haben etwas hier geschrieben.}
%
% \begin{verbatim}
% \ltxutwochar <4 hex digits>
%
% \ltxuchar {<hex digits>}
% B H U R R R
%
% \ltxueightchartwo <2 utf8 octets as 8-bit char tokens>
% \ltxueightcharthree <3 utf8 octets ...>
% \ltxueightcharfour <4 utf8 octets ...>
% \end{verbatim}
%
%
% \subsection{More specific stuff}
%
% In addition to setting up the mechanism for reading UTF-8 characters
% and specifying the \LaTeX-level support available, this package
% contains support for some default historically expected \TeX-related
% characters and some example `Unicode definition files' for standard
% font encodings.
%
%
% \subsection{Notes}
%
% This package does not support Unicode combining characters as \TeX{}
% is not really equipped to make this possible.
%
% No attempt is made to be useful beyond Latin, and maybe Cyrillic,
% for European languages (as of now).
%
%
% \subsection{Basic operation of the code}
%
% The \texttt{inputenc} package makes the upper 8-bit characters active and
% assigns to all of them an error message. It then waits for the
% input encoding files to change this set-up. Similarly, whenever
% |\inputencoding| is encountered in a document, first the upper
% 8-bit characters are set back to produce an error and then the
% definitions for the new input encoding are loaded, changing some of the
% previous settings.
%
% The 8-bit input encodings currently supported by \texttt{inputenc}
% all use declarations such as |\DeclareInputText| and the like to map an
% 8-bit number to some \LaTeX{} internal form, e.g.~to |\"a|.
%
% The situation when supporting UTF-8 as the input encoding is
% different, however. Here we only have to set up the actions of
% those 8-bit numbers that can be the first octet in a UTF-8
% representation of a Unicode character. But we cannot simply set
% this to some internal \LaTeX{} form since the Unicode character
% consists of more than one octet; instead we have to define this
% starting octet to parse the right number of further octets that
% together form the UTF-8 representation of some Unicode character.
%
% Therefore when switching to \texttt{utf8} within the
% \texttt{inputenc} framework the characters with numbers (hex)
% from \texttt{"C2} to \texttt{"DF} are defined to parse for a
% second octet following, the characters from \texttt{"E0} to
% \texttt{"EF} are defined to parse for two more octets and finally
% the characters from \texttt{"F0} to \texttt{"F3} are defined to
% parse for three additional octets. These additional octets are
% always in the range \texttt{"80} to \texttt{"B9}.
%
% Thus, when such a character is encountered in the document (so
% long as expansion is not prohibited) a defined number of
% additional octets (8-bit characters) are read and from them a
% unique control sequence name is immediately constructed.
%
% This control sequence is either defined (good) or undefined
% (likely); in the latter case the user gets an error message
% saying that this UTF-8 sequence (or, better, Unicode character)
% is not supported.
%
% If the control sequence is set up to do something useful then it will
% expand to a \LaTeX{} internal form: e.g.~for the utf8 sequence of
% two octets \texttt{"C3 "A4} we get |\"a| as the
% internal form which then, depending on the font encoding,
% eventually resolves to the single glyph `latin-a-umlaut' or to
% the composite glyph `latin-a with an umlaut accent'.
%
% These mappings from (UTF-8 encoded) Unicode characters to \LaTeX{}
% internal forms are made indirectly. The code below provides a
% declaration |\DeclareUnicodeCharacter| which maps Unicode numbers
% (as hexadecimal) to \LaTeX{} internal forms.
%
% This mapping needs to be set up only once so it is done at
% |\begin{document}| by looking at the list of font encodings that
% are loaded by the document and providing mappings related to
% those font encodings whenever these are available. Thus at most
% only those Unicode characters that can be represented by the glyphs
% available in these encodings will be defined.
%
% Technically this is done by loading one file per encoding,
% if available, that is supposed to provide the necessary mapping
% information.
%
%
% \StopEventually{}
%
%
%
%
% \section{Coding}
%
% \subsection{Housekeeping}
%
% The usual introductory bits and pieces:
%
% \begin{macrocode}
%<utf8>\ProvidesFile{utf8.def}
%<test>\ProvidesFile{utf8-test.tex}
%<+lcy> \ProvidesFile{lcyenc.dfu}
%<+ly1> \ProvidesFile{ly1enc.dfu}
%<+oms> \ProvidesFile{omsenc.dfu}
%<+ot1> \ProvidesFile{ot1enc.dfu}
%<+ot2> \ProvidesFile{ot2enc.dfu}
%<+t1> \ProvidesFile{t1enc.dfu}
%<+t2a> \ProvidesFile{t2aenc.dfu}
%<+t2b> \ProvidesFile{t2benc.dfu}
%<+t2c> \ProvidesFile{t2cenc.dfu}
%<+ts1> \ProvidesFile{ts1enc.dfu}
%<+x2> \ProvidesFile{x2enc.dfu}
%<+all> \ProvidesFile{utf8enc.dfu}
%<-utf8-2018> [2021/06/21 v1.2n UTF-8 support]
% \end{macrocode}
%
% \begin{macrocode}
%<*utf8>
% \end{macrocode}
% This is a temporary fix for the e-p\TeX{} / e-up\TeX{} engines that do not yet
% have a |\ifincsname| primitive. Once this is available the extra file will
% be dropped.
% \changes{v1.2h}{2019/07/09}{Temp rollback fix for e-pTeX}
% \begin{macrocode}
\ifx\ifincsname\@undefined % old e-pTeX or e-upTeX engines
\input utf8-2018.def
\expandafter\@firstofone
\else
\expandafter\@gobble
\fi
\endinput
% \end{macrocode}
%
% \begin{macrocode}
\makeatletter
% \end{macrocode}
% We restore the |\catcode| of space (which is set to ignore in
% \texttt{inputenc}) while reading \texttt{.def} files. Otherwise
% we would need to explicitly use |\space| all over the place in
% error and log messages.
% \changes{v1.1d}{2004/05/08}{Explicitly set catcode of space}
% \begin{macrocode}
\catcode`\ \saved@space@catcode
% \end{macrocode}
%
%
%
% \subsection{Parsing UTF-8 input}
%
% A UTF-8 char (that is not actually a 7-bit char, i.e.~a single
% octet) is parsed as follows: each starting octet is an active
% \TeX{} character token; each of these is defined below to be a
% macro with one to three arguments nominally (depending on the
% starting octet). It calls one of |\UTFviii@two@octets|,
% |\UTFviii@three@octets|, or |\UTFviii@four@octets| which then
% actually picks up the remaining octets as the argument(s).
%
% \begin{itemize}
% \item When typesetting we pick up the necessary number of additional
% octets, check if they form a command that \LaTeX{} knows about
% (via \cs{csname} \texttt{u8:}\cs{string}
% \verb=#1=\cs{string} \verb=#2...=\cs{endcsname}) and if so use that
% for typesetting. \cs{string} is needed as the octets may (all?) be
% active and we want the literal values in the name.
%
% \item If the UTF-8 character is going to be part of a label, then it is
% essentially becoming part of some csname and with the
% test \cs{ifincsname} we can find this out. If so, we render the whole
% sequence off octets harmless by using \cs{string} too when the
% starting octet executes (\cs{UTF@...@octets@string}).
%
% \item Another possible case is that \cs{protect} has \emph{not} the meaning
% of \cs{typeset@protect}. In that case we may do a \cs{write} or we may do
% a \cs{protected@edef} or \ldots{} In all such cases we want to keep the
% sequence of octets unchanged, but we can't use \cs{string} this time, since at
% least in the case of \cs{protect@edef} the result may later be
% typeset after all (in fact that is quite likely) and so at that
% point the starting octet needs to be an active character again
% (the others could be stringified). So for this case we use \cs{noexpand}
% ((\cs{UTF@...s@octets@noexpand}).
% \end{itemize}
%
% \begin{macro}{\UTFviii@two@octets}
% \changes{v1.2a}{2018/03/24}{Macros made `\cs{long} for improved error messages}%
% Putting that all together the code for a start octet of a two
% byte sequence would then look like this:
% \begin{macrocode}
\long\def\UTFviii@two@octets{%
\ifincsname
\expandafter \UTF@two@octets@string
\else
\ifx \protect\@typeset@protect \else
\expandafter\expandafter\expandafter \UTF@two@octets@noexpand
\fi
\fi
\UTFviii@two@octets@combine
}
% \end{macrocode}
% \cs{ifcsname} is tested first because that can be true even if we
% are otherwise doing typesetting. If this is the case we use
% \cs{string} on the whole octet
% sequence. \cs{UTF@two@octets@string} not only does this but also
% gets rid of the command \cs{UTFviii@two@octets@combine} in the input
% stream by picking it up as a first argument and dropping it.
%
% If this is not the case and we are doing typesetting (i.e.,
% \cs{protect} is \cs{typeset@protect}), then we execute
% \cs{UTFviii@two@octets@combine} which picks up all octets and typesets
% the character (or generates an error if it doesn't know how to
% typeset it).
%
% However, if we are not doing typesetting, then we execute the
% command \cs{UTFviii@two@octets@noexpand} which works like
% \cs{UTF@two@octets@string} but uses \cs{noexpand} instead of
% \cs{string}. This way the sequence is temporary rendered harmless,
% e.g., would display as is or stays put inside a
% \cs{protected@edef}. But if the result is later reused the
% starting octet is still active and so will be able to construct
% the UTF-8 character again.
% \end{macro}
%
%
% \begin{macro}{\UTFviii@three@octets}
% \changes{v1.2a}{2018/03/24}{Macros made `\cs{long} for improved error messages}%
% \begin{macro}{\UTFviii@four@octets}
% \changes{v1.2a}{2018/03/24}{Macros made `\cs{long} for improved
% error messages}
% The definitions for the other starting octets
% are the same except that they pick up more octets after them.
% \begin{macrocode}
\long\def\UTFviii@three@octets{%
\ifincsname
\expandafter \UTF@three@octets@string
\else
\ifx \protect\@typeset@protect \else
\expandafter\expandafter\expandafter \UTF@three@octets@noexpand
\fi
\fi
\UTFviii@three@octets@combine
}
% \end{macrocode}
% \begin{macrocode}
\long\def\UTFviii@four@octets{%
\ifincsname
\expandafter \UTF@four@octets@string
\else
\ifx \protect\@typeset@protect \else
\expandafter\expandafter\expandafter \UTF@four@octets@noexpand
\fi
\fi
\UTFviii@four@octets@combine
}
% \end{macrocode}
% \end{macro}
% \end{macro}
%
% \begin{macro}{\UTFviii@two@octets@noexpand}
% \begin{macro}{\UTFviii@three@octets@noexpand}
% \begin{macro}{\UTFviii@four@octets@noexpand}
% These temporarily prevent the active chars from expanding.
% \begin{macrocode}
\long\def\UTF@two@octets@noexpand#1#2{\noexpand#2\noexpand}
\long\def\UTF@three@octets@noexpand#1#2#3{\noexpand#2\noexpand#3\noexpand}
\long\def\UTF@four@octets@noexpand#1#2#3#4{\noexpand#2\noexpand#3\noexpand#4\noexpand}
% \end{macrocode}
% \end{macro}
% \end{macro}
% \end{macro}
% \begin{macro}{\UTFviii@two@octets@string}
% \begin{macro}{\UTFviii@three@octets@string}
% \begin{macro}{\UTFviii@four@octets@string}
% And the same with \cs{string} for use in \cs{csname} constructions.
% \begin{macrocode}
\long\def\UTF@two@octets@string#1#2{\string#2\string}
\long\def\UTF@three@octets@string#1#2#3{\string#2\string#3\string}
\long\def\UTF@four@octets@string#1#2#3#4{\string#2\string#3\string#4\string}
% \end{macrocode}
% \end{macro}
% \end{macro}
% \end{macro}
% \begin{macro}{\UTFviii@two@octets@combine}
% \begin{macro}{\UTFviii@three@octets@combine}
% \begin{macro}{\UTFviii@four@octets@combine}
% From the arguments a control sequence with a name of the form
% \verb=u8:#1#2...= is constructed where the |#i| ($i>1$) are the
% arguments and |#1| is the starting octet (as a \TeX{} active character
% token). Since some or even all of these characters are active
% we need to use |\string| when building
% the \cs{csname}.
%
% The \cs{csname} thus constructed can of course be undefined but to
% avoid producing an unhelpful low-level undefined command error we
% pass it to |\UTFviii@defined| which is responsible for producing
% a more sensible error message (not yet done!!). If, however, it is
% defined we simply execute the thing (which should then expand to
% an encoding specific internal \LaTeX{} form).
% \begin{macrocode}
\long\def\UTFviii@two@octets@combine#1#2{\expandafter
\UTFviii@defined\csname u8:\string#1\string#2\endcsname}
% \end{macrocode}
%
% \begin{macrocode}
\long\def\UTFviii@three@octets@combine#1#2#3{\expandafter
\UTFviii@defined\csname u8:\string#1\string#2\string#3\endcsname}
% \end{macrocode}
%
% \begin{macrocode}
\long\def\UTFviii@four@octets@combine#1#2#3#4{\expandafter
\UTFviii@defined\csname u8:\string#1\string#2\string#3\string#4\endcsname}
% \end{macrocode}
% \end{macro}
% \end{macro}
% \end{macro}
%
%
% \begin{macro}{\UTFviii@defined}
% This tests whether its argument is different from |\relax|: it
% either calls for a sensible error message (not done), or it gets
% the |\fi| out of the way (in case the command has arguments) and
% executes it.
% \begin{macrocode}
\def\UTFviii@defined#1{%
\ifx#1\relax
% \end{macrocode}
% Test if the sequence is invalid UTF-8 or valid UTF-8 but without
% a \LaTeX\ definition.
% \begin{macrocode}
\if\relax\expandafter\UTFviii@checkseq\string#1\relax\relax
% \end{macrocode}
% The endline character has a special definition within the
% inputenc package (it is gobbling spaces). For this reason we
% can't produce multiline strings without some precaution.
% \changes{v1.1b}{2004/02/09}{No newlines allowed in error messages}
% \changes{v1.1g}{2005/09/27}{Real spaces do not show up so use \cs{space}}
% \changes{v1.1o}{2015/08/28}{Show Unicode number of character in hex}
% \changes{v1.2a}{2018/03/24}{Error message improved for non-UTF-8 sequences}%
% \changes{v1.2m}{2021/06/08}{Normalize various error messages as kernel errors}
% \begin{macrocode}
\UTFviii@undefined@err{#1}%
% \end{macrocode}
%
% \begin{macrocode}
\else
\@latex@error{Invalid UTF-8 byte sequence (\expandafter
\@gobblefour\string#1)}%
\UTFviii@invalid@help
\fi
% \end{macrocode}
%
% \begin{macrocode}
\else\expandafter
#1%
\fi
}
% \end{macrocode}
% \end{macro}
%
% \begin{macro}{\UTFviii@invalid@err}
% \begin{macro}{\UTFviii@invalid@help}
% \changes{v1.2a}{2018/03/24}{Macro added}%
% \changes{v1.2f}{2018/10/05}{Show invalid byte in hex}%
% \begin{macrocode}
\def\UTFviii@invalid@err#1{%
\@latex@error{Invalid UTF-8 byte "\UTFviii@hexnumber{`#1}}%
\UTFviii@invalid@help}
% \end{macrocode}
%
% \begin{macrocode}
\def\UTFviii@invalid@help{%
The document does not appear to be in UTF-8 encoding.\MessageBreak
Try adding \noexpand\UseRawInputEncoding as the first line of the file\MessageBreak
or specify an encoding such as \noexpand\usepackage[latin1]{inputenc}\MessageBreak
in the document preamble.\MessageBreak
Alternatively, save the file in UTF-8 using your editor or another tool}
% \end{macrocode}
% \end{macro}
% \end{macro}
%
% \begin{macro}{\UTFviii@undefined@err}
% \changes{v1.2a}{2018/03/24}{Macro added}%
% \begin{macrocode}
\def\UTFviii@undefined@err#1{%
\@latex@error{Unicode character \expandafter
\UTFviii@splitcsname\string#1\relax
\MessageBreak
not set up for use with LaTeX}%
{You may provide a definition with\MessageBreak
\noexpand\DeclareUnicodeCharacter}%
}
% \end{macrocode}
% \end{macro}
%
%
% \begin{macro}{\UTFviii@checkseq}
% \begin{macro}{\UTFviii@check@continue}
% \changes{v1.2a}{2018/03/24}{Macro added}%
% Check that the csname consists of a valid UTF-8 sequence.
% \begin{macrocode}
\def\UTFviii@checkseq#1:#2#3{%
\ifnum`#2<"80 %
\ifx\relax#3\else1\fi
\else
\ifnum`#2<"C0 %
1 %
\else
\expandafter\expandafter\expandafter\UTFviii@check@continue
\expandafter\expandafter\expandafter#3%
\fi
\fi}
% \end{macrocode}
%
% \begin{macrocode}
\def\UTFviii@check@continue#1{%
\ifx\relax#1%
\else
\ifnum`#1<"80 1\else\ifnum`#1>"BF 1\fi\fi
\expandafter\UTFviii@check@continue
\fi
}
% \end{macrocode}
% \end{macro}
% \end{macro}
%
%
% \begin{macro}{\UTFviii@loop}
% This bit of code derived from \texttt{xmltex} defines the active
% character corresponding to starting octets to call |\UTFviii@two@octets|
% etc as appropriate.
% The starting octet itself is passed directly as the first
% argument, the others are picked up later en route.
%
% The |\UTFviii@loop| loops through the numbers starting at
% |\count@| and ending at |\@tempcnta|${} - 1$, each time executing
% the code in |\UTFviii@tmp|.
%
% All this is done in a group so that temporary catcode changes
% etc.~vanish after everything is set up.
%
% \begin{macrocode}
\begingroup
\catcode`\~13
\catcode`\"12
% \end{macrocode}
%
% \begin{macrocode}
\def\UTFviii@loop{%
\uccode`\~\count@
\uppercase\expandafter{\UTFviii@tmp}%
\advance\count@\@ne
\ifnum\count@<\@tempcnta
\expandafter\UTFviii@loop
\fi}
% \end{macrocode}
%
% Handle the single byte control characters.
% \changes{v1.2a}{2018/03/24}{Loop over C0 controls added}%
% C0 controls are valid UTF-8 but defined to give the ``Character not defined error''
% They may be defined with |\DeclareUnicodeCharacter|.
% \begin{macrocode}
\def\UTFviii@tmp{\xdef~{\noexpand\UTFviii@undefined@err{:\string~}}}
% 0 ^^@ null
\count@"1
\@tempcnta9
% 9 ^^I tab
% 10 ^^J nl
\UTFviii@loop
\count@11
\@tempcnta12
\UTFviii@loop
% 12 ^^L
% 13 ^^M
\count@14
\@tempcnta32
\UTFviii@loop
% \end{macrocode}
%
%
% Bytes with leading bits |10| are not valid UTF-8 starting bytes
% \begin{macrocode}
\count@"80
\@tempcnta"C2
\def\UTFviii@tmp{\xdef~{\noexpand\UTFviii@invalid@err\string~}}
\UTFviii@loop
% \end{macrocode}
%
% Setting up 2-byte UTF-8: The starting bytes is passed as an
% active character so that it can be reprocessed later!
% \begin{macrocode}
\count@"C2
\@tempcnta"E0
\def\UTFviii@tmp{\xdef~{\noexpand\UTFviii@two@octets\noexpand~}}
\UTFviii@loop
% \end{macrocode}
%
% Setting up 3-byte UTF-8:
% \begin{macrocode}
\count@"E0
\@tempcnta"F0
\def\UTFviii@tmp{\xdef~{\noexpand\UTFviii@three@octets\noexpand~}}
\UTFviii@loop
% \end{macrocode}
%
% Setting up 4-byte UTF-8:
% \changes{v1.2e}{2018/09/28}{Fix "F4 lead byte}%
% \begin{macrocode}
\count@"F0
\@tempcnta"F5
\def\UTFviii@tmp{\xdef~{\noexpand\UTFviii@four@octets\noexpand~}}
\UTFviii@loop
% \end{macrocode}
%
% Bytes above F4 are not valid UTF-8 starting bytes as they would encode numbers beyond
% the Unicode range
% \begin{macrocode}
\count@"F5
\@tempcnta"100
\def\UTFviii@tmp{\xdef~{\noexpand\UTFviii@invalid@err\string~}}
\UTFviii@loop
% \end{macrocode}
%
% \begin{macrocode}
\endgroup
% \end{macrocode}
%
% \end{macro}
%
% For this case we must disable the warning generated by
% \texttt{inputenc} if it doesn't see any new |\DeclareInputText|
% commands.
% \begin{macrocode}
\@inpenc@test
% \end{macrocode}
%
%
% If this file (\texttt{utf8.def}) is not being read while setting
% up \texttt{inputenc}, i.e.~in the preamble, but when
% |\inputencoding| is called somewhere within the document, we do not
% need to input the specific Unicode mappings again. We therefore
% stop reading the file at this point.
% \begin{macrocode}
\ifx\@begindocumenthook\@undefined
\makeatother
% \end{macrocode}
% The |\fi| must be on the same line as |\endinput| or else it will
% never be seen!
% \begin{macrocode}
\endinput \fi
% \end{macrocode}
%
%
% \subsection{Mapping Unicode codes to \LaTeX{} internal forms}
%
%
% \begin{macro}{\DeclareUnicodeCharacter}
% The |\DeclareUnicodeCharacter| declaration defines a mapping from
% a Unicode character code point to a \LaTeX{} internal form. The first
% argument is the Unicode number as hexadecimal digits and the second is
% the actual \LaTeX{} internal form.
%
% We start by making sure that some characters have the right
% |\catcode| when they are used in the definitions below.
% \begin{macrocode}
\begingroup
\catcode`\"=12
\catcode`\<=12
\catcode`\.=12
\catcode`\,=12
\catcode`\;=12
\catcode`\!=12
\catcode`\~=13
% \end{macrocode}
%
% \begin{macrocode}
\gdef\DeclareUnicodeCharacter#1#2{%
\count@"#1\relax
\wlog{ \space\space defining Unicode char U+#1 (decimal \the\count@)}%
\begingroup
% \end{macrocode}
% Next we do the parsing of the number stored in |\count@| and assign the
% result to |\UTFviii@tmp|. Actually all this could be done in-line,
% the macro |\parse@XML@charref| is only there to extend this code
% to parsing Unicode numbers in other contexts one day (perhaps).
% \begin{macrocode}
\parse@XML@charref
% \end{macrocode}
%
% Here is an example of what is happening, for the pair \texttt{"C2 "A3}
% (which is the utf8 representation for the character \textsterling{}).
% After |\parse@XML@charref| we have, stored in |\UTFviii@tmp|, a
% single command with two character tokens as arguments:
% \begin{quote}
% [$t_{C2}$ and $t_{A3}$ are the characters corresponding to these
% two octets]\\
% |\UTFviii@two@octets| $t_{\rm C2}t_{\rm A3}$
% \end{quote}
% what we actually need to produce is a definition of the form
% \begin{quote}
% |\def\u8:|$t_{\rm C2}$$t_{\rm A3}$ |{|\textit{\LaTeX{} internal form}|}|\,.
% \end{quote}
% So here we temporarily redefine the prefix commands
% |\UTFviii@two@octets|, etc.~to
% generate the csname that we wish to define> the |\string|s are
% added in case these tokens are still active.
% \begin{macrocode}
\def\UTFviii@two@octets##1##2{\csname u8:##1\string##2\endcsname}%
\def\UTFviii@three@octets##1##2##3{\csname u8:##1%
\string##2\string##3\endcsname}%
\def\UTFviii@four@octets##1##2##3##4{\csname u8:##1%
\string##2\string##3\string##4\endcsname}%
% \end{macrocode}
% Now we simply:-) need to use the right number of |\expandafter|s to
% finally construct the definition: expanding |\UTFviii@tmp| once to get
% its contents, a second time to replace the prefix command by its
% |\csname| expansion, and a third time to turn the expansion into
% a csname after which the |\gdef| finally gets applied.
% We add an irrelevant |\IeC| and braces around the definition, in
% order to avoid any space after the command being gobbled up
% when the text is written out to an auxiliary file (see
% \texttt{inputenc} for further details
% \begin{macrocode}
\expandafter\expandafter\expandafter
\expandafter\expandafter\expandafter
\expandafter
\gdef\UTFviii@tmp{\IeC{#2}}%
\endgroup
}
% \end{macrocode}
% \end{macro}
%
%
% \begin{macro}{\parse@XML@charref}
% This macro parses a Unicode number (decimal) and returns its
% UTF-8 representation as a sequence of non-active \TeX{} character
% tokens. In the
% original code it had two arguments delimited by \texttt{;} here,
% however, we supply the Unicode number implicitly.
% \begin{macrocode}
\gdef\parse@XML@charref{%
% \end{macrocode}
% We need to keep a few things local, mainly the |\uccode|'s that
% are set up below. However, the group originally used here is
% actually unnecessary since we call this macro only within another
% group; but it will be important to restore the group if this
% macro gets used for other purposes.
% \begin{macrocode}
% \begingroup
% \end{macrocode}
% The original code from \texttt{xmltex} supported the convention that a
% Unicode slot number could be given either as a decimal or as a
% hexadecimal (by starting with \texttt{x}). We do not do this so
% this code is also removed. This could be reactivated if one
% wants to support document commands that accept Unicode numbers
% (but then the first case needs to be changed from an error
% message back to something more useful again).
% \begin{macrocode}
% \uppercase{\count@\if x\noexpand#1"\else#1\fi#2}\relax
% \end{macrocode}
% As |\count@| already contains the right value we make
% |\parse@XML@charref| work without arguments.
% \changes{v1.1g}{2005/09/27}{Real spaces do not show up so use \cs{space}}
% \changes{v1.2a}{2018/03/24}{Allow control characters if active}
% In the case single byte UTF-8 sequences, only allow definition if
% the character os already active. The definition of |\UTFviii@tmp|
% looks slightly strange but is designed for the sequence of |\expandafter|
% in |\DeclareUnicodeCharacter|.
%
% \begin{macrocode}
\ifnum\count@<"A0\relax
\ifnum\catcode\count@=13
\uccode`\~=\count@\uppercase{\def\UTFviii@tmp{\@empty\@empty~}}%
\else
\@latex@error{Cannot define non-active Unicode char value < 00A0}%
\@eha
\def\UTFviii@tmp{\UTFviii@tmp}%
\fi
% \end{macrocode}
% The code below is derived from \texttt{xmltex} and generates the UTF-8 byte sequence
% for the number in |\count@|.
%
% The reverse operation (just used in error messages)
% has now been added as \cs{decode@UTFviii}.
% \begin{macrocode}
\else\ifnum\count@<"800\relax
\parse@UTFviii@a,%
\parse@UTFviii@b C\UTFviii@two@octets.,%
\else\ifnum\count@<"10000\relax
\parse@UTFviii@a;%
\parse@UTFviii@a,%
\parse@UTFviii@b E\UTFviii@three@octets.{,;}%
\else
% \end{macrocode}
%
% Test added here for out of range values, the 4-octet definitions are still set up
% so that |\DeclareUnicodeCharacter| does something sensible if the user scrolls
% past this error.
% \begin{macrocode}
\ifnum\count@>"10FFFF\relax
\@latex@error
{\UTFviii@hexnumber\count@\space too large for Unicode}%
{Values between 0 and 10FFFF are permitted}%
\fi
% \end{macrocode}
%
% \begin{macrocode}
\parse@UTFviii@a;%
\parse@UTFviii@a,%
\parse@UTFviii@a!%
\parse@UTFviii@b F\UTFviii@four@octets.{!,;}%
\fi
\fi
\fi
% \endgroup
}
% \end{macrocode}
% \end{macro}
%
% \begin{macro}{\parse@UTFviii@a}
% \ldots so somebody else can document this part :-)
% \changes{v1.1b}{2004/02/09}{Space in the wrong place \cs{count @64}}
% \begin{macrocode}
\gdef\parse@UTFviii@a#1{%
\@tempcnta\count@
\divide\count@ 64
\@tempcntb\count@
\multiply\count@ 64
\advance\@tempcnta-\count@
\advance\@tempcnta 128
\uccode`#1\@tempcnta
\count@\@tempcntb}
% \end{macrocode}
% \end{macro}
%
% \begin{macro}{\parse@UTFviii@b}
% \ldots same here
% \begin{macrocode}
\gdef\parse@UTFviii@b#1#2#3#4{%
\advance\count@ "#10\relax
\uccode`#3\count@
\uppercase{\gdef\UTFviii@tmp{#2#3#4}}}
% \end{macrocode}
% \end{macro}
%
% \begin{macro}{\decode@UTFviii}
% \changes{v1.1o}{2015/08/28}{Macro added}
% In the reverse direction, take a sequence of octets(bytes)
% representing a character in UTF-8 and construct the Unicode number.
% The sequence is terminated by |\relax|.
%
% In this version, if the sequence is not valid UTF-8 you probably
% get a low level arithmetic error from |\numexpr| or stray characters
% at the end. Getting a better error message would be somewhat expensive.
% As the main use is for reporting characters in messages, this is done
% just using expansion, so |\numexpr| is used, A stub returning 0 is defined
% if |\numexpr| is not available.
% \begin{macrocode}
\ifx\numexpr\@undefined
% \end{macrocode}
%
% \begin{macrocode}
\gdef\decode@UTFviii#1{0}
% \end{macrocode}
%
% \begin{macrocode}
\else
% \end{macrocode}
%
% If the input is malformed UTF-8 there may not be enough closing ) so
% add 5 so there are always some remaining then cleanup and remove
% any remaining ones at the end. This avoids |\numexpr| parse errors
% while outputting a package error.
% \begin{macrocode}
\gdef\decode@UTFviii#1\relax{%
\expandafter\UTFviii@cleanup
\the\numexpr\dec@de@UTFviii#1\relax)))))\@empty}
% \end{macrocode}
%
% \begin{macrocode}
\gdef\UTFviii@cleanup#1)#2\@empty{#1}
% \end{macrocode}
%
% \begin{macrocode}
\gdef\dec@de@UTFviii#1{%
\ifx\relax#1%
\else
\ifnum`#1>"EF
((((`#1-"F0)%
\else
\ifnum`#1>"DF
(((`#1-"E0)%
\else
\ifnum`#1>"BF
((`#1-"C0)%
\else
\ifnum`#1>"7F
)*64+(`#1-"80)%
\else
+`#1 %
\fi
\fi
\fi
\fi
\expandafter\dec@de@UTFviii
\fi}
% \end{macrocode}
%
% \begin{macrocode}
\fi
% \end{macrocode}
% \end{macro}
%
% \begin{macro}{\UTFviii@hexnumber}
% \changes{v1.1o}{2015/08/28}{Macro added}
% Convert a number to a sequence of uppercase hex digits.
% If |\numexpr| is not available, it returns its argument unchanged.
% \begin{macrocode}
\ifx\numexpr\@undefined
% \end{macrocode}
% \begin{macrocode}
\global\let\UTFviii@hexnumber\@firstofone
\global\UTFviii@hexdigit\hexnumber@
% \end{macrocode}
% \begin{macrocode}
\else
% \end{macrocode}
% \begin{macrocode}
\gdef\UTFviii@hexnumber#1{%
\ifnum#1>15 %
\expandafter\UTFviii@hexnumber\expandafter{\the\numexpr(#1-8)/16\relax}%
\fi
\UTFviii@hexdigit{\numexpr#1\ifnum#1>0-((#1-8)/16)*16\fi\relax}%
}
% \end{macrocode}
%
% Almost but not quite |\hexnumber@|.
% \begin{macrocode}
\gdef\UTFviii@hexdigit#1{\ifcase\numexpr#1\relax
0\or1\or2\or3\or4\or5\or6\or7\or8\or9\or
A\or B\or C\or D\or E\or F\fi}
% \end{macrocode}
%
% \begin{macrocode}
\fi
% \end{macrocode}
% \end{macro}
%
% \begin{macro}{\UTFviii@splitcsname}
% \changes{v1.1o}{2015/08/28}{Macro added}
% \begin{macro}{\UTFviii@hexcodepoint}
% \changes{v1.2e}{2018/09/28}{Macro added: Format codepoint properly}
% Split a csname representing a unicode character and return
% the character and the unicode number in hex.
% \begin{macrocode}
\gdef\UTFviii@hexcodepoint#1{U+%
\ifnum#1<16 0\fi
\ifnum#1<256 0\fi
\ifnum#1<4096 0\fi
\UTFviii@hexnumber{#1}%
}%
\gdef\UTFviii@splitcsname#1:#2\relax{%
% \end{macrocode}
% \changes{v1.2b}{2018/03/26}{add percent as \cs{endlinechar} not -1 in the format}%
% Need to pre-expand the argument to ensure cleanup in case of mal-formed UTF-8.
% \begin{macrocode}
#2 (\expandafter\UTFviii@hexcodepoint\expandafter{%
\the\numexpr\decode@UTFviii#2\relax})%
}
% \end{macrocode}
% \end{macro}
% \end{macro}
%
% \begin{macrocode}
\endgroup
% \end{macrocode}
%
% \begin{macrocode}
\@onlypreamble\DeclareUnicodeCharacter
% \end{macrocode}
% These are preamble only as long as we don't support Unicode
% charrefs in documents.
% \begin{macrocode}
\@onlypreamble\parse@XML@charref
\@onlypreamble\parse@UTFviii@a
\@onlypreamble\parse@UTFviii@b
% \end{macrocode}
%
%
% \subsection{Loading Unicode mappings at begin document}
%
% The original plan was to set up the UTF-8 support at
% |\begin{document}|; but then any text characters used in the preamble
% (as people do even though advised against it) would fail in one way or