forked from w3c/i18n-glossary
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathindex.html
610 lines (332 loc) · 86.3 KB
/
index.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
<!DOCTYPE html>
<html lang="en" dir="ltr">
<head>
<title>Internationalization Glossary</title>
<meta charset="utf-8"/>
<script src="https://www.w3.org/Tools/respec/respec-w3c" class="remove"></script>
<script class="remove">
var respecConfig = {
// specification status (e.g. WD, LCWD, WG-NOTE, etc.). If in doubt use ED.
specStatus: "ED",
//publishDate: "2021-07-08",
//previousPublishDate: "2019-04-16",
//previousMaturity: "WD",
noRecTrack: true,
shortName: "i18n-glossary",
copyrightStart: "2021",
edDraftURI: "https://w3c.github.io/i18n-glossary/",
// if this is a LCWD, uncomment and set the end of its review period
// lcEnd: "2009-08-05",
// editors, add as many as you like
// only "name" is required
//authors: [
// { name: "Person", mailto: "someone@example.com",
// company: "Invited Expert" },
// ],
editors: [
{ name: "Richard Ishida", mailto: "ishida@w3.org", company: "W3C", w3cid: 3439 },
{ name: "Addison Phillips", mailto: "addisonI18N@gmail.com", company: "Invited Expert", w3cid: 33573 }
],
github: "w3c/i18n-glossary",
group: "i18n",
};
</script>
<!--
The ReSpec attribute 'data-include' loads the contents of the CSS files into the
'style' element provided when generating the output page.
-->
<style data-include="https://w3c.github.io/i18n-drafts/style/respec_2022.css"></style>
<style data-include="https://w3c.github.io/i18n-glossary/local.css"></style>
</head>
<body>
<div id="abstract">
<p>This document provides definitions for various terms related to W3C internationalization.</p>
</div>
<div id="sotd">
<p>We welcome comments on this document, but to make it easier to track them, please raise
separate issues for each comment, and point to the section
you are commenting on using a URL.</p>
</div>
<section>
<h2 id="introduction">Introduction</h2>
<p>This document can be pointed to for definitions of terms, or these definitions may be copied to other documents and slightly adapted.</p>
<p>The W3C Internationalization Working Group also uses <a href="https://www.unicode.org/glossary/" target="_blank">definitions provided by the Unicode Consortium</a>. For more information on how to use this glossary, see Appendix <a href="#how-to-use"></a></p>
</section>
<section>
<h2 id="links">Alphabetical links</h2>
<div id="link_list" spellcheck="false">
<a href="#a">A</a>
<a href="#b">B</a>
<a href="#c">C</a>
<a href="#d">D</a>
<a href="#e">E</a>
<a href="#f">F</a>
<a href="#g">G</a>
<a href="#i">I</a>
<a href="#j">J</a>
<a href="#k">K</a>
<a href="#l">L</a>
<a href="#m">M</a>
<a href="#n">N</a>
<a href="#o">O</a>
<a href="#p">P</a>
<a href="#r">R</a>
<a href="#s">S</a>
<a href="#t">T</a>
<a href="#u">U</a>
<a href="#v">V</a>
<a href="#w">W</a>
<a href="#z">Z</a>
</div>
</section>
<section id="glossary">
<h2>Glossary</h2>
<div class="letter_anchor" id="a">A</div>
<p><dfn class="lint-ignore export">Abjad</dfn>. A writing system in which consonants are indicated, but not short vowels. The term “abjad” is derived from the first four letters of the traditional order of the Arabic script: <i class="name">alef, beh, jeem, dal</i>. (See also the <a href="https://www.unicode.org/glossary/#abjad" target="_blank">Unicode definition</a> and <a href="https://www.unicode.org/versions/latest/ch06.pdf#G7382" target="_blank">Section 6.1, Writing Systems</a>.) Alternatives include [=abugida=], [=alphabet=] and [=syllabary=].</p>
<p><dfn class="lint-ignore export">Abugida</dfn>. A writing system in which consonants have an inherent vowel, and other vowels are indicated by associating the consonant with one or more combining marks and/or letters. The term “abugida” is derived from the first four letters of the Ethiopic script in the Semitic order: <i class="name">alf, bet, gaml, dant</i>. (See also the <a href="https://www.unicode.org/glossary/#abugida" target="_blank">Unicode definition</a> and <a href="https://www.unicode.org/versions/latest/ch06.pdf#G7382" target="_blank">Section 6.1, Writing Systems</a>.) Alternatives include [=abjad=], [=alphabet=] and [=syllabary=].</p>
<p><dfn class="lint-ignore export">Alphabet</dfn>. A writing system in which both consonants and vowels are indicated. The term “alphabet” is derived from the first two letters of the Greek script: <i class="name">alpha, beta</i>. (See also the <a href="https://www.unicode.org/glossary/#alphabet" target="_blank">Unicode definition</a> and <a href="https://www.unicode.org/versions/latest/ch06.pdf#G7382" target="_blank">Section 6.1, Writing Systems</a>.) Alternatives include [=abugida=], [=abjad=] and [=syllabary=].</p>
<p><dfn data-plurals="application internal identifiers" class="lint-ignore export">Application internal identifiers</dfn>. Identifiers defined by or assigned by a user in a [=vocabulary=] that is internal to the document format or protocol and not intended for human interaction. Such values are generally not [=localizable text=].</p>
<p><dfn data-lt="ASCII case-insensitive" class="lint-ignore export">ASCII case-insensitive matching</dfn>. <a href="https://infra.spec.whatwg.org/#ascii-case-insensitive">Defined in INFRA</a>, this compares two sequences of code points as if all ASCII code points in the range 0x41 to 0x5A (A to Z) were mapped to the corresponding code points in the range 0x61 to 0x7A (a to z), but other code points are not [=case-folded=]. ASCII case-insensitive matching can be required when a [=vocabulary=] is itself constrained to ASCII.</p>
<p><dfn class="lint-ignore export" data-lt="auto direction|auto base direction|auto paragraph direction">Auto (direction)</dfn>. A value used for the [=paragraph direction=] of textual data when the actual direction is unknown; it indicates that [=first-strong detection=] will be used to estimate the display of the text. See also [=LTR=] and [=RTL=].</p>
<a href="#links"><abbr title="Back to Alphabetical links">↑</abbr></a>
<div class="letter_anchor" id="b">B</div>
<p><dfn class="lint-ignore export">Base direction</dfn>. Determines the general arrangement and progression of content when [=bidirectional text=] is displayed. The [=Unicode Bidirectional Algorithm=] is primarily focused on arranging adjacent characters, based on character properties. Base direction works at a higher level, and dictates (a) the visual order and direction in which runs of strongly-typed LTR and RTL character are displayed, and (b) where there are weakly-typed characters such as punctuation, the placement of those items relative to the other content.</p>
<p><dfn data-lt="basic multilingual plane|BMP" class="lint-ignore export">Basic Multilingual Plane (BMP)</dfn>. The first 65,536 code point positions in the Unicode character set are said to constitute the Basic Multilingual Plane. The BMP includes most of the more commonly used characters.</p>
<p><strong><em>Bidi algorithm</em></strong>, see [=Unicode Bidirectional Algorithm=].</p>
<p><dfn data-lt="bidirectional text|bidi text|bidi" class="lint-ignore export">Bidirectional text</dfn> (often referred to as "<em><strong>bidi text</strong></em>" for short). Text that mixes runs of both LTR and RTL text inline. It is common for right-to-left scripts, such as Arabic and Hebrew, to contain short runs of left-to-right text (most commonly in the Latin script), and several of the scripts that are predominantly right-to-left display numbers from left-to-right. Bidirectional text is the source of many of the difficulties when dealing with RTL scripts.</p>
<p><dfn data-lt="basic language range|Basic language range" class="lint-ignore export">Basic language range</dfn>. A [=language range=] consisting of a sequence of subtags separated by hyphens. That is, it is identical in appearance to a language tag.</p>
<p><dfn class="lint-ignore export">Bicameral</dfn>. <a href="https://unicode.org/glossary/#bicameral" target="_blank">Unicode definition</a>: <q>A script that distinguishes between two cases. (See <a href="https://unicode.org/glossary/#case" target="_blank">case</a>.)</q> Usually used to refer to scripts that have an upper- and lowercase distinction, such as many alphabetic scripts of European origin (Latin, Greek, Cyrillic).</p>
<p><dfn class="lint-ignore export" data-lt="bidi isolate">Bidirectional isolate</dfn>. A range of text, bounded by formatting characters or markup, that is treated by the [=Unicode Bidirectional Algorithm=] [[UAX9]] as directionally isolated from its surroundings. The entire range of text inside the isolate is treated by the surrounding text as if it were a single neutral character (such as <span class="codepoint" translate="no"><bdi lang="en"></bdi> <span class="uname">U+FFFC OBJECT REPLACEMENT CHARACTER</span>)</span>, and is assigned the corresponding display position in the surrounding text. Furthermore, the text inside the isolate has no effect on the ordering of the text outside it, and vice versa.</p>
<p><dfn class="lint-ignore export">Bidi isolation</dfn>. The use of [=bidi isolates=] in text in order to prevent the automatic rules of the Unicode Bidirectional Algorithm incorrectly ordering that content in relation to the surrounding text. For example, numbers following right-to-left text in memory are automatically positioned to the left of [=RTL=] text by the Bidi Algorithm, but sometimes need to appear to the right. Another example occurs when a list of RTL items occurs in a LTR sentence (and vice versa): the Bidi Algorithm will automatically assume that the order of items in the list should be "3 ,2 ,1", but actually what's needed is "1, 2, 3". In HTML, bidi isolation can be applied to a range of text by enclosing it in an element with a <code class="kw" translate="no">dir</code> attribute. In plain text there are Unicode formatting characters that can do the job. These mechanisms remove unwanted [=spillover effects=].</p>
<p><dfn class="lint-ignore">Block direction</dfn>. The initial base direction of a block of text, which resolves to either <em>left-to-right</em> or <em>right-to-left</em>. A block refers to a unit of text as a whole, such as a paragraph in a document or a string in a data file. The name "block" is chosen as a contrast to <em>inline direction</em>. Unicode calls this value the [=paragraph direction=].</p> <!-- This term not exported to prevent conflicts with 'block' related terminology. -->
<p><dfn class="lint-ignore">Block (Unicode)</dfn>. <a href="https://unicode.org/glossary/#block" target="_blank">Unicode definition</a>: <q>A grouping of characters within the Unicode encoding space used for organizing code charts. Each block is a uniquely named, continuous, non-overlapping range of code points, containing a multiple of 16 code points, and starting at a location that is a multiple of 16. A block may contain unassigned code points, which are reserved.</q> Note that a given script might be split across multiple blocks.</p>
<a href="#links"><abbr title="Back to Alphabetical links">↑</abbr></a>
<div class="letter_anchor" id="c">C</div>
<p><dfn data-lt="canonical Unicode locale identifier|canonical tag|canonical locale|Canonical Unicode locale identifier" data-plurals="canonical unicode locale identifiers" class="lint-ignore export">Canonical Unicode locale identifier</dfn>. A [=well-formed language tag=] resulting from the application of the [=Unicode locale identifier=] canonicalization rules found in [[UAX35]]. This process converts any [=valid=] [[BCP47]] [=language tag=] into a valid [=Unicode locale identifier=]. For example, deprecated subtags or irregular grandfathered tags are replaced with their preferred value from the [=IANA language subtag registry=].</p>
<p><dfn id="def_case_mapping" class="lint-ignore export">Case mapping</dfn>. The process of transforming characters to a specific case, such as UPPER, lower, or Titlecase. For those scripts that have a case distinction, Unicode defines a <em>default</em> UPPER, lower, and Titlecase character mapping for each Unicode code point. Case mapping, at first, appears simple. However there are variations that need to be considered when treating the full range of Unicode in diverse languages.</p>
<p><dfn class="lint-ignore export" data-lt="case-folded">Case folding</dfn> The process of making two texts which differ only in case identical for comparison purposes, that is, it is meant for the purpose of string matching. This is distinct from [=case mapping=], which is primarily meant for display purposes. As with the default case mappings, Unicode defines default case fold mappings ("case folding") for each Unicode code point. Unicode defines two forms of case folding.</p>
<p><dfn data-lt="case-sensitive|Case sensitive matching" class="lint-ignore export">Case sensitive matching</dfn>. A form of string matching in which [=code points=] are compared directly, with no [=case folding=].</p>
<p><dfn data-lt="character encoding|character encoding form" class="lint-ignore export">Character encoding</dfn> or, more formally, a <em>character encoding form</em>. The way a [=coded character set=] is mapped to bytes for manipulation in a computer. Commonly referred to as just the <span class="alt_name">encoding</span>. For examples and further descriptions see <a href="https://www.w3.org/International/articles/definitions-characters/index#charsets">Character encodings: Essential concepts</a>.</p>
<p><dfn class="lint-ignore export">Character set</dfn> or <span class="alt_name">repertoire</span>. The set of characters one might use for a particular purpose – be it those required to support Western European languages in computers, or those a Chinese child will learn at school in the third grade (nothing to do with computers).</p>
<p><dfn class="lint-ignore export">Circumgraph</dfn>. A single vowel code point that produces glyphs on more than one side of its consonant base. For example, in the Odia syllable <img src="img/circumgraph.svg" alt="କୋ" style="height:1.2rem;">, /ka/, the character <span class="codepoint" translate="no"><bdi lang="or">ୋ</bdi><span class="uname">U+0B4B ORIYA VOWEL SIGN O</span></span> produces separate glyphs on either side of the base consonant.</p>
<p><dfn class="lint-ignore">CJK</dfn>. An abbreviation for Chinese, Japanese, and Korean. Sometimes CJKV is used to include the Han characters used in Vietnamese.</p>
<p><dfn class="lint-ignore export">Coded character set</dfn>. A set of characters where a unique number has been assigned to each character. Units of a coded character set are known as [=code points=].</p>
<p><strong><em>CLDR</em></strong>, see [=Common Locale Data Repository=].</p>
<p><dfn class="lint-ignore export">Code point</dfn>. A code point value represents the position of a character in a coded character set. For example, the code point for the letter <span class="qchar">á</span> in the Unicode coded character set is 225 in decimal, or 0xE1 in hexadecimal notation. Hexadecimal notation is commonly used for referring to code points. See also [=Unicode code point=].</p>
<p><dfn class="lint-ignore export">Code unit</dfn>. The units of data used by a <a>character encoding</a> to encode or serialize characters into a programming language or other serialized form (such as a file). Common code units are 8-, 16-, and 32-bits in size. On the Web we are mostly concerned with <em>bytes</em>, which are technically <q>8-bit code units</q>. However, in Javascript a <code>char</code> is a 16-bit code unit (related to the UTF-16 encoding of Unicode).</p>
<p><dfn data-lt="combining mark" class="lint-ignore export">Combining character</dfn>. Unicode characters such as accents, diacritics, Hebrew points, Arabic vowel signs, and Indic matras. They normally never appear alone unless they are being described, but are combined with a preceding base character. More than one combining character may be associated with the same base character. Many combining characters appear above or below or inside the base character, however some consume space along the baseline, either before or after the base character, and are referred to as <span class="name">spacing marks</span>, or <span class="name">spacing combining characters</span>. <a href="https://www.unicode.org/glossary/#combining_character" target="_blank">Unicode definition</a>: <q>A character with the General Category of Combining Mark (M). (See definition D52 in <a href="https://www.unicode.org/versions/latest/ch03.pdf#G30602" target="_blank">Section 3.6, Combination</a>.)</q></p>
<p><dfn data-lt="ccs" class="lint-ignore export">Combining character sequence</dfn>. <a href="https://www.unicode.org/glossary/#combining_character_sequence" target="_blank">Unicode definition</a>: <q>A maximal sequence of characters following the pattern <code>Base? (Combining_mark | ZWJ | ZWNJ)+</code>. Usually a base character that is a letter or digit, followed by one or more combining characters, zero width joiners, and/or zero width non-joiners.</q></p>
<p><strong><em>Combining mark</em></strong>. See [=Combining character=].</p>
<p><dfn data-lt="common locale data repository|CLDR|Common Locale Data Repository" class="lint-ignore export">Common Locale Data Repository</dfn> (or <em>CLDR</em>). The Common Locale Data Repository ([[CLDR]]) is a Unicode Consortium project that defines, collects, and curates sets of data needed to enable [=locales=] in systems or operating environments. CLDR data and its locale model are widely adopted, particularly in browsers.</p>
<p><dfn class="lint-ignore">Compatibility character</dfn>. <a href="https://www.unicode.org/glossary/#compatibility_character" target="_blank">Unicode definition</a>: <q>A character that would not have been encoded except for compatibility and round-trip convertibility with other standards. (See <a href="https://www.unicode.org/versions/latest/ch02.pdf#G11062" target="_blank">Section 2.3, Compatibility Characters</a>.)</q></p>
<p><dfn class="lint-ignore export">Composite message</dfn>. A single message, dynamically composed from more than one text string. The usual reason for creating composite messages is that one or more parts of the composite message will change according to the context. See <a href="https://www.w3.org/International/articles/composite-messages/">Working with Composite Messages</a>.</p>
<p><dfn class="lint-ignore export">Composite vowel</dfn>. A single vowel sound or diphthong that is represented by more than one code point from the available set of vowel marks, repurposed consonants, and diacritics.</p>
<p><dfn class="lint-ignore export">Conjunct</dfn>. A way of indicating consonant clusters, common in Brahmi-derived scripts, by visually merging or changing the glyphs for the sequence in some way. Conjunct behaviour is generally triggered in Unicode encoded text by adding a virama between the consonants.</p>
<p><dfn class="lint-ignore">Consonant cluster</dfn>. A sequence of consonants with no intervening vowels. See also [=conjunct=].</p>
<p><dfn class="lint-ignore export">Consumer</dfn>. When talking about strings on the Web, the W3C Internationalization group refers to a consumer as any process that receives natural language strings, either for display or processing.</p>
<p><dfn class="lint-ignore export">Cursive</dfn>. In the context of writing systems, this is applied to orthographies where letters are typically joined at the baseline (although some scripts have a few letters that only join on one side). Usually the font needs to support differences in glyph shape for the various joining contexts, which range from slight to radically different. Cursive scripts include Adlam, Arabic, Hanifi Rohingya, Mongolian, N'Ko, and Syriac. Letters in other scripts may also join, often at a hanging baseline, but they are not usually referred to as 'cursive', eg. Devanagari, Bengali, Gurmukhi, etc.</p>
<a href="#links"><abbr title="Back to Alphabetical links">↑</abbr></a>
<div class="letter_anchor" id="d">D</div>
<p><dfn class="lint-ignore export" data-lt="daylight saving|daylight saving time">Daylight Savings Time</dfn> (DST) or <dfn class="lint-ignore export">Summer Time</dfn>. An approach to setting times of the day that was adopted as a way of allowing people more sunlight hours in the evening. DST varies from country to country (not to mention locality-to-locality) and often has special one-off changes to accommodate special events. Not all regions observe DST: usually those closer to the equator do not need it. In converting times it is important to know when DST was introduced, and sometimes abandoned, for the local area, as well as on what dates DST starts and ends (which can vary from year to year). For example, Korea Standard Time and Japan Standard Time currently use the same [=zone offset=] and neither uses daylight saving. However, Japan abandoned DST in 1951, while South Korea used it last in 1988, so an application that tracks time values that reach back that far might need to track these time zones separately.</p>
<p><dfn class="lint-ignore export">Decomposed</dfn>. Decomposed text is usually the result of applying [=Unicode normalization form D=] (NFD), which splits Unicode characters into component parts, typically a base character plus one or more diacritics. However, a decomposed sequence of code points may also be intentionally (or unintentionally) used by a content author where a [=precomposed=] alternative exists.</p>
<p><dfn class="lint-ignore export">Dependent vowel</dfn>. <code class="kw" translate="no">Vowel_dependent</code> is one of the categories in the <code class="kw" translate="no">Indic_Syllabic_Category</code> property set (<a href="https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5B%3AIndic_Syllabic_Category=Vowel_Dependent%3A%5D" target="_blank">see a list</a>). The Unicode Standard definition <a href="https://www.unicode.org/glossary/#dependent_vowel" target="_blank">says</a>: <q cite="https://www.unicode.org/glossary/#dependent_vowel">A symbol or sign that represents a vowel and that is attached or combined with another symbol, usually one that represents a consonant.</q> Dependent vowels are usually combining marks, but may also be letters (eg. in Thai, or New Tai Lue, which has no combining characters).</p>
<p><dfn class="lint-ignore export">Document character set</dfn>. The character set used for processing a document, regardless of what character encoding was used to store it. For XML and HTML (from version 4.0), this is always Unicode. This means that browsers convert all text to Unicode internally and the logical model describing how XML or HTML are processed is described in terms of the set of characters defined by Unicode.</p>
<a href="#links"><abbr title="Back to Alphabetical links">↑</abbr></a>
<div class="letter_anchor" id="e">E</div>
<p><dfn class="lint-ignore">European digits</dfn>. A term used by the Unicode Standard to refer to ASCII digits. <a href="https://www.unicode.org/glossary/#european_digits" target="_blank">Unicode definition</a>: <q>Forms of decimal digits first used in Europe and now used worldwide. Historically, these digits were derived from the Arabic digits; they are sometimes called “Arabic numerals,” but this nomenclature leads to confusion with the real Arabic-Indic digits. Also called "Western digits" and "Latin digits." See <a href="https://www.unicode.org/terminology/digits.html" target="_blank"> Terminology for Digits</a> for additional information on terminology related to digits.</q></p>
<p><dfn class="lint-ignore export">Extended grapheme cluster</dfn>. See <a>grapheme cluster</a>.</p>
<p><dfn class="lint-ignore export">Extended language range</dfn>. A [=language range=] consisting of a sequence of hyphen-separated subtags. In an extended language range, a subtag can either be a valid subtag or the wildcard subtag <q><code>*</code></q>, which matches any value.</p>
<a href="#links"><abbr title="Back to Alphabetical links">↑</abbr></a>
<div class="letter_anchor" id="f">F</div>
<p><dfn class="lint-ignore export">Featural syllabary</dfn>. A syllabic writing system where the syllable glyphs are not arbitrary shapes but instead show, usually in a regular way, phonological features that are part of the syllable they represent. Examples include Korean (where a syllabic character is made up of strokes representing the individual sounds of the syllable) and Canadian Syllabics (where the vowel part of the syllable is expressed by rotation of the syllable glyph). See also <a href="https://en.wikipedia.org/wiki/Featural_writing_system">Wikipedia</a>.</p>
<p><dfn class="lint-ignore export" data-lt="field-based time format">Field-based formats</dfn>. A time format that divides the date and/or time into separate field values such as year, month, day, hour, minute, second, etc. such as <samp>2016-09-11T06:10:32</samp>. Contrast this with an alternative way to express the same time, <samp>1465621816590</samp>, which is not field-based and is rather hard to read. Field-based times may or may not be tied to either <a href="#def_utc" class="termref">UTC</a> or the local time zone – or they may be indeterminate. Field-based times are also typically tied to a specific calendar (such as the Gregorian calendar). The formats described by the ISO 8601 standard are field-based.</p>
<p><dfn class="lint-ignore export">First-strong detection</dfn>. An algorithm that looks for the first strongly-directional character in a string (while ignoring embedded runs of isolated text), and then uses that to guess at the appropriate base direction for the string as a whole. Unicode code points are associated with properties relating to text direction: generally, letters in right-to-left scripts such as Arabic and Hebrew have a strong RTL direction, whereas Latin and Han characters have a strong LTR direction. Other characters, such as punctuation, only have a weak intrinsic directionality, and the actual directionality is determined according to the context in which they are found.</p>
<p><dfn class="lint-ignore export">Floating times</dfn>. Times that are not fixed to a specific [=incremental time=] value or time zone. When you apply time zone information to floating times they produce <em>a range of acceptable incremental time values</em>, because they represent a nominal time which is described in the same way in all time zones around the world. For example, Saturday 11 June 2016 happens to be the date of the British Queen's official 90th birthday. The specific time when 11th June starts or ends in Britain may actually be on Friday or Sunday in other countries, because their clocks are set differently, but the date of the event is always referred to as Saturday 11 June. Other examples of floating time events include the publication date for an issue of a newspaper, the date the Tokyo Olympics starts, the time the New Year starts, office hours set to "9 to 5" regardless of [=time zone=], and so on.</p>
<p><dfn class="lint-ignore export">Fullwidth</dfn>. <a href="https://www.unicode.org/glossary/#fullwidth" target="_blank">Unicode definition</a>: <q>Characters of East Asian character sets whose glyph image extends across the entire character display cell. In legacy character sets, fullwidth characters are normally encoded in two or three bytes. The Japanese term for fullwidth characters is zenkaku.</q> See also [=halfwidth=].</p>
<a href="#links"><abbr title="Back to Alphabetical links">↑</abbr></a>
<div class="letter_anchor" id="g">G</div>
<p><dfn class="lint-ignore">General category</dfn>. <a href="https://www.unicode.org/glossary/#general_category" target="_blank">Unicode definition</a>: <q>Partition of the characters into major classes such as letters, punctuation, and symbols, and further subclasses for each of the major classes. (See <a href="https://www.unicode.org/versions/latest/ch04.pdf#G124142" target="_blank">Section 4.5, General Category</a>.)</q></p>
<p><dfn class="lint-ignore export">Glyph</dfn>. The visual representation of a character when rendered by a particular font. In more complex orthographies a glyph may represent only a part of a character, or may represent more than one character. A font is a collection of glyph shapes, and different fonts or font rules can render a given character using a variety of different glyphs. For example, the letter 'a' can be represented using regular (a), bold (<b style="font-weight: bold;">a</b>), or italic (<i style="font-style: italic;">a</i>) glyphs.</p>
<p><dfn class="lint-ignore export">Grapheme</dfn>. A character or a sequence of characters in a visual representation of some text that a typical user would perceive as being a single unit (<q>character</q>). Graphemes are important for a number of text operations such as sorting or text selection, so it is necessary to be able to compute the boundaries between each user-perceived character. For more information about graphemes and grapheme clusters, with examples, see <a href="https://www.w3.org/International/articles/definitions-characters/index#characters">Character encodings: Essential concepts</a>.</p>
<p><dfn class="lint-ignore export">Grapheme cluster</dfn>. A grapheme cluster is defined by the Unicode Standard as the default mechanism for computing an approximation to [=graphemes=] (see <cite>Unicode Standard Annex #29: Text Segmentation</cite> [[UAX29]]). Two types of default grapheme cluster are defined. Unless otherwise noted, <q>grapheme cluster</q> in this document refers to an <q>extended default grapheme cluster</q>. (A discussion of grapheme clusters is also given in Section 2 of the <cite>Unicode Standard</cite>, [[Unicode]]. Cf. near the end of <a href="https://www.unicode.org/versions/latest/ch02.pdf" target="_blank">Section 2.11</a> in version 14.0 of The Unicode Standard.) Because different natural languages have different needs, grapheme clusters can also sometimes require tailoring. For example, a Slovak user might wish to treat the default pair of grapheme clusters "ch" as a single grapheme cluster. Note that the interaction between the language of string content and the end-user's preferences might be complex.</p>
<p><dfn class="lint-ignore export">Gregorian calendar</dfn>. The most widely used way of representing civil time. It is a solar calendar, with years usually consisting of 365 days, plus the concept of a "leap year". This adds an additional day every 4 years, except when the year is evenly divisible by 100 (unless the year is also evenly divisible by 400). There are numerous other calendars in use around the world, some of which are lunar calendars, some that are based on a different start date than the Gregorian calendar, and some that are reset each time a prominent person dies. Often these calendars are still used for religious purposes, but sometimes you will also find them being used in newspapers and emails, or for birth dates. There are technologies, such as ICU or Dojo, that support conversion between different calendaring systems.</p>
<a href="#links"><abbr title="Back to Alphabetical links">↑</abbr></a>
<div class="letter_anchor" id="h">H</div>
<p><dfn class="lint-ignore export">Halfwidth</dfn>. <a href="https://www.unicode.org/glossary/#halfwidth" target="_blank">Unicode definition</a>: <q>Characters of East Asian character sets whose glyph image occupies half of the character display cell. In legacy character sets, halfwidth characters are normally encoded in a single byte. The Japanese term for halfwidth characters is hankaku.</q> See also [=fullwidth=].</p>
<a href="#links"><abbr title="Back to Alphabetical links">↑</abbr></a>
<div class="letter_anchor" id="i">I</div>
<p><dfn class="lint-ignore export">Ideograph</dfn>. <a href="https://www.unicode.org/glossary/#ideograph" target="_blank">Unicode definition</a>: <q>(1) Any symbol that primarily denotes an idea or concept in contrast to a sound or pronunciation – for example, ♻, which denotes the concept of recycling by a series of bent arrows. (2) A generic term for the unit of writing of a logosyllabic writing system. In this sense, ideograph (or ideogram) is not systematically distinguished from logograph (or logogram). (3) A term commonly used to refer specifically to Han characters, equivalent to the Chinese, Japanese, or Korean terms also sometimes used: hànzì, kanji, or hanja.</q></p>
<p><dfn class="lint-ignore export">Ijam</dfn>. A diacritic in the Arabic script that is considered to be an integral part of a basic letter form, such as the dots in
<span class="codepoint" translate="no"><bdi lang="ar" dir="rtl">ث</bdi><span class="uname">U+062B ARABIC LETTER THEH</span></span>,
pronounced <span class="ipa">θ</span>. <strong>Unicode encodes letter+ijam combinations as atomic characters which are never given equivalent decompositions in the standard</strong>. Ijam generally take the form of one-, two-, three- or four-dot markings above or below the basic letter skeleton, although other diacritic forms occur, especially in extensions of the Arabic script in Central and South Asia and in Africa. For example, <span class="pre"><span class="codepoint" translate="no"><bdi lang="ar" dir="rtl">ۈ</bdi><span class="uname">U+06C8 ARABIC LETTER YU</span></span></span> shows a letter with ijam that represents the vowel <span class="ipa">y</span> in the Uighur orthography. See <a href="https://www.unicode.org/versions/Unicode13.0.0/ch09.pdf" target="_blank">Chapter 9</a> of the Unicode Standard. See also: [=tashkil=].</p>
<p><dfn class="lint-ignore export">Incremental time</dfn>. A way of representing time in computers that is based on a progression of fixed integer units that increase monotonically from a specific point in time (called the "epoch"). Java (and many other systems) count time as the number of milliseconds since midnight (00:00 a.m.) on January 1, 1970 in <a href="#def_utc" class="termref">UTC</a> (less all of the intervening leap seconds). Other systems use different units and/or epochs. For example, the incremental time for 11 June, 2016 at 6.10am BST in JavaScript is <samp>1465621816590</samp>. Most programming languages and operating environments provide or use incremental time for working with time values. However, incremental time is not usually seen directly by users, but is typically mapped to a [=field-based time format=] for interchange or for human consumption.</p>
<p><dfn class="lint-ignore export">Independent vowel</dfn>. Independent letters used to represent [=standalone vowel=] sounds. They are typically found in Brahmi-derived Indic scripts, at the beginning of a word or after a word-internal vowel.</p>
<p><dfn class="lint-ignore export">Inherent vowel</dfn>. A vowel sound that is automatically pronounced after a consonant letter, unless suppressed by either indicating another vowel, or using a character specifically designed to kill the vowel sound, or contextual rules. Inherent vowels are commonly found in scripts Brahmi-derived Indic scripts. The sound of the inherent vowel varies by language.</p>
<p><dfn data-lt="I18N|internationalized|internationalisation" class="lint-ignore export">Internationalization</dfn>. The design and development of a product that is enabled for target audiences that vary in culture, region, or language. Internationalization is sometimes abbreviated <code>i18n</code> because there are eighteen letters between the "I" and the "N" in the English word.</p>
<p><dfn class="lint-ignore export">International preferences</dfn>. A user's particular set of language and formatting preferences and associated cultural conventions. Software can use these preferences to correctly process or present information exchanged with that user.</p>
<p><dfn data-lt="subtag registry|registry|lstr" class="lint-ignore export">IANA Language Subtag Registry</dfn>. A machine-readable text file available via IANA which contains a comprehensive list of all of the subtags valid in language tags. (Link: <a href="https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry">Registry</a>)</p>
<a href="#links"><abbr title="Back to Alphabetical links">↑</abbr></a>
<div class="letter_anchor" id="j">J</div>
<p><dfn class="lint-ignore export">Jamo</dfn>. The basic unit used to form Hangul syllables, representing vowels and consonants in Korean. In addition to code points for Jamo, Unicode encodes 11,172 Hangul <em>syllables</em>. These represent combinations of Jamo as single, pre-composed characters. In practice, such syllables are the main characters in actual use. (See also: <a href="https://www.unicode.org/faq/korean.html" target="_blank">Unicode Korean FAQ</a>)</p>
<a href="#links"><abbr title="Back to Alphabetical links">↑</abbr></a>
<div class="letter_anchor" id="k">K</div>
<p><dfn class="lint-ignore export">Kana</dfn>. <a href="https://unicode.org/glossary/#kana" target="_blank">Unicode definition</a>: <q cite="https://unicode.org/glossary/#kana">A collective term for the two syllabic scripts used (along with <a href="https://unicode.org/glossary/#kanji" target="_blank">kanji</a> and <a href="https://unicode.org/glossary/#romaji" target="_blank">romaji</a>) by the Japanese writing system. The two forms are <em><a href="https://unicode.org/glossary/#hiragana" target="_blank">hiragana</a></em> and <em> <a href="https://unicode.org/glossary/#katakana" target="_blank">katakana</a></em>.</q></p>
<a href="#links"><abbr title="Back to Alphabetical links">↑</abbr></a>
<div class="letter_anchor" id="l">L</div>
<p><dfn class="lint-ignore export">Language metadata</dfn>. When constrasted with the [=text-processing language=], this indicates the<em> intended use of the resource</em> as a whole. For example, such metadata may be used for searching for a relevant resource, for serving the right language version, for classification, etc. This type of language declaration differs from that of the text-processing language declaration in that (a) the value for such declarations may be more than one language subtag, and (b) the language value declared doesn't indicate which bits of a multilingual resource are in which language.</p>
<p><dfn class="lint-ignore export">Language tag extension</dfn>. A system of additional [[BCP47]] subtags introduced by a single letter or digit subtag registered with IANA and permitting additional types of language identification.</p>
<p><dfn class="lint-ignore export">Language negotiation</dfn>. Any process which selects or filters content based on language. Usually this implies selecting content in a single language (or falling back to some meaningful default language that is available) by finding the best matching values when several languages or locales [[LTLI]] are present in the content. Some common language negotiation algorithms include the Lookup algorithm in [[BCP47]] or the BestFitMatcher in [[ECMA-402]].</p>
<p><dfn class="lint-ignore export">Language priority list</dfn>. A collection of one or more [=language ranges=] identifying the user's language preferences for use in matching. As the name suggests, such lists are normally ordered or weighted according to the user's preferences. The HTTP [[RFC2616]] <code>Accept-Language</code> [[RFC3282]] header is an example of one kind of language priority list.</p>
<p><dfn data-lt="language-range" class="lint-ignore export">Language range</dfn>. A string similar in structure to a language tag that is used for "identifying sets of language tags that share specific attributes".</p>
<p><dfn class="lint-ignore export">Language subtag</dfn>. A sequence of ASCII letters or digits separated from other subtags by the hyphen-minus character and identifying a specific element of meaning withing the overall [=language tag=]. In [[BCP47]], subtags can consist of upper or lowercase ASCII letters (the case carries no distinction) or ASCII digits. Subtags are limited to no more than eight characters (although additional length restrictions apply depending on the specific use of the subtag).
</p>
<p><dfn class="lint-ignore export">Language tag</dfn>. A string used as an identifier for a language, usually referring explicitly to a [[BCP47]] language tag. These language tags consist of one or more [=language subtags=].
</p>
<p><dfn class="lint-ignore export">Legacy character encodings</dfn>. Character encoding forms that do not encode the full repertoire of characters in the Unicode character set.</p>
<p><dfn class="lint-ignore export">Locale</dfn>. An identifier (such as a [=language tag=]) for a set of [=international preferences=]. Usually this identifier indicates the preferred language of the user and possibly includes other information, such as a geographic region (such as a country). A locale is passed in APIs or set in the operating environment to obtain culturally-affected behavior within a system or process.</p>
<p><dfn class="lint-ignore export">Locale-aware</dfn> (or <em>Enabled</em>). A system that can respond to changes in the [=locale=] with culturally and language-specific behavior or content. Generally, systems that are internationalized can support a wide range of [=locales=] in order to meet the [=international preferences=] of many kinds of users.</p>
<p><dfn class="lint-ignore export">Locale fallback</dfn>. The process of searching for translated content, locale data, or other resources by "falling back" from more-specific resources to more-general ones following a deterministic pattern.</p>
<p><dfn data-lt="locale neutral" class="lint-ignore export">Locale-neutral</dfn>. A [=non-linguistic field=] is said to be locale-neutral when it is stored or exchanged in a format that is not specifically appropriate for any given language, locale, or culture and which can be interpreted unambiguously for presentation in a [=locale-aware=] way.</p>
<p><dfn class="lint-ignore export">Localizable content</dfn>. Content that can be adapted to meet the needs of a particular language, culture, or region. It includes both [=localizable text=] and non-text content such as icons.</p>
<p><dfn class="lint-ignore export">Localizable text</dfn>. String content intended as human-readable text and <b>not</b> to any of the surrounding or embedded syntactic content that form part of the document structure. Note that syntactic content can have localizable text embedded in it, such as when an [[HTML]] <code class="kw">img</code> element has an <code class="kw">alt</code> attribute containing a description of the image. [[CHARMOD-NORM]] gives <a href="https://www.w3.org/TR/charmod-norm/#dfn-localizable-content">some additional examples</a>. See also [=localizable content=], [=syntactic content=], and [=natural language=].</p>
<p><dfn data-lt="localisation" class="lint-ignore export">Localization</dfn>. The tailoring of a system to the individual cultural expectations of a specific target market or group of individuals. Localization includes, but is not limited to, the translation of user-facing text and messages. Localization is sometimes abbreviated as <code>l10n</code> because there are ten letters between the "L" and the "N" in the English word. When a particular set of content and preferences corresponding to a specific set of international preferences is operationally available, then the system is said to be <em>localized</em>.</p>
<p><dfn class="lint-ignore export">Logical order</dfn>. Some scripts, in particular Arabic and Hebrew, are written from right to left. Text including characters from these scripts can run in both directions and is therefore called [=bidirectional text=]. The Unicode Standard
[[Unicode]] requires that characters be stored and interchanged in logical order, i.e. roughly corresponding to the order in which text is typed in via the keyboard or spoken (for a more detailed definition see [[Unicode]], Section 2.2). Logical ordering is important to ensure interoperability of data, and also benefits accessibility, searching, and collation.
</p>
<p><dfn class="lint-ignore export" data-lt="left-to-right">LTR</dfn>. Stands for "left-to-right" and refers to the inline base direction of left-to-right [[UAX9]]. This is the base text direction used by languages whose starting character progression begins on the left side of the page in horizontal text. It's used for scripts such as Latin, Cyrillic, Devanagari, and many others. See also [=RTL=] and [=auto (direction)=].</p>
<a href="#links"><abbr title="Back to Alphabetical links">↑</abbr></a>
<div class="letter_anchor" id="m">M</div>
<p><dfn class="lint-ignore export">Metadata</dfn>. Additional information about data. Key types of metadata for [=internationalization=] are [=language metadata=] and metadata to support [=bidirectional text=]. Metadata has a scope, e.g., a string or a set of strings. In absence of explicit metadata, defaults might apply, e.g. defaults for the [=base direction=] of a text.</p>
<p><dfn class="lint-ignore export">Mojibake</dfn> (<span lang="ja">文字化け</span>). Garbled or incorrectly rendered or processed characters, generally caused by using the wrong <a>character encoding</a> to interpret the bytes in a string or file. The word is Japanese in origin and is pronounced <q>/mo.d͡ʒi.ba.ke/</q>. For example, the word <q><span lang="ja">文字化け</span></q> encoded as <a>UTF-8</a> might be displayed as <q>文字化ã‘</q> if viewed in an application that thinks (incorrectly) that the character encoding is <code>ISO-8859-1</code>.</p>
<a href="#links"><abbr title="Back to Alphabetical links">↑</abbr></a>
<div class="letter_anchor" id="n">N</div>
<p><dfn class="lint-ignore export">Natural Language</dfn> (sometimes just <em>language</em>). Refers to the spoken, written, or signed communications used by human beings. See also [=localizable text=] and [=syntactic content=].</p>
<p><dfn class="lint-ignore export">Non-linguistic Field</dfn>. Any element of a data structure not intended for the storage or interchange of natural language textual data. This includes non-string data types, such as booleans, numbers, dates, and so forth. It also includes strings, such as program or protocol internal identifiers. This document uses the term <em>field</em> as a short hand for this concept.</p>
<p><dfn class="lint-ignore export" data-lt="normalisation|Unicode normalization">Normalization</dfn>. The process of removing alternate representations of equivalent sequences from textual data, to convert the data into a form that can be binary-compared for equivalence. In internationalization contexts this usually refers to applying one of the Unicode normalization forms, such as [=NFC=], to a string. For more info, see [[CHARMOD-NORM]] or [[UAX15]]</p>
<p><dfn class="lint-ignore export" data-lt="unicode normalization form c|nfc">Normalization Form C</dfn> (<strong>NFC</strong>). <a href="https://www.unicode.org/glossary/#normalization_form_c" target="_blank">Unicode definition</a>: <q cite="https://www.unicode.org/glossary/#normalization_form_c">A normalization form that erases any canonical differences, and generally produces a <strong>composed</strong> result. For example, a + umlaut is
converted to ä in this form. This form most closely matches legacy
usage. The formal definition is D120 in <a href="https://www.unicode.org/versions/latest/ch03.pdf#G49537" target="_blank">
Section 3.11, Normalization Forms</a>.</q> See also [=normalization=].</p>
<p><dfn class="lint-ignore export" data-lt="unicode normalization form d|nfd">Normalization Form D</dfn> (<strong>NFD</strong>). <a href="https://www.unicode.org/glossary/#normalization_form_d" target="_blank">Unicode definition</a>: <q cite="https://www.unicode.org/glossary/#normalization_form_d">A normalization form that erases any canonical differences, and produces a <strong>decomposed</strong> result. For example, ä is converted to a + umlaut in this form. This form is most often used in internal processing, such as in collation. The formal definition is D118 in <a href="https://www.unicode.org/versions/latest/ch03.pdf#G49537" target="_blank"> Section 3.11, Normalization Forms</a>.</q> See also [=normalization=].</p>
<p><dfn class="lint-ignore export" data-lt="unicode normalization form kc|nfkc">Normalization Form KC</dfn> (<strong>NFKC</strong>). <a href="https://unicode.org/glossary/#normalization_form_kc" target="_blank">Unicode definition</a>: <q cite="https://unicode.org/glossary/#normalization_form_kc">A normalization form that erases both canonical and compatibility differences, and generally produces a composed result: for example, the single dž character is converted to d + ž in this form. This form is commonly used in matching. The formal definition is D121 in <a href="https://www.unicode.org/versions/latest/ch03.pdf#G49537">Section 3.11, Normalization Forms</a>.</q> Note that compatibility decomposition removes meaning from the text that it is applied to. Some developers and specification authors find this normalization form attractive because it appears to bring together many strings that are logically similar, but NFKC has limited utility in actual practice and has side effects that confuse users. This definition is provided for completeness, but NFKC is not generally appropriate for use on the Web. See also [=normalization=].</p>
<p><dfn class="lint-ignore export" data-lt="unicode normalization form kd|nfkd">Normalization Form KD</dfn> (<strong>NFKD</strong>). <a href="https://unicode.org/glossary/#normalization_form_kd" target="_blank">Unicode definition</a>: <q cite="https://unicode.org/glossary/#normalization_form_kd">A normalization form that erases both canonical and compatibility differences, and produces a decomposed result: for example, the single dž character is converted to d + z + caron in this form. The formal definition is D121 in <a href="https://www.unicode.org/versions/latest/ch03.pdf#G49537">Section 3.11, Normalization Forms</a>.</q> Some developers and specification authors find this normalization form attractive because it appears to bring together many strings that are logically similar, but NFKD has limited utility in actual practice and has side effects that confuse users. This definition is provided for completeness, but NFKD is not generally appropriate for use on the Web. See also [=normalization=].</p>
<a href="#links"><abbr title="Back to Alphabetical links">↑</abbr></a>
<div class="letter_anchor" id="o">O</div>
<p><dfn class="lint-ignore export">Othographic syllable</dfn> A [=typographic character unit=] that includes one or more [=grapheme clusters=]. These are most commonly found in Brahmi-derived scripts (such as Devanagari, or Balinese) when forming conjuncts or stacks. They commonly demarcate sequences of characters that are different from phonetic syllables, and they may even span word boundaries. See also the <a href="https://unicode.org/glossary/#orthographic_syllable" target="_blank">definition</a> in the Unicode Standard.</p>
<a href="#links"><abbr title="Back to Alphabetical links">↑</abbr></a>
<div class="letter_anchor" id="p">P</div>
<p><dfn class="lint-ignore export">Paragraph direction</dfn>. The initial <a>base direction</a> of a paragraph or string, which resolves to either <em>left-to-right</em> or <em>right-to-left</em>. Nested embedding controls may be used to change the direction of an inline range of text, but the paragraph direction sets the starting point which the <a>Unicode Bidirectional Algorithm</a> uses to calculate the directions of the embedded levels. For more details, see <a href="https://www.unicode.org/reports/tr9/" target="_blank">Unicode Standard Annex #9, Unicode Bidirectional Algorithm</a> [[UAX9]], especially definitions BD2–BD5.<br><span class="see-also">See also the definition in the <a href="https://www.unicode.org/glossary/#paragraph_direction" target="_blank">Unicode Standard</a>.</span></p>
<p><dfn class="lint-ignore export" data-lt="percent encoding|%HH encoding|percent encoded">Percent-encoding</dfn>. Percent encoding is the escaping mechanism defined by URI [[RFC3986]] for the encoding of arbitrary byte values not otherwise permitted into a URI. For example, if a user wishes to include the character <span class="codepoint" translate="no"><bdi lang="und">/</bdi><span class="uname">U+002F SOLIDUS</span></span> in a URI, the byte <code>0x2F</code> is encoded as the character sequence <code>%2F</code>. If the user wishes to include the character <span class="codepoint" translate="no"><bdi lang="en">é</bdi><span class="uname">U+00E9 LATIN SMALL LETTER E WITH ACUTE</span></span> in a URI, the <a>UTF-8</a> byte sequence for this character (<code>0xC3 0xA9</code>) could be encoded as the sequence <code>%C3%A9</code>.</p>
<p><dfn class="lint-ignore export">Plane</dfn>. <a href="https://unicode.org/glossary/#plane" target="_blank">Unicode definition</a>: <q cite="https://unicode.org/glossary/#plane">A range of 65,536 (10000<sub>16</sub>) contiguous Unicode code points, where the first code point is an integer multiple of 65,536 (10000<sub>16</sub>). Planes are numbered from 0 to 16, with the number being the first code point of the plane divided by 65,536. Thus Plane 0 is U+0000..U+FFFF, Plane 1 is U+<b>1</b>0000..U+<b>1</b>FFFF, ..., and Plane 16 (10<sub>16</sub>) is U+<b>10</b>0000..<b>10</b>FFFF. (Note that ISO/IEC 10646 uses hexadecimal notation for the plane numbers—for example, Plane B instead of Plane 11).</q> See also [=Basic Multilingual Plane=].</p>
<p><dfn data-lt="pre-base" class="lint-ignore export">Pre-base vowel</dfn>. A pre-base (or prescript) vowel glyph is displayed before the consonant or orthographic syllable after which it is pronounced. If the vowel character is a combining mark, it is still typed and stored in pronunciation order, and the application will render it in the correct location. In some scripts, such as Thai, a pre-base vowel glyph is represented by a normal letter, which is typed and stored in the correct position relative to the base.</p>
<p><dfn class="lint-ignore export">Precomposed</dfn>. A precomposed character is one that can also be broken down into separate code points representing its component parts (decomposition). Typically this will include base characters with diacritics, such as accented Latin characters, or Indic characters with nuktas. [=Normalization Form C=] (NFC) produces precomposed characters from many [=decomposed=] sequences.</p>
<p><dfn class="lint-ignore export">Producer</dfn>. When talking about strings on the Web, the W3C Internationalization group refers to a producer as any process where natural language string data is created for later storage, processing, or interchange.</p>
<a href="#links"><abbr title="Back to Alphabetical links">↑</abbr></a>
<div class="letter_anchor" id="r">R</div>
<p><dfn class="lint-ignore export">Resource</dfn>. In the context of W3C Internationalization documents, a given document, file, or protocol "message" which includes both the [=localizable content=] as well as the [=syntactic content=] such as identifiers surrounding or containing it. For example, in an HTML document that also has some CSS and a few <code class="kw" translate="no">script</code> elements with embedded JavaScript, the entire HTML document, considered as a file, is a resource. This term is intentionally similar to the term 'resource' as used in [[RFC3986]], although here the term is applied loosely. </p>
<p><dfn class="lint-ignore export">Resource identifier</dfn>. A compact string of characters for identifying an abstract or physical <a class="termref">resource</a>. On the Web, this mostly means various types of Universal Resource Identifiers (or <em>URIs</em>). For wire formats, [[RFC3986]] defines the structure and serialization. <em>Internationalized Resource Identifiers</em> (or <em>IRIs</em>) [[RFC3987]] describes how non-ASCII Unicode characters can be used in resource identifiers. The WhatWG [[URL]] specification describes how browsers handle IRIs and their mapping to URIs.</p>
<p><dfn class="lint-ignore export" data-lt="right-to-left">RTL</dfn>. Stands for "right-to-left" and refers to the inline base direction of right-to-left [[UAX9]]. This is the base text direction used by languages whose starting character progression begins on the right side of the page in horizontal text. It's used for a variety of scripts which include Arabic, Hebrew, N'Ko, Adlam, Thaana, and Syriac among others. See also [=LTR=] and [=auto (direction)=].</p>
<p><dfn class="lint-ignore export">Ruby</dfn>. A name for small (usually phonetic) annotations that are rendered alongside text. 'Ruby' is a British and Japanese printing term (often also called <i class="name">furigana</i> in Japan). Similar annotations are also used for Chinese, and sometimes Mongolian, and Korean.</p>
<a href="#links"><abbr title="Back to Alphabetical links">↑</abbr></a>
<div class="letter_anchor" id="s">S</div>
<p><strong><em>Scalar value</em></strong>, see [=Unicode scalar value=].</p>
<p><dfn class="lint-ignore">Script</dfn>. <a href="https://www.unicode.org/glossary/#script" target="_blank">Unicode definition</a>: <q>A collection of letters and other written signs used to represent textual information in one or more writing systems. For example, Russian is written with a subset of the Cyrillic script; Ukrainian is written with a different subset. The Japanese writing system uses several scripts.</q> See also [=writing system=].</p>
<p><dfn class="lint-ignore export">Serialization agreement</dfn>. When talking about strings on the Web, the W3C Internationalization group refers to serialization agreements as the common understanding between a producer and consumer about the serialization of string metadata: how it is to be understood, serialized, read, transmitted, removed, etc.</p>
<p><dfn class="lint-ignore export">Shaping</dfn>. Making context-sensitive changes to glyph shapes. Shaping may or may not occur at the same time as context-sensitive positioning of glyphs (such as higher diacritics over tall base characters).</p>
<p><dfn class="lint-ignore export">Spacing mark</dfn>. Combining characters that consume space along the baseline, either before or after the base character.</p>
<p><dfn class="link-ignore export" data-lt="spillover effects">Spillover</dfn>. Errors in text presentation due to a lack of [=bidi isolation=]. When strings appear next to each other or when values are inserted into text without isolation, the [=bidi algorithm=] can visually rearrange the text in ways that make the text illegible. See the article <a href="https://www.w3.org/International/articles/inline-bidi-markup/index.en.html"><cite>Inline markup and bidirectional text in HTML</cite></a> for more info. See also: [=bidi isolation=]</p>
<p><dfn class="lint-ignore export">Standalone vowel</dfn>. Vowel sounds that are not immediately preceded by a consonant sound; they typically appear at the beginning or in the middle of a word. In Brahmi-derived scripts standalone vowels are often represented using [=independent vowel=] letters.</p>
<p><dfn class="lint-ignore export">String direction</dfn>. The overall direction of a specific string, which indicates the presentation order of string-internal directional runs. Strings transmitted inside various data structures are often inserted into a block (such as a paragraph). In such a case, the string direction is needed as part of the [=bidi isolation=] of the string.</p>
<p><dfn class="lint-ignore export" data-lt="supplementary code point">Supplementary character</dfn>. Beyond the [=Basic Multilingual Plane=] the Unicode character set also contains space for around a million additional code point positions. Characters in this latter range are referred to as supplementary characters. In the UTF-16 encoding, each [=supplementary character=] is encoded using a pair of [=surrogates=].</p>
<p><dfn class="lint-ignore" data-lt="surrogate">Surrogate code point</dfn>. <a href="https://www.unicode.org/glossary/#surrogate_code_point" target="_blank">Unicode definition</a>: <q>A Unicode code point in the range <code class="kw" translate="no">U+D800..U+DFFF</code>. Reserved for use by UTF-16, where a pair of surrogate code units (a high surrogate followed by a low surrogate) “stand in” for a [=supplementary code point=].</q> This term is also defined by [[INFRA]].</p>
<p><dfn class="lint-ignore export">Surrogate pair</dfn>. In the UTF-16 [=character encoding=] of Unicode, a sequence of two [=surrogate code points=], one from the range <code class="kw" translate="no">U+D800...U+DBFF</code> (a <em>high surrogate</em>) followed by one from the range <code class="kw" translate="no">U+DC00...U+DFFF</code> (a <em>low surrogate</em>). Each surrogate pair encodes a single [=supplementary code point=].</p>
<p><dfn class="lint-ignore export">Syllabary</dfn>. A type of writing system in which each symbol typically represents both a consonant and a vowel, or in some instances more than one consonant and a vowel. Usually there is also a set of symbols that represent [=standalone vowel=] sounds. Alternatives include [=abugida=], [=alphabet=] and [=abjad=].</p>
<p><dfn class="lint-ignore export">Syllable</dfn>. A model of syllable structure divides the syllable into an <em>onset</em> followed by a <em>rhyme</em>. The rhyme is typically composed of a <em>nucleus</em> and an optional <em>coda</em>. The nucleus is the most sonorous part of the syllable. A syllable always has a nucleus, but syllables may have no onset and/or coda (eg. compare 'but', 'an', 'the', 'a').</p>
<p><dfn class="lint-ignore export">Syntactic content</dfn>. Any text in a document format or protocol that belongs to the structure of the format or protocol. This definition includes values that are typically thought of as "markup" but can also include other values, such as the name of a field in an HTTP header. Syntactic content consists of all of the characters that form the <em>structure</em> of a format or protocol. For example, <span class="qchar"><</span> and <span class="qchar">></span> (as well as the element name and various attributes they surround) are part of the syntactic content in an HTML document. Syntactic content usually is defined by a specification or specifications and includes both the defined, reserved keywords for the given protocol or format as well as string tokens and identifiers that are defined by document authors to form the structure of the document (rather than the "content" of the document). See also [=localizable text=] and [=natural language=].</p>
<a href="#links"><abbr title="Back to Alphabetical links">↑</abbr></a>
<div class="letter_anchor" id="t">T</div>
<p><dfn class="lint-ignore export">Tashkil</dfn>. An Arabic script mark that indicates vocalization of text or other types of phonetic guide which indicate pronunciation, such as in <span class="codepoint" translate="no"><bdi lang="ar" dir="rtl">ثَ</bdi><span class="uname">U+062B ARABIC LETTER THEH</span> + <span class="uname">U+064E ARABIC FATHA</span></span>,
pronounced <span class="ipa">θa</span>. These include several subtypes: <span class="name">harakat</span> (short vowel marks), <span class="name">tanwin</span> (postnasalized, that is, an extra <span class="ipa">n</span> sound at the end of a noun marked by a double similar harakat), <span class="name">shaddah</span> (consonant gemination mark), and <span class="name">sukun</span> (to mark lack of a following vowel). <strong>A basic Arabic letter plus any of these types of marks is never encoded as an atomic, precomposed character, but must always be represented as a sequence of letter plus separate combining mark.</strong> For example, <span class="codepoint" translate="no"><bdi lang="ar" dir="rtl">هٰ</bdi><span class="uname">U+0647 ARABIC LETTER HEH</span> + <span class="uname">U+0670 ARABIC LETTER SUPERSCRIPT ALEF</span></span> pronounced <span class="ipa">ha</span>, is an example of a letter plus tashkil combination in Arabic (cf. the use of that diacritic in a precomposed Uighur letter). See <a href="https://www.unicode.org/versions/Unicode13.0.0/ch09.pdf" target="_blank">Chapter 9</a> of the Unicode Standard. See also [=ijam=].</p>
<p><dfn data-lt="text processing language" class="lint-ignore export">Text-processing language</dfn>. The language in which a specific range of text is actually written. This needs to be declared so that user agents or applications that manipulate the text, such as voice browsers, spell checkers, style processors, hyphenators, etc., can apply the appropriate rules to the text in question. So we are, by necessity, talking about associating a <em>single</em> language with a <em>specific</em> range of text. Contrast this with [=language metadata=].</p>
<p><dfn class="lint-ignore export">Time zone</dfn>. A set of rules for determining the local time (wall time) as it relates to incremental time (as used in most computing systems) for a particular geographical region, and vice versa. Time zone rules have to take into account [=zone offsets=] <em>plus</em> any [=daylight savings=] modifications to wall time that apply.</p>
<p><dfn class="lint-ignore export">Time zone identifiers</dfn>. Allows you to refer to a particular difference from [=UTC=] that includes both [= zone offsets=] and [=daylight savings time=]. The most definitive reference for identifying sets of time zone rules is the <a href="https://www.iana.org/time-zones">TZ database</a> (also known as the Olson time zone database), which is used by systems such as various commercial UNIX operating systems, Linux, Java, CLDR, ICU, and many other systems and libraries. Other systems exist: for example, Microsoft Windows uses its own data set and identifiers. In the TZ database, time zones are given IDs that typically consist of a region and exemplar city. An exemplar city is a city in the time zone in question that should be well-known to people using the time zone. For example, the U.S. Pacific time zone has a TZ database ID of <code class="kw" translate="no">America/Los_Angeles</code>. The TZ database also supplies aliases for many IDs; for example, <code class="kw" translate="no">Asia/Ulan Bator</code> is equivalent to <code class="kw" translate="no">Asia/Ulaanbaatar</code>. The Common Locale Data Repository (CLDR) can be used to provide a localized form for the IDs. Note that some systems, such as Apple's Mac OS, provide additional exemplar cities.</p>
<p><dfn class="lint-ignore export">Titlecase</dfn>. <a href="https://unicode.org/glossary/#titlecase">Unicode definition:</a> <em>Uppercased initial letter followed by lowercase letters in words. A casing convention often used in titles, headers, and entries, as exemplified in this glossary.</em> Note that titlecasing rules are language-sensitive. For more information, see <a data-cite="charmod-norm#definitionCaseFolding">Case Mapping and Case Folding</a> in [[[CHARMOD-NORM]]].</p>
<p><dfn class="lint-ignore export">Transcoder</dfn>. A process that converts
text between two character encodings. Most commonly in W3C internationalization documents it
refers to a process that converts from a [=legacy character encoding=] to a <a href="https://www.w3.org/TR/2005/REC-charmod-20050215/#Unicode_Encoding_Form">Unicode encoding form</a>, such as <a>UTF-8</a>.</p>
<p><dfn class="lint-ignore export">Transcription</dfn>. A <em>transcription</em> is likely to be more phonetically accurate than a [=transliteration=] (though usually still only reflects an approximation to the actual sound), but, in particular, it does not usually allow completely reversible conversions.</p>
<p><dfn class="lint-ignore export">Transliteration</dfn>. In a <em>transliteration</em> each native character is associated with an equivalent and unique Latin-script character. The transliteration may not accurately represent pronunciation, but does allow straightforward and reversible conversion between the two scripts. Compare with [=transcription=].</p>
<p><dfn class="lint-ignore">Typographic character unit</dfn>. <a href="https://www.w3.org/TR/css-text-4/#typographic-character-unit">CSS definition</a>: <q>the basic unit of text. Even within the realm of text layout, the relevant character unit depends on the operation. For example, line-breaking and letter-spacing will segment a sequence of Thai characters that include <span class="codepoint"><bdi lang="th"> ำ</bdi><span class="uname">U+0E33 THAI CHARACTER SARA AM</span></span> differently; or the behavior of a conjunct consonant in a script such as Devanagari may depend on the font in use. So the typographic character represents a unit of the writing system—such as a Latin alphabetic letter (including its diacritics), Hangul syllable, Chinese ideographic character, Myanmar syllable cluster—that is indivisible with respect to a particular typographic operation (line-breaking, first-letter effects, tracking, justification, vertical arrangement, etc.).</q>
<p><dfn class="lint-ignore">Typographic letter unit</dfn>. <a href="https://www.w3.org/TR/css-text-4/#typographic-letter-unit">CSS definition</a>: <q>a [=typographic character unit=] belonging to one of the Letter or Number [=general categories=].</q>
</p>
<a href="#links"><abbr title="Back to Alphabetical links">↑</abbr></a>
<div class="letter_anchor" id="u">U</div>
<p><dfn class="lint-ignore export">Unicameral</dfn> or <em>unicase</em>. <a href="https://unicode.org/glossary/#unicameral" target="_blank">Unicode definition</a>: <q>A script that has no case distinctions.</q></p>
<p><dfn data-lt="unicode bidi algorithm|bidi algorithm|UBA" class="lint-ignore export">Unicode Bidirectional Algorithm</dfn> or <em>Bidi algorithm</em>. The name for the rules described in the <a href="https://www.unicode.org/reports/tr9/" target="_blank">Unicode Standard Annex #9, “Unicode Bidirectional Algorithm</a> [[UAX9]]. Those rules describe how inline [=bidirectional text=] should be rendered for scripts such as Arabic, Hebrew, Thaana, N'Ko, Adlam, etc. The effects of the bidi algorithm depend on the [=base direction=] and the directional properties of the characters to which it is applied.</p>
<p><dfn class="lint-ignore export">Unicode code point</dfn>. The numeric value assigned to each Unicode character. Unicode code points range from <code class="kw" translate="no">0</code> to <code class="kw" translate="no">0x10FFFF</code>. (See Section 4.1 in [[CHARMOD]] for a deeper discussion of character encoding terminology.) Unicode code points are denoted as <code class="kw" translate="no">U+<em>hhhh</em></code>, where <code class="kw" translate="no"><em>hhhh</em></code> is a sequence of at least four, and at most six hexadecimal digits. For example, the character <span class="codepoint"><bdi lang="en">€</bdi><span class="uname">U+20AC EURO SIGN</span></span> has the code point <span class="uname" translate="no">U+20AC</span>, while the character <span class="codepoint"><bdi lang="en">😺</bdi><span class="uname">U+1F63A SMILING CAT FACE WITH OPEN MOUTH</span></span> has the code point <span class="uname" translate="no">U+1F63A</span>.</p>
<p><dfn data-lt="unicode locale" class="lint-ignore export">Unicode Locale Identifier</dfn> or <em>Unicode Locale</em>. A [=language tag=] that follows the additional rules and restrictions on subtag choice defined in UTR#35 [[UAX35]]. Any valid Unicode locale identifier is also a [=valid=] [[BCP47]] [=language tag=], but a few [=valid language tags=] are not also valid Unicode locale identifiers.</p>
<p><dfn data-lt="scalar value|Unicode Scalar Value|USV" class="lint-ignore export">Unicode scalar value</dfn>. <a href="https://www.unicode.org/glossary/#unicode_scalar_value" target="_blank">Unicode definition</a>: <q>Any [=Unicode code point=] except high-surrogate and low-surrogate code points. In other words, the ranges of integers 0 to D7FF<sub>16</sub> and E000<sub>16</sub> to 10FFFF<sub>16</sub> inclusive. (See definition D76 in <a href="https://www.unicode.org/versions/latest/ch03.pdf#G7404" target="_blank">Section 3.9, Unicode Encoding Forms</a>.)</q></p>
<p><dfn data-lt="unicode" class="lint-ignore export">Universal Character Set</dfn> or <em>Unicode</em>. The character set or repertoire defined by the [[Unicode]] Standard and which includes all of the characters used to encode text, including historical or extinct writing systems as well as modern usage, private use, typesetting symbols, and other things, such as the emoji. Other [=character sets=] are subsets of Unicode.</p>
<p><dfn id="def_utc" data-lt="utc" class="lint-ignore export">Universal Coordinated Time (UTC)</dfn>. The basis for modern timekeeping. Among other things, it provides a common baseline for converting between [=incremental time=] and [=wall time=]. UTC is also known as GMT (Greenwich Mean Time). There are some subtle differences between the two, but none that the average person would notice.
The time zone offset for UTC is 0. UTC is often indicated in [=field-based formats=] using <code class="kw" translate="no">Z</code>.</p>
<p><dfn class="lint-ignore export">User-facing identifiers</dfn>. Identifiers defined by or assigned by a user in a [=vocabulary=] that is intended to be at least potentially visible to end-users (and thus is [=localizable text=]).</p>
<p><span class="alt_name">User-perceived character</span>. See [=grapheme=].</p>
<p><dfn class="lint-ignore export">User-supplied value</dfn>. Unreserved [=syntactic content=] in a [=vocabulary=] that is assigned by users, as distinct from reserved keywords in a given format or protocol. Users generally expect that their user-supplied values can be words or phrases in their preferred [=natural language=]. This is why [[CHARMOD]] recommends that "Specifications <em class="rfc2119">SHOULD NOT</em> arbitrarily exclude code points from the full range of Unicode code points from <span class="uname">U+0000</span> to <span class="uname">U+10FFFF</span> inclusive." [[CHARMOD-NORM]] gives <a href="https://www.w3.org/TR/charmod-norm/#dfn-user-supplied-value">some examples</a>. </p>
<!-- Do not export UTF-8 to avoid conflicts with Encoding -->
<p><dfn class="lint-ignore">UTF-8</dfn>. <a href="https://unicode.org/glossary/#UTF_8" target="_blank">Unicode definition</a>: <q>A multibyte encoding for text that represents each Unicode character with 1 to 4 bytes, and which is backward-compatible with ASCII. UTF-8 is the predominant form of [=Unicode=] in web pages. More technically: (1) The <a href="https://unicode.org/glossary/#utf_8_encoding_form" target="_blank"> UTF-8 encoding form</a>. (2) The <a href="https://unicode.org/glossary/#utf_8_encoding_scheme" target="_blank"> UTF-8 encoding scheme</a>. (3) “UCS Transformation Format 8,” defined in Annex D of ISO/IEC 10646:2003, technically equivalent to the definitions in the Unicode Standard.</q></p>
<!-- Do not export UTF-16 to avoid conflicts with Encoding -->
<p><dfn class="lint-ignore">UTF-16</dfn>. <a href="https://unicode.org/glossary/#UTF_16" target="_blank">Unicode definition</a>: <q>A multibyte encoding for text that represents each Unicode character with 2 or 4 bytes; it is not backward-compatible with ASCII. It is the internal form of [=Unicode=] in many programming languages, such as Java, C#, and JavaScript, and in many operating systems. More technically: (1) The <a href="https://unicode.org/glossary/#utf_16_encoding_form" target="_blank">UTF-16 encoding form</a>. (2) The <a href="https://unicode.org/glossary/#utf_16_encoding_scheme" target="_blank"> UTF-16 encoding scheme</a>. (3) "Transformation format for 16 planes of Group 00," defined in Annex C of ISO/IEC 10646:2003; technically equivalent to the definitions in the Unicode Standard.</q></p>
<a href="#links"><abbr title="Back to Alphabetical links">↑</abbr></a>
<div class="letter_anchor" id="v">V</div>
<p><dfn class="lint-ignore export" data-lt="valid">Valid language tag</dfn>. A language tag that is [=well-formed=] and which also conforms to the additional <a href="https://www.rfc-editor.org/rfc/rfc5646.html#section-2.2.9">conformance requirements</a> in [[BCP47]], notably that each of the subtags appears in the IANA Language Subtag Registry. Contrast this with [=well-formed=] language tags.</p>
<p><dfn class="lint-ignore export">Variation selector</dfn>. <a href="https://unicode.org/glossary/#variation_selector" target="_blank">Unicode definition</a>: <q>Any of three ranges of Unicode characters designated for use in defining a <i><a href="https://unicode.org/glossary/#variation_sequence">variation sequence</a></i>. Variation selectors in the range U+FE00..U+FE0F are known as <i>general-use</i> variation selectors and are used for <a href="https://unicode.org/glossary/#standardized_variation_sequence"><em>standardized variation sequences</em></a>. Two of these, U+FE0E and U+FE0F, have specialized functions when used with <i>emoji</i> base characters. Variation selectors in the range U+180B..U+180D are known as <i><a href="https://unicode.org/glossary/#mongolian_free_variation_selector">Mongolian Free Variation Selector</a></i>; their use is limited to standardized variation sequences for the Mongolian script. Variation selectors in the range U+E0100..U+E01EF are known as <i>ideographic</i> variation selectors and are used for <a href="https://unicode.org/glossary/#ideographic_variation_sequence"><em>ideographic variation sequences</em></a>. Variation selectors are all nonspacing combing marks (General_Category=Mn). They have no graphic shape of their own; instead they function to pick out a particular, defined subset of potential graphic presentations for the base character to which they are applied. All variation selectors are <i><a href="https://unicode.org/glossary/#default_ignorable">default ignorable</a></i> code points (DICP=Yes), meaning that if they are not interpretable in combination with their base character, they should be ignored for display, rather than shown with a nondisplayable glyph box, for example. See <a href="https://www.unicode.org/versions/latest/ch23.pdf#G19053">Section 23.4, Variation Selectors</a>. The term variation selector is sometimes abbreviated as 'VS'.</q></p>
<p><dfn class="lint-ignore export">Virama</dfn>. <a href="https://www.unicode.org/glossary/#virama" target="_blank">Unicode definition</a>: <q>From Sanskrit. The name of a sign used in many Indic and other Brahmi-derived scripts to suppress the inherent vowel of the consonant to which it is applied, thereby generating a dead consonant. (See <a href="https://www.unicode.org/versions/latest/ch12.pdf#G12284" target="_blank">Section 12.1, Devanagari</a>.) The sign varies in shape from script to script, and may be known by other names in various languages.</q> It may also be visible or hidden in consonant clusters, depending on the language and context. <a href="https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5B%3AInSC%3DVirama%3A%5D&g=&i=" target="_blank">Used for</a> scripts such as Devanagari, Bengali, Tamil, Balinese, etc. <!--See also <a href="#invisiblestacker">invisible stacker</a>, and <a href="#purekiller">pure killer</a--></p>
<p><dfn class="lint-ignore export">Vocabulary</dfn>. The list of reserved keywords and/or rules for assigning [=user-supplied values=] (such as identifiers) in a format or protocol. This can include restrictions on range, order, or type of characters that can appear in different places. For example, HTML defines the names of its elements and attributes, as well as enumerated attribute values, which defines the "vocabulary" of HTML [=syntactic content=]. Another example would be ECMAScript, which restricts the range of characters that can appear at the start or in the body of an identifier or variable name. It applies different rules for other cases, such as to the values of string literals. Values within a vocabulary fall into two broad classes: those that are meant to be seen, read, or interacted with by humans (and thus might be expected to contain natural language text); and those that are application or protocol internal and not intended for human interaction.</p>
<a href="#links"><abbr title="Back to Alphabetical links">↑</abbr></a>
<div class="letter_anchor" id="w">W</div>
<p><dfn class="lint-ignore export">Wall time</dfn> or <dfn class="lint-ignore export">local time</dfn>. A moment in time that can be mapped to a specific point in [=incremental time=] if you apply any relevant time zone information, but it corresponds to what a person would recognise the time to be if they looked at a clock and/or calendar mounted on a wall in a particular place. So, for example, the time displayed by a computer in the UK may be <samp>Sat 11 Jun 06:10</samp>. By applying knowledge about how that time relates to [=UTC=] (in this case, adjusting by one hour to account for British Summer Time) it is possible to convert that to the incremental time <samp>1465621816590</samp>. It's also possible to convert that to a wall time in another location, such as San Francisco, where someone looking at their computer's time display at the same time would have seen <samp>Fri 10 Jun 22:10</samp>.</p>
<p><dfn class="lint-ignore export" data-lt="well-formed">Well-formed language tag</dfn>. A language tag that follows the grammar defined in [[BCP47]]. That is, it is structurally correct, consisting of ASCII letters and digit [=language subtags=] of the prescribed length, separated by hyphens. Contrast this with [=valid language tag=].</p>
<p><dfn class="lint-ignore">Word boundary</dfn> or <dfn class="lint-ignore export">Word</dfn>. The concept of 'word' is difficult to define in any language (see <a href="https://www.w3.org/International/articles/typography/linebreak.en#whatisword">What is a word?</a>). In these definitions, a word is an often vaguely-defined, but recognisable semantic unit that is typically smaller than a phrase and may comprise one or more syllables. Word boundaries are typically important for text operations such as line-breaking, and for prosodic and phonetic rules.</p>
<p><dfn class="lint-ignore">Writing system</dfn>. <a href="https://www.unicode.org/glossary/#writing_system" target="_blank">Unicode definition</a>: <q>A set of rules for using one or more scripts to write a particular language. Examples include the American English writing system, the British English writing system, the French writing system, and the Japanese writing system.</q> See also [=script=].</p>
<a href="#links"><abbr title="Back to Alphabetical links">↑</abbr></a>
<div class="letter_anchor" id="z">Z</div>
<p><dfn class="lint-ignore export">Zone offset</dfn>. An amount that is added to or subtracted from [=UTC=] based on the location of the event around the world relative to the prime meridian. Usually offsets are at one-hour intervals, but offsets can also include other differences, such as 30 or 45 minutes. A common way to express a zone offset in field-based formats is with +/- followed by the offset. So for example, Japan is 9 hours ahead of UTC, so you may see a time written as <samp>2016-06-11 05:10+09:00</samp>. Note, importantly, that the zone offset does not help you convert times to wall time where [=daylight savings time=] is in force.</p>
<a href="#links"><abbr title="Back to Alphabetical links">↑</abbr></a>
</section>
<section class="appendix" id="how-to-use">
<h3 id="howto">How to use this glossary</h3>
<p>If you are writing a W3C specification, you can use the terminology in this document directly by importing this document as a cross-reference source. This may be especially helpful when using <a>character encoding</a> or <a>locale</a>-related terms found here. Such terms are common to the needs of many specifications and often refer to other definitions found in this glossary.</p>
<p>This glossary is a Working Group NOTE, which means that the definitions found in it are, by definition, non-normative. In most cases, most specifications do not need the definition of terms found here to be formally normative. Using the definitions found here lends clarity when using specialized terminology, but doesn't affect requirements directly and in those cases copying these definitions into your specification won't add value to your readers.</p>
<p>However, you might find that your specification needs a normative dependency on a definition. In those cases, you should copy the definition to your own document (perhaps linking back to this glossary as a source). Slight adjustments can be made to the definitions to fit local circumstances. Please contact the Internationalization Working Group with any questions or concerns when doing this.</p>
<section id="infra-rel">
<h4>Relationship to the Infra Standard</h4>
<p>This document is meant to be used in conjunction with formal, normative definitions found in [[INFRA]]. When a term is defined in both documents, the [[INFRA]] version SHOULD be preferred. The definitions in both documents are maintained to be consistent with one-another. Note that [[INFRA]]'s definitions may be used as normative.</p>
<p>The I18N Glossary has many more terms specific to internationalization than are to be found in Infra, including many terms useful in defining or discussing the handling of text.</p>
</section>
<section id="respec-how-to-use">
<h4>ReSpec</h4>
<p>If you are using <a href="https://respec.org">ReSpec</a>, you can import this glossary using the <kbd><a href="https://respec.org/docs/#xref">xref</a></kbd> keyword in your configuration block. Complete instructions can be found the ReSpec documentation <a href="https://respec.org/docs/#references-0">here</a>.</p>
<p>For this document, the <code>xref</code> configuration will look something like this:</p>
<pre>
xref: [ "i18n-glossary" ],
</pre>
<p>Adding the above configuration allows you to write references to the terms found in this glossary using the normal ReSpec notation (<code><a></code> tags or <code>[​=term=​]</code> markup).</p>
<p>If you use a terminology definition inside of a normative statement, ReSpec will complain that you've used an informative reference inside of a normative statement. The I18N Glossary is maintained as a WG Note, rather than on the REC-track, so it is formally informative. This is only necessary when you reference a term from a normative statement.</p>
<p>You can suppress the warning by including this directive in your configuration block:</p>
<pre>lint: informative-dfn: false
</pre>
</section>
<section id="bikeshed-how">
<h4>Bikeshed</h4>
<p>If you are using <a href="https://github.com/speced/bikeshed">Bikeshed</a> to generate your specification, you can import this glossary using a <code translate="no">spec</code> directive that looks like this:</p>
<pre>spec: i18n-glossary; urlPrefix: https://www.w3.org/TR/i18n-glossary/</pre>
<p>All of the references in the glossary are exported, so you should be able to refer to terms in the glossary by using markup that looks like <code translate="yes">[=<em>term</em>=]</code>. In rare cases, you may need to include specific directives following the <code translate="no">spec</code> directive to ensure that the definition is found during document processing. Here is an example using the term <a>locale</a>:</p>
<pre>
spec: i18n-glossary; urlPrefix: https://www.w3.org/TR/i18n-glossary/
type: dfn
text: locale; url: locale
</pre>
</section>
</section>
</body>
</html>