|
80 | 80 | \end{note}
|
81 | 81 | If an input file is determined to be a UTF-8 file,
|
82 | 82 | then it shall be a well-formed UTF-8 code unit sequence and
|
83 |
| -it is decoded to produce a sequence of UCS scalar values |
84 |
| -that constitutes the sequence of elements of the translation character set. |
| 83 | +it is decoded to produce a sequence of Unicode scalar values. |
| 84 | +A sequence of translation character set elements is then formed |
| 85 | +by mapping each Unicode scalar value |
| 86 | +to the corresponding translation character set element. |
85 | 87 | In the resulting sequence,
|
86 | 88 | each pair of characters in the input sequence consisting of
|
87 | 89 | \unicode{000d}{carriage return} followed by \unicode{000a}{line feed},
|
|
242 | 244 | The \defnadj{translation}{character set} consists of the following elements:
|
243 | 245 | \begin{itemize}
|
244 | 246 | \item
|
245 |
| -each character named by ISO/IEC 10646, |
246 |
| -as identified by its unique UCS scalar value, and |
| 247 | +each abstract character assigned a code point in the Unicode codespace, and |
247 | 248 | \item
|
248 |
| -a distinct character for each UCS scalar value |
249 |
| -where no named character is assigned. |
| 249 | +a distinct character for each Unicode scalar value |
| 250 | +not assigned to an abstract character. |
250 | 251 | \end{itemize}
|
251 | 252 | \begin{note}
|
252 |
| -ISO/IEC 10646 code points are integers |
| 253 | +Unicode code points are integers |
253 | 254 | in the range $[0, \mathrm{10FFFF}]$ (hexadecimal).
|
254 | 255 | A surrogate code point is a value
|
255 | 256 | in the range $[\mathrm{D800}, \mathrm{DFFF}]$ (hexadecimal).
|
256 |
| -A UCS scalar value is any code point that is not a surrogate code point. |
| 257 | +A Unicode scalar value is any code point that is not a surrogate code point. |
257 | 258 | \end{note}
|
258 | 259 |
|
259 | 260 | \pnum
|
|
353 | 354 | \tcode{\textbackslash U} \grammarterm{hex-quad} \grammarterm{hex-quad}, or
|
354 | 355 | \tcode{\textbackslash u\{\grammarterm{simple-hexadecimal-digit-sequence}\}}
|
355 | 356 | designates the character in the translation character set
|
356 |
| -whose UCS scalar value is the hexadecimal number represented by |
| 357 | +whose Unicode scalar value is the hexadecimal number represented by |
357 | 358 | the sequence of \grammarterm{hexadecimal-digit}s
|
358 | 359 | in the \grammarterm{universal-character-name}.
|
359 |
| -The program is ill-formed if that number is not a UCS scalar value. |
| 360 | +The program is ill-formed if that number is not a Unicode scalar value. |
360 | 361 |
|
361 | 362 | \pnum
|
362 | 363 | A \grammarterm{universal-character-name}
|
363 | 364 | that is a \grammarterm{named-universal-character}
|
364 |
| -designates the character named by its \grammarterm{n-char-sequence}. |
365 |
| -A character is so named if the \grammarterm{n-char-sequence} is equal to |
366 |
| -\begin{itemize} |
367 |
| -\item |
368 |
| -the associated character name or associated character name alias |
369 |
| -specified in ISO/IEC 10646 subclause ``Code charts and lists of character names'' |
370 |
| -or |
371 |
| -\item |
372 |
| -the control code alias given in \tref{lex.charset.ucn}. |
| 365 | +designates the corresponding character |
| 366 | +in the Unicode Standard (chapter 4.8 Name) |
| 367 | +if the \grammarterm{n-char-sequence} is equal |
| 368 | +to its character name or |
| 369 | +to one of its character name aliases of |
| 370 | +type ``control'', ``correction'', or ``alternate''; |
| 371 | +otherwise, the program is ill-formed. |
373 | 372 | \begin{note}
|
374 |
| -The aliases in \tref{lex.charset.ucn} are provided for control characters |
375 |
| -which otherwise have no associated character name or character name alias. |
376 |
| -These names are derived from |
| 373 | +These aliases are listed in |
377 | 374 | the Unicode Character Database's \tcode{NameAliases.txt}.
|
378 |
| -For historical reasons, control characters are formally unnamed. |
379 |
| -\end{note} |
380 |
| -\end{itemize} |
381 |
| -\begin{note} |
382 |
| -None of the associated character names, |
383 |
| -associated character name aliases, or |
384 |
| -control code aliases |
385 |
| -have leading or trailing spaces. |
| 375 | +None of these names or aliases have leading or trailing spaces. |
386 | 376 | \end{note}
|
387 | 377 |
|
388 |
| -\begin{multicolfloattable}{Control code aliases}{lex.charset.ucn}{ll} |
389 |
| -\unicode{0000}{null} \\ |
390 |
| -\unicode{0001}{start of heading} \\ |
391 |
| -\unicode{0002}{start of text} \\ |
392 |
| -\unicode{0003}{end of text} \\ |
393 |
| -\unicode{0004}{end of transmission} \\ |
394 |
| -\unicode{0005}{enquiry} \\ |
395 |
| -\unicode{0006}{acknowledge} \\ |
396 |
| -\unicode{0007}{alert} \\ |
397 |
| -\unicode{0008}{backspace} \\ |
398 |
| -\unicode{0009}{character tabulation} \\ |
399 |
| -\unicode{0009}{horizontal tabulation} \\ |
400 |
| -\unicode{000a}{line feed} \\ |
401 |
| -\unicode{000a}{new line} \\ |
402 |
| -\unicode{000a}{end of line} \\ |
403 |
| -\unicode{000b}{line tabulation} \\ |
404 |
| -\unicode{000b}{vertical tabulation} \\ |
405 |
| -\unicode{000c}{form feed} \\ |
406 |
| -\unicode{000d}{carriage return} \\ |
407 |
| -\unicode{000e}{shift out} \\ |
408 |
| -\unicode{000e}{locking-shift one} \\ |
409 |
| -\unicode{000f}{shift in} \\ |
410 |
| -\unicode{000f}{locking-shift zero} \\ |
411 |
| -\unicode{0010}{data link escape} \\ |
412 |
| -\unicode{0011}{device control one} \\ |
413 |
| -\unicode{0012}{device control two} \\ |
414 |
| -\unicode{0013}{device control three} \\ |
415 |
| -\unicode{0014}{device control four} \\ |
416 |
| -\unicode{0015}{negative acknowledge} \\ |
417 |
| -\unicode{0016}{synchronous idle} \\ |
418 |
| -\unicode{0017}{end of transmission block} \\ |
419 |
| -\unicode{0018}{cancel} \\ |
420 |
| -\unicode{0019}{end of medium} \\ |
421 |
| -\unicode{001a}{substitute} \\ |
422 |
| -\unicode{001b}{escape} \\ |
423 |
| -\unicode{001c}{information separator four} \\ |
424 |
| -\unicode{001c}{file separator} \\ |
425 |
| -\unicode{001d}{information separator three} \\ |
426 |
| -\unicode{001d}{group separator} \\ |
427 |
| -\unicode{001e}{information separator two} \\ |
428 |
| -\unicode{001e}{record separator} \\ |
429 |
| -\unicode{001f}{information separator one} \\ |
430 |
| -\unicode{001f}{unit separator} \\ |
431 |
| -\columnbreak |
432 |
| -\unicode{007f}{delete} \\ |
433 |
| -\unicode{0082}{break permitted here} \\ |
434 |
| -\unicode{0083}{no break here} \\ |
435 |
| -\unicode{0084}{index} \\ |
436 |
| -\unicode{0085}{next line} \\ |
437 |
| -\unicode{0086}{start of selected area} \\ |
438 |
| -\unicode{0087}{end of selected area} \\ |
439 |
| -\unicode{0088}{character tabulation set} \\ |
440 |
| -\unicode{0088}{horizontal tabulation set} \\ |
441 |
| -\unicode{0089}{character tabulation with justification} \\ |
442 |
| -\unicode{0089}{horizontal tabulation with justification} \\ |
443 |
| -\unicode{008a}{line tabulation set} \\ |
444 |
| -\unicode{008a}{vertical tabulation set} \\ |
445 |
| -\unicode{008b}{partial line forward} \\ |
446 |
| -\unicode{008b}{partial line down} \\ |
447 |
| -\unicode{008c}{partial line backward} \\ |
448 |
| -\unicode{008c}{partial line up} \\ |
449 |
| -\unicode{008d}{reverse line feed} \\ |
450 |
| -\unicode{008d}{reverse index} \\ |
451 |
| -\unicode{008e}{single shift two} \\ |
452 |
| -\unicode{008e}{single-shift-2} \\ |
453 |
| -\unicode{008f}{single shift three} \\ |
454 |
| -\unicode{008f}{single-shift-3} \\ |
455 |
| -\unicode{0090}{device control string} \\ |
456 |
| -\unicode{0091}{private use one} \\ |
457 |
| -\unicode{0091}{private use-1} \\ |
458 |
| -\unicode{0092}{private use two} \\ |
459 |
| -\unicode{0092}{private use-2} \\ |
460 |
| -\unicode{0093}{set transmit state} \\ |
461 |
| -\unicode{0094}{cancel character} \\ |
462 |
| -\unicode{0095}{message waiting} \\ |
463 |
| -\unicode{0096}{start of guarded area} \\ |
464 |
| -\unicode{0096}{start of protected area} \\ |
465 |
| -\unicode{0097}{end of guarded area} \\ |
466 |
| -\unicode{0097}{end of protected area} \\ |
467 |
| -\unicode{0098}{start of string} \\ |
468 |
| -\unicode{009a}{single character introducer} \\ |
469 |
| -\unicode{009b}{control sequence introducer} \\ |
470 |
| -\unicode{009c}{string terminator} \\ |
471 |
| -\unicode{009d}{operating system command} \\ |
472 |
| -\unicode{009e}{privacy message} \\ |
473 |
| -\unicode{009f}{application program command} \\ |
474 |
| -\end{multicolfloattable} |
475 |
| - |
476 | 378 | \pnum
|
477 | 379 | If a \grammarterm{universal-character-name} outside
|
478 | 380 | the \grammarterm{c-char-sequence}, \grammarterm{s-char-sequence}, or
|
|
491 | 393 | The \defnadj{basic literal}{character set} consists of
|
492 | 394 | all characters of the basic character set,
|
493 | 395 | plus the control characters specified in \tref{lex.charset.literal}.
|
494 |
| -\begin{note} |
495 |
| -The alias \uname{bell} for \ucode{0007} shown in ISO 10646 |
496 |
| -is ambiguous with \unicode{1f514}{bell}. |
497 |
| -\end{note} |
498 | 396 |
|
499 | 397 | \begin{floattable}{Additional control characters in the basic literal character set}{lex.charset.literal}{ll}
|
500 | 398 | \topline
|
|
544 | 442 | \indextext{UTF-16}%
|
545 | 443 | \indextext{UTF-32}%
|
546 | 444 | For a UTF-8, UTF-16, or UTF-32 literal,
|
547 |
| -the UCS scalar value |
| 445 | +the Unicode scalar value |
548 | 446 | corresponding to each character of the translation character set
|
549 |
| -is encoded as specified in ISO/IEC 10646 for the respective UCS encoding form. |
| 447 | +is encoded as specified in the Unicode Standard |
| 448 | +for the respective Unicode encoding form. |
550 | 449 | \indextext{character set|)}
|
551 | 450 |
|
552 | 451 | \rSec1[lex.pptoken]{Preprocessing tokens}
|
|
887 | 786 | \begin{bnf}
|
888 | 787 | \nontermdef{identifier-start}\br
|
889 | 788 | nondigit\br
|
890 |
| - \textnormal{an element of the translation character set of class XID_Start} |
| 789 | + \textnormal{an element of the translation character set with the Unicode property XID_Start} |
891 | 790 | \end{bnf}
|
892 | 791 |
|
893 | 792 | \begin{bnf}
|
894 | 793 | \nontermdef{identifier-continue}\br
|
895 | 794 | digit\br
|
896 | 795 | nondigit\br
|
897 |
| - \textnormal{an element of the translation character set of class XID_Continue} |
| 796 | + \textnormal{an element of the translation character set with the Unicode property XID_Continue} |
898 | 797 | \end{bnf}
|
899 | 798 |
|
900 | 799 | \begin{bnf}
|
|
913 | 812 | \pnum
|
914 | 813 | \indextext{name!length of}%
|
915 | 814 | \indextext{name}%
|
916 |
| -The character classes XID_Start and XID_Continue |
917 |
| -are Derived Core Properties as described by \UAX{44}. |
| 815 | +\begin{note} |
| 816 | +The character properties XID_Start and XID_Continue are Derived Core Properties |
| 817 | +as described by \UAX{44} of the Unicode Standard. |
918 | 818 | \begin{footnote}
|
919 | 819 | On systems in which linkers cannot accept extended
|
920 | 820 | characters, an encoding of the \grammarterm{universal-character-name} can be used in
|
|
925 | 825 | place a translation limit on significant characters for external
|
926 | 826 | identifiers.
|
927 | 827 | \end{footnote}
|
| 828 | +\end{note} |
928 | 829 | The program is ill-formed
|
929 | 830 | if an \grammarterm{identifier} does not conform to
|
930 |
| -Normalization Form C as specified in ISO/IEC 10646. |
| 831 | +Normalization Form C as specified in the Unicode Standard. |
931 | 832 | \begin{note}
|
932 | 833 | Identifiers are case-sensitive.
|
933 | 834 | \end{note}
|
|
2099 | 2000 | \impldef{code unit sequence for non-representable \grammarterm{string-literal}}
|
2100 | 2001 | code unit sequence is encoded.
|
2101 | 2002 | \begin{note}
|
2102 |
| -No character lacks representation in any of the UCS encoding forms. |
| 2003 | +No character lacks representation in any Unicode encoding form. |
2103 | 2004 | \end{note}
|
2104 | 2005 | When encoding a stateful character encoding,
|
2105 | 2006 | implementations should encode the first such sequence
|
|
0 commit comments