From d93354898e4908d4bdc4738cbf42dfb9a6671775 Mon Sep 17 00:00:00 2001 From: Jens Maurer Date: Wed, 15 Feb 2023 22:49:30 +0100 Subject: [PATCH] P2736R2 Referencing The Unicode Standard In [lex.name], the paper missed a change of the original term "character classes" to "character properties"; that change is included. Fixes NB FR 133, FR 013 (C++23 CD). --- source/back.tex | 18 ----- source/future.tex | 17 ++--- source/intro.tex | 24 +----- source/iostreams.tex | 4 +- source/lex.tex | 161 ++++++++-------------------------------- source/preprocessor.tex | 12 ++- source/utilities.tex | 19 ++--- 7 files changed, 56 insertions(+), 199 deletions(-) diff --git a/source/back.tex b/source/back.tex index 40954fd628..867ae7dfac 100644 --- a/source/back.tex +++ b/source/back.tex @@ -18,24 +18,6 @@ \chapter{Bibliography} Programming languages, their environments, and system software interfaces --- Floating-point extensions for C --- Part 3: Interchange and extended types} % Other international standards. -\item - %%% Format for the following entry is based on that specified at - %%% http://www.iec.ch/standardsdev/resources/draftingpublications/directives/principles/referencing.htm - The Unicode Consortium. Unicode Standard Annex, \UAX{29}, - \doccite{Unicode Text Segmentation} [online]. - Edited by Mark Davis. Revision 35; issued for Unicode 12.0.0. 2019-02-15 [viewed 2020-02-23]. - Available from: \url{http://www.unicode.org/reports/tr29/tr29-35.html} -\item - The Unicode Consortium. Unicode Standard Annex, \UAX{31}, - \doccite{Unicode Identifier and Pattern Syntax} [online]. - Edited by Mark Davis. Revision 33; issued for Unicode 13.0.0. - 2020-02-13 [viewed 2021-06-08]. - Available from: \url{https://www.unicode.org/reports/tr31/tr31-33.html} -\item - The Unicode Standard Version 14.0, - \doccite{Core Specification}. - Unicode Consortium, ISBN 978-1-936213-29-0, copyright \copyright 2021 Unicode, Inc. - Available from: \url{https://www.unicode.org/versions/Unicode14.0.0/UnicodeStandard-14.0.pdf} \item IANA Time Zone Database. Available from: \url{https://www.iana.org/time-zones} diff --git a/source/future.tex b/source/future.tex index d1a90c64b0..ba9fc37ef5 100644 --- a/source/future.tex +++ b/source/future.tex @@ -2045,6 +2045,10 @@ If \tcode{(Mode \& little_endian)}, the facet shall generate a multibyte sequence in little-endian order, as opposed to the default big-endian order. +\item + UCS-2 is the same encoding as UTF-16, + except that it encodes scalar values in the range + \ucode{0000}--\ucode{ffff} (Basic Multilingual Plane) only. \end{itemize} \pnum @@ -2055,8 +2059,7 @@ \begin{itemize} \item The facet shall convert between UTF-8 multibyte sequences - and UCS-2 or UTF-32 (depending on the size of \tcode{Elem}) - within the program. + and UCS-2 or UTF-32 (depending on the size of \tcode{Elem}). \item Endianness shall not affect how multibyte sequences are read or written. \item @@ -2071,8 +2074,7 @@ \begin{itemize} \item The facet shall convert between UTF-16 multibyte sequences - and UCS-2 or UTF-32 (depending on the size of \tcode{Elem}) - within the program. + and UCS-2 or UTF-32 (depending on the size of \tcode{Elem}). \item Multibyte sequences shall be read or written according to the \tcode{Mode} flag, as set out above. @@ -2095,13 +2097,6 @@ The multibyte sequences may be written as either a text or a binary file. \end{itemize} -\pnum -The encoding forms UTF-8, UTF-16, and UTF-32 are specified in ISO/IEC 10646. -The encoding form UCS-2 is specified in ISO/IEC 10646:2003. -\begin{footnote} -Cancelled and replaced by ISO/IEC 10646:2017. -\end{footnote} - \rSec1[depr.conversions]{Deprecated convenience conversion interfaces} \rSec2[depr.conversions.general]{General} diff --git a/source/intro.tex b/source/intro.tex index d8c059d829..c247ada66b 100644 --- a/source/intro.tex +++ b/source/intro.tex @@ -51,14 +51,6 @@ Operating System Interface (POSIX), Technical Corrigendum 1} \item ISO/IEC/IEEE 9945:2009/Cor 2:2017, \doccite{Information Technology --- Portable Operating System Interface (POSIX), Technical Corrigendum 2} -\item ISO/IEC 10646, \doccite{Information technology --- -Universal Coded Character Set (UCS)} -\item ISO/IEC 10646:2003, -\begin{footnote} -Cancelled and replaced by ISO/IEC 10646:2017. -\end{footnote} -\doccite{Information technology --- -Universal Multiple-Octet Coded Character Set (UCS)} \item ISO/IEC/IEEE 60559:2020, \doccite{Information technology --- Microprocessor Systems --- Floating-Point arithmetic} \item ISO 80000-2:2009, \doccite{Quantities and units --- @@ -75,14 +67,8 @@ Language Specification}, Standard Ecma-262, third edition, 1999. \item -The Unicode Consortium. -Unicode Standard Annex, \UAX{44}, \doccite{Unicode Character Database}. -Edited by Ken Whistler and Lauren\c{t}iu Iancu. -Available from: \url{http://www.unicode.org/reports/tr44/} -\item -The Unicode Consortium. -The Unicode Standard, \doccite{Derived Core Properties}. -Available from: \url{https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt} +The Unicode Consortium. \doccite{The Unicode Standard}. +Available from: \url{https://www.unicode.org/versions/latest/} \end{itemize} \pnum @@ -104,12 +90,6 @@ hereinafter called \defn{ECMA-262}. \indextext{references!normative|)} -\pnum -\begin{note} -References to ISO/IEC 10646:2003 are used only -to support deprecated features\iref{depr.locale.stdcvt}. -\end{note} - \rSec0[intro.defs]{Terms and definitions} \pnum diff --git a/source/iostreams.tex b/source/iostreams.tex index b8271e70db..f2066de816 100644 --- a/source/iostreams.tex +++ b/source/iostreams.tex @@ -6850,7 +6850,7 @@ if invoking the native Unicode API requires transcoding, implementations should substitute invalid code units with \unicode{fffd}{replacement character} per -The Unicode Standard Version 14.0 - Core Specification, Chapter 3.9. +the Unicode Standard, Chapter 3.9 \ucode{fffd} Substitution in Conversion. \end{itemdescr} \rSec3[ostream.unformatted]{Unformatted output functions} @@ -7786,7 +7786,7 @@ If invoking the native Unicode API requires transcoding, implementations should substitute invalid code units with \unicode{fffd}{replacement character} per -The Unicode Standard Version 14.0 - Core Specification, Chapter 3.9. +the Unicode Standard, Chapter 3.9 \ucode{fffd} Substitution in Conversion. \end{itemdescr} \indexlibraryglobal{vprint_nonunicode}% diff --git a/source/lex.tex b/source/lex.tex index 8647edfbee..022a2ac9ad 100644 --- a/source/lex.tex +++ b/source/lex.tex @@ -80,8 +80,10 @@ \end{note} If an input file is determined to be a UTF-8 file, then it shall be a well-formed UTF-8 code unit sequence and -it is decoded to produce a sequence of UCS scalar values -that constitutes the sequence of elements of the translation character set. +it is decoded to produce a sequence of Unicode scalar values. +A sequence of translation character set elements is then formed +by mapping each Unicode scalar value +to the corresponding translation character set element. In the resulting sequence, each pair of characters in the input sequence consisting of \unicode{000d}{carriage return} followed by \unicode{000a}{line feed}, @@ -244,18 +246,17 @@ The \defnadj{translation}{character set} consists of the following elements: \begin{itemize} \item -each character named by ISO/IEC 10646, -as identified by its unique UCS scalar value, and +each abstract character assigned a code point in the Unicode codespace, and \item -a distinct character for each UCS scalar value -where no named character is assigned. +a distinct character for each Unicode scalar value +not assigned to an abstract character. \end{itemize} \begin{note} -ISO/IEC 10646 code points are integers +Unicode code points are integers in the range $[0, \mathrm{10FFFF}]$ (hexadecimal). A surrogate code point is a value in the range $[\mathrm{D800}, \mathrm{DFFF}]$ (hexadecimal). -A UCS scalar value is any code point that is not a surrogate code point. +A Unicode scalar value is any code point that is not a surrogate code point. \end{note} \pnum @@ -355,126 +356,27 @@ \tcode{\textbackslash U} \grammarterm{hex-quad} \grammarterm{hex-quad}, or \tcode{\textbackslash u\{\grammarterm{simple-hexadecimal-digit-sequence}\}} designates the character in the translation character set -whose UCS scalar value is the hexadecimal number represented by +whose Unicode scalar value is the hexadecimal number represented by the sequence of \grammarterm{hexadecimal-digit}s in the \grammarterm{universal-character-name}. -The program is ill-formed if that number is not a UCS scalar value. +The program is ill-formed if that number is not a Unicode scalar value. \pnum A \grammarterm{universal-character-name} that is a \grammarterm{named-universal-character} -designates the character named by its \grammarterm{n-char-sequence}. -A character is so named if the \grammarterm{n-char-sequence} is equal to -\begin{itemize} -\item -the associated character name or associated character name alias -specified in ISO/IEC 10646 subclause ``Code charts and lists of character names'' -or -\item -the control code alias given in \tref{lex.charset.ucn}. +designates the corresponding character +in the Unicode Standard (chapter 4.8 Name) +if the \grammarterm{n-char-sequence} is equal +to its character name or +to one of its character name aliases of +type ``control'', ``correction'', or ``alternate''; +otherwise, the program is ill-formed. \begin{note} -The aliases in \tref{lex.charset.ucn} are provided for control characters -which otherwise have no associated character name or character name alias. -These names are derived from +These aliases are listed in the Unicode Character Database's \tcode{NameAliases.txt}. -For historical reasons, control characters are formally unnamed. -\end{note} -\end{itemize} -\begin{note} -None of the associated character names, -associated character name aliases, or -control code aliases -have leading or trailing spaces. +None of these names or aliases have leading or trailing spaces. \end{note} -\begin{multicolfloattable}{Control code aliases}{lex.charset.ucn}{ll} -\unicode{0000}{null} \\ -\unicode{0001}{start of heading} \\ -\unicode{0002}{start of text} \\ -\unicode{0003}{end of text} \\ -\unicode{0004}{end of transmission} \\ -\unicode{0005}{enquiry} \\ -\unicode{0006}{acknowledge} \\ -\unicode{0007}{alert} \\ -\unicode{0008}{backspace} \\ -\unicode{0009}{character tabulation} \\ -\unicode{0009}{horizontal tabulation} \\ -\unicode{000a}{line feed} \\ -\unicode{000a}{new line} \\ -\unicode{000a}{end of line} \\ -\unicode{000b}{line tabulation} \\ -\unicode{000b}{vertical tabulation} \\ -\unicode{000c}{form feed} \\ -\unicode{000d}{carriage return} \\ -\unicode{000e}{shift out} \\ -\unicode{000e}{locking-shift one} \\ -\unicode{000f}{shift in} \\ -\unicode{000f}{locking-shift zero} \\ -\unicode{0010}{data link escape} \\ -\unicode{0011}{device control one} \\ -\unicode{0012}{device control two} \\ -\unicode{0013}{device control three} \\ -\unicode{0014}{device control four} \\ -\unicode{0015}{negative acknowledge} \\ -\unicode{0016}{synchronous idle} \\ -\unicode{0017}{end of transmission block} \\ -\unicode{0018}{cancel} \\ -\unicode{0019}{end of medium} \\ -\unicode{001a}{substitute} \\ -\unicode{001b}{escape} \\ -\unicode{001c}{information separator four} \\ -\unicode{001c}{file separator} \\ -\unicode{001d}{information separator three} \\ -\unicode{001d}{group separator} \\ -\unicode{001e}{information separator two} \\ -\unicode{001e}{record separator} \\ -\unicode{001f}{information separator one} \\ -\unicode{001f}{unit separator} \\ -\columnbreak -\unicode{007f}{delete} \\ -\unicode{0082}{break permitted here} \\ -\unicode{0083}{no break here} \\ -\unicode{0084}{index} \\ -\unicode{0085}{next line} \\ -\unicode{0086}{start of selected area} \\ -\unicode{0087}{end of selected area} \\ -\unicode{0088}{character tabulation set} \\ -\unicode{0088}{horizontal tabulation set} \\ -\unicode{0089}{character tabulation with justification} \\ -\unicode{0089}{horizontal tabulation with justification} \\ -\unicode{008a}{line tabulation set} \\ -\unicode{008a}{vertical tabulation set} \\ -\unicode{008b}{partial line forward} \\ -\unicode{008b}{partial line down} \\ -\unicode{008c}{partial line backward} \\ -\unicode{008c}{partial line up} \\ -\unicode{008d}{reverse line feed} \\ -\unicode{008d}{reverse index} \\ -\unicode{008e}{single shift two} \\ -\unicode{008e}{single-shift-2} \\ -\unicode{008f}{single shift three} \\ -\unicode{008f}{single-shift-3} \\ -\unicode{0090}{device control string} \\ -\unicode{0091}{private use one} \\ -\unicode{0091}{private use-1} \\ -\unicode{0092}{private use two} \\ -\unicode{0092}{private use-2} \\ -\unicode{0093}{set transmit state} \\ -\unicode{0094}{cancel character} \\ -\unicode{0095}{message waiting} \\ -\unicode{0096}{start of guarded area} \\ -\unicode{0096}{start of protected area} \\ -\unicode{0097}{end of guarded area} \\ -\unicode{0097}{end of protected area} \\ -\unicode{0098}{start of string} \\ -\unicode{009a}{single character introducer} \\ -\unicode{009b}{control sequence introducer} \\ -\unicode{009c}{string terminator} \\ -\unicode{009d}{operating system command} \\ -\unicode{009e}{privacy message} \\ -\unicode{009f}{application program command} \\ -\end{multicolfloattable} - \pnum If a \grammarterm{universal-character-name} outside the \grammarterm{c-char-sequence}, \grammarterm{s-char-sequence}, or @@ -493,10 +395,6 @@ The \defnadj{basic literal}{character set} consists of all characters of the basic character set, plus the control characters specified in \tref{lex.charset.literal}. -\begin{note} -The alias \uname{bell} for \ucode{0007} shown in ISO 10646 -is ambiguous with \unicode{1f514}{bell}. -\end{note} \begin{floattable}{Additional control characters in the basic literal character set}{lex.charset.literal}{ll} \topline @@ -546,9 +444,10 @@ \indextext{UTF-16}% \indextext{UTF-32}% For a UTF-8, UTF-16, or UTF-32 literal, -the UCS scalar value +the Unicode scalar value corresponding to each character of the translation character set -is encoded as specified in ISO/IEC 10646 for the respective UCS encoding form. +is encoded as specified in the Unicode Standard +for the respective Unicode encoding form. \indextext{character set|)} \rSec1[lex.pptoken]{Preprocessing tokens} @@ -889,14 +788,14 @@ \begin{bnf} \nontermdef{identifier-start}\br nondigit\br - \textnormal{an element of the translation character set of class XID_Start} + \textnormal{an element of the translation character set with the Unicode property XID_Start} \end{bnf} \begin{bnf} \nontermdef{identifier-continue}\br digit\br nondigit\br - \textnormal{an element of the translation character set of class XID_Continue} + \textnormal{an element of the translation character set with the Unicode property XID_Continue} \end{bnf} \begin{bnf} @@ -915,8 +814,9 @@ \pnum \indextext{name!length of}% \indextext{name}% -The character classes XID_Start and XID_Continue -are Derived Core Properties as described by \UAX{44}. +\begin{note} +The character properties XID_Start and XID_Continue are Derived Core Properties +as described by \UAX{44} of the Unicode Standard. \begin{footnote} On systems in which linkers cannot accept extended characters, an encoding of the \grammarterm{universal-character-name} can be used in @@ -927,9 +827,10 @@ place a translation limit on significant characters for external identifiers. \end{footnote} +\end{note} The program is ill-formed if an \grammarterm{identifier} does not conform to -Normalization Form C as specified in ISO/IEC 10646. +Normalization Form C as specified in the Unicode Standard. \begin{note} Identifiers are case-sensitive. \end{note} @@ -2102,7 +2003,7 @@ \impldef{code unit sequence for non-representable \grammarterm{string-literal}} code unit sequence is encoded. \begin{note} -No character lacks representation in any of the UCS encoding forms. +No character lacks representation in any Unicode encoding form. \end{note} When encoding a stateful character encoding, implementations should encode the first such sequence diff --git a/source/preprocessor.tex b/source/preprocessor.tex index 5fe3d2d381..b2ac324acd 100644 --- a/source/preprocessor.tex +++ b/source/preprocessor.tex @@ -1895,13 +1895,11 @@ \item \indextext{__stdc_iso_10646__@\mname{STDC_ISO_10646}}% \mname{STDC_ISO_10646}\\ -An integer literal of the form \tcode{yyyymmL} (for example, -\tcode{199712L}). -If this symbol is defined, then every character in the Unicode required set, when -stored in an object of type \keyword{wchar_t}, has the same value as the code point -of that character. The \defn{Unicode required set} consists of all -the characters that are defined by ISO/IEC 10646, along with -all amendments and technical corrigenda as of the specified year and month. +An integer literal of the form \tcode{yyyymmL} +(for example, \tcode{199712L}). +Whether \mname{STDC_ISO_10646} is predefined and +if so, what its value is, +are \impldef{presence and value of \mname{STDC_ISO_10646}}. \item \indextext{__stdcpp_threads__@\mname{STDCPP_THREADS}}% diff --git a/source/utilities.tex b/source/utilities.tex index 85f4f73ba6..8f064fd063 100644 --- a/source/utilities.tex +++ b/source/utilities.tex @@ -14976,7 +14976,7 @@ a string is assumed to be in a locale-independent, \impldef{encoding assumption for \tcode{format} width computation} encoding. -Implementations should use a Unicode encoding +Implementations should use either UTF-8, UTF-16, or UTF-32, on platforms capable of displaying Unicode text in a terminal. \begin{note} This is the case for Windows @@ -14990,7 +14990,7 @@ \end{note} \pnum -For a string in a Unicode encoding, +For a string in UTF-8, UTF-16, or UTF-32, implementations should estimate the width of a string as the sum of estimated widths of the first code points in its extended grapheme clusters. @@ -15015,7 +15015,8 @@ The estimated width of other code points is 1. \pnum -For a string in a non-Unicode encoding, the width of a string is unspecified. +For a string in neither UTF-8, UTF-16, nor UTF-32, +the width of a string is unspecified. \pnum % FIXME: What if it's an arg-id? @@ -15027,7 +15028,7 @@ For string types, this field provides an upper bound for the estimated width of the prefix of the input string that is copied into the output. -For a string in a Unicode encoding, +For a string in UTF-8, UTF-16, or UTF-32, the formatter copies to the output the longest prefix of whole extended grapheme clusters whose estimated width is no greater than the precision. @@ -15930,15 +15931,15 @@ \begin{itemize} \item -\placeholder{CE} is a Unicode encoding and +\placeholder{CE} is UTF-8, UTF-16, or UTF-32 and \placeholder{C} corresponds to either -a UCS scalar value whose Unicode property \tcode{General_Category} +a Unicode scalar value whose Unicode property \tcode{General_Category} has a value in the groups \tcode{Separator} (\tcode{Z}) or \tcode{Other} (\tcode{C}) or to -a UCS scalar value which has the Unicode property \tcode{Grapheme_Extend=Yes}, -as described by table 12 of \UAX{44}, or +a Unicode scalar value with the Unicode property \tcode{Grapheme_Extend=Yes}, +as described by \UAX{44} of the Unicode Standard, or \item -\placeholder{CE} is not a Unicode encoding and +\placeholder{CE} is neither UTF-8, UTF-16, nor UTF-32 and \placeholder{C} is one of an implementation-defined set of separator or non-printable characters \end{itemize}