From d93354898e4908d4bdc4738cbf42dfb9a6671775 Mon Sep 17 00:00:00 2001
From: Jens Maurer <Jens.Maurer@gmx.net>
Date: Wed, 15 Feb 2023 22:49:30 +0100
Subject: [PATCH] P2736R2 Referencing The Unicode Standard

In [lex.name], the paper missed a change of the original term
"character classes" to "character properties"; that change is
included.

Fixes NB FR 133, FR 013 (C++23 CD).
---
 source/back.tex         |  18 -----
 source/future.tex       |  17 ++---
 source/intro.tex        |  24 +-----
 source/iostreams.tex    |   4 +-
 source/lex.tex          | 161 ++++++++--------------------------------
 source/preprocessor.tex |  12 ++-
 source/utilities.tex    |  19 ++---
 7 files changed, 56 insertions(+), 199 deletions(-)

diff --git a/source/back.tex b/source/back.tex
index 40954fd628..867ae7dfac 100644
--- a/source/back.tex
+++ b/source/back.tex
@@ -18,24 +18,6 @@ \chapter{Bibliography}
     Programming languages, their environments, and system software interfaces ---
     Floating-point extensions for C --- Part 3: Interchange and extended types}
 % Other international standards.
-\item
-  %%% Format for the following entry is based on that specified at
-  %%% http://www.iec.ch/standardsdev/resources/draftingpublications/directives/principles/referencing.htm
-  The Unicode Consortium. Unicode Standard Annex, \UAX{29},
-  \doccite{Unicode Text Segmentation} [online].
-  Edited by Mark Davis. Revision 35; issued for Unicode 12.0.0. 2019-02-15 [viewed 2020-02-23].
-  Available from: \url{http://www.unicode.org/reports/tr29/tr29-35.html}
-\item
-  The Unicode Consortium. Unicode Standard Annex, \UAX{31},
-  \doccite{Unicode Identifier and Pattern Syntax} [online].
-  Edited by Mark Davis. Revision 33; issued for Unicode 13.0.0.
-  2020-02-13 [viewed 2021-06-08].
-  Available from: \url{https://www.unicode.org/reports/tr31/tr31-33.html}
-\item
-  The Unicode Standard Version 14.0,
-  \doccite{Core Specification}.
-  Unicode Consortium, ISBN 978-1-936213-29-0, copyright \copyright 2021 Unicode, Inc.
-  Available from: \url{https://www.unicode.org/versions/Unicode14.0.0/UnicodeStandard-14.0.pdf}
 \item
   IANA Time Zone Database.
   Available from: \url{https://www.iana.org/time-zones}
diff --git a/source/future.tex b/source/future.tex
index d1a90c64b0..ba9fc37ef5 100644
--- a/source/future.tex
+++ b/source/future.tex
@@ -2045,6 +2045,10 @@
   If \tcode{(Mode \& little_endian)}, the facet shall generate a
   multibyte sequence in little-endian order,
   as opposed to the default big-endian order.
+\item
+  UCS-2 is the same encoding as UTF-16,
+  except that it encodes scalar values in the range
+  \ucode{0000}--\ucode{ffff} (Basic Multilingual Plane) only.
 \end{itemize}
 
 \pnum
@@ -2055,8 +2059,7 @@
 \begin{itemize}
 \item
   The facet shall convert between UTF-8 multibyte sequences
-  and UCS-2 or UTF-32 (depending on the size of \tcode{Elem})
-  within the program.
+  and UCS-2 or UTF-32 (depending on the size of \tcode{Elem}).
 \item
   Endianness shall not affect how multibyte sequences are read or written.
 \item
@@ -2071,8 +2074,7 @@
 \begin{itemize}
 \item
   The facet shall convert between UTF-16 multibyte sequences
-  and UCS-2 or UTF-32 (depending on the size of \tcode{Elem})
-  within the program.
+  and UCS-2 or UTF-32 (depending on the size of \tcode{Elem}).
 \item
   Multibyte sequences shall be read or written
   according to the \tcode{Mode} flag, as set out above.
@@ -2095,13 +2097,6 @@
   The multibyte sequences may be written as either a text or a binary file.
 \end{itemize}
 
-\pnum
-The encoding forms UTF-8, UTF-16, and UTF-32 are specified in ISO/IEC 10646.
-The encoding form UCS-2 is specified in ISO/IEC 10646:2003.
-\begin{footnote}
-Cancelled and replaced by ISO/IEC 10646:2017.
-\end{footnote}
-
 \rSec1[depr.conversions]{Deprecated convenience conversion interfaces}
 
 \rSec2[depr.conversions.general]{General}
diff --git a/source/intro.tex b/source/intro.tex
index d8c059d829..c247ada66b 100644
--- a/source/intro.tex
+++ b/source/intro.tex
@@ -51,14 +51,6 @@
 Operating System Interface (POSIX), Technical Corrigendum 1}
 \item ISO/IEC/IEEE 9945:2009/Cor 2:2017, \doccite{Information Technology --- Portable
 Operating System Interface (POSIX), Technical Corrigendum 2}
-\item ISO/IEC 10646, \doccite{Information technology ---
-Universal Coded Character Set (UCS)}
-\item ISO/IEC 10646:2003,
-\begin{footnote}
-Cancelled and replaced by ISO/IEC 10646:2017.
-\end{footnote}
-\doccite{Information technology ---
-Universal Multiple-Octet Coded Character Set (UCS)}
 \item ISO/IEC/IEEE 60559:2020, \doccite{Information technology ---
 Microprocessor Systems --- Floating-Point arithmetic}
 \item ISO 80000-2:2009, \doccite{Quantities and units ---
@@ -75,14 +67,8 @@
 Language Specification},
 Standard Ecma-262, third edition, 1999.
 \item
-The Unicode Consortium.
-Unicode Standard Annex, \UAX{44}, \doccite{Unicode Character Database}.
-Edited by Ken Whistler and Lauren\c{t}iu Iancu.
-Available from: \url{http://www.unicode.org/reports/tr44/}
-\item
-The Unicode Consortium.
-The Unicode Standard, \doccite{Derived Core Properties}.
-Available from: \url{https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt}
+The Unicode Consortium. \doccite{The Unicode Standard}.
+Available from: \url{https://www.unicode.org/versions/latest/}
 \end{itemize}
 
 \pnum
@@ -104,12 +90,6 @@
 hereinafter called \defn{ECMA-262}.
 \indextext{references!normative|)}
 
-\pnum
-\begin{note}
-References to ISO/IEC 10646:2003 are used only
-to support deprecated features\iref{depr.locale.stdcvt}.
-\end{note}
-
 \rSec0[intro.defs]{Terms and definitions}
 
 \pnum
diff --git a/source/iostreams.tex b/source/iostreams.tex
index b8271e70db..f2066de816 100644
--- a/source/iostreams.tex
+++ b/source/iostreams.tex
@@ -6850,7 +6850,7 @@
 if invoking the native Unicode API requires transcoding,
 implementations should substitute invalid code units
 with \unicode{fffd}{replacement character} per
-The Unicode Standard Version 14.0 - Core Specification, Chapter 3.9.
+the Unicode Standard, Chapter 3.9 \ucode{fffd} Substitution in Conversion.
 \end{itemdescr}
 
 \rSec3[ostream.unformatted]{Unformatted output functions}
@@ -7786,7 +7786,7 @@
 If invoking the native Unicode API requires transcoding,
 implementations should substitute invalid code units
 with \unicode{fffd}{replacement character} per
-The Unicode Standard Version 14.0 - Core Specification, Chapter 3.9.
+the Unicode Standard, Chapter 3.9 \ucode{fffd} Substitution in Conversion.
 \end{itemdescr}
 
 \indexlibraryglobal{vprint_nonunicode}%
diff --git a/source/lex.tex b/source/lex.tex
index 8647edfbee..022a2ac9ad 100644
--- a/source/lex.tex
+++ b/source/lex.tex
@@ -80,8 +80,10 @@
 \end{note}
 If an input file is determined to be a UTF-8 file,
 then it shall be a well-formed UTF-8 code unit sequence and
-it is decoded to produce a sequence of UCS scalar values
-that constitutes the sequence of elements of the translation character set.
+it is decoded to produce a sequence of Unicode scalar values.
+A sequence of translation character set elements is then formed
+by mapping each Unicode scalar value
+to the corresponding translation character set element.
 In the resulting sequence,
 each pair of characters in the input sequence consisting of
 \unicode{000d}{carriage return} followed by \unicode{000a}{line feed},
@@ -244,18 +246,17 @@
 The \defnadj{translation}{character set} consists of the following elements:
 \begin{itemize}
 \item
-each character named by ISO/IEC 10646,
-as identified by its unique UCS scalar value, and
+each abstract character assigned a code point in the Unicode codespace, and
 \item
-a distinct character for each UCS scalar value
-where no named character is assigned.
+a distinct character for each Unicode scalar value
+not assigned to an abstract character.
 \end{itemize}
 \begin{note}
-ISO/IEC 10646 code points are integers
+Unicode code points are integers
 in the range $[0, \mathrm{10FFFF}]$ (hexadecimal).
 A surrogate code point is a value
 in the range $[\mathrm{D800}, \mathrm{DFFF}]$ (hexadecimal).
-A UCS scalar value is any code point that is not a surrogate code point.
+A Unicode scalar value is any code point that is not a surrogate code point.
 \end{note}
 
 \pnum
@@ -355,126 +356,27 @@
 \tcode{\textbackslash U} \grammarterm{hex-quad} \grammarterm{hex-quad}, or
 \tcode{\textbackslash u\{\grammarterm{simple-hexadecimal-digit-sequence}\}}
 designates the character in the translation character set
-whose UCS scalar value is the hexadecimal number represented by
+whose Unicode scalar value is the hexadecimal number represented by
 the sequence of \grammarterm{hexadecimal-digit}s
 in the \grammarterm{universal-character-name}.
-The program is ill-formed if that number is not a UCS scalar value.
+The program is ill-formed if that number is not a Unicode scalar value.
 
 \pnum
 A \grammarterm{universal-character-name}
 that is a \grammarterm{named-universal-character}
-designates the character named by its \grammarterm{n-char-sequence}.
-A character is so named if the \grammarterm{n-char-sequence} is equal to
-\begin{itemize}
-\item
-the associated character name or associated character name alias
-specified in ISO/IEC 10646 subclause ``Code charts and lists of character names''
-or
-\item
-the control code alias given in \tref{lex.charset.ucn}.
+designates the corresponding character
+in the Unicode Standard (chapter 4.8 Name)
+if the \grammarterm{n-char-sequence} is equal
+to its character name or
+to one of its character name aliases of
+type ``control'', ``correction'', or ``alternate'';
+otherwise, the program is ill-formed.
 \begin{note}
-The aliases in \tref{lex.charset.ucn} are provided for control characters
-which otherwise have no associated character name or character name alias.
-These names are derived from
+These aliases are listed in
 the Unicode Character Database's \tcode{NameAliases.txt}.
-For historical reasons, control characters are formally unnamed.
-\end{note}
-\end{itemize}
-\begin{note}
-None of the associated character names,
-associated character name aliases, or
-control code aliases
-have leading or trailing spaces.
+None of these names or aliases have leading or trailing spaces.
 \end{note}
 
-\begin{multicolfloattable}{Control code aliases}{lex.charset.ucn}{ll}
-\unicode{0000}{null} \\
-\unicode{0001}{start of heading} \\
-\unicode{0002}{start of text} \\
-\unicode{0003}{end of text} \\
-\unicode{0004}{end of transmission} \\
-\unicode{0005}{enquiry} \\
-\unicode{0006}{acknowledge} \\
-\unicode{0007}{alert} \\
-\unicode{0008}{backspace} \\
-\unicode{0009}{character tabulation} \\
-\unicode{0009}{horizontal tabulation} \\
-\unicode{000a}{line feed} \\
-\unicode{000a}{new line} \\
-\unicode{000a}{end of line} \\
-\unicode{000b}{line tabulation} \\
-\unicode{000b}{vertical tabulation} \\
-\unicode{000c}{form feed} \\
-\unicode{000d}{carriage return} \\
-\unicode{000e}{shift out} \\
-\unicode{000e}{locking-shift one} \\
-\unicode{000f}{shift in} \\
-\unicode{000f}{locking-shift zero} \\
-\unicode{0010}{data link escape} \\
-\unicode{0011}{device control one} \\
-\unicode{0012}{device control two} \\
-\unicode{0013}{device control three} \\
-\unicode{0014}{device control four} \\
-\unicode{0015}{negative acknowledge} \\
-\unicode{0016}{synchronous idle} \\
-\unicode{0017}{end of transmission block} \\
-\unicode{0018}{cancel} \\
-\unicode{0019}{end of medium} \\
-\unicode{001a}{substitute} \\
-\unicode{001b}{escape} \\
-\unicode{001c}{information separator four} \\
-\unicode{001c}{file separator} \\
-\unicode{001d}{information separator three} \\
-\unicode{001d}{group separator} \\
-\unicode{001e}{information separator two} \\
-\unicode{001e}{record separator} \\
-\unicode{001f}{information separator one} \\
-\unicode{001f}{unit separator} \\
-\columnbreak
-\unicode{007f}{delete} \\
-\unicode{0082}{break permitted here} \\
-\unicode{0083}{no break here} \\
-\unicode{0084}{index} \\
-\unicode{0085}{next line} \\
-\unicode{0086}{start of selected area} \\
-\unicode{0087}{end of selected area} \\
-\unicode{0088}{character tabulation set} \\
-\unicode{0088}{horizontal tabulation set} \\
-\unicode{0089}{character tabulation with justification} \\
-\unicode{0089}{horizontal tabulation with justification} \\
-\unicode{008a}{line tabulation set} \\
-\unicode{008a}{vertical tabulation set} \\
-\unicode{008b}{partial line forward} \\
-\unicode{008b}{partial line down} \\
-\unicode{008c}{partial line backward} \\
-\unicode{008c}{partial line up} \\
-\unicode{008d}{reverse line feed} \\
-\unicode{008d}{reverse index} \\
-\unicode{008e}{single shift two} \\
-\unicode{008e}{single-shift-2} \\
-\unicode{008f}{single shift three} \\
-\unicode{008f}{single-shift-3} \\
-\unicode{0090}{device control string} \\
-\unicode{0091}{private use one} \\
-\unicode{0091}{private use-1} \\
-\unicode{0092}{private use two} \\
-\unicode{0092}{private use-2} \\
-\unicode{0093}{set transmit state} \\
-\unicode{0094}{cancel character} \\
-\unicode{0095}{message waiting} \\
-\unicode{0096}{start of guarded area} \\
-\unicode{0096}{start of protected area} \\
-\unicode{0097}{end of guarded area} \\
-\unicode{0097}{end of protected area} \\
-\unicode{0098}{start of string} \\
-\unicode{009a}{single character introducer} \\
-\unicode{009b}{control sequence introducer} \\
-\unicode{009c}{string terminator} \\
-\unicode{009d}{operating system command} \\
-\unicode{009e}{privacy message} \\
-\unicode{009f}{application program command} \\
-\end{multicolfloattable}
-
 \pnum
 If a \grammarterm{universal-character-name} outside
 the \grammarterm{c-char-sequence}, \grammarterm{s-char-sequence}, or
@@ -493,10 +395,6 @@
 The \defnadj{basic literal}{character set} consists of
 all characters of the basic character set,
 plus the control characters specified in \tref{lex.charset.literal}.
-\begin{note}
-The alias \uname{bell} for \ucode{0007} shown in ISO 10646
-is ambiguous with \unicode{1f514}{bell}.
-\end{note}
 
 \begin{floattable}{Additional control characters in the basic literal character set}{lex.charset.literal}{ll}
 \topline
@@ -546,9 +444,10 @@
 \indextext{UTF-16}%
 \indextext{UTF-32}%
 For a UTF-8, UTF-16, or UTF-32 literal,
-the UCS scalar value
+the Unicode scalar value
 corresponding to each character of the translation character set
-is encoded as specified in ISO/IEC 10646 for the respective UCS encoding form.
+is encoded as specified in the Unicode Standard
+for the respective Unicode encoding form.
 \indextext{character set|)}
 
 \rSec1[lex.pptoken]{Preprocessing tokens}
@@ -889,14 +788,14 @@
 \begin{bnf}
 \nontermdef{identifier-start}\br
     nondigit\br
-    \textnormal{an element of the translation character set of class XID_Start}
+    \textnormal{an element of the translation character set with the Unicode property XID_Start}
 \end{bnf}
 
 \begin{bnf}
 \nontermdef{identifier-continue}\br
     digit\br
     nondigit\br
-    \textnormal{an element of the translation character set of class XID_Continue}
+    \textnormal{an element of the translation character set with the Unicode property XID_Continue}
 \end{bnf}
 
 \begin{bnf}
@@ -915,8 +814,9 @@
 \pnum
 \indextext{name!length of}%
 \indextext{name}%
-The character classes XID_Start and XID_Continue
-are Derived Core Properties as described by \UAX{44}.
+\begin{note}
+The character properties XID_Start and XID_Continue are Derived Core Properties
+as described by \UAX{44} of the Unicode Standard.
 \begin{footnote}
 On systems in which linkers cannot accept extended
 characters, an encoding of the \grammarterm{universal-character-name} can be used in
@@ -927,9 +827,10 @@
 place a translation limit on significant characters for external
 identifiers.
 \end{footnote}
+\end{note}
 The program is ill-formed
 if an \grammarterm{identifier} does not conform to
-Normalization Form C as specified in ISO/IEC 10646.
+Normalization Form C as specified in the Unicode Standard.
 \begin{note}
 Identifiers are case-sensitive.
 \end{note}
@@ -2102,7 +2003,7 @@
 \impldef{code unit sequence for non-representable \grammarterm{string-literal}}
 code unit sequence is encoded.
 \begin{note}
-No character lacks representation in any of the UCS encoding forms.
+No character lacks representation in any Unicode encoding form.
 \end{note}
 When encoding a stateful character encoding,
 implementations should encode the first such sequence
diff --git a/source/preprocessor.tex b/source/preprocessor.tex
index 5fe3d2d381..b2ac324acd 100644
--- a/source/preprocessor.tex
+++ b/source/preprocessor.tex
@@ -1895,13 +1895,11 @@
 \item
 \indextext{__stdc_iso_10646__@\mname{STDC_ISO_10646}}%
 \mname{STDC_ISO_10646}\\
-An integer literal of the form \tcode{yyyymmL} (for example,
-\tcode{199712L}).
-If this symbol is defined, then every character in the Unicode required set, when
-stored in an object of type \keyword{wchar_t}, has the same value as the code point
-of that character. The \defn{Unicode required set} consists of all
-the characters that are defined by ISO/IEC 10646, along with
-all amendments and technical corrigenda as of the specified year and month.
+An integer literal of the form \tcode{yyyymmL}
+(for example, \tcode{199712L}).
+Whether \mname{STDC_ISO_10646} is predefined and
+if so, what its value is,
+are \impldef{presence and value of \mname{STDC_ISO_10646}}.
 
 \item
 \indextext{__stdcpp_threads__@\mname{STDCPP_THREADS}}%
diff --git a/source/utilities.tex b/source/utilities.tex
index 85f4f73ba6..8f064fd063 100644
--- a/source/utilities.tex
+++ b/source/utilities.tex
@@ -14976,7 +14976,7 @@
 a string is assumed to be in
 a locale-independent,
 \impldef{encoding assumption for \tcode{format} width computation} encoding.
-Implementations should use a Unicode encoding
+Implementations should use either UTF-8, UTF-16, or UTF-32,
 on platforms capable of displaying Unicode text in a terminal.
 \begin{note}
 This is the case for Windows
@@ -14990,7 +14990,7 @@
 \end{note}
 
 \pnum
-For a string in a Unicode encoding,
+For a string in UTF-8, UTF-16, or UTF-32,
 implementations should estimate the width of a string
 as the sum of estimated widths of
 the first code points in its extended grapheme clusters.
@@ -15015,7 +15015,8 @@
 The estimated width of other code points is 1.
 
 \pnum
-For a string in a non-Unicode encoding, the width of a string is unspecified.
+For a string in neither UTF-8, UTF-16, nor UTF-32,
+the width of a string is unspecified.
 
 \pnum
 % FIXME: What if it's an arg-id?
@@ -15027,7 +15028,7 @@
 For string types, this field provides an upper bound
 for the estimated width of the prefix of
 the input string that is copied into the output.
-For a string in a Unicode encoding,
+For a string in UTF-8, UTF-16, or UTF-32,
 the formatter copies to the output
 the longest prefix of whole extended grapheme clusters
 whose estimated width is no greater than the precision.
@@ -15930,15 +15931,15 @@
 
 \begin{itemize}
 \item
-\placeholder{CE} is a Unicode encoding and
+\placeholder{CE} is UTF-8, UTF-16, or UTF-32 and
 \placeholder{C} corresponds to either
-a UCS scalar value whose Unicode property \tcode{General_Category}
+a Unicode scalar value whose Unicode property \tcode{General_Category}
 has a value in the groups \tcode{Separator} (\tcode{Z}) or \tcode{Other} (\tcode{C}) or to
-a UCS scalar value which has the Unicode property \tcode{Grapheme_Extend=Yes},
-as described by table 12 of \UAX{44}, or
+a Unicode scalar value with the Unicode property \tcode{Grapheme_Extend=Yes},
+as described by \UAX{44} of the Unicode Standard, or
 
 \item
-\placeholder{CE} is not a Unicode encoding and
+\placeholder{CE} is neither UTF-8, UTF-16, nor UTF-32 and
 \placeholder{C} is one of an implementation-defined set
 of separator or non-printable characters
 \end{itemize}