From 5e5c3cd312bfe943c3508571d2e32db4b50444ae Mon Sep 17 00:00:00 2001 From: Lee Byron Date: Tue, 13 Apr 2021 02:28:14 -0700 Subject: [PATCH 1/3] RFC: Allow full unicode range This spec text implements #687 (full context and details there) and also introduces a new escape sequence. Three distinct changes: 1. Change SourceCharacter to allow points above 0xFFFF, now to 0x10FFFF. 2. Allow surrogate pairs within StringValue. This handles illegal pairs with a parse error. 3. Introduce new syntax for full range code point EscapedUnicode. This syntax (`\u{1F37A}`) has been adopted by many other languages and I propose GraphQL adopt it as well. (As a bonus, this removes the last instance of a regex in the lexer grammar!) --- spec/Appendix B -- Grammar Summary.md | 13 ++++++++-- spec/Section 2 -- Language.md | 37 +++++++++++++++++++++++---- 2 files changed, 43 insertions(+), 7 deletions(-) diff --git a/spec/Appendix B -- Grammar Summary.md b/spec/Appendix B -- Grammar Summary.md index c40662504..75ad6f4c3 100644 --- a/spec/Appendix B -- Grammar Summary.md +++ b/spec/Appendix B -- Grammar Summary.md @@ -7,7 +7,7 @@ SourceCharacter :: - "U+0009" - "U+000A" - "U+000D" -- "U+0020–U+FFFF" +- "U+0020–U+10FFFF" ## Ignored Tokens @@ -113,7 +113,16 @@ StringCharacter :: - `\u` EscapedUnicode - `\` EscapedCharacter -EscapedUnicode :: /[0-9A-Fa-f]{4}/ +EscapedUnicode :: + +- HexDigit HexDigit HexDigit HexDigit +- `{` HexDigit+ `}` "but only if <= 0x10FFFF" + +HexDigit :: one of + +- `0` `1` `2` `3` `4` `5` `6` `7` `8` `9` +- `A` `B` `C` `D` `E` `F` +- `a` `b` `c` `d` `e` `f` EscapedCharacter :: one of `"` `\` `/` `b` `f` `n` `r` `t` diff --git a/spec/Section 2 -- Language.md b/spec/Section 2 -- Language.md index c89aaf05e..f71b02219 100644 --- a/spec/Section 2 -- Language.md +++ b/spec/Section 2 -- Language.md @@ -50,7 +50,7 @@ SourceCharacter :: - "U+0009" - "U+000A" - "U+000D" -- "U+0020–U+FFFF" +- "U+0020–U+10FFFF" GraphQL documents are expressed as a sequence of [Unicode](https://unicode.org/standard/standard.html) code points (informally @@ -812,7 +812,16 @@ StringCharacter :: - `\u` EscapedUnicode - `\` EscapedCharacter -EscapedUnicode :: /[0-9A-Fa-f]{4}/ +EscapedUnicode :: + +- HexDigit HexDigit HexDigit HexDigit +- `{` HexDigit+ `}` "but only if <= 0x10FFFF" + +HexDigit :: one of + +- `0` `1` `2` `3` `4` `5` `6` `7` `8` `9` +- `A` `B` `C` `D` `E` `F` +- `a` `b` `c` `d` `e` `f` EscapedCharacter :: one of `"` `\` `/` `b` `f` `n` `r` `t` @@ -897,7 +906,24 @@ StringValue :: `""` StringValue :: `"` StringCharacter+ `"` -- Return the sequence of all {StringCharacter} code points. +- Let {string} be the sequence of all {StringCharacter} code points. +- For each {codePoint} at {index} in {string}: + - If {codePoint} is >= 0xD800 and <= 0xDBFF (a + [_High Surrogate_](https://unicodebook.readthedocs.io/unicode_encodings.html#utf-16-surrogate-pairs)): + - Let {lowPoint} be the code point at {index} + {1} in {string}. + - Assert {lowPoint} is >= 0xDC00 and <= 0xDFFF (a + [_Low Surrogate_](https://unicodebook.readthedocs.io/unicode_encodings.html#utf-16-surrogate-pairs)). + - Let {decodedPoint} = ({codePoint} - 0xD800) × 0x400 + ({lowPoint} - + 0xDC00) + 0x10000. + - Within {string}, replace {codePoint} and {lowPoint} with {decodedPoint}. + - Otherwise, assert {codePoint} is not >= 0xDC00 and <= 0xDFFF (a + [_Low Surrogate_](https://unicodebook.readthedocs.io/unicode_encodings.html#utf-16-surrogate-pairs)). +- Return {string}. + +Note: {StringValue} should avoid encoding code points as surrogate pairs. While +services must interpret them accordingly, a braced escape (for example +`"\u{1F4A9}"`) is a clearer way to encode code points outside of the +[Basic Multilingual Plane](https://unicodebook.readthedocs.io/unicode.html#bmp). StringCharacter :: SourceCharacter but not `"` or `\` or LineTerminator @@ -905,8 +931,9 @@ StringCharacter :: SourceCharacter but not `"` or `\` or LineTerminator StringCharacter :: `\u` EscapedUnicode -- Let {value} be the 16-bit hexadecimal value represented by the sequence of - hexadecimal digits within {EscapedUnicode}. +- Let {value} be the 21-bit hexadecimal value represented by the sequence of + {HexDigit} within {EscapedUnicode}. +- Assert {value} <= 0x10FFFF. - Return the code point {value}. StringCharacter :: `\` EscapedCharacter From cfce61e64eb658178f8b47b53f5f529e332e3f2a Mon Sep 17 00:00:00 2001 From: Lee Byron Date: Tue, 18 May 2021 14:03:15 -0700 Subject: [PATCH 2/3] Revised RFC after feedback Co-authored-by: Andreas Marek --- build.sh | 4 +- package.json | 2 +- spec/Appendix B -- Grammar Summary.md | 9 +- spec/Section 2 -- Language.md | 167 +++++++++++++++++--------- spec/metadata.json | 15 +++ 5 files changed, 130 insertions(+), 67 deletions(-) create mode 100644 spec/metadata.json diff --git a/build.sh b/build.sh index d888fcf0d..09fb26317 100755 --- a/build.sh +++ b/build.sh @@ -7,13 +7,13 @@ GITTAG=$(git tag --points-at HEAD) # Build the specification draft document echo "Building spec draft" mkdir -p public/draft -spec-md --githubSource "https://github.com/graphql/graphql-spec/blame/main/" spec/GraphQL.md > public/draft/index.html +spec-md --metadata spec/metadata.json --githubSource "https://github.com/graphql/graphql-spec/blame/main/" spec/GraphQL.md > public/draft/index.html # If this is a tagged commit, also build the release document if [ -n "$GITTAG" ]; then echo "Building spec release $GITTAG" mkdir -p "public/$GITTAG" - spec-md --githubSource "https://github.com/graphql/graphql-spec/blame/$GITTAG/" spec/GraphQL.md > "public/$GITTAG/index.html" + spec-md --metadata spec/metadata.json --githubSource "https://github.com/graphql/graphql-spec/blame/$GITTAG/" spec/GraphQL.md > "public/$GITTAG/index.html" fi # Create the index file diff --git a/package.json b/package.json index 45f0b1983..00e6426c0 100644 --- a/package.json +++ b/package.json @@ -14,7 +14,7 @@ }, "scripts": { "test": "npm run test:build && npm run test:spellcheck", - "test:build": "spec-md spec/GraphQL.md > /dev/null", + "test:build": "spec-md --metadata spec/metadata.json spec/GraphQL.md > /dev/null", "test:spellcheck": "cspell 'spec/**/*.md' README.md", "format": "prettier --write '**/*.{md,yml,yaml,json}'", "format:check": "prettier --check '**/*.{md,yml,yaml,json}'", diff --git a/spec/Appendix B -- Grammar Summary.md b/spec/Appendix B -- Grammar Summary.md index 75ad6f4c3..2291ee35f 100644 --- a/spec/Appendix B -- Grammar Summary.md +++ b/spec/Appendix B -- Grammar Summary.md @@ -2,12 +2,7 @@ ## Source Text -SourceCharacter :: - -- "U+0009" -- "U+000A" -- "U+000D" -- "U+0020–U+10FFFF" +SourceCharacter :: "Any Unicode scalar value" ## Ignored Tokens @@ -115,8 +110,8 @@ StringCharacter :: EscapedUnicode :: +- `{` HexDigit+ `}` - HexDigit HexDigit HexDigit HexDigit -- `{` HexDigit+ `}` "but only if <= 0x10FFFF" HexDigit :: one of diff --git a/spec/Section 2 -- Language.md b/spec/Section 2 -- Language.md index f71b02219..9f26d0139 100644 --- a/spec/Section 2 -- Language.md +++ b/spec/Section 2 -- Language.md @@ -45,32 +45,22 @@ match, however some lookahead restrictions include additional constraints. ## Source Text -SourceCharacter :: +SourceCharacter :: "Any Unicode scalar value" -- "U+0009" -- "U+000A" -- "U+000D" -- "U+0020–U+10FFFF" +GraphQL documents are interpreted from a source text, which is a sequence of +{SourceCharacter}, each {SourceCharacter} being a _Unicode scalar value_ which +may be any Unicode code point from U+0000 to U+D7FF or U+E000 to U+10FFFF +(informally referred to as _"characters"_ through most of this specification). -GraphQL documents are expressed as a sequence of -[Unicode](https://unicode.org/standard/standard.html) code points (informally -referred to as _"characters"_ through most of this specification). However, with -few exceptions, most of GraphQL is expressed only in the original non-control -ASCII range so as to be as widely compatible with as many existing tools, -languages, and serialization formats as possible and avoid display issues in -text editors and source control. +A GraphQL document may be expressed only in the ASCII range to be as widely +compatible with as many existing tools, languages, and serialization formats as +possible and avoid display issues in text editors and source control. Non-ASCII +Unicode scalar values may appear within {StringValue} and {Comment}. -Note: Non-ASCII Unicode characters may appear freely within {StringValue} and -{Comment} portions of GraphQL. - -### Unicode - -UnicodeBOM :: "Byte Order Mark (U+FEFF)" - -The "Byte Order Mark" is a special Unicode character which may appear at the -beginning of a file containing Unicode which programs may use to determine the -fact that the text stream is Unicode, what endianness the text stream is in, and -which of several Unicode encodings to interpret. +Note: An implementation which uses _UTF-16_ to represent GraphQL documents in +memory (for example, JavaScript or Java) may encounter a _surrogate pair_. This +encodes a _supplementary code point_ and is a single valid source character, +however an unpaired _surrogate code point_ is not a valid source character. ### White Space @@ -175,6 +165,17 @@ significant way, for example a {StringValue} may contain white space characters. No {Ignored} may appear _within_ a {Token}, for example no white space characters are permitted between the characters defining a {FloatValue}. +**Byte order mark** + +UnicodeBOM :: "Byte Order Mark (U+FEFF)" + +The _Byte Order Mark_ is a special Unicode code point which may appear at the +beginning of a file which programs may use to determine the fact that the text +stream is Unicode, and what specific encoding has been used. + +As files are often concatenated, a _Byte Order Mark_ may appear anywhere within +a GraphQL document and is {Ignored}. + ### Punctuators Punctuator :: one of ! $ & ( ) ... : = @ [ ] { | } @@ -814,8 +815,8 @@ StringCharacter :: EscapedUnicode :: +- `{` HexDigit+ `}` - HexDigit HexDigit HexDigit HexDigit -- `{` HexDigit+ `}` "but only if <= 0x10FFFF" HexDigit :: one of @@ -830,19 +831,58 @@ BlockStringCharacter :: - SourceCharacter but not `"""` or `\"""` - `\"""` -Strings are sequences of characters wrapped in quotation marks (U+0022). (ex. -{`"Hello World"`}). White space and other otherwise-ignored characters are -significant within a string value. +{StringValue} is a sequence of characters wrapped in quotation marks (U+0022). +(ex. {`"Hello World"`}). White space and other characters ignored in other parts +of a GraphQL document are significant within a string value. + +A {StringValue} is evaluated to a Unicode text value, a sequence of Unicode +scalar values, by interpreting all escape sequences using the static semantics +defined below. The empty string {`""`} must not be followed by another {`"`} otherwise it would be interpreted as the beginning of a block string. As an example, the source {`""""""`} can only be interpreted as a single empty block string and not three empty strings. -Non-ASCII Unicode characters are allowed within single-quoted strings. Since -{SourceCharacter} must not contain some ASCII control characters, escape -sequences must be used to represent these characters. The {`\`}, {`"`} -characters also must be escaped. All other escape sequences are optional. +**Escape Sequences** + +In a single-quoted {StringValue}, any Unicode scalar value may be expressed +using an escape sequence. GraphQL strings allow both C-style escape sequences +(for example `\n`) and two forms of Unicode escape sequences: one with a +fixed-width of 4 hexadecimal digits (for example `\u000A`) and one with a +variable-width most useful for representing a _supplementary character_ such as +an Emoji (for example `\u{1F4A9}`). + +The hexadecimal number encoded by a Unicode escape sequence must describe a +Unicode scalar value, otherwise parsing should stop with an early error. For +example both sources `"\uDEAD"` and `"\u{110000}"` should not be considered +valid {StringValue}. + +Escape sequences are only meaningful within a single-quoted string. Within a +block string, they are simply that sequence of characters (for example +`"""\n"""` represents the Unicode text [U+005C, U+006E]). Within a comment an +escape sequence is not a significant sequence of characters. They may not appear +elsewhere in a GraphQL document. + +Since {StringCharacter} must not contain some characters, escape sequences must +be used to represent these characters. All other escape sequences are optional +and unescaped non-ASCII Unicode characters are allowed within strings. If using +GraphQL within a system which only supports ASCII, then escape sequences may be +used to represent all Unicode characters outside of the ASCII range. + +For legacy reasons, a _supplementary character_ may be escaped by two +fixed-width unicode escape sequences forming a _surrogate pair_. For example the +input `"\uD83D\uDCA9"` is a valid {StringValue} which represents the same +Unicode text as `"\u{1F4A9}"`. While this legacy form is allowed, it should be +avoided as a variable-width unicode escape sequence is a clearer way to encode +such code points. + +When producing a {StringValue}, implementations should use escape sequences to +represent non-printable control characters (U+0000 to U+001F and U+007F to +U+009F). Other escape sequences are not necessary, however an implementation may +use escape sequences to represent any other range of code points. If an +implementation chooses to escape a _supplementary character_, it should not use +a fixed-width surrogate pair unicode escape sequence. **Block Strings** @@ -898,7 +938,13 @@ Note: If non-printable ASCII characters are needed in a string value, a standard quoted string with appropriate escape sequences must be used instead of a block string. -**Semantics** +**Static Semantics** + +A {StringValue} describes a Unicode text value, a sequence of *Unicode scalar +value*s. These semantics describe how to apply the {StringValue} grammar to a +source text to evaluate a Unicode text. Errors encountered during this +evaluation are considered a failure to apply the {StringValue} grammar to a +source and result in a parsing error. StringValue :: `""` @@ -906,36 +952,43 @@ StringValue :: `""` StringValue :: `"` StringCharacter+ `"` -- Let {string} be the sequence of all {StringCharacter} code points. -- For each {codePoint} at {index} in {string}: - - If {codePoint} is >= 0xD800 and <= 0xDBFF (a - [_High Surrogate_](https://unicodebook.readthedocs.io/unicode_encodings.html#utf-16-surrogate-pairs)): - - Let {lowPoint} be the code point at {index} + {1} in {string}. - - Assert {lowPoint} is >= 0xDC00 and <= 0xDFFF (a - [_Low Surrogate_](https://unicodebook.readthedocs.io/unicode_encodings.html#utf-16-surrogate-pairs)). - - Let {decodedPoint} = ({codePoint} - 0xD800) × 0x400 + ({lowPoint} - - 0xDC00) + 0x10000. - - Within {string}, replace {codePoint} and {lowPoint} with {decodedPoint}. - - Otherwise, assert {codePoint} is not >= 0xDC00 and <= 0xDFFF (a - [_Low Surrogate_](https://unicodebook.readthedocs.io/unicode_encodings.html#utf-16-surrogate-pairs)). -- Return {string}. - -Note: {StringValue} should avoid encoding code points as surrogate pairs. While -services must interpret them accordingly, a braced escape (for example -`"\u{1F4A9}"`) is a clearer way to encode code points outside of the -[Basic Multilingual Plane](https://unicodebook.readthedocs.io/unicode.html#bmp). +- Return the concatenated sequence of _Unicode scalar value_ by evaluating all + {StringCharacter}. StringCharacter :: SourceCharacter but not `"` or `\` or LineTerminator -- Return the code point {SourceCharacter}. +- Return the _Unicode scalar value_ {SourceCharacter}. StringCharacter :: `\u` EscapedUnicode -- Let {value} be the 21-bit hexadecimal value represented by the sequence of - {HexDigit} within {EscapedUnicode}. -- Assert {value} <= 0x10FFFF. +- Let {value} be the hexadecimal value represented by the sequence of {HexDigit} + within {EscapedUnicode}. +- Assert {value} is a within the _Unicode scalar value_ range (>= 0x0000 and <= + 0xD7FF or >= 0xE000 and <= 0x10FFFF). - Return the code point {value}. +StringCharacter :: `\u` HexDigit HexDigit HexDigit HexDigit `\u` HexDigit +HexDigit HexDigit HexDigit + +- Let {leadingValue} be the hexadecimal value represented by the first sequence + of {HexDigit}. +- Let {trailingValue} be the hexadecimal value represented by the second + sequence of {HexDigit}. +- If {leadingValue} is >= 0xD800 and <= 0xDBFF (a _Leading Surrogate_): + - Assert {trailingValue} is >= 0xDC00 and <= 0xDFFF (a _Trailing Surrogate_). + - Return ({leadingValue} - 0xD800) × 0x400 + ({trailingValue} - 0xDC00) + + 0x10000. +- Otherwise: + - Assert {leadingValue} is within the _Unicode scalar value_ range. + - Assert {trailingValue} is within the _Unicode scalar value_ range. + - Return the sequence of the code point {leadingValue} followed by the code + point {trailingValue}. + +Note: If both escape sequences encode a _Unicode scalar value_, then this +semantic is identical to applying the prior semantic on each fixed-width escape +sequence. A variable-width escape sequence must only encode a _Unicode scalar +value_. + StringCharacter :: `\` EscapedCharacter - Return the code point represented by {EscapedCharacter} according to the table @@ -954,13 +1007,13 @@ StringCharacter :: `\` EscapedCharacter StringValue :: `"""` BlockStringCharacter\* `"""` -- Let {rawValue} be the Unicode character sequence of all {BlockStringCharacter} - Unicode character values (which may be an empty sequence). +- Let {rawValue} be the concatenated sequence of _Unicode scalar value_ by + evaluating all {BlockStringCharacter} (which may be an empty sequence). - Return the result of {BlockStringValue(rawValue)}. BlockStringCharacter :: SourceCharacter but not `"""` or `\"""` -- Return the character value of {SourceCharacter}. +- Return the _Unicode scalar value_ {SourceCharacter}. BlockStringCharacter :: `\"""` diff --git a/spec/metadata.json b/spec/metadata.json new file mode 100644 index 000000000..553d56e06 --- /dev/null +++ b/spec/metadata.json @@ -0,0 +1,15 @@ +{ + "biblio": { + "https://www.unicode.org/glossary": { + "byte-order-mark": "#byte_order_mark", + "leading-surrogate": "#leading_surrogate", + "trailing-surrogate": "#trailing_surrogate", + "supplementary-character": "#supplementary_character", + "supplementary-code-point": "#supplementary_code_point", + "surrogate-code-point": "#surrogate_code_point", + "surrogate-pair": "#surrogate_pair", + "unicode-scalar-value": "#unicode_scalar_value", + "utf-16": "#UTF_16" + } + } +} From 68713052d9af68756df196b549e86fb47052a802 Mon Sep 17 00:00:00 2001 From: Lee Byron Date: Thu, 2 Jun 2022 15:27:29 -0700 Subject: [PATCH 3/3] Editorial --- spec/Section 2 -- Language.md | 105 +++++++++++++++++----------------- 1 file changed, 52 insertions(+), 53 deletions(-) diff --git a/spec/Section 2 -- Language.md b/spec/Section 2 -- Language.md index 9f26d0139..a687d11aa 100644 --- a/spec/Section 2 -- Language.md +++ b/spec/Section 2 -- Language.md @@ -59,7 +59,7 @@ Unicode scalar values may appear within {StringValue} and {Comment}. Note: An implementation which uses _UTF-16_ to represent GraphQL documents in memory (for example, JavaScript or Java) may encounter a _surrogate pair_. This -encodes a _supplementary code point_ and is a single valid source character, +encodes one _supplementary code point_ and is a single valid source character, however an unpaired _surrogate code point_ is not a valid source character. ### White Space @@ -105,10 +105,9 @@ CommentChar :: SourceCharacter but not LineTerminator GraphQL source documents may contain single-line comments, starting with the {`#`} marker. -A comment can contain any Unicode code point in {SourceCharacter} except -{LineTerminator} so a comment always consists of all code points starting with -the {`#`} character up to but not including the {LineTerminator} (or end of the -source). +A comment may contain any {SourceCharacter} except {LineTerminator} so a comment +always consists of all {SourceCharacter} starting with the {`#`} character up to +but not including the {LineTerminator} (or end of the source). Comments are {Ignored} like white space and may appear after any token, or before a {LineTerminator}, and have no significance to the semantic meaning of a @@ -171,10 +170,9 @@ UnicodeBOM :: "Byte Order Mark (U+FEFF)" The _Byte Order Mark_ is a special Unicode code point which may appear at the beginning of a file which programs may use to determine the fact that the text -stream is Unicode, and what specific encoding has been used. - -As files are often concatenated, a _Byte Order Mark_ may appear anywhere within -a GraphQL document and is {Ignored}. +stream is Unicode, and what specific encoding has been used. As files are often +concatenated, a _Byte Order Mark_ may appear before or after any lexical token +and is {Ignored}. ### Punctuators @@ -831,13 +829,10 @@ BlockStringCharacter :: - SourceCharacter but not `"""` or `\"""` - `\"""` -{StringValue} is a sequence of characters wrapped in quotation marks (U+0022). -(ex. {`"Hello World"`}). White space and other characters ignored in other parts -of a GraphQL document are significant within a string value. - -A {StringValue} is evaluated to a Unicode text value, a sequence of Unicode -scalar values, by interpreting all escape sequences using the static semantics -defined below. +A {StringValue} is evaluated to a _Unicode text_ value, a sequence of _Unicode +scalar value_, by interpreting all escape sequences using the static semantics +defined below. White space and other characters ignored between lexical tokens +are significant within a string value. The empty string {`""`} must not be followed by another {`"`} otherwise it would be interpreted as the beginning of a block string. As an example, the source @@ -846,7 +841,7 @@ empty strings. **Escape Sequences** -In a single-quoted {StringValue}, any Unicode scalar value may be expressed +In a single-quoted {StringValue}, any _Unicode scalar value_ may be expressed using an escape sequence. GraphQL strings allow both C-style escape sequences (for example `\n`) and two forms of Unicode escape sequences: one with a fixed-width of 4 hexadecimal digits (for example `\u000A`) and one with a @@ -854,35 +849,37 @@ variable-width most useful for representing a _supplementary character_ such as an Emoji (for example `\u{1F4A9}`). The hexadecimal number encoded by a Unicode escape sequence must describe a -Unicode scalar value, otherwise parsing should stop with an early error. For -example both sources `"\uDEAD"` and `"\u{110000}"` should not be considered -valid {StringValue}. +_Unicode scalar value_, otherwise must result in a parse error. For example both +sources `"\uDEAD"` and `"\u{110000}"` should not be considered valid +{StringValue}. Escape sequences are only meaningful within a single-quoted string. Within a block string, they are simply that sequence of characters (for example -`"""\n"""` represents the Unicode text [U+005C, U+006E]). Within a comment an +`"""\n"""` represents the _Unicode text_ [U+005C, U+006E]). Within a comment an escape sequence is not a significant sequence of characters. They may not appear elsewhere in a GraphQL document. -Since {StringCharacter} must not contain some characters, escape sequences must -be used to represent these characters. All other escape sequences are optional -and unescaped non-ASCII Unicode characters are allowed within strings. If using -GraphQL within a system which only supports ASCII, then escape sequences may be -used to represent all Unicode characters outside of the ASCII range. +Since {StringCharacter} must not contain some code points directly (for example, +a {LineTerminator}), escape sequences must be used to represent them. All other +escape sequences are optional and unescaped non-ASCII Unicode characters are +allowed within strings. If using GraphQL within a system which only supports +ASCII, then escape sequences may be used to represent all Unicode characters +outside of the ASCII range. For legacy reasons, a _supplementary character_ may be escaped by two fixed-width unicode escape sequences forming a _surrogate pair_. For example the input `"\uD83D\uDCA9"` is a valid {StringValue} which represents the same -Unicode text as `"\u{1F4A9}"`. While this legacy form is allowed, it should be +_Unicode text_ as `"\u{1F4A9}"`. While this legacy form is allowed, it should be avoided as a variable-width unicode escape sequence is a clearer way to encode such code points. When producing a {StringValue}, implementations should use escape sequences to represent non-printable control characters (U+0000 to U+001F and U+007F to U+009F). Other escape sequences are not necessary, however an implementation may -use escape sequences to represent any other range of code points. If an -implementation chooses to escape a _supplementary character_, it should not use -a fixed-width surrogate pair unicode escape sequence. +use escape sequences to represent any other range of code points (for example, +when producing ASCII-only output). If an implementation chooses to escape a +_supplementary character_, it should only use a variable-width unicode escape +sequence. **Block Strings** @@ -940,11 +937,13 @@ string. **Static Semantics** -A {StringValue} describes a Unicode text value, a sequence of *Unicode scalar -value*s. These semantics describe how to apply the {StringValue} grammar to a -source text to evaluate a Unicode text. Errors encountered during this -evaluation are considered a failure to apply the {StringValue} grammar to a -source and result in a parsing error. +:: A {StringValue} describes a _Unicode text_ value, which is a sequence of +_Unicode scalar value_. + +These semantics describe how to apply the {StringValue} grammar to a source text +to evaluate a _Unicode text_. Errors encountered during this evaluation are +considered a failure to apply the {StringValue} grammar to a source and must +result in a parsing error. StringValue :: `""` @@ -952,7 +951,7 @@ StringValue :: `""` StringValue :: `"` StringCharacter+ `"` -- Return the concatenated sequence of _Unicode scalar value_ by evaluating all +- Return the _Unicode text_ by concatenating the evaluation of all {StringCharacter}. StringCharacter :: SourceCharacter but not `"` or `\` or LineTerminator @@ -965,7 +964,7 @@ StringCharacter :: `\u` EscapedUnicode within {EscapedUnicode}. - Assert {value} is a within the _Unicode scalar value_ range (>= 0x0000 and <= 0xD7FF or >= 0xE000 and <= 0x10FFFF). -- Return the code point {value}. +- Return the _Unicode scalar value_ {value}. StringCharacter :: `\u` HexDigit HexDigit HexDigit HexDigit `\u` HexDigit HexDigit HexDigit HexDigit @@ -981,8 +980,8 @@ HexDigit HexDigit HexDigit - Otherwise: - Assert {leadingValue} is within the _Unicode scalar value_ range. - Assert {trailingValue} is within the _Unicode scalar value_ range. - - Return the sequence of the code point {leadingValue} followed by the code - point {trailingValue}. + - Return the sequence of the _Unicode scalar value_ {leadingValue} followed by + the _Unicode scalar value_ {trailingValue}. Note: If both escape sequences encode a _Unicode scalar value_, then this semantic is identical to applying the prior semantic on each fixed-width escape @@ -991,24 +990,24 @@ value_. StringCharacter :: `\` EscapedCharacter -- Return the code point represented by {EscapedCharacter} according to the table - below. +- Return the _Unicode scalar value_ represented by {EscapedCharacter} according + to the table below. -| Escaped Character | Code Point | Character Name | -| ----------------- | ---------- | ---------------------------- | -| {`"`} | U+0022 | double quote | -| {`\`} | U+005C | reverse solidus (back slash) | -| {`/`} | U+002F | solidus (forward slash) | -| {`b`} | U+0008 | backspace | -| {`f`} | U+000C | form feed | -| {`n`} | U+000A | line feed (new line) | -| {`r`} | U+000D | carriage return | -| {`t`} | U+0009 | horizontal tab | +| Escaped Character | Scalar Value | Character Name | +| ----------------- | ------------ | ---------------------------- | +| {`"`} | U+0022 | double quote | +| {`\`} | U+005C | reverse solidus (back slash) | +| {`/`} | U+002F | solidus (forward slash) | +| {`b`} | U+0008 | backspace | +| {`f`} | U+000C | form feed | +| {`n`} | U+000A | line feed (new line) | +| {`r`} | U+000D | carriage return | +| {`t`} | U+0009 | horizontal tab | StringValue :: `"""` BlockStringCharacter\* `"""` -- Let {rawValue} be the concatenated sequence of _Unicode scalar value_ by - evaluating all {BlockStringCharacter} (which may be an empty sequence). +- Let {rawValue} be the _Unicode text_ by concatenating the evaluation of all + {BlockStringCharacter} (which may be an empty sequence). - Return the result of {BlockStringValue(rawValue)}. BlockStringCharacter :: SourceCharacter but not `"""` or `\"""`