diff --git a/build.sh b/build.sh index d888fcf0d..09fb26317 100755 --- a/build.sh +++ b/build.sh @@ -7,13 +7,13 @@ GITTAG=$(git tag --points-at HEAD) # Build the specification draft document echo "Building spec draft" mkdir -p public/draft -spec-md --githubSource "https://github.com/graphql/graphql-spec/blame/main/" spec/GraphQL.md > public/draft/index.html +spec-md --metadata spec/metadata.json --githubSource "https://github.com/graphql/graphql-spec/blame/main/" spec/GraphQL.md > public/draft/index.html # If this is a tagged commit, also build the release document if [ -n "$GITTAG" ]; then echo "Building spec release $GITTAG" mkdir -p "public/$GITTAG" - spec-md --githubSource "https://github.com/graphql/graphql-spec/blame/$GITTAG/" spec/GraphQL.md > "public/$GITTAG/index.html" + spec-md --metadata spec/metadata.json --githubSource "https://github.com/graphql/graphql-spec/blame/$GITTAG/" spec/GraphQL.md > "public/$GITTAG/index.html" fi # Create the index file diff --git a/package.json b/package.json index 45f0b1983..00e6426c0 100644 --- a/package.json +++ b/package.json @@ -14,7 +14,7 @@ }, "scripts": { "test": "npm run test:build && npm run test:spellcheck", - "test:build": "spec-md spec/GraphQL.md > /dev/null", + "test:build": "spec-md --metadata spec/metadata.json spec/GraphQL.md > /dev/null", "test:spellcheck": "cspell 'spec/**/*.md' README.md", "format": "prettier --write '**/*.{md,yml,yaml,json}'", "format:check": "prettier --check '**/*.{md,yml,yaml,json}'", diff --git a/spec/Appendix B -- Grammar Summary.md b/spec/Appendix B -- Grammar Summary.md index c40662504..2291ee35f 100644 --- a/spec/Appendix B -- Grammar Summary.md +++ b/spec/Appendix B -- Grammar Summary.md @@ -2,12 +2,7 @@ ## Source Text -SourceCharacter :: - -- "U+0009" -- "U+000A" -- "U+000D" -- "U+0020–U+FFFF" +SourceCharacter :: "Any Unicode scalar value" ## Ignored Tokens @@ -113,7 +108,16 @@ StringCharacter :: - `\u` EscapedUnicode - `\` EscapedCharacter -EscapedUnicode :: /[0-9A-Fa-f]{4}/ +EscapedUnicode :: + +- `{` HexDigit+ `}` +- HexDigit HexDigit HexDigit HexDigit + +HexDigit :: one of + +- `0` `1` `2` `3` `4` `5` `6` `7` `8` `9` +- `A` `B` `C` `D` `E` `F` +- `a` `b` `c` `d` `e` `f` EscapedCharacter :: one of `"` `\` `/` `b` `f` `n` `r` `t` diff --git a/spec/Section 2 -- Language.md b/spec/Section 2 -- Language.md index c89aaf05e..a687d11aa 100644 --- a/spec/Section 2 -- Language.md +++ b/spec/Section 2 -- Language.md @@ -45,32 +45,22 @@ match, however some lookahead restrictions include additional constraints. ## Source Text -SourceCharacter :: +SourceCharacter :: "Any Unicode scalar value" -- "U+0009" -- "U+000A" -- "U+000D" -- "U+0020–U+FFFF" +GraphQL documents are interpreted from a source text, which is a sequence of +{SourceCharacter}, each {SourceCharacter} being a _Unicode scalar value_ which +may be any Unicode code point from U+0000 to U+D7FF or U+E000 to U+10FFFF +(informally referred to as _"characters"_ through most of this specification). -GraphQL documents are expressed as a sequence of -[Unicode](https://unicode.org/standard/standard.html) code points (informally -referred to as _"characters"_ through most of this specification). However, with -few exceptions, most of GraphQL is expressed only in the original non-control -ASCII range so as to be as widely compatible with as many existing tools, -languages, and serialization formats as possible and avoid display issues in -text editors and source control. +A GraphQL document may be expressed only in the ASCII range to be as widely +compatible with as many existing tools, languages, and serialization formats as +possible and avoid display issues in text editors and source control. Non-ASCII +Unicode scalar values may appear within {StringValue} and {Comment}. -Note: Non-ASCII Unicode characters may appear freely within {StringValue} and -{Comment} portions of GraphQL. - -### Unicode - -UnicodeBOM :: "Byte Order Mark (U+FEFF)" - -The "Byte Order Mark" is a special Unicode character which may appear at the -beginning of a file containing Unicode which programs may use to determine the -fact that the text stream is Unicode, what endianness the text stream is in, and -which of several Unicode encodings to interpret. +Note: An implementation which uses _UTF-16_ to represent GraphQL documents in +memory (for example, JavaScript or Java) may encounter a _surrogate pair_. This +encodes one _supplementary code point_ and is a single valid source character, +however an unpaired _surrogate code point_ is not a valid source character. ### White Space @@ -115,10 +105,9 @@ CommentChar :: SourceCharacter but not LineTerminator GraphQL source documents may contain single-line comments, starting with the {`#`} marker. -A comment can contain any Unicode code point in {SourceCharacter} except -{LineTerminator} so a comment always consists of all code points starting with -the {`#`} character up to but not including the {LineTerminator} (or end of the -source). +A comment may contain any {SourceCharacter} except {LineTerminator} so a comment +always consists of all {SourceCharacter} starting with the {`#`} character up to +but not including the {LineTerminator} (or end of the source). Comments are {Ignored} like white space and may appear after any token, or before a {LineTerminator}, and have no significance to the semantic meaning of a @@ -175,6 +164,16 @@ significant way, for example a {StringValue} may contain white space characters. No {Ignored} may appear _within_ a {Token}, for example no white space characters are permitted between the characters defining a {FloatValue}. +**Byte order mark** + +UnicodeBOM :: "Byte Order Mark (U+FEFF)" + +The _Byte Order Mark_ is a special Unicode code point which may appear at the +beginning of a file which programs may use to determine the fact that the text +stream is Unicode, and what specific encoding has been used. As files are often +concatenated, a _Byte Order Mark_ may appear before or after any lexical token +and is {Ignored}. + ### Punctuators Punctuator :: one of ! $ & ( ) ... : = @ [ ] { | } @@ -812,7 +811,16 @@ StringCharacter :: - `\u` EscapedUnicode - `\` EscapedCharacter -EscapedUnicode :: /[0-9A-Fa-f]{4}/ +EscapedUnicode :: + +- `{` HexDigit+ `}` +- HexDigit HexDigit HexDigit HexDigit + +HexDigit :: one of + +- `0` `1` `2` `3` `4` `5` `6` `7` `8` `9` +- `A` `B` `C` `D` `E` `F` +- `a` `b` `c` `d` `e` `f` EscapedCharacter :: one of `"` `\` `/` `b` `f` `n` `r` `t` @@ -821,19 +829,57 @@ BlockStringCharacter :: - SourceCharacter but not `"""` or `\"""` - `\"""` -Strings are sequences of characters wrapped in quotation marks (U+0022). (ex. -{`"Hello World"`}). White space and other otherwise-ignored characters are -significant within a string value. +A {StringValue} is evaluated to a _Unicode text_ value, a sequence of _Unicode +scalar value_, by interpreting all escape sequences using the static semantics +defined below. White space and other characters ignored between lexical tokens +are significant within a string value. The empty string {`""`} must not be followed by another {`"`} otherwise it would be interpreted as the beginning of a block string. As an example, the source {`""""""`} can only be interpreted as a single empty block string and not three empty strings. -Non-ASCII Unicode characters are allowed within single-quoted strings. Since -{SourceCharacter} must not contain some ASCII control characters, escape -sequences must be used to represent these characters. The {`\`}, {`"`} -characters also must be escaped. All other escape sequences are optional. +**Escape Sequences** + +In a single-quoted {StringValue}, any _Unicode scalar value_ may be expressed +using an escape sequence. GraphQL strings allow both C-style escape sequences +(for example `\n`) and two forms of Unicode escape sequences: one with a +fixed-width of 4 hexadecimal digits (for example `\u000A`) and one with a +variable-width most useful for representing a _supplementary character_ such as +an Emoji (for example `\u{1F4A9}`). + +The hexadecimal number encoded by a Unicode escape sequence must describe a +_Unicode scalar value_, otherwise must result in a parse error. For example both +sources `"\uDEAD"` and `"\u{110000}"` should not be considered valid +{StringValue}. + +Escape sequences are only meaningful within a single-quoted string. Within a +block string, they are simply that sequence of characters (for example +`"""\n"""` represents the _Unicode text_ [U+005C, U+006E]). Within a comment an +escape sequence is not a significant sequence of characters. They may not appear +elsewhere in a GraphQL document. + +Since {StringCharacter} must not contain some code points directly (for example, +a {LineTerminator}), escape sequences must be used to represent them. All other +escape sequences are optional and unescaped non-ASCII Unicode characters are +allowed within strings. If using GraphQL within a system which only supports +ASCII, then escape sequences may be used to represent all Unicode characters +outside of the ASCII range. + +For legacy reasons, a _supplementary character_ may be escaped by two +fixed-width unicode escape sequences forming a _surrogate pair_. For example the +input `"\uD83D\uDCA9"` is a valid {StringValue} which represents the same +_Unicode text_ as `"\u{1F4A9}"`. While this legacy form is allowed, it should be +avoided as a variable-width unicode escape sequence is a clearer way to encode +such code points. + +When producing a {StringValue}, implementations should use escape sequences to +represent non-printable control characters (U+0000 to U+001F and U+007F to +U+009F). Other escape sequences are not necessary, however an implementation may +use escape sequences to represent any other range of code points (for example, +when producing ASCII-only output). If an implementation chooses to escape a +_supplementary character_, it should only use a variable-width unicode escape +sequence. **Block Strings** @@ -889,7 +935,15 @@ Note: If non-printable ASCII characters are needed in a string value, a standard quoted string with appropriate escape sequences must be used instead of a block string. -**Semantics** +**Static Semantics** + +:: A {StringValue} describes a _Unicode text_ value, which is a sequence of +_Unicode scalar value_. + +These semantics describe how to apply the {StringValue} grammar to a source text +to evaluate a _Unicode text_. Errors encountered during this evaluation are +considered a failure to apply the {StringValue} grammar to a source and must +result in a parsing error. StringValue :: `""` @@ -897,43 +951,68 @@ StringValue :: `""` StringValue :: `"` StringCharacter+ `"` -- Return the sequence of all {StringCharacter} code points. +- Return the _Unicode text_ by concatenating the evaluation of all + {StringCharacter}. StringCharacter :: SourceCharacter but not `"` or `\` or LineTerminator -- Return the code point {SourceCharacter}. +- Return the _Unicode scalar value_ {SourceCharacter}. StringCharacter :: `\u` EscapedUnicode -- Let {value} be the 16-bit hexadecimal value represented by the sequence of - hexadecimal digits within {EscapedUnicode}. -- Return the code point {value}. +- Let {value} be the hexadecimal value represented by the sequence of {HexDigit} + within {EscapedUnicode}. +- Assert {value} is a within the _Unicode scalar value_ range (>= 0x0000 and <= + 0xD7FF or >= 0xE000 and <= 0x10FFFF). +- Return the _Unicode scalar value_ {value}. + +StringCharacter :: `\u` HexDigit HexDigit HexDigit HexDigit `\u` HexDigit +HexDigit HexDigit HexDigit + +- Let {leadingValue} be the hexadecimal value represented by the first sequence + of {HexDigit}. +- Let {trailingValue} be the hexadecimal value represented by the second + sequence of {HexDigit}. +- If {leadingValue} is >= 0xD800 and <= 0xDBFF (a _Leading Surrogate_): + - Assert {trailingValue} is >= 0xDC00 and <= 0xDFFF (a _Trailing Surrogate_). + - Return ({leadingValue} - 0xD800) × 0x400 + ({trailingValue} - 0xDC00) + + 0x10000. +- Otherwise: + - Assert {leadingValue} is within the _Unicode scalar value_ range. + - Assert {trailingValue} is within the _Unicode scalar value_ range. + - Return the sequence of the _Unicode scalar value_ {leadingValue} followed by + the _Unicode scalar value_ {trailingValue}. + +Note: If both escape sequences encode a _Unicode scalar value_, then this +semantic is identical to applying the prior semantic on each fixed-width escape +sequence. A variable-width escape sequence must only encode a _Unicode scalar +value_. StringCharacter :: `\` EscapedCharacter -- Return the code point represented by {EscapedCharacter} according to the table - below. +- Return the _Unicode scalar value_ represented by {EscapedCharacter} according + to the table below. -| Escaped Character | Code Point | Character Name | -| ----------------- | ---------- | ---------------------------- | -| {`"`} | U+0022 | double quote | -| {`\`} | U+005C | reverse solidus (back slash) | -| {`/`} | U+002F | solidus (forward slash) | -| {`b`} | U+0008 | backspace | -| {`f`} | U+000C | form feed | -| {`n`} | U+000A | line feed (new line) | -| {`r`} | U+000D | carriage return | -| {`t`} | U+0009 | horizontal tab | +| Escaped Character | Scalar Value | Character Name | +| ----------------- | ------------ | ---------------------------- | +| {`"`} | U+0022 | double quote | +| {`\`} | U+005C | reverse solidus (back slash) | +| {`/`} | U+002F | solidus (forward slash) | +| {`b`} | U+0008 | backspace | +| {`f`} | U+000C | form feed | +| {`n`} | U+000A | line feed (new line) | +| {`r`} | U+000D | carriage return | +| {`t`} | U+0009 | horizontal tab | StringValue :: `"""` BlockStringCharacter\* `"""` -- Let {rawValue} be the Unicode character sequence of all {BlockStringCharacter} - Unicode character values (which may be an empty sequence). +- Let {rawValue} be the _Unicode text_ by concatenating the evaluation of all + {BlockStringCharacter} (which may be an empty sequence). - Return the result of {BlockStringValue(rawValue)}. BlockStringCharacter :: SourceCharacter but not `"""` or `\"""` -- Return the character value of {SourceCharacter}. +- Return the _Unicode scalar value_ {SourceCharacter}. BlockStringCharacter :: `\"""` diff --git a/spec/metadata.json b/spec/metadata.json new file mode 100644 index 000000000..553d56e06 --- /dev/null +++ b/spec/metadata.json @@ -0,0 +1,15 @@ +{ + "biblio": { + "https://www.unicode.org/glossary": { + "byte-order-mark": "#byte_order_mark", + "leading-surrogate": "#leading_surrogate", + "trailing-surrogate": "#trailing_surrogate", + "supplementary-character": "#supplementary_character", + "supplementary-code-point": "#supplementary_code_point", + "surrogate-code-point": "#surrogate_code_point", + "surrogate-pair": "#surrogate_pair", + "unicode-scalar-value": "#unicode_scalar_value", + "utf-16": "#UTF_16" + } + } +}