From 666ebe4bd4cd6543a947afa9dbd24bf0066651eb Mon Sep 17 00:00:00 2001 From: Andreas Marek Date: Sun, 9 Feb 2020 04:35:53 +1100 Subject: [PATCH 1/6] add full unicode support --- src/language/__tests__/lexer-test.js | 33 +++++++++++ src/language/lexer.js | 88 +++++++++++++++++++++++----- 2 files changed, 105 insertions(+), 16 deletions(-) diff --git a/src/language/__tests__/lexer-test.js b/src/language/__tests__/lexer-test.js index 2d6e6b11fc..0fa3053ef1 100644 --- a/src/language/__tests__/lexer-test.js +++ b/src/language/__tests__/lexer-test.js @@ -268,6 +268,20 @@ describe('Lexer', () => { end: 34, value: 'unicode \u1234\u5678\u90AB\uCDEF', }); + + expect(lexOne('"string with unicode code point outside BMP 😀"')).to.contain({ + kind: TokenKind.STRING, + start: 0, + end: 47, + value: 'string with unicode code point outside BMP 😀', + }); + + expect(lexOne('"string with unicode code point outside BMP escaped \\uD83D\\uDE00"')).to.contain({ + kind: TokenKind.STRING, + start: 0, + end: 65, + value: 'string with unicode code point outside BMP escaped 😀', + }); }); it('lex reports useful string errors', () => { @@ -353,6 +367,17 @@ describe('Lexer', () => { message: 'Syntax Error: Invalid character escape sequence: \\uXXXF.', locations: [{ line: 1, column: 7 }], }); + + expectSyntaxError('"bad \\uDEAD esc"').to.deep.equal({ + message: 'Syntax Error: Invalid surrogate pair escape sequence: \\uDEAD.', + locations: [{ line: 1, column: 7 }], + }); + + expectSyntaxError('"bad \\uD83D\\uDBFF esc"').to.deep.equal({ + message: 'Syntax Error: Invalid surrogate pair escape sequence: \\uD83D\\uDBFF.', + locations: [{ line: 1, column: 7 }], + }); + }); it('lexes block strings', () => { @@ -412,6 +437,14 @@ describe('Lexer', () => { value: 'unescaped \\n\\r\\b\\t\\f\\u1234', }); + expect(lexOne('"""unescaped unicode outside BMP 😀"""')).to.contain({ + kind: TokenKind.BLOCK_STRING, + start: 0, + end: 38, + value: 'unescaped unicode outside BMP 😀', + }); + + expect(lexOne('"""slashes \\\\ \\/"""')).to.contain({ kind: TokenKind.BLOCK_STRING, start: 0, diff --git a/src/language/lexer.js b/src/language/lexer.js index b6ab501308..5c969fc9fa 100644 --- a/src/language/lexer.js +++ b/src/language/lexer.js @@ -511,22 +511,9 @@ function readString(source, start, line, col, prev): Token { break; case 117: { // uXXXX - const charCode = uniCharCode( - body.charCodeAt(position + 1), - body.charCodeAt(position + 2), - body.charCodeAt(position + 3), - body.charCodeAt(position + 4), - ); - if (charCode < 0) { - const invalidSequence = body.slice(position + 1, position + 5); - throw syntaxError( - source, - position, - `Invalid character escape sequence: \\u${invalidSequence}.`, - ); - } - value += String.fromCharCode(charCode); - position += 4; + const convertedEscape = convertUnicodeEscape(source, body, position); + value += convertedEscape.value; + position += convertedEscape.positionIncrease; break; } default: @@ -546,6 +533,75 @@ function readString(source, start, line, col, prev): Token { throw syntaxError(source, position, 'Unterminated string.'); } +function convertUnicodeEscape(source, body, position) { + const charCode = uniCharCode( + body.charCodeAt(position + 1), + body.charCodeAt(position + 2), + body.charCodeAt(position + 3), + body.charCodeAt(position + 4), + ); + if (charCode < 0) { + const invalidSequence = body.slice(position + 1, position + 5); + throw syntaxError( + source, + position, + `Invalid character escape sequence: \\u${invalidSequence}.`, + ); + } + + let value; + let positionIncrease; + // String.fromCharCode doesn't fail for invalid surrogate pairs, therefore + // it is manually verified here + if (isTrailingSurrogate(charCode)) { + const invalidSequence = body.slice(position + 1, position + 5); + throw syntaxError( + source, + position, + `Invalid surrogate pair escape sequence: \\u${invalidSequence}.`, + ); + } + if (isLeadingSurrogate(charCode)) { + if (body.charCodeAt(position + 5) !== 92 || + body.charCodeAt(position + 6) !== 117) { + const invalidSequence = body.slice(position + 1, position + 7); + throw syntaxError( + source, + position, + `Invalid surrogate pair escape sequence: \\u${invalidSequence}.`, + ); + } + const trailingSurrogate = uniCharCode( + body.charCodeAt(position + 7), + body.charCodeAt(position + 8), + body.charCodeAt(position + 9), + body.charCodeAt(position + 10), + ); + if (!isTrailingSurrogate(trailingSurrogate)) { + const invalidSequence = body.slice(position + 1, position + 11); + throw syntaxError( + source, + position, + `Invalid surrogate pair escape sequence: \\u${invalidSequence}.`, + ); + } + value = String.fromCharCode(charCode, trailingSurrogate); + positionIncrease = 10; + } else { + value = String.fromCharCode(charCode); + positionIncrease = 4; + } + return { value, positionIncrease }; +} + +function isLeadingSurrogate(charCode) { + return 0xD800 <= charCode && charCode <= 0xDBFF; +} + +function isTrailingSurrogate(charCode) { + return 0xDC00 <= charCode && charCode <= 0xDFFF; +} + /** * Reads a block string token from the source file. * From 2ca116526e0894f3c23d761f2af87d631fad259d Mon Sep 17 00:00:00 2001 From: Andreas Marek Date: Sun, 9 Feb 2020 04:45:14 +1100 Subject: [PATCH 2/6] apply prettier --- src/language/__tests__/lexer-test.js | 15 ++++++++++----- src/language/lexer.js | 10 ++++++---- 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/src/language/__tests__/lexer-test.js b/src/language/__tests__/lexer-test.js index 0fa3053ef1..819d5a0a45 100644 --- a/src/language/__tests__/lexer-test.js +++ b/src/language/__tests__/lexer-test.js @@ -269,14 +269,20 @@ describe('Lexer', () => { value: 'unicode \u1234\u5678\u90AB\uCDEF', }); - expect(lexOne('"string with unicode code point outside BMP 😀"')).to.contain({ + expect( + lexOne('"string with unicode code point outside BMP 😀"'), + ).to.contain({ kind: TokenKind.STRING, start: 0, end: 47, value: 'string with unicode code point outside BMP 😀', }); - expect(lexOne('"string with unicode code point outside BMP escaped \\uD83D\\uDE00"')).to.contain({ + expect( + lexOne( + '"string with unicode code point outside BMP escaped \\uD83D\\uDE00"', + ), + ).to.contain({ kind: TokenKind.STRING, start: 0, end: 65, @@ -374,10 +380,10 @@ describe('Lexer', () => { }); expectSyntaxError('"bad \\uD83D\\uDBFF esc"').to.deep.equal({ - message: 'Syntax Error: Invalid surrogate pair escape sequence: \\uD83D\\uDBFF.', + message: + 'Syntax Error: Invalid surrogate pair escape sequence: \\uD83D\\uDBFF.', locations: [{ line: 1, column: 7 }], }); - }); it('lexes block strings', () => { @@ -444,7 +450,6 @@ describe('Lexer', () => { value: 'unescaped unicode outside BMP 😀', }); - expect(lexOne('"""slashes \\\\ \\/"""')).to.contain({ kind: TokenKind.BLOCK_STRING, start: 0, diff --git a/src/language/lexer.js b/src/language/lexer.js index 5c969fc9fa..191696f6ca 100644 --- a/src/language/lexer.js +++ b/src/language/lexer.js @@ -562,8 +562,10 @@ function convertUnicodeEscape(source, body, position) { ); } if (isLeadingSurrogate(charCode)) { - if (body.charCodeAt(position + 5) !== 92 || - body.charCodeAt(position + 6) !== 117) { + if ( + body.charCodeAt(position + 5) !== 92 || + body.charCodeAt(position + 6) !== 117 + ) { const invalidSequence = body.slice(position + 1, position + 7); throw syntaxError( source, @@ -595,11 +597,11 @@ function convertUnicodeEscape(source, body, position) { } function isLeadingSurrogate(charCode) { - return 0xD800 <= charCode && charCode <= 0xDBFF; + return 0xd800 <= charCode && charCode <= 0xdbff; } function isTrailingSurrogate(charCode) { - return 0xDC00 <= charCode && charCode <= 0xDFFF; + return 0xdc00 <= charCode && charCode <= 0xdfff; } /** From 386788f3b40752a1166878fbe4a72de47baf9a77 Mon Sep 17 00:00:00 2001 From: Andreas Marek Date: Sun, 9 Feb 2020 11:25:25 +1100 Subject: [PATCH 3/6] fix test --- src/language/lexer.js | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/language/lexer.js b/src/language/lexer.js index 191696f6ca..96e78b15ce 100644 --- a/src/language/lexer.js +++ b/src/language/lexer.js @@ -597,11 +597,11 @@ function convertUnicodeEscape(source, body, position) { } function isLeadingSurrogate(charCode) { - return 0xd800 <= charCode && charCode <= 0xdbff; + return charCode >= 0xd800 && charCode <= 0xdbff; } function isTrailingSurrogate(charCode) { - return 0xdc00 <= charCode && charCode <= 0xdfff; + return charCode >= 0xdc00 && charCode <= 0xdfff; } /** From b437de925a0729f24e995d5ee5a84ae334ae7dd5 Mon Sep 17 00:00:00 2001 From: Andreas Marek Date: Sun, 9 Feb 2020 11:34:33 +1100 Subject: [PATCH 4/6] more tests --- src/language/__tests__/lexer-test.js | 44 ++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/src/language/__tests__/lexer-test.js b/src/language/__tests__/lexer-test.js index 819d5a0a45..0bcdfa7b76 100644 --- a/src/language/__tests__/lexer-test.js +++ b/src/language/__tests__/lexer-test.js @@ -288,6 +288,50 @@ describe('Lexer', () => { end: 65, value: 'string with unicode code point outside BMP escaped 😀', }); + + expect( + lexOne( + '"string with unicode code point outside BMP escaped \\uD800\\uDC00"', + ), + ).to.contain({ + kind: TokenKind.STRING, + start: 0, + end: 65, + value: 'string with unicode code point outside BMP escaped \uD800\uDC00', + }); + + expect( + lexOne( + '"string with unicode code point outside BMP escaped \\uDBFF\\uDC00"', + ), + ).to.contain({ + kind: TokenKind.STRING, + start: 0, + end: 65, + value: 'string with unicode code point outside BMP escaped \uDBFF\uDC00', + }); + + expect( + lexOne( + '"string with unicode code point outside BMP escaped \\uDBFF\\uDFFF"', + ), + ).to.contain({ + kind: TokenKind.STRING, + start: 0, + end: 65, + value: 'string with unicode code point outside BMP escaped \uDBFF\uDFFF', + }); + + expect( + lexOne( + '"string with unicode code point outside BMP escaped \\uD800\\uDFFF"', + ), + ).to.contain({ + kind: TokenKind.STRING, + start: 0, + end: 65, + value: 'string with unicode code point outside BMP escaped \uD800\uDFFF', + }); }); it('lex reports useful string errors', () => { From f41b3c7755b2af14dd007baea2f0cd1bd9dcc54e Mon Sep 17 00:00:00 2001 From: Andreas Marek Date: Wed, 19 Feb 2020 18:50:26 +1100 Subject: [PATCH 5/6] add test --- src/language/__tests__/lexer-test.js | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/language/__tests__/lexer-test.js b/src/language/__tests__/lexer-test.js index 0bcdfa7b76..f826e8bba6 100644 --- a/src/language/__tests__/lexer-test.js +++ b/src/language/__tests__/lexer-test.js @@ -423,6 +423,12 @@ describe('Lexer', () => { locations: [{ line: 1, column: 7 }], }); + expectSyntaxError('"bad \\uD83D\\noEscape"').to.deep.equal({ + message: + 'Syntax Error: Invalid surrogate pair escape sequence: \\uD83D\\n.', + locations: [{ line: 1, column: 7 }], + }); + expectSyntaxError('"bad \\uD83D\\uDBFF esc"').to.deep.equal({ message: 'Syntax Error: Invalid surrogate pair escape sequence: \\uD83D\\uDBFF.', From 01f055a13f6c9286f5366ea59a2ed25edc69443b Mon Sep 17 00:00:00 2001 From: Lee Byron Date: Fri, 23 Apr 2021 01:09:35 -0700 Subject: [PATCH 6/6] Add full unicode spec change support * Requires surrogate pairs, regardless of if they are escaped * Support braced unicode escapes * Improved error messages with more tests --- src/language/__tests__/lexer-test.js | 133 +++++++++++-- src/language/lexer.js | 275 +++++++++++++++------------ 2 files changed, 271 insertions(+), 137 deletions(-) diff --git a/src/language/__tests__/lexer-test.js b/src/language/__tests__/lexer-test.js index 98c25e8387..28f75eeb81 100644 --- a/src/language/__tests__/lexer-test.js +++ b/src/language/__tests__/lexer-test.js @@ -263,6 +263,31 @@ describe('Lexer', () => { value: 'unicode \u1234\u5678\u90AB\uCDEF', }); + expect(lexOne('"unicode \\u{1234}\\u{5678}\\u{90AB}\\u{CDEF}"')).to.contain( + { + kind: TokenKind.STRING, + start: 0, + end: 42, + value: 'unicode \u1234\u5678\u90AB\uCDEF', + }, + ); + + expect( + lexOne('"string with unicode escape outside BMP \\u{1F600}"'), + ).to.contain({ + kind: TokenKind.STRING, + start: 0, + end: 50, + value: 'string with unicode escape outside BMP 😀', + }); + + expect(lexOne('"unicode \\u{10FFFF}"')).to.contain({ + kind: TokenKind.STRING, + start: 0, + end: 20, + value: 'unicode \u{10FFFF}', + }); + expect( lexOne('"string with unicode code point outside BMP 😀"'), ).to.contain({ @@ -378,55 +403,135 @@ describe('Lexer', () => { }); expectSyntaxError('"bad \\z esc"').to.deep.equal({ - message: 'Syntax Error: Invalid character escape sequence: \\z.', + message: 'Syntax Error: Invalid character escape sequence: "\\z".', locations: [{ line: 1, column: 7 }], }); expectSyntaxError('"bad \\x esc"').to.deep.equal({ - message: 'Syntax Error: Invalid character escape sequence: \\x.', + message: 'Syntax Error: Invalid character escape sequence: "\\x".', locations: [{ line: 1, column: 7 }], }); expectSyntaxError('"bad \\u1 esc"').to.deep.equal({ - message: 'Syntax Error: Invalid character escape sequence: \\u1 es.', + message: 'Syntax Error: Invalid Unicode escape sequence: "\\u1 es".', + locations: [{ line: 1, column: 7 }], + }); + + expectSyntaxError('"bad \\u1"').to.deep.equal({ + message: 'Syntax Error: Invalid Unicode escape sequence: "\\u1".', locations: [{ line: 1, column: 7 }], }); expectSyntaxError('"bad \\u0XX1 esc"').to.deep.equal({ - message: 'Syntax Error: Invalid character escape sequence: \\u0XX1.', + message: 'Syntax Error: Invalid Unicode escape sequence: "\\u0XX1".', locations: [{ line: 1, column: 7 }], }); expectSyntaxError('"bad \\uXXXX esc"').to.deep.equal({ - message: 'Syntax Error: Invalid character escape sequence: \\uXXXX.', + message: 'Syntax Error: Invalid Unicode escape sequence: "\\uXXXX".', locations: [{ line: 1, column: 7 }], }); expectSyntaxError('"bad \\uFXXX esc"').to.deep.equal({ - message: 'Syntax Error: Invalid character escape sequence: \\uFXXX.', + message: 'Syntax Error: Invalid Unicode escape sequence: "\\uFXXX".', locations: [{ line: 1, column: 7 }], }); expectSyntaxError('"bad \\uXXXF esc"').to.deep.equal({ - message: 'Syntax Error: Invalid character escape sequence: \\uXXXF.', + message: 'Syntax Error: Invalid Unicode escape sequence: "\\uXXXF".', locations: [{ line: 1, column: 7 }], }); - expectSyntaxError('"bad \\uDEAD esc"').to.deep.equal({ - message: 'Syntax Error: Invalid surrogate pair escape sequence: \\uDEAD.', + expectSyntaxError('"bad \\u{} esc"').to.deep.equal({ + message: 'Syntax Error: Invalid Unicode escape sequence: "\\u{}".', locations: [{ line: 1, column: 7 }], }); - expectSyntaxError('"bad \\uD83D\\noEscape"').to.deep.equal({ - message: - 'Syntax Error: Invalid surrogate pair escape sequence: \\uD83D\\n.', + expectSyntaxError('"bad \\u{XXXF} esc"').to.deep.equal({ + message: 'Syntax Error: Invalid Unicode escape sequence: "\\u{XXXF}".', + locations: [{ line: 1, column: 7 }], + }); + + expectSyntaxError('"bad \\u{XXXF esc"').to.deep.equal({ + message: 'Syntax Error: Invalid Unicode escape sequence: "\\u{XXXF es".', locations: [{ line: 1, column: 7 }], }); + expectSyntaxError('"bad \\u{X"').to.deep.equal({ + message: 'Syntax Error: Invalid Unicode escape sequence: "\\u{X".', + locations: [{ line: 1, column: 7 }], + }); + + expectSyntaxError('"bad \\u{XXXF e}scape"').to.deep.equal({ + message: 'Syntax Error: Invalid Unicode escape sequence: "\\u{XXXF e}".', + locations: [{ line: 1, column: 7 }], + }); + + expectSyntaxError('"bad \\u{110000} esc"').to.deep.equal({ + message: 'Syntax Error: Undefined Unicode code-point: "\\u{110000}".', + locations: [{ line: 1, column: 7 }], + }); + + expectSyntaxError('"bad \uDEAD esc"').to.deep.equal({ + message: 'Syntax Error: Invalid low surrogate within String: "\\uDEAD".', + locations: [{ line: 1, column: 6 }], + }); + + expectSyntaxError('"bad \\uDEAD esc"').to.deep.equal({ + message: 'Syntax Error: Invalid low surrogate within String: "\\uDEAD".', + locations: [{ line: 1, column: 6 }], + }); + + expectSyntaxError('"bad \\u{DEAD} esc"').to.deep.equal({ + message: 'Syntax Error: Invalid low surrogate within String: "\\uDEAD".', + locations: [{ line: 1, column: 6 }], + }); + + expectSyntaxError('"bad \uD83D esc"').to.deep.equal({ + message: + 'Syntax Error: Invalid high surrogate "\\uD83D" followed by a non-low surrogate " " in String.', + locations: [{ line: 1, column: 6 }], + }); + + expectSyntaxError('"bad \\uD83D esc"').to.deep.equal({ + message: + 'Syntax Error: Invalid high surrogate "\\uD83D" followed by a non-low surrogate " " in String.', + locations: [{ line: 1, column: 6 }], + }); + + expectSyntaxError('"bad \\u{D83D} esc"').to.deep.equal({ + message: + 'Syntax Error: Invalid high surrogate "\\uD83D" followed by a non-low surrogate " " in String.', + locations: [{ line: 1, column: 6 }], + }); + + expectSyntaxError('"bad \uD83D\uDBFF esc"').to.deep.equal({ + message: + 'Syntax Error: Invalid high surrogate "\\uD83D" followed by a non-low surrogate "\\uDBFF" in String.', + locations: [{ line: 1, column: 6 }], + }); + expectSyntaxError('"bad \\uD83D\\uDBFF esc"').to.deep.equal({ message: - 'Syntax Error: Invalid surrogate pair escape sequence: \\uD83D\\uDBFF.', - locations: [{ line: 1, column: 7 }], + 'Syntax Error: Invalid high surrogate "\\uD83D" followed by a non-low surrogate "\\uDBFF" in String.', + locations: [{ line: 1, column: 6 }], + }); + + expectSyntaxError('"bad \uD83D\\uDBFF esc"').to.deep.equal({ + message: + 'Syntax Error: Invalid high surrogate "\\uD83D" followed by a non-low surrogate "\\uDBFF" in String.', + locations: [{ line: 1, column: 6 }], + }); + + expectSyntaxError('"bad \\uD83D\uDBFF esc"').to.deep.equal({ + message: + 'Syntax Error: Invalid high surrogate "\\uD83D" followed by a non-low surrogate "\\uDBFF" in String.', + locations: [{ line: 1, column: 6 }], + }); + + expectSyntaxError('"bad \\uD83D\\escape"').to.deep.equal({ + message: 'Syntax Error: Invalid character escape sequence: "\\e".', + locations: [{ line: 1, column: 13 }], }); }); diff --git a/src/language/lexer.js b/src/language/lexer.js index ec91f2248e..2b09f9008d 100644 --- a/src/language/lexer.js +++ b/src/language/lexer.js @@ -425,7 +425,7 @@ function readDigits(source: Source, start: number, firstCode: number): number { /** * Reads a string token from the source file. * - * "([^"\\\u000A\u000D]|(\\(u[0-9a-fA-F]{4}|["\\/bfnrt])))*" + * "([^"\\\u000A\u000D]|(\\(u([0-9a-fA-F]{4}|\{[0-9a-fA-F]{1,6}\})|["\\/bfnrt])))*" */ function readString( source: Source, @@ -439,6 +439,7 @@ function readString( let chunkStart = position; let code = 0; let value = ''; + let isSurrogatePair = false; while ( position < body.length && @@ -470,129 +471,173 @@ function readString( ); } - ++position; + let codeSize; + // Escape Sequence (\) if (code === 92) { - // \ - value += body.slice(chunkStart, position - 1); - code = body.charCodeAt(position); - switch (code) { - case 34: - value += '"'; - break; - case 47: - value += '/'; - break; - case 92: - value += '\\'; - break; - case 98: - value += '\b'; - break; - case 102: - value += '\f'; - break; - case 110: - value += '\n'; - break; - case 114: - value += '\r'; - break; - case 116: - value += '\t'; - break; - case 117: { - // uXXXX - const convertedEscape = convertUnicodeEscape(source, body, position); - value += convertedEscape.value; - position += convertedEscape.positionIncrease; - break; - } - default: - throw syntaxError( - source, - position, - `Invalid character escape sequence: \\${String.fromCharCode( - code, - )}.`, - ); + value += body.slice(chunkStart, position); + const escape = readEscapeSequence(source, position); + code = escape.code; + codeSize = escape.size; + value += escape.value; + chunkStart = position + codeSize; + } else { + codeSize = 1; + } + + // Surrogate Pairs + // The specification semantics call for replacing surrogate pairs with valid + // non-BMP Unicode code points. However since JS strings encode non-BMP code + // points as surrogate pairs anyhow, this simply validates those pairs. + if (code >= 0xd800 && code <= 0xdbff) { + let nextCode = body.charCodeAt(position + codeSize); + if (nextCode === 92) { + nextCode = readEscapeSequence(source, position + codeSize).code; } - ++position; - chunkStart = position; + // A High Surrogate must be followed by a Low Surrogate. + if (nextCode < 0xdc00 || nextCode > 0xdfff) { + throw syntaxError( + source, + position, + `Invalid high surrogate ${printCharCode( + code, + )} followed by a non-low surrogate ${printCharCode( + nextCode, + )} in String.`, + ); + } + isSurrogatePair = true; + } else if (code >= 0xdc00 && code <= 0xdfff) { + // A Low Surrogate must follow a High Surrogate. + if (!isSurrogatePair) { + throw syntaxError( + source, + position, + `Invalid low surrogate within String: ${printCharCode(code)}.`, + ); + } + isSurrogatePair = false; } + + position += codeSize; } throw syntaxError(source, position, 'Unterminated string.'); } -function convertUnicodeEscape(source, body, position) { - const charCode = uniCharCode( - body.charCodeAt(position + 1), - body.charCodeAt(position + 2), - body.charCodeAt(position + 3), - body.charCodeAt(position + 4), - ); - if (charCode < 0) { - const invalidSequence = body.slice(position + 1, position + 5); - throw syntaxError( - source, - position, - `Invalid character escape sequence: \\u${invalidSequence}.`, - ); - } +// The code-point, lexed size, and string value of an escape sequence. +type EscapeSequence = {| code: number, size: number, value: string |}; - let value; - let positionIncrease; - // String.fromCharCode doesn't fail for invalid surrogate pairs, therefore - // it is manually verified here - if (isTrailingSurrogate(charCode)) { - const invalidSequence = body.slice(position + 1, position + 5); - throw syntaxError( - source, - position, - `Invalid surrogate pair escape sequence: \\u${invalidSequence}.`, - ); +/** + * | Escaped Character | Code Point | Character Name | + * | ----------------- | ---------- | ---------------------------- | + * | {`"`} | U+0022 | double quote | + * | {`\`} | U+005C | reverse solidus (back slash) | + * | {`/`} | U+002F | solidus (forward slash) | + * | {`b`} | U+0008 | backspace | + * | {`f`} | U+000C | form feed | + * | {`n`} | U+000A | line feed (new line) | + * | {`r`} | U+000D | carriage return | + * | {`t`} | U+0009 | horizontal tab | + */ +function readEscapeSequence(source: Source, pos: number): EscapeSequence { + const escapedCode = source.body.charCodeAt(pos + 1); + switch (escapedCode) { + case 34: // \" + return { code: 0x0022, size: 2, value: '"' }; + case 47: // \/ + return { code: 0x005c, size: 2, value: '/' }; + case 92: // \\ + return { code: 0x002f, size: 2, value: '\\' }; + case 98: // \b + return { code: 0x0008, size: 2, value: '\b' }; + case 102: // \f + return { code: 0x000c, size: 2, value: '\f' }; + case 110: // \n + return { code: 0x000a, size: 2, value: '\n' }; + case 114: // \r + return { code: 0x000d, size: 2, value: '\r' }; + case 116: // \t + return { code: 0x0009, size: 2, value: '\t' }; + case 117: // \u + return readEscapedUnicode(source, pos); } - if (isLeadingSurrogate(charCode)) { - if ( - body.charCodeAt(position + 5) !== 92 || - body.charCodeAt(position + 6) !== 117 - ) { - const invalidSequence = body.slice(position + 1, position + 7); - throw syntaxError( - source, - position, - `Invalid surrogate pair escape sequence: \\u${invalidSequence}.`, - ); + throw syntaxError( + source, + pos + 1, + `Invalid character escape sequence: "${source.body.slice(pos, pos + 2)}".`, + ); +} + +function readEscapedUnicode(source: Source, pos: number): EscapeSequence { + const body = source.body; + let code = 0; + let size = 2; + // A braced unicode escape "{" + if (body.charCodeAt(pos + 2) === 123) { + size++; + // A braced unicode escape cannot be larger than 10 chars. + while (size < 10) { + const charCode = body.charCodeAt(pos + size++); + // If an end quote, break with an invalid code. + if (charCode === 34) { + size--; + code = -1; + break; + } + // End brace "}" to complete the code. + if (charCode === 125) { + // If the size is only 4, the escape found no hex digits. + if (size === 4) { + code = -1; + } + break; + } else if (size === 10) { + // If this is the 10th char which is not a brace, it's an invalid code. + code = -1; + } else { + // Append this hex digit to the code point. + code = (code << 4) | char2hex(charCode); + } } - const trailingSurrogate = uniCharCode( - body.charCodeAt(position + 7), - body.charCodeAt(position + 8), - body.charCodeAt(position + 9), - body.charCodeAt(position + 10), - ); - if (!isTrailingSurrogate(trailingSurrogate)) { - const invalidSequence = body.slice(position + 1, position + 11); + // Unicode code points must be <= U+10FFFF + if (code > 0x10ffff) { throw syntaxError( source, - position, - `Invalid surrogate pair escape sequence: \\u${invalidSequence}.`, + pos + 1, + `Undefined Unicode code-point: "${body.slice(pos, pos + size)}".`, ); } - value = String.fromCharCode(charCode, trailingSurrogate); - positionIncrease = 10; } else { - value = String.fromCharCode(charCode); - positionIncrease = 4; + // A simple unicode escape is 6 chars. + while (size < 6) { + const charCode = body.charCodeAt(pos + size++); + // If an end quote, break with an invalid code. + if (charCode === 34) { + size--; + code = -1; + break; + } + // Append this hex digit to the code point. + code = (code << 4) | char2hex(charCode); + } } - return { value, positionIncrease }; -} - -function isLeadingSurrogate(charCode) { - return charCode >= 0xd800 && charCode <= 0xdbff; -} - -function isTrailingSurrogate(charCode) { - return charCode >= 0xdc00 && charCode <= 0xdfff; + // A negative code point occurs if char2hex ever encountered a non-hex digit. + if (code < 0) { + throw syntaxError( + source, + pos + 1, + `Invalid Unicode escape sequence: "${body.slice(pos, pos + size)}".`, + ); + } + // JS strings encode astral code points as surrogate pairs. + const value = + code <= 0xffff + ? String.fromCharCode(code) + : String.fromCharCode( + 0xd800 | ((code - 0x10000) >> 10), // High Surrogate + 0xdc00 | ((code - 0x10000) & 0x3ff), // Low Surrogate + ); + return { code, size, value }; } /** @@ -679,22 +724,6 @@ function readBlockString( throw syntaxError(source, position, 'Unterminated string.'); } -/** - * Converts four hexadecimal chars to the integer that the - * string represents. For example, uniCharCode('0','0','0','f') - * will return 15, and uniCharCode('0','0','f','f') returns 255. - * - * Returns a negative number on error, if a char was invalid. - * - * This is implemented by noting that char2hex() returns -1 on error, - * which means the result of ORing the char2hex() will also be negative. - */ -function uniCharCode(a: number, b: number, c: number, d: number): number { - return ( - (char2hex(a) << 12) | (char2hex(b) << 8) | (char2hex(c) << 4) | char2hex(d) - ); -} - /** * Converts a hex character to its integer value. * '0' becomes 0, '9' becomes 9