From d8a4a3ef8801784220e791888ce103137f09631b Mon Sep 17 00:00:00 2001 From: Ivan Goncharov Date: Wed, 2 Jun 2021 15:04:50 +0300 Subject: [PATCH 1/6] Lexer: Always use hexadecimal form for code values. --- src/language/lexer.ts | 266 +++++++++++++++++++++--------------------- 1 file changed, 134 insertions(+), 132 deletions(-) diff --git a/src/language/lexer.ts b/src/language/lexer.ts index c435a02bd8..8d1c0b82bc 100644 --- a/src/language/lexer.ts +++ b/src/language/lexer.ts @@ -129,18 +129,18 @@ function readToken(lexer: Lexer, prev: Token): Token { // SourceCharacter switch (code) { case 0xfeff: // - case 9: // \t - case 32: // - case 44: // , + case 0x0009: // \t + case 0x0020: // + case 0x002c: // , ++pos; continue; - case 10: // \n + case 0x000a: // \n ++pos; ++lexer.line; lexer.lineStart = pos; continue; - case 13: // \r - if (body.charCodeAt(pos + 1) === 10) { + case 0x000d: // \r + if (body.charCodeAt(pos + 1) === 0x000a) { pos += 2; } else { ++pos; @@ -148,115 +148,115 @@ function readToken(lexer: Lexer, prev: Token): Token { ++lexer.line; lexer.lineStart = pos; continue; - case 33: // ! + case 0x0021: // ! return new Token(TokenKind.BANG, pos, pos + 1, line, col, prev); - case 35: // # + case 0x0023: // # return readComment(source, pos, line, col, prev); - case 36: // $ + case 0x0024: // $ return new Token(TokenKind.DOLLAR, pos, pos + 1, line, col, prev); - case 38: // & + case 0x0026: // & return new Token(TokenKind.AMP, pos, pos + 1, line, col, prev); - case 40: // ( + case 0x0028: // ( return new Token(TokenKind.PAREN_L, pos, pos + 1, line, col, prev); - case 41: // ) + case 0x0029: // ) return new Token(TokenKind.PAREN_R, pos, pos + 1, line, col, prev); - case 46: // . + case 0x002e: // . if ( - body.charCodeAt(pos + 1) === 46 && - body.charCodeAt(pos + 2) === 46 + body.charCodeAt(pos + 1) === 0x002e && + body.charCodeAt(pos + 2) === 0x002e ) { return new Token(TokenKind.SPREAD, pos, pos + 3, line, col, prev); } break; - case 58: // : + case 0x003a: // : return new Token(TokenKind.COLON, pos, pos + 1, line, col, prev); - case 61: // = + case 0x003d: // = return new Token(TokenKind.EQUALS, pos, pos + 1, line, col, prev); - case 64: // @ + case 0x0040: // @ return new Token(TokenKind.AT, pos, pos + 1, line, col, prev); - case 91: // [ + case 0x005b: // [ return new Token(TokenKind.BRACKET_L, pos, pos + 1, line, col, prev); - case 93: // ] + case 0x005d: // ] return new Token(TokenKind.BRACKET_R, pos, pos + 1, line, col, prev); - case 123: // { + case 0x007b: // { return new Token(TokenKind.BRACE_L, pos, pos + 1, line, col, prev); - case 124: // | + case 0x007c: // | return new Token(TokenKind.PIPE, pos, pos + 1, line, col, prev); - case 125: // } + case 0x007d: // } return new Token(TokenKind.BRACE_R, pos, pos + 1, line, col, prev); - case 34: // " + case 0x0022: // " if ( - body.charCodeAt(pos + 1) === 34 && - body.charCodeAt(pos + 2) === 34 + body.charCodeAt(pos + 1) === 0x0022 && + body.charCodeAt(pos + 2) === 0x0022 ) { return readBlockString(source, pos, line, col, prev, lexer); } return readString(source, pos, line, col, prev); - case 45: // - - case 48: // 0 - case 49: // 1 - case 50: // 2 - case 51: // 3 - case 52: // 4 - case 53: // 5 - case 54: // 6 - case 55: // 7 - case 56: // 8 - case 57: // 9 + case 0x002d: // - + case 0x0030: // 0 + case 0x0031: // 1 + case 0x0032: // 2 + case 0x0033: // 3 + case 0x0034: // 4 + case 0x0035: // 5 + case 0x0036: // 6 + case 0x0037: // 7 + case 0x0038: // 8 + case 0x0039: // 9 return readNumber(source, pos, code, line, col, prev); - case 65: // A - case 66: // B - case 67: // C - case 68: // D - case 69: // E - case 70: // F - case 71: // G - case 72: // H - case 73: // I - case 74: // J - case 75: // K - case 76: // L - case 77: // M - case 78: // N - case 79: // O - case 80: // P - case 81: // Q - case 82: // R - case 83: // S - case 84: // T - case 85: // U - case 86: // V - case 87: // W - case 88: // X - case 89: // Y - case 90: // Z - case 95: // _ - case 97: // a - case 98: // b - case 99: // c - case 100: // d - case 101: // e - case 102: // f - case 103: // g - case 104: // h - case 105: // i - case 106: // j - case 107: // k - case 108: // l - case 109: // m - case 110: // n - case 111: // o - case 112: // p - case 113: // q - case 114: // r - case 115: // s - case 116: // t - case 117: // u - case 118: // v - case 119: // w - case 120: // x - case 121: // y - case 122: // z + case 0x0041: // A + case 0x0042: // B + case 0x0043: // C + case 0x0044: // D + case 0x0045: // E + case 0x0046: // F + case 0x0047: // G + case 0x0048: // H + case 0x0049: // I + case 0x004a: // J + case 0x004b: // K + case 0x004c: // L + case 0x004d: // M + case 0x004e: // N + case 0x004f: // O + case 0x0050: // P + case 0x0051: // Q + case 0x0052: // R + case 0x0053: // S + case 0x0054: // T + case 0x0055: // U + case 0x0056: // V + case 0x0057: // W + case 0x0058: // X + case 0x0059: // Y + case 0x005a: // Z + case 0x005f: // _ + case 0x0061: // a + case 0x0062: // b + case 0x0063: // c + case 0x0064: // d + case 0x0065: // e + case 0x0066: // f + case 0x0067: // g + case 0x0068: // h + case 0x0069: // i + case 0x006a: // j + case 0x006b: // k + case 0x006c: // l + case 0x006d: // m + case 0x006e: // n + case 0x006f: // o + case 0x0070: // p + case 0x0071: // q + case 0x0072: // r + case 0x0073: // s + case 0x0074: // t + case 0x0075: // u + case 0x0076: // v + case 0x0077: // w + case 0x0078: // x + case 0x0079: // y + case 0x007a: // z return readName(source, pos, line, col, prev); } @@ -276,7 +276,7 @@ function unexpectedCharacterMessage(code: number): string { return `Cannot contain the invalid character ${printCharCode(code)}.`; } - if (code === 39) { + if (code === 0x0027) { // ' return 'Unexpected single quote character (\'), did you mean to use a double quote (")?'; } @@ -339,15 +339,15 @@ function readNumber( let position = start; let isFloat = false; - if (code === 45) { + if (code === 0x002d) { // - code = body.charCodeAt(++position); } - if (code === 48) { + if (code === 0x0030) { // 0 code = body.charCodeAt(++position); - if (code >= 48 && code <= 57) { + if (code >= 0x0030 && code <= 0x0039) { throw syntaxError( source, position, @@ -359,7 +359,7 @@ function readNumber( code = body.charCodeAt(position); } - if (code === 46) { + if (code === 0x002e) { // . isFloat = true; @@ -368,12 +368,12 @@ function readNumber( code = body.charCodeAt(position); } - if (code === 69 || code === 101) { + if (code === 0x0045 || code === 0x0065) { // E e isFloat = true; code = body.charCodeAt(++position); - if (code === 43 || code === 45) { + if (code === 0x002b || code === 0x002d) { // + - code = body.charCodeAt(++position); } @@ -382,7 +382,7 @@ function readNumber( } // Numbers cannot be followed by . or NameStart - if (code === 46 || isNameStart(code)) { + if (code === 0x002e || isNameStart(code)) { throw syntaxError( source, position, @@ -408,11 +408,11 @@ function readDigits(source: Source, start: number, firstCode: number): number { const body = source.body; let position = start; let code = firstCode; - if (code >= 48 && code <= 57) { + if (code >= 0x0030 && code <= 0x0039) { // 0 - 9 do { code = body.charCodeAt(++position); - } while (code >= 48 && code <= 57); // 0 - 9 + } while (code >= 0x0030 && code <= 0x0039); // 0 - 9 return position; } throw syntaxError( @@ -448,7 +448,7 @@ function readString( code !== 0x000d ) { // Closing Quote (") - if (code === 34) { + if (code === 0x0022) { value += body.slice(chunkStart, position); return new Token( TokenKind.STRING, @@ -471,36 +471,36 @@ function readString( } ++position; - if (code === 92) { + if (code === 0x005c) { // \ value += body.slice(chunkStart, position - 1); code = body.charCodeAt(position); switch (code) { - case 34: + case 0x0022: value += '"'; break; - case 47: + case 0x002f: value += '/'; break; - case 92: + case 0x005c: value += '\\'; break; - case 98: + case 0x0062: value += '\b'; break; - case 102: + case 0x0066: value += '\f'; break; - case 110: + case 0x006e: value += '\n'; break; - case 114: + case 0x0072: value += '\r'; break; - case 116: + case 0x0074: value += '\t'; break; - case 117: { + case 0x0075: { // uXXXX const charCode = uniCharCode( body.charCodeAt(position + 1), @@ -559,9 +559,9 @@ function readBlockString( while (position < body.length && !isNaN((code = body.charCodeAt(position)))) { // Closing Triple-Quote (""") if ( - code === 34 && - body.charCodeAt(position + 1) === 34 && - body.charCodeAt(position + 2) === 34 + code === 0x0022 && + body.charCodeAt(position + 1) === 0x0022 && + body.charCodeAt(position + 2) === 0x0022 ) { rawValue += body.slice(chunkStart, position); return new Token( @@ -589,14 +589,14 @@ function readBlockString( ); } - if (code === 10) { + if (code === 0x000a) { // new line ++position; ++lexer.line; lexer.lineStart = position; - } else if (code === 13) { + } else if (code === 0x000d) { // carriage return - if (body.charCodeAt(position + 1) === 10) { + if (body.charCodeAt(position + 1) === 0x000a) { position += 2; } else { ++position; @@ -605,10 +605,10 @@ function readBlockString( lexer.lineStart = position; } else if ( // Escape Triple-Quote (\""") - code === 92 && - body.charCodeAt(position + 1) === 34 && - body.charCodeAt(position + 2) === 34 && - body.charCodeAt(position + 3) === 34 + code === 0x005c && + body.charCodeAt(position + 1) === 0x0022 && + body.charCodeAt(position + 2) === 0x0022 && + body.charCodeAt(position + 3) === 0x0022 ) { rawValue += body.slice(chunkStart, position) + '"""'; position += 4; @@ -646,12 +646,12 @@ function uniCharCode(a: number, b: number, c: number, d: number): number { * Returns -1 on error. */ function char2hex(a: number): number { - return a >= 48 && a <= 57 - ? a - 48 // 0-9 - : a >= 65 && a <= 70 - ? a - 55 // A-F - : a >= 97 && a <= 102 - ? a - 87 // a-f + return a >= 0x0030 && a <= 0x0039 + ? a - 0x0030 // 0-9 + : a >= 0x0041 && a <= 0x0046 + ? a - 0x0037 // A-F + : a >= 0x0061 && a <= 0x0066 + ? a - 0x0057 // a-f : -1; } @@ -674,10 +674,10 @@ function readName( while ( position !== bodyLength && !isNaN((code = body.charCodeAt(position))) && - (code === 95 || // _ - (code >= 48 && code <= 57) || // 0-9 - (code >= 65 && code <= 90) || // A-Z - (code >= 97 && code <= 122)) // a-z + (code === 0x005f || // _ + (code >= 0x0030 && code <= 0x0039) || // 0-9 + (code >= 0x0041 && code <= 0x005a) || // A-Z + (code >= 0x0061 && code <= 0x007a)) // a-z ) { ++position; } @@ -695,6 +695,8 @@ function readName( // _ A-Z a-z function isNameStart(code: number): boolean { return ( - code === 95 || (code >= 65 && code <= 90) || (code >= 97 && code <= 122) + code === 0x005f || + (code >= 0x0041 && code <= 0x005a) || + (code >= 0x0061 && code <= 0x007a) ); } From c1eb4c20694c6eaae6f11f727b8c73fc92b7d43a Mon Sep 17 00:00:00 2001 From: Ivan Goncharov Date: Wed, 2 Jun 2021 15:39:58 +0300 Subject: [PATCH 2/6] Lexer: set prev inside `lookahed` method --- src/language/ast.ts | 3 +- src/language/lexer.ts | 76 +++++++++++++++++++------------------------ 2 files changed, 35 insertions(+), 44 deletions(-) diff --git a/src/language/ast.ts b/src/language/ast.ts index 77cdf06de5..62ddf24c6b 100644 --- a/src/language/ast.ts +++ b/src/language/ast.ts @@ -96,7 +96,6 @@ export class Token { end: number, line: number, column: number, - prev: Token | null, value?: string, ) { this.kind = kind; @@ -105,7 +104,7 @@ export class Token { this.line = line; this.column = column; this.value = value as string; - this.prev = prev; + this.prev = null; this.next = null; } diff --git a/src/language/lexer.ts b/src/language/lexer.ts index 8d1c0b82bc..7ae7114db9 100644 --- a/src/language/lexer.ts +++ b/src/language/lexer.ts @@ -38,7 +38,7 @@ export class Lexer { lineStart: number; constructor(source: Source) { - const startOfFileToken = new Token(TokenKind.SOF, 0, 0, 0, 0, null); + const startOfFileToken = new Token(TokenKind.SOF, 0, 0, 0, 0); this.source = source; this.lastToken = startOfFileToken; @@ -64,8 +64,17 @@ export class Lexer { let token = this.token; if (token.kind !== TokenKind.EOF) { do { - // @ts-expect-error next is only mutable during parsing, so we cast to allow this. - token = token.next ?? (token.next = readToken(this, token)); + if (token.next) { + token = token.next; + } else { + // Read the next token and form a link in the token linked-list. + const nextToken = readNextToken(this, token); + // @ts-expect-error next is only mutable during parsing. + token.next = nextToken; + // @ts-expect-error prev is only mutable during parsing. + nextToken.prev = token; + token = nextToken; + } } while (token.kind === TokenKind.COMMENT); } return token; @@ -114,7 +123,7 @@ function printCharCode(code: number): string { * punctuators immediately or calls the appropriate helper function for more * complicated tokens. */ -function readToken(lexer: Lexer, prev: Token): Token { +function readNextToken(lexer: Lexer, prev: Token): Token { const source = lexer.source; const body = source.body; const bodyLength = body.length; @@ -149,49 +158,49 @@ function readToken(lexer: Lexer, prev: Token): Token { lexer.lineStart = pos; continue; case 0x0021: // ! - return new Token(TokenKind.BANG, pos, pos + 1, line, col, prev); + return new Token(TokenKind.BANG, pos, pos + 1, line, col); case 0x0023: // # - return readComment(source, pos, line, col, prev); + return readComment(source, pos, line, col); case 0x0024: // $ - return new Token(TokenKind.DOLLAR, pos, pos + 1, line, col, prev); + return new Token(TokenKind.DOLLAR, pos, pos + 1, line, col); case 0x0026: // & - return new Token(TokenKind.AMP, pos, pos + 1, line, col, prev); + return new Token(TokenKind.AMP, pos, pos + 1, line, col); case 0x0028: // ( - return new Token(TokenKind.PAREN_L, pos, pos + 1, line, col, prev); + return new Token(TokenKind.PAREN_L, pos, pos + 1, line, col); case 0x0029: // ) - return new Token(TokenKind.PAREN_R, pos, pos + 1, line, col, prev); + return new Token(TokenKind.PAREN_R, pos, pos + 1, line, col); case 0x002e: // . if ( body.charCodeAt(pos + 1) === 0x002e && body.charCodeAt(pos + 2) === 0x002e ) { - return new Token(TokenKind.SPREAD, pos, pos + 3, line, col, prev); + return new Token(TokenKind.SPREAD, pos, pos + 3, line, col); } break; case 0x003a: // : - return new Token(TokenKind.COLON, pos, pos + 1, line, col, prev); + return new Token(TokenKind.COLON, pos, pos + 1, line, col); case 0x003d: // = - return new Token(TokenKind.EQUALS, pos, pos + 1, line, col, prev); + return new Token(TokenKind.EQUALS, pos, pos + 1, line, col); case 0x0040: // @ - return new Token(TokenKind.AT, pos, pos + 1, line, col, prev); + return new Token(TokenKind.AT, pos, pos + 1, line, col); case 0x005b: // [ - return new Token(TokenKind.BRACKET_L, pos, pos + 1, line, col, prev); + return new Token(TokenKind.BRACKET_L, pos, pos + 1, line, col); case 0x005d: // ] - return new Token(TokenKind.BRACKET_R, pos, pos + 1, line, col, prev); + return new Token(TokenKind.BRACKET_R, pos, pos + 1, line, col); case 0x007b: // { - return new Token(TokenKind.BRACE_L, pos, pos + 1, line, col, prev); + return new Token(TokenKind.BRACE_L, pos, pos + 1, line, col); case 0x007c: // | - return new Token(TokenKind.PIPE, pos, pos + 1, line, col, prev); + return new Token(TokenKind.PIPE, pos, pos + 1, line, col); case 0x007d: // } - return new Token(TokenKind.BRACE_R, pos, pos + 1, line, col, prev); + return new Token(TokenKind.BRACE_R, pos, pos + 1, line, col); case 0x0022: // " if ( body.charCodeAt(pos + 1) === 0x0022 && body.charCodeAt(pos + 2) === 0x0022 ) { - return readBlockString(source, pos, line, col, prev, lexer); + return readBlockString(source, pos, line, col, lexer); } - return readString(source, pos, line, col, prev); + return readString(source, pos, line, col); case 0x002d: // - case 0x0030: // 0 case 0x0031: // 1 @@ -203,7 +212,7 @@ function readToken(lexer: Lexer, prev: Token): Token { case 0x0037: // 7 case 0x0038: // 8 case 0x0039: // 9 - return readNumber(source, pos, code, line, col, prev); + return readNumber(source, pos, code, line, col); case 0x0041: // A case 0x0042: // B case 0x0043: // C @@ -257,7 +266,7 @@ function readToken(lexer: Lexer, prev: Token): Token { case 0x0078: // x case 0x0079: // y case 0x007a: // z - return readName(source, pos, line, col, prev); + return readName(source, pos, line, col); } throw syntaxError(source, pos, unexpectedCharacterMessage(code)); @@ -265,7 +274,7 @@ function readToken(lexer: Lexer, prev: Token): Token { const line = lexer.line; const col = 1 + pos - lexer.lineStart; - return new Token(TokenKind.EOF, bodyLength, bodyLength, line, col, prev); + return new Token(TokenKind.EOF, bodyLength, bodyLength, line, col); } /** @@ -294,7 +303,6 @@ function readComment( start: number, line: number, col: number, - prev: Token | null, ): Token { const body = source.body; let code; @@ -314,7 +322,6 @@ function readComment( position, line, col, - prev, body.slice(start + 1, position), ); } @@ -332,7 +339,6 @@ function readNumber( firstCode: number, line: number, col: number, - prev: Token | null, ): Token { const body = source.body; let code = firstCode; @@ -396,7 +402,6 @@ function readNumber( position, line, col, - prev, body.slice(start, position), ); } @@ -432,7 +437,6 @@ function readString( start: number, line: number, col: number, - prev: Token | null, ): Token { const body = source.body; let position = start + 1; @@ -450,15 +454,7 @@ function readString( // Closing Quote (") if (code === 0x0022) { value += body.slice(chunkStart, position); - return new Token( - TokenKind.STRING, - start, - position + 1, - line, - col, - prev, - value, - ); + return new Token(TokenKind.STRING, start, position + 1, line, col, value); } // SourceCharacter @@ -547,7 +543,6 @@ function readBlockString( start: number, line: number, col: number, - prev: Token | null, lexer: Lexer, ): Token { const body = source.body; @@ -570,7 +565,6 @@ function readBlockString( position + 3, line, col, - prev, dedentBlockStringValue(rawValue), ); } @@ -665,7 +659,6 @@ function readName( start: number, line: number, col: number, - prev: Token | null, ): Token { const body = source.body; const bodyLength = body.length; @@ -687,7 +680,6 @@ function readName( position, line, col, - prev, body.slice(start, position), ); } From d652392ff5e81e96abd58ab4d5d0ef904ea7a42b Mon Sep 17 00:00:00 2001 From: Ivan Goncharov Date: Wed, 2 Jun 2021 15:46:21 +0300 Subject: [PATCH 3/6] paser: switch `expectOptionalToken` to return boolean --- src/language/parser.ts | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/language/parser.ts b/src/language/parser.ts index 660fc906c1..f2807b5c1f 100644 --- a/src/language/parser.ts +++ b/src/language/parser.ts @@ -570,8 +570,8 @@ export class Parser { case TokenKind.DOLLAR: if (isConst) { this.expectToken(TokenKind.DOLLAR); - const varName = this.expectOptionalToken(TokenKind.NAME)?.value; - if (varName != null) { + if (this._lexer.token.kind === TokenKind.NAME) { + const varName = this._lexer.token.value; throw syntaxError( this._lexer.source, token.start, @@ -1395,23 +1395,23 @@ export class Parser { } /** - * If the next token is of the given kind, return that token after advancing the lexer. - * Otherwise, do not change the parser state and return undefined. + * If the next token is of the given kind, return "true" after advancing the lexer. + * Otherwise, do not change the parser state and return "false". */ - expectOptionalToken(kind: TokenKindEnum): Maybe { + expectOptionalToken(kind: TokenKindEnum): boolean { const token = this._lexer.token; if (token.kind === kind) { this._lexer.advance(); - return token; + return true; } - return undefined; + return false; } /** * If the next token is a given keyword, advance the lexer. * Otherwise, do not change the parser state and throw an error. */ - expectKeyword(value: string) { + expectKeyword(value: string): void { const token = this._lexer.token; if (token.kind === TokenKind.NAME && token.value === value) { this._lexer.advance(); From 4f3a10eae283e5fcf7ca02064236a1c21e3639da Mon Sep 17 00:00:00 2001 From: Ivan Goncharov Date: Wed, 2 Jun 2021 17:07:07 +0300 Subject: [PATCH 4/6] Lexer: create tokens through 'createToken' --- src/language/lexer.ts | 196 +++++++++++++++++++----------------------- 1 file changed, 87 insertions(+), 109 deletions(-) diff --git a/src/language/lexer.ts b/src/language/lexer.ts index 7ae7114db9..31c5a47a08 100644 --- a/src/language/lexer.ts +++ b/src/language/lexer.ts @@ -68,7 +68,7 @@ export class Lexer { token = token.next; } else { // Read the next token and form a link in the token linked-list. - const nextToken = readNextToken(this, token); + const nextToken = readNextToken(this, token.end); // @ts-expect-error next is only mutable during parsing. token.next = nextToken; // @ts-expect-error prev is only mutable during parsing. @@ -116,6 +116,21 @@ function printCharCode(code: number): string { ); } +/** + * Create a token with line and column location information. + */ +function createToken( + lexer: Lexer, + kind: TokenKindEnum, + start: number, + end: number, + value?: string, +): Token { + const line = lexer.line; + const col = 1 + start - lexer.lineStart; + return new Token(kind, start, end, line, col, value); +} + /** * Gets the next token from the source starting at the given position. * @@ -123,84 +138,80 @@ function printCharCode(code: number): string { * punctuators immediately or calls the appropriate helper function for more * complicated tokens. */ -function readNextToken(lexer: Lexer, prev: Token): Token { - const source = lexer.source; - const body = source.body; +function readNextToken(lexer: Lexer, start: number): Token { + const body = lexer.source.body; const bodyLength = body.length; - let pos = prev.end; - while (pos < bodyLength) { - const code = body.charCodeAt(pos); - - const line = lexer.line; - const col = 1 + pos - lexer.lineStart; + let position = start; + while (position < bodyLength) { + const code = body.charCodeAt(position); // SourceCharacter switch (code) { case 0xfeff: // - case 0x0009: // \t + case 0x0009: // \t case 0x0020: // case 0x002c: // , - ++pos; + ++position; continue; case 0x000a: // \n - ++pos; + ++position; ++lexer.line; - lexer.lineStart = pos; + lexer.lineStart = position; continue; case 0x000d: // \r - if (body.charCodeAt(pos + 1) === 0x000a) { - pos += 2; + if (body.charCodeAt(position + 1) === 0x000a) { + position += 2; } else { - ++pos; + ++position; } ++lexer.line; - lexer.lineStart = pos; + lexer.lineStart = position; continue; case 0x0021: // ! - return new Token(TokenKind.BANG, pos, pos + 1, line, col); + return createToken(lexer, TokenKind.BANG, position, position + 1); case 0x0023: // # - return readComment(source, pos, line, col); + return readComment(lexer, position); case 0x0024: // $ - return new Token(TokenKind.DOLLAR, pos, pos + 1, line, col); + return createToken(lexer, TokenKind.DOLLAR, position, position + 1); case 0x0026: // & - return new Token(TokenKind.AMP, pos, pos + 1, line, col); + return createToken(lexer, TokenKind.AMP, position, position + 1); case 0x0028: // ( - return new Token(TokenKind.PAREN_L, pos, pos + 1, line, col); + return createToken(lexer, TokenKind.PAREN_L, position, position + 1); case 0x0029: // ) - return new Token(TokenKind.PAREN_R, pos, pos + 1, line, col); + return createToken(lexer, TokenKind.PAREN_R, position, position + 1); case 0x002e: // . if ( - body.charCodeAt(pos + 1) === 0x002e && - body.charCodeAt(pos + 2) === 0x002e + body.charCodeAt(position + 1) === 0x002e && + body.charCodeAt(position + 2) === 0x002e ) { - return new Token(TokenKind.SPREAD, pos, pos + 3, line, col); + return createToken(lexer, TokenKind.SPREAD, position, position + 3); } break; case 0x003a: // : - return new Token(TokenKind.COLON, pos, pos + 1, line, col); + return createToken(lexer, TokenKind.COLON, position, position + 1); case 0x003d: // = - return new Token(TokenKind.EQUALS, pos, pos + 1, line, col); + return createToken(lexer, TokenKind.EQUALS, position, position + 1); case 0x0040: // @ - return new Token(TokenKind.AT, pos, pos + 1, line, col); + return createToken(lexer, TokenKind.AT, position, position + 1); case 0x005b: // [ - return new Token(TokenKind.BRACKET_L, pos, pos + 1, line, col); + return createToken(lexer, TokenKind.BRACKET_L, position, position + 1); case 0x005d: // ] - return new Token(TokenKind.BRACKET_R, pos, pos + 1, line, col); + return createToken(lexer, TokenKind.BRACKET_R, position, position + 1); case 0x007b: // { - return new Token(TokenKind.BRACE_L, pos, pos + 1, line, col); + return createToken(lexer, TokenKind.BRACE_L, position, position + 1); case 0x007c: // | - return new Token(TokenKind.PIPE, pos, pos + 1, line, col); + return createToken(lexer, TokenKind.PIPE, position, position + 1); case 0x007d: // } - return new Token(TokenKind.BRACE_R, pos, pos + 1, line, col); + return createToken(lexer, TokenKind.BRACE_R, position, position + 1); case 0x0022: // " if ( - body.charCodeAt(pos + 1) === 0x0022 && - body.charCodeAt(pos + 2) === 0x0022 + body.charCodeAt(position + 1) === 0x0022 && + body.charCodeAt(position + 2) === 0x0022 ) { - return readBlockString(source, pos, line, col, lexer); + return readBlockString(lexer, position); } - return readString(source, pos, line, col); + return readString(lexer, position); case 0x002d: // - case 0x0030: // 0 case 0x0031: // 1 @@ -212,7 +223,7 @@ function readNextToken(lexer: Lexer, prev: Token): Token { case 0x0037: // 7 case 0x0038: // 8 case 0x0039: // 9 - return readNumber(source, pos, code, line, col); + return readNumber(lexer, position, code); case 0x0041: // A case 0x0042: // B case 0x0043: // C @@ -266,15 +277,13 @@ function readNextToken(lexer: Lexer, prev: Token): Token { case 0x0078: // x case 0x0079: // y case 0x007a: // z - return readName(source, pos, line, col); + return readName(lexer, position); } - throw syntaxError(source, pos, unexpectedCharacterMessage(code)); + throw syntaxError(lexer.source, position, unexpectedCharacterMessage(code)); } - const line = lexer.line; - const col = 1 + pos - lexer.lineStart; - return new Token(TokenKind.EOF, bodyLength, bodyLength, line, col); + return createToken(lexer, TokenKind.EOF, bodyLength, bodyLength); } /** @@ -298,13 +307,8 @@ function unexpectedCharacterMessage(code: number): string { * * #[\u0009\u0020-\uFFFF]* */ -function readComment( - source: Source, - start: number, - line: number, - col: number, -): Token { - const body = source.body; +function readComment(lexer: Lexer, start: number): Token { + const body = lexer.source.body; let code; let position = start; @@ -316,12 +320,11 @@ function readComment( (code > 0x001f || code === 0x0009) ); - return new Token( + return createToken( + lexer, TokenKind.COMMENT, start, position, - line, - col, body.slice(start + 1, position), ); } @@ -333,14 +336,8 @@ function readComment( * Int: -?(0|[1-9][0-9]*) * Float: -?(0|[1-9][0-9]*)(\.[0-9]+)?((E|e)(+|-)?[0-9]+)? */ -function readNumber( - source: Source, - start: number, - firstCode: number, - line: number, - col: number, -): Token { - const body = source.body; +function readNumber(lexer: Lexer, start: number, firstCode: number): Token { + const body = lexer.source.body; let code = firstCode; let position = start; let isFloat = false; @@ -355,13 +352,13 @@ function readNumber( code = body.charCodeAt(++position); if (code >= 0x0030 && code <= 0x0039) { throw syntaxError( - source, + lexer.source, position, `Invalid number, unexpected digit after 0: ${printCharCode(code)}.`, ); } } else { - position = readDigits(source, position, code); + position = readDigits(lexer, position, code); code = body.charCodeAt(position); } @@ -370,7 +367,7 @@ function readNumber( isFloat = true; code = body.charCodeAt(++position); - position = readDigits(source, position, code); + position = readDigits(lexer, position, code); code = body.charCodeAt(position); } @@ -383,25 +380,24 @@ function readNumber( // + - code = body.charCodeAt(++position); } - position = readDigits(source, position, code); + position = readDigits(lexer, position, code); code = body.charCodeAt(position); } // Numbers cannot be followed by . or NameStart if (code === 0x002e || isNameStart(code)) { throw syntaxError( - source, + lexer.source, position, `Invalid number, expected digit but got: ${printCharCode(code)}.`, ); } - return new Token( + return createToken( + lexer, isFloat ? TokenKind.FLOAT : TokenKind.INT, start, position, - line, - col, body.slice(start, position), ); } @@ -409,8 +405,8 @@ function readNumber( /** * Returns the new position in the source after reading digits. */ -function readDigits(source: Source, start: number, firstCode: number): number { - const body = source.body; +function readDigits(lexer: Lexer, start: number, firstCode: number): number { + const body = lexer.source.body; let position = start; let code = firstCode; if (code >= 0x0030 && code <= 0x0039) { @@ -421,7 +417,7 @@ function readDigits(source: Source, start: number, firstCode: number): number { return position; } throw syntaxError( - source, + lexer.source, position, `Invalid number, expected digit but got: ${printCharCode(code)}.`, ); @@ -432,13 +428,8 @@ function readDigits(source: Source, start: number, firstCode: number): number { * * "([^"\\\u000A\u000D]|(\\(u[0-9a-fA-F]{4}|["\\/bfnrt])))*" */ -function readString( - source: Source, - start: number, - line: number, - col: number, -): Token { - const body = source.body; +function readString(lexer: Lexer, start: number): Token { + const body = lexer.source.body; let position = start + 1; let chunkStart = position; let code = 0; @@ -454,13 +445,13 @@ function readString( // Closing Quote (") if (code === 0x0022) { value += body.slice(chunkStart, position); - return new Token(TokenKind.STRING, start, position + 1, line, col, value); + return createToken(lexer, TokenKind.STRING, start, position + 1, value); } // SourceCharacter if (code < 0x0020 && code !== 0x0009) { throw syntaxError( - source, + lexer.source, position, `Invalid character within String: ${printCharCode(code)}.`, ); @@ -507,7 +498,7 @@ function readString( if (charCode < 0) { const invalidSequence = body.slice(position + 1, position + 5); throw syntaxError( - source, + lexer.source, position, `Invalid character escape sequence: \\u${invalidSequence}.`, ); @@ -518,7 +509,7 @@ function readString( } default: throw syntaxError( - source, + lexer.source, position, `Invalid character escape sequence: \\${String.fromCharCode( code, @@ -530,7 +521,7 @@ function readString( } } - throw syntaxError(source, position, 'Unterminated string.'); + throw syntaxError(lexer.source, position, 'Unterminated string.'); } /** @@ -538,14 +529,8 @@ function readString( * * """("?"?(\\"""|\\(?!=""")|[^"\\]))*""" */ -function readBlockString( - source: Source, - start: number, - line: number, - col: number, - lexer: Lexer, -): Token { - const body = source.body; +function readBlockString(lexer: Lexer, start: number): Token { + const body = lexer.source.body; let position = start + 3; let chunkStart = position; let code = 0; @@ -559,12 +544,11 @@ function readBlockString( body.charCodeAt(position + 2) === 0x0022 ) { rawValue += body.slice(chunkStart, position); - return new Token( + return createToken( + lexer, TokenKind.BLOCK_STRING, start, position + 3, - line, - col, dedentBlockStringValue(rawValue), ); } @@ -577,7 +561,7 @@ function readBlockString( code !== 0x000d ) { throw syntaxError( - source, + lexer.source, position, `Invalid character within String: ${printCharCode(code)}.`, ); @@ -612,7 +596,7 @@ function readBlockString( } } - throw syntaxError(source, position, 'Unterminated string.'); + throw syntaxError(lexer.source, position, 'Unterminated string.'); } /** @@ -654,13 +638,8 @@ function char2hex(a: number): number { * * [_A-Za-z][_0-9A-Za-z]* */ -function readName( - source: Source, - start: number, - line: number, - col: number, -): Token { - const body = source.body; +function readName(lexer: Lexer, start: number): Token { + const body = lexer.source.body; const bodyLength = body.length; let position = start + 1; let code = 0; @@ -674,12 +653,11 @@ function readName( ) { ++position; } - return new Token( + return createToken( + lexer, TokenKind.NAME, start, position, - line, - col, body.slice(start, position), ); } From f00c78b680cbba59ca00f90a21bc9adc9dadeb95 Mon Sep 17 00:00:00 2001 From: Ivan Goncharov Date: Wed, 2 Jun 2021 17:41:52 +0300 Subject: [PATCH 5/6] Lexer: replace some of switch cases with if --- src/language/lexer.ts | 83 ++++++++----------------------------------- 1 file changed, 14 insertions(+), 69 deletions(-) diff --git a/src/language/lexer.ts b/src/language/lexer.ts index 31c5a47a08..284102d402 100644 --- a/src/language/lexer.ts +++ b/src/language/lexer.ts @@ -212,72 +212,17 @@ function readNextToken(lexer: Lexer, start: number): Token { return readBlockString(lexer, position); } return readString(lexer, position); - case 0x002d: // - - case 0x0030: // 0 - case 0x0031: // 1 - case 0x0032: // 2 - case 0x0033: // 3 - case 0x0034: // 4 - case 0x0035: // 5 - case 0x0036: // 6 - case 0x0037: // 7 - case 0x0038: // 8 - case 0x0039: // 9 - return readNumber(lexer, position, code); - case 0x0041: // A - case 0x0042: // B - case 0x0043: // C - case 0x0044: // D - case 0x0045: // E - case 0x0046: // F - case 0x0047: // G - case 0x0048: // H - case 0x0049: // I - case 0x004a: // J - case 0x004b: // K - case 0x004c: // L - case 0x004d: // M - case 0x004e: // N - case 0x004f: // O - case 0x0050: // P - case 0x0051: // Q - case 0x0052: // R - case 0x0053: // S - case 0x0054: // T - case 0x0055: // U - case 0x0056: // V - case 0x0057: // W - case 0x0058: // X - case 0x0059: // Y - case 0x005a: // Z - case 0x005f: // _ - case 0x0061: // a - case 0x0062: // b - case 0x0063: // c - case 0x0064: // d - case 0x0065: // e - case 0x0066: // f - case 0x0067: // g - case 0x0068: // h - case 0x0069: // i - case 0x006a: // j - case 0x006b: // k - case 0x006c: // l - case 0x006d: // m - case 0x006e: // n - case 0x006f: // o - case 0x0070: // p - case 0x0071: // q - case 0x0072: // r - case 0x0073: // s - case 0x0074: // t - case 0x0075: // u - case 0x0076: // v - case 0x0077: // w - case 0x0078: // x - case 0x0079: // y - case 0x007a: // z - return readName(lexer, position); + } + + // IntValue | FloatValue + // 0-9 | - + if ((code >= 0x0030 && code <= 0x0039) || code === 0x002d) { + return readNumber(lexer, position, code); + } + + // Name + if (isNameStart(code)) { + return readName(lexer, position); } throw syntaxError(lexer.source, position, unexpectedCharacterMessage(code)); @@ -662,11 +607,11 @@ function readName(lexer: Lexer, start: number): Token { ); } -// _ A-Z a-z +// a-z | A-Z | _ function isNameStart(code: number): boolean { return ( - code === 0x005f || + (code >= 0x0061 && code <= 0x007a) || (code >= 0x0041 && code <= 0x005a) || - (code >= 0x0061 && code <= 0x007a) + code === 0x005f ); } From 3a3e61c95cd0de4045dc7ce3d5d25fa9f5b18bae Mon Sep 17 00:00:00 2001 From: Ivan Goncharov Date: Wed, 2 Jun 2021 18:59:30 +0300 Subject: [PATCH 6/6] lexer: extract isDigit predicate --- src/language/lexer.ts | 37 +++++++++++++++++++++---------------- 1 file changed, 21 insertions(+), 16 deletions(-) diff --git a/src/language/lexer.ts b/src/language/lexer.ts index 284102d402..44d69d33d2 100644 --- a/src/language/lexer.ts +++ b/src/language/lexer.ts @@ -216,7 +216,7 @@ function readNextToken(lexer: Lexer, start: number): Token { // IntValue | FloatValue // 0-9 | - - if ((code >= 0x0030 && code <= 0x0039) || code === 0x002d) { + if (isDigit(code) || code === 0x002d) { return readNumber(lexer, position, code); } @@ -295,7 +295,7 @@ function readNumber(lexer: Lexer, start: number, firstCode: number): Token { if (code === 0x0030) { // 0 code = body.charCodeAt(++position); - if (code >= 0x0030 && code <= 0x0039) { + if (isDigit(code)) { throw syntaxError( lexer.source, position, @@ -351,21 +351,26 @@ function readNumber(lexer: Lexer, start: number, firstCode: number): Token { * Returns the new position in the source after reading digits. */ function readDigits(lexer: Lexer, start: number, firstCode: number): number { + if (!isDigit(firstCode)) { + throw syntaxError( + lexer.source, + start, + `Invalid number, expected digit but got: ${printCharCode(firstCode)}.`, + ); + } + const body = lexer.source.body; let position = start; let code = firstCode; - if (code >= 0x0030 && code <= 0x0039) { - // 0 - 9 - do { - code = body.charCodeAt(++position); - } while (code >= 0x0030 && code <= 0x0039); // 0 - 9 - return position; - } - throw syntaxError( - lexer.source, - position, - `Invalid number, expected digit but got: ${printCharCode(code)}.`, - ); + do { + code = body.charCodeAt(++position); + } while (isDigit(code)); + return position; +} + +// 0 - 9 +function isDigit(code: number): boolean { + return code >= 0x0030 && code <= 0x0039; } /** @@ -569,7 +574,7 @@ function uniCharCode(a: number, b: number, c: number, d: number): number { * Returns -1 on error. */ function char2hex(a: number): number { - return a >= 0x0030 && a <= 0x0039 + return isDigit(a) ? a - 0x0030 // 0-9 : a >= 0x0041 && a <= 0x0046 ? a - 0x0037 // A-F @@ -592,7 +597,7 @@ function readName(lexer: Lexer, start: number): Token { position !== bodyLength && !isNaN((code = body.charCodeAt(position))) && (code === 0x005f || // _ - (code >= 0x0030 && code <= 0x0039) || // 0-9 + isDigit(code) || (code >= 0x0041 && code <= 0x005a) || // A-Z (code >= 0x0061 && code <= 0x007a)) // a-z ) {