From a448c4828855d2f331d884950cd58db606874bc4 Mon Sep 17 00:00:00 2001 From: mamazu Date: Mon, 22 Nov 2021 22:24:08 +0100 Subject: [PATCH 1/4] Fixing the string tokenization --- src/PHPCR/Util/QOM/Sql2Scanner.php | 97 ++++++++----------- .../Util/QOM/Sql2ToQomQueryConverter.php | 45 ++------- .../PHPCR/Tests/Util/QOM/Sql2ScannerTest.php | 62 +++++++++--- 3 files changed, 97 insertions(+), 107 deletions(-) diff --git a/src/PHPCR/Util/QOM/Sql2Scanner.php b/src/PHPCR/Util/QOM/Sql2Scanner.php index fd5c563f..708d3d38 100644 --- a/src/PHPCR/Util/QOM/Sql2Scanner.php +++ b/src/PHPCR/Util/QOM/Sql2Scanner.php @@ -26,13 +26,6 @@ class Sql2Scanner */ protected $tokens; - /** - * Delimiters between tokens. - * - * @var array - */ - protected $delimiters; - /** * Parsing position in the SQL string. * @@ -68,16 +61,6 @@ public function lookupNextToken($offset = 0) return ''; } - /** - * Get the delimiter that separated the two previous tokens. - * - * @return string - */ - public function getPreviousDelimiter() - { - return isset($this->delimiters[$this->curpos - 1]) ? $this->delimiters[$this->curpos - 1] : ' '; - } - /** * Get the next token and remove it from the queue. * Return an empty string when there are no more tokens. @@ -116,12 +99,12 @@ public function expectToken($token, $case_insensitive = true) * Expect the next tokens to be the one given in the array of tokens and * throws an exception if it's not the case. * - * @see expectToken - * * @param array $tokens * @param bool $case_insensitive * * @throws InvalidQueryException + * + * @see expectToken */ public function expectTokens($tokens, $case_insensitive = true) { @@ -151,7 +134,7 @@ public function tokenIs($token, $value, $case_insensitive = true) } /** - * Scan a SQL2 string a extract the tokens. + * Scan a SQL2 string and extract the tokens. * * @param string $sql2 * @@ -160,49 +143,45 @@ public function tokenIs($token, $value, $case_insensitive = true) protected function scan($sql2) { $tokens = []; - $token = strtok($sql2, " \n\t"); - while ($token !== false) { - $this->tokenize($tokens, $token); - $token = strtok(" \n\t"); - } - - $regexpTokens = []; - foreach ($tokens as $token) { - $regexpTokens[] = preg_quote($token, '/'); - } - - $regexp = '/^'.implode('([ \t\n]*)', $regexpTokens).'$/'; - preg_match($regexp, $sql2, $this->delimiters); - $this->delimiters[0] = ''; - - return $tokens; - } - - /** - * Tokenize a string returned by strtok to split the string at '.', ',', '(', '=' - * and ')' characters. - * - * @param array $tokens - * @param string $token - */ - protected function tokenize(&$tokens, $token) - { - $buffer = ''; - for ($i = 0; $i < strlen($token); $i++) { - $char = trim(substr($token, $i, 1)); - if (in_array($char, ['.', ',', '(', ')', '='])) { - if ($buffer !== '') { - $tokens[] = $buffer; - $buffer = ''; + $currentToken = ''; + $tokenEndChars = ['.', ',', '(', ')', '=']; + $isString = false; + foreach (\str_split($sql2) as $character) { + if (!$isString && in_array($character, [' ', "\t", "\n"], true)) { + if ($currentToken !== '') { + $tokens[] = $currentToken; } - $tokens[] = $char; - } else { - $buffer .= $char; + $currentToken = ''; + continue; } + if (!$isString && in_array($character, $tokenEndChars, true)) { + if ($currentToken !== '') { + $tokens[] = $currentToken; + } + $tokens[] = $character; + $currentToken = ''; + continue; + } + $currentToken .= $character; + if (in_array($character, ['"', "'"], true)) { + if ($isString) { + // reached the end of the string + $isString = false; + $tokens[] = $currentToken; + $currentToken = ''; + } else { + $isString = true; + } + } + } + if ($currentToken !== '') { + $tokens[] = $currentToken; } - if ($buffer !== '') { - $tokens[] = $buffer; + if ($isString) { + throw new InvalidQueryException("Syntax error: unterminated quoted string $currentToken in '$sql2'"); } + + return $tokens; } } diff --git a/src/PHPCR/Util/QOM/Sql2ToQomQueryConverter.php b/src/PHPCR/Util/QOM/Sql2ToQomQueryConverter.php index d3a7a46e..a859ebd2 100644 --- a/src/PHPCR/Util/QOM/Sql2ToQomQueryConverter.php +++ b/src/PHPCR/Util/QOM/Sql2ToQomQueryConverter.php @@ -756,27 +756,13 @@ protected function parseCastLiteral($token) $this->scanner->expectToken('('); $token = $this->scanner->fetchNextToken(); - $quoteString = false; - if (substr($token, 0, 1) === '\'') { - $quoteString = "'"; - } elseif (substr($token, 0, 1) === '"') { - $quoteString = '"'; - } + $quoteString = in_array($token[0], ['\'', '"'], true); if ($quoteString) { - while (substr($token, -1) !== $quoteString) { - $nextToken = $this->scanner->fetchNextToken(); - if ('' === $nextToken) { - break; - } - $token .= $nextToken; - } - - if (substr($token, -1) !== $quoteString) { - throw new InvalidQueryException("Syntax error: unterminated quoted string '$token' in '{$this->sql2}'"); - } + $quotesUsed = $token[0]; $token = substr($token, 1, -1); - $token = str_replace('\\'.$quoteString, $quoteString, $token); + // Un-escaping quotes + $token = str_replace('\\'.$quotesUsed, $quotesUsed, $token); } $this->scanner->expectToken('AS'); @@ -813,28 +799,13 @@ protected function parseLiteralValue() return $this->parseCastLiteral($token); } - $quoteString = false; - if (substr($token, 0, 1) === '\'') { - $quoteString = "'"; - } elseif (substr($token, 0, 1) === '"') { - $quoteString = '"'; - } + $quoteString = in_array($token[0], ['"', "'"], true); if ($quoteString) { - while (substr($token, -1) !== $quoteString) { - $nextToken = $this->scanner->fetchNextToken(); - if ('' === $nextToken) { - break; - } - $token .= $this->scanner->getPreviousDelimiter(); - $token .= $nextToken; - } - - if (substr($token, -1) !== $quoteString) { - throw new InvalidQueryException("Syntax error: unterminated quoted string $token in '{$this->sql2}'"); - } + $quotesUsed = $token[0]; $token = substr($token, 1, -1); - $token = str_replace('\\'.$quoteString, $quoteString, $token); + // Unescape quotes + $token = str_replace('\\'.$quotesUsed, $quotesUsed, $token); $token = str_replace("''", "'", $token); if (preg_match('/^\d{4}-\d{2}-\d{2}( \d{2}:\d{2}:\d+)?$/', $token)) { if (preg_match('/^\d{4}-\d{2}-\d{2}$/', $token)) { diff --git a/tests/PHPCR/Tests/Util/QOM/Sql2ScannerTest.php b/tests/PHPCR/Tests/Util/QOM/Sql2ScannerTest.php index d3bda9cc..7a9c7653 100644 --- a/tests/PHPCR/Tests/Util/QOM/Sql2ScannerTest.php +++ b/tests/PHPCR/Tests/Util/QOM/Sql2ScannerTest.php @@ -2,6 +2,7 @@ namespace PHPCR\Tests\Util\QOM; +use PHPCR\Query\InvalidQueryException; use PHPCR\Util\QOM\Sql2Scanner; use PHPUnit\Framework\TestCase; @@ -24,24 +25,63 @@ public function testToken() while ($token = $scanner->fetchNextToken()) { $this->assertEquals(array_shift($expected), $token); } + $this->assertCount(0, $expected); } - public function testDelimiter() + public function testStringTokenization() { - $scanner = new Sql2Scanner('SELECT page.* FROM [nt:unstructured] AS page'); + $scanner = new Sql2Scanner('SELECT page.* FROM [nt:unstructured] AS page WHERE name ="Hello world"'); + $expected = [ + 'SELECT', + 'page', + '.', + '*', + 'FROM', + '[nt:unstructured]', + 'AS', + 'page', + 'WHERE', + 'name', + '=', + '"Hello world"', + ]; + + while ($token = $scanner->fetchNextToken()) { + $this->assertEquals(array_shift($expected), $token); + } + $this->assertCount(0, $expected); + } + + public function testStringTokenizationWithNewLines() + { + $scanner = new Sql2Scanner(<<<'SQL' +SELECT page.* +FROM [nt:unstructured] AS page WHERE name ="Hello world" +SQL); $expected = [ - '', - ' ', - '', - '', - ' ', - ' ', - ' ', - ' ', + 'SELECT', + 'page', + '.', + '*', + 'FROM', + '[nt:unstructured]', + 'AS', + 'page', + 'WHERE', + 'name', + '=', + '"Hello world"', ]; while ($token = $scanner->fetchNextToken()) { - $this->assertEquals(array_shift($expected), $scanner->getPreviousDelimiter()); + $this->assertEquals(array_shift($expected), $token); } + $this->assertCount(0, $expected); + } + + public function testThrowingErrorOnUnclosedString() + { + $this->expectException(InvalidQueryException::class); + new Sql2Scanner('SELECT page.* FROM [nt:unstructured] AS page WHERE name ="Hello '); } } From 4fd702f3f519b8e6168b85f26a5c239359eb7fa4 Mon Sep 17 00:00:00 2001 From: mamazu Date: Tue, 23 Nov 2021 14:40:08 +0100 Subject: [PATCH 2/4] Adding more test cases and fixed bug in the parser Now the parser can understand escaped characters in the string --- src/PHPCR/Util/QOM/Sql2Scanner.php | 22 ++++++++------ .../PHPCR/Tests/Util/QOM/Sql2ScannerTest.php | 29 +++++++++++++++---- 2 files changed, 36 insertions(+), 15 deletions(-) diff --git a/src/PHPCR/Util/QOM/Sql2Scanner.php b/src/PHPCR/Util/QOM/Sql2Scanner.php index 708d3d38..fc65916b 100644 --- a/src/PHPCR/Util/QOM/Sql2Scanner.php +++ b/src/PHPCR/Util/QOM/Sql2Scanner.php @@ -145,16 +145,18 @@ protected function scan($sql2) $tokens = []; $currentToken = ''; $tokenEndChars = ['.', ',', '(', ')', '=']; - $isString = false; + + $stringStartCharacter = false; + $isEscaped = false; foreach (\str_split($sql2) as $character) { - if (!$isString && in_array($character, [' ', "\t", "\n"], true)) { + if (!$stringStartCharacter && in_array($character, [' ', "\t", "\n"], true)) { if ($currentToken !== '') { $tokens[] = $currentToken; } $currentToken = ''; continue; } - if (!$isString && in_array($character, $tokenEndChars, true)) { + if (!$stringStartCharacter && in_array($character, $tokenEndChars, true)) { if ($currentToken !== '') { $tokens[] = $currentToken; } @@ -163,22 +165,24 @@ protected function scan($sql2) continue; } $currentToken .= $character; - if (in_array($character, ['"', "'"], true)) { - if ($isString) { + if (!$isEscaped && in_array($character, ['"', "'"], true)) { + if ($character === $stringStartCharacter) { // reached the end of the string - $isString = false; + $stringStartCharacter = false; $tokens[] = $currentToken; $currentToken = ''; - } else { - $isString = true; + } elseif (!$stringStartCharacter) { + // If there is no start character already we have found the beginning of a new string + $stringStartCharacter = $character; } } + $isEscaped = $character === '\\'; } if ($currentToken !== '') { $tokens[] = $currentToken; } - if ($isString) { + if ($stringStartCharacter) { throw new InvalidQueryException("Syntax error: unterminated quoted string $currentToken in '$sql2'"); } diff --git a/tests/PHPCR/Tests/Util/QOM/Sql2ScannerTest.php b/tests/PHPCR/Tests/Util/QOM/Sql2ScannerTest.php index 7a9c7653..4c06ffe8 100644 --- a/tests/PHPCR/Tests/Util/QOM/Sql2ScannerTest.php +++ b/tests/PHPCR/Tests/Util/QOM/Sql2ScannerTest.php @@ -28,6 +28,9 @@ public function testToken() $this->assertCount(0, $expected); } + /** + * @dataProvider dataTestStringTokenization + */ public function testStringTokenization() { $scanner = new Sql2Scanner('SELECT page.* FROM [nt:unstructured] AS page WHERE name ="Hello world"'); @@ -52,11 +55,24 @@ public function testStringTokenization() $this->assertCount(0, $expected); } - public function testStringTokenizationWithNewLines() + public function dataTestStringTokenization() { - $scanner = new Sql2Scanner(<<<'SQL' + $multilineQuery = <<<'SQL' SELECT page.* -FROM [nt:unstructured] AS page WHERE name ="Hello world" +FROM [nt:unstructured] AS page +WHERE name ="Hello world" +SQL; + + return [ + 'single line query' => ['SELECT page.* FROM [nt:unstructured] AS page WHERE name ="Hello world"'], + 'multi line query' => [$multilineQuery], + ]; + } + + public function testEscapingStrings() + { + $scanner = new Sql2Scanner(<<fetchNextToken()) { $this->assertEquals(array_shift($expected), $token); } - $this->assertCount(0, $expected); } public function testThrowingErrorOnUnclosedString() From 0653b8d60060ef7ccb369650f10b7415aea3fd3e Mon Sep 17 00:00:00 2001 From: mamazu Date: Wed, 24 Nov 2021 11:35:31 +0100 Subject: [PATCH 3/4] Fixing nowdoc syntax --- tests/PHPCR/Tests/Util/QOM/Sql2ScannerTest.php | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/PHPCR/Tests/Util/QOM/Sql2ScannerTest.php b/tests/PHPCR/Tests/Util/QOM/Sql2ScannerTest.php index 4c06ffe8..94d1c2a6 100644 --- a/tests/PHPCR/Tests/Util/QOM/Sql2ScannerTest.php +++ b/tests/PHPCR/Tests/Util/QOM/Sql2ScannerTest.php @@ -57,7 +57,7 @@ public function testStringTokenization() public function dataTestStringTokenization() { - $multilineQuery = <<<'SQL' + $multilineQuery = << Date: Thu, 25 Nov 2021 19:30:09 +0100 Subject: [PATCH 4/4] Adding the weird string logic from SQL to the tests --- src/PHPCR/Util/QOM/Sql2Scanner.php | 25 ++++++- .../PHPCR/Tests/Util/QOM/Sql2ScannerTest.php | 68 +++++++++++++++---- 2 files changed, 80 insertions(+), 13 deletions(-) diff --git a/src/PHPCR/Util/QOM/Sql2Scanner.php b/src/PHPCR/Util/QOM/Sql2Scanner.php index fc65916b..8639f22b 100644 --- a/src/PHPCR/Util/QOM/Sql2Scanner.php +++ b/src/PHPCR/Util/QOM/Sql2Scanner.php @@ -148,7 +148,8 @@ protected function scan($sql2) $stringStartCharacter = false; $isEscaped = false; - foreach (\str_split($sql2) as $character) { + $escapedQuotesCount = 0; + foreach (\str_split($sql2) as $index => $character) { if (!$stringStartCharacter && in_array($character, [' ', "\t", "\n"], true)) { if ($currentToken !== '') { $tokens[] = $currentToken; @@ -165,7 +166,20 @@ protected function scan($sql2) continue; } $currentToken .= $character; + if (!$isEscaped && in_array($character, ['"', "'"], true)) { + // Checking if the previous or next value is a ' to handle the weird SQL strings + // This will not check if the amount of quotes is even + $nextCharacter = $this->getCharacterAtIndex($sql2, $index + 1); + if ($character === "'" && $nextCharacter === "'") { + $isEscaped = true; + $escapedQuotesCount++; + continue; + } + // If the escaped quotes are not paired up. eg. "I'''m cool" would be a parsing error + if ($escapedQuotesCount % 2 == 1 && $stringStartCharacter !== "'") { + throw new InvalidQueryException("Syntax error: Number of single quotes to be even: $currentToken"); + } if ($character === $stringStartCharacter) { // reached the end of the string $stringStartCharacter = false; @@ -188,4 +202,13 @@ protected function scan($sql2) return $tokens; } + + private function getCharacterAtIndex($string, $index) + { + if ($index < strlen($string)) { + return $string[$index]; + } + + return ''; + } } diff --git a/tests/PHPCR/Tests/Util/QOM/Sql2ScannerTest.php b/tests/PHPCR/Tests/Util/QOM/Sql2ScannerTest.php index 94d1c2a6..07da5dd2 100644 --- a/tests/PHPCR/Tests/Util/QOM/Sql2ScannerTest.php +++ b/tests/PHPCR/Tests/Util/QOM/Sql2ScannerTest.php @@ -22,10 +22,7 @@ public function testToken() 'page', ]; - while ($token = $scanner->fetchNextToken()) { - $this->assertEquals(array_shift($expected), $token); - } - $this->assertCount(0, $expected); + $this->expectTokensFromScanner($scanner, $expected); } /** @@ -49,15 +46,12 @@ public function testStringTokenization() '"Hello world"', ]; - while ($token = $scanner->fetchNextToken()) { - $this->assertEquals(array_shift($expected), $token); - } - $this->assertCount(0, $expected); + $this->expectTokensFromScanner($scanner, $expected); } public function dataTestStringTokenization() { - $multilineQuery = <<fetchNextToken()) { - $this->assertEquals(array_shift($expected), $token); - } + $this->expectTokensFromScanner($scanner, $expected); + } + + public function testSQLEscapedStrings() + { + $sql = "WHERE page.name = 'Hello, it''s me.'"; + + $scanner = new Sql2Scanner($sql); + $expected = [ + 'WHERE', + 'page', + '.', + 'name', + '=', + "'Hello, it''s me.'", + ]; + + $this->expectTokensFromScanner($scanner, $expected); + } + + public function testSQLEscapedStrings2() + { + $sql = "WHERE page.name = 'Hello, it''' AND"; + + $scanner = new Sql2Scanner($sql); + $expected = [ + 'WHERE', + 'page', + '.', + 'name', + '=', + "'Hello, it'''", + 'AND', + ]; + + $this->expectTokensFromScanner($scanner, $expected); } public function testThrowingErrorOnUnclosedString() @@ -102,4 +129,21 @@ public function testThrowingErrorOnUnclosedString() $this->expectException(InvalidQueryException::class); new Sql2Scanner('SELECT page.* FROM [nt:unstructured] AS page WHERE name ="Hello '); } + + /** + * Function to assert that the tokens the scanner finds match the expected output + * and the entire expected output is consumed. + * + * @param Sql2Scanner $scanner + * @param array $expected + */ + private function expectTokensFromScanner(Sql2Scanner $scanner, array $expected) + { + $actualTokens = []; + while ($token = $scanner->fetchNextToken()) { + $actualTokens[] = $token; + } + + $this->assertEquals($expected, $actualTokens); + } }