Skip to content

Fixing the string tokenization #199

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Nov 29, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
114 changes: 60 additions & 54 deletions src/PHPCR/Util/QOM/Sql2Scanner.php
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,6 @@ class Sql2Scanner
*/
protected $tokens;

/**
* Delimiters between tokens.
*
* @var array
*/
protected $delimiters;

/**
* Parsing position in the SQL string.
*
Expand Down Expand Up @@ -68,16 +61,6 @@ public function lookupNextToken($offset = 0)
return '';
}

/**
* Get the delimiter that separated the two previous tokens.
*
* @return string
*/
public function getPreviousDelimiter()
{
return isset($this->delimiters[$this->curpos - 1]) ? $this->delimiters[$this->curpos - 1] : ' ';
}

/**
* Get the next token and remove it from the queue.
* Return an empty string when there are no more tokens.
Expand Down Expand Up @@ -116,12 +99,12 @@ public function expectToken($token, $case_insensitive = true)
* Expect the next tokens to be the one given in the array of tokens and
* throws an exception if it's not the case.
*
* @see expectToken
*
* @param array $tokens
* @param bool $case_insensitive
*
* @throws InvalidQueryException
*
* @see expectToken
*/
public function expectTokens($tokens, $case_insensitive = true)
{
Expand Down Expand Up @@ -151,7 +134,7 @@ public function tokenIs($token, $value, $case_insensitive = true)
}

/**
* Scan a SQL2 string a extract the tokens.
* Scan a SQL2 string and extract the tokens.
*
* @param string $sql2
*
Expand All @@ -160,49 +143,72 @@ public function tokenIs($token, $value, $case_insensitive = true)
protected function scan($sql2)
{
$tokens = [];
$token = strtok($sql2, " \n\t");
while ($token !== false) {
$this->tokenize($tokens, $token);
$token = strtok(" \n\t");
$currentToken = '';
$tokenEndChars = ['.', ',', '(', ')', '='];

$stringStartCharacter = false;
$isEscaped = false;
$escapedQuotesCount = 0;
foreach (\str_split($sql2) as $index => $character) {
if (!$stringStartCharacter && in_array($character, [' ', "\t", "\n"], true)) {
if ($currentToken !== '') {
$tokens[] = $currentToken;
}
$currentToken = '';
continue;
}
if (!$stringStartCharacter && in_array($character, $tokenEndChars, true)) {
if ($currentToken !== '') {
$tokens[] = $currentToken;
}
$tokens[] = $character;
$currentToken = '';
continue;
}
$currentToken .= $character;

if (!$isEscaped && in_array($character, ['"', "'"], true)) {
// Checking if the previous or next value is a ' to handle the weird SQL strings
// This will not check if the amount of quotes is even
$nextCharacter = $this->getCharacterAtIndex($sql2, $index + 1);
if ($character === "'" && $nextCharacter === "'") {
$isEscaped = true;
$escapedQuotesCount++;
continue;
}
// If the escaped quotes are not paired up. eg. "I'''m cool" would be a parsing error
if ($escapedQuotesCount % 2 == 1 && $stringStartCharacter !== "'") {
throw new InvalidQueryException("Syntax error: Number of single quotes to be even: $currentToken");
}
if ($character === $stringStartCharacter) {
// reached the end of the string
$stringStartCharacter = false;
$tokens[] = $currentToken;
$currentToken = '';
} elseif (!$stringStartCharacter) {
// If there is no start character already we have found the beginning of a new string
$stringStartCharacter = $character;
}
}
$isEscaped = $character === '\\';
}

$regexpTokens = [];
foreach ($tokens as $token) {
$regexpTokens[] = preg_quote($token, '/');
if ($currentToken !== '') {
$tokens[] = $currentToken;
}

$regexp = '/^'.implode('([ \t\n]*)', $regexpTokens).'$/';
preg_match($regexp, $sql2, $this->delimiters);
$this->delimiters[0] = '';
if ($stringStartCharacter) {
throw new InvalidQueryException("Syntax error: unterminated quoted string $currentToken in '$sql2'");
}

return $tokens;
}

/**
* Tokenize a string returned by strtok to split the string at '.', ',', '(', '='
* and ')' characters.
*
* @param array $tokens
* @param string $token
*/
protected function tokenize(&$tokens, $token)
private function getCharacterAtIndex($string, $index)
{
$buffer = '';
for ($i = 0; $i < strlen($token); $i++) {
$char = trim(substr($token, $i, 1));
if (in_array($char, ['.', ',', '(', ')', '='])) {
if ($buffer !== '') {
$tokens[] = $buffer;
$buffer = '';
}
$tokens[] = $char;
} else {
$buffer .= $char;
}
if ($index < strlen($string)) {
return $string[$index];
}

if ($buffer !== '') {
$tokens[] = $buffer;
}
return '';
}
}
45 changes: 8 additions & 37 deletions src/PHPCR/Util/QOM/Sql2ToQomQueryConverter.php
Original file line number Diff line number Diff line change
Expand Up @@ -756,27 +756,13 @@ protected function parseCastLiteral($token)
$this->scanner->expectToken('(');
$token = $this->scanner->fetchNextToken();

$quoteString = false;
if (substr($token, 0, 1) === '\'') {
$quoteString = "'";
} elseif (substr($token, 0, 1) === '"') {
$quoteString = '"';
}
$quoteString = in_array($token[0], ['\'', '"'], true);

if ($quoteString) {
while (substr($token, -1) !== $quoteString) {
$nextToken = $this->scanner->fetchNextToken();
if ('' === $nextToken) {
break;
}
$token .= $nextToken;
}

if (substr($token, -1) !== $quoteString) {
throw new InvalidQueryException("Syntax error: unterminated quoted string '$token' in '{$this->sql2}'");
}
$quotesUsed = $token[0];
$token = substr($token, 1, -1);
$token = str_replace('\\'.$quoteString, $quoteString, $token);
// Un-escaping quotes
$token = str_replace('\\'.$quotesUsed, $quotesUsed, $token);
}

$this->scanner->expectToken('AS');
Expand Down Expand Up @@ -813,28 +799,13 @@ protected function parseLiteralValue()
return $this->parseCastLiteral($token);
}

$quoteString = false;
if (substr($token, 0, 1) === '\'') {
$quoteString = "'";
} elseif (substr($token, 0, 1) === '"') {
$quoteString = '"';
}
$quoteString = in_array($token[0], ['"', "'"], true);

if ($quoteString) {
while (substr($token, -1) !== $quoteString) {
$nextToken = $this->scanner->fetchNextToken();
if ('' === $nextToken) {
break;
}
$token .= $this->scanner->getPreviousDelimiter();
$token .= $nextToken;
}

if (substr($token, -1) !== $quoteString) {
throw new InvalidQueryException("Syntax error: unterminated quoted string $token in '{$this->sql2}'");
}
$quotesUsed = $token[0];
$token = substr($token, 1, -1);
$token = str_replace('\\'.$quoteString, $quoteString, $token);
// Unescape quotes
$token = str_replace('\\'.$quotesUsed, $quotesUsed, $token);
$token = str_replace("''", "'", $token);
if (preg_match('/^\d{4}-\d{2}-\d{2}( \d{2}:\d{2}:\d+)?$/', $token)) {
if (preg_match('/^\d{4}-\d{2}-\d{2}$/', $token)) {
Expand Down
130 changes: 116 additions & 14 deletions tests/PHPCR/Tests/Util/QOM/Sql2ScannerTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

namespace PHPCR\Tests\Util\QOM;

use PHPCR\Query\InvalidQueryException;
use PHPCR\Util\QOM\Sql2Scanner;
use PHPUnit\Framework\TestCase;

Expand All @@ -21,27 +22,128 @@ public function testToken()
'page',
];

while ($token = $scanner->fetchNextToken()) {
$this->assertEquals(array_shift($expected), $token);
}
$this->expectTokensFromScanner($scanner, $expected);
}

public function testDelimiter()
/**
* @dataProvider dataTestStringTokenization
*/
public function testStringTokenization()
{
$scanner = new Sql2Scanner('SELECT page.* FROM [nt:unstructured] AS page');
$scanner = new Sql2Scanner('SELECT page.* FROM [nt:unstructured] AS page WHERE name ="Hello world"');
$expected = [
'',
' ',
'',
'',
' ',
' ',
' ',
' ',
'SELECT',
'page',
'.',
'*',
'FROM',
'[nt:unstructured]',
'AS',
'page',
'WHERE',
'name',
'=',
'"Hello world"',
];

$this->expectTokensFromScanner($scanner, $expected);
}

public function dataTestStringTokenization()
{
$multilineQuery = <<<'SQL'
SELECT page.*
FROM [nt:unstructured] AS page
WHERE name ="Hello world"
SQL;

return [
'single line query' => ['SELECT page.* FROM [nt:unstructured] AS page WHERE name ="Hello world"'],
'multi line query' => [$multilineQuery],
];
}

public function testEscapingStrings()
{
$sql = <<<SQL
SELECT page.* FROM [nt:unstructured] AS page WHERE page.quotes = "\"'"
SQL;
$scanner = new Sql2Scanner($sql);
$expected = [
'SELECT',
'page',
'.',
'*',
'FROM',
'[nt:unstructured]',
'AS',
'page',
'WHERE',
'page',
'.',
'quotes',
'=',
'"\"\'"',
];

$this->expectTokensFromScanner($scanner, $expected);
}

public function testSQLEscapedStrings()
{
$sql = "WHERE page.name = 'Hello, it''s me.'";

$scanner = new Sql2Scanner($sql);
$expected = [
'WHERE',
'page',
'.',
'name',
'=',
"'Hello, it''s me.'",
];

$this->expectTokensFromScanner($scanner, $expected);
}

public function testSQLEscapedStrings2()
{
$sql = "WHERE page.name = 'Hello, it''' AND";

$scanner = new Sql2Scanner($sql);
$expected = [
'WHERE',
'page',
'.',
'name',
'=',
"'Hello, it'''",
'AND',
];

$this->expectTokensFromScanner($scanner, $expected);
}

public function testThrowingErrorOnUnclosedString()
{
$this->expectException(InvalidQueryException::class);
new Sql2Scanner('SELECT page.* FROM [nt:unstructured] AS page WHERE name ="Hello ');
}

/**
* Function to assert that the tokens the scanner finds match the expected output
* and the entire expected output is consumed.
*
* @param Sql2Scanner $scanner
* @param array<string> $expected
*/
private function expectTokensFromScanner(Sql2Scanner $scanner, array $expected)
{
$actualTokens = [];
while ($token = $scanner->fetchNextToken()) {
$this->assertEquals(array_shift($expected), $scanner->getPreviousDelimiter());
$actualTokens[] = $token;
}

$this->assertEquals($expected, $actualTokens);
}
}