From 209064d6e110634489f7ce387cfe9000cf1b0299 Mon Sep 17 00:00:00 2001 From: Hans Ott Date: Tue, 21 Jan 2025 18:12:18 +0100 Subject: [PATCH 1/6] Only support escape literals for Postgres, Redshift and generic dialect --- src/tokenizer.rs | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 39ca84c9f..a7104d436 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -43,7 +43,7 @@ use sqlparser_derive::{Visit, VisitMut}; use crate::dialect::Dialect; use crate::dialect::{ BigQueryDialect, DuckDbDialect, GenericDialect, MySqlDialect, PostgreSqlDialect, - SnowflakeDialect, + RedshiftSqlDialect, SnowflakeDialect, }; use crate::keywords::{Keyword, ALL_KEYWORDS, ALL_KEYWORDS_INDEX}; use crate::{ast::DollarQuotedString, dialect::HiveDialect}; @@ -982,7 +982,8 @@ impl<'a> Tokenizer<'a> { } } // PostgreSQL accepts "escape" string constants, which are an extension to the SQL standard. - x @ 'e' | x @ 'E' => { + x @ 'e' | x @ 'E' if dialect_of!(self is PostgreSqlDialect | RedshiftSqlDialect | GenericDialect) => + { let starting_loc = chars.location(); chars.next(); // consume, to check the next char match chars.peek() { @@ -3543,4 +3544,18 @@ mod tests { ]; compare(expected, tokens); } + + #[test] + fn test_mysql_escape_literal() { + let dialect = MySqlDialect {}; + let sql = "select e'\\u'"; + let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap(); + let expected = vec![ + Token::make_keyword("select"), + Token::Whitespace(Whitespace::Space), + Token::make_word("e", None), + Token::SingleQuotedString("u".to_string()), + ]; + compare(expected, tokens); + } } From db937fa9bfcfab470873d5b85acc166f97208704 Mon Sep 17 00:00:00 2001 From: Hans Ott Date: Thu, 23 Jan 2025 16:09:46 +0100 Subject: [PATCH 2/6] Add supports_string_escape_constant() to dialect --- src/dialect/generic.rs | 4 +++ src/dialect/mod.rs | 7 +++++ src/dialect/postgresql.rs | 4 +++ src/dialect/redshift.rs | 4 +++ src/test_utils.rs | 14 +++++++++- src/tokenizer.rs | 56 ++++++++++++++++++++++++++++++--------- 6 files changed, 75 insertions(+), 14 deletions(-) diff --git a/src/dialect/generic.rs b/src/dialect/generic.rs index d696861b5..4021b5753 100644 --- a/src/dialect/generic.rs +++ b/src/dialect/generic.rs @@ -139,4 +139,8 @@ impl Dialect for GenericDialect { fn supports_user_host_grantee(&self) -> bool { true } + + fn supports_string_escape_constant(&self) -> bool { + true + } } diff --git a/src/dialect/mod.rs b/src/dialect/mod.rs index 64dbc4b1b..fc697ddad 100644 --- a/src/dialect/mod.rs +++ b/src/dialect/mod.rs @@ -820,6 +820,13 @@ pub trait Dialect: Debug + Any { fn supports_set_stmt_without_operator(&self) -> bool { false } + + /// Returns true if this dialect supports the E'...' syntax for string literals + /// + /// Postgres: + fn supports_string_escape_constant(&self) -> bool { + false + } } /// This represents the operators for which precedence must be defined diff --git a/src/dialect/postgresql.rs b/src/dialect/postgresql.rs index 170b0a7c9..d4f2a032e 100644 --- a/src/dialect/postgresql.rs +++ b/src/dialect/postgresql.rs @@ -245,6 +245,10 @@ impl Dialect for PostgreSqlDialect { fn supports_nested_comments(&self) -> bool { true } + + fn supports_string_escape_constant(&self) -> bool { + true + } } pub fn parse_create(parser: &mut Parser) -> Option> { diff --git a/src/dialect/redshift.rs b/src/dialect/redshift.rs index 55405ba53..a4522bbf8 100644 --- a/src/dialect/redshift.rs +++ b/src/dialect/redshift.rs @@ -109,4 +109,8 @@ impl Dialect for RedshiftSqlDialect { fn supports_partiql(&self) -> bool { true } + + fn supports_string_escape_constant(&self) -> bool { + true + } } diff --git a/src/test_utils.rs b/src/test_utils.rs index 914be7d9f..51e4fd748 100644 --- a/src/test_utils.rs +++ b/src/test_utils.rs @@ -33,7 +33,7 @@ use core::fmt::Debug; use crate::dialect::*; use crate::parser::{Parser, ParserError}; -use crate::tokenizer::Tokenizer; +use crate::tokenizer::{Token, Tokenizer}; use crate::{ast::*, parser::ParserOptions}; #[cfg(test)] @@ -237,6 +237,18 @@ impl TestedDialects { pub fn verified_expr(&self, sql: &str) -> Expr { self.expr_parses_to(sql, sql) } + + /// Check that the tokenizer returns the expected tokens for the given SQL. + pub fn tokenizes_to(&self, sql: &str, expected: Vec) { + self.dialects.iter().for_each(|dialect| { + let mut tokenizer = Tokenizer::new(&**dialect, sql); + if let Some(options) = &self.options { + tokenizer = tokenizer.with_unescape(options.unescape); + } + let tokens = tokenizer.tokenize().unwrap(); + assert_eq!(expected, tokens); + }); + } } /// Returns all available dialects. diff --git a/src/tokenizer.rs b/src/tokenizer.rs index a7104d436..555db681d 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -982,8 +982,7 @@ impl<'a> Tokenizer<'a> { } } // PostgreSQL accepts "escape" string constants, which are an extension to the SQL standard. - x @ 'e' | x @ 'E' if dialect_of!(self is PostgreSqlDialect | RedshiftSqlDialect | GenericDialect) => - { + x @ 'e' | x @ 'E' if self.dialect.supports_string_escape_constant() => { let starting_loc = chars.location(); chars.next(); // consume, to check the next char match chars.peek() { @@ -2156,6 +2155,7 @@ mod tests { use crate::dialect::{ BigQueryDialect, ClickHouseDialect, HiveDialect, MsSqlDialect, MySqlDialect, SQLiteDialect, }; + use crate::test_utils::all_dialects_where; use core::fmt::Debug; #[test] @@ -3546,16 +3546,46 @@ mod tests { } #[test] - fn test_mysql_escape_literal() { - let dialect = MySqlDialect {}; - let sql = "select e'\\u'"; - let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap(); - let expected = vec![ - Token::make_keyword("select"), - Token::Whitespace(Whitespace::Space), - Token::make_word("e", None), - Token::SingleQuotedString("u".to_string()), - ]; - compare(expected, tokens); + fn test_string_escape_constant_not_supported() { + all_dialects_where(|dialect| !dialect.supports_string_escape_constant()).tokenizes_to( + "select e'...'", + vec![ + Token::make_keyword("select"), + Token::Whitespace(Whitespace::Space), + Token::make_word("e", None), + Token::SingleQuotedString("...".to_string()), + ], + ); + + all_dialects_where(|dialect| !dialect.supports_string_escape_constant()).tokenizes_to( + "select E'...'", + vec![ + Token::make_keyword("select"), + Token::Whitespace(Whitespace::Space), + Token::make_word("E", None), + Token::SingleQuotedString("...".to_string()), + ], + ); + } + + #[test] + fn test_string_escape_constant_supported() { + all_dialects_where(|dialect| dialect.supports_string_escape_constant()).tokenizes_to( + "select e'...'", + vec![ + Token::make_keyword("select"), + Token::Whitespace(Whitespace::Space), + Token::EscapedStringLiteral("...".to_string()), + ], + ); + + all_dialects_where(|dialect| dialect.supports_string_escape_constant()).tokenizes_to( + "select E'...'", + vec![ + Token::make_keyword("select"), + Token::Whitespace(Whitespace::Space), + Token::EscapedStringLiteral("...".to_string()), + ], + ); } } From 97dd5e2f7a7ae2e2f5ac453bcdd9fef326c98ae8 Mon Sep 17 00:00:00 2001 From: Hans Ott Date: Thu, 23 Jan 2025 16:10:48 +0100 Subject: [PATCH 3/6] Remove SnowflakeDialect --- src/tokenizer.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 555db681d..bd14e837d 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -43,7 +43,7 @@ use sqlparser_derive::{Visit, VisitMut}; use crate::dialect::Dialect; use crate::dialect::{ BigQueryDialect, DuckDbDialect, GenericDialect, MySqlDialect, PostgreSqlDialect, - RedshiftSqlDialect, SnowflakeDialect, + SnowflakeDialect, }; use crate::keywords::{Keyword, ALL_KEYWORDS, ALL_KEYWORDS_INDEX}; use crate::{ast::DollarQuotedString, dialect::HiveDialect}; From 881817b6c133dd9bc3c0b061fb8325994ef24247 Mon Sep 17 00:00:00 2001 From: Hans Ott Date: Thu, 23 Jan 2025 16:53:17 +0100 Subject: [PATCH 4/6] Improve test --- src/test_utils.rs | 6 +++++- src/tokenizer.rs | 40 ++++++++++++++++++++++++++++------------ 2 files changed, 33 insertions(+), 13 deletions(-) diff --git a/src/test_utils.rs b/src/test_utils.rs index 51e4fd748..8e371b9e5 100644 --- a/src/test_utils.rs +++ b/src/test_utils.rs @@ -240,13 +240,17 @@ impl TestedDialects { /// Check that the tokenizer returns the expected tokens for the given SQL. pub fn tokenizes_to(&self, sql: &str, expected: Vec) { + if self.dialects.len() == 0 { + panic!("No dialects to test"); + } + self.dialects.iter().for_each(|dialect| { let mut tokenizer = Tokenizer::new(&**dialect, sql); if let Some(options) = &self.options { tokenizer = tokenizer.with_unescape(options.unescape); } let tokens = tokenizer.tokenize().unwrap(); - assert_eq!(expected, tokens); + assert_eq!(expected, tokens, "Tokenized differently for {:?}", dialect); }); } } diff --git a/src/tokenizer.rs b/src/tokenizer.rs index bd14e837d..9d89ba4ab 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -3547,44 +3547,60 @@ mod tests { #[test] fn test_string_escape_constant_not_supported() { - all_dialects_where(|dialect| !dialect.supports_string_escape_constant()).tokenizes_to( - "select e'...'", + all_dialects_where(|dialect| { + !dialect.supports_string_escape_constant() + && !dialect.supports_string_literal_backslash_escape() + }) + .tokenizes_to( + "select e'\\n'", vec![ Token::make_keyword("select"), Token::Whitespace(Whitespace::Space), Token::make_word("e", None), - Token::SingleQuotedString("...".to_string()), + Token::SingleQuotedString("\\n".to_string()), ], ); - all_dialects_where(|dialect| !dialect.supports_string_escape_constant()).tokenizes_to( - "select E'...'", + all_dialects_where(|dialect| { + !dialect.supports_string_escape_constant() + && !dialect.supports_string_literal_backslash_escape() + }) + .tokenizes_to( + "select E'\\n'", vec![ Token::make_keyword("select"), Token::Whitespace(Whitespace::Space), Token::make_word("E", None), - Token::SingleQuotedString("...".to_string()), + Token::SingleQuotedString("\\n".to_string()), ], ); } #[test] fn test_string_escape_constant_supported() { - all_dialects_where(|dialect| dialect.supports_string_escape_constant()).tokenizes_to( - "select e'...'", + all_dialects_where(|dialect| { + dialect.supports_string_escape_constant() + && !dialect.supports_string_literal_backslash_escape() + }) + .tokenizes_to( + "select e'\\''", vec![ Token::make_keyword("select"), Token::Whitespace(Whitespace::Space), - Token::EscapedStringLiteral("...".to_string()), + Token::EscapedStringLiteral("'".to_string()), ], ); - all_dialects_where(|dialect| dialect.supports_string_escape_constant()).tokenizes_to( - "select E'...'", + all_dialects_where(|dialect| { + dialect.supports_string_escape_constant() + && !dialect.supports_string_literal_backslash_escape() + }) + .tokenizes_to( + "select E'\\''", vec![ Token::make_keyword("select"), Token::Whitespace(Whitespace::Space), - Token::EscapedStringLiteral("...".to_string()), + Token::EscapedStringLiteral("'".to_string()), ], ); } From 2588a2e05cd19c293eb91e9b07e0dae81c17fd5a Mon Sep 17 00:00:00 2001 From: Hans Ott Date: Thu, 23 Jan 2025 17:57:41 +0100 Subject: [PATCH 5/6] Simplify test --- src/tokenizer.rs | 32 ++++++++------------------------ 1 file changed, 8 insertions(+), 24 deletions(-) diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 9d89ba4ab..d0de8c90d 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -3547,42 +3547,30 @@ mod tests { #[test] fn test_string_escape_constant_not_supported() { - all_dialects_where(|dialect| { - !dialect.supports_string_escape_constant() - && !dialect.supports_string_literal_backslash_escape() - }) - .tokenizes_to( - "select e'\\n'", + all_dialects_where(|dialect| !dialect.supports_string_escape_constant()).tokenizes_to( + "select e'...'", vec![ Token::make_keyword("select"), Token::Whitespace(Whitespace::Space), Token::make_word("e", None), - Token::SingleQuotedString("\\n".to_string()), + Token::SingleQuotedString("...".to_string()), ], ); - all_dialects_where(|dialect| { - !dialect.supports_string_escape_constant() - && !dialect.supports_string_literal_backslash_escape() - }) - .tokenizes_to( - "select E'\\n'", + all_dialects_where(|dialect| !dialect.supports_string_escape_constant()).tokenizes_to( + "select E'...'", vec![ Token::make_keyword("select"), Token::Whitespace(Whitespace::Space), Token::make_word("E", None), - Token::SingleQuotedString("\\n".to_string()), + Token::SingleQuotedString("...".to_string()), ], ); } #[test] fn test_string_escape_constant_supported() { - all_dialects_where(|dialect| { - dialect.supports_string_escape_constant() - && !dialect.supports_string_literal_backslash_escape() - }) - .tokenizes_to( + all_dialects_where(|dialect| dialect.supports_string_escape_constant()).tokenizes_to( "select e'\\''", vec![ Token::make_keyword("select"), @@ -3591,11 +3579,7 @@ mod tests { ], ); - all_dialects_where(|dialect| { - dialect.supports_string_escape_constant() - && !dialect.supports_string_literal_backslash_escape() - }) - .tokenizes_to( + all_dialects_where(|dialect| dialect.supports_string_escape_constant()).tokenizes_to( "select E'\\''", vec![ Token::make_keyword("select"), From 09e4b1541c533df4d988a414ae96e88468e0f1be Mon Sep 17 00:00:00 2001 From: Hans Ott Date: Thu, 23 Jan 2025 21:52:04 +0100 Subject: [PATCH 6/6] Use more expressive is_empty() --- src/test_utils.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/test_utils.rs b/src/test_utils.rs index 8e371b9e5..1c322f654 100644 --- a/src/test_utils.rs +++ b/src/test_utils.rs @@ -240,7 +240,7 @@ impl TestedDialects { /// Check that the tokenizer returns the expected tokens for the given SQL. pub fn tokenizes_to(&self, sql: &str, expected: Vec) { - if self.dialects.len() == 0 { + if self.dialects.is_empty() { panic!("No dialects to test"); }