@@ -94,6 +94,8 @@ pub enum Token {
9494 NationalStringLiteral ( String ) ,
9595 /// "escaped" string literal, which are an extension to the SQL standard: i.e: e'first \n second' or E 'first \n second'
9696 EscapedStringLiteral ( String ) ,
97+ /// Unicode string literal: i.e: U&'first \000A second'
98+ UnicodeStringLiteral ( String ) ,
9799 /// Hexadecimal string literal: i.e.: X'deadbeef'
98100 HexStringLiteral ( String ) ,
99101 /// Comma
@@ -251,6 +253,7 @@ impl fmt::Display for Token {
251253 Token :: DollarQuotedString ( ref s) => write ! ( f, "{s}" ) ,
252254 Token :: NationalStringLiteral ( ref s) => write ! ( f, "N'{s}'" ) ,
253255 Token :: EscapedStringLiteral ( ref s) => write ! ( f, "E'{s}'" ) ,
256+ Token :: UnicodeStringLiteral ( ref s) => write ! ( f, "U&'{s}'" ) ,
254257 Token :: HexStringLiteral ( ref s) => write ! ( f, "X'{s}'" ) ,
255258 Token :: SingleQuotedByteStringLiteral ( ref s) => write ! ( f, "B'{s}'" ) ,
256259 Token :: TripleSingleQuotedByteStringLiteral ( ref s) => write ! ( f, "B'''{s}'''" ) ,
@@ -794,6 +797,23 @@ impl<'a> Tokenizer<'a> {
794797 }
795798 }
796799 }
800+ // Unicode string literals like U&'first \000A second' are supported in some dialects, including PostgreSQL
801+ x @ 'u' | x @ 'U' if self . dialect . supports_unicode_string_literal ( ) => {
802+ chars. next ( ) ; // consume, to check the next char
803+ if chars. peek ( ) == Some ( & '&' ) {
804+ // we cannot advance the iterator here, as we need to consume the '&' later if the 'u' was an identifier
805+ let mut chars_clone = chars. peekable . clone ( ) ;
806+ chars_clone. next ( ) ; // consume the '&' in the clone
807+ if chars_clone. peek ( ) == Some ( & '\'' ) {
808+ chars. next ( ) ; // consume the '&' in the original iterator
809+ let s = unescape_unicode_single_quoted_string ( chars) ?;
810+ return Ok ( Some ( Token :: UnicodeStringLiteral ( s) ) ) ;
811+ }
812+ }
813+ // regular identifier starting with an "U" or "u"
814+ let s = self . tokenize_word ( x, chars) ;
815+ Ok ( Some ( Token :: make_word ( & s, None ) ) )
816+ }
797817 // The spec only allows an uppercase 'X' to introduce a hex
798818 // string, but PostgreSQL, at least, allows a lowercase 'x' too.
799819 x @ 'x' | x @ 'X' => {
@@ -1797,6 +1817,64 @@ impl<'a: 'b, 'b> Unescape<'a, 'b> {
17971817 }
17981818}
17991819
1820+ fn unescape_unicode_single_quoted_string ( chars : & mut State < ' _ > ) -> Result < String , TokenizerError > {
1821+ let mut unescaped = String :: new ( ) ;
1822+ chars. next ( ) ; // consume the opening quote
1823+ while let Some ( c) = chars. next ( ) {
1824+ match c {
1825+ '\'' => {
1826+ if chars. peek ( ) == Some ( & '\'' ) {
1827+ chars. next ( ) ;
1828+ unescaped. push ( '\'' ) ;
1829+ } else {
1830+ return Ok ( unescaped) ;
1831+ }
1832+ }
1833+ '\\' => match chars. peek ( ) {
1834+ Some ( '\\' ) => {
1835+ chars. next ( ) ;
1836+ unescaped. push ( '\\' ) ;
1837+ }
1838+ Some ( '+' ) => {
1839+ chars. next ( ) ;
1840+ unescaped. push ( take_char_from_hex_digits ( chars, 6 ) ?) ;
1841+ }
1842+ _ => unescaped. push ( take_char_from_hex_digits ( chars, 4 ) ?) ,
1843+ } ,
1844+ _ => {
1845+ unescaped. push ( c) ;
1846+ }
1847+ }
1848+ }
1849+ Err ( TokenizerError {
1850+ message : "Unterminated unicode encoded string literal" . to_string ( ) ,
1851+ location : chars. location ( ) ,
1852+ } )
1853+ }
1854+
1855+ fn take_char_from_hex_digits (
1856+ chars : & mut State < ' _ > ,
1857+ max_digits : usize ,
1858+ ) -> Result < char , TokenizerError > {
1859+ let mut result = 0u32 ;
1860+ for _ in 0 ..max_digits {
1861+ let next_char = chars. next ( ) . ok_or_else ( || TokenizerError {
1862+ message : "Unexpected EOF while parsing hex digit in escaped unicode string."
1863+ . to_string ( ) ,
1864+ location : chars. location ( ) ,
1865+ } ) ?;
1866+ let digit = next_char. to_digit ( 16 ) . ok_or_else ( || TokenizerError {
1867+ message : format ! ( "Invalid hex digit in escaped unicode string: {}" , next_char) ,
1868+ location : chars. location ( ) ,
1869+ } ) ?;
1870+ result = result * 16 + digit;
1871+ }
1872+ char:: from_u32 ( result) . ok_or_else ( || TokenizerError {
1873+ message : format ! ( "Invalid unicode character: {:x}" , result) ,
1874+ location : chars. location ( ) ,
1875+ } )
1876+ }
1877+
18001878#[ cfg( test) ]
18011879mod tests {
18021880 use super :: * ;
0 commit comments