@@ -5,7 +5,6 @@ package parsing
55import core .Names ._ , core .Contexts ._ , core .Decorators ._ , util .Spans ._
66import core .StdNames ._ , core .Comments ._
77import util .SourceFile
8- import java .lang .Character .isDigit
98import util .Chars ._
109import util .{SourcePosition , CharBuffer }
1110import util .Spans .Span
@@ -706,6 +705,44 @@ object Scanners {
706705 recur(lastOffset, false )
707706 }
708707
708+ import Character .{isHighSurrogate , isLowSurrogate , isUnicodeIdentifierPart , isUnicodeIdentifierStart , isValidCodePoint , toCodePoint }
709+
710+ // given char (ch) is high surrogate followed by low, codepoint passes predicate.
711+ // true means supplementary chars were put to buffer.
712+ // strict to require low surrogate (if not in string literal).
713+ private def isSupplementary (high : Char , test : Int => Boolean , strict : Boolean = true ): Boolean =
714+ isHighSurrogate(high) && {
715+ var res = false
716+ nextChar()
717+ val low = ch
718+ if isLowSurrogate(low) then
719+ nextChar()
720+ val codepoint = toCodePoint(high, low)
721+ if isValidCodePoint(codepoint) && test(codepoint) then
722+ putChar(high)
723+ putChar(low)
724+ res = true
725+ else
726+ error(f " illegal character ' \u ${high.toInt}%04x \u ${low.toInt}%04x' " )
727+ // error(f"illegal character '\\u$high%04x\\u$low%04x'")
728+ else if ! strict then
729+ putChar(high)
730+ res = true
731+ else
732+ error(f " illegal character ' \u ${high.toInt}%04x' missing low surrogate " )
733+ // error(f"illegal character '\\u$high%04x' missing low surrogate")
734+ res
735+ }
736+ private def atSupplementary (ch : Char , f : Int => Boolean ): Boolean =
737+ isHighSurrogate(ch) && {
738+ val hi = ch
739+ val lo = lookaheadChar()
740+ isLowSurrogate(lo) && {
741+ val codepoint = toCodePoint(hi, lo)
742+ isValidCodePoint(codepoint) && f(codepoint)
743+ }
744+ }
745+
709746 /** read next token, filling TokenData fields of Scanner.
710747 */
711748 protected final def fetchToken (): Unit = {
@@ -832,11 +869,12 @@ object Scanners {
832869 else ch match {
833870 case '{' | '[' | ' ' | '\t ' if lookaheadChar() != '\' ' =>
834871 token = QUOTE
835- case _ if ! isAtEnd && ( ch != SU && ch != CR && ch != LF || isUnicodeEscape) =>
872+ case _ if ! isAtEnd && ch != SU && ch != CR && ch != LF =>
836873 val isEmptyCharLit = (ch == '\' ' )
837874 getLitChar()
838875 if ch == '\' ' then
839876 if isEmptyCharLit then error(" empty character literal (use '\\ '' for single quote)" )
877+ else if litBuf.length != 1 then error(" illegal codepoint in Char constant: " + litBuf.toString.map(c => f " \u ${c.toInt}%04x " ).mkString(" '" , " " , " '" )) // FIXME format
840878 else finishCharLit()
841879 else if isEmptyCharLit then error(" empty character literal" )
842880 else error(" unclosed character literal" )
@@ -879,9 +917,11 @@ object Scanners {
879917 def fetchOther () =
880918 if (ch == '\u21D2 ' ) {
881919 nextChar(); token = ARROW
920+ report.deprecationWarning(" The unicode arrow `⇒` is deprecated, use `=>` instead. If you still wish to display it as one character, consider using a font with programming ligatures such as Fira Code." , sourcePos(offset))
882921 }
883922 else if (ch == '\u2190 ' ) {
884923 nextChar(); token = LARROW
924+ report.deprecationWarning(" The unicode arrow `←` is deprecated, use `<-` instead. If you still wish to display it as one character, consider using a font with programming ligatures such as Fira Code." , sourcePos(offset))
885925 }
886926 else if (Character .isUnicodeIdentifierStart(ch)) {
887927 putChar(ch)
@@ -893,9 +933,12 @@ object Scanners {
893933 nextChar()
894934 getOperatorRest()
895935 }
936+ else if isSupplementary(ch, isUnicodeIdentifierStart) then
937+ getIdentRest()
896938 else {
897- // FIXME: Dotty deviation: f"" interpolator is not supported (#1814)
898- error(" illegal character '\\ u%04x'" .format(ch : Int ))
939+ // FIXME: Dotty deviation: f"" interpolator doesn't handle char or escaped backslash
940+ // error(f"illegal character '\\u$ch%04x'")
941+ error(f " illegal character ' \u ${ch.toInt}%04x' " )
899942 nextChar()
900943 }
901944 fetchOther()
@@ -1034,11 +1077,12 @@ object Scanners {
10341077 case SU => // strangely enough, Character.isUnicodeIdentifierPart(SU) returns true!
10351078 finishNamed()
10361079 case _ =>
1037- if ( Character . isUnicodeIdentifierPart(ch)) {
1080+ if isUnicodeIdentifierPart(ch) then
10381081 putChar(ch)
10391082 nextChar()
10401083 getIdentRest()
1041- }
1084+ else if isSupplementary(ch, isUnicodeIdentifierPart) then
1085+ getIdentRest()
10421086 else
10431087 finishNamed()
10441088 }
@@ -1121,7 +1165,7 @@ object Scanners {
11211165 }
11221166
11231167 // for interpolated strings
1124- @ annotation. tailrec private def getStringPart (multiLine : Boolean ): Unit =
1168+ @ tailrec private def getStringPart (multiLine : Boolean ): Unit =
11251169 if (ch == '"' )
11261170 if (multiLine) {
11271171 nextRawChar()
@@ -1146,6 +1190,28 @@ object Scanners {
11461190 getStringPart(multiLine)
11471191 }
11481192 else if (ch == '$' ) {
1193+ def getInterpolatedIdentRest (hasSupplement : Boolean ): Unit =
1194+ @ tailrec def loopRest (): Unit =
1195+ if ch != SU && isUnicodeIdentifierPart(ch) then
1196+ putChar(ch) ; nextRawChar()
1197+ loopRest()
1198+ else if atSupplementary(ch, isUnicodeIdentifierPart) then
1199+ putChar(ch) ; nextRawChar()
1200+ putChar(ch) ; nextRawChar()
1201+ loopRest()
1202+ else
1203+ finishNamed(target = next)
1204+ end loopRest
1205+ setStrVal()
1206+ token = STRINGPART
1207+ next.lastOffset = charOffset - 1
1208+ next.offset = charOffset - 1
1209+ putChar(ch) ; nextRawChar()
1210+ if hasSupplement then
1211+ putChar(ch) ; nextRawChar()
1212+ loopRest()
1213+ end getInterpolatedIdentRest
1214+
11491215 nextRawChar()
11501216 if (ch == '$' || ch == '"' ) {
11511217 putChar(ch)
@@ -1156,18 +1222,10 @@ object Scanners {
11561222 setStrVal()
11571223 token = STRINGPART
11581224 }
1159- else if (Character .isUnicodeIdentifierStart(ch) || ch == '_' ) {
1160- setStrVal()
1161- token = STRINGPART
1162- next.lastOffset = charOffset - 1
1163- next.offset = charOffset - 1
1164- while
1165- putChar(ch)
1166- nextRawChar()
1167- ch != SU && Character .isUnicodeIdentifierPart(ch)
1168- do ()
1169- finishNamed(target = next)
1170- }
1225+ else if isUnicodeIdentifierStart(ch) || ch == '_' then
1226+ getInterpolatedIdentRest(hasSupplement = false )
1227+ else if atSupplementary(ch, isUnicodeIdentifierStart) then
1228+ getInterpolatedIdentRest(hasSupplement = true )
11711229 else
11721230 error(" invalid string interpolation: `$$`, `$\" `, `$`ident or `$`BlockExpr expected" )
11731231 }
@@ -1213,76 +1271,76 @@ object Scanners {
12131271 false
12141272 }
12151273
1216- /** copy current character into litBuf, interpreting any escape sequences,
1217- * and advance to next character.
1274+ /** Copy current character into cbuf, interpreting any escape sequences,
1275+ * and advance to next character. Surrogate pairs are consumed (see check
1276+ * at fetchSingleQuote), but orphan surrogate is allowed.
12181277 */
12191278 protected def getLitChar (): Unit =
1220- def invalidUnicodeEscape () = {
1221- error(" invalid character in unicode escape sequence" , charOffset - 1 )
1222- putChar(ch)
1223- }
1224- def putUnicode (): Unit = {
1225- while ch == 'u' || ch == 'U' do nextChar()
1226- var i = 0
1227- var cp = 0
1228- while (i < 4 ) {
1229- val shift = (3 - i) * 4
1230- val d = digit2int(ch, 16 )
1231- if (d < 0 ) {
1232- return invalidUnicodeEscape()
1233- }
1234- cp += (d << shift)
1235- nextChar()
1236- i += 1
1237- }
1238- putChar(cp.asInstanceOf [Char ])
1239- }
1240- if (ch == '\\ ' ) {
1279+ if ch == '\\ ' then
12411280 nextChar()
1242- if ('0' <= ch && ch <= '7' ) {
1243- val start = charOffset - 2
1244- val leadch : Char = ch
1245- var oct : Int = digit2int(ch, 8 )
1246- nextChar()
1247- if ('0' <= ch && ch <= '7' ) {
1248- oct = oct * 8 + digit2int(ch, 8 )
1249- nextChar()
1250- if (leadch <= '3' && '0' <= ch && ch <= '7' ) {
1251- oct = oct * 8 + digit2int(ch, 8 )
1252- nextChar()
1253- }
1254- }
1255- val alt = if oct == LF then raw " \n " else f " \u $oct%04x "
1256- error(s " octal escape literals are unsupported: use $alt instead " , start)
1257- putChar(oct.toChar)
1258- }
1259- else if (ch == 'u' || ch == 'U' ) {
1260- putUnicode()
1261- }
1262- else {
1263- ch match {
1264- case 'b' => putChar('\b ' )
1265- case 't' => putChar('\t ' )
1266- case 'n' => putChar('\n ' )
1267- case 'f' => putChar('\f ' )
1268- case 'r' => putChar('\r ' )
1269- case '\" ' => putChar('\" ' )
1270- case '\' ' => putChar('\' ' )
1271- case '\\ ' => putChar('\\ ' )
1272- case _ => invalidEscape()
1273- }
1274- nextChar()
1275- }
1276- }
1277- else {
1281+ charEscape()
1282+ else if ! isSupplementary(ch, _ => true , strict = false ) then
12781283 putChar(ch)
12791284 nextChar()
1280- }
12811285
1282- protected def invalidEscape (): Unit = {
1286+ private def charEscape (): Unit =
1287+ var bump = true
1288+ ch match
1289+ case 'b' => putChar('\b ' )
1290+ case 't' => putChar('\t ' )
1291+ case 'n' => putChar('\n ' )
1292+ case 'f' => putChar('\f ' )
1293+ case 'r' => putChar('\r ' )
1294+ case '\" ' => putChar('\" ' )
1295+ case '\' ' => putChar('\' ' )
1296+ case '\\ ' => putChar('\\ ' )
1297+ case 'u' |
1298+ 'U' => bump = uEscape()
1299+ case x if '0' <= x && x <= '7' => bump = octalEscape()
1300+ case _ => invalidEscape()
1301+ if bump then nextChar()
1302+ end charEscape
1303+
1304+ private def uEscape (): Boolean =
1305+ while ch == 'u' || ch == 'U' do nextChar()
1306+ var i = 0
1307+ var cp = 0
1308+ while i < 4 do
1309+ val digit = digit2int(ch, 16 )
1310+ if digit < 0 then
1311+ error(" invalid character in unicode escape sequence" , charOffset - 1 )
1312+ putChar(ch)
1313+ return false
1314+ val shift = (3 - i) * 4
1315+ cp += digit << shift
1316+ nextChar()
1317+ i += 1
1318+ end while
1319+ putChar(cp.asInstanceOf [Char ])
1320+ false
1321+ end uEscape
1322+
1323+ private def octalEscape (): Boolean =
1324+ val start = charOffset - 2
1325+ val leadch : Char = ch
1326+ var oct : Int = digit2int(ch, 8 )
1327+ nextChar()
1328+ if '0' <= ch && ch <= '7' then
1329+ oct = oct * 8 + digit2int(ch, 8 )
1330+ nextChar()
1331+ if leadch <= '3' && '0' <= ch && ch <= '7' then
1332+ oct = oct * 8 + digit2int(ch, 8 )
1333+ nextChar()
1334+ // val alt = if (oct == LF) "\\n" else f"\\u$oct%04x"
1335+ val alt = if oct == LF then raw " \n " else f " \u $oct%04x "
1336+ error(s " octal escape literals are unsupported: use $alt instead " , start)
1337+ putChar(oct.toChar)
1338+ false
1339+ end octalEscape
1340+
1341+ protected def invalidEscape (): Unit =
12831342 error(" invalid escape character" , charOffset - 1 )
12841343 putChar(ch)
1285- }
12861344
12871345 private def getLitChars (delimiter : Char ) =
12881346 while (ch != delimiter && ! isAtEnd && (ch != SU && ch != CR && ch != LF || isUnicodeEscape))
@@ -1365,25 +1423,22 @@ object Scanners {
13651423 setStrVal()
13661424 }
13671425
1368- private def finishCharLit (): Unit = {
1426+ private def finishCharLit (): Unit =
13691427 nextChar()
13701428 token = CHARLIT
13711429 setStrVal()
1372- }
13731430
13741431 /** Parse character literal if current character is followed by \',
13751432 * or follow with given op and return a symbol literal token
13761433 */
1377- def charLitOr (op : => Token ): Unit = {
1434+ def charLitOr (op : => Token ): Unit =
13781435 putChar(ch)
13791436 nextChar()
1380- if ( ch == '\' ' ) finishCharLit()
1381- else {
1437+ if ch == '\' ' then finishCharLit()
1438+ else
13821439 token = op
13831440 strVal = if (name != null ) name.toString else null
13841441 litBuf.clear()
1385- }
1386- }
13871442
13881443 override def toString : String =
13891444 showTokenDetailed(token) + {
0 commit comments