@@ -5,7 +5,6 @@ package parsing
55import core .Names ._ , core .Contexts ._ , core .Decorators ._ , util .Spans ._
66import core .StdNames ._ , core .Comments ._
77import util .SourceFile
8- import java .lang .Character .isDigit
98import util .Chars ._
109import util .{SourcePosition , CharBuffer }
1110import util .Spans .Span
@@ -705,6 +704,44 @@ object Scanners {
705704 recur(lastOffset, false )
706705 }
707706
707+ import Character .{isHighSurrogate , isLowSurrogate , isUnicodeIdentifierPart , isUnicodeIdentifierStart , isValidCodePoint , toCodePoint }
708+
709+ // given char (ch) is high surrogate followed by low, codepoint passes predicate.
710+ // true means supplementary chars were put to buffer.
711+ // strict to require low surrogate (if not in string literal).
712+ private def isSupplementary (high : Char , test : Int => Boolean , strict : Boolean = true ): Boolean =
713+ isHighSurrogate(high) && {
714+ var res = false
715+ nextChar()
716+ val low = ch
717+ if isLowSurrogate(low) then
718+ nextChar()
719+ val codepoint = toCodePoint(high, low)
720+ if isValidCodePoint(codepoint) && test(codepoint) then
721+ putChar(high)
722+ putChar(low)
723+ res = true
724+ else
725+ error(f " illegal character ' \u ${high.toInt}%04x \u ${low.toInt}%04x' " )
726+ // error(f"illegal character '\\u$high%04x\\u$low%04x'")
727+ else if ! strict then
728+ putChar(high)
729+ res = true
730+ else
731+ error(f " illegal character ' \u ${high.toInt}%04x' missing low surrogate " )
732+ // error(f"illegal character '\\u$high%04x' missing low surrogate")
733+ res
734+ }
735+ private def atSupplementary (ch : Char , f : Int => Boolean ): Boolean =
736+ isHighSurrogate(ch) && {
737+ val hi = ch
738+ val lo = lookaheadChar()
739+ isLowSurrogate(lo) && {
740+ val codepoint = toCodePoint(hi, lo)
741+ isValidCodePoint(codepoint) && f(codepoint)
742+ }
743+ }
744+
708745 /** read next token, filling TokenData fields of Scanner.
709746 */
710747 protected final def fetchToken (): Unit = {
@@ -831,11 +868,12 @@ object Scanners {
831868 else ch match {
832869 case '{' | '[' | ' ' | '\t ' if lookaheadChar() != '\' ' =>
833870 token = QUOTE
834- case _ if ! isAtEnd && ( ch != SU && ch != CR && ch != LF || isUnicodeEscape) =>
871+ case _ if ! isAtEnd && ch != SU && ch != CR && ch != LF =>
835872 val isEmptyCharLit = (ch == '\' ' )
836873 getLitChar()
837874 if ch == '\' ' then
838875 if isEmptyCharLit then error(" empty character literal (use '\\ '' for single quote)" )
876+ else if litBuf.length != 1 then error(" illegal codepoint in Char constant: " + litBuf.toString.map(c => f " \u ${c.toInt}%04x " ).mkString(" '" , " " , " '" )) // FIXME format
839877 else finishCharLit()
840878 else if isEmptyCharLit then error(" empty character literal" )
841879 else error(" unclosed character literal" )
@@ -878,9 +916,11 @@ object Scanners {
878916 def fetchOther () =
879917 if (ch == '\u21D2 ' ) {
880918 nextChar(); token = ARROW
919+ report.deprecationWarning(" The unicode arrow `⇒` is deprecated, use `=>` instead. If you still wish to display it as one character, consider using a font with programming ligatures such as Fira Code." , sourcePos(offset))
881920 }
882921 else if (ch == '\u2190 ' ) {
883922 nextChar(); token = LARROW
923+ report.deprecationWarning(" The unicode arrow `←` is deprecated, use `<-` instead. If you still wish to display it as one character, consider using a font with programming ligatures such as Fira Code." , sourcePos(offset))
884924 }
885925 else if (Character .isUnicodeIdentifierStart(ch)) {
886926 putChar(ch)
@@ -892,9 +932,12 @@ object Scanners {
892932 nextChar()
893933 getOperatorRest()
894934 }
935+ else if isSupplementary(ch, isUnicodeIdentifierStart) then
936+ getIdentRest()
895937 else {
896- // FIXME: Dotty deviation: f"" interpolator is not supported (#1814)
897- error(" illegal character '\\ u%04x'" .format(ch : Int ))
938+ // FIXME: Dotty deviation: f"" interpolator doesn't handle char or escaped backslash
939+ // error(f"illegal character '\\u$ch%04x'")
940+ error(f " illegal character ' \u ${ch.toInt}%04x' " )
898941 nextChar()
899942 }
900943 fetchOther()
@@ -1033,11 +1076,12 @@ object Scanners {
10331076 case SU => // strangely enough, Character.isUnicodeIdentifierPart(SU) returns true!
10341077 finishNamed()
10351078 case _ =>
1036- if ( Character . isUnicodeIdentifierPart(ch)) {
1079+ if isUnicodeIdentifierPart(ch) then
10371080 putChar(ch)
10381081 nextChar()
10391082 getIdentRest()
1040- }
1083+ else if isSupplementary(ch, isUnicodeIdentifierPart) then
1084+ getIdentRest()
10411085 else
10421086 finishNamed()
10431087 }
@@ -1120,7 +1164,7 @@ object Scanners {
11201164 }
11211165
11221166 // for interpolated strings
1123- @ annotation. tailrec private def getStringPart (multiLine : Boolean ): Unit =
1167+ @ tailrec private def getStringPart (multiLine : Boolean ): Unit =
11241168 if (ch == '"' )
11251169 if (multiLine) {
11261170 nextRawChar()
@@ -1145,6 +1189,28 @@ object Scanners {
11451189 getStringPart(multiLine)
11461190 }
11471191 else if (ch == '$' ) {
1192+ def getInterpolatedIdentRest (hasSupplement : Boolean ): Unit =
1193+ @ tailrec def loopRest (): Unit =
1194+ if ch != SU && isUnicodeIdentifierPart(ch) then
1195+ putChar(ch) ; nextRawChar()
1196+ loopRest()
1197+ else if atSupplementary(ch, isUnicodeIdentifierPart) then
1198+ putChar(ch) ; nextRawChar()
1199+ putChar(ch) ; nextRawChar()
1200+ loopRest()
1201+ else
1202+ finishNamedToken(IDENTIFIER , target = next)
1203+ end loopRest
1204+ setStrVal()
1205+ token = STRINGPART
1206+ next.lastOffset = charOffset - 1
1207+ next.offset = charOffset - 1
1208+ putChar(ch) ; nextRawChar()
1209+ if hasSupplement then
1210+ putChar(ch) ; nextRawChar()
1211+ loopRest()
1212+ end getInterpolatedIdentRest
1213+
11481214 nextRawChar()
11491215 if (ch == '$' || ch == '"' ) {
11501216 putChar(ch)
@@ -1155,18 +1221,10 @@ object Scanners {
11551221 setStrVal()
11561222 token = STRINGPART
11571223 }
1158- else if (Character .isUnicodeIdentifierStart(ch) || ch == '_' ) {
1159- setStrVal()
1160- token = STRINGPART
1161- next.lastOffset = charOffset - 1
1162- next.offset = charOffset - 1
1163- while
1164- putChar(ch)
1165- nextRawChar()
1166- ch != SU && Character .isUnicodeIdentifierPart(ch)
1167- do ()
1168- finishNamedToken(IDENTIFIER , target = next)
1169- }
1224+ else if isUnicodeIdentifierStart(ch) || ch == '_' then
1225+ getInterpolatedIdentRest(hasSupplement = false )
1226+ else if atSupplementary(ch, isUnicodeIdentifierStart) then
1227+ getInterpolatedIdentRest(hasSupplement = true )
11701228 else
11711229 error(" invalid string interpolation: `$$`, `$\" `, `$`ident or `$`BlockExpr expected" )
11721230 }
@@ -1212,76 +1270,76 @@ object Scanners {
12121270 false
12131271 }
12141272
1215- /** copy current character into litBuf, interpreting any escape sequences,
1216- * and advance to next character.
1273+ /** Copy current character into cbuf, interpreting any escape sequences,
1274+ * and advance to next character. Surrogate pairs are consumed (see check
1275+ * at fetchSingleQuote), but orphan surrogate is allowed.
12171276 */
12181277 protected def getLitChar (): Unit =
1219- def invalidUnicodeEscape () = {
1220- error(" invalid character in unicode escape sequence" , charOffset - 1 )
1221- putChar(ch)
1222- }
1223- def putUnicode (): Unit = {
1224- while ch == 'u' || ch == 'U' do nextChar()
1225- var i = 0
1226- var cp = 0
1227- while (i < 4 ) {
1228- val shift = (3 - i) * 4
1229- val d = digit2int(ch, 16 )
1230- if (d < 0 ) {
1231- return invalidUnicodeEscape()
1232- }
1233- cp += (d << shift)
1234- nextChar()
1235- i += 1
1236- }
1237- putChar(cp.asInstanceOf [Char ])
1238- }
1239- if (ch == '\\ ' ) {
1278+ if ch == '\\ ' then
12401279 nextChar()
1241- if ('0' <= ch && ch <= '7' ) {
1242- val start = charOffset - 2
1243- val leadch : Char = ch
1244- var oct : Int = digit2int(ch, 8 )
1245- nextChar()
1246- if ('0' <= ch && ch <= '7' ) {
1247- oct = oct * 8 + digit2int(ch, 8 )
1248- nextChar()
1249- if (leadch <= '3' && '0' <= ch && ch <= '7' ) {
1250- oct = oct * 8 + digit2int(ch, 8 )
1251- nextChar()
1252- }
1253- }
1254- val alt = if oct == LF then raw " \n " else f " \u $oct%04x "
1255- error(s " octal escape literals are unsupported: use $alt instead " , start)
1256- putChar(oct.toChar)
1257- }
1258- else if (ch == 'u' || ch == 'U' ) {
1259- putUnicode()
1260- }
1261- else {
1262- ch match {
1263- case 'b' => putChar('\b ' )
1264- case 't' => putChar('\t ' )
1265- case 'n' => putChar('\n ' )
1266- case 'f' => putChar('\f ' )
1267- case 'r' => putChar('\r ' )
1268- case '\" ' => putChar('\" ' )
1269- case '\' ' => putChar('\' ' )
1270- case '\\ ' => putChar('\\ ' )
1271- case _ => invalidEscape()
1272- }
1273- nextChar()
1274- }
1275- }
1276- else {
1280+ charEscape()
1281+ else if ! isSupplementary(ch, _ => true , strict = false ) then
12771282 putChar(ch)
12781283 nextChar()
1279- }
12801284
1281- protected def invalidEscape (): Unit = {
1285+ private def charEscape (): Unit =
1286+ var bump = true
1287+ ch match
1288+ case 'b' => putChar('\b ' )
1289+ case 't' => putChar('\t ' )
1290+ case 'n' => putChar('\n ' )
1291+ case 'f' => putChar('\f ' )
1292+ case 'r' => putChar('\r ' )
1293+ case '\" ' => putChar('\" ' )
1294+ case '\' ' => putChar('\' ' )
1295+ case '\\ ' => putChar('\\ ' )
1296+ case 'u' |
1297+ 'U' => bump = uEscape()
1298+ case x if '0' <= x && x <= '7' => bump = octalEscape()
1299+ case _ => invalidEscape()
1300+ if bump then nextChar()
1301+ end charEscape
1302+
1303+ private def uEscape (): Boolean =
1304+ while ch == 'u' || ch == 'U' do nextChar()
1305+ var i = 0
1306+ var cp = 0
1307+ while i < 4 do
1308+ val digit = digit2int(ch, 16 )
1309+ if digit < 0 then
1310+ error(" invalid character in unicode escape sequence" , charOffset - 1 )
1311+ putChar(ch)
1312+ return false
1313+ val shift = (3 - i) * 4
1314+ cp += digit << shift
1315+ nextChar()
1316+ i += 1
1317+ end while
1318+ putChar(cp.asInstanceOf [Char ])
1319+ false
1320+ end uEscape
1321+
1322+ private def octalEscape (): Boolean =
1323+ val start = charOffset - 2
1324+ val leadch : Char = ch
1325+ var oct : Int = digit2int(ch, 8 )
1326+ nextChar()
1327+ if '0' <= ch && ch <= '7' then
1328+ oct = oct * 8 + digit2int(ch, 8 )
1329+ nextChar()
1330+ if leadch <= '3' && '0' <= ch && ch <= '7' then
1331+ oct = oct * 8 + digit2int(ch, 8 )
1332+ nextChar()
1333+ // val alt = if (oct == LF) "\\n" else f"\\u$oct%04x"
1334+ val alt = if oct == LF then raw " \n " else f " \u $oct%04x "
1335+ error(s " octal escape literals are unsupported: use $alt instead " , start)
1336+ putChar(oct.toChar)
1337+ false
1338+ end octalEscape
1339+
1340+ protected def invalidEscape (): Unit =
12821341 error(" invalid escape character" , charOffset - 1 )
12831342 putChar(ch)
1284- }
12851343
12861344 private def getLitChars (delimiter : Char ) =
12871345 while (ch != delimiter && ! isAtEnd && (ch != SU && ch != CR && ch != LF || isUnicodeEscape))
@@ -1364,25 +1422,22 @@ object Scanners {
13641422 setStrVal()
13651423 }
13661424
1367- private def finishCharLit (): Unit = {
1425+ private def finishCharLit (): Unit =
13681426 nextChar()
13691427 token = CHARLIT
13701428 setStrVal()
1371- }
13721429
13731430 /** Parse character literal if current character is followed by \',
13741431 * or follow with given op and return a symbol literal token
13751432 */
1376- def charLitOr (op : => Token ): Unit = {
1433+ def charLitOr (op : => Token ): Unit =
13771434 putChar(ch)
13781435 nextChar()
1379- if ( ch == '\' ' ) finishCharLit()
1380- else {
1436+ if ch == '\' ' then finishCharLit()
1437+ else
13811438 token = op
13821439 strVal = if (name != null ) name.toString else null
13831440 litBuf.clear()
1384- }
1385- }
13861441
13871442 override def toString : String =
13881443 showTokenDetailed(token) + {
0 commit comments