@@ -696,6 +696,45 @@ object Scanners {
696696 recur(lastOffset, false )
697697 }
698698
699+ import Character .{isHighSurrogate , isLowSurrogate , isUnicodeIdentifierPart , isUnicodeIdentifierStart , isValidCodePoint , toCodePoint }
700+
701+ // f"\\u$c%04x" or f"${"\\"}u$c%04x"
702+ private def toUnicode (c : Char ): String = { val s = c.toInt.toHexString; " \\ u" + " 0" * (4 - s.length) + s }
703+
704+ // given char (ch) is high surrogate followed by low, codepoint passes predicate.
705+ // true means supplementary chars were put to buffer.
706+ // strict to require low surrogate (if not in string literal).
707+ private def isSupplementary (high : Char , test : Int => Boolean , strict : Boolean = true ): Boolean =
708+ isHighSurrogate(high) && {
709+ var res = false
710+ nextChar()
711+ val low = ch
712+ if isLowSurrogate(low) then
713+ nextChar()
714+ val codepoint = toCodePoint(high, low)
715+ if isValidCodePoint(codepoint) && test(codepoint) then
716+ putChar(high)
717+ putChar(low)
718+ res = true
719+ else
720+ error(s " illegal character ' ${toUnicode(high)}${toUnicode(low)}' " )
721+ else if ! strict then
722+ putChar(high)
723+ res = true
724+ else
725+ error(s " illegal character ' ${toUnicode(high)}' missing low surrogate " )
726+ res
727+ }
728+ private def atSupplementary (ch : Char , f : Int => Boolean ): Boolean =
729+ isHighSurrogate(ch) && {
730+ val hi = ch
731+ val lo = lookaheadChar()
732+ isLowSurrogate(lo) && {
733+ val codepoint = toCodePoint(hi, lo)
734+ isValidCodePoint(codepoint) && f(codepoint)
735+ }
736+ }
737+
699738 /** read next token, filling TokenData fields of Scanner.
700739 */
701740 protected final def fetchToken (): Unit = {
@@ -822,11 +861,12 @@ object Scanners {
822861 else ch match {
823862 case '{' | '[' | ' ' | '\t ' if lookaheadChar() != '\' ' =>
824863 token = QUOTE
825- case _ if ! isAtEnd && ( ch != SU && ch != CR && ch != LF || isUnicodeEscape) =>
864+ case _ if ! isAtEnd && ch != SU && ch != CR && ch != LF =>
826865 val isEmptyCharLit = (ch == '\' ' )
827866 getLitChar()
828867 if ch == '\' ' then
829868 if isEmptyCharLit then error(" empty character literal (use '\\ '' for single quote)" )
869+ else if litBuf.length != 1 then error(" illegal codepoint in Char constant: " + litBuf.toString.map(toUnicode).mkString(" '" , " " , " '" ))
830870 else finishCharLit()
831871 else if isEmptyCharLit then error(" empty character literal" )
832872 else error(" unclosed character literal" )
@@ -869,9 +909,11 @@ object Scanners {
869909 def fetchOther () =
870910 if (ch == '\u21D2 ' ) {
871911 nextChar(); token = ARROW
912+ report.deprecationWarning(" The unicode arrow `⇒` is deprecated, use `=>` instead. If you still wish to display it as one character, consider using a font with programming ligatures such as Fira Code." , sourcePos(offset))
872913 }
873914 else if (ch == '\u2190 ' ) {
874915 nextChar(); token = LARROW
916+ report.deprecationWarning(" The unicode arrow `←` is deprecated, use `<-` instead. If you still wish to display it as one character, consider using a font with programming ligatures such as Fira Code." , sourcePos(offset))
875917 }
876918 else if (Character .isUnicodeIdentifierStart(ch)) {
877919 putChar(ch)
@@ -883,9 +925,10 @@ object Scanners {
883925 nextChar()
884926 getOperatorRest()
885927 }
928+ else if isSupplementary(ch, isUnicodeIdentifierStart) then
929+ getIdentRest()
886930 else {
887- // FIXME: Dotty deviation: f"" interpolator is not supported (#1814)
888- error(" illegal character '\\ u%04x'" .format(ch : Int ))
931+ error(s " illegal character ' ${toUnicode(ch)}' " )
889932 nextChar()
890933 }
891934 fetchOther()
@@ -1024,11 +1067,12 @@ object Scanners {
10241067 case SU => // strangely enough, Character.isUnicodeIdentifierPart(SU) returns true!
10251068 finishNamed()
10261069 case _ =>
1027- if ( Character . isUnicodeIdentifierPart(ch)) {
1070+ if isUnicodeIdentifierPart(ch) then
10281071 putChar(ch)
10291072 nextChar()
10301073 getIdentRest()
1031- }
1074+ else if isSupplementary(ch, isUnicodeIdentifierPart) then
1075+ getIdentRest()
10321076 else
10331077 finishNamed()
10341078 }
@@ -1111,7 +1155,7 @@ object Scanners {
11111155 }
11121156
11131157 // for interpolated strings
1114- @ annotation. tailrec private def getStringPart (multiLine : Boolean ): Unit =
1158+ @ tailrec private def getStringPart (multiLine : Boolean ): Unit =
11151159 if (ch == '"' )
11161160 if (multiLine) {
11171161 nextRawChar()
@@ -1136,6 +1180,28 @@ object Scanners {
11361180 getStringPart(multiLine)
11371181 }
11381182 else if (ch == '$' ) {
1183+ def getInterpolatedIdentRest (hasSupplement : Boolean ): Unit =
1184+ @ tailrec def loopRest (): Unit =
1185+ if ch != SU && isUnicodeIdentifierPart(ch) then
1186+ putChar(ch) ; nextRawChar()
1187+ loopRest()
1188+ else if atSupplementary(ch, isUnicodeIdentifierPart) then
1189+ putChar(ch) ; nextRawChar()
1190+ putChar(ch) ; nextRawChar()
1191+ loopRest()
1192+ else
1193+ finishNamedToken(IDENTIFIER , target = next)
1194+ end loopRest
1195+ setStrVal()
1196+ token = STRINGPART
1197+ next.lastOffset = charOffset - 1
1198+ next.offset = charOffset - 1
1199+ putChar(ch) ; nextRawChar()
1200+ if hasSupplement then
1201+ putChar(ch) ; nextRawChar()
1202+ loopRest()
1203+ end getInterpolatedIdentRest
1204+
11391205 nextRawChar()
11401206 if (ch == '$' || ch == '"' ) {
11411207 putChar(ch)
@@ -1146,18 +1212,10 @@ object Scanners {
11461212 setStrVal()
11471213 token = STRINGPART
11481214 }
1149- else if (Character .isUnicodeIdentifierStart(ch) || ch == '_' ) {
1150- setStrVal()
1151- token = STRINGPART
1152- next.lastOffset = charOffset - 1
1153- next.offset = charOffset - 1
1154- while
1155- putChar(ch)
1156- nextRawChar()
1157- ch != SU && Character .isUnicodeIdentifierPart(ch)
1158- do ()
1159- finishNamedToken(IDENTIFIER , target = next)
1160- }
1215+ else if isUnicodeIdentifierStart(ch) || ch == '_' then
1216+ getInterpolatedIdentRest(hasSupplement = false )
1217+ else if atSupplementary(ch, isUnicodeIdentifierStart) then
1218+ getInterpolatedIdentRest(hasSupplement = true )
11611219 else
11621220 error(" invalid string interpolation: `$$`, `$\" `, `$`ident or `$`BlockExpr expected" , off = charOffset - 2 )
11631221 putChar('$' )
@@ -1205,76 +1263,73 @@ object Scanners {
12051263 false
12061264 }
12071265
1208- /** copy current character into litBuf, interpreting any escape sequences,
1209- * and advance to next character.
1266+ /** Copy current character into cbuf, interpreting any escape sequences,
1267+ * and advance to next character. Surrogate pairs are consumed (see check
1268+ * at fetchSingleQuote), but orphan surrogate is allowed.
12101269 */
12111270 protected def getLitChar (): Unit =
1212- def invalidUnicodeEscape () = {
1213- error(" invalid character in unicode escape sequence" , charOffset - 1 )
1214- putChar(ch)
1215- }
1216- def putUnicode (): Unit = {
1217- while ch == 'u' || ch == 'U' do nextChar()
1218- var i = 0
1219- var cp = 0
1220- while (i < 4 ) {
1221- val shift = (3 - i) * 4
1222- val d = digit2int(ch, 16 )
1223- if (d < 0 ) {
1224- return invalidUnicodeEscape()
1225- }
1226- cp += (d << shift)
1227- nextChar()
1228- i += 1
1229- }
1230- putChar(cp.asInstanceOf [Char ])
1231- }
1232- if (ch == '\\ ' ) {
1271+ if ch == '\\ ' then
12331272 nextChar()
1234- if ('0' <= ch && ch <= '7' ) {
1235- val start = charOffset - 2
1236- val leadch : Char = ch
1237- var oct : Int = digit2int(ch, 8 )
1238- nextChar()
1239- if ('0' <= ch && ch <= '7' ) {
1240- oct = oct * 8 + digit2int(ch, 8 )
1241- nextChar()
1242- if (leadch <= '3' && '0' <= ch && ch <= '7' ) {
1243- oct = oct * 8 + digit2int(ch, 8 )
1244- nextChar()
1245- }
1246- }
1247- val alt = if oct == LF then raw " \n " else f " ${" \\ " }u $oct%04x "
1248- error(s " octal escape literals are unsupported: use $alt instead " , start)
1249- putChar(oct.toChar)
1250- }
1251- else if (ch == 'u' || ch == 'U' ) {
1252- putUnicode()
1253- }
1254- else {
1255- ch match {
1256- case 'b' => putChar('\b ' )
1257- case 't' => putChar('\t ' )
1258- case 'n' => putChar('\n ' )
1259- case 'f' => putChar('\f ' )
1260- case 'r' => putChar('\r ' )
1261- case '\" ' => putChar('\" ' )
1262- case '\' ' => putChar('\' ' )
1263- case '\\ ' => putChar('\\ ' )
1264- case _ => invalidEscape()
1265- }
1266- nextChar()
1267- }
1268- }
1269- else {
1273+ charEscape()
1274+ else if ! isSupplementary(ch, _ => true , strict = false ) then
12701275 putChar(ch)
12711276 nextChar()
1272- }
12731277
1274- protected def invalidEscape (): Unit = {
1278+ private def charEscape (): Unit =
1279+ var bump = true
1280+ ch match
1281+ case 'b' => putChar('\b ' )
1282+ case 't' => putChar('\t ' )
1283+ case 'n' => putChar('\n ' )
1284+ case 'f' => putChar('\f ' )
1285+ case 'r' => putChar('\r ' )
1286+ case '\" ' => putChar('\" ' )
1287+ case '\' ' => putChar('\' ' )
1288+ case '\\ ' => putChar('\\ ' )
1289+ case 'u' |
1290+ 'U' => uEscape(); bump = false
1291+ case x if '0' <= x && x <= '7' => octalEscape(); bump = false
1292+ case _ => invalidEscape()
1293+ if bump then nextChar()
1294+ end charEscape
1295+
1296+ private def uEscape (): Unit =
1297+ while ch == 'u' || ch == 'U' do nextChar()
1298+ var i = 0
1299+ var cp = 0
1300+ while i < 4 do
1301+ val digit = digit2int(ch, 16 )
1302+ if digit < 0 then
1303+ error(" invalid character in unicode escape sequence" , charOffset - 1 )
1304+ putChar(ch)
1305+ return
1306+ val shift = (3 - i) * 4
1307+ cp += digit << shift
1308+ nextChar()
1309+ i += 1
1310+ end while
1311+ putChar(cp.asInstanceOf [Char ])
1312+ end uEscape
1313+
1314+ private def octalEscape (): Unit =
1315+ val start = charOffset - 2
1316+ val leadch : Char = ch
1317+ var oct : Int = digit2int(ch, 8 )
1318+ nextChar()
1319+ if '0' <= ch && ch <= '7' then
1320+ oct = oct * 8 + digit2int(ch, 8 )
1321+ nextChar()
1322+ if leadch <= '3' && '0' <= ch && ch <= '7' then
1323+ oct = oct * 8 + digit2int(ch, 8 )
1324+ nextChar()
1325+ val alt = if oct == LF then raw " \n " else toUnicode(oct.toChar)
1326+ error(s " octal escape literals are unsupported: use $alt instead " , start)
1327+ putChar(oct.toChar)
1328+ end octalEscape
1329+
1330+ protected def invalidEscape (): Unit =
12751331 error(" invalid escape character" , charOffset - 1 )
12761332 putChar(ch)
1277- }
12781333
12791334 private def getLitChars (delimiter : Char ) =
12801335 while (ch != delimiter && ! isAtEnd && (ch != SU && ch != CR && ch != LF || isUnicodeEscape))
0 commit comments