@@ -65,10 +65,14 @@ fileprivate extension Compiler.ByteCodeGen {
6565 emitDot ( )
6666
6767 case let . char( c) :
68- try emitCharacter ( c)
68+ emitCharacter ( c)
6969
7070 case let . scalar( s) :
71- try emitScalar ( s)
71+ if options. semanticLevel == . graphemeCluster {
72+ emitCharacter ( Character ( s) )
73+ } else {
74+ emitMatchScalar ( s)
75+ }
7276
7377 case let . assertion( kind) :
7478 try emitAssertion ( kind)
@@ -94,6 +98,34 @@ fileprivate extension Compiler.ByteCodeGen {
9498 }
9599 }
96100
101+ mutating func emitQuotedLiteral( _ s: String ) {
102+ guard options. semanticLevel == . graphemeCluster else {
103+ for char in s {
104+ for scalar in char. unicodeScalars {
105+ emitMatchScalar ( scalar)
106+ }
107+ }
108+ return
109+ }
110+
111+ // Fast path for eliding boundary checks for an all ascii quoted literal
112+ if optimizationsEnabled && s. allSatisfy ( \. isASCII) {
113+ let lastIdx = s. unicodeScalars. indices. last!
114+ for idx in s. unicodeScalars. indices {
115+ let boundaryCheck = idx == lastIdx
116+ let scalar = s. unicodeScalars [ idx]
117+ if options. isCaseInsensitive && scalar. properties. isCased {
118+ builder. buildMatchScalarCaseInsensitive ( scalar, boundaryCheck: boundaryCheck)
119+ } else {
120+ builder. buildMatchScalar ( scalar, boundaryCheck: boundaryCheck)
121+ }
122+ }
123+ return
124+ }
125+
126+ for c in s { emitCharacter ( c) }
127+ }
128+
97129 mutating func emitBackreference(
98130 _ ref: AST . Reference
99131 ) throws {
@@ -257,41 +289,47 @@ fileprivate extension Compiler.ByteCodeGen {
257289 }
258290 }
259291
260- mutating func emitScalar( _ s: UnicodeScalar ) throws {
261- // TODO: Native instruction buildMatchScalar(s)
262- if options. isCaseInsensitive {
263- // TODO: e.g. buildCaseInsensitiveMatchScalar(s)
264- builder. buildConsume ( by: consumeScalar {
265- $0. properties. lowercaseMapping == s. properties. lowercaseMapping
266- } )
292+ mutating func emitMatchScalar( _ s: UnicodeScalar ) {
293+ assert ( options. semanticLevel == . unicodeScalar)
294+ if options. isCaseInsensitive && s. properties. isCased {
295+ builder. buildMatchScalarCaseInsensitive ( s, boundaryCheck: false )
267296 } else {
268- builder. buildConsume ( by: consumeScalar {
269- $0 == s
270- } )
297+ builder. buildMatchScalar ( s, boundaryCheck: false )
271298 }
272299 }
273300
274- mutating func emitCharacter( _ c: Character ) throws {
275- // Unicode scalar matches the specific scalars that comprise a character
301+ mutating func emitCharacter( _ c: Character ) {
302+ // Unicode scalar mode matches the specific scalars that comprise a character
276303 if options. semanticLevel == . unicodeScalar {
277304 for scalar in c. unicodeScalars {
278- try emitScalar ( scalar)
305+ emitMatchScalar ( scalar)
279306 }
280307 return
281308 }
282309
283310 if options. isCaseInsensitive && c. isCased {
284- // TODO: buildCaseInsensitiveMatch(c) or buildMatch(c, caseInsensitive: true)
285- builder. buildConsume { input, bounds in
286- let inputChar = input [ bounds. lowerBound] . lowercased ( )
287- let matchChar = c. lowercased ( )
288- return inputChar == matchChar
289- ? input. index ( after: bounds. lowerBound)
290- : nil
311+ if optimizationsEnabled && c. isASCII {
312+ // c.isCased ensures that c is not CR-LF,
313+ // so we know that c is a single scalar
314+ assert ( c. unicodeScalars. count == 1 )
315+ builder. buildMatchScalarCaseInsensitive (
316+ c. unicodeScalars. last!,
317+ boundaryCheck: true )
318+ } else {
319+ builder. buildMatch ( c, isCaseInsensitive: true )
291320 }
292- } else {
293- builder. buildMatch ( c)
321+ return
294322 }
323+
324+ if optimizationsEnabled && c. isASCII {
325+ let lastIdx = c. unicodeScalars. indices. last!
326+ for idx in c. unicodeScalars. indices {
327+ builder. buildMatchScalar ( c. unicodeScalars [ idx] , boundaryCheck: idx == lastIdx)
328+ }
329+ return
330+ }
331+
332+ builder. buildMatch ( c, isCaseInsensitive: false )
295333 }
296334
297335 mutating func emitAny( ) {
@@ -741,11 +779,12 @@ fileprivate extension Compiler.ByteCodeGen {
741779 _ ccc: DSLTree . CustomCharacterClass
742780 ) throws {
743781 if let asciiBitset = ccc. asAsciiBitset ( options) ,
744- options. semanticLevel == . graphemeCluster,
745782 optimizationsEnabled {
746- // future work: add a bit to .matchBitset to consume either a character
747- // or a scalar so we can have this optimization in scalar mode
748- builder. buildMatchAsciiBitset ( asciiBitset)
783+ if options. semanticLevel == . unicodeScalar {
784+ builder. buildScalarMatchAsciiBitset ( asciiBitset)
785+ } else {
786+ builder. buildMatchAsciiBitset ( asciiBitset)
787+ }
749788 } else {
750789 let consumer = try ccc. generateConsumer ( options)
751790 builder. buildConsume ( by: consumer)
@@ -822,45 +861,7 @@ fileprivate extension Compiler.ByteCodeGen {
822861 try emitAtom ( a)
823862
824863 case let . quotedLiteral( s) :
825- if options. semanticLevel == . graphemeCluster {
826- if options. isCaseInsensitive {
827- // TODO: buildCaseInsensitiveMatchSequence(c) or alternative
828- builder. buildConsume { input, bounds in
829- var iterator = s. makeIterator ( )
830- var currentIndex = bounds. lowerBound
831- while let ch = iterator. next ( ) {
832- guard currentIndex < bounds. upperBound,
833- ch. lowercased ( ) == input [ currentIndex] . lowercased ( )
834- else { return nil }
835- input. formIndex ( after: & currentIndex)
836- }
837- return currentIndex
838- }
839- } else {
840- builder. buildMatchSequence ( s)
841- }
842- } else {
843- builder. buildConsume {
844- [ caseInsensitive = options. isCaseInsensitive] input, bounds in
845- // TODO: Case folding
846- var iterator = s. unicodeScalars. makeIterator ( )
847- var currentIndex = bounds. lowerBound
848- while let scalar = iterator. next ( ) {
849- guard currentIndex < bounds. upperBound else { return nil }
850- if caseInsensitive {
851- if scalar. properties. lowercaseMapping != input. unicodeScalars [ currentIndex] . properties. lowercaseMapping {
852- return nil
853- }
854- } else {
855- if scalar != input. unicodeScalars [ currentIndex] {
856- return nil
857- }
858- }
859- input. unicodeScalars. formIndex ( after: & currentIndex)
860- }
861- return currentIndex
862- }
863- }
864+ emitQuotedLiteral ( s)
864865
865866 case let . convertedRegexLiteral( n, _) :
866867 return try emitNode ( n)
0 commit comments