@@ -15,9 +15,10 @@ extension Processor {
1515 isStrictASCII: Bool ,
1616 isScalarSemantics: Bool
1717 ) -> Bool {
18- guard let next = input. matchBuiltinCC (
18+ guard currentPosition < end , let next = input. matchBuiltinCC (
1919 cc,
2020 at: currentPosition,
21+ limitedBy: end,
2122 isInverted: isInverted,
2223 isStrictASCII: isStrictASCII,
2324 isScalarSemantics: isScalarSemantics
@@ -102,56 +103,96 @@ extension Processor {
102103
103104 case . wordBoundary:
104105 if payload. usesSimpleUnicodeBoundaries {
105- // TODO: How should we handle bounds?
106106 return atSimpleBoundary ( payload. usesASCIIWord, payload. semanticLevel)
107107 } else {
108- return input. isOnWordBoundary ( at: currentPosition, using: & wordIndexCache, & wordIndexMaxIndex)
108+ return input. isOnWordBoundary ( at: currentPosition, in : searchBounds , using: & wordIndexCache, & wordIndexMaxIndex)
109109 }
110110
111111 case . notWordBoundary:
112112 if payload. usesSimpleUnicodeBoundaries {
113- // TODO: How should we handle bounds?
114113 return !atSimpleBoundary( payload. usesASCIIWord, payload. semanticLevel)
115114 } else {
116- return !input. isOnWordBoundary ( at: currentPosition, using: & wordIndexCache, & wordIndexMaxIndex)
115+ return !input. isOnWordBoundary ( at: currentPosition, in : searchBounds , using: & wordIndexCache, & wordIndexMaxIndex)
117116 }
118117 }
119118 }
120119}
121120
122121// MARK: Matching `.`
123122extension String {
124- // TODO: Should the below have a `limitedBy` parameter?
123+ /// Returns the character at `pos`, bounded by `end`, as well as the upper
124+ /// boundary of the returned character.
125+ ///
126+ /// This function handles loading a character from a string while respecting
127+ /// an end boundary, even if that end boundary is sub-character or sub-scalar.
128+ ///
129+ /// - If `pos` is at or past `end`, this function returns `nil`.
130+ /// - If `end` is between `pos` and the next grapheme cluster boundary (i.e.,
131+ /// `end` is before `self.index(after: pos)`, then the returned character
132+ /// is smaller than the one that would be produced by `self[pos]` and the
133+ /// returned index is at the end of that character.
134+ /// - If `end` is between `pos` and the next grapheme cluster boundary, and
135+ /// is not on a Unicode scalar boundary, the partial scalar is dropped. This
136+ /// can result in a `nil` return or a character that includes only part of
137+ /// the `self[pos]` character.
138+ ///
139+ /// - Parameters:
140+ /// - pos: The position to load a character from.
141+ /// - end: The limit for the character at `pos`.
142+ /// - Returns: The character at `pos`, bounded by `end`, if it exists, along
143+ /// with the upper bound of that character. The upper bound is always
144+ /// scalar-aligned.
145+ func characterAndEnd( at pos: String . Index , limitedBy end: String . Index ) -> ( Character , String . Index ) ? {
146+ // FIXME: Sink into the stdlib to avoid multiple boundary calculations
147+ guard pos < end else { return nil }
148+ let next = index ( after: pos)
149+ if next <= end {
150+ return ( self [ pos] , next)
151+ }
125152
153+ // `end` must be a sub-character position that is between `pos` and the
154+ // next grapheme boundary. This is okay if `end` is on a Unicode scalar
155+ // boundary, but if it's in the middle of a scalar's code units, there
156+ // may not be a character to return at all after rounding down. Use
157+ // `Substring`'s rounding to determine what we can return.
158+ let substr = self [ pos..< end]
159+ return substr. isEmpty
160+ ? nil
161+ : ( substr. first!, substr. endIndex)
162+ }
163+
126164 func matchAnyNonNewline(
127165 at currentPosition: String . Index ,
166+ limitedBy end: String . Index ,
128167 isScalarSemantics: Bool
129168 ) -> String . Index ? {
130- guard currentPosition < endIndex else {
131- return nil
132- }
169+ guard currentPosition < end else { return nil }
133170 if case . definite( let result) = _quickMatchAnyNonNewline (
134171 at: currentPosition,
172+ limitedBy: end,
135173 isScalarSemantics: isScalarSemantics
136174 ) {
137175 assert ( result == _thoroughMatchAnyNonNewline (
138176 at: currentPosition,
177+ limitedBy: end,
139178 isScalarSemantics: isScalarSemantics) )
140179 return result
141180 }
142181 return _thoroughMatchAnyNonNewline (
143182 at: currentPosition,
183+ limitedBy: end,
144184 isScalarSemantics: isScalarSemantics)
145185 }
146186
147187 @inline ( __always)
148188 private func _quickMatchAnyNonNewline(
149189 at currentPosition: String . Index ,
190+ limitedBy end: String . Index ,
150191 isScalarSemantics: Bool
151192 ) -> QuickResult < String . Index ? > {
152- assert ( currentPosition < endIndex )
193+ assert ( currentPosition < end )
153194 guard let ( asciiValue, next, isCRLF) = _quickASCIICharacter (
154- at: currentPosition
195+ at: currentPosition, limitedBy : end
155196 ) else {
156197 return . unknown
157198 }
@@ -167,46 +208,47 @@ extension String {
167208 @inline ( never)
168209 private func _thoroughMatchAnyNonNewline(
169210 at currentPosition: String . Index ,
211+ limitedBy end: String . Index ,
170212 isScalarSemantics: Bool
171213 ) -> String . Index ? {
172- assert ( currentPosition < endIndex)
173214 if isScalarSemantics {
215+ guard currentPosition < end else { return nil }
174216 let scalar = unicodeScalars [ currentPosition]
175217 guard !scalar. isNewline else { return nil }
176218 return unicodeScalars. index ( after: currentPosition)
177219 }
178220
179- let char = self [ currentPosition]
180- guard !char. isNewline else { return nil }
181- return index ( after: currentPosition)
221+ guard let ( char, next) = characterAndEnd ( at: currentPosition, limitedBy: end) ,
222+ !char. isNewline
223+ else { return nil }
224+ return next
182225 }
183226}
184227
185228// MARK: - Built-in character class matching
186229extension String {
187- // TODO: Should the below have a `limitedBy` parameter?
188-
189230 // Mentioned in ProgrammersManual.md, update docs if redesigned
190231 func matchBuiltinCC(
191232 _ cc: _CharacterClassModel . Representation ,
192233 at currentPosition: String . Index ,
234+ limitedBy end: String . Index ,
193235 isInverted: Bool ,
194236 isStrictASCII: Bool ,
195237 isScalarSemantics: Bool
196238 ) -> String . Index ? {
197- guard currentPosition < endIndex else {
198- return nil
199- }
239+ guard currentPosition < end else { return nil }
200240 if case . definite( let result) = _quickMatchBuiltinCC (
201241 cc,
202242 at: currentPosition,
243+ limitedBy: end,
203244 isInverted: isInverted,
204245 isStrictASCII: isStrictASCII,
205246 isScalarSemantics: isScalarSemantics
206247 ) {
207248 assert ( result == _thoroughMatchBuiltinCC (
208249 cc,
209250 at: currentPosition,
251+ limitedBy: end,
210252 isInverted: isInverted,
211253 isStrictASCII: isStrictASCII,
212254 isScalarSemantics: isScalarSemantics) )
@@ -215,6 +257,7 @@ extension String {
215257 return _thoroughMatchBuiltinCC (
216258 cc,
217259 at: currentPosition,
260+ limitedBy: end,
218261 isInverted: isInverted,
219262 isStrictASCII: isStrictASCII,
220263 isScalarSemantics: isScalarSemantics)
@@ -225,13 +268,17 @@ extension String {
225268 private func _quickMatchBuiltinCC(
226269 _ cc: _CharacterClassModel . Representation ,
227270 at currentPosition: String . Index ,
271+ limitedBy end: String . Index ,
228272 isInverted: Bool ,
229273 isStrictASCII: Bool ,
230274 isScalarSemantics: Bool
231275 ) -> QuickResult < String . Index ? > {
232- assert ( currentPosition < endIndex )
276+ assert ( currentPosition < end )
233277 guard let ( next, result) = _quickMatch (
234- cc, at: currentPosition, isScalarSemantics: isScalarSemantics
278+ cc,
279+ at: currentPosition,
280+ limitedBy: end,
281+ isScalarSemantics: isScalarSemantics
235282 ) else {
236283 return . unknown
237284 }
@@ -243,27 +290,25 @@ extension String {
243290 private func _thoroughMatchBuiltinCC(
244291 _ cc: _CharacterClassModel . Representation ,
245292 at currentPosition: String . Index ,
293+ limitedBy end: String . Index ,
246294 isInverted: Bool ,
247295 isStrictASCII: Bool ,
248296 isScalarSemantics: Bool
249297 ) -> String . Index ? {
250- assert ( currentPosition < endIndex)
251- let char = self [ currentPosition]
298+ // TODO: Branch here on scalar semantics
299+ // Don't want to pay character cost if unnecessary
300+ guard var ( char, next) =
301+ characterAndEnd ( at: currentPosition, limitedBy: end)
302+ else { return nil }
252303 let scalar = unicodeScalars [ currentPosition]
253304
254305 let asciiCheck = !isStrictASCII
255306 || ( scalar. isASCII && isScalarSemantics)
256307 || char. isASCII
257308
258309 var matched : Bool
259- var next : String . Index
260- switch ( isScalarSemantics, cc) {
261- case ( _, . anyGrapheme) :
262- next = index ( after: currentPosition)
263- case ( true , _) :
310+ if isScalarSemantics && cc != . anyGrapheme {
264311 next = unicodeScalars. index ( after: currentPosition)
265- case ( false , _) :
266- next = index ( after: currentPosition)
267312 }
268313
269314 switch cc {
@@ -291,7 +336,7 @@ extension String {
291336 if isScalarSemantics {
292337 matched = scalar. isNewline && asciiCheck
293338 if matched && scalar == " \r "
294- && next != endIndex && unicodeScalars [ next] == " \n " {
339+ && next < end && unicodeScalars [ next] == " \n " {
295340 // Match a full CR-LF sequence even in scalar semantics
296341 unicodeScalars. formIndex ( after: & next)
297342 }
0 commit comments