22//
33// This source file is part of the Swift.org open source project
44//
5- // Copyright (c) 2014 - 2017 Apple Inc. and the Swift project authors
5+ // Copyright (c) 2014 - 2023 Apple Inc. and the Swift project authors
66// Licensed under Apache License v2.0 with Runtime Library Exception
77//
88// See https://swift.org/LICENSE.txt for license information
@@ -436,6 +436,117 @@ internal struct _GraphemeBreakingState {
436436 var shouldBreakRI = false
437437}
438438
439+ extension Unicode {
440+ /// A state machine for recognizing character (i.e., extended grapheme
441+ /// cluster) boundaries in an arbitrary series of Unicode scalars.
442+ ///
443+ /// To detect grapheme breaks in a sequence of Unicode scalars, feed each of
444+ /// them to the `hasBreak(before:)` method. The method returns true if the
445+ /// sequence has a grapheme break preceding the given value.
446+ ///
447+ /// The results produced by this state machine are guaranteed to match the way
448+ /// `String` splits its contents into `Character` values.
449+ @available ( SwiftStdlib 5 . 8 , * )
450+ public // SPI(Foundation) FIXME: We need API for this
451+ struct _CharacterRecognizer {
452+ internal var _previous : Unicode . Scalar
453+ internal var _state : _GraphemeBreakingState
454+
455+ /// Returns a non-nil value if it can be determined whether there is a
456+ /// grapheme break between `scalar1` and `scalar2` without knowing anything
457+ /// about the scalars that precede `scalar1`. This can optionally be used as
458+ /// a fast (but incomplete) test before spinning up a full state machine
459+ /// session.
460+ @_effects ( releasenone)
461+ public static func quickBreak(
462+ between scalar1: Unicode . Scalar ,
463+ and scalar2: Unicode . Scalar
464+ ) -> Bool ? {
465+ if scalar1. value == 0xD , scalar2. value == 0xA {
466+ return false
467+ }
468+ if _hasGraphemeBreakBetween ( scalar1, scalar2) {
469+ return true
470+ }
471+ return nil
472+ }
473+
474+ /// Initialize a new character recognizer at the _start of text_ (sot)
475+ /// position.
476+ ///
477+ /// The resulting state machine will report a grapheme break on the
478+ /// first scalar that is fed to it.
479+ public init ( ) {
480+ _state = _GraphemeBreakingState ( )
481+ // To avoid having to handle the empty case specially, we use NUL as the
482+ // placeholder before the first scalar. NUL is a control character, so per
483+ // rule GB5, it will induce an unconditional grapheme break before the
484+ // first actual scalar, emulating GB1.
485+ _previous = Unicode . Scalar ( 0 as UInt8 )
486+ }
487+
488+ /// Feeds the next scalar to the state machine, returning a Boolean value
489+ /// indicating whether it starts a new extended grapheme cluster.
490+ ///
491+ /// This method will always report a break the first time it is called
492+ /// on a newly initialized recognizer.
493+ ///
494+ /// The state machine does not carry information across character
495+ /// boundaries. I.e., if this method returns true, then `self` after the
496+ /// call is equivalent to feeding the same scalar to a newly initialized
497+ /// recognizer instance.
498+ @_effects ( releasenone)
499+ public mutating func hasBreak(
500+ before next: Unicode . Scalar
501+ ) -> Bool {
502+ let r = _state. shouldBreak ( between: _previous, and: next)
503+ if r {
504+ _state = _GraphemeBreakingState ( )
505+ }
506+ _previous = next
507+ return r
508+ }
509+
510+ /// Decode the scalars in the given UTF-8 buffer and feed them to the
511+ /// recognizer up to and including the scalar following the first grapheme
512+ /// break. If the buffer contains a grapheme break, then this function
513+ /// returns the index range of the scalar that follows the first one;
514+ /// otherwise it returns `nil`.
515+ ///
516+ /// On return, the state of the recognizer is updated to reflect the scalars
517+ /// up to and including the returned one. You can detect additional grapheme
518+ /// breaks by feeding the recognizer subsequent data.
519+ ///
520+ /// - Parameter buffer: A buffer containing valid UTF-8 data, starting and
521+ /// ending on Unicode scalar boundaries.
522+ ///
523+ /// - Parameter start: A valid index into `buffer`, addressing the first
524+ /// code unit of a UTF-8 scalar in the buffer, or the end.
525+ ///
526+ /// - Returns: The index range of the scalar that follows the first grapheme
527+ /// break in the buffer, if there is one. If the buffer contains no
528+ /// grapheme breaks, then this function returns `nil`.
529+ ///
530+ /// - Warning: This function does not validate that the buffer contains
531+ /// valid UTF-8 data; its behavior is undefined if given invalid input.
532+ @_effects ( releasenone)
533+ public mutating func _firstBreak(
534+ inUncheckedUnsafeUTF8Buffer buffer: UnsafeBufferPointer < UInt8 > ,
535+ startingAt start: Int = 0
536+ ) -> Range < Int > ? {
537+ var i = start
538+ while i < buffer. endIndex {
539+ let ( next, n) = _decodeScalar ( buffer, startingAt: i)
540+ if hasBreak ( before: next) {
541+ return Range ( _uncheckedBounds: ( i, i &+ n) )
542+ }
543+ i &+= n
544+ }
545+ return nil
546+ }
547+ }
548+ }
549+
439550extension _StringGuts {
440551 // Returns the stride of the grapheme cluster starting at offset `index`,
441552 // assuming it is on a grapheme cluster boundary.
@@ -459,7 +570,7 @@ extension _StringGuts {
459570
460571 while true {
461572 guard let ( scalar2, nextIndex) = nextScalar ( index) else { break }
462- if shouldBreak ( between: scalar, and: scalar2, at : index , with : & state ) {
573+ if state . shouldBreak ( between: scalar, and: scalar2) {
463574 break
464575 }
465576 index = nextIndex
@@ -505,7 +616,7 @@ extension _StringGuts {
505616 }
506617}
507618
508- extension _StringGuts {
619+ extension _GraphemeBreakingState {
509620 // Return true if there is an extended grapheme cluster boundary between two
510621 // scalars, based on state information previously collected about preceding
511622 // scalars.
@@ -517,11 +628,9 @@ extension _StringGuts {
517628 //
518629 // This is based on the Unicode Annex #29 for [Grapheme Cluster Boundary
519630 // Rules](https://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules).
520- internal func shouldBreak(
631+ internal mutating func shouldBreak(
521632 between scalar1: Unicode . Scalar ,
522- and scalar2: Unicode . Scalar ,
523- at index: Int ,
524- with state: inout _GraphemeBreakingState
633+ and scalar2: Unicode . Scalar
525634 ) -> Bool {
526635 // GB3
527636 if scalar1. value == 0xD , scalar2. value == 0xA {
@@ -545,8 +654,8 @@ extension _StringGuts {
545654 var enterIndicSequence = false
546655
547656 defer {
548- state . isInEmojiSequence = enterEmojiSequence
549- state . isInIndicSequence = enterIndicSequence
657+ self . isInEmojiSequence = enterEmojiSequence
658+ self . isInIndicSequence = enterIndicSequence
550659 }
551660
552661 switch ( x, y) {
@@ -591,14 +700,14 @@ extension _StringGuts {
591700 // continue the grapheme cluster by combining more scalars later. If we're
592701 // not currently in an emoji sequence, but our lhs scalar is a pictograph,
593702 // then that's a signal that it's the start of an emoji sequence.
594- if state . isInEmojiSequence || x == . extendedPictographic {
703+ if self . isInEmojiSequence || x == . extendedPictographic {
595704 enterEmojiSequence = true
596705 }
597706
598707 // If we're currently in an indic sequence (or if our lhs is a linking
599708 // consonant), then this check and everything underneath ensures that
600709 // we continue being in one and may check if this extend is a Virama.
601- if state . isInIndicSequence || scalar1. _isLinkingConsonant {
710+ if self . isInIndicSequence || scalar1. _isLinkingConsonant {
602711 if y == . extend {
603712 let extendNormData = Unicode . _NormData ( scalar2, fastUpperbound: 0x300 )
604713
@@ -611,7 +720,7 @@ extension _StringGuts {
611720 enterIndicSequence = true
612721
613722 if scalar2. _isVirama {
614- state . hasSeenVirama = true
723+ self . hasSeenVirama = true
615724 }
616725 }
617726
@@ -627,32 +736,34 @@ extension _StringGuts {
627736
628737 // GB11
629738 case ( . zwj, . extendedPictographic) :
630- return !state . isInEmojiSequence
739+ return !self . isInEmojiSequence
631740
632741 // GB12 & GB13
633742 case ( . regionalIndicator, . regionalIndicator) :
634743 defer {
635- state . shouldBreakRI. toggle ( )
744+ self . shouldBreakRI. toggle ( )
636745 }
637746
638- return state . shouldBreakRI
747+ return self . shouldBreakRI
639748
640749 // GB999
641750 default :
642751 // GB9c
643752 if
644- state . isInIndicSequence,
645- state . hasSeenVirama,
753+ self . isInIndicSequence,
754+ self . hasSeenVirama,
646755 scalar2. _isLinkingConsonant
647756 {
648- state . hasSeenVirama = false
757+ self . hasSeenVirama = false
649758 return false
650759 }
651760
652761 return true
653762 }
654763 }
764+ }
655765
766+ extension _StringGuts {
656767 // Return true if there is an extended grapheme cluster boundary between two
657768 // scalars, with no previous knowledge about preceding scalars.
658769 //
0 commit comments