From cb48c6d0d71964ddec3e8b18dcad1e1d23ff94a1 Mon Sep 17 00:00:00 2001
From: Stephen Canon <scanon@apple.com>
Date: Thu, 26 Jun 2025 14:13:16 -0400
Subject: [PATCH 1/2] Optimization pass over String and UTF8Span's allASCII
 helper

This ranges between parity (for very small strings) and 5x faster (for 32-63B strings) in benchmarking on M1 MBP. For largeish strings it delivers a roughly 2x speedup; further increase in blocksize nets a small win in microbenchmarks that I do not expect would translate to real world usage due to codesize impact and the fact that most strings are smallish.

There's some opportunity for further work here; in particular, if people start building Swift for a baseline of AVX2 or AVX512, we should have paths for that (and we should also implement them if/when we get better multiversioning dispatch machinery in the language). Span adoption would be interesting. It's likely we should have a dedicated "small core" implementation that uses only aligned accesses. Still, this is a significant improvement as-is, and we should land it.
---
 stdlib/public/core/StringCreate.swift | 156 ++++++++++++++++++--------
 1 file changed, 111 insertions(+), 45 deletions(-)
diff --git a/stdlib/public/core/StringCreate.swift b/stdlib/public/core/StringCreate.swift
index 2bdbfb484106c..9fec206404ca1 100644
--- a/stdlib/public/core/StringCreate.swift
+++ b/stdlib/public/core/StringCreate.swift
@@ -13,60 +13,126 @@
 //===----------------------------------------------------------------------===//
 
 internal func _allASCII(_ input: UnsafeBufferPointer<UInt8>) -> Bool {
-  if input.isEmpty { return true }
-
-  // NOTE: Avoiding for-in syntax to avoid bounds checks
-  //
-  // TODO(String performance): SIMD-ize
-  //
-  let count = input.count
-  var ptr = unsafe UnsafeRawPointer(input.baseAddress._unsafelyUnwrappedUnchecked)
-
-  let asciiMask64 = 0x8080_8080_8080_8080 as UInt64
-  let asciiMask32 = UInt32(truncatingIfNeeded: asciiMask64)
-  let asciiMask16 = UInt16(truncatingIfNeeded: asciiMask64)
-  let asciiMask8 = UInt8(truncatingIfNeeded: asciiMask64)
+  //--------------- Implementation building blocks ---------------------------//
+#if arch(arm64_32)
+  typealias Word = UInt64
+#else
+  typealias Word = UInt
+#endif
+  let mask = Word(truncatingIfNeeded: 0x80808080_80808080 as UInt64)
   
-  let end128 = unsafe ptr + count & ~(MemoryLayout<(UInt64, UInt64)>.stride &- 1)
-  let end64 = unsafe ptr + count & ~(MemoryLayout<UInt64>.stride &- 1)
-  let end32 = unsafe ptr + count & ~(MemoryLayout<UInt32>.stride &- 1)
-  let end16 = unsafe ptr + count & ~(MemoryLayout<UInt16>.stride &- 1)
-  let end = unsafe ptr + count
-
-  
-  while unsafe ptr < end128 {
-    let pair = unsafe ptr.loadUnaligned(as: (UInt64, UInt64).self)
-    let result = (pair.0 | pair.1) & asciiMask64
-    guard result == 0 else { return false }
-    unsafe ptr = unsafe ptr + MemoryLayout<(UInt64, UInt64)>.stride
+#if arch(i386) || arch(x86_64)
+  // TODO: Should consider AVX2 / AVX512 / AVX10 path here
+  typealias Block = (SIMD16<UInt8>, SIMD16<UInt8>)
+  @_transparent func pmovmskb(_ vec: SIMD16<UInt8>) -> UInt16 {
+    UInt16(Builtin.bitcast_Vec16xInt1_Int16(
+      Builtin.cmp_slt_Vec16xInt8(vec._storage._value, Builtin.zeroInitializer())
+    ))
+  }
+#elseif arch(arm64) || arch(arm64_32)
+  typealias Block = (SIMD16<UInt8>, SIMD16<UInt8>)
+  @_transparent func umaxv(_ vec: SIMD16<UInt8>) -> UInt8 {
+    UInt8(Builtin.int_vector_reduce_umax_Vec16xInt8(vec._storage._value))
   }
+#else
+  typealias Block = (Word, Word, Word, Word)
+#endif
   
-  // If we had enough bytes for two iterations of this, we would have hit
-  // the loop above, so we only need to do this once
-  if unsafe ptr < end64 {
-    let value = unsafe ptr.loadUnaligned(as: UInt64.self)
-    guard value & asciiMask64 == 0 else { return false }
-    unsafe ptr = unsafe ptr + MemoryLayout<UInt64>.stride
+  @_transparent
+  func allASCII(wordAt pointer: UnsafePointer<UInt8>) -> Bool {
+    let word = unsafe UnsafeRawPointer(pointer).loadUnaligned(as: Word.self)
+    return word & mask == 0
   }
   
-  if unsafe ptr < end32 {
-    let value = unsafe ptr.loadUnaligned(as: UInt32.self)
-    guard value & asciiMask32 == 0 else { return false }
-    unsafe ptr = unsafe ptr + MemoryLayout<UInt32>.stride
+  @_transparent
+  func allASCII(blockAt pointer: UnsafePointer<UInt8>) -> Bool {
+    let block = unsafe UnsafeRawPointer(pointer).loadUnaligned(as: Block.self)
+#if arch(i386) || arch(x86_64)
+    return pmovmskb(block.0 | block.1) == 0
+#elseif arch(arm64) || arch(arm64_32)
+    return umaxv(block.0 | block.1) < 0x80
+#else
+    return (block.0 | block.1 | block.2 | block.3) & mask == 0
+#endif
   }
+  //----------------------- Implementation proper ----------------------------//
+  guard input.count >= MemoryLayout<Word>.size else {
+    // They gave us a region of memory
+    // whose size is as modest as it can be.
+    // We'll check every byte
+    // for the bit of most height
+    // and return if we happen on any
+    //
+    // I'm sorry, I'm sorry, I'm trying to delete it. (This chunk of code, not
+    // the Limerick. I would wager that--at least for Strings--we could
+    // unconditionally load 16B here,¹ because of the small string encoding,
+    // and check them all at once, which would be much more efficient. That
+    // probably has to happen by lifting this check into the SmallString
+    // initializer directly, though.)
+    //
+    // ¹ well, most of the time, which makes it a rather conditional
+    // "unconditionally".
+    return unsafe input.allSatisfy { $0 < 0x80 }
+  }
+  
+  // bytes.count is non-zero, so we can unconditionally unwrap baseAddress.
+  let base = unsafe input.baseAddress._unsafelyUnwrappedUnchecked
+  let n = input.count
+  var i = 0
   
-  if unsafe ptr < end16 {
-    let value = unsafe ptr.loadUnaligned(as: UInt16.self)
-    guard value & asciiMask16 == 0 else { return false }
-    unsafe ptr = unsafe ptr + MemoryLayout<UInt16>.stride
+  guard n >= MemoryLayout<Block>.size else {
+    // The size isn't yet to a block
+    // word-by-word we are forced to walk.
+    // So as to not leave a gap
+    // the last word may lap
+    // the word that we already chalked.
+    //
+    //     0      k      2k     3k      ?k   n-k    n-1
+    //     |      |      |      |       |     |      |
+    //     +------+------+------+       +------+     |
+    //     | word | word | word |  ...  | word |     |
+    //     +------+------+------+       +------+     v
+    //                                        +------+
+    //      possibly overlapping final word > | word |
+    //                                        +------+
+    //
+    // This means that we check any bytes in the overlap region twice, but
+    // that's much preferrable to using smaller accesses to avoid rechecking,
+    // because the entire last word is about as expensive as checking just
+    // one byte would be, and on average there's more than one byte remaining.
+    //
+    // Note that we don't bother trying to align any of these accesses, because
+    // there is minimal benefit to doing so on "modern" OoO cores, which can
+    // handle cacheline-crossing loads at full speed. If the string happens to
+    // be aligned, they'll be aligned, if not, they won't be. It will likely
+    // make sense to add a path that does align everything for more limited
+    // embedded CPUs, though.
+    let k = MemoryLayout<Word>.size
+    let last = n &- k
+    while i < last {
+      guard unsafe allASCII(wordAt: base + i) else { return false }
+      i &+= k
+    }
+    return unsafe allASCII(wordAt: base + last)
   }
-
-  if unsafe ptr < end {
-    let value = unsafe ptr.loadUnaligned(fromByteOffset: 0, as: UInt8.self)
-    guard value & asciiMask8 == 0 else { return false }
+  
+  // check block-by-block, with a possibly overlapping last block to avoid
+  // sub-block cleanup. We should be able to avoid manual index arithmetic
+  // and write this loop and the one above something like the following:
+  //
+  //  return stride(from: 0, to: last, by: k).allSatisfy {
+  //    allASCII(blockAt: base + $0)
+  //  } && allASCII(blockAt: base + last)
+  //
+  // but LLVM leaves one unnecessary conditional operation in the loop
+  // when we do that, so we write them out as while loops instead for now.
+  let k = MemoryLayout<Block>.size
+  let last = n &- k
+  while i < last {
+    guard unsafe allASCII(blockAt: base + i) else { return false }
+    i &+= k
   }
-  unsafe _internalInvariant(ptr == end || ptr + 1 == end)
-  return true
+  return unsafe allASCII(blockAt: base + last)
 }
 
 extension String {

From 878bd01ba93c2b8afae415831bbb600ad3a85c1c Mon Sep 17 00:00:00 2001
From: Stephen Canon <scanon@apple.com>
Date: Fri, 27 Jun 2025 10:35:04 -0400
Subject: [PATCH 2/2] Protect SIMD code in allASCII with
 SWIFT_STDLIB_ENABLE_VECTOR_TYPES

---
 stdlib/public/core/StringCreate.swift | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/stdlib/public/core/StringCreate.swift b/stdlib/public/core/StringCreate.swift
index 9fec206404ca1..ea1827f7bc503 100644
--- a/stdlib/public/core/StringCreate.swift
+++ b/stdlib/public/core/StringCreate.swift
@@ -20,8 +20,8 @@ internal func _allASCII(_ input: UnsafeBufferPointer<UInt8>) -> Bool {
   typealias Word = UInt
 #endif
   let mask = Word(truncatingIfNeeded: 0x80808080_80808080 as UInt64)
-  
-#if arch(i386) || arch(x86_64)
+
+#if (arch(i386) || arch(x86_64)) && SWIFT_STDLIB_ENABLE_VECTOR_TYPES
   // TODO: Should consider AVX2 / AVX512 / AVX10 path here
   typealias Block = (SIMD16<UInt8>, SIMD16<UInt8>)
   @_transparent func pmovmskb(_ vec: SIMD16<UInt8>) -> UInt16 {
@@ -29,7 +29,7 @@ internal func _allASCII(_ input: UnsafeBufferPointer<UInt8>) -> Bool {
       Builtin.cmp_slt_Vec16xInt8(vec._storage._value, Builtin.zeroInitializer())
     ))
   }
-#elseif arch(arm64) || arch(arm64_32)
+#elseif (arch(arm64) || arch(arm64_32)) && SWIFT_STDLIB_ENABLE_VECTOR_TYPES
   typealias Block = (SIMD16<UInt8>, SIMD16<UInt8>)
   @_transparent func umaxv(_ vec: SIMD16<UInt8>) -> UInt8 {
     UInt8(Builtin.int_vector_reduce_umax_Vec16xInt8(vec._storage._value))
@@ -47,9 +47,9 @@ internal func _allASCII(_ input: UnsafeBufferPointer<UInt8>) -> Bool {
   @_transparent
   func allASCII(blockAt pointer: UnsafePointer<UInt8>) -> Bool {
     let block = unsafe UnsafeRawPointer(pointer).loadUnaligned(as: Block.self)
-#if arch(i386) || arch(x86_64)
+#if (arch(i386) || arch(x86_64)) && SWIFT_STDLIB_ENABLE_VECTOR_TYPES
     return pmovmskb(block.0 | block.1) == 0
-#elseif arch(arm64) || arch(arm64_32)
+#elseif (arch(arm64) || arch(arm64_32)) && SWIFT_STDLIB_ENABLE_VECTOR_TYPES
     return umaxv(block.0 | block.1) < 0x80
 #else
     return (block.0 | block.1 | block.2 | block.3) & mask == 0