From cb48c6d0d71964ddec3e8b18dcad1e1d23ff94a1 Mon Sep 17 00:00:00 2001 From: Stephen Canon Date: Thu, 26 Jun 2025 14:13:16 -0400 Subject: [PATCH 1/2] Optimization pass over String and UTF8Span's allASCII helper This ranges between parity (for very small strings) and 5x faster (for 32-63B strings) in benchmarking on M1 MBP. For largeish strings it delivers a roughly 2x speedup; further increase in blocksize nets a small win in microbenchmarks that I do not expect would translate to real world usage due to codesize impact and the fact that most strings are smallish. There's some opportunity for further work here; in particular, if people start building Swift for a baseline of AVX2 or AVX512, we should have paths for that (and we should also implement them if/when we get better multiversioning dispatch machinery in the language). Span adoption would be interesting. It's likely we should have a dedicated "small core" implementation that uses only aligned accesses. Still, this is a significant improvement as-is, and we should land it. --- stdlib/public/core/StringCreate.swift | 156 ++++++++++++++++++-------- 1 file changed, 111 insertions(+), 45 deletions(-) diff --git a/stdlib/public/core/StringCreate.swift b/stdlib/public/core/StringCreate.swift index 2bdbfb484106c..9fec206404ca1 100644 --- a/stdlib/public/core/StringCreate.swift +++ b/stdlib/public/core/StringCreate.swift @@ -13,60 +13,126 @@ //===----------------------------------------------------------------------===// internal func _allASCII(_ input: UnsafeBufferPointer) -> Bool { - if input.isEmpty { return true } - - // NOTE: Avoiding for-in syntax to avoid bounds checks - // - // TODO(String performance): SIMD-ize - // - let count = input.count - var ptr = unsafe UnsafeRawPointer(input.baseAddress._unsafelyUnwrappedUnchecked) - - let asciiMask64 = 0x8080_8080_8080_8080 as UInt64 - let asciiMask32 = UInt32(truncatingIfNeeded: asciiMask64) - let asciiMask16 = UInt16(truncatingIfNeeded: asciiMask64) - let asciiMask8 = UInt8(truncatingIfNeeded: asciiMask64) + //--------------- Implementation building blocks ---------------------------// +#if arch(arm64_32) + typealias Word = UInt64 +#else + typealias Word = UInt +#endif + let mask = Word(truncatingIfNeeded: 0x80808080_80808080 as UInt64) - let end128 = unsafe ptr + count & ~(MemoryLayout<(UInt64, UInt64)>.stride &- 1) - let end64 = unsafe ptr + count & ~(MemoryLayout.stride &- 1) - let end32 = unsafe ptr + count & ~(MemoryLayout.stride &- 1) - let end16 = unsafe ptr + count & ~(MemoryLayout.stride &- 1) - let end = unsafe ptr + count - - - while unsafe ptr < end128 { - let pair = unsafe ptr.loadUnaligned(as: (UInt64, UInt64).self) - let result = (pair.0 | pair.1) & asciiMask64 - guard result == 0 else { return false } - unsafe ptr = unsafe ptr + MemoryLayout<(UInt64, UInt64)>.stride +#if arch(i386) || arch(x86_64) + // TODO: Should consider AVX2 / AVX512 / AVX10 path here + typealias Block = (SIMD16, SIMD16) + @_transparent func pmovmskb(_ vec: SIMD16) -> UInt16 { + UInt16(Builtin.bitcast_Vec16xInt1_Int16( + Builtin.cmp_slt_Vec16xInt8(vec._storage._value, Builtin.zeroInitializer()) + )) + } +#elseif arch(arm64) || arch(arm64_32) + typealias Block = (SIMD16, SIMD16) + @_transparent func umaxv(_ vec: SIMD16) -> UInt8 { + UInt8(Builtin.int_vector_reduce_umax_Vec16xInt8(vec._storage._value)) } +#else + typealias Block = (Word, Word, Word, Word) +#endif - // If we had enough bytes for two iterations of this, we would have hit - // the loop above, so we only need to do this once - if unsafe ptr < end64 { - let value = unsafe ptr.loadUnaligned(as: UInt64.self) - guard value & asciiMask64 == 0 else { return false } - unsafe ptr = unsafe ptr + MemoryLayout.stride + @_transparent + func allASCII(wordAt pointer: UnsafePointer) -> Bool { + let word = unsafe UnsafeRawPointer(pointer).loadUnaligned(as: Word.self) + return word & mask == 0 } - if unsafe ptr < end32 { - let value = unsafe ptr.loadUnaligned(as: UInt32.self) - guard value & asciiMask32 == 0 else { return false } - unsafe ptr = unsafe ptr + MemoryLayout.stride + @_transparent + func allASCII(blockAt pointer: UnsafePointer) -> Bool { + let block = unsafe UnsafeRawPointer(pointer).loadUnaligned(as: Block.self) +#if arch(i386) || arch(x86_64) + return pmovmskb(block.0 | block.1) == 0 +#elseif arch(arm64) || arch(arm64_32) + return umaxv(block.0 | block.1) < 0x80 +#else + return (block.0 | block.1 | block.2 | block.3) & mask == 0 +#endif } + //----------------------- Implementation proper ----------------------------// + guard input.count >= MemoryLayout.size else { + // They gave us a region of memory + // whose size is as modest as it can be. + // We'll check every byte + // for the bit of most height + // and return if we happen on any + // + // I'm sorry, I'm sorry, I'm trying to delete it. (This chunk of code, not + // the Limerick. I would wager that--at least for Strings--we could + // unconditionally load 16B here,¹ because of the small string encoding, + // and check them all at once, which would be much more efficient. That + // probably has to happen by lifting this check into the SmallString + // initializer directly, though.) + // + // ¹ well, most of the time, which makes it a rather conditional + // "unconditionally". + return unsafe input.allSatisfy { $0 < 0x80 } + } + + // bytes.count is non-zero, so we can unconditionally unwrap baseAddress. + let base = unsafe input.baseAddress._unsafelyUnwrappedUnchecked + let n = input.count + var i = 0 - if unsafe ptr < end16 { - let value = unsafe ptr.loadUnaligned(as: UInt16.self) - guard value & asciiMask16 == 0 else { return false } - unsafe ptr = unsafe ptr + MemoryLayout.stride + guard n >= MemoryLayout.size else { + // The size isn't yet to a block + // word-by-word we are forced to walk. + // So as to not leave a gap + // the last word may lap + // the word that we already chalked. + // + // 0 k 2k 3k ?k n-k n-1 + // | | | | | | | + // +------+------+------+ +------+ | + // | word | word | word | ... | word | | + // +------+------+------+ +------+ v + // +------+ + // possibly overlapping final word > | word | + // +------+ + // + // This means that we check any bytes in the overlap region twice, but + // that's much preferrable to using smaller accesses to avoid rechecking, + // because the entire last word is about as expensive as checking just + // one byte would be, and on average there's more than one byte remaining. + // + // Note that we don't bother trying to align any of these accesses, because + // there is minimal benefit to doing so on "modern" OoO cores, which can + // handle cacheline-crossing loads at full speed. If the string happens to + // be aligned, they'll be aligned, if not, they won't be. It will likely + // make sense to add a path that does align everything for more limited + // embedded CPUs, though. + let k = MemoryLayout.size + let last = n &- k + while i < last { + guard unsafe allASCII(wordAt: base + i) else { return false } + i &+= k + } + return unsafe allASCII(wordAt: base + last) } - - if unsafe ptr < end { - let value = unsafe ptr.loadUnaligned(fromByteOffset: 0, as: UInt8.self) - guard value & asciiMask8 == 0 else { return false } + + // check block-by-block, with a possibly overlapping last block to avoid + // sub-block cleanup. We should be able to avoid manual index arithmetic + // and write this loop and the one above something like the following: + // + // return stride(from: 0, to: last, by: k).allSatisfy { + // allASCII(blockAt: base + $0) + // } && allASCII(blockAt: base + last) + // + // but LLVM leaves one unnecessary conditional operation in the loop + // when we do that, so we write them out as while loops instead for now. + let k = MemoryLayout.size + let last = n &- k + while i < last { + guard unsafe allASCII(blockAt: base + i) else { return false } + i &+= k } - unsafe _internalInvariant(ptr == end || ptr + 1 == end) - return true + return unsafe allASCII(blockAt: base + last) } extension String { From 878bd01ba93c2b8afae415831bbb600ad3a85c1c Mon Sep 17 00:00:00 2001 From: Stephen Canon Date: Fri, 27 Jun 2025 10:35:04 -0400 Subject: [PATCH 2/2] Protect SIMD code in allASCII with SWIFT_STDLIB_ENABLE_VECTOR_TYPES --- stdlib/public/core/StringCreate.swift | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/stdlib/public/core/StringCreate.swift b/stdlib/public/core/StringCreate.swift index 9fec206404ca1..ea1827f7bc503 100644 --- a/stdlib/public/core/StringCreate.swift +++ b/stdlib/public/core/StringCreate.swift @@ -20,8 +20,8 @@ internal func _allASCII(_ input: UnsafeBufferPointer) -> Bool { typealias Word = UInt #endif let mask = Word(truncatingIfNeeded: 0x80808080_80808080 as UInt64) - -#if arch(i386) || arch(x86_64) + +#if (arch(i386) || arch(x86_64)) && SWIFT_STDLIB_ENABLE_VECTOR_TYPES // TODO: Should consider AVX2 / AVX512 / AVX10 path here typealias Block = (SIMD16, SIMD16) @_transparent func pmovmskb(_ vec: SIMD16) -> UInt16 { @@ -29,7 +29,7 @@ internal func _allASCII(_ input: UnsafeBufferPointer) -> Bool { Builtin.cmp_slt_Vec16xInt8(vec._storage._value, Builtin.zeroInitializer()) )) } -#elseif arch(arm64) || arch(arm64_32) +#elseif (arch(arm64) || arch(arm64_32)) && SWIFT_STDLIB_ENABLE_VECTOR_TYPES typealias Block = (SIMD16, SIMD16) @_transparent func umaxv(_ vec: SIMD16) -> UInt8 { UInt8(Builtin.int_vector_reduce_umax_Vec16xInt8(vec._storage._value)) @@ -47,9 +47,9 @@ internal func _allASCII(_ input: UnsafeBufferPointer) -> Bool { @_transparent func allASCII(blockAt pointer: UnsafePointer) -> Bool { let block = unsafe UnsafeRawPointer(pointer).loadUnaligned(as: Block.self) -#if arch(i386) || arch(x86_64) +#if (arch(i386) || arch(x86_64)) && SWIFT_STDLIB_ENABLE_VECTOR_TYPES return pmovmskb(block.0 | block.1) == 0 -#elseif arch(arm64) || arch(arm64_32) +#elseif (arch(arm64) || arch(arm64_32)) && SWIFT_STDLIB_ENABLE_VECTOR_TYPES return umaxv(block.0 | block.1) < 0x80 #else return (block.0 | block.1 | block.2 | block.3) & mask == 0