From bd5189c25a5a02871e63c4c5266ab3e62ed62ba8 Mon Sep 17 00:00:00 2001 From: Michael Ilseman Date: Thu, 15 Jun 2017 13:19:13 -0700 Subject: [PATCH 1/2] [String] Grapheme fast paths for punctuation: 5-8x speedup. Many strings use non-sub-300 punctuation characters (e.g. unicode hyphen, CJK quotes, etc). This can cause switching between fast and slow paths for grapheme breaking. Add in fast-paths for general punctuation characters and CJK punctuation and symbol characters. This results in about a 5-8x speedup for heavily (unicode) punctuated Latiny and CJKy workloads. --- benchmark/single-source/StringWalk.swift | 191 +++++++++++++++++-- benchmark/single-source/StringWalk.swift.gyb | 17 +- benchmark/utils/main.swift | 16 ++ stdlib/public/core/StringCharacterView.swift | 8 + 4 files changed, 207 insertions(+), 25 deletions(-) diff --git a/benchmark/single-source/StringWalk.swift b/benchmark/single-source/StringWalk.swift index e55dad7e799a..1998b23de81c 100644 --- a/benchmark/single-source/StringWalk.swift +++ b/benchmark/single-source/StringWalk.swift @@ -16,15 +16,11 @@ // scripts/generate_harness/generate_harness.py to regenerate this file. //////////////////////////////////////////////////////////////////////////////// - -// Test String subscript performance. // -// Subscript has a slow path that initializes a global variable: -// Swift._cocoaStringSubscript.addressor. Global optimization would -// normally hoist the initializer outside the inner loop (over -// unicodeScalars), forcing the initializer to be called on each -// lap. However, no that the cocoa code is properly marked "slowPath", -// no hoisting should occur. +// Test String iteration performance over a variety of workloads, languages, +// and symbols. +// + import TestsUtils var count: Int = 0 @@ -70,6 +66,8 @@ let japanese = "今回のアップデートでSwiftに大幅な改良が施さ let chinese = "Swift 是面向 Apple 平台的编程语言,功能强大且直观易用,而本次更新对其进行了全面优化。" let korean = "이번 업데이트에서는 강력하면서도 직관적인 Apple 플랫폼용 프로그래밍 언어인 Swift를 완벽히 개선하였습니다." let russian = "в чащах юга жил-был цитрус? да, но фальшивый экземпляр" +let punctuated = "\u{201c}Hello\u{2010}world\u{2026}\u{201d}" +let punctuatedJapanese = "\u{300c}\u{300e}今日は\u{3001}世界\u{3002}\u{300f}\u{300d}" // A workload that's mostly Latin characters, with occasional emoji // interspersed. Common for tweets. @@ -91,7 +89,6 @@ let unicodeScalarsMultiplier = baseMultiplier let charactersMultiplier = baseMultiplier / 5 - @inline(never) public func run_StringWalk_ascii_unicodeScalars(_ N: Int) { for _ in 1...unicodeScalarsMultiplier*N { @@ -177,7 +174,6 @@ public func run_CharIndexing_ascii_unicodeScalars_Backwards(_ N: Int) { - @inline(never) public func run_StringWalk_utf16_unicodeScalars(_ N: Int) { for _ in 1...unicodeScalarsMultiplier*N { @@ -263,7 +259,6 @@ public func run_CharIndexing_utf16_unicodeScalars_Backwards(_ N: Int) { - @inline(never) public func run_StringWalk_tweet_unicodeScalars(_ N: Int) { for _ in 1...unicodeScalarsMultiplier*N { @@ -349,7 +344,6 @@ public func run_CharIndexing_tweet_unicodeScalars_Backwards(_ N: Int) { - @inline(never) public func run_StringWalk_japanese_unicodeScalars(_ N: Int) { for _ in 1...unicodeScalarsMultiplier*N { @@ -435,7 +429,6 @@ public func run_CharIndexing_japanese_unicodeScalars_Backwards(_ N: Int) { - @inline(never) public func run_StringWalk_chinese_unicodeScalars(_ N: Int) { for _ in 1...unicodeScalarsMultiplier*N { @@ -521,7 +514,6 @@ public func run_CharIndexing_chinese_unicodeScalars_Backwards(_ N: Int) { - @inline(never) public func run_StringWalk_korean_unicodeScalars(_ N: Int) { for _ in 1...unicodeScalarsMultiplier*N { @@ -607,7 +599,6 @@ public func run_CharIndexing_korean_unicodeScalars_Backwards(_ N: Int) { - @inline(never) public func run_StringWalk_russian_unicodeScalars(_ N: Int) { for _ in 1...unicodeScalarsMultiplier*N { @@ -692,3 +683,173 @@ public func run_CharIndexing_russian_unicodeScalars_Backwards(_ N: Int) { + +@inline(never) +public func run_StringWalk_punctuated_unicodeScalars(_ N: Int) { + for _ in 1...unicodeScalarsMultiplier*N { + count_unicodeScalars(punctuated.unicodeScalars) + } +} + +@inline(never) +public func run_StringWalk_punctuated_unicodeScalars_Backwards(_ N: Int) { + for _ in 1...unicodeScalarsMultiplier*N { + count_unicodeScalars_rev(punctuated.unicodeScalars.reversed()) + } +} + + + + +@inline(never) +public func run_StringWalk_punctuated_characters(_ N: Int) { + for _ in 1...charactersMultiplier*N { + count_characters(punctuated.characters) + } +} + +@inline(never) +public func run_StringWalk_punctuated_characters_Backwards(_ N: Int) { + for _ in 1...charactersMultiplier*N { + count_characters_rev(punctuated.characters.reversed()) + } +} + + + + +let punctuatedCharacters = Array(punctuated) + +@inline(never) +public func run_CharIteration_punctuated_unicodeScalars(_ N: Int) { + for _ in 1...unicodeScalarsMultiplier*N { + for c in punctuatedCharacters { + for u in c.unicodeScalars { + count |= Int(u.value) + } + } + } +} + +@inline(never) +public func run_CharIteration_punctuated_unicodeScalars_Backwards(_ N: Int) { + for _ in 1...unicodeScalarsMultiplier*N { + for c in punctuatedCharacters { + for u in c.unicodeScalars.reversed() { + count |= Int(u.value) + } + } + } +} + +@inline(never) +public func run_CharIndexing_punctuated_unicodeScalars(_ N: Int) { + for _ in 1...unicodeScalarsMultiplier*N { + for c in punctuatedCharacters { + let s = c.unicodeScalars + for i in s.indices { + count |= Int(s[i].value) + } + } + } +} + +@inline(never) +public func run_CharIndexing_punctuated_unicodeScalars_Backwards(_ N: Int) { + for _ in 1...unicodeScalarsMultiplier*N { + for c in punctuatedCharacters { + let s = c.unicodeScalars + for i in s.indices.reversed() { + count |= Int(s[i].value) + } + } + } +} + + + + +@inline(never) +public func run_StringWalk_punctuatedJapanese_unicodeScalars(_ N: Int) { + for _ in 1...unicodeScalarsMultiplier*N { + count_unicodeScalars(punctuatedJapanese.unicodeScalars) + } +} + +@inline(never) +public func run_StringWalk_punctuatedJapanese_unicodeScalars_Backwards(_ N: Int) { + for _ in 1...unicodeScalarsMultiplier*N { + count_unicodeScalars_rev(punctuatedJapanese.unicodeScalars.reversed()) + } +} + + + + +@inline(never) +public func run_StringWalk_punctuatedJapanese_characters(_ N: Int) { + for _ in 1...charactersMultiplier*N { + count_characters(punctuatedJapanese.characters) + } +} + +@inline(never) +public func run_StringWalk_punctuatedJapanese_characters_Backwards(_ N: Int) { + for _ in 1...charactersMultiplier*N { + count_characters_rev(punctuatedJapanese.characters.reversed()) + } +} + + + + +let punctuatedJapaneseCharacters = Array(punctuatedJapanese) + +@inline(never) +public func run_CharIteration_punctuatedJapanese_unicodeScalars(_ N: Int) { + for _ in 1...unicodeScalarsMultiplier*N { + for c in punctuatedJapaneseCharacters { + for u in c.unicodeScalars { + count |= Int(u.value) + } + } + } +} + +@inline(never) +public func run_CharIteration_punctuatedJapanese_unicodeScalars_Backwards(_ N: Int) { + for _ in 1...unicodeScalarsMultiplier*N { + for c in punctuatedJapaneseCharacters { + for u in c.unicodeScalars.reversed() { + count |= Int(u.value) + } + } + } +} + +@inline(never) +public func run_CharIndexing_punctuatedJapanese_unicodeScalars(_ N: Int) { + for _ in 1...unicodeScalarsMultiplier*N { + for c in punctuatedJapaneseCharacters { + let s = c.unicodeScalars + for i in s.indices { + count |= Int(s[i].value) + } + } + } +} + +@inline(never) +public func run_CharIndexing_punctuatedJapanese_unicodeScalars_Backwards(_ N: Int) { + for _ in 1...unicodeScalarsMultiplier*N { + for c in punctuatedJapaneseCharacters { + let s = c.unicodeScalars + for i in s.indices.reversed() { + count |= Int(s[i].value) + } + } + } +} + + + diff --git a/benchmark/single-source/StringWalk.swift.gyb b/benchmark/single-source/StringWalk.swift.gyb index a3e19130fcc8..d2094273d430 100644 --- a/benchmark/single-source/StringWalk.swift.gyb +++ b/benchmark/single-source/StringWalk.swift.gyb @@ -17,15 +17,11 @@ // scripts/generate_harness/generate_harness.py to regenerate this file. //////////////////////////////////////////////////////////////////////////////// - -// Test String subscript performance. // -// Subscript has a slow path that initializes a global variable: -// Swift._cocoaStringSubscript.addressor. Global optimization would -// normally hoist the initializer outside the inner loop (over -// unicodeScalars), forcing the initializer to be called on each -// lap. However, no that the cocoa code is properly marked "slowPath", -// no hoisting should occur. +// Test String iteration performance over a variety of workloads, languages, +// and symbols. +// + import TestsUtils var count: Int = 0 @@ -71,6 +67,8 @@ let japanese = "今回のアップデートでSwiftに大幅な改良が施さ let chinese = "Swift 是面向 Apple 平台的编程语言,功能强大且直观易用,而本次更新对其进行了全面优化。" let korean = "이번 업데이트에서는 강력하면서도 직관적인 Apple 플랫폼용 프로그래밍 언어인 Swift를 완벽히 개선하였습니다." let russian = "в чащах юга жил-был цитрус? да, но фальшивый экземпляр" +let punctuated = "\u{201c}Hello\u{2010}world\u{2026}\u{201d}" +let punctuatedJapanese = "\u{300c}\u{300e}今日は\u{3001}世界\u{3002}\u{300f}\u{300d}" // A workload that's mostly Latin characters, with occasional emoji // interspersed. Common for tweets. @@ -91,8 +89,7 @@ let baseMultiplier = 10_000 let unicodeScalarsMultiplier = baseMultiplier let charactersMultiplier = baseMultiplier / 5 -% for Name in ["ascii", "utf16", "tweet", "japanese", "chinese", "korean", "russian"]: - +% for Name in ["ascii", "utf16", "tweet", "japanese", "chinese", "korean", "russian", "punctuated", "punctuatedJapanese"]: % for Kind in ["unicodeScalars", "characters"]: @inline(never) diff --git a/benchmark/utils/main.swift b/benchmark/utils/main.swift index 18a0ad6dad86..6a4b6fe25877 100644 --- a/benchmark/utils/main.swift +++ b/benchmark/utils/main.swift @@ -178,6 +178,10 @@ addTo(&precommitTests, "CharIndexing_japanese_unicodeScalars", run_CharIndexing_ addTo(&precommitTests, "CharIndexing_japanese_unicodeScalars_Backwards", run_CharIndexing_japanese_unicodeScalars_Backwards) addTo(&precommitTests, "CharIndexing_korean_unicodeScalars", run_CharIndexing_korean_unicodeScalars) addTo(&precommitTests, "CharIndexing_korean_unicodeScalars_Backwards", run_CharIndexing_korean_unicodeScalars_Backwards) +addTo(&precommitTests, "CharIndexing_punctuatedJapanese_unicodeScalars", run_CharIndexing_punctuatedJapanese_unicodeScalars) +addTo(&precommitTests, "CharIndexing_punctuatedJapanese_unicodeScalars_Backwards", run_CharIndexing_punctuatedJapanese_unicodeScalars_Backwards) +addTo(&precommitTests, "CharIndexing_punctuated_unicodeScalars", run_CharIndexing_punctuated_unicodeScalars) +addTo(&precommitTests, "CharIndexing_punctuated_unicodeScalars_Backwards", run_CharIndexing_punctuated_unicodeScalars_Backwards) addTo(&precommitTests, "CharIndexing_russian_unicodeScalars", run_CharIndexing_russian_unicodeScalars) addTo(&precommitTests, "CharIndexing_russian_unicodeScalars_Backwards", run_CharIndexing_russian_unicodeScalars_Backwards) addTo(&precommitTests, "CharIndexing_tweet_unicodeScalars", run_CharIndexing_tweet_unicodeScalars) @@ -192,6 +196,10 @@ addTo(&precommitTests, "CharIteration_japanese_unicodeScalars", run_CharIteratio addTo(&precommitTests, "CharIteration_japanese_unicodeScalars_Backwards", run_CharIteration_japanese_unicodeScalars_Backwards) addTo(&precommitTests, "CharIteration_korean_unicodeScalars", run_CharIteration_korean_unicodeScalars) addTo(&precommitTests, "CharIteration_korean_unicodeScalars_Backwards", run_CharIteration_korean_unicodeScalars_Backwards) +addTo(&precommitTests, "CharIteration_punctuatedJapanese_unicodeScalars", run_CharIteration_punctuatedJapanese_unicodeScalars) +addTo(&precommitTests, "CharIteration_punctuatedJapanese_unicodeScalars_Backwards", run_CharIteration_punctuatedJapanese_unicodeScalars_Backwards) +addTo(&precommitTests, "CharIteration_punctuated_unicodeScalars", run_CharIteration_punctuated_unicodeScalars) +addTo(&precommitTests, "CharIteration_punctuated_unicodeScalars_Backwards", run_CharIteration_punctuated_unicodeScalars_Backwards) addTo(&precommitTests, "CharIteration_russian_unicodeScalars", run_CharIteration_russian_unicodeScalars) addTo(&precommitTests, "CharIteration_russian_unicodeScalars_Backwards", run_CharIteration_russian_unicodeScalars_Backwards) addTo(&precommitTests, "CharIteration_tweet_unicodeScalars", run_CharIteration_tweet_unicodeScalars) @@ -563,6 +571,14 @@ addTo(&stringTests, "StringWalk_korean_characters", run_StringWalk_korean_charac addTo(&stringTests, "StringWalk_korean_characters_Backwards", run_StringWalk_korean_characters_Backwards) addTo(&stringTests, "StringWalk_korean_unicodeScalars", run_StringWalk_korean_unicodeScalars) addTo(&stringTests, "StringWalk_korean_unicodeScalars_Backwards", run_StringWalk_korean_unicodeScalars_Backwards) +addTo(&stringTests, "StringWalk_punctuatedJapanese_characters", run_StringWalk_punctuatedJapanese_characters) +addTo(&stringTests, "StringWalk_punctuatedJapanese_characters_Backwards", run_StringWalk_punctuatedJapanese_characters_Backwards) +addTo(&stringTests, "StringWalk_punctuatedJapanese_unicodeScalars", run_StringWalk_punctuatedJapanese_unicodeScalars) +addTo(&stringTests, "StringWalk_punctuatedJapanese_unicodeScalars_Backwards", run_StringWalk_punctuatedJapanese_unicodeScalars_Backwards) +addTo(&stringTests, "StringWalk_punctuated_characters", run_StringWalk_punctuated_characters) +addTo(&stringTests, "StringWalk_punctuated_characters_Backwards", run_StringWalk_punctuated_characters_Backwards) +addTo(&stringTests, "StringWalk_punctuated_unicodeScalars", run_StringWalk_punctuated_unicodeScalars) +addTo(&stringTests, "StringWalk_punctuated_unicodeScalars_Backwards", run_StringWalk_punctuated_unicodeScalars_Backwards) addTo(&stringTests, "StringWalk_russian_characters", run_StringWalk_russian_characters) addTo(&stringTests, "StringWalk_russian_characters_Backwards", run_StringWalk_russian_characters_Backwards) addTo(&stringTests, "StringWalk_russian_unicodeScalars", run_StringWalk_russian_unicodeScalars) diff --git a/stdlib/public/core/StringCharacterView.swift b/stdlib/public/core/StringCharacterView.swift index 104b4e84c752..ce6ff668e3b9 100644 --- a/stdlib/public/core/StringCharacterView.swift +++ b/stdlib/public/core/StringCharacterView.swift @@ -344,6 +344,14 @@ extension String.CharacterView : BidirectionalCollection { // 0xAC00–0xD7AF case 0xac00...0xd7af: return true + // Common general use punctuation, excluding extenders: + // 0x2010-0x2029 + case 0x2010...0x2029: return true + + // CJK punctuation characters, excluding extenders: + // 0x3000-0x3029 + case 0x3000...0x3029: return true + default: return false } } From 4c0ba61e5338c50914b21c5223bbd090ae16b999 Mon Sep 17 00:00:00 2001 From: Michael Ilseman Date: Tue, 27 Jun 2017 20:37:16 -0700 Subject: [PATCH 2/2] [gardening] Remove done TODO comments --- stdlib/public/core/StringCharacterView.swift | 3 --- 1 file changed, 3 deletions(-) diff --git a/stdlib/public/core/StringCharacterView.swift b/stdlib/public/core/StringCharacterView.swift index ce6ff668e3b9..debf0ba1d8d6 100644 --- a/stdlib/public/core/StringCharacterView.swift +++ b/stdlib/public/core/StringCharacterView.swift @@ -315,7 +315,6 @@ extension String.CharacterView : BidirectionalCollection { // others: // 0x3400-0xA4CF case 0x3400...0xa4cf: return true - // TODO: CJK punctuation // Repeat sub-300 check, this is beneficial for common cases of Latin // characters embedded within non-Latin script (e.g. newlines, spaces, @@ -324,8 +323,6 @@ extension String.CharacterView : BidirectionalCollection { // NOTE: CR-LF special case has already been checked. case 0x0000...0x02ff: return true - // TODO: general punctuation - // Non-combining kana: // 0x3041-0x3096 // 0x30A1-0x30FA