From fa299b573aaa639a245763631aeee751d03ef7a9 Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Fri, 6 May 2022 10:39:16 -0500 Subject: [PATCH 1/2] Use full case folding outside of character classes This should get us part way to proper full case folding support. To handle case folding that results in multiple characters, we'll need to switch to more of a sequence-based comparison than just matching a single character. --- Sources/_StringProcessing/ByteCodeGen.swift | 8 ++--- .../_StringProcessing/ConsumerInterface.swift | 2 +- .../Unicode/CaseConversion.swift | 18 ++++++++++ .../_CharacterClassModel.swift | 2 +- Tests/RegexTests/UTS18Tests.swift | 33 ++++++++++++------- 5 files changed, 46 insertions(+), 17 deletions(-) diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index 2131d1eb5..f9549039d 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -180,9 +180,7 @@ extension Compiler.ByteCodeGen { if options.isCaseInsensitive && c.isCased { // TODO: buildCaseInsensitiveMatch(c) or buildMatch(c, caseInsensitive: true) builder.buildConsume { input, bounds in - let inputChar = input[bounds.lowerBound].lowercased() - let matchChar = c.lowercased() - return inputChar == matchChar + return input[bounds.lowerBound].caseFoldedEquals(c) ? input.index(after: bounds.lowerBound) : nil } @@ -637,11 +635,13 @@ extension Compiler.ByteCodeGen { if options.isCaseInsensitive { // TODO: buildCaseInsensitiveMatchSequence(c) or alternative builder.buildConsume { input, bounds in + // FIXME: This needs to iterate over the case-folded strings, not + // iterate and then case-fold as we go. var iterator = s.makeIterator() var currentIndex = bounds.lowerBound while let ch = iterator.next() { guard currentIndex < bounds.upperBound, - ch.lowercased() == input[currentIndex].lowercased() + ch.caseFoldedEquals(input[currentIndex]) else { return nil } input.formIndex(after: ¤tIndex) } diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index d27b89314..15eacaa8f 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -67,7 +67,7 @@ extension DSLTree.Atom { return { input, bounds in let low = bounds.lowerBound if isCaseInsensitive && c.isCased { - return input[low].lowercased() == c.lowercased() + return input[low].caseFoldedEquals(c) ? input.index(after: low) : nil } else { diff --git a/Sources/_StringProcessing/Unicode/CaseConversion.swift b/Sources/_StringProcessing/Unicode/CaseConversion.swift index cfa68c425..38c25b74e 100644 --- a/Sources/_StringProcessing/Unicode/CaseConversion.swift +++ b/Sources/_StringProcessing/Unicode/CaseConversion.swift @@ -9,6 +9,24 @@ // //===----------------------------------------------------------------------===// +@_spi(_Unicode) import Swift // TODO +extension Character { + /// Whether this character and `c` are equal when case folded. + func caseFoldedEquals(_ c: Character) -> Bool { + guard #available(SwiftStdlib 5.7, *) else { fatalError() } + let foldedSelf = unicodeScalars.map(\.properties._caseFolded).joined() + let foldedOther = c.unicodeScalars.map(\.properties._caseFolded).joined() + return foldedSelf == foldedOther + } +} + +extension UnicodeScalar { + /// Whether this Unicode scalar and `s` are equal when case folded. + func caseFoldedEquals(_ s: UnicodeScalar) -> Bool { + guard #available(SwiftStdlib 5.7, *) else { fatalError() } + return properties._caseFolded == s.properties._caseFolded + } +} diff --git a/Sources/_StringProcessing/_CharacterClassModel.swift b/Sources/_StringProcessing/_CharacterClassModel.swift index fc3fd5741..bc644aa13 100644 --- a/Sources/_StringProcessing/_CharacterClassModel.swift +++ b/Sources/_StringProcessing/_CharacterClassModel.swift @@ -98,7 +98,7 @@ public struct _CharacterClassModel: Hashable { switch self { case .character(let c): if options.isCaseInsensitive { - return c.lowercased() == character.lowercased() + return c.caseFoldedEquals(character) } else { return c == character } diff --git a/Tests/RegexTests/UTS18Tests.swift b/Tests/RegexTests/UTS18Tests.swift index 71f459a1b..02840753d 100644 --- a/Tests/RegexTests/UTS18Tests.swift +++ b/Tests/RegexTests/UTS18Tests.swift @@ -230,20 +230,31 @@ extension UTS18Tests { expectFirstMatch("Dåb", regex(#"Dåb"#).ignoresCase(), "Dåb") expectFirstMatch("dÅB", regex(#"Dåb"#).ignoresCase(), "dÅB") expectFirstMatch("D\u{212B}B", regex(#"Dåb"#).ignoresCase(), "D\u{212B}B") - } + + let sigmas = "σΣς" + expectFirstMatch(sigmas, regex(#"^σ+$"#).ignoresCase(), sigmas[...]) + expectFirstMatch(sigmas, regex(#"^Σ+$"#).ignoresCase(), sigmas[...]) + expectFirstMatch(sigmas, regex(#"^ς+$"#).ignoresCase(), sigmas[...]) - func testSimpleLooseMatches_XFail() { - XCTExpectFailure("Need case folding support") { - let sigmas = "σΣς" - expectFirstMatch(sigmas, regex(#"σ+"#).ignoresCase(), sigmas[...]) - expectFirstMatch(sigmas, regex(#"Σ+"#).ignoresCase(), sigmas[...]) - expectFirstMatch(sigmas, regex(#"ς+"#).ignoresCase(), sigmas[...]) - - // TODO: Test German sharp S - // TODO: Test char classes, e.g. [\p{Block=Phonetic_Extensions} [A-E]] + // Custom character classes + for regexCh in sigmas { + for inputCh in sigmas { + expectFirstMatch(String(inputCh), regex("[\(regexCh)]").ignoresCase(), String(inputCh)[...]) + if regexCh != inputCh { + XCTAssertFalse(String(inputCh).contains(regex("[\(regexCh)]"))) + } + } } + + expectFirstMatch("Strauß", regex("ß").ignoresCase(), "ß") + XCTExpectFailure { + expectFirstMatch("Strauss", regex("ß").ignoresCase(), "ss") + } + + // TODO: Test char classes, e.g. [\p{Block=Phonetic_Extensions} [A-E]] + // TODO: Document when full case folding applies } - + // RL1.6 Line Boundaries // // To meet this requirement, if an implementation provides for line-boundary From ace7480f5a53153e631b2487f496103c27e1830e Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Mon, 9 May 2022 11:23:27 -0500 Subject: [PATCH 2/2] Lazily compare case folded strings --- Sources/_StringProcessing/Unicode/CaseConversion.swift | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Sources/_StringProcessing/Unicode/CaseConversion.swift b/Sources/_StringProcessing/Unicode/CaseConversion.swift index 38c25b74e..c3706ddfe 100644 --- a/Sources/_StringProcessing/Unicode/CaseConversion.swift +++ b/Sources/_StringProcessing/Unicode/CaseConversion.swift @@ -17,9 +17,9 @@ extension Character { /// Whether this character and `c` are equal when case folded. func caseFoldedEquals(_ c: Character) -> Bool { guard #available(SwiftStdlib 5.7, *) else { fatalError() } - let foldedSelf = unicodeScalars.map(\.properties._caseFolded).joined() - let foldedOther = c.unicodeScalars.map(\.properties._caseFolded).joined() - return foldedSelf == foldedOther + let foldedSelf = unicodeScalars.lazy.map(\.properties._caseFolded).joined() + let foldedOther = c.unicodeScalars.lazy.map(\.properties._caseFolded).joined() + return foldedSelf.elementsEqual(foldedOther) } }