diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index d30cab209..961641a59 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -198,9 +198,7 @@ extension Compiler.ByteCodeGen { if options.isCaseInsensitive && c.isCased { // TODO: buildCaseInsensitiveMatch(c) or buildMatch(c, caseInsensitive: true) builder.buildConsume { input, bounds in - let inputChar = input[bounds.lowerBound].lowercased() - let matchChar = c.lowercased() - return inputChar == matchChar + return input[bounds.lowerBound].caseFoldedEquals(c) ? input.index(after: bounds.lowerBound) : nil } @@ -655,11 +653,13 @@ extension Compiler.ByteCodeGen { if options.isCaseInsensitive { // TODO: buildCaseInsensitiveMatchSequence(c) or alternative builder.buildConsume { input, bounds in + // FIXME: This needs to iterate over the case-folded strings, not + // iterate and then case-fold as we go. var iterator = s.makeIterator() var currentIndex = bounds.lowerBound while let ch = iterator.next() { guard currentIndex < bounds.upperBound, - ch.lowercased() == input[currentIndex].lowercased() + ch.caseFoldedEquals(input[currentIndex]) else { return nil } input.formIndex(after: ¤tIndex) } diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index 90e573824..b3a85f254 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -67,7 +67,7 @@ extension DSLTree.Atom { return { input, bounds in let low = bounds.lowerBound if isCaseInsensitive && c.isCased { - return input[low].lowercased() == c.lowercased() + return input[low].caseFoldedEquals(c) ? input.index(after: low) : nil } else { diff --git a/Sources/_StringProcessing/Unicode/CaseConversion.swift b/Sources/_StringProcessing/Unicode/CaseConversion.swift index cfa68c425..c3706ddfe 100644 --- a/Sources/_StringProcessing/Unicode/CaseConversion.swift +++ b/Sources/_StringProcessing/Unicode/CaseConversion.swift @@ -9,6 +9,24 @@ // //===----------------------------------------------------------------------===// +@_spi(_Unicode) import Swift // TODO +extension Character { + /// Whether this character and `c` are equal when case folded. + func caseFoldedEquals(_ c: Character) -> Bool { + guard #available(SwiftStdlib 5.7, *) else { fatalError() } + let foldedSelf = unicodeScalars.lazy.map(\.properties._caseFolded).joined() + let foldedOther = c.unicodeScalars.lazy.map(\.properties._caseFolded).joined() + return foldedSelf.elementsEqual(foldedOther) + } +} + +extension UnicodeScalar { + /// Whether this Unicode scalar and `s` are equal when case folded. + func caseFoldedEquals(_ s: UnicodeScalar) -> Bool { + guard #available(SwiftStdlib 5.7, *) else { fatalError() } + return properties._caseFolded == s.properties._caseFolded + } +} diff --git a/Sources/_StringProcessing/_CharacterClassModel.swift b/Sources/_StringProcessing/_CharacterClassModel.swift index 85dd1ca37..c11554507 100644 --- a/Sources/_StringProcessing/_CharacterClassModel.swift +++ b/Sources/_StringProcessing/_CharacterClassModel.swift @@ -98,7 +98,7 @@ public struct _CharacterClassModel: Hashable { switch self { case .character(let c): if options.isCaseInsensitive { - return c.lowercased() == character.lowercased() + return c.caseFoldedEquals(character) } else { return c == character } diff --git a/Tests/RegexTests/UTS18Tests.swift b/Tests/RegexTests/UTS18Tests.swift index d13b47b8d..304187a9e 100644 --- a/Tests/RegexTests/UTS18Tests.swift +++ b/Tests/RegexTests/UTS18Tests.swift @@ -238,20 +238,31 @@ extension UTS18Tests { expectFirstMatch("Dåb", regex(#"Dåb"#).ignoresCase(), "Dåb") expectFirstMatch("dÅB", regex(#"Dåb"#).ignoresCase(), "dÅB") expectFirstMatch("D\u{212B}B", regex(#"Dåb"#).ignoresCase(), "D\u{212B}B") - } + + let sigmas = "σΣς" + expectFirstMatch(sigmas, regex(#"^σ+$"#).ignoresCase(), sigmas[...]) + expectFirstMatch(sigmas, regex(#"^Σ+$"#).ignoresCase(), sigmas[...]) + expectFirstMatch(sigmas, regex(#"^ς+$"#).ignoresCase(), sigmas[...]) - func testSimpleLooseMatches_XFail() { - XCTExpectFailure("Need case folding support") { - let sigmas = "σΣς" - expectFirstMatch(sigmas, regex(#"σ+"#).ignoresCase(), sigmas[...]) - expectFirstMatch(sigmas, regex(#"Σ+"#).ignoresCase(), sigmas[...]) - expectFirstMatch(sigmas, regex(#"ς+"#).ignoresCase(), sigmas[...]) - - // TODO: Test German sharp S - // TODO: Test char classes, e.g. [\p{Block=Phonetic_Extensions} [A-E]] + // Custom character classes + for regexCh in sigmas { + for inputCh in sigmas { + expectFirstMatch(String(inputCh), regex("[\(regexCh)]").ignoresCase(), String(inputCh)[...]) + if regexCh != inputCh { + XCTAssertFalse(String(inputCh).contains(regex("[\(regexCh)]"))) + } + } } + + expectFirstMatch("Strauß", regex("ß").ignoresCase(), "ß") + XCTExpectFailure { + expectFirstMatch("Strauss", regex("ß").ignoresCase(), "ss") + } + + // TODO: Test char classes, e.g. [\p{Block=Phonetic_Extensions} [A-E]] + // TODO: Document when full case folding applies } - + // RL1.6 Line Boundaries // // To meet this requirement, if an implementation provides for line-boundary