From e748aea642c490cdb0b5e1b0e46001c786a7b2b7 Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Mon, 2 May 2022 15:05:10 -0500 Subject: [PATCH 01/24] Add NegativeLookahead and Anchor comments (#372) --- Sources/RegexBuilder/Anchor.swift | 97 +++++++++++++++++++-- Tests/RegexBuilderTests/RegexDSLTests.swift | 4 +- 2 files changed, 90 insertions(+), 11 deletions(-) diff --git a/Sources/RegexBuilder/Anchor.swift b/Sources/RegexBuilder/Anchor.swift index e8cd4ac54..ae66310af 100644 --- a/Sources/RegexBuilder/Anchor.swift +++ b/Sources/RegexBuilder/Anchor.swift @@ -12,6 +12,12 @@ @_implementationOnly import _RegexParser @_spi(RegexBuilder) import _StringProcessing +/// A regex component that matches a specific condition at a particular position +/// in an input string. +/// +/// You can use anchors to guarantee that a match only occurs at certain points +/// in an input string, such as at the beginning of the string or at the end of +/// a line. @available(SwiftStdlib 5.7, *) public struct Anchor { internal enum Kind { @@ -53,14 +59,24 @@ extension Anchor: RegexComponent { @available(SwiftStdlib 5.7, *) extension Anchor { + /// An anchor that matches at the start of the input string. + /// + /// This anchor is equivalent to `\A` in regex syntax. public static var startOfSubject: Anchor { Anchor(kind: .startOfSubject) } - + + /// An anchor that matches at the end of the input string or at the end of + /// the line immediately before the the end of the string. + /// + /// This anchor is equivalent to `\Z` in regex syntax. public static var endOfSubjectBeforeNewline: Anchor { Anchor(kind: .endOfSubjectBeforeNewline) } - + + /// An anchor that matches at the end of the input string. + /// + /// This anchor is equivalent to `\z` in regex syntax. public static var endOfSubject: Anchor { Anchor(kind: .endOfSubject) } @@ -70,26 +86,53 @@ extension Anchor { // Anchor(kind: resetStartOfMatch) // } + /// An anchor that matches at the first position of a match in the input + /// string. public static var firstMatchingPositionInSubject: Anchor { Anchor(kind: .firstMatchingPositionInSubject) } + /// An anchor that matches at a grapheme cluster boundary. + /// + /// This anchor is equivalent to `\y` in regex syntax. public static var textSegmentBoundary: Anchor { Anchor(kind: .textSegmentBoundary) } + /// An anchor that matches at the start of a line, including the start of + /// the input string. + /// + /// This anchor is equivalent to `^` in regex syntax when the `m` option + /// has been enabled or `anchorsMatchLineEndings(true)` has been called. public static var startOfLine: Anchor { Anchor(kind: .startOfLine) } + /// An anchor that matches at the end of a line, including at the end of + /// the input string. + /// + /// This anchor is equivalent to `$` in regex syntax when the `m` option + /// has been enabled or `anchorsMatchLineEndings(true)` has been called. public static var endOfLine: Anchor { Anchor(kind: .endOfLine) } + /// An anchor that matches at a word boundary. + /// + /// Word boundaries are identified using the Unicode default word boundary + /// algorithm by default. To specify a different word boundary algorithm, + /// see the `RegexComponent.wordBoundaryKind(_:)` method. + /// + /// This anchor is equivalent to `\b` in regex syntax. public static var wordBoundary: Anchor { Anchor(kind: .wordBoundary) } + /// The inverse of this anchor, which matches at every position that this + /// anchor does not. + /// + /// For the `wordBoundary` and `textSegmentBoundary` anchors, the inverted + /// version corresponds to `\B` and `\Y`, respectively. public var inverted: Anchor { var result = self result.isInverted.toggle() @@ -97,6 +140,13 @@ extension Anchor { } } +/// A regex component that allows a match to continue only if its contents +/// match at the given location. +/// +/// A lookahead is a zero-length assertion that its included regex matches at +/// a particular position. Lookaheads do not advance the overall matching +/// position in the input string — once a lookahead succeeds, matching continues +/// in the regex from the same position. @available(SwiftStdlib 5.7, *) public struct Lookahead: _BuiltinRegexComponent { public var regex: Regex @@ -105,19 +155,48 @@ public struct Lookahead: _BuiltinRegexComponent { self.regex = regex } + /// Creates a lookahead from the given regex component. public init( - _ component: R, - negative: Bool = false + _ component: R ) where R.RegexOutput == Output { - self.init(node: .nonCapturingGroup( - negative ? .negativeLookahead : .lookahead, component.regex.root)) + self.init(node: .nonCapturingGroup(.lookahead, component.regex.root)) } + + /// Creates a lookahead from the regex generated by the given builder closure. + public init( + @RegexComponentBuilder _ component: () -> R + ) where R.RegexOutput == Output { + self.init(node: .nonCapturingGroup(.lookahead, component().regex.root)) + } +} +/// A regex component that allows a match to continue only if its contents +/// do not match at the given location. +/// +/// A negative lookahead is a zero-length assertion that its included regex +/// does not match at a particular position. Lookaheads do not advance the +/// overall matching position in the input string — once a lookahead succeeds, +/// matching continues in the regex from the same position. +@available(SwiftStdlib 5.7, *) +public struct NegativeLookahead: _BuiltinRegexComponent { + public var regex: Regex + + init(_ regex: Regex) { + self.regex = regex + } + + /// Creates a negative lookahead from the given regex component. + public init( + _ component: R + ) where R.RegexOutput == Output { + self.init(node: .nonCapturingGroup(.negativeLookahead, component.regex.root)) + } + + /// Creates a negative lookahead from the regex generated by the given builder + /// closure. public init( - negative: Bool = false, @RegexComponentBuilder _ component: () -> R ) where R.RegexOutput == Output { - self.init(node: .nonCapturingGroup( - negative ? .negativeLookahead : .lookahead, component().regex.root)) + self.init(node: .nonCapturingGroup(.negativeLookahead, component().regex.root)) } } diff --git a/Tests/RegexBuilderTests/RegexDSLTests.swift b/Tests/RegexBuilderTests/RegexDSLTests.swift index 4e08ea103..be2b054a5 100644 --- a/Tests/RegexBuilderTests/RegexDSLTests.swift +++ b/Tests/RegexBuilderTests/RegexDSLTests.swift @@ -115,7 +115,7 @@ class RegexDSLTests: XCTestCase { { let disallowedChars = CharacterClass.hexDigit .symmetricDifference("a"..."z") - Lookahead(disallowedChars, negative: true) // No: 0-9 + g-z + NegativeLookahead(disallowedChars) // No: 0-9 + g-z OneOrMore(("b"..."g").union("d"..."n")) // b-n @@ -487,7 +487,7 @@ class RegexDSLTests: XCTestCase { { OneOrMore("a") Lookahead(CharacterClass.digit) - Lookahead("2", negative: true) + NegativeLookahead { "2" } CharacterClass.word } } From 13342eb03bf0d44dfa8608f9a4fad7176970bf43 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Tue, 3 May 2022 13:49:10 +0100 Subject: [PATCH 02/24] Add matching support for `\p{Lc}` This is defined in UAX#44 as being equivalent to `Lu | Ll | Lt`. --- .gitignore | 3 +++ Sources/_StringProcessing/ConsumerInterface.swift | 5 +++-- Tests/RegexTests/MatchTests.swift | 6 ++++++ 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index a7e7e4d09..ff85b9fa3 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,8 @@ .DS_Store +# The current toolchain is dumping files in the package root, rude +*.emit-module.* + # Xcode # # gitignore contributors: remember to update Global/Xcode.gitignore, Objective-C.gitignore & Swift.gitignore diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index 356b7cc4b..a44c2c876 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -691,8 +691,9 @@ extension Unicode.ExtendedGeneralCategory { ]) case .casedLetter: - throw Unsupported( - "TODO: cased letter? not the property?") + return consumeScalarGCs([ + .uppercaseLetter, .lowercaseLetter, .titlecaseLetter + ]) case .control: return consumeScalarGC(.control) diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index 345e80e22..769538b74 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -693,6 +693,12 @@ extension RegexTests { firstMatchTest(#"\p{gc=L}"#, input: "123abcXYZ", match: "a") firstMatchTest(#"\p{Lu}"#, input: "123abcXYZ", match: "X") + // U+0374 GREEK NUMERAL SIGN (Lm) + // U+00AA FEMININE ORDINAL INDICATOR (Lo) + firstMatchTest(#"\p{L}"#, input: "\u{0374}\u{00AA}123abcXYZ", match: "\u{0374}") + firstMatchTest(#"\p{Lc}"#, input: "\u{0374}\u{00AA}123abcXYZ", match: "a") + firstMatchTest(#"\p{Lc}"#, input: "\u{0374}\u{00AA}123XYZ", match: "X") + firstMatchTest( #"\P{Cc}"#, input: "\n\n\nXYZ", match: "X") firstMatchTest( From 925f51bc863aac2ddaa640a01c28843dd48ad5fc Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Tue, 3 May 2022 13:49:11 +0100 Subject: [PATCH 03/24] Add parser support for `\p{L&}` This is a PCRE spelling for a cased letter. --- .../Regex/Parse/CharacterPropertyClassification.swift | 6 +++--- Tests/RegexTests/MatchTests.swift | 2 ++ Tests/RegexTests/ParseTests.swift | 3 +++ 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift b/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift index 911312121..5cc920063 100644 --- a/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift +++ b/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift @@ -32,8 +32,8 @@ extension Source { static private func classifyGeneralCategory( _ str: String ) -> Unicode.ExtendedGeneralCategory? { - // This uses the aliases defined in - // https://www.unicode.org/Public/UCD/latest/ucd/PropertyValueAliases.txt. + // This uses the aliases defined in https://www.unicode.org/Public/UCD/latest/ucd/PropertyValueAliases.txt. + // Additionally, uses the `L& = Lc` alias defined by PCRE. withNormalizedForms(str) { str in switch str { case "c", "other": return .other @@ -43,7 +43,7 @@ extension Source { case "co", "privateuse": return .privateUse case "cs", "surrogate": return .surrogate case "l", "letter": return .letter - case "lc", "casedletter": return .casedLetter + case "lc", "l&", "casedletter": return .casedLetter case "ll", "lowercaseletter": return .lowercaseLetter case "lm", "modifierletter": return .modifierLetter case "lo", "otherletter": return .otherLetter diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index 769538b74..2c6b858cc 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -698,6 +698,8 @@ extension RegexTests { firstMatchTest(#"\p{L}"#, input: "\u{0374}\u{00AA}123abcXYZ", match: "\u{0374}") firstMatchTest(#"\p{Lc}"#, input: "\u{0374}\u{00AA}123abcXYZ", match: "a") firstMatchTest(#"\p{Lc}"#, input: "\u{0374}\u{00AA}123XYZ", match: "X") + firstMatchTest(#"\p{L&}"#, input: "\u{0374}\u{00AA}123abcXYZ", match: "a") + firstMatchTest(#"\p{L&}"#, input: "\u{0374}\u{00AA}123XYZ", match: "X") firstMatchTest( #"\P{Cc}"#, input: "\n\n\nXYZ", match: "X") diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index aeefe6477..f0013b158 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -1156,6 +1156,9 @@ extension RegexTests { #"\p{C}+"#, oneOrMore(of: prop(.generalCategory(.other)))) + // L& defined by PCRE. + parseTest(#"\p{L&}"#, prop(.generalCategory(.casedLetter))) + // UAX44-LM3 means all of the below are equivalent. let lowercaseLetter = prop(.generalCategory(.lowercaseLetter)) parseTest(#"\p{ll}"#, lowercaseLetter) From c44efeb345d1ead6b55f1415950299e36f276be2 Mon Sep 17 00:00:00 2001 From: Michael Ilseman Date: Tue, 3 May 2022 11:36:21 -0600 Subject: [PATCH 04/24] Update ProposalOverview.md --- Documentation/Evolution/ProposalOverview.md | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/Evolution/ProposalOverview.md b/Documentation/Evolution/ProposalOverview.md index 7656526a6..5f526f963 100644 --- a/Documentation/Evolution/ProposalOverview.md +++ b/Documentation/Evolution/ProposalOverview.md @@ -3,6 +3,7 @@ ## Regex Type and Overview +- [Second review](https://forums.swift.org/t/se-0350-second-review-regex-type-and-overview/56886) - [Proposal](https://github.com/apple/swift-evolution/blob/main/proposals/0350-regex-type-overview.md), [Thread](https://forums.swift.org/t/se-0350-regex-type-and-overview/56530) - [Pitch thread](https://forums.swift.org/t/pitch-regex-type-and-overview/56029) From 980185592604e3aebb81ad83caf198e1e66fd1d9 Mon Sep 17 00:00:00 2001 From: Michael Ilseman Date: Tue, 3 May 2022 12:55:18 -0600 Subject: [PATCH 05/24] Add tests for AnyRegexOutput (#371) * Add tests for AnyRegexOutput Fix a few missing areas of functionality while we're at it --- .../Regex/AnyRegexOutput.swift | 22 ++- Tests/RegexBuilderTests/RegexDSLTests.swift | 37 ----- Tests/RegexTests/AnyRegexOutputTests.swift | 157 ++++++++++++++++++ 3 files changed, 173 insertions(+), 43 deletions(-) create mode 100644 Tests/RegexTests/AnyRegexOutputTests.swift diff --git a/Sources/_StringProcessing/Regex/AnyRegexOutput.swift b/Sources/_StringProcessing/Regex/AnyRegexOutput.swift index 23222da00..00fc2e952 100644 --- a/Sources/_StringProcessing/Regex/AnyRegexOutput.swift +++ b/Sources/_StringProcessing/Regex/AnyRegexOutput.swift @@ -62,6 +62,7 @@ public struct AnyRegexOutput { /// The depth of `Optioals`s wrapping the underlying value. For example, /// `Substring` has optional depth `0`, and `Int??` has optional depth `2`. let optionalDepth: Int + /// The bounds of the output element. let bounds: Range? } @@ -90,7 +91,7 @@ extension AnyRegexOutput { /// - Parameter type: The expected output type. /// - Returns: The output, if the underlying value can be converted to the /// output type; otherwise `nil`. - public func `as`(_ type: Output.Type) -> Output? { + public func `as`(_ type: Output.Type = Output.self) -> Output? { let elements = _elements.map { StructuredCapture( optionalCount: $0.optionalDepth, @@ -206,23 +207,30 @@ extension Regex.Match where Output == AnyRegexOutput { /// - Parameter type: The expected output type. /// - Returns: A match generic over the output type, if the underlying values /// can be converted to the output type; otherwise, `nil`. - public func `as`(_ type: Output.Type) -> Regex.Match? { + public func `as`( + _ type: Output.Type = Output.self + ) -> Regex.Match? { fatalError("FIXME: Not implemented") } } @available(SwiftStdlib 5.7, *) -extension Regex where Output == AnyRegexOutput { +extension Regex { /// Returns whether a named-capture with `name` exists public func contains(captureNamed name: String) -> Bool { - fatalError("FIXME: not implemented") + program.tree.root._captureList.captures.contains(where: { + $0.name == name + }) } +} +@available(SwiftStdlib 5.7, *) +extension Regex where Output == AnyRegexOutput { /// Creates a type-erased regex from an existing regex. /// /// Use this initializer to fit a regex with strongly typed captures into the /// use site of a dynamic regex, i.e. one that was created from a string. - public init(_ match: Regex) { + public init(_ regex: Regex) { fatalError("FIXME: Not implemented") } @@ -231,7 +239,9 @@ extension Regex where Output == AnyRegexOutput { /// - Parameter type: The expected output type. /// - Returns: A regex generic over the output type if the underlying types can be converted. /// Returns `nil` otherwise. - public func `as`(_ type: Output.Type) -> Regex? { + public func `as`( + _ type: Output.Type = Output.self + ) -> Regex? { fatalError("FIXME: Not implemented") } } diff --git a/Tests/RegexBuilderTests/RegexDSLTests.swift b/Tests/RegexBuilderTests/RegexDSLTests.swift index be2b054a5..5673aa348 100644 --- a/Tests/RegexBuilderTests/RegexDSLTests.swift +++ b/Tests/RegexBuilderTests/RegexDSLTests.swift @@ -742,43 +742,6 @@ class RegexDSLTests: XCTestCase { } } - func testDynamicCaptures() throws { - do { - let regex = try Regex("aabcc.") - let line = "aabccd" - let match = try XCTUnwrap(line.wholeMatch(of: regex)) - XCTAssertEqual(match.0, line[...]) - let output = match.output - XCTAssertEqual(output[0].substring, line[...]) - } - do { - let regex = try Regex( - #""" - (?[0-9A-F]+)(?:\.\.(?[0-9A-F]+))?\s+;\s+(?\w+).* - """#) - let line = """ - A6F0..A6F1 ; Extend # Mn [2] BAMUM COMBINING MARK KOQNDON..BAMUM \ - COMBINING MARK TUKWENTIS - """ - let match = try XCTUnwrap(line.wholeMatch(of: regex)) - XCTAssertEqual(match.0, line[...]) - let output = match.output - XCTAssertEqual(output[0].substring, line[...]) - XCTAssertTrue(output[1].substring == "A6F0") - XCTAssertTrue(output["lower"]?.substring == "A6F0") - XCTAssertTrue(output[2].substring == "A6F1") - XCTAssertTrue(output["upper"]?.substring == "A6F1") - XCTAssertTrue(output[3].substring == "Extend") - XCTAssertTrue(output["desc"]?.substring == "Extend") - let typedOutput = try XCTUnwrap(output.as( - (Substring, lower: Substring, upper: Substring?, Substring).self)) - XCTAssertEqual(typedOutput.0, line[...]) - XCTAssertTrue(typedOutput.lower == "A6F0") - XCTAssertTrue(typedOutput.upper == "A6F1") - XCTAssertTrue(typedOutput.3 == "Extend") - } - } - func testBackreference() throws { try _testDSLCaptures( ("abc#41#42abcabcabc", ("abc#41#42abcabcabc", "abc", 42, "abc", nil)), diff --git a/Tests/RegexTests/AnyRegexOutputTests.swift b/Tests/RegexTests/AnyRegexOutputTests.swift new file mode 100644 index 000000000..8d91c0ec8 --- /dev/null +++ b/Tests/RegexTests/AnyRegexOutputTests.swift @@ -0,0 +1,157 @@ + +import _StringProcessing +import XCTest + +// Test that our existential capture and concrete captures are +// the same +private func checkSame( + _ aro: AnyRegexOutput, + _ concrete: (Substring, fieldA: Substring, fieldB: Substring) +) { + XCTAssertEqual(aro[0].substring, concrete.0) + + XCTAssertEqual(aro["fieldA"]!.substring, concrete.1) + XCTAssertEqual(aro["fieldA"]!.substring, concrete.fieldA) + + XCTAssertEqual(aro[1].substring, concrete.1) + + XCTAssertEqual(aro["fieldB"]!.substring, concrete.2) + XCTAssertEqual(aro["fieldB"]!.substring, concrete.fieldB) + + XCTAssertEqual(aro[2].substring, concrete.2) + +} +private func checkSame( + _ aro: Regex.Match, + _ concrete: Regex<(Substring, fieldA: Substring, fieldB: Substring)>.Match +) { + checkSame(aro.output, concrete.output) + + XCTAssertEqual(aro.0, concrete.0) + XCTAssertEqual(aro[0].substring, concrete.0) + + XCTAssertEqual(aro["fieldA"]!.substring, concrete.1) + XCTAssertEqual(aro["fieldA"]!.substring, concrete.fieldA) + XCTAssertEqual(aro[1].substring, concrete.1) + + XCTAssertEqual(aro["fieldB"]!.substring, concrete.2) + XCTAssertEqual(aro["fieldB"]!.substring, concrete.fieldB) + XCTAssertEqual(aro[2].substring, concrete.2) +} +private func checkSame( + _ aro: Regex, + _ concrete: Regex<(Substring, fieldA: Substring, fieldB: Substring)> +) { + XCTAssertEqual( + aro.contains(captureNamed: "fieldA"), + concrete.contains(captureNamed: "fieldA")) + XCTAssertEqual( + aro.contains(captureNamed: "fieldB"), + concrete.contains(captureNamed: "fieldB")) + XCTAssertEqual( + aro.contains(captureNamed: "notAField"), + concrete.contains(captureNamed: "notAField")) +} + +extension RegexTests { + func testAnyRegexOutput() { + let regex = try! Regex(#""" + (?x) + (? [^,]*) + , + (? [^,]*) + """#) + + let match = "abc,def".wholeMatch(of: regex)! + XCTAssertEqual(match.0, "abc,def") + XCTAssertEqual(match[0].substring, "abc,def") + + XCTAssertEqual(match["fieldA"]!.substring, "abc") + XCTAssertEqual(match.output["fieldA"]!.substring, "abc") + XCTAssertEqual(match[1].substring, "abc") + + XCTAssertEqual(match["fieldB"]!.substring, "def") + XCTAssertEqual(match.output["fieldB"]!.substring, "def") + XCTAssertEqual(match[2].substring, "def") + + XCTAssertNil(match["notACapture"]) + XCTAssertNil(match.output["notACapture"]) + XCTAssertEqual(match.count, 3) + + XCTAssert(regex.contains(captureNamed: "fieldA")) + XCTAssert(regex.contains(captureNamed: "fieldB")) + XCTAssertFalse(regex.contains(captureNamed: "notAField")) + + // MARK: Check equivalence with concrete + + let regexConcrete: + Regex<(Substring, fieldA: Substring, fieldB: Substring)> + = try! Regex(#""" + (?x) + (? [^,]*) + , + (? [^,]*) + """#) + checkSame(regex, regexConcrete) + + let matchConcrete = "abc,def".wholeMatch(of: regexConcrete)! + checkSame(match, matchConcrete) + + let output = match.output + let concreteOutput = matchConcrete.output + checkSame(output, concreteOutput) + + // TODO: ARO init from concrete match tuple + + let concreteOutputCasted = output.as( + (Substring, fieldA: Substring, fieldB: Substring).self + )! + checkSame(output, concreteOutputCasted) + + var concreteOutputCopy = concreteOutput + concreteOutputCopy = output.as()! + checkSame(output, concreteOutputCopy) + + // TODO: Regex.Match: init from tuple match and as to tuple match + + // TODO: Regex: init from tuple regex and as cast to tuple regex + + } + + func testDynamicCaptures() throws { + do { + let regex = try Regex("aabcc.") + let line = "aabccd" + let match = try XCTUnwrap(line.wholeMatch(of: regex)) + XCTAssertEqual(match.0, line[...]) + let output = match.output + XCTAssertEqual(output[0].substring, line[...]) + } + do { + let regex = try Regex( + #""" + (?[0-9A-F]+)(?:\.\.(?[0-9A-F]+))?\s+;\s+(?\w+).* + """#) + let line = """ + A6F0..A6F1 ; Extend # Mn [2] BAMUM COMBINING MARK KOQNDON..BAMUM \ + COMBINING MARK TUKWENTIS + """ + let match = try XCTUnwrap(line.wholeMatch(of: regex)) + XCTAssertEqual(match.0, line[...]) + let output = match.output + XCTAssertEqual(output[0].substring, line[...]) + XCTAssertTrue(output[1].substring == "A6F0") + XCTAssertTrue(output["lower"]?.substring == "A6F0") + XCTAssertTrue(output[2].substring == "A6F1") + XCTAssertTrue(output["upper"]?.substring == "A6F1") + XCTAssertTrue(output[3].substring == "Extend") + XCTAssertTrue(output["desc"]?.substring == "Extend") + let typedOutput = try XCTUnwrap(output.as( + (Substring, lower: Substring, upper: Substring?, Substring).self)) + XCTAssertEqual(typedOutput.0, line[...]) + XCTAssertTrue(typedOutput.lower == "A6F0") + XCTAssertTrue(typedOutput.upper == "A6F1") + XCTAssertTrue(typedOutput.3 == "Extend") + } + } +} From 0e5cfa8dfa30524258525d9bf4931a8e1fca1c31 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Wed, 4 May 2022 11:47:47 +0100 Subject: [PATCH 06/24] Rename noAutoCapture -> namedCapturesOnly --- Sources/_RegexParser/Regex/AST/MatchingOptions.swift | 2 +- Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift | 2 +- Sources/_StringProcessing/MatchingOptions.swift | 6 +++--- Tests/RegexTests/ParseTests.swift | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/Sources/_RegexParser/Regex/AST/MatchingOptions.swift b/Sources/_RegexParser/Regex/AST/MatchingOptions.swift index e779c39fb..d3dbc1666 100644 --- a/Sources/_RegexParser/Regex/AST/MatchingOptions.swift +++ b/Sources/_RegexParser/Regex/AST/MatchingOptions.swift @@ -17,7 +17,7 @@ extension AST { case caseInsensitive // i case allowDuplicateGroupNames // J case multiline // m - case noAutoCapture // n + case namedCapturesOnly // n case singleLine // s case reluctantByDefault // U case extended // x diff --git a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift index 9633b607e..92270c92a 100644 --- a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift +++ b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift @@ -616,7 +616,7 @@ extension Source { case "i": return advanceAndReturn(.caseInsensitive) case "J": return advanceAndReturn(.allowDuplicateGroupNames) case "m": return advanceAndReturn(.multiline) - case "n": return advanceAndReturn(.noAutoCapture) + case "n": return advanceAndReturn(.namedCapturesOnly) case "s": return advanceAndReturn(.singleLine) case "U": return advanceAndReturn(.reluctantByDefault) case "x": diff --git a/Sources/_StringProcessing/MatchingOptions.swift b/Sources/_StringProcessing/MatchingOptions.swift index 665715a60..34a6e6f9a 100644 --- a/Sources/_StringProcessing/MatchingOptions.swift +++ b/Sources/_StringProcessing/MatchingOptions.swift @@ -135,7 +135,7 @@ extension MatchingOptions { case caseInsensitive case allowDuplicateGroupNames case multiline - case noAutoCapture + case namedCapturesOnly case singleLine case reluctantByDefault @@ -174,8 +174,8 @@ extension MatchingOptions { self = .allowDuplicateGroupNames case .multiline: self = .multiline - case .noAutoCapture: - self = .noAutoCapture + case .namedCapturesOnly: + self = .namedCapturesOnly case .singleLine: self = .singleLine case .reluctantByDefault: diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index f0013b158..96cc6bdf9 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -906,7 +906,7 @@ extension RegexTests { )) let allOptions: [AST.MatchingOption.Kind] = [ - .caseInsensitive, .allowDuplicateGroupNames, .multiline, .noAutoCapture, + .caseInsensitive, .allowDuplicateGroupNames, .multiline, .namedCapturesOnly, .singleLine, .reluctantByDefault, .extraExtended, .extended, .unicodeWordBoundaries, .asciiOnlyDigit, .asciiOnlyPOSIXProps, .asciiOnlySpace, .asciiOnlyWord, .textSegmentGraphemeMode, From 2a4b3a60c78ae8e0daeb1da7463222d620dac220 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Wed, 4 May 2022 11:47:48 +0100 Subject: [PATCH 07/24] Implement the `(?n)` option This switches `(...)` groups to being non-capturing, with only named groups capturing. --- .../Regex/Parse/LexicalAnalysis.swift | 4 +++ Sources/_RegexParser/Regex/Parse/Parse.swift | 33 ++++++++++++------- .../Regex/Parse/SyntaxOptions.swift | 3 ++ Tests/RegexTests/ParseTests.swift | 7 ++++ 4 files changed, 36 insertions(+), 11 deletions(-) diff --git a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift index 92270c92a..c2cce67e8 100644 --- a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift +++ b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift @@ -914,6 +914,10 @@ extension Source { } // TODO: (name:) + // If (?n) is set, a bare (...) group is non-capturing. + if context.syntax.contains(.namedCapturesOnly) { + return .nonCapture + } return .capture } } diff --git a/Sources/_RegexParser/Regex/Parse/Parse.swift b/Sources/_RegexParser/Regex/Parse/Parse.swift index ec6e1c26c..54e46948a 100644 --- a/Sources/_RegexParser/Regex/Parse/Parse.swift +++ b/Sources/_RegexParser/Regex/Parse/Parse.swift @@ -287,23 +287,34 @@ extension Parser { private mutating func applySyntaxOptions( of opts: AST.MatchingOptionSequence ) { - // We skip this for multi-line, as extended syntax is always enabled there. - if context.syntax.contains(.multilineExtendedSyntax) { return } + func mapOption(_ option: SyntaxOptions, + _ pred: (AST.MatchingOption) -> Bool) { + if opts.resetsCurrentOptions { + context.syntax.remove(option) + } + if opts.adding.contains(where: pred) { + context.syntax.insert(option) + } + if opts.removing.contains(where: pred) { + context.syntax.remove(option) + } + } + func mapOption(_ option: SyntaxOptions, _ kind: AST.MatchingOption.Kind) { + mapOption(option, { $0.kind == kind }) + } + + // (?n) + mapOption(.namedCapturesOnly, .namedCapturesOnly) - // Check if we're introducing or removing extended syntax. + // (?x), (?xx) + // We skip this for multi-line, as extended syntax is always enabled there. // TODO: PCRE differentiates between (?x) and (?xx) where only the latter // handles non-semantic whitespace in a custom character class. Other // engines such as Oniguruma, Java, and ICU do this under (?x). Therefore, // treat (?x) and (?xx) as the same option here. If we ever get a strict // PCRE mode, we will need to change this to handle that. - if opts.resetsCurrentOptions { - context.syntax.remove(.extendedSyntax) - } - if opts.adding.contains(where: \.isAnyExtended) { - context.syntax.insert(.extendedSyntax) - } - if opts.removing.contains(where: \.isAnyExtended) { - context.syntax.remove(.extendedSyntax) + if !context.syntax.contains(.multilineExtendedSyntax) { + mapOption(.extendedSyntax, \.isAnyExtended) } } diff --git a/Sources/_RegexParser/Regex/Parse/SyntaxOptions.swift b/Sources/_RegexParser/Regex/Parse/SyntaxOptions.swift index 0a6270f1b..dbfe5f2d6 100644 --- a/Sources/_RegexParser/Regex/Parse/SyntaxOptions.swift +++ b/Sources/_RegexParser/Regex/Parse/SyntaxOptions.swift @@ -63,6 +63,9 @@ public struct SyntaxOptions: OptionSet { return [Self(1 << 6), .extendedSyntax] } + /// `(?n)` + public static var namedCapturesOnly: Self { Self(1 << 7) } + /* /// `*` == `[[:digit:]]*` == `\d*` diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index 96cc6bdf9..831f904c6 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -973,6 +973,13 @@ extension RegexTests { "d" )), captures: [.cap]) + parseTest("(?n)(?^:())(?)()", concat( + changeMatchingOptions(matchingOptions(adding: .namedCapturesOnly)), + changeMatchingOptions(unsetMatchingOptions(), capture(empty())), + namedCapture("x", empty()), + nonCapture(empty()) + ), captures: [.cap, .named("x")]) + // MARK: References // \1 ... \9 are always backreferences. From 6d833aa25f321f3c351551cfa952499a86784540 Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Thu, 5 May 2022 15:43:38 -0500 Subject: [PATCH 08/24] Improve Unicode/UTS18 and semantic level support (#268) * Add tests for UTS18 level support (incomplete) * Implement canonical equivalence tests * Fix canonical equivalence at different levels * Test named chars x semantic level * Enable loose matching on \N{...} scalar names * Make Unicode property classes work with semantics --- Package.swift | 2 +- Sources/_StringProcessing/ByteCodeGen.swift | 56 +- .../_StringProcessing/ConsumerInterface.swift | 372 ++++++----- .../Regex/ASTConversion.swift | 12 +- .../Unicode/CharacterProps.swift | 6 + .../_CharacterClassModel.swift | 7 +- Tests/RegexTests/MatchTests.swift | 86 ++- Tests/RegexTests/UTS18Tests.swift | 589 ++++++++++++++++++ 8 files changed, 933 insertions(+), 197 deletions(-) create mode 100644 Tests/RegexTests/UTS18Tests.swift diff --git a/Package.swift b/Package.swift index f8162e762..8303fc5cb 100644 --- a/Package.swift +++ b/Package.swift @@ -67,7 +67,7 @@ let package = Package( name: "RegexTests", dependencies: ["_StringProcessing"], swiftSettings: [ - .unsafeFlags(["-Xfrontend", "-disable-availability-checking"]) + .unsafeFlags(["-Xfrontend", "-disable-availability-checking"]), ]), .testTarget( name: "RegexBuilderTests", diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index 21fcfa703..2131d1eb5 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -168,7 +168,15 @@ extension Compiler.ByteCodeGen { } mutating func emitCharacter(_ c: Character) throws { - // FIXME: Does semantic level matter? + // Unicode scalar matches the specific scalars that comprise a character + if options.semanticLevel == .unicodeScalar { + print("emitting '\(c)' as a sequence of \(c.unicodeScalars.count) scalars") + for scalar in c.unicodeScalars { + try emitScalar(scalar) + } + return + } + if options.isCaseInsensitive && c.isCased { // TODO: buildCaseInsensitiveMatch(c) or buildMatch(c, caseInsensitive: true) builder.buildConsume { input, bounds in @@ -625,22 +633,44 @@ extension Compiler.ByteCodeGen { try emitAtom(a) case let .quotedLiteral(s): - // TODO: Should this incorporate options? - if options.isCaseInsensitive { - // TODO: buildCaseInsensitiveMatchSequence(c) or alternative - builder.buildConsume { input, bounds in - var iterator = s.makeIterator() + if options.semanticLevel == .graphemeCluster { + if options.isCaseInsensitive { + // TODO: buildCaseInsensitiveMatchSequence(c) or alternative + builder.buildConsume { input, bounds in + var iterator = s.makeIterator() + var currentIndex = bounds.lowerBound + while let ch = iterator.next() { + guard currentIndex < bounds.upperBound, + ch.lowercased() == input[currentIndex].lowercased() + else { return nil } + input.formIndex(after: ¤tIndex) + } + return currentIndex + } + } else { + builder.buildMatchSequence(s) + } + } else { + builder.buildConsume { + [caseInsensitive = options.isCaseInsensitive] input, bounds in + // TODO: Case folding + var iterator = s.unicodeScalars.makeIterator() var currentIndex = bounds.lowerBound - while let ch = iterator.next() { - guard currentIndex < bounds.upperBound, - ch.lowercased() == input[currentIndex].lowercased() - else { return nil } - input.formIndex(after: ¤tIndex) + while let scalar = iterator.next() { + guard currentIndex < bounds.upperBound else { return nil } + if caseInsensitive { + if scalar.properties.lowercaseMapping != input.unicodeScalars[currentIndex].properties.lowercaseMapping { + return nil + } + } else { + if scalar != input.unicodeScalars[currentIndex] { + return nil + } + } + input.unicodeScalars.formIndex(after: ¤tIndex) } return currentIndex } - } else { - builder.buildMatchSequence(s) } case let .regexLiteral(l): diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index a44c2c876..d27b89314 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -111,6 +111,38 @@ extension DSLTree.Atom { } } +extension String { + /// Compares this string to `other` using the loose matching rule UAX44-LM2, + /// which ignores case, whitespace, underscores, and nearly all medial + /// hyphens. + /// + /// FIXME: Only ignore medial hyphens + /// FIXME: Special case for U+1180 HANGUL JUNGSEONG O-E + /// See https://www.unicode.org/reports/tr44/#Matching_Rules + fileprivate func isEqualByUAX44LM2(to other: String) -> Bool { + var index = startIndex + var otherIndex = other.startIndex + + while index < endIndex && otherIndex < other.endIndex { + if self[index].isWhitespace || self[index] == "-" || self[index] == "_" { + formIndex(after: &index) + continue + } + if other[otherIndex].isWhitespace || other[otherIndex] == "-" || other[otherIndex] == "_" { + other.formIndex(after: &otherIndex) + continue + } + + if self[index] != other[otherIndex] && self[index].lowercased() != other[otherIndex].lowercased() { + return false + } + + formIndex(after: &index) + other.formIndex(after: &otherIndex) + } + return index == endIndex && otherIndex == other.endIndex + } +} // TODO: This is basically an AST interpreter, which would // be good or interesting to build regardless, and serves @@ -131,6 +163,13 @@ extension AST.Atom { } } + var singleScalar: UnicodeScalar? { + switch kind { + case .scalar(let s): return s + default: return nil + } + } + func generateConsumer( _ opts: MatchingOptions ) throws -> MEProgram.ConsumeFunction? { @@ -167,10 +206,12 @@ extension AST.Atom { return try p.generateConsumer(opts) case let .namedCharacter(name): - return consumeScalarProp { - // TODO: alias? casing? - $0.name == name || $0.nameAlias == name - } + return consumeScalar(propertyScalarPredicate { + // FIXME: name aliases not covered by $0.nameAlias are missed + // e.g. U+FEFF is also 'FORM FEED', 'BYTE ORDER MARK', and 'BOM' + $0.name?.isEqualByUAX44LM2(to: name) == true + || $0.nameAlias?.isEqualByUAX44LM2(to: name) == true + }) case .any: assertionFailure( @@ -312,8 +353,9 @@ extension DSLTree.CustomCharacterClass { } } if isInverted { - // FIXME: semantic level - return input.index(after: bounds.lowerBound) + return opts.semanticLevel == .graphemeCluster + ? input.index(after: bounds.lowerBound) + : input.unicodeScalars.index(after: bounds.lowerBound) } return nil } @@ -321,38 +363,26 @@ extension DSLTree.CustomCharacterClass { } // NOTE: Conveniences, though not most performant -private func consumeScalarScript( - _ s: Unicode.Script -) -> MEProgram.ConsumeFunction { - consumeScalar { - Unicode.Script($0) == s - } +typealias ScalarPredicate = (UnicodeScalar) -> Bool + +private func scriptScalarPredicate(_ s: Unicode.Script) -> ScalarPredicate { + { Unicode.Script($0) == s } } -private func consumeScalarScriptExtension( - _ s: Unicode.Script -) -> MEProgram.ConsumeFunction { - consumeScalar { - let extensions = Unicode.Script.extensions(for: $0) - return extensions.contains(s) - } +private func scriptExtensionScalarPredicate(_ s: Unicode.Script) -> ScalarPredicate { + { Unicode.Script.extensions(for: $0).contains(s) } } -private func consumeScalarGC( - _ gc: Unicode.GeneralCategory -) -> MEProgram.ConsumeFunction { - consumeScalar { gc == $0.properties.generalCategory } +private func categoryScalarPredicate(_ gc: Unicode.GeneralCategory) -> ScalarPredicate { + { gc == $0.properties.generalCategory } } -private func consumeScalarGCs( - _ gcs: [Unicode.GeneralCategory] -) -> MEProgram.ConsumeFunction { - consumeScalar { gcs.contains($0.properties.generalCategory) } +private func categoriesScalarPredicate(_ gcs: [Unicode.GeneralCategory]) -> ScalarPredicate { + { gcs.contains($0.properties.generalCategory) } } -private func consumeScalarProp( - _ p: @escaping (Unicode.Scalar.Properties) -> Bool -) -> MEProgram.ConsumeFunction { - consumeScalar { p($0.properties) } +private func propertyScalarPredicate(_ p: @escaping (Unicode.Scalar.Properties) -> Bool) -> ScalarPredicate { + { p($0.properties) } } + func consumeScalar( - _ p: @escaping (Unicode.Scalar) -> Bool + _ p: @escaping ScalarPredicate ) -> MEProgram.ConsumeFunction { { input, bounds in // TODO: bounds check? @@ -364,6 +394,37 @@ func consumeScalar( return nil } } +func consumeCharacterWithLeadingScalar( + _ p: @escaping ScalarPredicate +) -> MEProgram.ConsumeFunction { + { input, bounds in + let curIdx = bounds.lowerBound + if p(input[curIdx].unicodeScalars.first!) { + return input.index(after: curIdx) + } + return nil + } +} +func consumeCharacterWithSingleScalar( + _ p: @escaping ScalarPredicate +) -> MEProgram.ConsumeFunction { + { input, bounds in + let curIdx = bounds.lowerBound + + if input[curIdx].hasExactlyOneScalar && p(input[curIdx].unicodeScalars.first!) { + return input.index(after: curIdx) + } + return nil + } +} + +func consumeFunction( + for opts: MatchingOptions +) -> (@escaping ScalarPredicate) -> MEProgram.ConsumeFunction { + opts.semanticLevel == .graphemeCluster + ? consumeCharacterWithLeadingScalar + : consumeScalar +} extension AST.Atom.CharacterProperty { func generateConsumer( @@ -375,16 +436,15 @@ extension AST.Atom.CharacterProperty { ) -> MEProgram.ConsumeFunction { return { input, bounds in if p(input, bounds) != nil { return nil } - // TODO: semantic level + // TODO: bounds check - return input.unicodeScalars.index( - after: bounds.lowerBound) + return opts.semanticLevel == .graphemeCluster + ? input.index(after: bounds.lowerBound) + : input.unicodeScalars.index(after: bounds.lowerBound) } } - // FIXME: Below is largely scalar based, for convenience, - // but we want a comprehensive treatment to semantic mode - // switching. + let consume = consumeFunction(for: opts) let preInversion: MEProgram.ConsumeFunction = try { switch kind { @@ -395,11 +455,16 @@ extension AST.Atom.CharacterProperty { return input.index(after: bounds.lowerBound) } case .assigned: - return consumeScalar { + return consume { $0.properties.generalCategory != .unassigned } case .ascii: - return consumeScalar(\.isASCII) + // Note: ASCII must look at the whole character, not just the first + // scalar. That is, "e\u{301}" is not an ASCII character, even though + // the first scalar is. + return opts.semanticLevel == .graphemeCluster + ? consumeCharacterWithSingleScalar(\.isASCII) + : consumeScalar(\.isASCII) case .generalCategory(let p): return try p.generateConsumer(opts) @@ -410,10 +475,10 @@ extension AST.Atom.CharacterProperty { return value ? cons : invert(cons) case .script(let s): - return consumeScalarScript(s) + return consume(scriptScalarPredicate(s)) case .scriptExtension(let s): - return consumeScalarScriptExtension(s) + return consume(scriptExtensionScalarPredicate(s)) case .posix(let p): return p.generateConsumer(opts) @@ -436,49 +501,48 @@ extension Unicode.BinaryProperty { func generateConsumer( _ opts: MatchingOptions ) throws -> MEProgram.ConsumeFunction { + let consume = consumeFunction(for: opts) + switch self { - case .asciiHexDigit: - return consumeScalarProp { + return consume(propertyScalarPredicate { $0.isHexDigit && $0.isASCIIHexDigit - } + }) case .alphabetic: - return consumeScalarProp(\.isAlphabetic) + return consume(propertyScalarPredicate(\.isAlphabetic)) case .bidiControl: break - - - case .bidiMirrored: - return consumeScalarProp(\.isBidiMirrored) + case .bidiMirrored: + return consume(propertyScalarPredicate(\.isBidiMirrored)) case .cased: - return consumeScalarProp(\.isCased) + return consume(propertyScalarPredicate(\.isCased)) case .compositionExclusion: break case .caseIgnorable: - return consumeScalarProp(\.isCaseIgnorable) + return consume(propertyScalarPredicate(\.isCaseIgnorable)) case .changesWhenCasefolded: - return consumeScalarProp(\.changesWhenCaseFolded) + return consume(propertyScalarPredicate(\.changesWhenCaseFolded)) case .changesWhenCasemapped: - return consumeScalarProp(\.changesWhenCaseMapped) + return consume(propertyScalarPredicate(\.changesWhenCaseMapped)) case .changesWhenNFKCCasefolded: - return consumeScalarProp(\.changesWhenNFKCCaseFolded) + return consume(propertyScalarPredicate(\.changesWhenNFKCCaseFolded)) case .changesWhenLowercased: - return consumeScalarProp(\.changesWhenLowercased) + return consume(propertyScalarPredicate(\.changesWhenLowercased)) case .changesWhenTitlecased: - return consumeScalarProp(\.changesWhenTitlecased) + return consume(propertyScalarPredicate(\.changesWhenTitlecased)) case .changesWhenUppercased: - return consumeScalarProp(\.changesWhenUppercased) + return consume(propertyScalarPredicate(\.changesWhenUppercased)) case .dash: - return consumeScalarProp(\.isDash) + return consume(propertyScalarPredicate(\.isDash)) case .deprecated: - return consumeScalarProp(\.isDeprecated) + return consume(propertyScalarPredicate(\.isDeprecated)) case .defaultIgnorableCodePoint: - return consumeScalarProp(\.isDefaultIgnorableCodePoint) + return consume(propertyScalarPredicate(\.isDefaultIgnorableCodePoint)) case .diacratic: // spelling? - return consumeScalarProp(\.isDiacritic) + return consume(propertyScalarPredicate(\.isDiacritic)) case .emojiModifierBase: if #available(macOS 10.12.2, iOS 10.2, tvOS 10.1, watchOS 3.1.1, *) { - return consumeScalarProp(\.isEmojiModifierBase) + return consume(propertyScalarPredicate(\.isEmojiModifierBase)) } else { throw Unsupported( "isEmojiModifierBase on old OSes") @@ -487,59 +551,59 @@ extension Unicode.BinaryProperty { break case .emojiModifier: if #available(macOS 10.12.2, iOS 10.2, tvOS 10.1, watchOS 3.1.1, *) { - return consumeScalarProp(\.isEmojiModifier) + return consume(propertyScalarPredicate(\.isEmojiModifier)) } else { throw Unsupported("isEmojiModifier on old OSes") } case .emoji: if #available(macOS 10.12.2, iOS 10.2, tvOS 10.1, watchOS 3.1.1, *) { - return consumeScalarProp(\.isEmoji) + return consume(propertyScalarPredicate(\.isEmoji)) } else { throw Unsupported("isEmoji on old OSes") } case .emojiPresentation: if #available(macOS 10.12.2, iOS 10.2, tvOS 10.1, watchOS 3.1.1, *) { - return consumeScalarProp(\.isEmojiPresentation) + return consume(propertyScalarPredicate(\.isEmojiPresentation)) } else { throw Unsupported( "isEmojiPresentation on old OSes") } case .extender: - return consumeScalarProp(\.isExtender) + return consume(propertyScalarPredicate(\.isExtender)) case .extendedPictographic: break // NOTE: Stdlib has this data internally case .fullCompositionExclusion: - return consumeScalarProp(\.isFullCompositionExclusion) + return consume(propertyScalarPredicate(\.isFullCompositionExclusion)) case .graphemeBase: - return consumeScalarProp(\.isGraphemeBase) + return consume(propertyScalarPredicate(\.isGraphemeBase)) case .graphemeExtended: - return consumeScalarProp(\.isGraphemeExtend) + return consume(propertyScalarPredicate(\.isGraphemeExtend)) case .graphemeLink: break case .hexDigit: - return consumeScalarProp(\.isHexDigit) + return consume(propertyScalarPredicate(\.isHexDigit)) case .hyphen: break case .idContinue: - return consumeScalarProp(\.isIDContinue) + return consume(propertyScalarPredicate(\.isIDContinue)) case .ideographic: - return consumeScalarProp(\.isIdeographic) + return consume(propertyScalarPredicate(\.isIdeographic)) case .idStart: - return consumeScalarProp(\.isIDStart) + return consume(propertyScalarPredicate(\.isIDStart)) case .idsBinaryOperator: - return consumeScalarProp(\.isIDSBinaryOperator) + return consume(propertyScalarPredicate(\.isIDSBinaryOperator)) case .idsTrinaryOperator: - return consumeScalarProp(\.isIDSTrinaryOperator) + return consume(propertyScalarPredicate(\.isIDSTrinaryOperator)) case .joinControl: - return consumeScalarProp(\.isJoinControl) + return consume(propertyScalarPredicate(\.isJoinControl)) case .logicalOrderException: - return consumeScalarProp(\.isLogicalOrderException) + return consume(propertyScalarPredicate(\.isLogicalOrderException)) case .lowercase: - return consumeScalarProp(\.isLowercase) + return consume(propertyScalarPredicate(\.isLowercase)) case .math: - return consumeScalarProp(\.isMath) + return consume(propertyScalarPredicate(\.isMath)) case .noncharacterCodePoint: - return consumeScalarProp(\.isNoncharacterCodePoint) + return consume(propertyScalarPredicate(\.isNoncharacterCodePoint)) case .otherAlphabetic: break case .otherDefaultIgnorableCodePoint: @@ -557,37 +621,37 @@ extension Unicode.BinaryProperty { case .otherUppercase: break case .patternSyntax: - return consumeScalarProp(\.isPatternSyntax) + return consume(propertyScalarPredicate(\.isPatternSyntax)) case .patternWhitespace: - return consumeScalarProp(\.isPatternWhitespace) + return consume(propertyScalarPredicate(\.isPatternWhitespace)) case .prependedConcatenationMark: break case .quotationMark: - return consumeScalarProp(\.isQuotationMark) + return consume(propertyScalarPredicate(\.isQuotationMark)) case .radical: - return consumeScalarProp(\.isRadical) + return consume(propertyScalarPredicate(\.isRadical)) case .regionalIndicator: - return consumeScalar { s in + return consume { s in (0x1F1E6...0x1F1FF).contains(s.value) } case .softDotted: - return consumeScalarProp(\.isSoftDotted) + return consume(propertyScalarPredicate(\.isSoftDotted)) case .sentenceTerminal: - return consumeScalarProp(\.isSentenceTerminal) + return consume(propertyScalarPredicate(\.isSentenceTerminal)) case .terminalPunctuation: - return consumeScalarProp(\.isTerminalPunctuation) + return consume(propertyScalarPredicate(\.isTerminalPunctuation)) case .unifiedIdiograph: // spelling? - return consumeScalarProp(\.isUnifiedIdeograph) + return consume(propertyScalarPredicate(\.isUnifiedIdeograph)) case .uppercase: - return consumeScalarProp(\.isUppercase) + return consume(propertyScalarPredicate(\.isUppercase)) case .variationSelector: - return consumeScalarProp(\.isVariationSelector) + return consume(propertyScalarPredicate(\.isVariationSelector)) case .whitespace: - return consumeScalarProp(\.isWhitespace) + return consume(propertyScalarPredicate(\.isWhitespace)) case .xidContinue: - return consumeScalarProp(\.isXIDContinue) + return consume(propertyScalarPredicate(\.isXIDContinue)) case .xidStart: - return consumeScalarProp(\.isXIDStart) + return consume(propertyScalarPredicate(\.isXIDStart)) case .expandsOnNFC, .expandsOnNFD, .expandsOnNFKD, .expandsOnNFKC: throw Unsupported("Unicode-deprecated: \(self)") @@ -602,42 +666,44 @@ extension Unicode.POSIXProperty { func generateConsumer( _ opts: MatchingOptions ) -> MEProgram.ConsumeFunction { - // FIXME: semantic levels, modes, etc + let consume = consumeFunction(for: opts) + + // FIXME: modes, etc switch self { case .alnum: - return consumeScalarProp { + return consume(propertyScalarPredicate { $0.isAlphabetic || $0.numericType != nil - } + }) case .blank: - return consumeScalar { s in + return consume { s in s.properties.generalCategory == .spaceSeparator || s == "\t" } case .graph: - return consumeScalarProp { p in + return consume(propertyScalarPredicate { p in !( p.isWhitespace || p.generalCategory == .control || p.generalCategory == .surrogate || p.generalCategory == .unassigned ) - } + }) case .print: - return consumeScalarProp { p in + return consume(propertyScalarPredicate { p in // FIXME: better def p.generalCategory != .control - } + }) case .word: - return consumeScalarProp { p in + return consume(propertyScalarPredicate { p in // FIXME: better def p.isAlphabetic || p.numericType != nil || p.isJoinControl || p.isDash// marks and connectors... - } + }) case .xdigit: - return consumeScalarProp(\.isHexDigit) // or number + return consume(propertyScalarPredicate(\.isHexDigit)) // or number } } @@ -648,113 +714,115 @@ extension Unicode.ExtendedGeneralCategory { func generateConsumer( _ opts: MatchingOptions ) throws -> MEProgram.ConsumeFunction { + let consume = consumeFunction(for: opts) + switch self { case .letter: - return consumeScalarGCs([ + return consume(categoriesScalarPredicate([ .uppercaseLetter, .lowercaseLetter, .titlecaseLetter, .modifierLetter, .otherLetter - ]) + ])) case .mark: - return consumeScalarGCs([ + return consume(categoriesScalarPredicate([ .nonspacingMark, .spacingMark, .enclosingMark - ]) + ])) case .number: - return consumeScalarGCs([ + return consume(categoriesScalarPredicate([ .decimalNumber, .letterNumber, .otherNumber - ]) + ])) case .symbol: - return consumeScalarGCs([ + return consume(categoriesScalarPredicate([ .mathSymbol, .currencySymbol, .modifierSymbol, .otherSymbol - ]) + ])) case .punctuation: - return consumeScalarGCs([ + return consume(categoriesScalarPredicate([ .connectorPunctuation, .dashPunctuation, .openPunctuation, .closePunctuation, .initialPunctuation, .finalPunctuation, .otherPunctuation - ]) + ])) case .separator: - return consumeScalarGCs([ + return consume(categoriesScalarPredicate([ .spaceSeparator, .lineSeparator, .paragraphSeparator - ]) + ])) case .other: - return consumeScalarGCs([ + return consume(categoriesScalarPredicate([ .control, .format, .surrogate, .privateUse, .unassigned - ]) + ])) case .casedLetter: - return consumeScalarGCs([ + return consume(categoriesScalarPredicate([ .uppercaseLetter, .lowercaseLetter, .titlecaseLetter - ]) + ])) case .control: - return consumeScalarGC(.control) + return consume(categoryScalarPredicate(.control)) case .format: - return consumeScalarGC(.format) + return consume(categoryScalarPredicate(.format)) case .unassigned: - return consumeScalarGC(.unassigned) + return consume(categoryScalarPredicate(.unassigned)) case .privateUse: - return consumeScalarGC(.privateUse) + return consume(categoryScalarPredicate(.privateUse)) case .surrogate: - return consumeScalarGC(.surrogate) + return consume(categoryScalarPredicate(.surrogate)) case .lowercaseLetter: - return consumeScalarGC(.lowercaseLetter) + return consume(categoryScalarPredicate(.lowercaseLetter)) case .modifierLetter: - return consumeScalarGC(.modifierLetter) + return consume(categoryScalarPredicate(.modifierLetter)) case .otherLetter: - return consumeScalarGC(.otherLetter) + return consume(categoryScalarPredicate(.otherLetter)) case .titlecaseLetter: - return consumeScalarGC(.titlecaseLetter) + return consume(categoryScalarPredicate(.titlecaseLetter)) case .uppercaseLetter: - return consumeScalarGC(.uppercaseLetter) + return consume(categoryScalarPredicate(.uppercaseLetter)) case .spacingMark: - return consumeScalarGC(.spacingMark) + return consume(categoryScalarPredicate(.spacingMark)) case .enclosingMark: - return consumeScalarGC(.enclosingMark) + return consume(categoryScalarPredicate(.enclosingMark)) case .nonspacingMark: - return consumeScalarGC(.nonspacingMark) + return consume(categoryScalarPredicate(.nonspacingMark)) case .decimalNumber: - return consumeScalarGC(.decimalNumber) + return consume(categoryScalarPredicate(.decimalNumber)) case .letterNumber: - return consumeScalarGC(.letterNumber) + return consume(categoryScalarPredicate(.letterNumber)) case .otherNumber: - return consumeScalarGC(.otherNumber) + return consume(categoryScalarPredicate(.otherNumber)) case .connectorPunctuation: - return consumeScalarGC(.connectorPunctuation) + return consume(categoryScalarPredicate(.connectorPunctuation)) case .dashPunctuation: - return consumeScalarGC(.dashPunctuation) + return consume(categoryScalarPredicate(.dashPunctuation)) case .closePunctuation: - return consumeScalarGC(.closePunctuation) + return consume(categoryScalarPredicate(.closePunctuation)) case .finalPunctuation: - return consumeScalarGC(.finalPunctuation) + return consume(categoryScalarPredicate(.finalPunctuation)) case .initialPunctuation: - return consumeScalarGC(.initialPunctuation) + return consume(categoryScalarPredicate(.initialPunctuation)) case .otherPunctuation: - return consumeScalarGC(.otherPunctuation) + return consume(categoryScalarPredicate(.otherPunctuation)) case .openPunctuation: - return consumeScalarGC(.openPunctuation) + return consume(categoryScalarPredicate(.openPunctuation)) case .currencySymbol: - return consumeScalarGC(.currencySymbol) + return consume(categoryScalarPredicate(.currencySymbol)) case .modifierSymbol: - return consumeScalarGC(.modifierSymbol) + return consume(categoryScalarPredicate(.modifierSymbol)) case .mathSymbol: - return consumeScalarGC(.mathSymbol) + return consume(categoryScalarPredicate(.mathSymbol)) case .otherSymbol: - return consumeScalarGC(.otherSymbol) + return consume(categoryScalarPredicate(.otherSymbol)) case .lineSeparator: - return consumeScalarGC(.lineSeparator) + return consume(categoryScalarPredicate(.lineSeparator)) case .paragraphSeparator: - return consumeScalarGC(.paragraphSeparator) + return consume(categoryScalarPredicate(.paragraphSeparator)) case .spaceSeparator: - return consumeScalarGC(.spaceSeparator) + return consume(categoryScalarPredicate(.spaceSeparator)) } } } diff --git a/Sources/_StringProcessing/Regex/ASTConversion.swift b/Sources/_StringProcessing/Regex/ASTConversion.swift index ef98a7b8f..47433dc42 100644 --- a/Sources/_StringProcessing/Regex/ASTConversion.swift +++ b/Sources/_StringProcessing/Regex/ASTConversion.swift @@ -65,13 +65,17 @@ extension AST.Node { // TODO: For printing, nice to coalesce // scalars literals too. We likely need a different // approach even before we have a better IR. - guard let char = atom?.singleCharacter else { + if let char = atom?.singleCharacter { + result.append(char) + } else if let scalar = atom?.singleScalar { + result.append(Character(scalar)) + } else { break } - result.append(char) + astChildren.formIndex(after: &idx) } - return result.count <= 1 ? nil : (idx, result) + return result.isEmpty ? nil : (idx, result) } // No need to nest single children concatenations @@ -207,7 +211,7 @@ extension AST.Atom { switch self.kind { case let .char(c): return .char(c) - case let .scalar(s): return .scalar(s) + case let .scalar(s): return .char(Character(s)) case .any: return .any case let .backreference(r): return .backreference(.init(ast: r)) case let .changeMatchingOptions(seq): return .changeMatchingOptions(.init(ast: seq)) diff --git a/Sources/_StringProcessing/Unicode/CharacterProps.swift b/Sources/_StringProcessing/Unicode/CharacterProps.swift index cfa68c425..80f6819a6 100644 --- a/Sources/_StringProcessing/Unicode/CharacterProps.swift +++ b/Sources/_StringProcessing/Unicode/CharacterProps.swift @@ -12,3 +12,9 @@ // TODO +extension Character { + /// Whether this character is made up of exactly one Unicode scalar value. + var hasExactlyOneScalar: Bool { + unicodeScalars.index(after: unicodeScalars.startIndex) == unicodeScalars.endIndex + } +} diff --git a/Sources/_StringProcessing/_CharacterClassModel.swift b/Sources/_StringProcessing/_CharacterClassModel.swift index 4d0c12c1f..fc3fd5741 100644 --- a/Sources/_StringProcessing/_CharacterClassModel.swift +++ b/Sources/_StringProcessing/_CharacterClassModel.swift @@ -194,11 +194,14 @@ public struct _CharacterClassModel: Hashable { return matched ? next : nil case .unicodeScalar: let c = str.unicodeScalars[i] + var nextIndex = str.unicodeScalars.index(after: i) var matched: Bool switch cc { case .any: matched = true case .anyScalar: matched = true - case .anyGrapheme: fatalError("Not matched in this mode") + case .anyGrapheme: + matched = true + nextIndex = str.index(after: i) case .digit: matched = c.properties.numericType != nil && (c.isASCII || !options.usesASCIIDigits) case .hexDigit: @@ -215,7 +218,7 @@ public struct _CharacterClassModel: Hashable { if isInverted { matched.toggle() } - return matched ? str.unicodeScalars.index(after: i) : nil + return matched ? nextIndex : nil } } } diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index 2c6b858cc..83b73fe35 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -169,6 +169,8 @@ func firstMatchTest( XCTAssertEqual(found, match, file: file, line: line) } } catch { + // FIXME: This allows non-matches to succeed even when xfail'd + // When xfail == true, this should report failure for match == nil if !xfail && match != nil { XCTFail("\(error)", file: file, line: line) } @@ -182,7 +184,9 @@ func firstMatchTests( syntax: SyntaxOptions = .traditional, enableTracing: Bool = false, dumpAST: Bool = false, - xfail: Bool = false + xfail: Bool = false, + file: StaticString = #filePath, + line: UInt = #line ) { for (input, match) in tests { firstMatchTest( @@ -192,7 +196,9 @@ func firstMatchTests( syntax: syntax, enableTracing: enableTracing, dumpAST: dumpAST, - xfail: xfail) + xfail: xfail, + file: file, + line: line) } } @@ -400,7 +406,8 @@ extension RegexTests { "a++a", ("babc", nil), ("baaabc", nil), - ("bb", nil)) + ("bb", nil), + xfail: true) firstMatchTests( "a+?a", ("babc", nil), @@ -462,15 +469,11 @@ extension RegexTests { "a{2,4}+a", ("babc", nil), ("baabc", nil), - ("baaabc", nil), ("baaaaabc", "aaaaa"), ("baaaaaaaabc", "aaaaa"), ("bb", nil)) firstMatchTests( "a{,4}+a", - ("babc", nil), - ("baabc", nil), - ("baaabc", nil), ("baaaaabc", "aaaaa"), ("baaaaaaaabc", "aaaaa"), ("bb", nil)) @@ -478,11 +481,44 @@ extension RegexTests { "a{2,}+a", ("babc", nil), ("baabc", nil), + ("bb", nil)) + + // XFAIL'd versions of the above + firstMatchTests( + "a{2,4}+a", + ("baaabc", nil), + xfail: true) + firstMatchTests( + "a{,4}+a", + ("babc", nil), + ("baabc", nil), + ("baaabc", nil), + xfail: true) + firstMatchTests( + "a{2,}+a", ("baaabc", nil), ("baaaaabc", nil), ("baaaaaaaabc", nil), - ("bb", nil)) + xfail: true) + // XFAIL'd possessive tests + firstMatchTests( + "a?+a", + ("a", nil), + xfail: true) + firstMatchTests( + "(a|a)?+a", + ("a", nil), + xfail: true) + firstMatchTests( + "(a|a){2,4}+a", + ("a", nil), + ("aa", nil)) + firstMatchTests( + "(a|a){2,4}+a", + ("aaa", nil), + ("aaaa", nil), + xfail: true) firstMatchTests( "(?:a{2,4}?b)+", @@ -946,15 +982,19 @@ extension RegexTests { // TODO: Oniguruma \y and \Y firstMatchTests( - #"\u{65}"#, // Scalar 'e' is present in both: - ("Cafe\u{301}", "e"), // composed and - ("Sol Cafe", "e")) // standalone + #"\u{65}"#, // Scalar 'e' is present in both + ("Cafe\u{301}", nil), // but scalar mode requires boundary at end of match + xfail: true) + firstMatchTests( + #"\u{65}"#, // Scalar 'e' is present in both + ("Sol Cafe", "e")) // standalone is okay + firstMatchTests( #"\u{65}\y"#, // Grapheme boundary assertion ("Cafe\u{301}", nil), ("Sol Cafe", "e")) firstMatchTests( - #"\u{65}\Y"#, // Grapheme non-boundary assertion + #"(?u)\u{65}\Y"#, // Grapheme non-boundary assertion ("Cafe\u{301}", "e"), ("Sol Cafe", nil)) } @@ -1361,11 +1401,11 @@ extension RegexTests { // as a character. firstMatchTest(#"\u{65}\u{301}$"#, input: eDecomposed, match: eDecomposed) - // FIXME: Decomposed character in regex literal doesn't match an equivalent character - firstMatchTest(#"\u{65}\u{301}$"#, input: eComposed, match: eComposed, - xfail: true) + firstMatchTest(#"\u{65}\u{301}$"#, input: eComposed, match: eComposed) - firstMatchTest(#"\u{65}"#, input: eDecomposed, match: "e") + // FIXME: Implicit \y at end of match + firstMatchTest(#"\u{65}"#, input: eDecomposed, match: nil, + xfail: true) firstMatchTest(#"\u{65}$"#, input: eDecomposed, match: nil) // FIXME: \y is unsupported firstMatchTest(#"\u{65}\y"#, input: eDecomposed, match: nil, @@ -1389,12 +1429,10 @@ extension RegexTests { (eComposed, true), (eDecomposed, true)) - // FIXME: Decomposed character in regex literal doesn't match an equivalent character matchTest( #"e\u{301}$"#, (eComposed, true), - (eDecomposed, true), - xfail: true) + (eDecomposed, true)) matchTest( #"e$"#, @@ -1415,9 +1453,7 @@ extension RegexTests { (eDecomposed, true)) // \p{Letter} firstMatchTest(#"\p{Letter}$"#, input: eComposed, match: eComposed) - // FIXME: \p{Letter} doesn't match a decomposed character - firstMatchTest(#"\p{Letter}$"#, input: eDecomposed, match: eDecomposed, - xfail: true) + firstMatchTest(#"\p{Letter}$"#, input: eDecomposed, match: eDecomposed) // \d firstMatchTest(#"\d"#, input: "5", match: "5") @@ -1480,7 +1516,8 @@ extension RegexTests { firstMatchTest(#"\u{1F1F0}\u{1F1F7}"#, input: flag, match: flag) // First Unicode scalar followed by CCC of regional indicators - firstMatchTest(#"\u{1F1F0}[\u{1F1E6}-\u{1F1FF}]"#, input: flag, match: flag) + firstMatchTest(#"\u{1F1F0}[\u{1F1E6}-\u{1F1FF}]"#, input: flag, match: flag, + xfail: true) // FIXME: CCC of Regional Indicator doesn't match with both parts of a flag character // A CCC of regional indicators x 2 @@ -1521,8 +1558,7 @@ extension RegexTests { // FIXME: \O is unsupported firstMatchTest(#"(?u)\O\u{301}"#, input: eDecomposed, match: eDecomposed) - firstMatchTest(#"(?u)e\O"#, input: eDecomposed, match: eDecomposed, - xfail: true) + firstMatchTest(#"(?u)e\O"#, input: eDecomposed, match: eDecomposed) firstMatchTest(#"\O"#, input: eComposed, match: eComposed) firstMatchTest(#"\O"#, input: eDecomposed, match: nil, xfail: true) diff --git a/Tests/RegexTests/UTS18Tests.swift b/Tests/RegexTests/UTS18Tests.swift new file mode 100644 index 000000000..71f459a1b --- /dev/null +++ b/Tests/RegexTests/UTS18Tests.swift @@ -0,0 +1,589 @@ +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift.org open source project +// +// Copyright (c) 2021-2022 Apple Inc. and the Swift project authors +// Licensed under Apache License v2.0 with Runtime Library Exception +// +// See https://swift.org/LICENSE.txt for license information +// +//===----------------------------------------------------------------------===// + +// This test suite includes tests that verify the behavior of `Regex` as it +// relates to Unicode Technical Standard #18: Unicode Regular Expressions. +// +// Please note: Quotations of UTS18 in this file mostly use 'Character' to mean +// Unicode code point, and 'String' to mean 'sequence of code points' — they +// are not the Swift meanings of those terms. +// +// See https://unicode.org/reports/tr18/ for more. + +import XCTest +@testable // for internal `matches(of:)` +import _StringProcessing + +class UTS18Tests: XCTestCase { + var input: String { + "ABCdefghîøu\u{308}\u{FFF0} -–—[]123" + // 01234567890 1 234567890 + // 0 10 20 + } +} + +fileprivate func regex(_ pattern: String) -> Regex { + try! Regex(pattern, as: Substring.self) +} + +fileprivate extension String { + subscript(pos bounds: R) -> Substring + where R.Bound == Int + { + let bounds = bounds.relative(to: 0..( + _ input: String, + _ r: Regex, + _ output: Output, + file: StaticString = #file, + line: UInt = #line) +{ + XCTAssertEqual(input.firstMatch(of: r)?.output, output, file: file, line: line) +} + +#if os(Linux) +func XCTExpectFailure(_ message: String? = nil, body: () -> Void) {} +#endif + +// MARK: - Basic Unicode Support: Level 1 + +// C1. An implementation claiming conformance to Level 1 of this specification +// shall meet the requirements described in the following sections: +extension UTS18Tests { + // RL1.1 Hex Notation + // + // To meet this requirement, an implementation shall supply a mechanism for + // specifying any Unicode code point (from U+0000 to U+10FFFF), using the + // hexadecimal code point representation. + func testHexNotation() { + expectFirstMatch("ab", regex(#"\u{61}\u{62}"#), "ab") + expectFirstMatch("𝄞", regex(#"\u{1D11E}"#), "𝄞") + } + + // 1.1.1 Hex Notation and Normalization + // + // TODO: Does this section make a recommendation? + + // RL1.2 Properties + // To meet this requirement, an implementation shall provide at least a + // minimal list of properties, consisting of the following: + // - General_Category + // - Script and Script_Extensions + // - Alphabetic + // - Uppercase + // - Lowercase + // - White_Space + // - Noncharacter_Code_Point + // - Default_Ignorable_Code_Point + // - ANY, ASCII, ASSIGNED + // The values for these properties must follow the Unicode definitions, and + // include the property and property value aliases from the UCD. Matching of + // Binary, Enumerated, Catalog, and Name values must follow the Matching + // Rules from [UAX44] with one exception: implementations are not required + // to ignore an initial prefix string of "is" in property values. + func testProperties() { + // General_Category + expectFirstMatch(input, regex(#"\p{Lu}+"#), input[pos: ..<3]) + expectFirstMatch(input, regex(#"\p{lu}+"#), input[pos: ..<3]) + expectFirstMatch(input, regex(#"\p{uppercase letter}+"#), input[pos: ..<3]) + expectFirstMatch(input, regex(#"\p{Uppercase Letter}+"#), input[pos: ..<3]) + expectFirstMatch(input, regex(#"\p{Uppercase_Letter}+"#), input[pos: ..<3]) + expectFirstMatch(input, regex(#"\p{uppercaseletter}+"#), input[pos: ..<3]) + + expectFirstMatch(input, regex(#"\p{P}+"#), "-–—[]") + expectFirstMatch(input, regex(#"\p{Pd}+"#), "-–—") + + expectFirstMatch(input, regex(#"\p{Any}+"#), input[...]) + expectFirstMatch(input, regex(#"\p{Assigned}+"#), input[pos: ..<11]) + expectFirstMatch(input, regex(#"\p{ASCII}+"#), input[pos: ..<8]) + + // Script and Script_Extensions + // U+3042 あ HIRAGANA LETTER A Hira {Hira} + XCTAssertTrue("\u{3042}".contains(regex(#"\p{Hira}"#))) + XCTAssertTrue("\u{3042}".contains(regex(#"\p{sc=Hira}"#))) + XCTAssertTrue("\u{3042}".contains(regex(#"\p{scx=Hira}"#))) + // U+30FC ー KATAKANA-HIRAGANA PROLONGED SOUND MARK Zyyy = Common {Hira, Kana} + XCTAssertTrue("\u{30FC}".contains(regex(#"\p{Hira}"#))) // Implicit = Script_Extensions + XCTAssertTrue("\u{30FC}".contains(regex(#"\p{Kana}"#))) + XCTAssertTrue("\u{30FC}".contains(regex(#"\p{sc=Zyyy}"#))) // Explicit = Script + XCTAssertTrue("\u{30FC}".contains(regex(#"\p{scx=Hira}"#))) + XCTAssertTrue("\u{30FC}".contains(regex(#"\p{scx=Kana}"#))) + XCTAssertFalse("\u{30FC}".contains(regex(#"\p{sc=Hira}"#))) + XCTAssertFalse("\u{30FC}".contains(regex(#"\p{sc=Kana}"#))) + + // Uppercase, etc + expectFirstMatch(input, regex(#"\p{Uppercase}+"#), input[pos: ..<3]) + expectFirstMatch(input, regex(#"\p{isUppercase}+"#), input[pos: ..<3]) + expectFirstMatch(input, regex(#"\p{Uppercase=true}+"#), input[pos: ..<3]) + expectFirstMatch(input, regex(#"\p{is Uppercase}+"#), input[pos: ..<3]) + expectFirstMatch(input, regex(#"\p{is uppercase = true}+"#), input[pos: ..<3]) + expectFirstMatch(input, regex(#"\p{lowercase}+"#), input[pos: 3..<11]) + expectFirstMatch(input, regex(#"\p{whitespace}+"#), input[pos: 12..<13]) + + // Block vs Writing System + let greekScalar = "Θ" // U+0398 + let greekExtendedScalar = "ἀ" // U+1F00 + XCTAssertTrue(greekScalar.contains(regex(#"\p{Greek}"#))) + XCTAssertTrue(greekExtendedScalar.contains(regex(#"\p{Greek}"#))) + } + + func testProperties_XFail() { + XCTExpectFailure("Need to support 'age' and 'block' properties") { + // XCTAssertFalse("z".contains(#/\p{age=3.1}/#)) + XCTFail(#"\(#/\p{age=3.1}/#)"#) + // XCTAssertTrue("\u{1F00}".contains(#/\p{Block=Greek}/#)) + XCTFail(#"\(#/\p{Block=Greek}/#)"#) + } + } + + // RL1.2a Compatibility Properties + // To meet this requirement, an implementation shall provide the properties + // listed in Annex C: Compatibility Properties, with the property values as + // listed there. Such an implementation shall document whether it is using + // the Standard Recommendation or POSIX-compatible properties. + func testCompatibilityProperties() throws { + // FIXME: These tests seem insufficient + expectFirstMatch(input, regex(#"[[:alpha:]]+"#), input[pos: ..<11]) + expectFirstMatch(input, regex(#"[[:upper:]]+"#), input[pos: ..<3]) + expectFirstMatch(input, regex(#"[[:lower:]]+"#), input[pos: 3..<11]) + expectFirstMatch(input, regex(#"[[:punct:]]+"#), input[pos: 13..<18]) + expectFirstMatch(input, regex(#"[[:digit:]]+"#), input[pos: 18..<21]) + expectFirstMatch(input, regex(#"[[:xdigit:]]+"#), input[pos: ..<6]) + expectFirstMatch(input, regex(#"[[:alnum:]]+"#), input[pos: ..<11]) + expectFirstMatch(input, regex(#"[[:space:]]+"#), input[pos: 12..<13]) + // TODO: blank + // TODO: cntrl + expectFirstMatch(input, regex(#"[[:graph:]]+"#), input[pos: ..<11]) + expectFirstMatch(input, regex(#"[[:print:]]+"#), input[...]) + expectFirstMatch(input, regex(#"[[:word:]]+"#), input[pos: ..<11]) + } + + //RL1.3 Subtraction and Intersection + // + // To meet this requirement, an implementation shall supply mechanisms for + // union, intersection and set-difference of sets of characters within + // regular expression character class expressions. + func testSubtractionAndIntersection() throws { + // Non-ASCII letters + expectFirstMatch(input, regex(#"[\p{Letter}--\p{ASCII}]+"#), input[pos: 8..<11]) + // Digits that aren't 1 or 2 + expectFirstMatch(input, regex(#"[\p{digit}--[12]]+"#), input[pos: 20..<21]) + + // ASCII-only letters + expectFirstMatch(input, regex(#"[\p{Letter}&&\p{ASCII}]+"#), input[pos: ..<8]) + // Digits that are 2 or 3 + expectFirstMatch(input, regex(#"[\p{digit}&&[23]]+"#), input[pos: 19..<21]) + + // Non-ASCII lowercase + non-lowercase ASCII + expectFirstMatch(input, regex(#"[\p{lowercase}~~\p{ascii}]+"#), input[pos: ..<3]) + XCTAssertTrue("123%&^ABC".contains(regex(#"^[\p{lowercase}~~\p{ascii}]+$"#))) + } + + func testSubtractionAndIntersectionPrecedence() { + expectFirstMatch("ABC123-", regex(#"[[:alnum:]]*-"#), "ABC123-") + expectFirstMatch("ABC123-", regex(#"[[:alnum:]--\p{Uppercase}]*-"#), "123-") + // Union binds more closely than difference + expectFirstMatch("ABC123-", regex(#"[[:alnum:]--\p{Uppercase}[:digit:]]*-"#), "-") + // TODO: Test for intersection precedence + } + + // RL1.4 Simple Word Boundaries + // To meet this requirement, an implementation shall extend the word boundary + // mechanism so that: + // - The class of includes all the Alphabetic values from the + // Unicode character database, from UnicodeData.txt, plus the decimals + // (General_Category=Decimal_Number, or equivalently Numeric_Type=Decimal), + // and the U+200C ZERO WIDTH NON-JOINER and U+200D ZERO WIDTH JOINER + // (Join_Control=True). See also Annex C: Compatibility Properties. + // - Nonspacing marks are never divided from their base characters, and + // otherwise ignored in locating boundaries. + func testSimpleWordBoundaries() { + let simpleWordRegex = regex(#".+?\b"#).wordBoundaryKind(.unicodeLevel1) + expectFirstMatch(input, simpleWordRegex, input[pos: ..<11]) + expectFirstMatch("don't", simpleWordRegex, "don") + expectFirstMatch("Cafe\u{301}", simpleWordRegex, "Café") + } + + // RL1.5 Simple Loose Matches + // + // To meet this requirement, if an implementation provides for case- + // insensitive matching, then it shall provide at least the simple, default + // Unicode case-insensitive matching, and specify which properties are closed + // and which are not. + // + // To meet this requirement, if an implementation provides for case + // conversions, then it shall provide at least the simple, default Unicode + // case folding. + func testSimpleLooseMatches() { + expectFirstMatch("Dåb", regex(#"Dåb"#).ignoresCase(), "Dåb") + expectFirstMatch("dÅB", regex(#"Dåb"#).ignoresCase(), "dÅB") + expectFirstMatch("D\u{212B}B", regex(#"Dåb"#).ignoresCase(), "D\u{212B}B") + } + + func testSimpleLooseMatches_XFail() { + XCTExpectFailure("Need case folding support") { + let sigmas = "σΣς" + expectFirstMatch(sigmas, regex(#"σ+"#).ignoresCase(), sigmas[...]) + expectFirstMatch(sigmas, regex(#"Σ+"#).ignoresCase(), sigmas[...]) + expectFirstMatch(sigmas, regex(#"ς+"#).ignoresCase(), sigmas[...]) + + // TODO: Test German sharp S + // TODO: Test char classes, e.g. [\p{Block=Phonetic_Extensions} [A-E]] + } + } + + // RL1.6 Line Boundaries + // + // To meet this requirement, if an implementation provides for line-boundary + // testing, it shall recognize not only CRLF, LF, CR, but also NEL (U+0085), + // PARAGRAPH SEPARATOR (U+2029) and LINE SEPARATOR (U+2028). + func testLineBoundaries() { + let lineInput = """ + 01 + 02\r\ + 03\n\ + 04\u{a}\ + 05\u{b}\ + 06\u{c}\ + 07\u{d}\ + 08\u{d}\u{a}\ + 09\u{85}\ + 10\u{2028}\ + 11\u{2029}\ + + """ + // Check the input counts + var lines = lineInput.matches(of: regex(#"\d{2}"#)) + XCTAssertEqual(lines.count, 11) + // Test \R - newline sequence + lines = lineInput.matches(of: regex(#"\d{2}\R"#)) + XCTAssertEqual(lines.count, 11) + // Test anchors as line boundaries + lines = lineInput.matches(of: regex(#"^\d{2}$"#).anchorsMatchLineEndings()) + XCTAssertEqual(lines.count, 11) + // Test that dot does not match line endings + lines = lineInput.matches(of: regex(#".+"#)) + XCTAssertEqual(lines.count, 11) + + // Does not contain an empty line + XCTAssertFalse(lineInput.contains(regex(#"^$"#))) + // Does contain an empty line (between \n and \r, which are reversed here) + let empty = "\n\r" + XCTAssertTrue(empty.contains(regex(#"^$"#).anchorsMatchLineEndings())) + } + + // RL1.7 Supplementary Code Points + // + // To meet this requirement, an implementation shall handle the full range of + // Unicode code points, including values from U+FFFF to U+10FFFF. In + // particular, where UTF-16 is used, a sequence consisting of a leading + // surrogate followed by a trailing surrogate shall be handled as a single + // code point in matching. + func testSupplementaryCodePoints() { + XCTAssertTrue("👍".contains(regex(#"\u{1F44D}"#))) + XCTAssertTrue("👍".contains(regex(#"[\u{1F440}-\u{1F44F}]"#))) + XCTAssertTrue("👍👎".contains(regex(#"^[\u{1F440}-\u{1F44F}]+$"#))) + } +} + +// MARK: - Extended Unicode Support: Level 2 + +// C2. An implementation claiming conformance to Level 2 of this specification +// shall satisfy C1, and meet the requirements described in the following +// sections: +extension UTS18Tests { + // RL2.1 Canonical Equivalents + // + // Specific recommendation? + func testCanonicalEquivalents() { + let equivalents = [ + "\u{006f}\u{031b}\u{0323}", // o + horn + dot_below + "\u{006f}\u{0323}\u{031b}", // o + dot_below + horn + "\u{01a1}\u{0323}", // o-horn + dot_below + "\u{1ecd}\u{031b}", // o-dot_below + horn + "\u{1ee3}", // o-horn-dot_below + ] + + let regexes = [ + regex(#"\u{006f}\u{031b}\u{0323}"#), // o + horn + dot_below + regex(#"\u{006f}\u{0323}\u{031b}"#), // o + dot_below + horn + regex(#"\u{01a1}\u{0323}"#), // o-horn + dot_below + regex(#"\u{1ecd}\u{031b}"#), // o-dot_below + horn + regex(#"\u{1ee3}"#), // o-horn-dot_below + ] + + // Default: Grapheme cluster semantics + for (regexNum, regex) in regexes.enumerated() { + for (equivNum, equiv) in equivalents.enumerated() { + XCTAssertTrue( + equiv.contains(regex), + "Grapheme cluster semantics: Regex \(regexNum) didn't match with string \(equivNum)") + } + } + + // Unicode scalar semantics + for (regexNum, regex) in regexes.enumerated() { + for (equivNum, equiv) in equivalents.enumerated() { + let regex = regex.matchingSemantics(.unicodeScalar) + if regexNum == equivNum { + XCTAssertTrue( + equiv.contains(regex), + "Unicode scalar semantics: Regex \(regexNum) didn't match with string \(equivNum)") + } else { + XCTAssertFalse( + equiv.contains(regex), + "Unicode scalar semantics: Regex \(regexNum) incorrectly matched with string \(equivNum)") + } + } + } + } + + // RL2.2 Extended Grapheme Clusters and Character Classes with Strings + // + // To meet this requirement, an implementation shall provide a mechanism for + // matching against an arbitrary extended grapheme cluster, Character Classes + // with Strings, and extended grapheme cluster boundaries. + func testExtendedGraphemeClusters() { + XCTAssertTrue("abcdef🇬🇭".contains(regex(#"abcdef.$"#))) + XCTAssertTrue("abcdef🇬🇭".contains(regex(#"abcdef\X$"#))) + XCTAssertTrue("abcdef🇬🇭".contains(regex(#"abcdef\X$"#).matchingSemantics(.unicodeScalar))) + XCTAssertTrue("abcdef🇬🇭".contains(regex(#"abcdef.+\y"#).matchingSemantics(.unicodeScalar))) + } + + func testCharacterClassesWithStrings() { + let regex = regex(#"[a-z🧐🇧🇪🇧🇫🇧🇬]"#) + XCTAssertTrue("🧐".contains(regex)) + XCTAssertTrue("🇧🇫".contains(regex)) + } + + // RL2.3 Default Word Boundaries + // + // To meet this requirement, an implementation shall provide a mechanism for + // matching Unicode default word boundaries. + func testDefaultWordBoundaries() { + XCTExpectFailure { XCTFail("Implement tests") } + } + + // RL2.4 Default Case Conversion + // + // To meet this requirement, if an implementation provides for case + // conversions, then it shall provide at least the full, default Unicode case + // folding. + func testDefaultCaseConversion() { + XCTExpectFailure { XCTFail("Implement tests") } + } + + // RL2.5 Name Properties + // + // To meet this requirement, an implementation shall support individually + // named characters. + func testNameProperty_XFail() { + XCTExpectFailure("Need \\p{name=...} support") { + XCTFail(#"\(#/\p{name=BOM}/#)"#) + // Name property + // XCTAssertTrue("\u{FEFF}".contains(#/\p{name=ZERO WIDTH NO-BREAK SPACE}/#)) + // Name property and Matching Rules + // XCTAssertTrue("\u{FEFF}".contains(#/\p{name=zerowidthno breakspace}/#)) + // Name_Alias property + // XCTAssertTrue("\u{FEFF}".contains(#/\p{name=BYTE ORDER MARK}/#)) + // Name_Alias property (again) + // XCTAssertTrue("\u{FEFF}".contains(#/\p{name=BOM}/#)) + + // Computed name + // XCTAssertTrue("강".contains(#/\p{name=HANGUL SYLLABLE GANG}/#)) + + // Control character + // XCTAssertTrue("\u{7}".contains(#/\p{name=BEL}/#)) + // Graphic symbol + // XCTAssertTrue("\u{1F514}".contains(#/\p{name=BELL}/#)) + } + } + + func testIndividuallyNamedCharacters() { + XCTAssertTrue("\u{263A}".contains(regex(#"\N{WHITE SMILING FACE}"#))) + XCTAssertTrue("\u{3B1}".contains(regex(#"\N{GREEK SMALL LETTER ALPHA}"#))) + XCTAssertTrue("\u{10450}".contains(regex(#"\N{SHAVIAN LETTER PEEP}"#))) + + XCTAssertTrue("\u{FEFF}".contains(regex(#"\N{ZERO WIDTH NO-BREAK SPACE}"#))) + XCTAssertTrue("강".contains(regex(#"\N{HANGUL SYLLABLE GANG}"#))) + XCTAssertTrue("\u{1F514}".contains(regex(#"\N{BELL}"#))) + XCTAssertTrue("🐯".contains(regex(#"\N{TIGER FACE}"#))) + XCTAssertFalse("🐯".contains(regex(#"\N{TIEGR FACE}"#))) + + // Loose matching + XCTAssertTrue("\u{263A}".contains(regex(#"\N{whitesmilingface}"#))) + XCTAssertTrue("\u{263A}".contains(regex(#"\N{wHiTe_sMiLiNg_fAcE}"#))) + XCTAssertTrue("\u{263A}".contains(regex(#"\N{White Smiling-Face}"#))) + XCTAssertTrue("\u{FEFF}".contains(regex(#"\N{zerowidthno breakspace}"#))) + + // Matching semantic level + XCTAssertFalse("👩‍👩‍👧‍👦".contains(regex(#".\N{ZERO WIDTH JOINER}"#))) + XCTAssertTrue("👩‍👩‍👧‍👦".contains(regex(#"(?u).\N{ZERO WIDTH JOINER}"#))) + } + + func testIndividuallyNamedCharacters_XFail() { + XCTExpectFailure("Need to support named chars in custom character classes") { + XCTFail("\(regex(#"[\N{GREEK SMALL LETTER ALPHA}-\N{GREEK SMALL LETTER BETA}]+"#))") + // XCTAssertTrue("^\u{3B1}\u{3B2}$".contains(#/[\N{GREEK SMALL LETTER ALPHA}-\N{GREEK SMALL LETTER BETA}]+/#)) + } + + XCTExpectFailure("Other named char failures -- investigate") { + XCTAssertTrue("\u{C}".contains(regex(#"\N{FORM FEED}"#))) + XCTAssertTrue("\u{FEFF}".contains(regex(#"\N{BYTE ORDER MARK}"#))) + XCTAssertTrue("\u{FEFF}".contains(regex(#"\N{BOM}"#))) + XCTAssertTrue("\u{7}".contains(regex(#"\N{BEL}"#))) + } + + XCTExpectFailure("Need to recognize invalid names at compile time") { + XCTFail("This should be a compilation error, not a match failure:") + XCTAssertFalse("abc".contains(regex(#"\N{NOT AN ACTUAL CHARACTER NAME}"#))) + } + } + + // RL2.6 Wildcards in Property Values + // + // To meet this requirement, an implementation shall support wildcards in + // Unicode property values. + func testWildcardsInPropertyValues() { + XCTExpectFailure { XCTFail("Implement tests") } + } + + // RL2.7 Full Properties + // + // To meet this requirement, an implementation shall support all of the + // properties listed below that are in the supported version of the Unicode + // Standard (or Unicode Technical Standard, respectively), with values that + // match the Unicode definitions for that version. + func testFullProperties() { + // MARK: General + // Name (Name_Alias) + // Block + // Age + // General_Category + // Script (Script_Extensions) + // White_Space + // Alphabetic + // Hangul_Syllable_Type + // Noncharacter_Code_Point + // Default_Ignorable_Code_Point + // Deprecated + // Logical_Order_Exception + // Variation_Selector + + // MARK: Numeric + // Numeric_Value + // Numeric_Type + // Hex_Digit + // ASCII_Hex_Digit + + // MARK: Identifiers + // ID_Continue + // ID_Start + // XID_Continue + // XID_Start + // Pattern_Syntax + // Pattern_White_Space + // Identifier_Status + // Identifier_Type + + // MARK: CJK + // Ideographic + // Unified_Ideograph + // Radical + // IDS_Binary_Operator + // IDS_Trinary_Operator + // Equivalent_Unified_Ideograph + XCTExpectFailure { + XCTFail(#"Unsupported: \(#/^\p{Equivalent_Unified_Ideograph=⼚}+$/#)"#) + // XCTAssertTrue("⼚⺁厂".contains(#/^\p{Equivalent_Unified_Ideograph=⼚}+$/#)) + } + + // MARK: Case + // Uppercase + // Lowercase + // Simple_Lowercase_Mapping + // Simple_Titlecase_Mapping + // Simple_Uppercase_Mapping + // Simple_Case_Folding + // Soft_Dotted + // Cased + // Case_Ignorable + // Changes_When_Lowercased + // Changes_When_Uppercased + XCTAssertTrue("a".contains(regex(#"\p{Changes_When_Uppercased}"#))) + XCTAssertTrue("a".contains(regex(#"\p{Changes_When_Uppercased=true}"#))) + XCTAssertFalse("A".contains(regex(#"\p{Changes_When_Uppercased}"#))) + // Changes_When_Titlecased + // Changes_When_Casefolded + // Changes_When_Casemapped + + // MARK: Normalization + // Canonical_Combining_Class + // Decomposition_Type + // NFC_Quick_Check + // NFKC_Quick_Check + // NFD_Quick_Check + // NFKD_Quick_Check + // NFKC_Casefold + // Changes_When_NFKC_Casefolded + + // MARK: Emoji + // Emoji + // Emoji_Presentation + // Emoji_Modifier + // Emoji_Modifier_Base + // Emoji_Component + // Extended_Pictographic + // Basic_Emoji* + // Emoji_Keycap_Sequence* + // RGI_Emoji_Modifier_Sequence* + // RGI_Emoji_Flag_Sequence* + // RGI_Emoji_Tag_Sequence* + // RGI_Emoji_ZWJ_Sequence* + // RGI_Emoji* + + // MARK: Shaping and Rendering + // Join_Control + // Joining_Group + // Joining_Type + // Vertical_Orientation + // Line_Break + // Grapheme_Cluster_Break + // Sentence_Break + // Word_Break + // East_Asian_Width + // Prepended_Concatenation_Mark + + // MARK: Bidirectional + // Bidi_Class + // Bidi_Control + // Bidi_Mirrored + // Bidi_Mirroring_Glyph + // Bidi_Paired_Bracket + // Bidi_Paired_Bracket_Type + + // MARK: Miscellaneous + // Math + // Quotation_Mark + // Dash + // Sentence_Terminal + // Terminal_Punctuation + // Diacritic + // Extender + // Grapheme_Base + // Grapheme_Extend + // Regional_Indicator + } +} From 09a385bf4ffcc3d0d26e54a592ea87f9cc50c948 Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Fri, 6 May 2022 06:56:52 -0500 Subject: [PATCH 09/24] Support Unicode scalar names in `\p{name=...}` (#382) --- Sources/_RegexParser/Regex/AST/Atom.swift | 3 ++ .../CharacterPropertyClassification.swift | 2 + .../_StringProcessing/ConsumerInterface.swift | 23 ++++++++--- Tests/RegexTests/ParseTests.swift | 7 ++++ Tests/RegexTests/UTS18Tests.swift | 41 +++++++++++-------- 5 files changed, 54 insertions(+), 22 deletions(-) diff --git a/Sources/_RegexParser/Regex/AST/Atom.swift b/Sources/_RegexParser/Regex/AST/Atom.swift index e17ce68bb..d6062115a 100644 --- a/Sources/_RegexParser/Regex/AST/Atom.swift +++ b/Sources/_RegexParser/Regex/AST/Atom.swift @@ -396,6 +396,9 @@ extension AST.Atom.CharacterProperty { case script(Unicode.Script) case scriptExtension(Unicode.Script) + /// Character name in the form `\p{name=...}` + case named(String) + case posix(Unicode.POSIXProperty) /// Some special properties implemented by PCRE and Oniguruma. diff --git a/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift b/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift index 5cc920063..ee9195ff3 100644 --- a/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift +++ b/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift @@ -428,6 +428,8 @@ extension Source { if let cat = classifyGeneralCategory(value) { return .generalCategory(cat) } + case "name", "na": + return .named(value) default: break } diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index d27b89314..637b1a37a 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -144,6 +144,19 @@ extension String { } } +func consumeName(_ name: String, opts: MatchingOptions) -> MEProgram.ConsumeFunction { + let consume = opts.semanticLevel == .graphemeCluster + ? consumeCharacterWithSingleScalar + : consumeScalar + + return consume(propertyScalarPredicate { + // FIXME: name aliases not covered by $0.nameAlias are missed + // e.g. U+FEFF has both 'BYTE ORDER MARK' and 'BOM' as aliases + $0.name?.isEqualByUAX44LM2(to: name) == true + || $0.nameAlias?.isEqualByUAX44LM2(to: name) == true + }) +} + // TODO: This is basically an AST interpreter, which would // be good or interesting to build regardless, and serves // as a compiler fall-back path @@ -206,12 +219,7 @@ extension AST.Atom { return try p.generateConsumer(opts) case let .namedCharacter(name): - return consumeScalar(propertyScalarPredicate { - // FIXME: name aliases not covered by $0.nameAlias are missed - // e.g. U+FEFF is also 'FORM FEED', 'BYTE ORDER MARK', and 'BOM' - $0.name?.isEqualByUAX44LM2(to: name) == true - || $0.nameAlias?.isEqualByUAX44LM2(to: name) == true - }) + return consumeName(name, opts: opts) case .any: assertionFailure( @@ -479,6 +487,9 @@ extension AST.Atom.CharacterProperty { case .scriptExtension(let s): return consume(scriptExtensionScalarPredicate(s)) + + case .named(let n): + return consumeName(n, opts: opts) case .posix(let p): return p.generateConsumer(opts) diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index 831f904c6..0ef021442 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -1219,6 +1219,13 @@ extension RegexTests { parseTest(#"\p{word}"#, prop(.posix(.word))) parseTest(#"\p{xdigit}"#, prop(.posix(.xdigit))) + parseTest(#"\p{name=A}"#, prop(.named("A"))) + parseTest(#"\p{Name=B}"#, prop(.named("B"))) + parseTest(#"\p{isName=C}"#, prop(.named("C"))) + parseTest(#"\p{na=D}"#, prop(.named("D"))) + parseTest(#"\p{NA=E}"#, prop(.named("E"))) + parseTest(#"\p{na=isI}"#, prop(.named("isI"))) + // MARK: Conditionals parseTest(#"(?(1))"#, conditional( diff --git a/Tests/RegexTests/UTS18Tests.swift b/Tests/RegexTests/UTS18Tests.swift index 71f459a1b..eff9f9b4e 100644 --- a/Tests/RegexTests/UTS18Tests.swift +++ b/Tests/RegexTests/UTS18Tests.swift @@ -389,25 +389,34 @@ extension UTS18Tests { // // To meet this requirement, an implementation shall support individually // named characters. - func testNameProperty_XFail() { - XCTExpectFailure("Need \\p{name=...} support") { - XCTFail(#"\(#/\p{name=BOM}/#)"#) - // Name property - // XCTAssertTrue("\u{FEFF}".contains(#/\p{name=ZERO WIDTH NO-BREAK SPACE}/#)) - // Name property and Matching Rules - // XCTAssertTrue("\u{FEFF}".contains(#/\p{name=zerowidthno breakspace}/#)) + func testNameProperty() throws { + // Name property + XCTAssertTrue("\u{FEFF}".contains(regex(#"\p{name=ZERO WIDTH NO-BREAK SPACE}"#))) + // Name property and Matching Rules + XCTAssertTrue("\u{FEFF}".contains(regex(#"\p{name=zerowidthno breakspace}"#))) + + // Computed name + XCTAssertTrue("강".contains(regex(#"\p{name=HANGUL SYLLABLE GANG}"#))) + + // Graphic symbol + XCTAssertTrue("\u{1F514}".contains(regex(#"\p{name=BELL}"#))) + + // Name match failures + XCTAssertFalse("\u{FEFF}".contains(regex(#"\p{name=ZERO WIDTH NO-BRAKE SPACE}"#))) + XCTAssertFalse("\u{FEFF}".contains(regex(#"\p{name=ZERO WIDTH NO-BREAK SPACE ZZZZ}"#))) + XCTAssertFalse("\u{FEFF}".contains(regex(#"\p{name=ZERO WIDTH NO-BREAK}"#))) + XCTAssertFalse("\u{FEFF}".contains(regex(#"\p{name=z}"#))) + } + + func testNameProperty_XFail() throws { + XCTExpectFailure("Need more expansive name alias matching") { // Name_Alias property - // XCTAssertTrue("\u{FEFF}".contains(#/\p{name=BYTE ORDER MARK}/#)) + XCTAssertTrue("\u{FEFF}".contains(regex(#"\p{name=BYTE ORDER MARK}"#))) // Name_Alias property (again) - // XCTAssertTrue("\u{FEFF}".contains(#/\p{name=BOM}/#)) - - // Computed name - // XCTAssertTrue("강".contains(#/\p{name=HANGUL SYLLABLE GANG}/#)) - + XCTAssertTrue("\u{FEFF}".contains(regex(#"\p{name=BOM}"#))) + // Control character - // XCTAssertTrue("\u{7}".contains(#/\p{name=BEL}/#)) - // Graphic symbol - // XCTAssertTrue("\u{1F514}".contains(#/\p{name=BELL}/#)) + XCTAssertTrue("\u{7}".contains(regex(#"\p{name=BEL}"#))) } } From 39c0ed535b9d4ed99aa88b3e20b1a7c61e0d8bc1 Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Fri, 6 May 2022 15:52:55 -0500 Subject: [PATCH 10/24] Modify DSL test to test for uncaptured backreference (#355) --- Tests/RegexBuilderTests/RegexDSLTests.swift | 48 +++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/Tests/RegexBuilderTests/RegexDSLTests.swift b/Tests/RegexBuilderTests/RegexDSLTests.swift index 5673aa348..c0c6491ac 100644 --- a/Tests/RegexBuilderTests/RegexDSLTests.swift +++ b/Tests/RegexBuilderTests/RegexDSLTests.swift @@ -852,6 +852,54 @@ class RegexDSLTests: XCTestCase { } } } + + // Post-hoc captured reference w/ attempted match before capture + // #"(?:\w\1|(\w):)+"# + // + // This tests that the reference `a` simply fails to match instead of + // erroring when encountered before a match is captured into `a`. The + // matching process here goes like this: + // - the first time through, the first alternation is taken + // - `.word` matches on "a" + // - the `a` backreference fails on ":", because `a` hasn't matched yet + // - backtrack to the beginning of the input + // - now the second alternation is taken + // - `.word` matches on "a" and is captured as `a` + // - the literal ":" matches + // - proceeding from the position of the first "b" in the first alternation + // - `.word` matches on "b" + // - the `a` backreference now contains "a", and matches on "a" + // - proceeding from the position of the first "c" in the first alternation + // - `.word` matches on "c" + // - the `a` backreference still contains "a", and matches on "a" + // - proceeding from the position of the first "o" in the first alternation + // - `.word` matches on "o" + // - the `a` backreference still contains "a", so it fails on ":" + // - now the second alternation is taken + // - `.word` matches on "o" and is captured as `a` + // - the literal ":" matches + // - continuing as above from the second "b"... + try _testDSLCaptures( + ("a:bacao:boco", ("a:bacao:boco", "o")), + matchType: (Substring, Substring?).self, + == + ) { + // NOTE: "expression too complex to type check" when inferring the generic + // parameter. + OneOrMore { + let a = Reference(Substring.self) + ChoiceOf<(Substring, Substring?)> { + Regex { + .word + a + } + Regex { + Capture(.word, as: a) + ":" + } + } + } + } } func testSemanticVersionExample() { From 9740416c5e724c2b9d412e19d3cb84c55114f452 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Mon, 9 May 2022 17:15:42 +0100 Subject: [PATCH 11/24] Introduce ASTStage parameter to `parse` This allows specifying whether or not to perform semantic checks on the AST. Some clients, e.g syntax coloring, only care about the syntactic structure. But other clients want errors to be emitted for e.g unsupported constructs. --- .../PatternConverter/PatternConverter.swift | 2 +- .../Regex/Parse/CompilerInterface.swift | 2 +- Sources/_RegexParser/Regex/Parse/Parse.swift | 18 +++++++++++++++--- Sources/_StringProcessing/Compiler.swift | 2 +- .../Regex/AnyRegexOutput.swift | 4 ++-- Sources/_StringProcessing/Regex/Core.swift | 4 ++-- Tests/RegexTests/CaptureTests.swift | 2 +- Tests/RegexTests/DiagnosticTests.swift | 6 +++--- Tests/RegexTests/ParseTests.swift | 10 +++++----- 9 files changed, 31 insertions(+), 19 deletions(-) diff --git a/Sources/PatternConverter/PatternConverter.swift b/Sources/PatternConverter/PatternConverter.swift index a10698526..497d54506 100644 --- a/Sources/PatternConverter/PatternConverter.swift +++ b/Sources/PatternConverter/PatternConverter.swift @@ -50,7 +50,7 @@ struct PatternConverter: ParsableCommand { print("Converting '\(delim)\(regex)\(delim)'") let ast = try _RegexParser.parse( - regex, + regex, .semantic, experimentalSyntax ? .experimental : .traditional) // Show rendered source ranges diff --git a/Sources/_RegexParser/Regex/Parse/CompilerInterface.swift b/Sources/_RegexParser/Regex/Parse/CompilerInterface.swift index 0856361d8..4ae518dcd 100644 --- a/Sources/_RegexParser/Regex/Parse/CompilerInterface.swift +++ b/Sources/_RegexParser/Regex/Parse/CompilerInterface.swift @@ -96,7 +96,7 @@ public func swiftCompilerParseRegexLiteral( _ input: String, captureBufferOut: UnsafeMutableRawBufferPointer ) throws -> (regexToEmit: String, version: Int) { do { - let ast = try parseWithDelimiters(input) + let ast = try parseWithDelimiters(input, .semantic) // Serialize the capture structure for later type inference. assert(captureBufferOut.count >= input.utf8.count) ast.captureStructure.encode(to: captureBufferOut) diff --git a/Sources/_RegexParser/Regex/Parse/Parse.swift b/Sources/_RegexParser/Regex/Parse/Parse.swift index 54e46948a..168adf4a2 100644 --- a/Sources/_RegexParser/Regex/Parse/Parse.swift +++ b/Sources/_RegexParser/Regex/Parse/Parse.swift @@ -558,8 +558,19 @@ extension Parser { } } +public enum ASTStage { + /// The regex is parsed, and a syntactically valid AST is returned. Otherwise + /// an error is thrown. This is useful for e.g syntax coloring. + case syntactic + + /// The regex is parsed, and a syntactically and semantically valid AST is + /// returned. Otherwise an error is thrown. A semantically valid AST has been + /// checked for e.g unsupported constructs and invalid backreferences. + case semantic +} + public func parse( - _ regex: S, _ syntax: SyntaxOptions + _ regex: S, _ stage: ASTStage, _ syntax: SyntaxOptions ) throws -> AST where S.SubSequence == Substring { let source = Source(String(regex)) @@ -591,11 +602,12 @@ fileprivate func defaultSyntaxOptions( /// Parses a given regex string with delimiters, inferring the syntax options /// from the delimiters used. public func parseWithDelimiters( - _ regex: S + _ regex: S, _ stage: ASTStage ) throws -> AST where S.SubSequence == Substring { let (contents, delim) = droppingRegexDelimiters(String(regex)) do { - return try parse(contents, defaultSyntaxOptions(delim, contents: contents)) + let syntax = defaultSyntaxOptions(delim, contents: contents) + return try parse(contents, stage, syntax) } catch let error as LocatedErrorProtocol { // Convert the range in 'contents' to the range in 'regex'. let delimCount = delim.opening.count diff --git a/Sources/_StringProcessing/Compiler.swift b/Sources/_StringProcessing/Compiler.swift index 47faa23ed..1c20761c8 100644 --- a/Sources/_StringProcessing/Compiler.swift +++ b/Sources/_StringProcessing/Compiler.swift @@ -38,7 +38,7 @@ class Compiler { func _compileRegex( _ regex: String, _ syntax: SyntaxOptions = .traditional ) throws -> Executor { - let ast = try parse(regex, syntax) + let ast = try parse(regex, .semantic, syntax) let program = try Compiler(ast: ast).emit() return Executor(program: program) } diff --git a/Sources/_StringProcessing/Regex/AnyRegexOutput.swift b/Sources/_StringProcessing/Regex/AnyRegexOutput.swift index 00fc2e952..6dd8e17b6 100644 --- a/Sources/_StringProcessing/Regex/AnyRegexOutput.swift +++ b/Sources/_StringProcessing/Regex/AnyRegexOutput.swift @@ -17,7 +17,7 @@ extension Regex where Output == AnyRegexOutput { /// /// - Parameter pattern: The regular expression. public init(_ pattern: String) throws { - self.init(ast: try parse(pattern, .traditional)) + self.init(ast: try parse(pattern, .semantic, .traditional)) } } @@ -31,7 +31,7 @@ extension Regex { _ pattern: String, as: Output.Type = Output.self ) throws { - self.init(ast: try parse(pattern, .traditional)) + self.init(ast: try parse(pattern, .semantic, .traditional)) } } diff --git a/Sources/_StringProcessing/Regex/Core.swift b/Sources/_StringProcessing/Regex/Core.swift index 1f9a35dad..29d2267b2 100644 --- a/Sources/_StringProcessing/Regex/Core.swift +++ b/Sources/_StringProcessing/Regex/Core.swift @@ -44,7 +44,7 @@ public struct Regex: RegexComponent { // Compiler interface. Do not change independently. @usableFromInline init(_regexString pattern: String) { - self.init(ast: try! parse(pattern, .traditional)) + self.init(ast: try! parse(pattern, .semantic, .traditional)) } // Compiler interface. Do not change independently. @@ -53,7 +53,7 @@ public struct Regex: RegexComponent { assert(version == currentRegexLiteralFormatVersion) // The version argument is passed by the compiler using the value defined // in libswiftParseRegexLiteral. - self.init(ast: try! parseWithDelimiters(pattern)) + self.init(ast: try! parseWithDelimiters(pattern, .semantic)) } public var regex: Regex { diff --git a/Tests/RegexTests/CaptureTests.swift b/Tests/RegexTests/CaptureTests.swift index b48e1f0a5..ad78cd5b5 100644 --- a/Tests/RegexTests/CaptureTests.swift +++ b/Tests/RegexTests/CaptureTests.swift @@ -150,7 +150,7 @@ func captureTest( file: StaticString = #file, line: UInt = #line ) { - let ast = try! parse(regex, .traditional) + let ast = try! parse(regex, .semantic, .traditional) let capList = ast.root._captureList guard capList == expected else { XCTFail(""" diff --git a/Tests/RegexTests/DiagnosticTests.swift b/Tests/RegexTests/DiagnosticTests.swift index 428020b80..0100a3a86 100644 --- a/Tests/RegexTests/DiagnosticTests.swift +++ b/Tests/RegexTests/DiagnosticTests.swift @@ -20,7 +20,7 @@ extension RegexTests { XCTAssert(SourceLocation.fake.isFake) XCTAssert(group(.capture, "a").location.isFake) - let ast = try! parse("(a)", .traditional).root + let ast = try! parse("(a)", .semantic, .traditional).root XCTAssert(ast.location.isReal) } @@ -31,7 +31,7 @@ extension RegexTests { // // Input should be a concatenation or alternation func flatTest(_ str: String, _ expected: [String]) { - guard let ast = try? parse(str, .traditional).root else { + guard let ast = try? parse(str, .semantic, .traditional).root else { XCTFail("Fail to parse: \(str)") return } @@ -54,7 +54,7 @@ extension RegexTests { func renderTest(_ str: String, _ expected: [String]) { let lines = try! parse( - str, .traditional + str, .semantic, .traditional )._render(in: str) func fail() { XCTFail(""" diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index 0ef021442..eed96becc 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -170,8 +170,8 @@ func parseNotEqualTest( syntax: SyntaxOptions = .traditional, file: StaticString = #file, line: UInt = #line ) { - let lhsAST = try! parse(lhs, syntax) - let rhsAST = try! parse(rhs, syntax) + let lhsAST = try! parse(lhs, .syntactic, syntax) + let rhsAST = try! parse(rhs, .syntactic, syntax) if lhsAST == rhsAST || lhsAST._dump() == rhsAST._dump() { XCTFail(""" AST: \(lhsAST._dump()) @@ -187,7 +187,7 @@ func rangeTest( at locFn: (AST.Node) -> SourceLocation = \.location, file: StaticString = #file, line: UInt = #line ) { - let ast = try! parse(input, syntax).root + let ast = try! parse(input, .syntactic, syntax).root let range = input.offsets(of: locFn(ast).range) let expected = expectedRange(input) @@ -207,7 +207,7 @@ func diagnosticTest( file: StaticString = #file, line: UInt = #line ) { do { - let ast = try parse(input, syntax) + let ast = try parse(input, .semantic, syntax) XCTFail(""" Passed \(ast) @@ -236,7 +236,7 @@ func diagnosticWithDelimitersTest( input, ignoreTrailing: ignoreTrailing, file: file, line: line) do { - let orig = try parseWithDelimiters(literal) + let orig = try parseWithDelimiters(literal, .semantic) let ast = orig.root XCTFail(""" From 4b31736f109fe4815696e00880b228947f66fea8 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Mon, 9 May 2022 17:15:43 +0100 Subject: [PATCH 12/24] Implement semantic diagnostics Start emitting errors for unsupported constructs, and other semantic errors such as duplicate group names. Once we start emitting bytecode for regex at compile time, these errors could potentially be subsumed into the bytecode generator. But for now, implement them as a separate pass. --- .../Regex/Parse/Diagnostics.swift | 26 +- Sources/_RegexParser/Regex/Parse/Parse.swift | 14 +- Sources/_RegexParser/Regex/Parse/Sema.swift | 384 ++++++++++++ .../_StringProcessing/ConsumerInterface.swift | 5 +- .../_CharacterClassModel.swift | 6 +- Tests/RegexTests/CaptureTests.swift | 4 +- Tests/RegexTests/MatchTests.swift | 6 +- Tests/RegexTests/ParseTests.swift | 589 +++++++++++------- Tests/RegexTests/UTS18Tests.swift | 2 +- 9 files changed, 798 insertions(+), 238 deletions(-) create mode 100644 Sources/_RegexParser/Regex/Parse/Sema.swift diff --git a/Sources/_RegexParser/Regex/Parse/Diagnostics.swift b/Sources/_RegexParser/Regex/Parse/Diagnostics.swift index c3d74c30b..7a8dfe771 100644 --- a/Sources/_RegexParser/Regex/Parse/Diagnostics.swift +++ b/Sources/_RegexParser/Regex/Parse/Diagnostics.swift @@ -15,6 +15,8 @@ enum ParseError: Error, Hashable { // TODO: I wonder if it makes sense to store the string. // This can make equality weird. + // MARK: Syntactic Errors + case numberOverflow(String) case expectedNumDigits(String, Int) case expectedNumber(String, kind: RadixKind) @@ -55,7 +57,6 @@ enum ParseError: Error, Hashable { case cannotRemoveMatchingOptionsAfterCaret case expectedCustomCharacterClassMembers - case invalidCharacterClassRangeOperand case emptyProperty case unknownProperty(key: String?, value: String) @@ -73,6 +74,15 @@ enum ParseError: Error, Hashable { case cannotRemoveExtendedSyntaxInMultilineMode case expectedCalloutArgument + + // MARK: Semantic Errors + + case unsupported(String) + case deprecatedUnicode(String) + case invalidReference(Int) + case duplicateNamedCapture(String) + case invalidCharacterClassRangeOperand + case invalidQuantifierRange(Int, Int) } extension IdentifierKind { @@ -88,6 +98,7 @@ extension IdentifierKind { extension ParseError: CustomStringConvertible { var description: String { switch self { + // MARK: Syntactic Errors case let .numberOverflow(s): return "number overflow: \(s)" case let .expectedNumDigits(s, i): @@ -167,6 +178,19 @@ extension ParseError: CustomStringConvertible { return "extended syntax may not be disabled in multi-line mode" case .expectedCalloutArgument: return "expected argument to callout" + + // MARK: Semantic Errors + + case let .unsupported(kind): + return "\(kind) is not currently supported" + case let .deprecatedUnicode(kind): + return "\(kind) is a deprecated Unicode property, and is not supported" + case let .invalidReference(i): + return "no capture numbered \(i)" + case let .duplicateNamedCapture(str): + return "group named '\(str)' already exists" + case let .invalidQuantifierRange(lhs, rhs): + return "range lower bound '\(lhs)' must be less than or equal to upper bound '\(rhs)'" } } } diff --git a/Sources/_RegexParser/Regex/Parse/Parse.swift b/Sources/_RegexParser/Regex/Parse/Parse.swift index 168adf4a2..2d33e4d7e 100644 --- a/Sources/_RegexParser/Regex/Parse/Parse.swift +++ b/Sources/_RegexParser/Regex/Parse/Parse.swift @@ -543,11 +543,6 @@ extension Parser { // Range between atoms. if let (dashLoc, rhs) = try source.lexCustomCharClassRangeEnd(context: context) { - guard atom.isValidCharacterClassRangeBound && - rhs.isValidCharacterClassRangeBound else { - throw ParseError.invalidCharacterClassRangeOperand - } - // TODO: Validate lower <= upper? members.append(.range(.init(atom, dashLoc, rhs))) continue } @@ -575,7 +570,14 @@ public func parse( { let source = Source(String(regex)) var parser = Parser(source, syntax: syntax) - return try parser.parse() + let ast = try parser.parse() + switch stage { + case .syntactic: + break + case .semantic: + try validate(ast) + } + return ast } /// Retrieve the default set of syntax options that a delimiter and literal diff --git a/Sources/_RegexParser/Regex/Parse/Sema.swift b/Sources/_RegexParser/Regex/Parse/Sema.swift new file mode 100644 index 000000000..32859812c --- /dev/null +++ b/Sources/_RegexParser/Regex/Parse/Sema.swift @@ -0,0 +1,384 @@ +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift.org open source project +// +// Copyright (c) 2022 Apple Inc. and the Swift project authors +// Licensed under Apache License v2.0 with Runtime Library Exception +// +// See https://swift.org/LICENSE.txt for license information +// +//===----------------------------------------------------------------------===// + +/// Validate a regex AST for semantic validity. Once bytecode is emitted at +/// compile time, this could potentially be subsumed by the bytecode generator. +fileprivate struct RegexValidator { + let ast: AST + let captures: CaptureList + + init(_ ast: AST) { + self.ast = ast + self.captures = ast.captureList + } + + func error(_ kind: ParseError, at loc: SourceLocation) -> Error { + Source.LocatedError(kind, loc) + } +} + +extension String { + fileprivate var quoted: String { "'\(self)'" } +} + +extension RegexValidator { + func validate() throws { + for opt in ast.globalOptions?.options ?? [] { + try validateGlobalMatchingOption(opt) + } + try validateNode(ast.root) + } + + func validateGlobalMatchingOption(_ opt: AST.GlobalMatchingOption) throws { + switch opt.kind { + case .limitDepth, .limitHeap, .limitMatch, .notEmpty, .notEmptyAtStart, + .noAutoPossess, .noDotStarAnchor, .noJIT, .noStartOpt, .utfMode, + .unicodeProperties: + // These are PCRE specific, and not something we're likely to ever + // support. + throw error(.unsupported("global matching option"), at: opt.location) + + case .newlineMatching: + // We have implemented the correct behavior for multi-line literals, but + // these should also affect '.' and '\N' matching, which we haven't + // implemented. + throw error(.unsupported("newline matching mode"), at: opt.location) + + case .newlineSequenceMatching: + // We haven't yet implemented the '\R' matching specifics of these. + throw error( + .unsupported("newline sequence matching mode"), at: opt.location) + } + } + + func validateReference(_ ref: AST.Reference) throws { + switch ref.kind { + case .absolute(let i): + guard i <= captures.captures.count else { + throw error(.invalidReference(i), at: ref.innerLoc) + } + case .relative: + throw error(.unsupported("relative capture reference"), at: ref.innerLoc) + case .named: + // TODO: This could be implemented by querying the capture list for an + // index. + throw error(.unsupported("named capture reference"), at: ref.innerLoc) + } + if let recLevel = ref.recursionLevel { + throw error(.unsupported("recursion level"), at: recLevel.location) + } + } + + func validateMatchingOption(_ opt: AST.MatchingOption) throws { + let loc = opt.location + switch opt.kind { + case .allowDuplicateGroupNames: + // Not currently supported as we need to figure out what to do with + // the capture type. + throw error(.unsupported("duplicate group naming"), at: loc) + + case .unicodeWordBoundaries: + throw error(.unsupported("unicode word boundary mode"), at: loc) + + case .textSegmentWordMode, .textSegmentGraphemeMode: + throw error(.unsupported("text segment mode"), at: loc) + + case .byteSemantics: + throw error(.unsupported("byte semantic mode"), at: loc) + + case .caseInsensitive, .possessiveByDefault, .reluctantByDefault, + .unicodeScalarSemantics, .graphemeClusterSemantics, + .singleLine, .multiline, .namedCapturesOnly, .extended, .extraExtended, + .asciiOnlyDigit, .asciiOnlyWord, .asciiOnlySpace, .asciiOnlyPOSIXProps: + break + } + } + + func validateMatchingOptions(_ opts: AST.MatchingOptionSequence) throws { + for opt in opts.adding { + try validateMatchingOption(opt) + } + for opt in opts.removing { + try validateMatchingOption(opt) + } + } + + func validateBinaryProperty( + _ prop: Unicode.BinaryProperty, at loc: SourceLocation + ) throws { + switch prop { + case .asciiHexDigit, .alphabetic, .bidiMirrored, .cased, .caseIgnorable, + .changesWhenCasefolded, .changesWhenCasemapped, + .changesWhenNFKCCasefolded, .changesWhenLowercased, + .changesWhenTitlecased, .changesWhenUppercased, .dash, .deprecated, + .defaultIgnorableCodePoint, .diacratic, .extender, + .fullCompositionExclusion, .graphemeBase, .graphemeExtended, .hexDigit, + .idContinue, .ideographic, .idStart, .idsBinaryOperator, + .idsTrinaryOperator, .joinControl, .logicalOrderException, .lowercase, + .math, .noncharacterCodePoint, .patternSyntax, .patternWhitespace, + .quotationMark, .radical, .regionalIndicator, .softDotted, + .sentenceTerminal, .terminalPunctuation, .unifiedIdiograph, .uppercase, + .variationSelector, .whitespace, .xidContinue, .xidStart: + break + + case .emojiModifierBase, .emojiModifier, .emoji, .emojiPresentation: + // These are available on macOS 10.12.2, iOS 10.2, tvOS 10.1, watchOS 3.1.1. + // TODO: We should ideally check deployment target for such conditionally + // available properties. + break + + case .expandsOnNFC, .expandsOnNFD, .expandsOnNFKD, .expandsOnNFKC: + throw error(.deprecatedUnicode(prop.rawValue.quoted), at: loc) + + case .bidiControl, .compositionExclusion, .emojiComponent, + .extendedPictographic, .graphemeLink, .hyphen, .otherAlphabetic, + .otherDefaultIgnorableCodePoint, .otherGraphemeExtended, + .otherIDContinue, .otherIDStart, .otherLowercase, .otherMath, + .otherUppercase, .prependedConcatenationMark: + throw error(.unsupported(prop.rawValue.quoted), at: loc) + } + } + + func validateCharacterProperty( + _ prop: AST.Atom.CharacterProperty, at loc: SourceLocation + ) throws { + // TODO: We could re-add the .other case to diagnose unknown properties + // here instead of in the parser. + // TODO: Should we store an 'inner location' for the contents of `\p{...}`? + switch prop.kind { + case .binary(let b, _): + try validateBinaryProperty(b, at: loc) + case .any, .assigned, .ascii, .generalCategory, .posix, .named, .script, + .scriptExtension: + break + case .pcreSpecial: + throw error(.unsupported("PCRE property"), at: loc) + case .onigurumaSpecial: + throw error(.unsupported("Unicode block property"), at: loc) + } + } + + func validateEscaped( + _ esc: AST.Atom.EscapedBuiltin, at loc: SourceLocation + ) throws { + switch esc { + case .resetStartOfMatch, .singleDataUnit, .horizontalWhitespace, + .notHorizontalWhitespace, .verticalTab, .notVerticalTab, + // '\N' needs to be emitted using 'emitAny'. + .notNewline: + throw error(.unsupported("'\\\(esc.character)'"), at: loc) + + // Character classes. + case .decimalDigit, .notDecimalDigit, .whitespace, .notWhitespace, + .wordCharacter, .notWordCharacter, .graphemeCluster, .trueAnychar: + // TODO: What about scalar matching mode for .graphemeCluster? We + // currently crash at runtime. + break + + case .newlineSequence: + break + + // Assertions. + case .wordBoundary, .notWordBoundary, .startOfSubject, + .endOfSubjectBeforeNewline, .endOfSubject, .textSegment, + .notTextSegment, .firstMatchingPositionInSubject: + break + + // Literal escapes. + case .alarm, .backspace, .escape, .formfeed, .newline, .carriageReturn, + .tab: + break + } + } + + func validateAtom(_ atom: AST.Atom) throws { + switch atom.kind { + case .escaped(let esc): + try validateEscaped(esc, at: atom.location) + + case .keyboardControl, .keyboardMeta, .keyboardMetaControl: + // We need to implement the scalar computations for these. + throw error(.unsupported("control sequence"), at: atom.location) + + case .property(let p): + try validateCharacterProperty(p, at: atom.location) + + case .backreference(let r): + try validateReference(r) + + case .subpattern: + throw error(.unsupported("subpattern"), at: atom.location) + + case .callout: + // These are PCRE and Oniguruma specific, supporting them is future work. + throw error(.unsupported("callout"), at: atom.location) + + case .backtrackingDirective: + // These are PCRE-specific, and are unlikely to be fully supported. + throw error(.unsupported("backtracking directive"), at: atom.location) + + case .changeMatchingOptions(let opts): + try validateMatchingOptions(opts) + + case .namedCharacter: + // TODO: We should error on unknown Unicode scalar names. + break + + case .char, .scalar, .startOfLine, .endOfLine, .any: + break + } + } + + func validateCustomCharacterClass(_ c: AST.CustomCharacterClass) throws { + for member in c.members { + try validateCharacterClassMember(member) + } + } + + func validateCharacterClassRange( + _ range: AST.CustomCharacterClass.Range + ) throws { + let lhs = range.lhs + let rhs = range.rhs + + try validateAtom(lhs) + try validateAtom(rhs) + + guard lhs.isValidCharacterClassRangeBound else { + throw error(.invalidCharacterClassRangeOperand, at: lhs.location) + } + guard rhs.isValidCharacterClassRangeBound else { + throw error(.invalidCharacterClassRangeOperand, at: rhs.location) + } + + guard lhs.literalCharacterValue != nil else { + throw error( + .unsupported("character class range operand"), at: lhs.location) + } + + guard rhs.literalCharacterValue != nil else { + throw error( + .unsupported("character class range operand"), at: rhs.location) + } + + // TODO: Validate lhs <= rhs? That may require knowledge of case + // insensitivity though. + } + + func validateCharacterClassMember( + _ member: AST.CustomCharacterClass.Member + ) throws { + switch member { + case .custom(let c): + try validateCustomCharacterClass(c) + + case .range(let r): + try validateCharacterClassRange(r) + + case .atom(let a): + try validateAtom(a) + + case .setOperation(let lhs, _, let rhs): + for lh in lhs { try validateCharacterClassMember(lh) } + for rh in rhs { try validateCharacterClassMember(rh) } + + case .quote, .trivia: + break + } + } + + func validateGroup(_ group: AST.Group) throws { + let kind = group.kind + switch kind.value { + case .capture, .namedCapture, .nonCapture, .lookahead, .negativeLookahead: + break + + case .balancedCapture: + // These are .NET specific, and kinda niche. + throw error(.unsupported("balanced capture"), at: kind.location) + + case .nonCaptureReset: + // We need to figure out how these interact with typed captures. + throw error(.unsupported("branch reset group"), at: kind.location) + + case .atomicNonCapturing: + throw error(.unsupported("atomic group"), at: kind.location) + + case .nonAtomicLookahead: + throw error(.unsupported("non-atomic lookahead"), at: kind.location) + + case .lookbehind, .negativeLookbehind, .nonAtomicLookbehind: + throw error(.unsupported("lookbehind"), at: kind.location) + + case .scriptRun, .atomicScriptRun: + throw error(.unsupported("script run"), at: kind.location) + + case .changeMatchingOptions(let opts): + try validateMatchingOptions(opts) + } + try validateNode(group.child) + } + + func validateQuantification(_ quant: AST.Quantification) throws { + try validateNode(quant.child) + switch quant.amount.value { + case .range(let lhs, let rhs): + guard lhs.value <= rhs.value else { + throw error( + .invalidQuantifierRange(lhs.value, rhs.value), at: quant.location) + } + case .zeroOrMore, .oneOrMore, .zeroOrOne, .exactly, .nOrMore, .upToN: + break + } + } + + func validateNode(_ node: AST.Node) throws { + switch node { + case .alternation(let a): + for branch in a.children { + try validateNode(branch) + } + case .concatenation(let c): + for child in c.children { + try validateNode(child) + } + + case .group(let g): + try validateGroup(g) + + case .conditional(let c): + // Note even once we get runtime support for this, we need to change the + // parsing to incorporate what is specified in the syntax proposal. + throw error(.unsupported("conditional"), at: c.location) + + case .quantification(let q): + try validateQuantification(q) + + case .atom(let a): + try validateAtom(a) + + case .customCharacterClass(let c): + try validateCustomCharacterClass(c) + + case .absentFunction(let a): + // These are Oniguruma specific. + throw error(.unsupported("absent function"), at: a.location) + + case .quote, .trivia, .empty: + break + } + } +} + +/// Check a regex AST for semantic validity. +public func validate(_ ast: AST) throws { + try RegexValidator(ast).validate() +} diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index 637b1a37a..9c0c3522c 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -513,7 +513,10 @@ extension Unicode.BinaryProperty { _ opts: MatchingOptions ) throws -> MEProgram.ConsumeFunction { let consume = consumeFunction(for: opts) - + + // Note if you implement support for any of the below, you need to adjust + // the switch in Sema.swift to not have it be diagnosed as unsupported + // (potentially guarded on deployment version). switch self { case .asciiHexDigit: return consume(propertyScalarPredicate { diff --git a/Sources/_StringProcessing/_CharacterClassModel.swift b/Sources/_StringProcessing/_CharacterClassModel.swift index fc3fd5741..27a24cf46 100644 --- a/Sources/_StringProcessing/_CharacterClassModel.swift +++ b/Sources/_StringProcessing/_CharacterClassModel.swift @@ -454,9 +454,13 @@ extension AST.Atom.EscapedBuiltin { case .notHorizontalWhitespace: return .horizontalWhitespace.inverted - case .notNewline: return .newlineSequence.inverted case .newlineSequence: return .newlineSequence + // FIXME: This is more like '.' than inverted '\R', as it is affected + // by e.g (*CR). We should therefore really be emitting it through + // emitAny(). For now we treat it as semantically invalid. + case .notNewline: return .newlineSequence.inverted + case .whitespace: return .whitespace case .notWhitespace: return .whitespace.inverted diff --git a/Tests/RegexTests/CaptureTests.swift b/Tests/RegexTests/CaptureTests.swift index ad78cd5b5..45be547db 100644 --- a/Tests/RegexTests/CaptureTests.swift +++ b/Tests/RegexTests/CaptureTests.swift @@ -38,8 +38,8 @@ extension CaptureList.Capture { return Self(optionalDepth: 6) } - static func named(_ name: String) -> Self { - return Self(name: name, optionalDepth: 0) + static func named(_ name: String, opt: Int = 0) -> Self { + return Self(name: name, optionalDepth: opt) } } extension CaptureList { diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index 83b73fe35..3b7def90b 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -717,7 +717,7 @@ extension RegexTests { firstMatchTest( #"\N{ASTERISK}+"#, input: "123***xyz", match: "***") firstMatchTest( - #"\N {2}"#, input: "123 xyz", match: "3 ") + #"\N {2}"#, input: "123 xyz", match: "3 ", xfail: true) firstMatchTest(#"\N{U+2C}"#, input: "123,xyz", match: ",") firstMatchTest(#"\N{U+1F4BF}"#, input: "123💿xyz", match: "💿") @@ -1014,7 +1014,7 @@ extension RegexTests { firstMatchTest( #"a(?:b)c"#, input: "123abcxyz", match: "abc") firstMatchTest( - "(?|(a)|(b)|(c))", input: "123abcxyz", match: "a") + "(?|(a)|(b)|(c))", input: "123abcxyz", match: "a", xfail: true) firstMatchTest( #"(?:a|.b)c"#, input: "123abcacxyz", match: "abc") @@ -1130,6 +1130,8 @@ extension RegexTests { firstMatchTest(#"(.)(.)\g-02"#, input: "abac", match: "aba", xfail: true) firstMatchTest(#"(?.)(.)\k"#, input: "abac", match: "aba", xfail: true) firstMatchTest(#"\g'+2'(.)(.)"#, input: "abac", match: "aba", xfail: true) + + firstMatchTest(#"\1(.)"#, input: "112", match: nil) } func testMatchExamples() { diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index eed96becc..0ff96fa0b 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -33,30 +33,56 @@ extension AST.CustomCharacterClass.Member: ExpressibleByExtendedGraphemeClusterL } } +enum SemanticErrorKind { + case unsupported, invalid +} class RegexTests: XCTestCase {} func parseTest( _ input: String, _ expectedAST: AST.Node, + throwsError errorKind: SemanticErrorKind? = nil, syntax: SyntaxOptions = .traditional, captures expectedCaptures: CaptureList = [], file: StaticString = #file, line: UInt = #line ) { parseTest( - input, .init(expectedAST, globalOptions: nil), syntax: syntax, - captures: expectedCaptures, file: file, line: line + input, .init(expectedAST, globalOptions: nil), throwsError: errorKind, + syntax: syntax, captures: expectedCaptures, file: file, line: line ) } func parseTest( _ input: String, _ expectedAST: AST, + throwsError errorKind: SemanticErrorKind? = nil, syntax: SyntaxOptions = .traditional, captures expectedCaptures: CaptureList = [], file: StaticString = #file, line: UInt = #line ) { - let ast = try! parse(input, syntax) + let ast: AST + do { + ast = try parse(input, errorKind != nil ? .syntactic : .semantic, syntax) + } catch { + XCTFail("unexpected error: \(error)", file: file, line: line) + return + } + if let errorKind = errorKind { + do { + _ = try parse(input, .semantic, syntax) + XCTFail("expected semantically invalid AST", file: file, line: line) + } catch let e as Source.LocatedError { + switch e.error { + case .unsupported: + XCTAssertEqual(errorKind, .unsupported, "\(e)", file: file, line: line) + default: + XCTAssertEqual(errorKind, .invalid, "\(e)", file: file, line: line) + } + } catch { + XCTFail("Error without source location: \(error)", file: file, line: line) + } + } guard ast == expectedAST || ast._dump() == expectedAST._dump() // EQ workaround else { @@ -143,15 +169,37 @@ func delimiterLexingTest( /// true, there may be additional characters that follow the literal that are /// not considered part of it. func parseWithDelimitersTest( - _ input: String, _ expecting: AST.Node, ignoreTrailing: Bool = false, - file: StaticString = #file, line: UInt = #line + _ input: String, _ expecting: AST.Node, + throwsError errorKind: SemanticErrorKind? = nil, + ignoreTrailing: Bool = false, file: StaticString = #file, line: UInt = #line ) { // First try lexing. let literal = delimiterLexingTest( input, ignoreTrailing: ignoreTrailing, file: file, line: line) - let orig = try! parseWithDelimiters(literal) - let ast = orig.root + let ast: AST.Node + do { + ast = try parseWithDelimiters( + literal, errorKind != nil ? .syntactic : .semantic).root + } catch { + XCTFail("unexpected error: \(error)", file: file, line: line) + return + } + if let errorKind = errorKind { + do { + _ = try parseWithDelimiters(input, .semantic) + XCTFail("expected semantically invalid AST", file: file, line: line) + } catch let e as Source.LocatedError { + switch e.error { + case .unsupported: + XCTAssertEqual(errorKind, .unsupported, "\(e)", file: file, line: line) + default: + XCTAssertEqual(errorKind, .invalid, "\(e)", file: file, line: line) + } + } catch { + XCTFail("Error without source location: \(error)", file: file, line: line) + } + } guard ast == expecting || ast._dump() == expecting._dump() // EQ workaround else { @@ -437,6 +485,12 @@ extension RegexTests { parseTest(#"abc\d"#, concat("a", "b", "c", escaped(.decimalDigit))) + // FIXME: '\N' should be emitted through 'emitAny', not through the + // _CharacterClassModel model. + parseTest(#"\N"#, escaped(.notNewline), throwsError: .unsupported) + + parseTest(#"\R"#, escaped(.newlineSequence)) + parseTest( "[-|$^:?+*())(*-+-]", charClass( @@ -595,10 +649,12 @@ extension RegexTests { range_m(.keyboardControl("A"), .keyboardControl("B")), range_m(.keyboardMetaControl("A"), .keyboardMetaControl("B")), range_m(.keyboardMeta("A"), .keyboardMeta("B")) - )) + ), throwsError: .unsupported) - parseTest(#"[\N{DOLLAR SIGN}-\N{APOSTROPHE}]"#, charClass( - range_m(.namedCharacter("DOLLAR SIGN"), .namedCharacter("APOSTROPHE")))) + parseTest( + #"[\N{DOLLAR SIGN}-\N{APOSTROPHE}]"#, charClass( + range_m(.namedCharacter("DOLLAR SIGN"), .namedCharacter("APOSTROPHE"))), + throwsError: .unsupported) // MARK: Operators @@ -691,13 +747,13 @@ extension RegexTests { parseTest(#"\\#u{3000}"#, "\u{3000}") // Control and meta controls. - parseTest(#"\c "#, atom(.keyboardControl(" "))) - parseTest(#"\c!"#, atom(.keyboardControl("!"))) - parseTest(#"\c~"#, atom(.keyboardControl("~"))) - parseTest(#"\C--"#, atom(.keyboardControl("-"))) - parseTest(#"\M-\C-a"#, atom(.keyboardMetaControl("a"))) - parseTest(#"\M-\C--"#, atom(.keyboardMetaControl("-"))) - parseTest(#"\M-a"#, atom(.keyboardMeta("a"))) + parseTest(#"\c "#, atom(.keyboardControl(" ")), throwsError: .unsupported) + parseTest(#"\c!"#, atom(.keyboardControl("!")), throwsError: .unsupported) + parseTest(#"\c~"#, atom(.keyboardControl("~")), throwsError: .unsupported) + parseTest(#"\C--"#, atom(.keyboardControl("-")), throwsError: .unsupported) + parseTest(#"\M-\C-a"#, atom(.keyboardMetaControl("a")), throwsError: .unsupported) + parseTest(#"\M-\C--"#, atom(.keyboardMetaControl("-")), throwsError: .unsupported) + parseTest(#"\M-a"#, atom(.keyboardMeta("a")), throwsError: .unsupported) // MARK: Comments @@ -734,6 +790,9 @@ extension RegexTests { parseTest( #"a{0,0}"#, quantRange(0...0, of: "a")) + parseTest( + #"a{1,1}"#, + quantRange(1...1, of: "a")) // Make sure ranges get treated as literal if invalid. parseTest("{", "{") @@ -786,11 +845,42 @@ extension RegexTests { // Balanced captures parseTest(#"(?)"#, balancedCapture(name: "a", priorName: "c", empty()), - captures: [.named("a")]) + throwsError: .unsupported, captures: [.named("a")]) parseTest(#"(?<-c>)"#, balancedCapture(name: nil, priorName: "c", empty()), - captures: [.cap]) + throwsError: .unsupported, captures: [.cap]) parseTest(#"(?'a-b'c)"#, balancedCapture(name: "a", priorName: "b", "c"), - captures: [.named("a")]) + throwsError: .unsupported, captures: [.named("a")]) + + // Capture resets. + // FIXME: The captures in each branch should be unified. For now, we don't + // treat any capture reset as semantically valid. + parseTest( + "(?|(a)|(b))", + nonCaptureReset(alt(capture("a"), capture("b"))), + throwsError: .unsupported, captures: [.opt, .opt] + ) + parseTest( + "(?|(?a)|(b))", + nonCaptureReset(alt(namedCapture("x", "a"), capture("b"))), + throwsError: .unsupported, captures: [.named("x", opt: 1), .opt] + ) + parseTest( + "(?|(a)|(?b))", + nonCaptureReset(alt(capture("a"), namedCapture("x", "b"))), + throwsError: .unsupported, captures: [.opt, .named("x", opt: 1)] + ) + parseTest( + "(?|(?a)|(?b))", + nonCaptureReset(alt(namedCapture("x", "a"), namedCapture("x", "b"))), + throwsError: .unsupported, captures: [.named("x", opt: 1), .named("x", opt: 1)] + ) + + // TODO: Reject mismatched names? + parseTest( + "(?|(?a)|(?b))", + nonCaptureReset(alt(namedCapture("x", "a"), namedCapture("y", "b"))), + throwsError: .unsupported, captures: [.named("x", opt: 1), .named("y", opt: 1)] + ) // Other groups parseTest( @@ -798,13 +888,13 @@ extension RegexTests { concat("a", nonCapture("b"), "c")) parseTest( #"a(?|b)c"#, - concat("a", nonCaptureReset("b"), "c")) + concat("a", nonCaptureReset("b"), "c"), throwsError: .unsupported) parseTest( #"a(?>b)c"#, - concat("a", atomicNonCapturing("b"), "c")) + concat("a", atomicNonCapturing("b"), "c"), throwsError: .unsupported) parseTest( "a(*atomic:b)c", - concat("a", atomicNonCapturing("b"), "c")) + concat("a", atomicNonCapturing("b"), "c"), throwsError: .unsupported) parseTest("a(?=b)c", concat("a", lookahead("b"), "c")) parseTest("a(*pla:b)c", concat("a", lookahead("b"), "c")) @@ -815,31 +905,42 @@ extension RegexTests { parseTest("a(*negative_lookahead:b)c", concat("a", negativeLookahead("b"), "c")) - parseTest("a(?<=b)c", concat("a", lookbehind("b"), "c")) - parseTest("a(*plb:b)c", concat("a", lookbehind("b"), "c")) - parseTest("a(*positive_lookbehind:b)c", concat("a", lookbehind("b"), "c")) - - parseTest("a(?"#, backreference(.relative(4))) - parseTest(#"\k<2>"#, backreference(.absolute(2))) - parseTest(#"\k'-3'"#, backreference(.relative(-3))) - parseTest(#"\k'1'"#, backreference(.absolute(1))) - - parseTest(#"\k{a0}"#, backreference(.named("a0"))) - parseTest(#"\k"#, backreference(.named("bc"))) - parseTest(#"\g{abc}"#, backreference(.named("abc"))) - parseTest(#"(?P=abc)"#, backreference(.named("abc"))) + parseTest(#"\113"#, backreference(.absolute(113)), throwsError: .invalid) + parseTest(#"\377"#, backreference(.absolute(377)), throwsError: .invalid) + parseTest(#"\81"#, backreference(.absolute(81)), throwsError: .invalid) + + parseTest(#"\g1"#, backreference(.absolute(1)), throwsError: .invalid) + parseTest(#"\g001"#, backreference(.absolute(1)), throwsError: .invalid) + parseTest(#"\g52"#, backreference(.absolute(52)), throwsError: .invalid) + parseTest(#"\g-01"#, backreference(.relative(-1)), throwsError: .unsupported) + parseTest(#"\g+30"#, backreference(.relative(30)), throwsError: .unsupported) + + parseTest(#"\g{1}"#, backreference(.absolute(1)), throwsError: .invalid) + parseTest(#"\g{001}"#, backreference(.absolute(1)), throwsError: .invalid) + parseTest(#"\g{52}"#, backreference(.absolute(52)), throwsError: .invalid) + parseTest(#"\g{-01}"#, backreference(.relative(-1)), throwsError: .unsupported) + parseTest(#"\g{+30}"#, backreference(.relative(30)), throwsError: .unsupported) + parseTest(#"\k<+4>"#, backreference(.relative(4)), throwsError: .unsupported) + parseTest(#"\k<2>"#, backreference(.absolute(2)), throwsError: .invalid) + parseTest(#"\k'-3'"#, backreference(.relative(-3)), throwsError: .unsupported) + parseTest(#"\k'1'"#, backreference(.absolute(1)), throwsError: .invalid) + + parseTest(#"\k{a0}"#, backreference(.named("a0")), throwsError: .unsupported) + parseTest(#"\k"#, backreference(.named("bc")), throwsError: .unsupported) + parseTest(#"\g{abc}"#, backreference(.named("abc")), throwsError: .unsupported) + parseTest(#"(?P=abc)"#, backreference(.named("abc")), throwsError: .unsupported) // Oniguruma recursion levels. - parseTest(#"\k"#, backreference(.named("bc"), recursionLevel: 0)) - parseTest(#"\k"#, backreference(.named("a"), recursionLevel: 0)) - parseTest(#"\k<1+1>"#, backreference(.absolute(1), recursionLevel: 1)) - parseTest(#"\k<3-8>"#, backreference(.absolute(3), recursionLevel: -8)) - parseTest(#"\k'-3-8'"#, backreference(.relative(-3), recursionLevel: -8)) - parseTest(#"\k'bc-8'"#, backreference(.named("bc"), recursionLevel: -8)) - parseTest(#"\k'+3-8'"#, backreference(.relative(3), recursionLevel: -8)) - parseTest(#"\k'+3+8'"#, backreference(.relative(3), recursionLevel: 8)) - - parseTest(#"(?R)"#, subpattern(.recurseWholePattern)) - parseTest(#"(?0)"#, subpattern(.recurseWholePattern)) - parseTest(#"(?1)"#, subpattern(.absolute(1))) - parseTest(#"(?+12)"#, subpattern(.relative(12))) - parseTest(#"(?-2)"#, subpattern(.relative(-2))) - parseTest(#"(?&hello)"#, subpattern(.named("hello"))) - parseTest(#"(?P>P)"#, subpattern(.named("P"))) + parseTest(#"\k"#, backreference(.named("bc"), recursionLevel: 0), throwsError: .unsupported) + parseTest(#"\k"#, backreference(.named("a"), recursionLevel: 0), throwsError: .unsupported) + parseTest(#"\k<1+1>"#, backreference(.absolute(1), recursionLevel: 1), throwsError: .invalid) + parseTest(#"\k<3-8>"#, backreference(.absolute(3), recursionLevel: -8), throwsError: .invalid) + parseTest(#"\k'-3-8'"#, backreference(.relative(-3), recursionLevel: -8), throwsError: .unsupported) + parseTest(#"\k'bc-8'"#, backreference(.named("bc"), recursionLevel: -8), throwsError: .unsupported) + parseTest(#"\k'+3-8'"#, backreference(.relative(3), recursionLevel: -8), throwsError: .unsupported) + parseTest(#"\k'+3+8'"#, backreference(.relative(3), recursionLevel: 8), throwsError: .unsupported) + + parseTest(#"(?R)"#, subpattern(.recurseWholePattern), throwsError: .unsupported) + parseTest(#"(?0)"#, subpattern(.recurseWholePattern), throwsError: .unsupported) + parseTest(#"(?1)"#, subpattern(.absolute(1)), throwsError: .unsupported) + parseTest(#"(?+12)"#, subpattern(.relative(12)), throwsError: .unsupported) + parseTest(#"(?-2)"#, subpattern(.relative(-2)), throwsError: .unsupported) + parseTest(#"(?&hello)"#, subpattern(.named("hello")), throwsError: .unsupported) + parseTest(#"(?P>P)"#, subpattern(.named("P")), throwsError: .unsupported) parseTest(#"[(?R)]"#, charClass("(", "?", "R", ")")) parseTest(#"[(?&a)]"#, charClass("(", "?", "&", "a", ")")) parseTest(#"[(?1)]"#, charClass("(", "?", "1", ")")) - parseTest(#"\g<1>"#, subpattern(.absolute(1))) - parseTest(#"\g<001>"#, subpattern(.absolute(1))) - parseTest(#"\g'52'"#, subpattern(.absolute(52))) - parseTest(#"\g'-01'"#, subpattern(.relative(-1))) - parseTest(#"\g'+30'"#, subpattern(.relative(30))) - parseTest(#"\g'abc'"#, subpattern(.named("abc"))) + parseTest(#"\g<1>"#, subpattern(.absolute(1)), throwsError: .unsupported) + parseTest(#"\g<001>"#, subpattern(.absolute(1)), throwsError: .unsupported) + parseTest(#"\g'52'"#, subpattern(.absolute(52)), throwsError: .unsupported) + parseTest(#"\g'-01'"#, subpattern(.relative(-1)), throwsError: .unsupported) + parseTest(#"\g'+30'"#, subpattern(.relative(30)), throwsError: .unsupported) + parseTest(#"\g'abc'"#, subpattern(.named("abc")), throwsError: .unsupported) // Backreferences are not valid in custom character classes. parseTest(#"[\8]"#, charClass("8")) parseTest(#"[\9]"#, charClass("9")) + // These are valid references. + parseTest(#"()\1"#, concat( + capture(empty()), backreference(.absolute(1)) + ), captures: [.cap]) + parseTest(#"\1()"#, concat( + backreference(.absolute(1)), capture(empty()) + ), captures: [.cap]) + parseTest(#"()()\2"#, concat( + capture(empty()), capture(empty()), backreference(.absolute(2)) + ), captures: [.cap, .cap]) + parseTest(#"()\2()"#, concat( + capture(empty()), backreference(.absolute(2)), capture(empty()) + ), captures: [.cap, .cap]) + // MARK: Character names. parseTest(#"\N{abc}"#, atom(.namedCharacter("abc"))) @@ -1137,7 +1252,7 @@ extension RegexTests { parseTest(#"\N{abc}+"#, oneOrMore(of: atom(.namedCharacter("abc")))) parseTest( #"\N {2}"#, - concat(atom(.escaped(.notNewline)), exactly(2, of: " ")) + concat(atom(.escaped(.notNewline)), exactly(2, of: " ")), throwsError: .unsupported ) parseTest(#"\N{AA}"#, atom(.namedCharacter("AA"))) @@ -1203,13 +1318,13 @@ extension RegexTests { parseTest(#"\p{isAlphabetic}"#, prop(.binary(.alphabetic))) parseTest(#"\p{isAlpha=isFalse}"#, prop(.binary(.alphabetic, value: false))) - parseTest(#"\p{In_Runic}"#, prop(.onigurumaSpecial(.inRunic))) + parseTest(#"\p{In_Runic}"#, prop(.onigurumaSpecial(.inRunic)), throwsError: .unsupported) - parseTest(#"\p{Xan}"#, prop(.pcreSpecial(.alphanumeric))) - parseTest(#"\p{Xps}"#, prop(.pcreSpecial(.posixSpace))) - parseTest(#"\p{Xsp}"#, prop(.pcreSpecial(.perlSpace))) - parseTest(#"\p{Xuc}"#, prop(.pcreSpecial(.universallyNamed))) - parseTest(#"\p{Xwd}"#, prop(.pcreSpecial(.perlWord))) + parseTest(#"\p{Xan}"#, prop(.pcreSpecial(.alphanumeric)), throwsError: .unsupported) + parseTest(#"\p{Xps}"#, prop(.pcreSpecial(.posixSpace)), throwsError: .unsupported) + parseTest(#"\p{Xsp}"#, prop(.pcreSpecial(.perlSpace)), throwsError: .unsupported) + parseTest(#"\p{Xuc}"#, prop(.pcreSpecial(.universallyNamed)), throwsError: .unsupported) + parseTest(#"\p{Xwd}"#, prop(.pcreSpecial(.perlWord)), throwsError: .unsupported) parseTest(#"\p{alnum}"#, prop(.posix(.alnum))) parseTest(#"\p{is_alnum}"#, prop(.posix(.alnum))) @@ -1229,45 +1344,45 @@ extension RegexTests { // MARK: Conditionals parseTest(#"(?(1))"#, conditional( - .groupMatched(ref(1)), trueBranch: empty(), falseBranch: empty())) + .groupMatched(ref(1)), trueBranch: empty(), falseBranch: empty()), throwsError: .unsupported) parseTest(#"(?(1)|)"#, conditional( - .groupMatched(ref(1)), trueBranch: empty(), falseBranch: empty())) + .groupMatched(ref(1)), trueBranch: empty(), falseBranch: empty()), throwsError: .unsupported) parseTest(#"(?(1)a)"#, conditional( - .groupMatched(ref(1)), trueBranch: "a", falseBranch: empty())) + .groupMatched(ref(1)), trueBranch: "a", falseBranch: empty()), throwsError: .unsupported) parseTest(#"(?(1)a|)"#, conditional( - .groupMatched(ref(1)), trueBranch: "a", falseBranch: empty())) + .groupMatched(ref(1)), trueBranch: "a", falseBranch: empty()), throwsError: .unsupported) parseTest(#"(?(1)|b)"#, conditional( - .groupMatched(ref(1)), trueBranch: empty(), falseBranch: "b")) + .groupMatched(ref(1)), trueBranch: empty(), falseBranch: "b"), throwsError: .unsupported) parseTest(#"(?(1)a|b)"#, conditional( - .groupMatched(ref(1)), trueBranch: "a", falseBranch: "b")) + .groupMatched(ref(1)), trueBranch: "a", falseBranch: "b"), throwsError: .unsupported) parseTest(#"(?(1)(a|b|c)|d)"#, conditional( .groupMatched(ref(1)), trueBranch: capture(alt("a", "b", "c")), falseBranch: "d" - ), captures: [.opt]) + ), throwsError: .unsupported, captures: [.opt]) parseTest(#"(?(+3))"#, conditional( - .groupMatched(ref(plus: 3)), trueBranch: empty(), falseBranch: empty())) + .groupMatched(ref(plus: 3)), trueBranch: empty(), falseBranch: empty()), throwsError: .unsupported) parseTest(#"(?(-21))"#, conditional( - .groupMatched(ref(minus: 21)), trueBranch: empty(), falseBranch: empty())) + .groupMatched(ref(minus: 21)), trueBranch: empty(), falseBranch: empty()), throwsError: .unsupported) // Oniguruma recursion levels. parseTest(#"(?(1+1))"#, conditional( .groupMatched(ref(1, recursionLevel: 1)), - trueBranch: empty(), falseBranch: empty()) + trueBranch: empty(), falseBranch: empty()), throwsError: .unsupported ) parseTest(#"(?(-1+1))"#, conditional( .groupMatched(ref(minus: 1, recursionLevel: 1)), - trueBranch: empty(), falseBranch: empty()) + trueBranch: empty(), falseBranch: empty()), throwsError: .unsupported ) parseTest(#"(?(1-3))"#, conditional( .groupMatched(ref(1, recursionLevel: -3)), - trueBranch: empty(), falseBranch: empty()) + trueBranch: empty(), falseBranch: empty()), throwsError: .unsupported ) parseTest(#"(?(+1-3))"#, conditional( .groupMatched(ref(plus: 1, recursionLevel: -3)), - trueBranch: empty(), falseBranch: empty()) + trueBranch: empty(), falseBranch: empty()), throwsError: .unsupported ) parseTest( #"(?)(?(a+5))"#, @@ -1275,7 +1390,7 @@ extension RegexTests { .groupMatched(ref("a", recursionLevel: 5)), trueBranch: empty(), falseBranch: empty() )), - captures: [.named("a")] + throwsError: .unsupported, captures: [.named("a")] ) parseTest( #"(?)(?(a1-5))"#, @@ -1283,50 +1398,50 @@ extension RegexTests { .groupMatched(ref("a1", recursionLevel: -5)), trueBranch: empty(), falseBranch: empty() )), - captures: [.named("a1")] + throwsError: .unsupported, captures: [.named("a1")] ) parseTest(#"(?(1))?"#, zeroOrOne(of: conditional( - .groupMatched(ref(1)), trueBranch: empty(), falseBranch: empty()))) + .groupMatched(ref(1)), trueBranch: empty(), falseBranch: empty())), throwsError: .unsupported) parseTest(#"(?(R)a|b)"#, conditional( - .recursionCheck, trueBranch: "a", falseBranch: "b")) + .recursionCheck, trueBranch: "a", falseBranch: "b"), throwsError: .unsupported) parseTest(#"(?(R1))"#, conditional( - .groupRecursionCheck(ref(1)), trueBranch: empty(), falseBranch: empty())) + .groupRecursionCheck(ref(1)), trueBranch: empty(), falseBranch: empty()), throwsError: .unsupported) parseTest(#"(?(R&abc)a|b)"#, conditional( - .groupRecursionCheck(ref("abc")), trueBranch: "a", falseBranch: "b")) + .groupRecursionCheck(ref("abc")), trueBranch: "a", falseBranch: "b"), throwsError: .unsupported) parseTest(#"(?()a|b)"#, conditional( - .groupMatched(ref("abc")), trueBranch: "a", falseBranch: "b")) + .groupMatched(ref("abc")), trueBranch: "a", falseBranch: "b"), throwsError: .unsupported) parseTest(#"(?('abc')a|b)"#, conditional( - .groupMatched(ref("abc")), trueBranch: "a", falseBranch: "b")) + .groupMatched(ref("abc")), trueBranch: "a", falseBranch: "b"), throwsError: .unsupported) parseTest(#"(?(abc)a|b)"#, conditional( groupCondition(.capture, concat("a", "b", "c")), trueBranch: "a", falseBranch: "b" - ), captures: [.cap]) + ), throwsError: .unsupported, captures: [.cap]) parseTest(#"(?(?:abc)a|b)"#, conditional( groupCondition(.nonCapture, concat("a", "b", "c")), trueBranch: "a", falseBranch: "b" - )) + ), throwsError: .unsupported) parseTest(#"(?(?=abc)a|b)"#, conditional( groupCondition(.lookahead, concat("a", "b", "c")), trueBranch: "a", falseBranch: "b" - )) + ), throwsError: .unsupported) parseTest(#"(?(?!abc)a|b)"#, conditional( groupCondition(.negativeLookahead, concat("a", "b", "c")), trueBranch: "a", falseBranch: "b" - )) + ), throwsError: .unsupported) parseTest(#"(?(?<=abc)a|b)"#, conditional( groupCondition(.lookbehind, concat("a", "b", "c")), trueBranch: "a", falseBranch: "b" - )) + ), throwsError: .unsupported) parseTest(#"(?(?y)(?(xxx)a|b)"#, concat( namedCapture("xxx", "y"), conditional(.groupMatched(ref("xxx")), trueBranch: "a", falseBranch: "b") - ), captures: [.named("xxx")]) + ), throwsError: .unsupported, captures: [.named("xxx")]) parseTest(#"(?(1)(?(2)(?(3)))|a)"#, conditional( .groupMatched(ref(1)), @@ -1356,115 +1471,119 @@ extension RegexTests { trueBranch: empty(), falseBranch: empty()), falseBranch: empty()), - falseBranch: "a")) + falseBranch: "a"), throwsError: .unsupported) parseTest(#"(?(DEFINE))"#, conditional( - .defineGroup, trueBranch: empty(), falseBranch: empty())) + .defineGroup, trueBranch: empty(), falseBranch: empty()), throwsError: .unsupported) parseTest(#"(?(VERSION>=3.1))"#, conditional( pcreVersionCheck(.greaterThanOrEqual, 3, 1), - trueBranch: empty(), falseBranch: empty()) + trueBranch: empty(), falseBranch: empty()), throwsError: .unsupported ) parseTest(#"(?(VERSION=0.1))"#, conditional( pcreVersionCheck(.equal, 0, 1), - trueBranch: empty(), falseBranch: empty()) + trueBranch: empty(), falseBranch: empty()), throwsError: .unsupported ) // MARK: Callouts // PCRE callouts - parseTest(#"(?C)"#, pcreCallout(.number(0))) - parseTest(#"(?C0)"#, pcreCallout(.number(0))) - parseTest(#"(?C20)"#, pcreCallout(.number(20))) - parseTest("(?C{abc})", pcreCallout(.string("abc"))) + parseTest(#"(?C)"#, pcreCallout(.number(0)), throwsError: .unsupported) + parseTest(#"(?C0)"#, pcreCallout(.number(0)), throwsError: .unsupported) + parseTest(#"(?C20)"#, pcreCallout(.number(20)), throwsError: .unsupported) + parseTest("(?C{abc})", pcreCallout(.string("abc")), throwsError: .unsupported) for delim in ["`", "'", "\"", "^", "%", "#", "$"] { - parseTest("(?C\(delim)hello\(delim))", pcreCallout(.string("hello"))) + parseTest("(?C\(delim)hello\(delim))", pcreCallout(.string("hello")), + throwsError: .unsupported) } // Oniguruma named callouts - parseTest("(*X)", onigurumaNamedCallout("X")) - parseTest("(*foo[t])", onigurumaNamedCallout("foo", tag: "t")) - parseTest("(*foo[a0]{b})", onigurumaNamedCallout("foo", tag: "a0", args: "b")) - parseTest("(*foo{b})", onigurumaNamedCallout("foo", args: "b")) - parseTest("(*foo[a]{a,b,c})", onigurumaNamedCallout("foo", tag: "a", args: "a", "b", "c")) - parseTest("(*foo{a,b,c})", onigurumaNamedCallout("foo", args: "a", "b", "c")) - parseTest("(*foo{%%$,!!,>>})", onigurumaNamedCallout("foo", args: "%%$", "!!", ">>")) - parseTest("(*foo{a, b, c})", onigurumaNamedCallout("foo", args: "a", " b", " c")) + parseTest("(*X)", onigurumaNamedCallout("X"), throwsError: .unsupported) + parseTest("(*foo[t])", onigurumaNamedCallout("foo", tag: "t"), throwsError: .unsupported) + parseTest("(*foo[a0]{b})", onigurumaNamedCallout("foo", tag: "a0", args: "b"), throwsError: .unsupported) + parseTest("(*foo{b})", onigurumaNamedCallout("foo", args: "b"), throwsError: .unsupported) + parseTest("(*foo[a]{a,b,c})", onigurumaNamedCallout("foo", tag: "a", args: "a", "b", "c"), throwsError: .unsupported) + parseTest("(*foo{a,b,c})", onigurumaNamedCallout("foo", args: "a", "b", "c"), throwsError: .unsupported) + parseTest("(*foo{%%$,!!,>>})", onigurumaNamedCallout("foo", args: "%%$", "!!", ">>"), throwsError: .unsupported) + parseTest("(*foo{a, b, c})", onigurumaNamedCallout("foo", args: "a", " b", " c"), throwsError: .unsupported) // Oniguruma 'of contents' callouts - parseTest("(?{x})", onigurumaCalloutOfContents("x")) - parseTest("(?{{{x}}y}}})", onigurumaCalloutOfContents("x}}y")) - parseTest("(?{{{x}}})", onigurumaCalloutOfContents("x")) - parseTest("(?{x}[tag])", onigurumaCalloutOfContents("x", tag: "tag")) - parseTest("(?{x}[tag]<)", onigurumaCalloutOfContents("x", tag: "tag", direction: .inRetraction)) - parseTest("(?{x}X)", onigurumaCalloutOfContents("x", direction: .both)) - parseTest("(?{x}>)", onigurumaCalloutOfContents("x")) - parseTest("(?{\\x})", onigurumaCalloutOfContents("\\x")) - parseTest("(?{\\})", onigurumaCalloutOfContents("\\")) + parseTest("(?{x})", onigurumaCalloutOfContents("x"), throwsError: .unsupported) + parseTest("(?{{{x}}y}}})", onigurumaCalloutOfContents("x}}y"), throwsError: .unsupported) + parseTest("(?{{{x}}})", onigurumaCalloutOfContents("x"), throwsError: .unsupported) + parseTest("(?{x}[tag])", onigurumaCalloutOfContents("x", tag: "tag"), throwsError: .unsupported) + parseTest("(?{x}[tag]<)", onigurumaCalloutOfContents("x", tag: "tag", direction: .inRetraction), throwsError: .unsupported) + parseTest("(?{x}X)", onigurumaCalloutOfContents("x", direction: .both), throwsError: .unsupported) + parseTest("(?{x}>)", onigurumaCalloutOfContents("x"), throwsError: .unsupported) + parseTest("(?{\\x})", onigurumaCalloutOfContents("\\x"), throwsError: .unsupported) + parseTest("(?{\\})", onigurumaCalloutOfContents("\\"), throwsError: .unsupported) // MARK: Backtracking directives - parseTest("(*ACCEPT)?", zeroOrOne(of: backtrackingDirective(.accept))) + parseTest("(*ACCEPT)?", zeroOrOne(of: backtrackingDirective(.accept)), throwsError: .unsupported) parseTest( "(*ACCEPT:a)??", - zeroOrOne(.reluctant, of: backtrackingDirective(.accept, name: "a")) + zeroOrOne(.reluctant, of: backtrackingDirective(.accept, name: "a")), + throwsError: .unsupported ) - parseTest("(*:a)", backtrackingDirective(.mark, name: "a")) - parseTest("(*MARK:a)", backtrackingDirective(.mark, name: "a")) - parseTest("(*F)", backtrackingDirective(.fail)) - parseTest("(*COMMIT)", backtrackingDirective(.commit)) - parseTest("(*SKIP)", backtrackingDirective(.skip)) - parseTest("(*SKIP:SKIP)", backtrackingDirective(.skip, name: "SKIP")) - parseTest("(*PRUNE)", backtrackingDirective(.prune)) - parseTest("(*THEN)", backtrackingDirective(.then)) + parseTest("(*:a)", backtrackingDirective(.mark, name: "a"), throwsError: .unsupported) + parseTest("(*MARK:a)", backtrackingDirective(.mark, name: "a"), throwsError: .unsupported) + parseTest("(*F)", backtrackingDirective(.fail), throwsError: .unsupported) + parseTest("(*COMMIT)", backtrackingDirective(.commit), throwsError: .unsupported) + parseTest("(*SKIP)", backtrackingDirective(.skip), throwsError: .unsupported) + parseTest("(*SKIP:SKIP)", backtrackingDirective(.skip, name: "SKIP"), throwsError: .unsupported) + parseTest("(*PRUNE)", backtrackingDirective(.prune), throwsError: .unsupported) + parseTest("(*THEN)", backtrackingDirective(.then), throwsError: .unsupported) // MARK: Oniguruma absent functions - parseTest("(?~)", absentRepeater(empty())) - parseTest("(?~abc)", absentRepeater(concat("a", "b", "c"))) - parseTest("(?~a+)", absentRepeater(oneOrMore(of: "a"))) - parseTest("(?~~)", absentRepeater("~")) - parseTest("(?~a|b|c)", absentRepeater(alt("a", "b", "c"))) - parseTest("(?~(a))", absentRepeater(capture("a")), captures: []) - parseTest("(?~)*", zeroOrMore(of: absentRepeater(empty()))) - - parseTest("(?~|abc)", absentStopper(concat("a", "b", "c"))) - parseTest("(?~|a+)", absentStopper(oneOrMore(of: "a"))) - parseTest("(?~|~)", absentStopper("~")) - parseTest("(?~|(a))", absentStopper(capture("a")), captures: []) - parseTest("(?~|a){2}", exactly(2, of: absentStopper("a"))) - - parseTest("(?~|a|b)", absentExpression("a", "b")) - parseTest("(?~|~|~)", absentExpression("~", "~")) + parseTest("(?~)", absentRepeater(empty()), throwsError: .unsupported) + parseTest("(?~abc)", absentRepeater(concat("a", "b", "c")), throwsError: .unsupported) + parseTest("(?~a+)", absentRepeater(oneOrMore(of: "a")), throwsError: .unsupported) + parseTest("(?~~)", absentRepeater("~"), throwsError: .unsupported) + parseTest("(?~a|b|c)", absentRepeater(alt("a", "b", "c")), throwsError: .unsupported) + parseTest("(?~(a))", absentRepeater(capture("a")), throwsError: .unsupported, captures: []) + parseTest("(?~)*", zeroOrMore(of: absentRepeater(empty())), throwsError: .unsupported) + + parseTest("(?~|abc)", absentStopper(concat("a", "b", "c")), throwsError: .unsupported) + parseTest("(?~|a+)", absentStopper(oneOrMore(of: "a")), throwsError: .unsupported) + parseTest("(?~|~)", absentStopper("~"), throwsError: .unsupported) + parseTest("(?~|(a))", absentStopper(capture("a")), throwsError: .unsupported, captures: []) + parseTest("(?~|a){2}", exactly(2, of: absentStopper("a")), throwsError: .unsupported) + + parseTest("(?~|a|b)", absentExpression("a", "b"), throwsError: .unsupported) + parseTest("(?~|~|~)", absentExpression("~", "~"), throwsError: .unsupported) parseTest("(?~|(a)|(?:b))", absentExpression(capture("a"), nonCapture("b")), - captures: []) + throwsError: .unsupported, captures: []) parseTest("(?~|(a)|(?:(b)|c))", absentExpression( capture("a"), nonCapture(alt(capture("b"), "c")) - ), captures: [.opt]) - parseTest("(?~|a|b)?", zeroOrOne(of: absentExpression("a", "b"))) + ), throwsError: .unsupported, captures: [.opt]) + parseTest("(?~|a|b)?", zeroOrOne(of: absentExpression("a", "b")), throwsError: .unsupported) - parseTest("(?~|)", absentRangeClear()) + parseTest("(?~|)", absentRangeClear(), throwsError: .unsupported) // TODO: It's not really clear what this means, but Oniguruma parses it... // Maybe we should diagnose it? - parseTest("(?~|)+", oneOrMore(of: absentRangeClear())) + parseTest("(?~|)+", oneOrMore(of: absentRangeClear()), throwsError: .unsupported) // MARK: Global matching options parseTest("(*CR)(*UTF)(*LIMIT_DEPTH=3)", ast( empty(), opts: .newlineMatching(.carriageReturnOnly), .utfMode, .limitDepth(.init(faking: 3)) - )) + ), throwsError: .unsupported) parseTest( - "(*BSR_UNICODE)3", ast("3", opts: .newlineSequenceMatching(.anyUnicode))) + "(*BSR_UNICODE)3", ast("3", opts: .newlineSequenceMatching(.anyUnicode)), + throwsError: .unsupported) parseTest( "(*BSR_ANYCRLF)", ast( - empty(), opts: .newlineSequenceMatching(.anyCarriageReturnOrLinefeed))) + empty(), opts: .newlineSequenceMatching(.anyCarriageReturnOrLinefeed)), + throwsError: .unsupported) // TODO: Diagnose on multiple line matching modes? parseTest( @@ -1472,7 +1591,7 @@ extension RegexTests { ast(empty(), opts: [ .carriageReturnOnly, .linefeedOnly, .carriageAndLinefeedOnly, .anyCarriageReturnOrLinefeed, .anyUnicode, .nulCharacter - ].map { .newlineMatching($0) })) + ].map { .newlineMatching($0) }), throwsError: .unsupported) parseTest( """ @@ -1485,7 +1604,7 @@ extension RegexTests { .limitMatch(.init(faking: 2)), .notEmpty, .notEmptyAtStart, .noAutoPossess, .noDotStarAnchor, .noJIT, .noStartOpt, .utfMode, .unicodeProperties - ) + ), throwsError: .unsupported ) parseTest("[(*CR)]", charClass("(", "*", "C", "R", ")")) @@ -1699,7 +1818,7 @@ extension RegexTests { # h """, ast(empty(), opts: .newlineMatching(.carriageReturnOnly)), - syntax: .extendedSyntax + throwsError: .unsupported, syntax: .extendedSyntax ) parseTest( """ @@ -1710,7 +1829,7 @@ extension RegexTests { # h """, ast(concat("b", "c", "e", "f"), opts: .newlineMatching(.carriageReturnOnly)), - syntax: .extendedSyntax + throwsError: .unsupported, syntax: .extendedSyntax ) parseTest( """ @@ -1721,7 +1840,7 @@ extension RegexTests { # h """, ast(concat("b", "c", "e", "f"), opts: .newlineMatching(.linefeedOnly)), - syntax: .extendedSyntax + throwsError: .unsupported, syntax: .extendedSyntax ) parseTest( """ @@ -1732,7 +1851,7 @@ extension RegexTests { # h """, ast(empty(), opts: .newlineMatching(.carriageAndLinefeedOnly)), - syntax: .extendedSyntax + throwsError: .unsupported, syntax: .extendedSyntax ) parseTest( """ @@ -1743,7 +1862,7 @@ extension RegexTests { # h """, ast(concat("b", "c", "e", "f"), opts: .newlineMatching(.carriageAndLinefeedOnly)), - syntax: .extendedSyntax + throwsError: .unsupported, syntax: .extendedSyntax ) parseTest( """ @@ -1754,7 +1873,7 @@ extension RegexTests { # h """, ast(concat("b", "c", "e", "f"), opts: .newlineMatching(.anyCarriageReturnOrLinefeed)), - syntax: .extendedSyntax + throwsError: .unsupported, syntax: .extendedSyntax ) parseTest( """ @@ -1765,7 +1884,7 @@ extension RegexTests { # h """, ast(concat("b", "c", "e", "f"), opts: .newlineMatching(.anyCarriageReturnOrLinefeed)), - syntax: .extendedSyntax + throwsError: .unsupported, syntax: .extendedSyntax ) parseTest( """ @@ -1776,7 +1895,7 @@ extension RegexTests { # h """, ast(concat("b", "c", "e", "f"), opts: .newlineMatching(.anyCarriageReturnOrLinefeed)), - syntax: .extendedSyntax + throwsError: .unsupported, syntax: .extendedSyntax ) parseTest( """ @@ -1787,7 +1906,7 @@ extension RegexTests { # h """, ast(concat("b", "c", "e", "f"), opts: .newlineMatching(.anyUnicode)), - syntax: .extendedSyntax + throwsError: .unsupported, syntax: .extendedSyntax ) parseTest( """ @@ -1808,7 +1927,7 @@ extension RegexTests { # h """, ast(concat("b", "c", "e", "f"), opts: .newlineMatching(.anyUnicode)), - syntax: .extendedSyntax + throwsError: .unsupported, syntax: .extendedSyntax ) parseTest( """ @@ -1819,7 +1938,7 @@ extension RegexTests { # h """, ast(concat("e", "f"), opts: .newlineMatching(.nulCharacter)), - syntax: .extendedSyntax + throwsError: .unsupported, syntax: .extendedSyntax ) parseTest( """ @@ -1830,7 +1949,7 @@ extension RegexTests { # h """, ast(concat("b", "c", "e", "f"), opts: .newlineMatching(.nulCharacter)), - syntax: .extendedSyntax + throwsError: .unsupported, syntax: .extendedSyntax ) parseTest( """ @@ -1844,7 +1963,7 @@ extension RegexTests { opts: .newlineMatching(.carriageReturnOnly), .newlineMatching(.nulCharacter) ), - syntax: .extendedSyntax + throwsError: .unsupported, syntax: .extendedSyntax ) // MARK: Parse with delimiters @@ -1947,30 +2066,37 @@ extension RegexTests { #"re'(?'a_bcA0'\')'"#, namedCapture("a_bcA0", "'")) parseWithDelimitersTest( #"re'(?'a_bcA0-c1A'x*)'"#, - balancedCapture(name: "a_bcA0", priorName: "c1A", zeroOrMore(of: "x"))) + balancedCapture(name: "a_bcA0", priorName: "c1A", zeroOrMore(of: "x")), + throwsError: .unsupported) parseWithDelimitersTest( #"rx' (?'a_bcA0' a b)'"#, concat(namedCapture("a_bcA0", concat("a", "b")))) parseWithDelimitersTest( #"re'(?('a_bcA0')x|y)'"#, conditional( - .groupMatched(ref("a_bcA0")), trueBranch: "x", falseBranch: "y")) + .groupMatched(ref("a_bcA0")), trueBranch: "x", falseBranch: "y"), + throwsError: .unsupported + ) parseWithDelimitersTest( #"re'(?('+20')\')'"#, conditional( - .groupMatched(ref(plus: 20)), trueBranch: "'", falseBranch: empty())) - + .groupMatched(ref(plus: 20)), trueBranch: "'", falseBranch: empty()), + throwsError: .unsupported + ) parseWithDelimitersTest( - #"re'a\k'b0A''"#, concat("a", backreference(.named("b0A")))) + #"re'a\k'b0A''"#, concat("a", backreference(.named("b0A"))), throwsError: .unsupported) parseWithDelimitersTest( - #"re'\k'+2-1''"#, backreference(.relative(2), recursionLevel: -1)) + #"re'\k'+2-1''"#, backreference(.relative(2), recursionLevel: -1), + throwsError: .unsupported + ) parseWithDelimitersTest( - #"re'a\g'b0A''"#, concat("a", subpattern(.named("b0A")))) + #"re'a\g'b0A''"#, concat("a", subpattern(.named("b0A"))), throwsError: .unsupported) parseWithDelimitersTest( - #"re'\g'-1'\''"#, concat(subpattern(.relative(-1)), "'")) + #"re'\g'-1'\''"#, concat(subpattern(.relative(-1)), "'"), throwsError: .unsupported) parseWithDelimitersTest( - #"re'(?C'a*b\c 🔥_ ;')'"#, pcreCallout(.string(#"a*b\c 🔥_ ;"#))) + #"re'(?C'a*b\c 🔥_ ;')'"#, pcreCallout(.string(#"a*b\c 🔥_ ;"#)), + throwsError: .unsupported) // Fine, because we don't end up skipping. delimiterLexingTest(#"re'(?'"#) @@ -2314,6 +2440,8 @@ extension RegexTests { diagnosticTest("[[::]]", .emptyProperty) diagnosticTest("[[:=:]]", .emptyProperty) + diagnosticTest(#"|([\d-c])?"#, .invalidCharacterClassRangeOperand) + // MARK: Bad escapes diagnosticTest("\\", .expectedEscape) @@ -2419,6 +2547,7 @@ extension RegexTests { diagnosticTest("*?", .quantifierRequiresOperand("*?")) diagnosticTest("{5}", .quantifierRequiresOperand("{5}")) diagnosticTest("{1,3}", .quantifierRequiresOperand("{1,3}")) + diagnosticTest("a{3,2}", .invalidQuantifierRange(3, 2)) // MARK: Unicode scalars @@ -2458,6 +2587,16 @@ extension RegexTests { diagnosticTest(#"\k"#, .expectedNumber("", kind: .decimal)) diagnosticTest(#"\k<1+>"#, .expectedNumber("", kind: .decimal)) + diagnosticTest(#"()\k<1+1>"#, .unsupported("recursion level")) + diagnosticTest(#"()\k<1-1>"#, .unsupported("recursion level")) + + diagnosticTest(#"\k<0>"#, .cannotReferToWholePattern) + diagnosticTest(#"\1"#, .invalidReference(1)) + diagnosticTest(#"(?:)\1"#, .invalidReference(1)) + diagnosticTest(#"()\2"#, .invalidReference(2)) + diagnosticTest(#"\2()"#, .invalidReference(2)) + diagnosticTest(#"(?:)()\2"#, .invalidReference(2)) + diagnosticTest(#"(?:)(?:)\2"#, .invalidReference(2)) // MARK: Conditionals @@ -2560,5 +2699,7 @@ extension RegexTests { func testCompilerInterfaceDiagnostics() { compilerInterfaceDiagnosticMessageTest( "#/[x*/#", "cannot parse regular expression: expected ']'") + compilerInterfaceDiagnosticMessageTest( + "/a{3,2}/", "cannot parse regular expression: range lower bound '3' must be less than or equal to upper bound '2'") } } diff --git a/Tests/RegexTests/UTS18Tests.swift b/Tests/RegexTests/UTS18Tests.swift index eff9f9b4e..145087ee7 100644 --- a/Tests/RegexTests/UTS18Tests.swift +++ b/Tests/RegexTests/UTS18Tests.swift @@ -444,7 +444,7 @@ extension UTS18Tests { func testIndividuallyNamedCharacters_XFail() { XCTExpectFailure("Need to support named chars in custom character classes") { - XCTFail("\(regex(#"[\N{GREEK SMALL LETTER ALPHA}-\N{GREEK SMALL LETTER BETA}]+"#))") + XCTFail(#"[\N{GREEK SMALL LETTER ALPHA}-\N{GREEK SMALL LETTER BETA}]+"#) // XCTAssertTrue("^\u{3B1}\u{3B2}$".contains(#/[\N{GREEK SMALL LETTER ALPHA}-\N{GREEK SMALL LETTER BETA}]+/#)) } From 466b375a82627c51a02d1d1c30ef15f9b0aeaf34 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Mon, 9 May 2022 17:15:44 +0100 Subject: [PATCH 13/24] Validate capture lists Begin storing source location on capture lists, and start erroring on duplicate named captures. --- .../Regex/Parse/CaptureList.swift | 15 +++++++---- Sources/_RegexParser/Regex/Parse/Sema.swift | 12 +++++++++ Sources/_StringProcessing/Regex/DSLTree.swift | 2 +- Tests/RegexTests/CaptureTests.swift | 26 ++++++++++++------- Tests/RegexTests/ParseTests.swift | 10 +++++-- 5 files changed, 48 insertions(+), 17 deletions(-) diff --git a/Sources/_RegexParser/Regex/Parse/CaptureList.swift b/Sources/_RegexParser/Regex/Parse/CaptureList.swift index d112b2010..0287e7337 100644 --- a/Sources/_RegexParser/Regex/Parse/CaptureList.swift +++ b/Sources/_RegexParser/Regex/Parse/CaptureList.swift @@ -26,15 +26,18 @@ extension CaptureList { public var name: String? public var type: Any.Type? public var optionalDepth: Int + public var location: SourceLocation public init( name: String? = nil, type: Any.Type? = nil, - optionalDepth: Int + optionalDepth: Int, + _ location: SourceLocation ) { self.name = name self.type = type self.optionalDepth = optionalDepth + self.location = location } } } @@ -61,13 +64,14 @@ extension AST.Node { case let .group(g): switch g.kind.value { case .capture: - list.append(.init(optionalDepth: nesting)) + list.append(.init(optionalDepth: nesting, g.location)) case .namedCapture(let name): - list.append(.init(name: name.value, optionalDepth: nesting)) + list.append(.init(name: name.value, optionalDepth: nesting, g.location)) case .balancedCapture(let b): - list.append(.init(name: b.name?.value, optionalDepth: nesting)) + list.append(.init(name: b.name?.value, optionalDepth: nesting, + g.location)) default: break } @@ -124,7 +128,8 @@ extension CaptureList.Capture: Equatable { public static func == (lhs: Self, rhs: Self) -> Bool { lhs.name == rhs.name && lhs.optionalDepth == rhs.optionalDepth && - lhs.type == rhs.type + lhs.type == rhs.type && + lhs.location == rhs.location } } extension CaptureList: Equatable {} diff --git a/Sources/_RegexParser/Regex/Parse/Sema.swift b/Sources/_RegexParser/Regex/Parse/Sema.swift index 32859812c..9d3f037f7 100644 --- a/Sources/_RegexParser/Regex/Parse/Sema.swift +++ b/Sources/_RegexParser/Regex/Parse/Sema.swift @@ -34,6 +34,7 @@ extension RegexValidator { for opt in ast.globalOptions?.options ?? [] { try validateGlobalMatchingOption(opt) } + try validateCaptures() try validateNode(ast.root) } @@ -59,6 +60,17 @@ extension RegexValidator { } } + func validateCaptures() throws { + // TODO: Should this be validated when creating the capture list? + var usedNames = Set() + for capture in captures.captures { + guard let name = capture.name else { continue } + guard usedNames.insert(name).inserted else { + throw error(.duplicateNamedCapture(name), at: capture.location) + } + } + } + func validateReference(_ ref: AST.Reference) throws { switch ref.kind { case .absolute(let i): diff --git a/Sources/_StringProcessing/Regex/DSLTree.swift b/Sources/_StringProcessing/Regex/DSLTree.swift index b279c08e4..ff057f2ee 100644 --- a/Sources/_StringProcessing/Regex/DSLTree.swift +++ b/Sources/_StringProcessing/Regex/DSLTree.swift @@ -472,7 +472,7 @@ extension DSLTree.Node { list.append(.init( name: name, type: child.valueCaptureType?.base, - optionalDepth: nesting)) + optionalDepth: nesting, .fake)) child._addCaptures(to: &list, optionalNesting: nesting) case let .nonCapturingGroup(kind, child): diff --git a/Tests/RegexTests/CaptureTests.swift b/Tests/RegexTests/CaptureTests.swift index 45be547db..9efbf2f76 100644 --- a/Tests/RegexTests/CaptureTests.swift +++ b/Tests/RegexTests/CaptureTests.swift @@ -16,36 +16,44 @@ import XCTest extension CaptureList.Capture { static var cap: Self { - return Self(optionalDepth: 0) + return Self(optionalDepth: 0, .fake) } static var opt: Self { - return Self(optionalDepth: 1) + return Self(optionalDepth: 1, .fake) } static var opt_opt: Self { - return Self(optionalDepth: 2) + return Self(optionalDepth: 2, .fake) } static var opt_opt_opt: Self { - return Self(optionalDepth: 3) + return Self(optionalDepth: 3, .fake) } static var opt_opt_opt_opt: Self { - return Self(optionalDepth: 4) + return Self(optionalDepth: 4, .fake) } static var opt_opt_opt_opt_opt: Self { - return Self(optionalDepth: 5) + return Self(optionalDepth: 5, .fake) } static var opt_opt_opt_opt_opt_opt: Self { - return Self(optionalDepth: 6) + return Self(optionalDepth: 6, .fake) } static func named(_ name: String, opt: Int = 0) -> Self { - return Self(name: name, optionalDepth: opt) + return Self(name: name, optionalDepth: opt, .fake) } } extension CaptureList { static func caps(count: Int) -> Self { Self(Array(repeating: .cap, count: count)) } + + var withoutLocs: Self { + var copy = self + for idx in copy.captures.indices { + copy.captures[idx].location = .fake + } + return copy + } } extension StructuredCapture { @@ -151,7 +159,7 @@ func captureTest( line: UInt = #line ) { let ast = try! parse(regex, .semantic, .traditional) - let capList = ast.root._captureList + let capList = ast.root._captureList.withoutLocs guard capList == expected else { XCTFail(""" Expected: diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index 0ff96fa0b..b6decf437 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -94,7 +94,7 @@ func parseTest( file: file, line: line) return } - let captures = ast.captureList + let captures = ast.captureList.withoutLocs guard captures == expectedCaptures else { XCTFail(""" @@ -872,7 +872,7 @@ extension RegexTests { parseTest( "(?|(?a)|(?b))", nonCaptureReset(alt(namedCapture("x", "a"), namedCapture("x", "b"))), - throwsError: .unsupported, captures: [.named("x", opt: 1), .named("x", opt: 1)] + throwsError: .invalid, captures: [.named("x", opt: 1), .named("x", opt: 1)] ) // TODO: Reject mismatched names? @@ -2539,6 +2539,12 @@ extension RegexTests { diagnosticTest("(?x)(? : )", .unknownGroupKind("? ")) + diagnosticTest("(?)(?)", .duplicateNamedCapture("x")) + diagnosticTest("(?)|(?)", .duplicateNamedCapture("x")) + diagnosticTest("((?))(?)", .duplicateNamedCapture("x")) + diagnosticTest("(|(?))(?)", .duplicateNamedCapture("x")) + diagnosticTest("(?)(?)(?)", .duplicateNamedCapture("x")) + // MARK: Quantifiers diagnosticTest("*", .quantifierRequiresOperand("*")) From c95e8621dc9bfd3aadde0867ed7646b9335ec9a1 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Mon, 9 May 2022 20:45:06 +0100 Subject: [PATCH 14/24] Address review feedback - Make `\h` and `\H` supported for now - Check character class ranges - Diagnose unquantifiable escape sequences --- Sources/_RegexParser/Regex/AST/Atom.swift | 19 +++++++++++ .../Regex/Parse/Diagnostics.swift | 9 ++++-- Sources/_RegexParser/Regex/Parse/Parse.swift | 3 -- Sources/_RegexParser/Regex/Parse/Sema.swift | 21 ++++++------ Tests/RegexTests/ParseTests.swift | 32 +++++++++++++++---- 5 files changed, 62 insertions(+), 22 deletions(-) diff --git a/Sources/_RegexParser/Regex/AST/Atom.swift b/Sources/_RegexParser/Regex/AST/Atom.swift index d6062115a..9b0f1cb2e 100644 --- a/Sources/_RegexParser/Regex/AST/Atom.swift +++ b/Sources/_RegexParser/Regex/AST/Atom.swift @@ -668,6 +668,23 @@ extension AST.Atom.EscapedBuiltin { return nil } } + + public var isQuantifiable: Bool { + switch self { + case .alarm, .escape, .formfeed, .newline, .carriageReturn, .tab, + .singleDataUnit, .decimalDigit, .notDecimalDigit, .horizontalWhitespace, + .notHorizontalWhitespace, .notNewline, .newlineSequence, .whitespace, + .notWhitespace, .verticalTab, .notVerticalTab, .wordCharacter, + .notWordCharacter, .backspace, .graphemeCluster, .trueAnychar: + return true + + case .wordBoundary, .notWordBoundary, .startOfSubject, + .endOfSubjectBeforeNewline, .endOfSubject, + .firstMatchingPositionInSubject, .resetStartOfMatch, .textSegment, + .notTextSegment: + return false + } + } } extension AST.Atom { @@ -749,6 +766,8 @@ extension AST.Atom { case .changeMatchingOptions: return false // TODO: Are callouts quantifiable? + case .escaped(let esc): + return esc.isQuantifiable default: return true } diff --git a/Sources/_RegexParser/Regex/Parse/Diagnostics.swift b/Sources/_RegexParser/Regex/Parse/Diagnostics.swift index 7a8dfe771..0054ae6b6 100644 --- a/Sources/_RegexParser/Regex/Parse/Diagnostics.swift +++ b/Sources/_RegexParser/Regex/Parse/Diagnostics.swift @@ -45,7 +45,6 @@ enum ParseError: Error, Hashable { case cannotReferToWholePattern - case notQuantifiable case quantifierRequiresOperand(String) case backtrackingDirectiveMustHaveName(String) @@ -83,6 +82,8 @@ enum ParseError: Error, Hashable { case duplicateNamedCapture(String) case invalidCharacterClassRangeOperand case invalidQuantifierRange(Int, Int) + case invalidCharacterRange(from: Character, to: Character) + case notQuantifiable } extension IdentifierKind { @@ -125,8 +126,6 @@ extension ParseError: CustomStringConvertible { return "invalid escape sequence '\\\(c)'" case .cannotReferToWholePattern: return "cannot refer to whole pattern here" - case .notQuantifiable: - return "expression is not quantifiable" case .quantifierRequiresOperand(let q): return "quantifier '\(q)' must appear after expression" case .backtrackingDirectiveMustHaveName(let b): @@ -191,6 +190,10 @@ extension ParseError: CustomStringConvertible { return "group named '\(str)' already exists" case let .invalidQuantifierRange(lhs, rhs): return "range lower bound '\(lhs)' must be less than or equal to upper bound '\(rhs)'" + case let .invalidCharacterRange(from: lhs, to: rhs): + return "character '\(lhs)' must compare less than or equal to '\(rhs)'" + case .notQuantifiable: + return "expression is not quantifiable" } } } diff --git a/Sources/_RegexParser/Regex/Parse/Parse.swift b/Sources/_RegexParser/Regex/Parse/Parse.swift index 2d33e4d7e..112f32358 100644 --- a/Sources/_RegexParser/Regex/Parse/Parse.swift +++ b/Sources/_RegexParser/Regex/Parse/Parse.swift @@ -227,9 +227,6 @@ extension Parser { if let (amt, kind, trivia) = try source.lexQuantifier(context: context) { let location = loc(_start) - guard operand.isQuantifiable else { - throw Source.LocatedError(ParseError.notQuantifiable, location) - } result.append(.quantification( .init(amt, kind, operand, location, trivia: trivia))) } else { diff --git a/Sources/_RegexParser/Regex/Parse/Sema.swift b/Sources/_RegexParser/Regex/Parse/Sema.swift index 9d3f037f7..f9f2b996a 100644 --- a/Sources/_RegexParser/Regex/Parse/Sema.swift +++ b/Sources/_RegexParser/Regex/Parse/Sema.swift @@ -182,17 +182,15 @@ extension RegexValidator { _ esc: AST.Atom.EscapedBuiltin, at loc: SourceLocation ) throws { switch esc { - case .resetStartOfMatch, .singleDataUnit, .horizontalWhitespace, - .notHorizontalWhitespace, .verticalTab, .notVerticalTab, + case .resetStartOfMatch, .singleDataUnit, .verticalTab, .notVerticalTab, // '\N' needs to be emitted using 'emitAny'. .notNewline: throw error(.unsupported("'\\\(esc.character)'"), at: loc) // Character classes. case .decimalDigit, .notDecimalDigit, .whitespace, .notWhitespace, - .wordCharacter, .notWordCharacter, .graphemeCluster, .trueAnychar: - // TODO: What about scalar matching mode for .graphemeCluster? We - // currently crash at runtime. + .wordCharacter, .notWordCharacter, .graphemeCluster, .trueAnychar, + .horizontalWhitespace, .notHorizontalWhitespace: break case .newlineSequence: @@ -271,18 +269,20 @@ extension RegexValidator { throw error(.invalidCharacterClassRangeOperand, at: rhs.location) } - guard lhs.literalCharacterValue != nil else { + guard let lhsChar = lhs.literalCharacterValue else { throw error( .unsupported("character class range operand"), at: lhs.location) } - guard rhs.literalCharacterValue != nil else { + guard let rhsChar = rhs.literalCharacterValue else { throw error( .unsupported("character class range operand"), at: rhs.location) } - // TODO: Validate lhs <= rhs? That may require knowledge of case - // insensitivity though. + guard lhsChar <= rhsChar else { + throw error( + .invalidCharacterRange(from: lhsChar, to: rhsChar), at: range.dashLoc) + } } func validateCharacterClassMember( @@ -341,6 +341,9 @@ extension RegexValidator { func validateQuantification(_ quant: AST.Quantification) throws { try validateNode(quant.child) + guard quant.child.isQuantifiable else { + throw error(.notQuantifiable, at: quant.child.location) + } switch quant.amount.value { case .range(let lhs, let rhs): guard lhs.value <= rhs.value else { diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index b6decf437..9dfcff99e 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -503,6 +503,8 @@ extension RegexTests { parseTest("[-a-]", charClass("-", "a", "-")) parseTest("[a-z]", charClass(range_m("a", "z"))) + parseTest("[a-a]", charClass(range_m("a", "a"))) + parseTest("[B-a]", charClass(range_m("B", "a"))) // FIXME: AST builder helpers for custom char class types parseTest("[a-d--a-c]", charClass( @@ -2442,6 +2444,11 @@ extension RegexTests { diagnosticTest(#"|([\d-c])?"#, .invalidCharacterClassRangeOperand) + diagnosticTest(#"[_-A]"#, .invalidCharacterRange(from: "_", to: "A")) + diagnosticTest(#"(?i)[_-A]"#, .invalidCharacterRange(from: "_", to: "A")) + diagnosticTest(#"[c-b]"#, .invalidCharacterRange(from: "c", to: "b")) + diagnosticTest(#"[\u{66}-\u{65}]"#, .invalidCharacterRange(from: "\u{66}", to: "\u{65}")) + // MARK: Bad escapes diagnosticTest("\\", .expectedEscape) @@ -2555,6 +2562,17 @@ extension RegexTests { diagnosticTest("{1,3}", .quantifierRequiresOperand("{1,3}")) diagnosticTest("a{3,2}", .invalidQuantifierRange(3, 2)) + // These are not quantifiable. + diagnosticTest(#"\b?"#, .notQuantifiable) + diagnosticTest(#"\B*"#, .notQuantifiable) + diagnosticTest(#"\A+"#, .notQuantifiable) + diagnosticTest(#"\Z??"#, .notQuantifiable) + diagnosticTest(#"\G*?"#, .notQuantifiable) + diagnosticTest(#"\z+?"#, .notQuantifiable) + diagnosticTest(#"\K{1}"#, .unsupported(#"'\K'"#)) + diagnosticTest(#"\y{2,5}"#, .notQuantifiable) + diagnosticTest(#"\Y{3,}"#, .notQuantifiable) + // MARK: Unicode scalars diagnosticTest(#"\u{G}"#, .expectedNumber("G", kind: .hex)) @@ -2641,13 +2659,13 @@ extension RegexTests { diagnosticTest("(*MARK)", .backtrackingDirectiveMustHaveName("MARK")) diagnosticTest("(*:)", .expectedNonEmptyContents) - diagnosticTest("(*MARK:a)?", .notQuantifiable) - diagnosticTest("(*FAIL)+", .notQuantifiable) - diagnosticTest("(*COMMIT:b)*", .notQuantifiable) - diagnosticTest("(*PRUNE:a)??", .notQuantifiable) - diagnosticTest("(*SKIP:a)*?", .notQuantifiable) - diagnosticTest("(*F)+?", .notQuantifiable) - diagnosticTest("(*:a){2}", .notQuantifiable) + diagnosticTest("(*MARK:a)?", .unsupported("backtracking directive")) + diagnosticTest("(*FAIL)+", .unsupported("backtracking directive")) + diagnosticTest("(*COMMIT:b)*", .unsupported("backtracking directive")) + diagnosticTest("(*PRUNE:a)??", .unsupported("backtracking directive")) + diagnosticTest("(*SKIP:a)*?", .unsupported("backtracking directive")) + diagnosticTest("(*F)+?", .unsupported("backtracking directive")) + diagnosticTest("(*:a){2}", .unsupported("backtracking directive")) // MARK: Oniguruma absent functions From c16e389b384510cb7fad84ecd19484a539920cba Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Mon, 9 May 2022 18:14:59 -0500 Subject: [PATCH 15/24] Implement \R, \v, \h for character/scalar modes (#384) Implement \R, \v, \h for character/scalar modes and audit assertions and anchors for semantic level. --- Sources/_RegexParser/Regex/Parse/Sema.swift | 5 +-- Sources/_StringProcessing/ByteCodeGen.swift | 32 +++++++++++++++---- .../Unicode/ScalarProps.swift | 16 ++++++++++ .../_CharacterClassModel.swift | 28 ++++++++++------ Tests/RegexTests/UTS18Tests.swift | 30 ++++++++++++++--- 5 files changed, 88 insertions(+), 23 deletions(-) diff --git a/Sources/_RegexParser/Regex/Parse/Sema.swift b/Sources/_RegexParser/Regex/Parse/Sema.swift index f9f2b996a..263902a8e 100644 --- a/Sources/_RegexParser/Regex/Parse/Sema.swift +++ b/Sources/_RegexParser/Regex/Parse/Sema.swift @@ -182,7 +182,7 @@ extension RegexValidator { _ esc: AST.Atom.EscapedBuiltin, at loc: SourceLocation ) throws { switch esc { - case .resetStartOfMatch, .singleDataUnit, .verticalTab, .notVerticalTab, + case .resetStartOfMatch, .singleDataUnit, // '\N' needs to be emitted using 'emitAny'. .notNewline: throw error(.unsupported("'\\\(esc.character)'"), at: loc) @@ -190,7 +190,8 @@ extension RegexValidator { // Character classes. case .decimalDigit, .notDecimalDigit, .whitespace, .notWhitespace, .wordCharacter, .notWordCharacter, .graphemeCluster, .trueAnychar, - .horizontalWhitespace, .notHorizontalWhitespace: + .horizontalWhitespace, .notHorizontalWhitespace, + .verticalTab, .notVerticalTab: break case .newlineSequence: diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index 2131d1eb5..d30cab209 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -80,10 +80,16 @@ extension Compiler.ByteCodeGen { } case .endOfSubjectBeforeNewline: - builder.buildAssert { (input, pos, bounds) in + builder.buildAssert { [semanticLevel = options.semanticLevel] (input, pos, bounds) in if pos == input.endIndex { return true } - return input.index(after: pos) == input.endIndex - && input[pos].isNewline + switch semanticLevel { + case .graphemeCluster: + return input.index(after: pos) == input.endIndex + && input[pos].isNewline + case .unicodeScalar: + return input.unicodeScalars.index(after: pos) == input.endIndex + && input.unicodeScalars[pos].isNewline + } } case .endOfSubject: @@ -115,8 +121,14 @@ extension Compiler.ByteCodeGen { case .startOfLine: if options.anchorsMatchNewlines { - builder.buildAssert { (input, pos, bounds) in - pos == input.startIndex || input[input.index(before: pos)].isNewline + builder.buildAssert { [semanticLevel = options.semanticLevel] (input, pos, bounds) in + if pos == input.startIndex { return true } + switch semanticLevel { + case .graphemeCluster: + return input[input.index(before: pos)].isNewline + case .unicodeScalar: + return input.unicodeScalars[input.unicodeScalars.index(before: pos)].isNewline + } } } else { builder.buildAssert { (input, pos, bounds) in @@ -126,8 +138,14 @@ extension Compiler.ByteCodeGen { case .endOfLine: if options.anchorsMatchNewlines { - builder.buildAssert { (input, pos, bounds) in - pos == input.endIndex || input[pos].isNewline + builder.buildAssert { [semanticLevel = options.semanticLevel] (input, pos, bounds) in + if pos == input.endIndex { return true } + switch semanticLevel { + case .graphemeCluster: + return input[pos].isNewline + case .unicodeScalar: + return input.unicodeScalars[pos].isNewline + } } } else { builder.buildAssert { (input, pos, bounds) in diff --git a/Sources/_StringProcessing/Unicode/ScalarProps.swift b/Sources/_StringProcessing/Unicode/ScalarProps.swift index 52a870357..0894fa572 100644 --- a/Sources/_StringProcessing/Unicode/ScalarProps.swift +++ b/Sources/_StringProcessing/Unicode/ScalarProps.swift @@ -46,3 +46,19 @@ extension Unicode.Script { return result } } + +extension UnicodeScalar { + var isHorizontalWhitespace: Bool { + value == 0x09 || properties.generalCategory == .spaceSeparator + } + + var isNewline: Bool { + switch value { + case 0x000A...0x000D /* LF ... CR */: return true + case 0x0085 /* NEXT LINE (NEL) */: return true + case 0x2028 /* LINE SEPARATOR */: return true + case 0x2029 /* PARAGRAPH SEPARATOR */: return true + default: return false + } + } +} diff --git a/Sources/_StringProcessing/_CharacterClassModel.swift b/Sources/_StringProcessing/_CharacterClassModel.swift index 27a24cf46..85dd1ca37 100644 --- a/Sources/_StringProcessing/_CharacterClassModel.swift +++ b/Sources/_StringProcessing/_CharacterClassModel.swift @@ -178,15 +178,18 @@ public struct _CharacterClassModel: Hashable { matched = c.isNumber && (c.isASCII || !options.usesASCIIDigits) case .hexDigit: matched = c.isHexDigit && (c.isASCII || !options.usesASCIIDigits) - case .horizontalWhitespace: fatalError("Not implemented") - case .newlineSequence: - matched = c.isNewline && (c.isASCII || !options.usesASCIISpaces) - case .verticalWhitespace: fatalError("Not implemented") + case .horizontalWhitespace: + matched = c.unicodeScalars.first?.isHorizontalWhitespace == true + && (c.isASCII || !options.usesASCIISpaces) + case .newlineSequence, .verticalWhitespace: + matched = c.unicodeScalars.first?.isNewline == true + && (c.isASCII || !options.usesASCIISpaces) case .whitespace: matched = c.isWhitespace && (c.isASCII || !options.usesASCIISpaces) case .word: matched = c.isWordCharacter && (c.isASCII || !options.usesASCIIWord) - case .custom(let set): matched = set.any { $0.matches(c, with: options) } + case .custom(let set): + matched = set.any { $0.matches(c, with: options) } } if isInverted { matched.toggle() @@ -206,14 +209,21 @@ public struct _CharacterClassModel: Hashable { matched = c.properties.numericType != nil && (c.isASCII || !options.usesASCIIDigits) case .hexDigit: matched = Character(c).isHexDigit && (c.isASCII || !options.usesASCIIDigits) - case .horizontalWhitespace: fatalError("Not implemented") - case .newlineSequence: fatalError("Not implemented") - case .verticalWhitespace: fatalError("Not implemented") + case .horizontalWhitespace: + matched = c.isHorizontalWhitespace && (c.isASCII || !options.usesASCIISpaces) + case .verticalWhitespace: + matched = c.isNewline && (c.isASCII || !options.usesASCIISpaces) + case .newlineSequence: + matched = c.isNewline && (c.isASCII || !options.usesASCIISpaces) + if c == "\r" && nextIndex != str.endIndex && str.unicodeScalars[nextIndex] == "\n" { + str.unicodeScalars.formIndex(after: &nextIndex) + } case .whitespace: matched = c.properties.isWhitespace && (c.isASCII || !options.usesASCIISpaces) case .word: matched = (c.properties.isAlphabetic || c == "_") && (c.isASCII || !options.usesASCIIWord) - case .custom: fatalError("Not supported") + case .custom(let set): + matched = set.any { $0.matches(Character(c), with: options) } } if isInverted { matched.toggle() diff --git a/Tests/RegexTests/UTS18Tests.swift b/Tests/RegexTests/UTS18Tests.swift index 145087ee7..d13b47b8d 100644 --- a/Tests/RegexTests/UTS18Tests.swift +++ b/Tests/RegexTests/UTS18Tests.swift @@ -22,6 +22,14 @@ import XCTest @testable // for internal `matches(of:)` import _StringProcessing +extension UnicodeScalar { + var value4Digits: String { + let valueString = String(value, radix: 16, uppercase: true) + if valueString.count >= 4 { return valueString } + return String(repeating: "0", count: 4 - valueString.count) + valueString + } +} + class UTS18Tests: XCTestCase { var input: String { "ABCdefghîøu\u{308}\u{FFF0} -–—[]123" @@ -262,21 +270,33 @@ extension UTS18Tests { 09\u{85}\ 10\u{2028}\ 11\u{2029}\ - + 12 """ // Check the input counts var lines = lineInput.matches(of: regex(#"\d{2}"#)) - XCTAssertEqual(lines.count, 11) + XCTAssertEqual(lines.count, 12) // Test \R - newline sequence - lines = lineInput.matches(of: regex(#"\d{2}\R"#)) + lines = lineInput.matches(of: regex(#"\d{2}\R^"#).anchorsMatchLineEndings()) + XCTAssertEqual(lines.count, 11) + // Test \v - vertical space + lines = lineInput.matches(of: regex(#"\d{2}\v^"#).anchorsMatchLineEndings()) XCTAssertEqual(lines.count, 11) // Test anchors as line boundaries lines = lineInput.matches(of: regex(#"^\d{2}$"#).anchorsMatchLineEndings()) - XCTAssertEqual(lines.count, 11) + XCTAssertEqual(lines.count, 12) // Test that dot does not match line endings lines = lineInput.matches(of: regex(#".+"#)) - XCTAssertEqual(lines.count, 11) + XCTAssertEqual(lines.count, 12) + // Unicode scalar semantics - \R still matches all, including \r\n sequence + lines = lineInput.matches( + of: regex(#"\d{2}\R(?=\d)"#).matchingSemantics(.unicodeScalar).anchorsMatchLineEndings()) + XCTAssertEqual(lines.count, 11) + // Unicode scalar semantics - \v matches all except for \r\n sequence + lines = lineInput.matches( + of: regex(#"\d{2}\v(?=\d)"#).matchingSemantics(.unicodeScalar).anchorsMatchLineEndings()) + XCTAssertEqual(lines.count, 10) + // Does not contain an empty line XCTAssertFalse(lineInput.contains(regex(#"^$"#))) // Does contain an empty line (between \n and \r, which are reversed here) From c13980f751ed19d0b878854117d2176543842a0e Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Mon, 9 May 2022 18:16:37 -0500 Subject: [PATCH 16/24] De-deprecate MatchingOptions.matchLevel (#390) Removing this deprecation warning, as it's just generating noise. We may still eventually want to remove _CharacterClassModel.matchLevel along with other refactoring in the future. --- Sources/_StringProcessing/MatchingOptions.swift | 1 - 1 file changed, 1 deletion(-) diff --git a/Sources/_StringProcessing/MatchingOptions.swift b/Sources/_StringProcessing/MatchingOptions.swift index 34a6e6f9a..f5c554bdc 100644 --- a/Sources/_StringProcessing/MatchingOptions.swift +++ b/Sources/_StringProcessing/MatchingOptions.swift @@ -117,7 +117,6 @@ extension MatchingOptions { // Deprecated CharacterClass.MatchLevel API extension MatchingOptions { - @available(*, deprecated) var matchLevel: _CharacterClassModel.MatchLevel { switch semanticLevel { case .graphemeCluster: From 61965c35426d8de5f6a1a3c71a02724afea00fff Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Tue, 10 May 2022 11:31:30 +0100 Subject: [PATCH 17/24] Restrict character property fuzzy matching to "pattern whitespace" I wasn't aware of this Unicode property when initially implementing this. It's a more restricted set of whitespace that Unicode reccommends for parsing patterns. It's the same set of whitespace used for extended syntax. UAX44-LM3 itself doesn't appear to specify the exact set of whitespace to match against, but this is no more restrictive than the engines I'm aware of. --- .../Parse/CharacterPropertyClassification.swift | 2 +- Tests/RegexTests/ParseTests.swift | 14 ++++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift b/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift index ee9195ff3..c0ece78ff 100644 --- a/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift +++ b/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift @@ -18,7 +18,7 @@ extension Source { // This follows the rules provided by UAX44-LM3, including trying to drop an // "is" prefix, which isn't required by UTS#18 RL1.2, but is nice for // consistency with other engines and the Unicode.Scalar.Properties names. - let str = str.filter { !$0.isWhitespace && $0 != "_" && $0 != "-" } + let str = str.filter { !$0.isPatternWhitespace && $0 != "_" && $0 != "-" } .lowercased() if let m = match(str) { return m diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index 9dfcff99e..1d4fb948d 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -2061,6 +2061,16 @@ extension RegexTests { """, changeMatchingOptions(matchingOptions(adding: .extended)) ) + parseWithDelimitersTest(#""" + #/ + \p{ + gc + = + digit + } + /# + """#, prop(.generalCategory(.decimalNumber))) + // MARK: Delimiter skipping: Make sure we can skip over the ending delimiter // if it's clear that it's part of the regex syntax. @@ -2486,6 +2496,10 @@ extension RegexTests { diagnosticTest(#"\p{aaa\p{b}}"#, .unknownProperty(key: nil, value: "aaa")) diagnosticTest(#"[[:{:]]"#, .unknownProperty(key: nil, value: "{")) + // We only filter pattern whitespace, which doesn't include things like + // non-breaking spaces. + diagnosticTest(#"\p{L\#u{A0}l}"#, .unknownProperty(key: nil, value: "L\u{A0}l")) + // MARK: Matching options diagnosticTest("(?-y{g})", .cannotRemoveTextSegmentOptions) From 05e610ab931f341f63cbc591a72c7e7f2b93b009 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Tue, 10 May 2022 11:31:32 +0100 Subject: [PATCH 18/24] Improve the wording of a diagnostic --- .../_RegexParser/Regex/Parse/Diagnostics.swift | 16 ++++++++++------ Tests/RegexTests/ParseTests.swift | 2 ++ 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/Sources/_RegexParser/Regex/Parse/Diagnostics.swift b/Sources/_RegexParser/Regex/Parse/Diagnostics.swift index 0054ae6b6..d87fba918 100644 --- a/Sources/_RegexParser/Regex/Parse/Diagnostics.swift +++ b/Sources/_RegexParser/Regex/Parse/Diagnostics.swift @@ -105,13 +105,17 @@ extension ParseError: CustomStringConvertible { case let .expectedNumDigits(s, i): return "expected \(i) digits in '\(s)'" case let .expectedNumber(s, kind: kind): - let radix: String - if kind == .decimal { - radix = "" - } else { - radix = " of radix \(kind.radix)" + let number: String + switch kind { + case .octal: + number = "octal number" + case .decimal: + number = "number" + case .hex: + number = "hexadecimal number" } - return "expected a numbers in '\(s)'\(radix)" + let suffix = s.isEmpty ? "" : " in '\(s)'" + return "expected \(number)\(suffix)" case let .expected(s): return "expected '\(s)'" case .unexpectedEndOfInput: diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index 1d4fb948d..f7fa2b341 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -2739,5 +2739,7 @@ extension RegexTests { "#/[x*/#", "cannot parse regular expression: expected ']'") compilerInterfaceDiagnosticMessageTest( "/a{3,2}/", "cannot parse regular expression: range lower bound '3' must be less than or equal to upper bound '2'") + compilerInterfaceDiagnosticMessageTest( + #"#/\u{}/#"#, "cannot parse regular expression: expected hexadecimal number") } } From 775201552d1246ed0698262260222480757537a2 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Tue, 10 May 2022 11:31:32 +0100 Subject: [PATCH 19/24] Introduce AST.Atom.Scalar This allows us to store the source location of the inner scalar value. --- Sources/_RegexParser/Regex/AST/Atom.swift | 18 ++++- .../Regex/Parse/LexicalAnalysis.swift | 69 ++++++++++++------- .../_StringProcessing/ConsumerInterface.swift | 4 +- .../_StringProcessing/PrintAsPattern.swift | 4 +- .../Regex/ASTConversion.swift | 2 +- .../Utility/ASTBuilder.swift | 4 +- Tests/RegexTests/ParseTests.swift | 6 ++ 7 files changed, 73 insertions(+), 34 deletions(-) diff --git a/Sources/_RegexParser/Regex/AST/Atom.swift b/Sources/_RegexParser/Regex/AST/Atom.swift index 9b0f1cb2e..0ef8537a6 100644 --- a/Sources/_RegexParser/Regex/AST/Atom.swift +++ b/Sources/_RegexParser/Regex/AST/Atom.swift @@ -29,7 +29,7 @@ extension AST { /// A Unicode scalar value written as a literal /// /// \u{...}, \0dd, \x{...}, ... - case scalar(Unicode.Scalar) + case scalar(Scalar) /// A Unicode property, category, or script, including those written using /// POSIX syntax. @@ -106,6 +106,18 @@ extension AST.Atom { } } +extension AST.Atom { + public struct Scalar: Hashable { + public var value: UnicodeScalar + public var location: SourceLocation + + public init(_ value: UnicodeScalar, _ location: SourceLocation) { + self.value = value + self.location = location + } + } +} + extension AST.Atom { // TODO: We might scrap this and break out a few categories so @@ -697,7 +709,7 @@ extension AST.Atom { case .char(let c): return c case .scalar(let s): - return Character(s) + return Character(s.value) case .escaped(let c): return c.scalarValue.map(Character.init) @@ -742,7 +754,7 @@ extension AST.Atom { case .char(let c): return String(c) case .scalar(let s): - return "\\u{\(String(s.value, radix: 16, uppercase: true))}" + return "\\u{\(String(s.value.value, radix: 16, uppercase: true))}" case .keyboardControl(let x): return "\\C-\(x)" diff --git a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift index c2cce67e8..9a48f4f1a 100644 --- a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift +++ b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift @@ -157,6 +157,19 @@ extension Source { return .init(start ..< currentPosition) } + /// Attempt to eat a given prefix that satisfies a given predicate, with the + /// source location recorded. + mutating func tryEatLocatedPrefix( + maxLength: Int? = nil, + _ f: (Char) -> Bool + ) -> Located? { + let result = recordLoc { src in + src.tryEatPrefix(maxLength: maxLength, f) + } + guard let result = result else { return nil } + return result.map(\.string) + } + /// Throws an expected ASCII character error if not matched mutating func expectASCII() throws -> Located { try recordLoc { src in @@ -217,13 +230,13 @@ extension Source { /// return the scalar value, or throw an error if the string is malformed or /// would overflow the scalar. private static func validateUnicodeScalar( - _ str: String, _ kind: RadixKind - ) throws -> Unicode.Scalar { - let num = try validateNumber(str, UInt32.self, kind) + _ str: Source.Located, _ kind: RadixKind + ) throws -> AST.Atom.Scalar { + let num = try validateNumber(str.value, UInt32.self, kind) guard let scalar = Unicode.Scalar(num) else { throw ParseError.misc("Invalid scalar value U+\(num.hexStr)") } - return scalar + return .init(scalar, str.location) } /// Try to eat a number of a particular type and radix off the front. @@ -266,14 +279,15 @@ extension Source { /// Eat a scalar value from hexadecimal notation off the front private mutating func expectUnicodeScalar( numDigits: Int - ) throws -> Located { - try recordLoc { src in + ) throws -> AST.Atom.Scalar { + let str = try recordLoc { src -> String in let str = src.eat(upToCount: numDigits).string guard str.count == numDigits else { throw ParseError.expectedNumDigits(str, numDigits) } - return try Source.validateUnicodeScalar(str, .hex) + return str } + return try Source.validateUnicodeScalar(str, .hex) } /// Eat a scalar off the front, starting from after the @@ -289,49 +303,57 @@ extension Source { /// mutating func expectUnicodeScalar( escapedCharacter base: Character - ) throws -> Located { + ) throws -> AST.Atom.Kind { try recordLoc { src in + + func nullScalar() -> AST.Atom.Kind { + let pos = src.currentPosition + return .scalar(.init(UnicodeScalar(0), SourceLocation(pos ..< pos))) + } + // TODO: PCRE offers a different behavior if PCRE2_ALT_BSUX is set. switch base { // Hex numbers. case "u" where src.tryEat("{"), "x" where src.tryEat("{"): - let str = try src.lexUntil(eating: "}").value - return try Source.validateUnicodeScalar(str, .hex) + let str = try src.lexUntil(eating: "}") + return .scalar(try Source.validateUnicodeScalar(str, .hex)) case "x": // \x expects *up to* 2 digits. - guard let digits = src.tryEatPrefix(maxLength: 2, \.isHexDigit) else { + guard let digits = src.tryEatLocatedPrefix(maxLength: 2, \.isHexDigit) + else { // In PCRE, \x without any valid hex digits is \u{0}. // TODO: This doesn't appear to be followed by ICU or Oniguruma, so // could be changed to throw an error if we had a parsing mode for // them. - return Unicode.Scalar(0) + return nullScalar() } - return try Source.validateUnicodeScalar(digits.string, .hex) + return .scalar(try Source.validateUnicodeScalar(digits, .hex)) case "u": - return try src.expectUnicodeScalar(numDigits: 4).value + return .scalar(try src.expectUnicodeScalar(numDigits: 4)) case "U": - return try src.expectUnicodeScalar(numDigits: 8).value + return .scalar(try src.expectUnicodeScalar(numDigits: 8)) // Octal numbers. case "o" where src.tryEat("{"): - let str = try src.lexUntil(eating: "}").value - return try Source.validateUnicodeScalar(str, .octal) + let str = try src.lexUntil(eating: "}") + return .scalar(try Source.validateUnicodeScalar(str, .octal)) case "0": // We can read *up to* 3 more octal digits. // FIXME: PCRE can only read up to 2 octal digits, if we get a strict // PCRE mode, we should limit it here. - guard let digits = src.tryEatPrefix(maxLength: 3, \.isOctalDigit) else { - return Unicode.Scalar(0) + guard let digits = src.tryEatLocatedPrefix(maxLength: 3, \.isOctalDigit) + else { + return nullScalar() } - return try Source.validateUnicodeScalar(digits.string, .octal) + return .scalar(try Source.validateUnicodeScalar(digits, .octal)) default: fatalError("Unexpected scalar start") } - } + }.value } /// Try to consume a quantifier @@ -1153,7 +1175,7 @@ extension Source { // We should either have a unicode scalar. if src.tryEat(sequence: "U+") { - let str = try src.lexUntil(eating: "}").value + let str = try src.lexUntil(eating: "}") return .scalar(try Source.validateUnicodeScalar(str, .hex)) } @@ -1581,8 +1603,7 @@ extension Source { switch char { // Hexadecimal and octal unicode scalars. case "u", "x", "U", "o", "0": - return try .scalar( - src.expectUnicodeScalar(escapedCharacter: char).value) + return try src.expectUnicodeScalar(escapedCharacter: char) default: break } diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index 9c0c3522c..a292d7518 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -178,7 +178,7 @@ extension AST.Atom { var singleScalar: UnicodeScalar? { switch kind { - case .scalar(let s): return s + case .scalar(let s): return s.value default: return nil } } @@ -200,7 +200,7 @@ extension AST.Atom { case let .scalar(s): assertionFailure( "Should have been handled by tree conversion") - return consumeScalar { $0 == s } + return consumeScalar { $0 == s.value } case let .char(c): assertionFailure( diff --git a/Sources/_StringProcessing/PrintAsPattern.swift b/Sources/_StringProcessing/PrintAsPattern.swift index 1b5c2a4c5..285a3fdbb 100644 --- a/Sources/_StringProcessing/PrintAsPattern.swift +++ b/Sources/_StringProcessing/PrintAsPattern.swift @@ -676,7 +676,7 @@ extension AST.Atom { return String(c) case let .scalar(s): - let hex = String(s.value, radix: 16, uppercase: true) + let hex = String(s.value.value, radix: 16, uppercase: true) return "\\u{\(hex)}" case let .property(p): @@ -773,7 +773,7 @@ extension AST.Atom { return String(c) case let .scalar(s): - let hex = String(s.value, radix: 16, uppercase: true) + let hex = String(s.value.value, radix: 16, uppercase: true) return "\\u{\(hex)}" case let .property(p): diff --git a/Sources/_StringProcessing/Regex/ASTConversion.swift b/Sources/_StringProcessing/Regex/ASTConversion.swift index 47433dc42..5c4f88f40 100644 --- a/Sources/_StringProcessing/Regex/ASTConversion.swift +++ b/Sources/_StringProcessing/Regex/ASTConversion.swift @@ -211,7 +211,7 @@ extension AST.Atom { switch self.kind { case let .char(c): return .char(c) - case let .scalar(s): return .char(Character(s)) + case let .scalar(s): return .char(Character(s.value)) case .any: return .any case let .backreference(r): return .backreference(.init(ast: r)) case let .changeMatchingOptions(seq): return .changeMatchingOptions(.init(ast: seq)) diff --git a/Sources/_StringProcessing/Utility/ASTBuilder.swift b/Sources/_StringProcessing/Utility/ASTBuilder.swift index 51d4f8bfc..387eeb43f 100644 --- a/Sources/_StringProcessing/Utility/ASTBuilder.swift +++ b/Sources/_StringProcessing/Utility/ASTBuilder.swift @@ -338,10 +338,10 @@ func escaped( atom(.escaped(e)) } func scalar(_ s: Unicode.Scalar) -> AST.Node { - atom(.scalar(s)) + atom(.scalar(.init(s, .fake))) } func scalar_m(_ s: Unicode.Scalar) -> AST.CustomCharacterClass.Member { - atom_m(.scalar(s)) + atom_m(.scalar(.init(s, .fake))) } func backreference(_ r: AST.Reference.Kind, recursionLevel: Int? = nil) -> AST.Node { diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index f7fa2b341..0496e77c6 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -2272,6 +2272,12 @@ extension RegexTests { $0.as(CustomCC.self)!.members[0].as(CustomCC.Range.self)!.dashLoc }) + // MARK: Unicode scalars + + rangeTest(#"\u{65}"#, range(3 ..< 5), at: { + $0.as(AST.Atom.self)!.as(AST.Atom.Scalar.self)!.location + }) + // MARK: References rangeTest(#"\k"#, range(3 ..< 6), at: { From f436ccad016d2dddb1984b8104c88a9ae9ed21ac Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Tue, 10 May 2022 11:31:32 +0100 Subject: [PATCH 20/24] Introduce scalar sequences `\u{AA BB CC}` Allow a whitespace-separated list of scalars within the `\u{...}` syntax. This is syntactic sugar that gets implicitly splatted out, for example `\u{A B C}` becomes `\u{A}\u{B}\u{C}`. --- Sources/_RegexParser/Regex/AST/Atom.swift | 36 ++++++++-- .../Regex/Parse/LexicalAnalysis.swift | 61 +++++++++++++++- Sources/_RegexParser/Regex/Parse/Sema.swift | 17 +++-- .../_RegexParser/Regex/Printing/DumpAST.swift | 3 + .../_StringProcessing/ConsumerInterface.swift | 6 +- .../_StringProcessing/PrintAsPattern.swift | 20 +++--- .../Regex/ASTConversion.swift | 18 +++-- .../Utility/ASTBuilder.swift | 20 +++++- Tests/RegexTests/MatchTests.swift | 21 +++++- Tests/RegexTests/ParseTests.swift | 71 +++++++++++++++++++ 10 files changed, 242 insertions(+), 31 deletions(-) diff --git a/Sources/_RegexParser/Regex/AST/Atom.swift b/Sources/_RegexParser/Regex/AST/Atom.swift index 0ef8537a6..ff75260e8 100644 --- a/Sources/_RegexParser/Regex/AST/Atom.swift +++ b/Sources/_RegexParser/Regex/AST/Atom.swift @@ -31,6 +31,12 @@ extension AST { /// \u{...}, \0dd, \x{...}, ... case scalar(Scalar) + /// A whitespace-separated sequence of Unicode scalar values which are + /// implicitly splatted out. + /// + /// `\u{A B C}` -> `\u{A}\u{B}\u{C}` + case scalarSequence(ScalarSequence) + /// A Unicode property, category, or script, including those written using /// POSIX syntax. /// @@ -84,6 +90,7 @@ extension AST.Atom { switch kind { case .char(let v): return v case .scalar(let v): return v + case .scalarSequence(let v): return v case .property(let v): return v case .escaped(let v): return v case .keyboardControl(let v): return v @@ -116,6 +123,18 @@ extension AST.Atom { self.location = location } } + + public struct ScalarSequence: Hashable { + public var scalars: [Scalar] + public var trivia: [AST.Trivia] + + public init(_ scalars: [Scalar], trivia: [AST.Trivia]) { + precondition(scalars.count > 1, "Expected multiple scalars") + self.scalars = scalars + self.trivia = trivia + } + public var scalarValues: [Unicode.Scalar] { scalars.map(\.value) } + } } extension AST.Atom { @@ -725,8 +744,9 @@ extension AST.Atom { // the AST? Or defer for the matching engine? return nil - case .property, .any, .startOfLine, .endOfLine, .backreference, .subpattern, - .callout, .backtrackingDirective, .changeMatchingOptions: + case .scalarSequence, .property, .any, .startOfLine, .endOfLine, + .backreference, .subpattern, .callout, .backtrackingDirective, + .changeMatchingOptions: return nil } } @@ -748,13 +768,21 @@ extension AST.Atom { /// A string literal representation of the atom, if possible. /// /// Individual characters are returned as-is, and Unicode scalars are - /// presented using "\u{nnnn}" syntax. + /// presented using "\u{nn nn ...}" syntax. public var literalStringValue: String? { + func scalarLiteral(_ u: [UnicodeScalar]) -> String { + let digits = u.map { String($0.value, radix: 16, uppercase: true) } + .joined(separator: " ") + return "\\u{\(digits)}" + } switch kind { case .char(let c): return String(c) case .scalar(let s): - return "\\u{\(String(s.value.value, radix: 16, uppercase: true))}" + return scalarLiteral([s.value]) + + case .scalarSequence(let s): + return scalarLiteral(s.scalarValues) case .keyboardControl(let x): return "\\C-\(x)" diff --git a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift index 9a48f4f1a..24c19b758 100644 --- a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift +++ b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift @@ -290,10 +290,54 @@ extension Source { return try Source.validateUnicodeScalar(str, .hex) } + /// Try to lex a seqence of hex digit unicode scalars. + /// + /// UniScalarSequence -> Whitespace? UniScalarSequencElt+ + /// UniScalarSequencElt -> HexDigit{1...} Whitespace? + /// + mutating func expectUnicodeScalarSequence( + eating ending: Character + ) throws -> AST.Atom.Kind { + try recordLoc { src in + var scalars = [AST.Atom.Scalar]() + var trivia = [AST.Trivia]() + + // Eat up any leading whitespace. + if let t = src.lexWhitespace() { trivia.append(t) } + + while true { + let str = src.lexUntil { src in + // Hit the ending, stop lexing. + if src.isEmpty || src.peek() == ending { + return true + } + // Eat up trailing whitespace, and stop lexing to record the scalar. + if let t = src.lexWhitespace() { + trivia.append(t) + return true + } + // Not the ending or trivia, must be a digit of the scalar. + return false + } + guard !str.value.isEmpty else { break } + scalars.append(try Source.validateUnicodeScalar(str, .hex)) + } + guard !scalars.isEmpty else { + throw ParseError.expectedNumber("", kind: .hex) + } + try src.expect(ending) + + if scalars.count == 1 { + return .scalar(scalars[0]) + } + return .scalarSequence(.init(scalars, trivia: trivia)) + }.value + } + /// Eat a scalar off the front, starting from after the /// backslash and base character (e.g. `\u` or `\x`). /// - /// UniScalar -> 'u{' HexDigit{1...} '}' + /// UniScalar -> 'u{' UniScalarSequence '}' /// | 'u' HexDigit{4} /// | 'x{' HexDigit{1...} '}' /// | 'x' HexDigit{0...2} @@ -314,7 +358,10 @@ extension Source { // TODO: PCRE offers a different behavior if PCRE2_ALT_BSUX is set. switch base { // Hex numbers. - case "u" where src.tryEat("{"), "x" where src.tryEat("{"): + case "u" where src.tryEat("{"): + return try src.expectUnicodeScalarSequence(eating: "}") + + case "x" where src.tryEat("{"): let str = try src.lexUntil(eating: "}") return .scalar(try Source.validateUnicodeScalar(str, .hex)) @@ -598,6 +645,16 @@ extension Source { // inside a custom character class (and only treats whitespace as // non-semantic there for the extra-extended `(?xx)` mode). If we get a // strict-PCRE mode, we'll need to add a case for that. + return lexWhitespace() + } + + /// Try to consume whitespace as trivia + /// + /// Whitespace -> WhitespaceChar+ + /// + /// Unlike `lexNonSemanticWhitespace`, this will always attempt to lex + /// whitespace. + mutating func lexWhitespace() -> AST.Trivia? { let trivia: Located? = recordLoc { src in src.tryEatPrefix(\.isPatternWhitespace)?.string } diff --git a/Sources/_RegexParser/Regex/Parse/Sema.swift b/Sources/_RegexParser/Regex/Parse/Sema.swift index 263902a8e..9d5ae4576 100644 --- a/Sources/_RegexParser/Regex/Parse/Sema.swift +++ b/Sources/_RegexParser/Regex/Parse/Sema.swift @@ -210,7 +210,7 @@ extension RegexValidator { } } - func validateAtom(_ atom: AST.Atom) throws { + func validateAtom(_ atom: AST.Atom, inCustomCharacterClass: Bool) throws { switch atom.kind { case .escaped(let esc): try validateEscaped(esc, at: atom.location) @@ -243,6 +243,13 @@ extension RegexValidator { // TODO: We should error on unknown Unicode scalar names. break + case .scalarSequence: + // Not currently supported in a custom character class. + if inCustomCharacterClass { + throw error(.unsupported("scalar sequence in custom character class"), + at: atom.location) + } + case .char, .scalar, .startOfLine, .endOfLine, .any: break } @@ -260,8 +267,8 @@ extension RegexValidator { let lhs = range.lhs let rhs = range.rhs - try validateAtom(lhs) - try validateAtom(rhs) + try validateAtom(lhs, inCustomCharacterClass: true) + try validateAtom(rhs, inCustomCharacterClass: true) guard lhs.isValidCharacterClassRangeBound else { throw error(.invalidCharacterClassRangeOperand, at: lhs.location) @@ -297,7 +304,7 @@ extension RegexValidator { try validateCharacterClassRange(r) case .atom(let a): - try validateAtom(a) + try validateAtom(a, inCustomCharacterClass: true) case .setOperation(let lhs, _, let rhs): for lh in lhs { try validateCharacterClassMember(lh) } @@ -379,7 +386,7 @@ extension RegexValidator { try validateQuantification(q) case .atom(let a): - try validateAtom(a) + try validateAtom(a, inCustomCharacterClass: false) case .customCharacterClass(let c): try validateCustomCharacterClass(c) diff --git a/Sources/_RegexParser/Regex/Printing/DumpAST.swift b/Sources/_RegexParser/Regex/Printing/DumpAST.swift index a9cf6b424..b8937d518 100644 --- a/Sources/_RegexParser/Regex/Printing/DumpAST.swift +++ b/Sources/_RegexParser/Regex/Printing/DumpAST.swift @@ -138,6 +138,9 @@ extension AST.Atom { switch kind { case .escaped(let c): return "\\\(c.character)" + case .scalarSequence(let s): + return s.scalars.map(\.value.halfWidthCornerQuoted).joined() + case .namedCharacter(let charName): return "\\N{\(charName)}" diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index a292d7518..48f353e52 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -230,9 +230,9 @@ extension AST.Atom { // handled in emitAssertion return nil - case .escaped, .keyboardControl, .keyboardMeta, .keyboardMetaControl, - .backreference, .subpattern, .callout, .backtrackingDirective, - .changeMatchingOptions: + case .scalarSequence, .escaped, .keyboardControl, .keyboardMeta, + .keyboardMetaControl, .backreference, .subpattern, .callout, + .backtrackingDirective, .changeMatchingOptions: // FIXME: implement return nil } diff --git a/Sources/_StringProcessing/PrintAsPattern.swift b/Sources/_StringProcessing/PrintAsPattern.swift index 285a3fdbb..601447968 100644 --- a/Sources/_StringProcessing/PrintAsPattern.swift +++ b/Sources/_StringProcessing/PrintAsPattern.swift @@ -671,13 +671,19 @@ extension AST.Atom { } var _dslBase: String { + func scalarLiteral(_ s: UnicodeScalar) -> String { + let hex = String(s.value, radix: 16, uppercase: true) + return "\\u{\(hex)}" + } switch kind { case let .char(c): return String(c) case let .scalar(s): - let hex = String(s.value.value, radix: 16, uppercase: true) - return "\\u{\(hex)}" + return scalarLiteral(s.value) + + case let .scalarSequence(seq): + return seq.scalarValues.map(scalarLiteral).joined() case let .property(p): return p._dslBase @@ -769,13 +775,9 @@ extension AST.Atom { var _regexBase: String { switch kind { - case let .char(c): - return String(c) - - case let .scalar(s): - let hex = String(s.value.value, radix: 16, uppercase: true) - return "\\u{\(hex)}" - + case .char, .scalar, .scalarSequence: + return literalStringValue! + case let .property(p): return p._regexBase diff --git a/Sources/_StringProcessing/Regex/ASTConversion.swift b/Sources/_StringProcessing/Regex/ASTConversion.swift index 5c4f88f40..e675a5659 100644 --- a/Sources/_StringProcessing/Regex/ASTConversion.swift +++ b/Sources/_StringProcessing/Regex/ASTConversion.swift @@ -60,15 +60,17 @@ extension AST.Node { var result = "" var idx = idx while idx < astChildren.endIndex { - let atom: AST.Atom? = astChildren[idx].as() + guard let atom: AST.Atom = astChildren[idx].as() else { break } // TODO: For printing, nice to coalesce // scalars literals too. We likely need a different // approach even before we have a better IR. - if let char = atom?.singleCharacter { + if let char = atom.singleCharacter { result.append(char) - } else if let scalar = atom?.singleScalar { + } else if let scalar = atom.singleScalar { result.append(Character(scalar)) + } else if case .scalarSequence(let seq) = atom.kind { + result += seq.scalarValues.map(Character.init) } else { break } @@ -136,7 +138,15 @@ extension AST.Node { return .trivia(v.contents) case let .atom(v): - return .atom(v.dslTreeAtom) + switch v.kind { + case .scalarSequence(let seq): + // Scalar sequences are splatted into concatenated scalars, which + // becomes a quoted literal. Sequences nested in concatenations have + // already been coalesced, this just handles the lone atom case. + return .quotedLiteral(String(seq.scalarValues.map(Character.init))) + default: + return .atom(v.dslTreeAtom) + } case let .customCharacterClass(ccc): return .customCharacterClass(ccc.dslTreeClass) diff --git a/Sources/_StringProcessing/Utility/ASTBuilder.swift b/Sources/_StringProcessing/Utility/ASTBuilder.swift index 387eeb43f..78477e2b5 100644 --- a/Sources/_StringProcessing/Utility/ASTBuilder.swift +++ b/Sources/_StringProcessing/Utility/ASTBuilder.swift @@ -338,10 +338,26 @@ func escaped( atom(.escaped(e)) } func scalar(_ s: Unicode.Scalar) -> AST.Node { - atom(.scalar(.init(s, .fake))) + .atom(scalar_a(s)) +} +func scalar_a(_ s: Unicode.Scalar) -> AST.Atom { + atom_a(.scalar(.init(s, .fake))) } func scalar_m(_ s: Unicode.Scalar) -> AST.CustomCharacterClass.Member { - atom_m(.scalar(.init(s, .fake))) + .atom(scalar_a(s)) +} + +func scalarSeq(_ s: Unicode.Scalar...) -> AST.Node { + .atom(scalarSeq_a(s)) +} +func scalarSeq_a(_ s: Unicode.Scalar...) -> AST.Atom { + scalarSeq_a(s) +} +func scalarSeq_a(_ s: [Unicode.Scalar]) -> AST.Atom { + atom_a(.scalarSequence(.init(s.map { .init($0, .fake) }, trivia: []))) +} +func scalarSeq_m(_ s: Unicode.Scalar...) -> AST.CustomCharacterClass.Member { + .atom(scalarSeq_a(s)) } func backreference(_ r: AST.Reference.Kind, recursionLevel: Int? = nil) -> AST.Node { diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index 3b7def90b..36056e85a 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -285,7 +285,20 @@ extension RegexTests { firstMatchTest(#"\0707"#, input: "12387\u{1C7}xyz", match: "\u{1C7}") // code point sequence - firstMatchTest(#"\u{61 62 63}"#, input: "123abcxyz", match: "abc", xfail: true) + firstMatchTest(#"\u{61 62 63}"#, input: "123abcxyz", match: "abc") + firstMatchTest(#"3\u{ 61 62 63 }"#, input: "123abcxyz", match: "3abc") + firstMatchTest(#"\u{61 62}\u{63}"#, input: "123abcxyz", match: "abc") + firstMatchTest(#"\u{61}\u{62 63}"#, input: "123abcxyz", match: "abc") + firstMatchTest(#"9|\u{61 62 63}"#, input: "123abcxyz", match: "abc") + firstMatchTest(#"(?:\u{61 62 63})"#, input: "123abcxyz", match: "abc") + firstMatchTest(#"23\u{61 62 63}xy"#, input: "123abcxyz", match: "23abcxy") + + // o + horn + dot_below + firstMatchTest( + #"\u{006f 031b 0323}"#, + input: "\u{006f}\u{031b}\u{0323}", + match: "\u{006f}\u{031b}\u{0323}" + ) // Escape sequences that represent scalar values. firstMatchTest(#"\a[\b]\e\f\n\r\t"#, @@ -1405,6 +1418,9 @@ extension RegexTests { firstMatchTest(#"\u{65}\u{301}$"#, input: eDecomposed, match: eDecomposed) firstMatchTest(#"\u{65}\u{301}$"#, input: eComposed, match: eComposed) + firstMatchTest(#"\u{65 301}$"#, input: eDecomposed, match: eDecomposed) + firstMatchTest(#"\u{65 301}$"#, input: eComposed, match: eComposed) + // FIXME: Implicit \y at end of match firstMatchTest(#"\u{65}"#, input: eDecomposed, match: nil, xfail: true) @@ -1516,7 +1532,8 @@ extension RegexTests { firstMatchTest(#"🇰🇷"#, input: flag, match: flag) firstMatchTest(#"[🇰🇷]"#, input: flag, match: flag) firstMatchTest(#"\u{1F1F0}\u{1F1F7}"#, input: flag, match: flag) - + firstMatchTest(#"\u{1F1F0 1F1F7}"#, input: flag, match: flag) + // First Unicode scalar followed by CCC of regional indicators firstMatchTest(#"\u{1F1F0}[\u{1F1E6}-\u{1F1FF}]"#, input: flag, match: flag, xfail: true) diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index 0496e77c6..219393893 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -481,6 +481,22 @@ extension RegexTests { parseTest(#"\x5X"#, concat(scalar("\u{5}"), "X")) parseTest(#"\x12ab"#, concat(scalar("\u{12}"), "a", "b")) + parseTest(#"\u{ a }"#, scalar("\u{A}")) + parseTest(#"\u{ a }\u{ B }"#, concat(scalar("\u{A}"), scalar("\u{B}"))) + + // MARK: Scalar sequences + + parseTest(#"\u{A bC}"#, scalarSeq("\u{A}", "\u{BC}")) + parseTest(#"\u{ A bC }"#, scalarSeq("\u{A}", "\u{BC}")) + parseTest(#"\u{A bC }"#, scalarSeq("\u{A}", "\u{BC}")) + parseTest(#"\u{ A bC}"#, scalarSeq("\u{A}", "\u{BC}")) + parseTest(#"\u{ A b C }"#, scalarSeq("\u{A}", "\u{B}", "\u{C}")) + + parseTest( + #"\u{3b1 3b3 3b5 3b9}"#, + scalarSeq("\u{3b1}", "\u{3b3}", "\u{3b5}", "\u{3b9}") + ) + // MARK: Character classes parseTest(#"abc\d"#, concat("a", "b", "c", escaped(.decimalDigit))) @@ -658,6 +674,28 @@ extension RegexTests { range_m(.namedCharacter("DOLLAR SIGN"), .namedCharacter("APOSTROPHE"))), throwsError: .unsupported) + parseTest( + #"[\u{AA}-\u{BB}]"#, + charClass(range_m(scalar_a("\u{AA}"), scalar_a("\u{BB}"))) + ) + + // Not currently supported, we need to figure out what their semantics are. + parseTest( + #"[\u{AA BB}-\u{CC}]"#, + charClass(range_m(scalarSeq_a("\u{AA}", "\u{BB}"), scalar_a("\u{CC}"))), + throwsError: .unsupported + ) + parseTest( + #"[\u{CC}-\u{AA BB}]"#, + charClass(range_m(scalar_a("\u{CC}"), scalarSeq_a("\u{AA}", "\u{BB}"))), + throwsError: .unsupported + ) + parseTest( + #"[\u{a b c}]"#, + charClass(scalarSeq_m("\u{A}", "\u{B}", "\u{C}")), + throwsError: .unsupported + ) + // MARK: Operators parseTest( @@ -2071,6 +2109,16 @@ extension RegexTests { /# """#, prop(.generalCategory(.decimalNumber))) + parseWithDelimitersTest(#""" + #/ + \u{ + aB + B + c + } + /# + """#, scalarSeq("\u{AB}", "\u{B}", "\u{C}")) + // MARK: Delimiter skipping: Make sure we can skip over the ending delimiter // if it's clear that it's part of the regex syntax. @@ -2145,6 +2193,12 @@ extension RegexTests { parseNotEqualTest(#"[\p{Any}]"#, #"[[:Any:]]"#) + parseNotEqualTest(#"\u{A}"#, #"\u{B}"#) + parseNotEqualTest(#"\u{A B}"#, #"\u{B A}"#) + parseNotEqualTest(#"\u{AB}"#, #"\u{A B}"#) + parseNotEqualTest(#"[\u{AA BB}-\u{CC}]"#, #"[\u{AA DD}-\u{CC}]"#) + parseNotEqualTest(#"[\u{AA BB}-\u{DD}]"#, #"[\u{AA BB}-\u{CC}]"#) + parseNotEqualTest(#"[abc[:space:]\d]+"#, #"[abc[:upper:]\d]+"#) @@ -2491,6 +2545,7 @@ extension RegexTests { diagnosticTest(#"\e\#u{301}"#, .invalidEscape("e\u{301}")) diagnosticTest(#"\\#u{E9}"#, .invalidEscape("é")) diagnosticTest(#"\˂"#, .invalidEscape("˂")) + diagnosticTest(#"\d\#u{301}"#, .invalidEscape("d\u{301}")) // MARK: Character properties @@ -2597,6 +2652,22 @@ extension RegexTests { diagnosticTest(#"\u{G}"#, .expectedNumber("G", kind: .hex)) + diagnosticTest(#"\u{"#, .expectedNumber("", kind: .hex)) + diagnosticTest(#"\u{ "#, .expectedNumber("", kind: .hex)) + diagnosticTest(#"\u{}"#, .expectedNumber("", kind: .hex)) + diagnosticTest(#"\u{ }"#, .expectedNumber("", kind: .hex)) + diagnosticTest(#"\u{ }"#, .expectedNumber("", kind: .hex)) + diagnosticTest(#"\u{ G}"#, .expectedNumber("G", kind: .hex)) + diagnosticTest(#"\u{G }"#, .expectedNumber("G", kind: .hex)) + diagnosticTest(#"\u{ G }"#, .expectedNumber("G", kind: .hex)) + diagnosticTest(#"\u{ GH }"#, .expectedNumber("GH", kind: .hex)) + diagnosticTest(#"\u{ G H }"#, .expectedNumber("G", kind: .hex)) + diagnosticTest(#"\u{ ABC G }"#, .expectedNumber("G", kind: .hex)) + diagnosticTest(#"\u{ FFFFFFFFF A }"#, .numberOverflow("FFFFFFFFF")) + + diagnosticTest(#"[\d--\u{a b}]"#, .unsupported("scalar sequence in custom character class")) + diagnosticTest(#"[\d--[\u{a b}]]"#, .unsupported("scalar sequence in custom character class")) + // MARK: Matching options diagnosticTest(#"(?^-"#, .cannotRemoveMatchingOptionsAfterCaret) From 05971641c077a9e59e4292019847b2a118453d0f Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Tue, 10 May 2022 11:31:33 +0100 Subject: [PATCH 21/24] Fix invalid indexing `curIdx` is an index of `astChildren`, not `children`. --- Sources/_StringProcessing/Regex/ASTConversion.swift | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Sources/_StringProcessing/Regex/ASTConversion.swift b/Sources/_StringProcessing/Regex/ASTConversion.swift index e675a5659..79a515033 100644 --- a/Sources/_StringProcessing/Regex/ASTConversion.swift +++ b/Sources/_StringProcessing/Regex/ASTConversion.swift @@ -102,7 +102,7 @@ extension AST.Node { curIdx = nextIdx } else { children.append(astChildren[curIdx].dslTreeNode) - children.formIndex(after: &curIdx) + astChildren.formIndex(after: &curIdx) } } return .concatenation(children) From 0872d1693b876d058c77666e17d7d108dc381012 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Tue, 10 May 2022 11:31:33 +0100 Subject: [PATCH 22/24] Fix source location tracking in `lexUntil` The `predicate` may independently advance the location before bailing, and we don't want that to affect the recorded location of the result. We probably ought to replace `lexUntil` with a better API. --- .../_RegexParser/Regex/Parse/LexicalAnalysis.swift | 13 +++++++++++-- Tests/RegexTests/ParseTests.swift | 8 ++++++++ 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift index 24c19b758..e8783dc86 100644 --- a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift +++ b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift @@ -503,13 +503,22 @@ extension Source { private mutating func lexUntil( _ predicate: (inout Source) throws -> Bool ) rethrows -> Located { + // We track locations outside of recordLoc, as the predicate may advance the + // input when we hit the end, and we don't want that to affect the location + // of what was lexed in the `result`. We still want the recordLoc call to + // attach locations to any thrown errors though. + // TODO: We should find a better way of doing this, `lexUntil` seems full + // of footguns. + let start = currentPosition + var end = currentPosition + var result = "" try recordLoc { src in - var result = "" while try !predicate(&src) { result.append(src.eat()) + end = src.currentPosition } - return result } + return .init(result, start ..< end) } private mutating func lexUntil(eating end: String) throws -> Located { diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index 219393893..8163c1359 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -2332,6 +2332,14 @@ extension RegexTests { $0.as(AST.Atom.self)!.as(AST.Atom.Scalar.self)!.location }) + rangeTest(#"\u{ 65 58 }"#, range(5 ..< 7), at: { + $0.as(AST.Atom.self)!.as(AST.Atom.ScalarSequence.self)!.scalars[0].location + }) + + rangeTest(#"\u{ 65 58 }"#, range(8 ..< 10), at: { + $0.as(AST.Atom.self)!.as(AST.Atom.ScalarSequence.self)!.scalars[1].location + }) + // MARK: References rangeTest(#"\k"#, range(3 ..< 6), at: { From b209e4fd39096d41294adfd296e4258ff8a3f183 Mon Sep 17 00:00:00 2001 From: Richard Wei Date: Tue, 10 May 2022 07:45:00 -0700 Subject: [PATCH 23/24] Tidy up build flags and fix implicit import circular dependency (#392) - Explicitly ask the compiler not to implicitly import _StringProessing. This is to avoid a circular dependency when `-enable-experimental-string-processing` is enabled by default. - Unify the build flags for modules that are built in the compiler repo into a `stdlibSettings` value. - Disable implicit _Concurrency import as well since it is how it's built in the compiler repo. This helps us catch errors before we integrate with the compiler repo. - Remove `-enable-experimental-pairwise-build-block` since SE-0348 has been implemented and enabled. - Update the minimum toolchain requirement to 2022-04-20. --- Package.swift | 25 ++++++++++--------------- README.md | 2 +- 2 files changed, 11 insertions(+), 16 deletions(-) diff --git a/Package.swift b/Package.swift index 8303fc5cb..f9eb95e8e 100644 --- a/Package.swift +++ b/Package.swift @@ -10,6 +10,13 @@ let availabilityDefinition = PackageDescription.SwiftSetting.unsafeFlags([ #"SwiftStdlib 5.7:macOS 9999, iOS 9999, watchOS 9999, tvOS 9999"#, ]) +let stdlibSettings: [PackageDescription.SwiftSetting] = [ + .unsafeFlags(["-enable-library-evolution"]), + .unsafeFlags(["-Xfrontend", "-disable-implicit-concurrency-module-import"]), + .unsafeFlags(["-Xfrontend", "-disable-implicit-string-processing-module-import"]), + availabilityDefinition +] + let package = Package( name: "swift-experimental-string-processing", products: [ @@ -36,10 +43,7 @@ let package = Package( .target( name: "_RegexParser", dependencies: [], - swiftSettings: [ - .unsafeFlags(["-enable-library-evolution"]), - availabilityDefinition - ]), + swiftSettings: stdlibSettings), .testTarget( name: "MatchingEngineTests", dependencies: [ @@ -51,18 +55,11 @@ let package = Package( .target( name: "_StringProcessing", dependencies: ["_RegexParser", "_CUnicode"], - swiftSettings: [ - .unsafeFlags(["-enable-library-evolution"]), - availabilityDefinition - ]), + swiftSettings: stdlibSettings), .target( name: "RegexBuilder", dependencies: ["_StringProcessing", "_RegexParser"], - swiftSettings: [ - .unsafeFlags(["-enable-library-evolution"]), - .unsafeFlags(["-Xfrontend", "-enable-experimental-pairwise-build-block"]), - availabilityDefinition - ]), + swiftSettings: stdlibSettings), .testTarget( name: "RegexTests", dependencies: ["_StringProcessing"], @@ -73,7 +70,6 @@ let package = Package( name: "RegexBuilderTests", dependencies: ["_StringProcessing", "RegexBuilder"], swiftSettings: [ - .unsafeFlags(["-Xfrontend", "-enable-experimental-pairwise-build-block"]), .unsafeFlags(["-Xfrontend", "-disable-availability-checking"]) ]), .testTarget( @@ -102,7 +98,6 @@ let package = Package( name: "Exercises", dependencies: ["_RegexParser", "_StringProcessing", "RegexBuilder"], swiftSettings: [ - .unsafeFlags(["-Xfrontend", "-enable-experimental-pairwise-build-block"]), .unsafeFlags(["-Xfrontend", "-disable-availability-checking"]) ]), .testTarget( diff --git a/README.md b/README.md index 42586ad2b..67c708a75 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ See [Declarative String Processing Overview][decl-string] ## Requirements -- [Swift Trunk Development Snapshot](https://www.swift.org/download/#snapshots) DEVELOPMENT-SNAPSHOT-2022-03-09 or later. +- [Swift Trunk Development Snapshot](https://www.swift.org/download/#snapshots) DEVELOPMENT-SNAPSHOT-2022-04-20 or later. ## Trying it out From f779459aeae786374159b858b1db4035f18af642 Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Tue, 10 May 2022 12:55:03 -0500 Subject: [PATCH 24/24] Catch more unquantifiable elements (#391) This adds start/end anchors ^ and $, groups that form zero-width assertions, and option-changing groups without content `(?i)...` --- Sources/_RegexParser/Regex/AST/AST.swift | 4 +++- Sources/_RegexParser/Regex/AST/Atom.swift | 2 ++ Sources/_RegexParser/Regex/AST/Group.swift | 15 +++++++++++++++ Tests/RegexTests/ParseTests.swift | 4 ++++ 4 files changed, 24 insertions(+), 1 deletion(-) diff --git a/Sources/_RegexParser/Regex/AST/AST.swift b/Sources/_RegexParser/Regex/AST/AST.swift index a7dcd2015..be1548b72 100644 --- a/Sources/_RegexParser/Regex/AST/AST.swift +++ b/Sources/_RegexParser/Regex/AST/AST.swift @@ -125,7 +125,9 @@ extension AST.Node { switch self { case .atom(let a): return a.isQuantifiable - case .group, .conditional, .customCharacterClass, .absentFunction: + case .group(let g): + return g.isQuantifiable + case .conditional, .customCharacterClass, .absentFunction: return true case .alternation, .concatenation, .quantification, .quote, .trivia, .empty: diff --git a/Sources/_RegexParser/Regex/AST/Atom.swift b/Sources/_RegexParser/Regex/AST/Atom.swift index ff75260e8..19e2fb498 100644 --- a/Sources/_RegexParser/Regex/AST/Atom.swift +++ b/Sources/_RegexParser/Regex/AST/Atom.swift @@ -808,6 +808,8 @@ extension AST.Atom { // TODO: Are callouts quantifiable? case .escaped(let esc): return esc.isQuantifiable + case .startOfLine, .endOfLine: + return false default: return true } diff --git a/Sources/_RegexParser/Regex/AST/Group.swift b/Sources/_RegexParser/Regex/AST/Group.swift index 8ecaadeda..6fd46abe7 100644 --- a/Sources/_RegexParser/Regex/AST/Group.swift +++ b/Sources/_RegexParser/Regex/AST/Group.swift @@ -136,3 +136,18 @@ extension AST.Group { } } } + +extension AST.Group { + var isQuantifiable: Bool { + switch kind.value { + case .capture, .namedCapture, .balancedCapture, .nonCapture, + .nonCaptureReset, .atomicNonCapturing, .scriptRun, .atomicScriptRun, + .changeMatchingOptions: + return true + + case .lookahead, .negativeLookahead, .nonAtomicLookahead, + .lookbehind, .negativeLookbehind, .nonAtomicLookbehind: + return false + } + } +} diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index 8163c1359..ed930b0fe 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -2652,6 +2652,10 @@ extension RegexTests { diagnosticTest(#"\Z??"#, .notQuantifiable) diagnosticTest(#"\G*?"#, .notQuantifiable) diagnosticTest(#"\z+?"#, .notQuantifiable) + diagnosticTest(#"^*"#, .notQuantifiable) + diagnosticTest(#"$?"#, .notQuantifiable) + diagnosticTest(#"(?=a)+"#, .notQuantifiable) + diagnosticTest(#"(?i)*"#, .notQuantifiable) diagnosticTest(#"\K{1}"#, .unsupported(#"'\K'"#)) diagnosticTest(#"\y{2,5}"#, .notQuantifiable) diagnosticTest(#"\Y{3,}"#, .notQuantifiable)