From 61965c35426d8de5f6a1a3c71a02724afea00fff Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Tue, 10 May 2022 11:31:30 +0100 Subject: [PATCH 1/6] Restrict character property fuzzy matching to "pattern whitespace" I wasn't aware of this Unicode property when initially implementing this. It's a more restricted set of whitespace that Unicode reccommends for parsing patterns. It's the same set of whitespace used for extended syntax. UAX44-LM3 itself doesn't appear to specify the exact set of whitespace to match against, but this is no more restrictive than the engines I'm aware of. --- .../Parse/CharacterPropertyClassification.swift | 2 +- Tests/RegexTests/ParseTests.swift | 14 ++++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift b/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift index ee9195ff3..c0ece78ff 100644 --- a/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift +++ b/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift @@ -18,7 +18,7 @@ extension Source { // This follows the rules provided by UAX44-LM3, including trying to drop an // "is" prefix, which isn't required by UTS#18 RL1.2, but is nice for // consistency with other engines and the Unicode.Scalar.Properties names. - let str = str.filter { !$0.isWhitespace && $0 != "_" && $0 != "-" } + let str = str.filter { !$0.isPatternWhitespace && $0 != "_" && $0 != "-" } .lowercased() if let m = match(str) { return m diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index 9dfcff99e..1d4fb948d 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -2061,6 +2061,16 @@ extension RegexTests { """, changeMatchingOptions(matchingOptions(adding: .extended)) ) + parseWithDelimitersTest(#""" + #/ + \p{ + gc + = + digit + } + /# + """#, prop(.generalCategory(.decimalNumber))) + // MARK: Delimiter skipping: Make sure we can skip over the ending delimiter // if it's clear that it's part of the regex syntax. @@ -2486,6 +2496,10 @@ extension RegexTests { diagnosticTest(#"\p{aaa\p{b}}"#, .unknownProperty(key: nil, value: "aaa")) diagnosticTest(#"[[:{:]]"#, .unknownProperty(key: nil, value: "{")) + // We only filter pattern whitespace, which doesn't include things like + // non-breaking spaces. + diagnosticTest(#"\p{L\#u{A0}l}"#, .unknownProperty(key: nil, value: "L\u{A0}l")) + // MARK: Matching options diagnosticTest("(?-y{g})", .cannotRemoveTextSegmentOptions) From 05e610ab931f341f63cbc591a72c7e7f2b93b009 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Tue, 10 May 2022 11:31:32 +0100 Subject: [PATCH 2/6] Improve the wording of a diagnostic --- .../_RegexParser/Regex/Parse/Diagnostics.swift | 16 ++++++++++------ Tests/RegexTests/ParseTests.swift | 2 ++ 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/Sources/_RegexParser/Regex/Parse/Diagnostics.swift b/Sources/_RegexParser/Regex/Parse/Diagnostics.swift index 0054ae6b6..d87fba918 100644 --- a/Sources/_RegexParser/Regex/Parse/Diagnostics.swift +++ b/Sources/_RegexParser/Regex/Parse/Diagnostics.swift @@ -105,13 +105,17 @@ extension ParseError: CustomStringConvertible { case let .expectedNumDigits(s, i): return "expected \(i) digits in '\(s)'" case let .expectedNumber(s, kind: kind): - let radix: String - if kind == .decimal { - radix = "" - } else { - radix = " of radix \(kind.radix)" + let number: String + switch kind { + case .octal: + number = "octal number" + case .decimal: + number = "number" + case .hex: + number = "hexadecimal number" } - return "expected a numbers in '\(s)'\(radix)" + let suffix = s.isEmpty ? "" : " in '\(s)'" + return "expected \(number)\(suffix)" case let .expected(s): return "expected '\(s)'" case .unexpectedEndOfInput: diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index 1d4fb948d..f7fa2b341 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -2739,5 +2739,7 @@ extension RegexTests { "#/[x*/#", "cannot parse regular expression: expected ']'") compilerInterfaceDiagnosticMessageTest( "/a{3,2}/", "cannot parse regular expression: range lower bound '3' must be less than or equal to upper bound '2'") + compilerInterfaceDiagnosticMessageTest( + #"#/\u{}/#"#, "cannot parse regular expression: expected hexadecimal number") } } From 775201552d1246ed0698262260222480757537a2 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Tue, 10 May 2022 11:31:32 +0100 Subject: [PATCH 3/6] Introduce AST.Atom.Scalar This allows us to store the source location of the inner scalar value. --- Sources/_RegexParser/Regex/AST/Atom.swift | 18 ++++- .../Regex/Parse/LexicalAnalysis.swift | 69 ++++++++++++------- .../_StringProcessing/ConsumerInterface.swift | 4 +- .../_StringProcessing/PrintAsPattern.swift | 4 +- .../Regex/ASTConversion.swift | 2 +- .../Utility/ASTBuilder.swift | 4 +- Tests/RegexTests/ParseTests.swift | 6 ++ 7 files changed, 73 insertions(+), 34 deletions(-) diff --git a/Sources/_RegexParser/Regex/AST/Atom.swift b/Sources/_RegexParser/Regex/AST/Atom.swift index 9b0f1cb2e..0ef8537a6 100644 --- a/Sources/_RegexParser/Regex/AST/Atom.swift +++ b/Sources/_RegexParser/Regex/AST/Atom.swift @@ -29,7 +29,7 @@ extension AST { /// A Unicode scalar value written as a literal /// /// \u{...}, \0dd, \x{...}, ... - case scalar(Unicode.Scalar) + case scalar(Scalar) /// A Unicode property, category, or script, including those written using /// POSIX syntax. @@ -106,6 +106,18 @@ extension AST.Atom { } } +extension AST.Atom { + public struct Scalar: Hashable { + public var value: UnicodeScalar + public var location: SourceLocation + + public init(_ value: UnicodeScalar, _ location: SourceLocation) { + self.value = value + self.location = location + } + } +} + extension AST.Atom { // TODO: We might scrap this and break out a few categories so @@ -697,7 +709,7 @@ extension AST.Atom { case .char(let c): return c case .scalar(let s): - return Character(s) + return Character(s.value) case .escaped(let c): return c.scalarValue.map(Character.init) @@ -742,7 +754,7 @@ extension AST.Atom { case .char(let c): return String(c) case .scalar(let s): - return "\\u{\(String(s.value, radix: 16, uppercase: true))}" + return "\\u{\(String(s.value.value, radix: 16, uppercase: true))}" case .keyboardControl(let x): return "\\C-\(x)" diff --git a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift index c2cce67e8..9a48f4f1a 100644 --- a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift +++ b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift @@ -157,6 +157,19 @@ extension Source { return .init(start ..< currentPosition) } + /// Attempt to eat a given prefix that satisfies a given predicate, with the + /// source location recorded. + mutating func tryEatLocatedPrefix( + maxLength: Int? = nil, + _ f: (Char) -> Bool + ) -> Located? { + let result = recordLoc { src in + src.tryEatPrefix(maxLength: maxLength, f) + } + guard let result = result else { return nil } + return result.map(\.string) + } + /// Throws an expected ASCII character error if not matched mutating func expectASCII() throws -> Located { try recordLoc { src in @@ -217,13 +230,13 @@ extension Source { /// return the scalar value, or throw an error if the string is malformed or /// would overflow the scalar. private static func validateUnicodeScalar( - _ str: String, _ kind: RadixKind - ) throws -> Unicode.Scalar { - let num = try validateNumber(str, UInt32.self, kind) + _ str: Source.Located, _ kind: RadixKind + ) throws -> AST.Atom.Scalar { + let num = try validateNumber(str.value, UInt32.self, kind) guard let scalar = Unicode.Scalar(num) else { throw ParseError.misc("Invalid scalar value U+\(num.hexStr)") } - return scalar + return .init(scalar, str.location) } /// Try to eat a number of a particular type and radix off the front. @@ -266,14 +279,15 @@ extension Source { /// Eat a scalar value from hexadecimal notation off the front private mutating func expectUnicodeScalar( numDigits: Int - ) throws -> Located { - try recordLoc { src in + ) throws -> AST.Atom.Scalar { + let str = try recordLoc { src -> String in let str = src.eat(upToCount: numDigits).string guard str.count == numDigits else { throw ParseError.expectedNumDigits(str, numDigits) } - return try Source.validateUnicodeScalar(str, .hex) + return str } + return try Source.validateUnicodeScalar(str, .hex) } /// Eat a scalar off the front, starting from after the @@ -289,49 +303,57 @@ extension Source { /// mutating func expectUnicodeScalar( escapedCharacter base: Character - ) throws -> Located { + ) throws -> AST.Atom.Kind { try recordLoc { src in + + func nullScalar() -> AST.Atom.Kind { + let pos = src.currentPosition + return .scalar(.init(UnicodeScalar(0), SourceLocation(pos ..< pos))) + } + // TODO: PCRE offers a different behavior if PCRE2_ALT_BSUX is set. switch base { // Hex numbers. case "u" where src.tryEat("{"), "x" where src.tryEat("{"): - let str = try src.lexUntil(eating: "}").value - return try Source.validateUnicodeScalar(str, .hex) + let str = try src.lexUntil(eating: "}") + return .scalar(try Source.validateUnicodeScalar(str, .hex)) case "x": // \x expects *up to* 2 digits. - guard let digits = src.tryEatPrefix(maxLength: 2, \.isHexDigit) else { + guard let digits = src.tryEatLocatedPrefix(maxLength: 2, \.isHexDigit) + else { // In PCRE, \x without any valid hex digits is \u{0}. // TODO: This doesn't appear to be followed by ICU or Oniguruma, so // could be changed to throw an error if we had a parsing mode for // them. - return Unicode.Scalar(0) + return nullScalar() } - return try Source.validateUnicodeScalar(digits.string, .hex) + return .scalar(try Source.validateUnicodeScalar(digits, .hex)) case "u": - return try src.expectUnicodeScalar(numDigits: 4).value + return .scalar(try src.expectUnicodeScalar(numDigits: 4)) case "U": - return try src.expectUnicodeScalar(numDigits: 8).value + return .scalar(try src.expectUnicodeScalar(numDigits: 8)) // Octal numbers. case "o" where src.tryEat("{"): - let str = try src.lexUntil(eating: "}").value - return try Source.validateUnicodeScalar(str, .octal) + let str = try src.lexUntil(eating: "}") + return .scalar(try Source.validateUnicodeScalar(str, .octal)) case "0": // We can read *up to* 3 more octal digits. // FIXME: PCRE can only read up to 2 octal digits, if we get a strict // PCRE mode, we should limit it here. - guard let digits = src.tryEatPrefix(maxLength: 3, \.isOctalDigit) else { - return Unicode.Scalar(0) + guard let digits = src.tryEatLocatedPrefix(maxLength: 3, \.isOctalDigit) + else { + return nullScalar() } - return try Source.validateUnicodeScalar(digits.string, .octal) + return .scalar(try Source.validateUnicodeScalar(digits, .octal)) default: fatalError("Unexpected scalar start") } - } + }.value } /// Try to consume a quantifier @@ -1153,7 +1175,7 @@ extension Source { // We should either have a unicode scalar. if src.tryEat(sequence: "U+") { - let str = try src.lexUntil(eating: "}").value + let str = try src.lexUntil(eating: "}") return .scalar(try Source.validateUnicodeScalar(str, .hex)) } @@ -1581,8 +1603,7 @@ extension Source { switch char { // Hexadecimal and octal unicode scalars. case "u", "x", "U", "o", "0": - return try .scalar( - src.expectUnicodeScalar(escapedCharacter: char).value) + return try src.expectUnicodeScalar(escapedCharacter: char) default: break } diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index 9c0c3522c..a292d7518 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -178,7 +178,7 @@ extension AST.Atom { var singleScalar: UnicodeScalar? { switch kind { - case .scalar(let s): return s + case .scalar(let s): return s.value default: return nil } } @@ -200,7 +200,7 @@ extension AST.Atom { case let .scalar(s): assertionFailure( "Should have been handled by tree conversion") - return consumeScalar { $0 == s } + return consumeScalar { $0 == s.value } case let .char(c): assertionFailure( diff --git a/Sources/_StringProcessing/PrintAsPattern.swift b/Sources/_StringProcessing/PrintAsPattern.swift index 1b5c2a4c5..285a3fdbb 100644 --- a/Sources/_StringProcessing/PrintAsPattern.swift +++ b/Sources/_StringProcessing/PrintAsPattern.swift @@ -676,7 +676,7 @@ extension AST.Atom { return String(c) case let .scalar(s): - let hex = String(s.value, radix: 16, uppercase: true) + let hex = String(s.value.value, radix: 16, uppercase: true) return "\\u{\(hex)}" case let .property(p): @@ -773,7 +773,7 @@ extension AST.Atom { return String(c) case let .scalar(s): - let hex = String(s.value, radix: 16, uppercase: true) + let hex = String(s.value.value, radix: 16, uppercase: true) return "\\u{\(hex)}" case let .property(p): diff --git a/Sources/_StringProcessing/Regex/ASTConversion.swift b/Sources/_StringProcessing/Regex/ASTConversion.swift index 47433dc42..5c4f88f40 100644 --- a/Sources/_StringProcessing/Regex/ASTConversion.swift +++ b/Sources/_StringProcessing/Regex/ASTConversion.swift @@ -211,7 +211,7 @@ extension AST.Atom { switch self.kind { case let .char(c): return .char(c) - case let .scalar(s): return .char(Character(s)) + case let .scalar(s): return .char(Character(s.value)) case .any: return .any case let .backreference(r): return .backreference(.init(ast: r)) case let .changeMatchingOptions(seq): return .changeMatchingOptions(.init(ast: seq)) diff --git a/Sources/_StringProcessing/Utility/ASTBuilder.swift b/Sources/_StringProcessing/Utility/ASTBuilder.swift index 51d4f8bfc..387eeb43f 100644 --- a/Sources/_StringProcessing/Utility/ASTBuilder.swift +++ b/Sources/_StringProcessing/Utility/ASTBuilder.swift @@ -338,10 +338,10 @@ func escaped( atom(.escaped(e)) } func scalar(_ s: Unicode.Scalar) -> AST.Node { - atom(.scalar(s)) + atom(.scalar(.init(s, .fake))) } func scalar_m(_ s: Unicode.Scalar) -> AST.CustomCharacterClass.Member { - atom_m(.scalar(s)) + atom_m(.scalar(.init(s, .fake))) } func backreference(_ r: AST.Reference.Kind, recursionLevel: Int? = nil) -> AST.Node { diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index f7fa2b341..0496e77c6 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -2272,6 +2272,12 @@ extension RegexTests { $0.as(CustomCC.self)!.members[0].as(CustomCC.Range.self)!.dashLoc }) + // MARK: Unicode scalars + + rangeTest(#"\u{65}"#, range(3 ..< 5), at: { + $0.as(AST.Atom.self)!.as(AST.Atom.Scalar.self)!.location + }) + // MARK: References rangeTest(#"\k"#, range(3 ..< 6), at: { From f436ccad016d2dddb1984b8104c88a9ae9ed21ac Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Tue, 10 May 2022 11:31:32 +0100 Subject: [PATCH 4/6] Introduce scalar sequences `\u{AA BB CC}` Allow a whitespace-separated list of scalars within the `\u{...}` syntax. This is syntactic sugar that gets implicitly splatted out, for example `\u{A B C}` becomes `\u{A}\u{B}\u{C}`. --- Sources/_RegexParser/Regex/AST/Atom.swift | 36 ++++++++-- .../Regex/Parse/LexicalAnalysis.swift | 61 +++++++++++++++- Sources/_RegexParser/Regex/Parse/Sema.swift | 17 +++-- .../_RegexParser/Regex/Printing/DumpAST.swift | 3 + .../_StringProcessing/ConsumerInterface.swift | 6 +- .../_StringProcessing/PrintAsPattern.swift | 20 +++--- .../Regex/ASTConversion.swift | 18 +++-- .../Utility/ASTBuilder.swift | 20 +++++- Tests/RegexTests/MatchTests.swift | 21 +++++- Tests/RegexTests/ParseTests.swift | 71 +++++++++++++++++++ 10 files changed, 242 insertions(+), 31 deletions(-) diff --git a/Sources/_RegexParser/Regex/AST/Atom.swift b/Sources/_RegexParser/Regex/AST/Atom.swift index 0ef8537a6..ff75260e8 100644 --- a/Sources/_RegexParser/Regex/AST/Atom.swift +++ b/Sources/_RegexParser/Regex/AST/Atom.swift @@ -31,6 +31,12 @@ extension AST { /// \u{...}, \0dd, \x{...}, ... case scalar(Scalar) + /// A whitespace-separated sequence of Unicode scalar values which are + /// implicitly splatted out. + /// + /// `\u{A B C}` -> `\u{A}\u{B}\u{C}` + case scalarSequence(ScalarSequence) + /// A Unicode property, category, or script, including those written using /// POSIX syntax. /// @@ -84,6 +90,7 @@ extension AST.Atom { switch kind { case .char(let v): return v case .scalar(let v): return v + case .scalarSequence(let v): return v case .property(let v): return v case .escaped(let v): return v case .keyboardControl(let v): return v @@ -116,6 +123,18 @@ extension AST.Atom { self.location = location } } + + public struct ScalarSequence: Hashable { + public var scalars: [Scalar] + public var trivia: [AST.Trivia] + + public init(_ scalars: [Scalar], trivia: [AST.Trivia]) { + precondition(scalars.count > 1, "Expected multiple scalars") + self.scalars = scalars + self.trivia = trivia + } + public var scalarValues: [Unicode.Scalar] { scalars.map(\.value) } + } } extension AST.Atom { @@ -725,8 +744,9 @@ extension AST.Atom { // the AST? Or defer for the matching engine? return nil - case .property, .any, .startOfLine, .endOfLine, .backreference, .subpattern, - .callout, .backtrackingDirective, .changeMatchingOptions: + case .scalarSequence, .property, .any, .startOfLine, .endOfLine, + .backreference, .subpattern, .callout, .backtrackingDirective, + .changeMatchingOptions: return nil } } @@ -748,13 +768,21 @@ extension AST.Atom { /// A string literal representation of the atom, if possible. /// /// Individual characters are returned as-is, and Unicode scalars are - /// presented using "\u{nnnn}" syntax. + /// presented using "\u{nn nn ...}" syntax. public var literalStringValue: String? { + func scalarLiteral(_ u: [UnicodeScalar]) -> String { + let digits = u.map { String($0.value, radix: 16, uppercase: true) } + .joined(separator: " ") + return "\\u{\(digits)}" + } switch kind { case .char(let c): return String(c) case .scalar(let s): - return "\\u{\(String(s.value.value, radix: 16, uppercase: true))}" + return scalarLiteral([s.value]) + + case .scalarSequence(let s): + return scalarLiteral(s.scalarValues) case .keyboardControl(let x): return "\\C-\(x)" diff --git a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift index 9a48f4f1a..24c19b758 100644 --- a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift +++ b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift @@ -290,10 +290,54 @@ extension Source { return try Source.validateUnicodeScalar(str, .hex) } + /// Try to lex a seqence of hex digit unicode scalars. + /// + /// UniScalarSequence -> Whitespace? UniScalarSequencElt+ + /// UniScalarSequencElt -> HexDigit{1...} Whitespace? + /// + mutating func expectUnicodeScalarSequence( + eating ending: Character + ) throws -> AST.Atom.Kind { + try recordLoc { src in + var scalars = [AST.Atom.Scalar]() + var trivia = [AST.Trivia]() + + // Eat up any leading whitespace. + if let t = src.lexWhitespace() { trivia.append(t) } + + while true { + let str = src.lexUntil { src in + // Hit the ending, stop lexing. + if src.isEmpty || src.peek() == ending { + return true + } + // Eat up trailing whitespace, and stop lexing to record the scalar. + if let t = src.lexWhitespace() { + trivia.append(t) + return true + } + // Not the ending or trivia, must be a digit of the scalar. + return false + } + guard !str.value.isEmpty else { break } + scalars.append(try Source.validateUnicodeScalar(str, .hex)) + } + guard !scalars.isEmpty else { + throw ParseError.expectedNumber("", kind: .hex) + } + try src.expect(ending) + + if scalars.count == 1 { + return .scalar(scalars[0]) + } + return .scalarSequence(.init(scalars, trivia: trivia)) + }.value + } + /// Eat a scalar off the front, starting from after the /// backslash and base character (e.g. `\u` or `\x`). /// - /// UniScalar -> 'u{' HexDigit{1...} '}' + /// UniScalar -> 'u{' UniScalarSequence '}' /// | 'u' HexDigit{4} /// | 'x{' HexDigit{1...} '}' /// | 'x' HexDigit{0...2} @@ -314,7 +358,10 @@ extension Source { // TODO: PCRE offers a different behavior if PCRE2_ALT_BSUX is set. switch base { // Hex numbers. - case "u" where src.tryEat("{"), "x" where src.tryEat("{"): + case "u" where src.tryEat("{"): + return try src.expectUnicodeScalarSequence(eating: "}") + + case "x" where src.tryEat("{"): let str = try src.lexUntil(eating: "}") return .scalar(try Source.validateUnicodeScalar(str, .hex)) @@ -598,6 +645,16 @@ extension Source { // inside a custom character class (and only treats whitespace as // non-semantic there for the extra-extended `(?xx)` mode). If we get a // strict-PCRE mode, we'll need to add a case for that. + return lexWhitespace() + } + + /// Try to consume whitespace as trivia + /// + /// Whitespace -> WhitespaceChar+ + /// + /// Unlike `lexNonSemanticWhitespace`, this will always attempt to lex + /// whitespace. + mutating func lexWhitespace() -> AST.Trivia? { let trivia: Located? = recordLoc { src in src.tryEatPrefix(\.isPatternWhitespace)?.string } diff --git a/Sources/_RegexParser/Regex/Parse/Sema.swift b/Sources/_RegexParser/Regex/Parse/Sema.swift index 263902a8e..9d5ae4576 100644 --- a/Sources/_RegexParser/Regex/Parse/Sema.swift +++ b/Sources/_RegexParser/Regex/Parse/Sema.swift @@ -210,7 +210,7 @@ extension RegexValidator { } } - func validateAtom(_ atom: AST.Atom) throws { + func validateAtom(_ atom: AST.Atom, inCustomCharacterClass: Bool) throws { switch atom.kind { case .escaped(let esc): try validateEscaped(esc, at: atom.location) @@ -243,6 +243,13 @@ extension RegexValidator { // TODO: We should error on unknown Unicode scalar names. break + case .scalarSequence: + // Not currently supported in a custom character class. + if inCustomCharacterClass { + throw error(.unsupported("scalar sequence in custom character class"), + at: atom.location) + } + case .char, .scalar, .startOfLine, .endOfLine, .any: break } @@ -260,8 +267,8 @@ extension RegexValidator { let lhs = range.lhs let rhs = range.rhs - try validateAtom(lhs) - try validateAtom(rhs) + try validateAtom(lhs, inCustomCharacterClass: true) + try validateAtom(rhs, inCustomCharacterClass: true) guard lhs.isValidCharacterClassRangeBound else { throw error(.invalidCharacterClassRangeOperand, at: lhs.location) @@ -297,7 +304,7 @@ extension RegexValidator { try validateCharacterClassRange(r) case .atom(let a): - try validateAtom(a) + try validateAtom(a, inCustomCharacterClass: true) case .setOperation(let lhs, _, let rhs): for lh in lhs { try validateCharacterClassMember(lh) } @@ -379,7 +386,7 @@ extension RegexValidator { try validateQuantification(q) case .atom(let a): - try validateAtom(a) + try validateAtom(a, inCustomCharacterClass: false) case .customCharacterClass(let c): try validateCustomCharacterClass(c) diff --git a/Sources/_RegexParser/Regex/Printing/DumpAST.swift b/Sources/_RegexParser/Regex/Printing/DumpAST.swift index a9cf6b424..b8937d518 100644 --- a/Sources/_RegexParser/Regex/Printing/DumpAST.swift +++ b/Sources/_RegexParser/Regex/Printing/DumpAST.swift @@ -138,6 +138,9 @@ extension AST.Atom { switch kind { case .escaped(let c): return "\\\(c.character)" + case .scalarSequence(let s): + return s.scalars.map(\.value.halfWidthCornerQuoted).joined() + case .namedCharacter(let charName): return "\\N{\(charName)}" diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index a292d7518..48f353e52 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -230,9 +230,9 @@ extension AST.Atom { // handled in emitAssertion return nil - case .escaped, .keyboardControl, .keyboardMeta, .keyboardMetaControl, - .backreference, .subpattern, .callout, .backtrackingDirective, - .changeMatchingOptions: + case .scalarSequence, .escaped, .keyboardControl, .keyboardMeta, + .keyboardMetaControl, .backreference, .subpattern, .callout, + .backtrackingDirective, .changeMatchingOptions: // FIXME: implement return nil } diff --git a/Sources/_StringProcessing/PrintAsPattern.swift b/Sources/_StringProcessing/PrintAsPattern.swift index 285a3fdbb..601447968 100644 --- a/Sources/_StringProcessing/PrintAsPattern.swift +++ b/Sources/_StringProcessing/PrintAsPattern.swift @@ -671,13 +671,19 @@ extension AST.Atom { } var _dslBase: String { + func scalarLiteral(_ s: UnicodeScalar) -> String { + let hex = String(s.value, radix: 16, uppercase: true) + return "\\u{\(hex)}" + } switch kind { case let .char(c): return String(c) case let .scalar(s): - let hex = String(s.value.value, radix: 16, uppercase: true) - return "\\u{\(hex)}" + return scalarLiteral(s.value) + + case let .scalarSequence(seq): + return seq.scalarValues.map(scalarLiteral).joined() case let .property(p): return p._dslBase @@ -769,13 +775,9 @@ extension AST.Atom { var _regexBase: String { switch kind { - case let .char(c): - return String(c) - - case let .scalar(s): - let hex = String(s.value.value, radix: 16, uppercase: true) - return "\\u{\(hex)}" - + case .char, .scalar, .scalarSequence: + return literalStringValue! + case let .property(p): return p._regexBase diff --git a/Sources/_StringProcessing/Regex/ASTConversion.swift b/Sources/_StringProcessing/Regex/ASTConversion.swift index 5c4f88f40..e675a5659 100644 --- a/Sources/_StringProcessing/Regex/ASTConversion.swift +++ b/Sources/_StringProcessing/Regex/ASTConversion.swift @@ -60,15 +60,17 @@ extension AST.Node { var result = "" var idx = idx while idx < astChildren.endIndex { - let atom: AST.Atom? = astChildren[idx].as() + guard let atom: AST.Atom = astChildren[idx].as() else { break } // TODO: For printing, nice to coalesce // scalars literals too. We likely need a different // approach even before we have a better IR. - if let char = atom?.singleCharacter { + if let char = atom.singleCharacter { result.append(char) - } else if let scalar = atom?.singleScalar { + } else if let scalar = atom.singleScalar { result.append(Character(scalar)) + } else if case .scalarSequence(let seq) = atom.kind { + result += seq.scalarValues.map(Character.init) } else { break } @@ -136,7 +138,15 @@ extension AST.Node { return .trivia(v.contents) case let .atom(v): - return .atom(v.dslTreeAtom) + switch v.kind { + case .scalarSequence(let seq): + // Scalar sequences are splatted into concatenated scalars, which + // becomes a quoted literal. Sequences nested in concatenations have + // already been coalesced, this just handles the lone atom case. + return .quotedLiteral(String(seq.scalarValues.map(Character.init))) + default: + return .atom(v.dslTreeAtom) + } case let .customCharacterClass(ccc): return .customCharacterClass(ccc.dslTreeClass) diff --git a/Sources/_StringProcessing/Utility/ASTBuilder.swift b/Sources/_StringProcessing/Utility/ASTBuilder.swift index 387eeb43f..78477e2b5 100644 --- a/Sources/_StringProcessing/Utility/ASTBuilder.swift +++ b/Sources/_StringProcessing/Utility/ASTBuilder.swift @@ -338,10 +338,26 @@ func escaped( atom(.escaped(e)) } func scalar(_ s: Unicode.Scalar) -> AST.Node { - atom(.scalar(.init(s, .fake))) + .atom(scalar_a(s)) +} +func scalar_a(_ s: Unicode.Scalar) -> AST.Atom { + atom_a(.scalar(.init(s, .fake))) } func scalar_m(_ s: Unicode.Scalar) -> AST.CustomCharacterClass.Member { - atom_m(.scalar(.init(s, .fake))) + .atom(scalar_a(s)) +} + +func scalarSeq(_ s: Unicode.Scalar...) -> AST.Node { + .atom(scalarSeq_a(s)) +} +func scalarSeq_a(_ s: Unicode.Scalar...) -> AST.Atom { + scalarSeq_a(s) +} +func scalarSeq_a(_ s: [Unicode.Scalar]) -> AST.Atom { + atom_a(.scalarSequence(.init(s.map { .init($0, .fake) }, trivia: []))) +} +func scalarSeq_m(_ s: Unicode.Scalar...) -> AST.CustomCharacterClass.Member { + .atom(scalarSeq_a(s)) } func backreference(_ r: AST.Reference.Kind, recursionLevel: Int? = nil) -> AST.Node { diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index 3b7def90b..36056e85a 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -285,7 +285,20 @@ extension RegexTests { firstMatchTest(#"\0707"#, input: "12387\u{1C7}xyz", match: "\u{1C7}") // code point sequence - firstMatchTest(#"\u{61 62 63}"#, input: "123abcxyz", match: "abc", xfail: true) + firstMatchTest(#"\u{61 62 63}"#, input: "123abcxyz", match: "abc") + firstMatchTest(#"3\u{ 61 62 63 }"#, input: "123abcxyz", match: "3abc") + firstMatchTest(#"\u{61 62}\u{63}"#, input: "123abcxyz", match: "abc") + firstMatchTest(#"\u{61}\u{62 63}"#, input: "123abcxyz", match: "abc") + firstMatchTest(#"9|\u{61 62 63}"#, input: "123abcxyz", match: "abc") + firstMatchTest(#"(?:\u{61 62 63})"#, input: "123abcxyz", match: "abc") + firstMatchTest(#"23\u{61 62 63}xy"#, input: "123abcxyz", match: "23abcxy") + + // o + horn + dot_below + firstMatchTest( + #"\u{006f 031b 0323}"#, + input: "\u{006f}\u{031b}\u{0323}", + match: "\u{006f}\u{031b}\u{0323}" + ) // Escape sequences that represent scalar values. firstMatchTest(#"\a[\b]\e\f\n\r\t"#, @@ -1405,6 +1418,9 @@ extension RegexTests { firstMatchTest(#"\u{65}\u{301}$"#, input: eDecomposed, match: eDecomposed) firstMatchTest(#"\u{65}\u{301}$"#, input: eComposed, match: eComposed) + firstMatchTest(#"\u{65 301}$"#, input: eDecomposed, match: eDecomposed) + firstMatchTest(#"\u{65 301}$"#, input: eComposed, match: eComposed) + // FIXME: Implicit \y at end of match firstMatchTest(#"\u{65}"#, input: eDecomposed, match: nil, xfail: true) @@ -1516,7 +1532,8 @@ extension RegexTests { firstMatchTest(#"🇰🇷"#, input: flag, match: flag) firstMatchTest(#"[🇰🇷]"#, input: flag, match: flag) firstMatchTest(#"\u{1F1F0}\u{1F1F7}"#, input: flag, match: flag) - + firstMatchTest(#"\u{1F1F0 1F1F7}"#, input: flag, match: flag) + // First Unicode scalar followed by CCC of regional indicators firstMatchTest(#"\u{1F1F0}[\u{1F1E6}-\u{1F1FF}]"#, input: flag, match: flag, xfail: true) diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index 0496e77c6..219393893 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -481,6 +481,22 @@ extension RegexTests { parseTest(#"\x5X"#, concat(scalar("\u{5}"), "X")) parseTest(#"\x12ab"#, concat(scalar("\u{12}"), "a", "b")) + parseTest(#"\u{ a }"#, scalar("\u{A}")) + parseTest(#"\u{ a }\u{ B }"#, concat(scalar("\u{A}"), scalar("\u{B}"))) + + // MARK: Scalar sequences + + parseTest(#"\u{A bC}"#, scalarSeq("\u{A}", "\u{BC}")) + parseTest(#"\u{ A bC }"#, scalarSeq("\u{A}", "\u{BC}")) + parseTest(#"\u{A bC }"#, scalarSeq("\u{A}", "\u{BC}")) + parseTest(#"\u{ A bC}"#, scalarSeq("\u{A}", "\u{BC}")) + parseTest(#"\u{ A b C }"#, scalarSeq("\u{A}", "\u{B}", "\u{C}")) + + parseTest( + #"\u{3b1 3b3 3b5 3b9}"#, + scalarSeq("\u{3b1}", "\u{3b3}", "\u{3b5}", "\u{3b9}") + ) + // MARK: Character classes parseTest(#"abc\d"#, concat("a", "b", "c", escaped(.decimalDigit))) @@ -658,6 +674,28 @@ extension RegexTests { range_m(.namedCharacter("DOLLAR SIGN"), .namedCharacter("APOSTROPHE"))), throwsError: .unsupported) + parseTest( + #"[\u{AA}-\u{BB}]"#, + charClass(range_m(scalar_a("\u{AA}"), scalar_a("\u{BB}"))) + ) + + // Not currently supported, we need to figure out what their semantics are. + parseTest( + #"[\u{AA BB}-\u{CC}]"#, + charClass(range_m(scalarSeq_a("\u{AA}", "\u{BB}"), scalar_a("\u{CC}"))), + throwsError: .unsupported + ) + parseTest( + #"[\u{CC}-\u{AA BB}]"#, + charClass(range_m(scalar_a("\u{CC}"), scalarSeq_a("\u{AA}", "\u{BB}"))), + throwsError: .unsupported + ) + parseTest( + #"[\u{a b c}]"#, + charClass(scalarSeq_m("\u{A}", "\u{B}", "\u{C}")), + throwsError: .unsupported + ) + // MARK: Operators parseTest( @@ -2071,6 +2109,16 @@ extension RegexTests { /# """#, prop(.generalCategory(.decimalNumber))) + parseWithDelimitersTest(#""" + #/ + \u{ + aB + B + c + } + /# + """#, scalarSeq("\u{AB}", "\u{B}", "\u{C}")) + // MARK: Delimiter skipping: Make sure we can skip over the ending delimiter // if it's clear that it's part of the regex syntax. @@ -2145,6 +2193,12 @@ extension RegexTests { parseNotEqualTest(#"[\p{Any}]"#, #"[[:Any:]]"#) + parseNotEqualTest(#"\u{A}"#, #"\u{B}"#) + parseNotEqualTest(#"\u{A B}"#, #"\u{B A}"#) + parseNotEqualTest(#"\u{AB}"#, #"\u{A B}"#) + parseNotEqualTest(#"[\u{AA BB}-\u{CC}]"#, #"[\u{AA DD}-\u{CC}]"#) + parseNotEqualTest(#"[\u{AA BB}-\u{DD}]"#, #"[\u{AA BB}-\u{CC}]"#) + parseNotEqualTest(#"[abc[:space:]\d]+"#, #"[abc[:upper:]\d]+"#) @@ -2491,6 +2545,7 @@ extension RegexTests { diagnosticTest(#"\e\#u{301}"#, .invalidEscape("e\u{301}")) diagnosticTest(#"\\#u{E9}"#, .invalidEscape("é")) diagnosticTest(#"\˂"#, .invalidEscape("˂")) + diagnosticTest(#"\d\#u{301}"#, .invalidEscape("d\u{301}")) // MARK: Character properties @@ -2597,6 +2652,22 @@ extension RegexTests { diagnosticTest(#"\u{G}"#, .expectedNumber("G", kind: .hex)) + diagnosticTest(#"\u{"#, .expectedNumber("", kind: .hex)) + diagnosticTest(#"\u{ "#, .expectedNumber("", kind: .hex)) + diagnosticTest(#"\u{}"#, .expectedNumber("", kind: .hex)) + diagnosticTest(#"\u{ }"#, .expectedNumber("", kind: .hex)) + diagnosticTest(#"\u{ }"#, .expectedNumber("", kind: .hex)) + diagnosticTest(#"\u{ G}"#, .expectedNumber("G", kind: .hex)) + diagnosticTest(#"\u{G }"#, .expectedNumber("G", kind: .hex)) + diagnosticTest(#"\u{ G }"#, .expectedNumber("G", kind: .hex)) + diagnosticTest(#"\u{ GH }"#, .expectedNumber("GH", kind: .hex)) + diagnosticTest(#"\u{ G H }"#, .expectedNumber("G", kind: .hex)) + diagnosticTest(#"\u{ ABC G }"#, .expectedNumber("G", kind: .hex)) + diagnosticTest(#"\u{ FFFFFFFFF A }"#, .numberOverflow("FFFFFFFFF")) + + diagnosticTest(#"[\d--\u{a b}]"#, .unsupported("scalar sequence in custom character class")) + diagnosticTest(#"[\d--[\u{a b}]]"#, .unsupported("scalar sequence in custom character class")) + // MARK: Matching options diagnosticTest(#"(?^-"#, .cannotRemoveMatchingOptionsAfterCaret) From 05971641c077a9e59e4292019847b2a118453d0f Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Tue, 10 May 2022 11:31:33 +0100 Subject: [PATCH 5/6] Fix invalid indexing `curIdx` is an index of `astChildren`, not `children`. --- Sources/_StringProcessing/Regex/ASTConversion.swift | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Sources/_StringProcessing/Regex/ASTConversion.swift b/Sources/_StringProcessing/Regex/ASTConversion.swift index e675a5659..79a515033 100644 --- a/Sources/_StringProcessing/Regex/ASTConversion.swift +++ b/Sources/_StringProcessing/Regex/ASTConversion.swift @@ -102,7 +102,7 @@ extension AST.Node { curIdx = nextIdx } else { children.append(astChildren[curIdx].dslTreeNode) - children.formIndex(after: &curIdx) + astChildren.formIndex(after: &curIdx) } } return .concatenation(children) From 0872d1693b876d058c77666e17d7d108dc381012 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Tue, 10 May 2022 11:31:33 +0100 Subject: [PATCH 6/6] Fix source location tracking in `lexUntil` The `predicate` may independently advance the location before bailing, and we don't want that to affect the recorded location of the result. We probably ought to replace `lexUntil` with a better API. --- .../_RegexParser/Regex/Parse/LexicalAnalysis.swift | 13 +++++++++++-- Tests/RegexTests/ParseTests.swift | 8 ++++++++ 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift index 24c19b758..e8783dc86 100644 --- a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift +++ b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift @@ -503,13 +503,22 @@ extension Source { private mutating func lexUntil( _ predicate: (inout Source) throws -> Bool ) rethrows -> Located { + // We track locations outside of recordLoc, as the predicate may advance the + // input when we hit the end, and we don't want that to affect the location + // of what was lexed in the `result`. We still want the recordLoc call to + // attach locations to any thrown errors though. + // TODO: We should find a better way of doing this, `lexUntil` seems full + // of footguns. + let start = currentPosition + var end = currentPosition + var result = "" try recordLoc { src in - var result = "" while try !predicate(&src) { result.append(src.eat()) + end = src.currentPosition } - return result } + return .init(result, start ..< end) } private mutating func lexUntil(eating end: String) throws -> Located { diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index 219393893..8163c1359 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -2332,6 +2332,14 @@ extension RegexTests { $0.as(AST.Atom.self)!.as(AST.Atom.Scalar.self)!.location }) + rangeTest(#"\u{ 65 58 }"#, range(5 ..< 7), at: { + $0.as(AST.Atom.self)!.as(AST.Atom.ScalarSequence.self)!.scalars[0].location + }) + + rangeTest(#"\u{ 65 58 }"#, range(8 ..< 10), at: { + $0.as(AST.Atom.self)!.as(AST.Atom.ScalarSequence.self)!.scalars[1].location + }) + // MARK: References rangeTest(#"\k"#, range(3 ..< 6), at: {