diff --git a/Sources/_RegexParser/Regex/AST/Atom.swift b/Sources/_RegexParser/Regex/AST/Atom.swift index e17ce68bb..d6062115a 100644 --- a/Sources/_RegexParser/Regex/AST/Atom.swift +++ b/Sources/_RegexParser/Regex/AST/Atom.swift @@ -396,6 +396,9 @@ extension AST.Atom.CharacterProperty { case script(Unicode.Script) case scriptExtension(Unicode.Script) + /// Character name in the form `\p{name=...}` + case named(String) + case posix(Unicode.POSIXProperty) /// Some special properties implemented by PCRE and Oniguruma. diff --git a/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift b/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift index 5cc920063..ee9195ff3 100644 --- a/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift +++ b/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift @@ -428,6 +428,8 @@ extension Source { if let cat = classifyGeneralCategory(value) { return .generalCategory(cat) } + case "name", "na": + return .named(value) default: break } diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index d27b89314..637b1a37a 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -144,6 +144,19 @@ extension String { } } +func consumeName(_ name: String, opts: MatchingOptions) -> MEProgram.ConsumeFunction { + let consume = opts.semanticLevel == .graphemeCluster + ? consumeCharacterWithSingleScalar + : consumeScalar + + return consume(propertyScalarPredicate { + // FIXME: name aliases not covered by $0.nameAlias are missed + // e.g. U+FEFF has both 'BYTE ORDER MARK' and 'BOM' as aliases + $0.name?.isEqualByUAX44LM2(to: name) == true + || $0.nameAlias?.isEqualByUAX44LM2(to: name) == true + }) +} + // TODO: This is basically an AST interpreter, which would // be good or interesting to build regardless, and serves // as a compiler fall-back path @@ -206,12 +219,7 @@ extension AST.Atom { return try p.generateConsumer(opts) case let .namedCharacter(name): - return consumeScalar(propertyScalarPredicate { - // FIXME: name aliases not covered by $0.nameAlias are missed - // e.g. U+FEFF is also 'FORM FEED', 'BYTE ORDER MARK', and 'BOM' - $0.name?.isEqualByUAX44LM2(to: name) == true - || $0.nameAlias?.isEqualByUAX44LM2(to: name) == true - }) + return consumeName(name, opts: opts) case .any: assertionFailure( @@ -479,6 +487,9 @@ extension AST.Atom.CharacterProperty { case .scriptExtension(let s): return consume(scriptExtensionScalarPredicate(s)) + + case .named(let n): + return consumeName(n, opts: opts) case .posix(let p): return p.generateConsumer(opts) diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index 831f904c6..0ef021442 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -1219,6 +1219,13 @@ extension RegexTests { parseTest(#"\p{word}"#, prop(.posix(.word))) parseTest(#"\p{xdigit}"#, prop(.posix(.xdigit))) + parseTest(#"\p{name=A}"#, prop(.named("A"))) + parseTest(#"\p{Name=B}"#, prop(.named("B"))) + parseTest(#"\p{isName=C}"#, prop(.named("C"))) + parseTest(#"\p{na=D}"#, prop(.named("D"))) + parseTest(#"\p{NA=E}"#, prop(.named("E"))) + parseTest(#"\p{na=isI}"#, prop(.named("isI"))) + // MARK: Conditionals parseTest(#"(?(1))"#, conditional( diff --git a/Tests/RegexTests/UTS18Tests.swift b/Tests/RegexTests/UTS18Tests.swift index 71f459a1b..eff9f9b4e 100644 --- a/Tests/RegexTests/UTS18Tests.swift +++ b/Tests/RegexTests/UTS18Tests.swift @@ -389,25 +389,34 @@ extension UTS18Tests { // // To meet this requirement, an implementation shall support individually // named characters. - func testNameProperty_XFail() { - XCTExpectFailure("Need \\p{name=...} support") { - XCTFail(#"\(#/\p{name=BOM}/#)"#) - // Name property - // XCTAssertTrue("\u{FEFF}".contains(#/\p{name=ZERO WIDTH NO-BREAK SPACE}/#)) - // Name property and Matching Rules - // XCTAssertTrue("\u{FEFF}".contains(#/\p{name=zerowidthno breakspace}/#)) + func testNameProperty() throws { + // Name property + XCTAssertTrue("\u{FEFF}".contains(regex(#"\p{name=ZERO WIDTH NO-BREAK SPACE}"#))) + // Name property and Matching Rules + XCTAssertTrue("\u{FEFF}".contains(regex(#"\p{name=zerowidthno breakspace}"#))) + + // Computed name + XCTAssertTrue("강".contains(regex(#"\p{name=HANGUL SYLLABLE GANG}"#))) + + // Graphic symbol + XCTAssertTrue("\u{1F514}".contains(regex(#"\p{name=BELL}"#))) + + // Name match failures + XCTAssertFalse("\u{FEFF}".contains(regex(#"\p{name=ZERO WIDTH NO-BRAKE SPACE}"#))) + XCTAssertFalse("\u{FEFF}".contains(regex(#"\p{name=ZERO WIDTH NO-BREAK SPACE ZZZZ}"#))) + XCTAssertFalse("\u{FEFF}".contains(regex(#"\p{name=ZERO WIDTH NO-BREAK}"#))) + XCTAssertFalse("\u{FEFF}".contains(regex(#"\p{name=z}"#))) + } + + func testNameProperty_XFail() throws { + XCTExpectFailure("Need more expansive name alias matching") { // Name_Alias property - // XCTAssertTrue("\u{FEFF}".contains(#/\p{name=BYTE ORDER MARK}/#)) + XCTAssertTrue("\u{FEFF}".contains(regex(#"\p{name=BYTE ORDER MARK}"#))) // Name_Alias property (again) - // XCTAssertTrue("\u{FEFF}".contains(#/\p{name=BOM}/#)) - - // Computed name - // XCTAssertTrue("강".contains(#/\p{name=HANGUL SYLLABLE GANG}/#)) - + XCTAssertTrue("\u{FEFF}".contains(regex(#"\p{name=BOM}"#))) + // Control character - // XCTAssertTrue("\u{7}".contains(#/\p{name=BEL}/#)) - // Graphic symbol - // XCTAssertTrue("\u{1F514}".contains(#/\p{name=BELL}/#)) + XCTAssertTrue("\u{7}".contains(regex(#"\p{name=BEL}"#))) } }