Skip to content
Merged
6 changes: 4 additions & 2 deletions Package.swift
Original file line number Diff line number Diff line change
Expand Up @@ -75,15 +75,17 @@ let package = Package(
name: "RegexBuilder",
dependencies: ["_StringProcessing", "_RegexParser"],
swiftSettings: publicStdlibSettings),
.target(name: "TestSupport",
swiftSettings: [availabilityDefinition]),
.testTarget(
name: "RegexTests",
dependencies: ["_StringProcessing"],
dependencies: ["_StringProcessing", "TestSupport"],
swiftSettings: [
.unsafeFlags(["-Xfrontend", "-disable-availability-checking"]),
]),
.testTarget(
name: "RegexBuilderTests",
dependencies: ["_StringProcessing", "RegexBuilder"],
dependencies: ["_StringProcessing", "RegexBuilder", "TestSupport"],
swiftSettings: [
.unsafeFlags(["-Xfrontend", "-disable-availability-checking"])
]),
Expand Down
33 changes: 33 additions & 0 deletions Sources/TestSupport/TestSupport.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
//===----------------------------------------------------------------------===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2022 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See https://swift.org/LICENSE.txt for license information
//
//===----------------------------------------------------------------------===//

import XCTest

// We need to split this out of the test files, as it needs to be compiled
// *without* `-disable-availability-checking` to ensure the #available check is
// not compiled into a no-op.

#if os(Linux)
public func XCTExpectFailure(
_ message: String? = nil, body: () throws -> Void
) rethrows {}
#endif

/// Guards certain tests to make sure we have a new stdlib available.
public func ensureNewStdlib(
file: StaticString = #file, line: UInt = #line
) -> Bool {
guard #available(SwiftStdlib 5.7, *) else {
XCTExpectFailure { XCTFail("Unsupported stdlib", file: file, line: line) }
return false
}
return true
}
6 changes: 4 additions & 2 deletions Sources/_RegexParser/Regex/AST/Atom.swift
Original file line number Diff line number Diff line change
Expand Up @@ -755,8 +755,10 @@ extension AST.Atom {
/// Whether this atom is valid as the operand of a custom character class
/// range.
public var isValidCharacterClassRangeBound: Bool {
// If we have a literal character value for this, it can be used as a bound.
if literalCharacterValue != nil { return true }
if let c = literalCharacterValue {
// We only match character range bounds that are single scalar NFC.
return c.hasExactlyOneScalar && c.isNFC
}
switch kind {
// \cx, \C-x, \M-x, \M-\C-x, \N{...}
case .keyboardControl, .keyboardMeta, .keyboardMetaControl, .namedCharacter:
Expand Down
46 changes: 24 additions & 22 deletions Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift
Original file line number Diff line number Diff line change
Expand Up @@ -480,35 +480,37 @@ extension Parser {
///
mutating func lexQuantifier(
) -> (Located<Quant.Amount>, Located<Quant.Kind>, [AST.Trivia])? {
var trivia: [AST.Trivia] = []
tryEating { p in
var trivia: [AST.Trivia] = []

if let t = lexNonSemanticWhitespace() { trivia.append(t) }
if let t = p.lexNonSemanticWhitespace() { trivia.append(t) }

let amt: Located<Quant.Amount>? = recordLoc { p in
if p.tryEat("*") { return .zeroOrMore }
if p.tryEat("+") { return .oneOrMore }
if p.tryEat("?") { return .zeroOrOne }
let amt: Located<Quant.Amount>? = p.recordLoc { p in
if p.tryEat("*") { return .zeroOrMore }
if p.tryEat("+") { return .oneOrMore }
if p.tryEat("?") { return .zeroOrOne }

return p.tryEating { p in
guard p.tryEat("{"),
let range = p.lexRange(trivia: &trivia),
p.tryEat("}")
else { return nil }
return range.value
return p.tryEating { p in
guard p.tryEat("{"),
let range = p.lexRange(trivia: &trivia),
p.tryEat("}")
else { return nil }
return range.value
}
}
}
guard let amt = amt else { return nil }
guard let amt = amt else { return nil }

// PCRE allows non-semantic whitespace here in extended syntax mode.
if let t = lexNonSemanticWhitespace() { trivia.append(t) }
// PCRE allows non-semantic whitespace here in extended syntax mode.
if let t = p.lexNonSemanticWhitespace() { trivia.append(t) }

let kind: Located<Quant.Kind> = recordLoc { p in
if p.tryEat("?") { return .reluctant }
if p.tryEat("+") { return .possessive }
return .eager
}
let kind: Located<Quant.Kind> = p.recordLoc { p in
if p.tryEat("?") { return .reluctant }
if p.tryEat("+") { return .possessive }
return .eager
}

return (amt, kind, trivia)
return (amt, kind, trivia)
}
}

/// Try to consume a range, returning `nil` if unsuccessful.
Expand Down
15 changes: 15 additions & 0 deletions Sources/_RegexParser/Utility/Misc.swift
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,21 @@ extension Substring {
var string: String { String(self) }
}

extension Character {
/// Whether this character is made up of exactly one Unicode scalar value.
public var hasExactlyOneScalar: Bool {
let scalars = unicodeScalars
return scalars.index(after: scalars.startIndex) == scalars.endIndex
}

/// Whether the given character is in NFC form.
internal var isNFC: Bool {
if isASCII { return true }
let str = String(self)
return str._nfcCodeUnits.elementsEqual(str.utf8)
}
}

extension CustomStringConvertible {
@_alwaysEmitIntoClient
public var halfWidthCornerQuoted: String {
Expand Down
165 changes: 162 additions & 3 deletions Sources/_StringProcessing/ByteCodeGen.swift
Original file line number Diff line number Diff line change
Expand Up @@ -775,9 +775,131 @@ fileprivate extension Compiler.ByteCodeGen {
builder.label(exit)
}

/// Coalesce any adjacent scalar members in a custom character class together.
/// This is required in order to produce correct grapheme matching behavior.
func coalescingCustomCharacterClassMembers(
_ members: [DSLTree.CustomCharacterClass.Member]
) -> [DSLTree.CustomCharacterClass.Member] {
struct Accumulator {
/// A series of range operands. For example, in `[ab-cde-fg]`, this will
/// contain the strings `["ab", "cde", "fg"]`. From there, the resulting
/// ranges will be created.
private var rangeOperands: [String] = [""]

/// The current range operand.
private var current: String {
_read { yield rangeOperands[rangeOperands.count - 1] }
_modify { yield &rangeOperands[rangeOperands.count - 1] }
}

/// Try to accumulate a character class member, returning `true` if
/// successful, `false` otherwise.
mutating func tryAccumulate(
_ member: DSLTree.CustomCharacterClass.Member
) -> Bool {
switch member {
case .atom(let a):
guard let c = a.literalCharacterValue else { return false }
current.append(c)
return true
case .quotedLiteral(let str):
current += str
return true
case let .range(lhs, rhs):
guard let lhs = lhs.literalCharacterValue,
let rhs = rhs.literalCharacterValue
else { return false }
current.append(lhs)
rangeOperands.append(String(rhs))
return true
case .trivia:
// Trivia can be completely ignored if we've already coalesced
// something.
return !current.isEmpty
default:
return false
}
}

func finish() -> [DSLTree.CustomCharacterClass.Member] {
if rangeOperands.count == 1 {
// If we didn't have any additional range operands, this isn't a
// range, we can just form a standard quoted literal.
return [.quotedLiteral(current)]
}
var members = [DSLTree.CustomCharacterClass.Member]()

// We have other range operands, splice them together. For N operands
// we have N - 1 ranges.
for (i, lhs) in rangeOperands.dropLast().enumerated() {
let rhs = rangeOperands[i + 1]

// If this is the first operand we only need to drop the last
// character for its quoted members, otherwise this is both an LHS
// and RHS of a range, and as such needs both sides trimmed.
let leading = i == 0 ? lhs.dropLast() : lhs.dropFirst().dropLast()
if !leading.isEmpty {
members.append(.quotedLiteral(String(leading)))
}
members.append(.range(.char(lhs.last!), .char(rhs.first!)))
}
// We've handled everything except the quoted portion of the last
// operand, add it now.
let trailing = rangeOperands.last!.dropFirst()
if !trailing.isEmpty {
members.append(.quotedLiteral(String(trailing)))
}
return members
}
}
return members
.map { m -> DSLTree.CustomCharacterClass.Member in
// First we need to recursively coalsce any child character classes.
switch m {
case .custom(let ccc):
return .custom(coalescingCustomCharacterClass(ccc))
case .intersection(let lhs, let rhs):
return .intersection(
coalescingCustomCharacterClass(lhs),
coalescingCustomCharacterClass(rhs))
case .subtraction(let lhs, let rhs):
return .subtraction(
coalescingCustomCharacterClass(lhs),
coalescingCustomCharacterClass(rhs))
case .symmetricDifference(let lhs, let rhs):
return .symmetricDifference(
coalescingCustomCharacterClass(lhs),
coalescingCustomCharacterClass(rhs))
case .atom, .range, .quotedLiteral, .trivia:
return m
}
}
.coalescing(with: Accumulator(), into: { $0.finish() }) { accum, member in
accum.tryAccumulate(member)
}
}

func coalescingCustomCharacterClass(
_ ccc: DSLTree.CustomCharacterClass
) -> DSLTree.CustomCharacterClass {
// This only needs to be done in grapheme semantic mode. In scalar semantic
// mode, we don't want to coalesce any scalars into a grapheme. This
// means that e.g `[e\u{301}-\u{302}]` remains a range between U+301 and
// U+302.
guard options.semanticLevel == .graphemeCluster else { return ccc }

let members = coalescingCustomCharacterClassMembers(ccc.members)
return .init(members: members, isInverted: ccc.isInverted)
}

mutating func emitCustomCharacterClass(
_ ccc: DSLTree.CustomCharacterClass
) throws {
// Before emitting a custom character class in grapheme semantic mode, we
// need to coalesce together any adjacent characters and scalars, over which
// we can perform grapheme breaking. This includes e.g range bounds for
// `[e\u{301}-\u{302}]`.
let ccc = coalescingCustomCharacterClass(ccc)
if let asciiBitset = ccc.asAsciiBitset(options),
optimizationsEnabled {
if options.semanticLevel == .unicodeScalar {
Expand All @@ -791,6 +913,45 @@ fileprivate extension Compiler.ByteCodeGen {
}
}

mutating func emitConcatenation(_ children: [DSLTree.Node]) throws {
// Before emitting a concatenation, we need to flatten out any nested
// concatenations, and coalesce any adjacent characters and scalars, forming
// quoted literals of their contents, over which we can perform grapheme
// breaking.
func flatten(_ node: DSLTree.Node) -> [DSLTree.Node] {
switch node {
case .concatenation(let ch):
return ch.flatMap(flatten)
case .convertedRegexLiteral(let n, _):
return flatten(n)
default:
return [node]
}
}
let children = children
.flatMap(flatten)
.coalescing(with: "", into: DSLTree.Node.quotedLiteral) { str, node in
switch node {
case .atom(let a):
guard let c = a.literalCharacterValue else { return false }
str.append(c)
return true
case .quotedLiteral(let q):
str += q
return true
case .trivia:
// Trivia can be completely ignored if we've already coalesced
// something.
return !str.isEmpty
default:
return false
}
}
for child in children {
try emitConcatenationComponent(child)
}
}

@discardableResult
mutating func emitNode(_ node: DSLTree.Node) throws -> ValueRegister? {
switch node {
Expand All @@ -799,9 +960,7 @@ fileprivate extension Compiler.ByteCodeGen {
try emitAlternation(children)

case let .concatenation(children):
for child in children {
try emitConcatenationComponent(child)
}
try emitConcatenation(children)

case let .capture(name, refId, child, transform):
options.beginScope()
Expand Down
30 changes: 27 additions & 3 deletions Sources/_StringProcessing/Compiler.swift
Original file line number Diff line number Diff line change
Expand Up @@ -42,19 +42,43 @@ class Compiler {
}
}

/// Hashable wrapper for `Any.Type`.
struct AnyHashableType: CustomStringConvertible, Hashable {
var ty: Any.Type
init(_ ty: Any.Type) {
self.ty = ty
}
var description: String { "\(ty)" }

static func == (lhs: Self, rhs: Self) -> Bool {
lhs.ty == rhs.ty
}
func hash(into hasher: inout Hasher) {
hasher.combine(ObjectIdentifier(ty))
}
}

// An error produced when compiling a regular expression.
enum RegexCompilationError: Error, CustomStringConvertible {
enum RegexCompilationError: Error, Hashable, CustomStringConvertible {
// TODO: Source location?
case uncapturedReference
case incorrectOutputType(incorrect: AnyHashableType, correct: AnyHashableType)
case invalidCharacterClassRangeOperand(Character)

static func incorrectOutputType(
incorrect: Any.Type, correct: Any.Type
) -> Self {
.incorrectOutputType(incorrect: .init(incorrect), correct: .init(correct))
}

case incorrectOutputType(incorrect: Any.Type, correct: Any.Type)

var description: String {
switch self {
case .uncapturedReference:
return "Found a reference used before it captured any match."
case .incorrectOutputType(let incorrect, let correct):
return "Cast to incorrect type 'Regex<\(incorrect)>', expected 'Regex<\(correct)>'"
case .invalidCharacterClassRangeOperand(let c):
return "'\(c)' is an invalid bound for character class range"
}
}
}
Expand Down
Loading