From d9d02c12bbec476afd75f380b6eb9b6c29575a37 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Thu, 12 May 2022 12:48:38 +0100 Subject: [PATCH] Ban `]` as literal first character of custom character class PCRE, Oniguruma, and ICU allow `]` to appear as the first member of a custom character class, and treat it as literal, due to empty character classes being invalid. However this behavior isn't particularly intuitive, and makes lexing heuristics harder to implement properly. Instead, reject such character classes as being empty, and require escaping if `]` is meant as the first character. --- Sources/_RegexParser/Regex/Parse/Parse.swift | 10 ------- Tests/RegexTests/MatchTests.swift | 1 - Tests/RegexTests/ParseTests.swift | 31 +++++++------------- 3 files changed, 10 insertions(+), 32 deletions(-) diff --git a/Sources/_RegexParser/Regex/Parse/Parse.swift b/Sources/_RegexParser/Regex/Parse/Parse.swift index 112f32358..013bb1361 100644 --- a/Sources/_RegexParser/Regex/Parse/Parse.swift +++ b/Sources/_RegexParser/Regex/Parse/Parse.swift @@ -465,16 +465,6 @@ extension Parser { var members: Array = [] try parseCCCMembers(into: &members) - // If we didn't parse any semantic members, we can eat a ']' character, as - // PCRE, Oniguruma, and ICU forbid empty character classes, and assume an - // initial ']' is literal. - if members.none(\.isSemantic) { - if let loc = source.tryEatWithLoc("]") { - members.append(.atom(.init(.char("]"), loc))) - try parseCCCMembers(into: &members) - } - } - // If we have a binary set operator, parse it and the next members. Note // that this means we left associate for a chain of operators. // TODO: We may want to diagnose and require users to disambiguate, at least diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index 36056e85a..89a645375 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -1282,7 +1282,6 @@ extension RegexTests { firstMatchTest(#"(?xx)[ \t]+"#, input: " \t \t", match: "\t") firstMatchTest("(?xx)[ a && ab ]+", input: " aaba ", match: "aa") - firstMatchTest("(?xx)[ ] a ]+", input: " a]]a ] ", match: "a]]a") } func testASCIIClasses() { diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index ed930b0fe..62eb2f6bd 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -528,23 +528,7 @@ extension RegexTests { )) parseTest("[-]", charClass("-")) - - // Empty character classes are forbidden, therefore these are character - // classes containing literal ']'. - parseTest("[]]", charClass("]")) - parseTest("[]a]", charClass("]", "a")) - parseTest("(?x)[ ]]", concat( - changeMatchingOptions(matchingOptions(adding: .extended)), - charClass("]") - )) - parseTest("(?x)[ ] ]", concat( - changeMatchingOptions(matchingOptions(adding: .extended)), - charClass("]") - )) - parseTest("(?x)[ ] a ]", concat( - changeMatchingOptions(matchingOptions(adding: .extended)), - charClass("]", "a") - )) + parseTest(#"[\]]"#, charClass("]")) // These are metacharacters in certain contexts, but normal characters // otherwise. @@ -2497,10 +2481,15 @@ extension RegexTests { diagnosticTest("[a", .expected("]")) - // The first ']' of a custom character class is literal, so these are - // missing the closing bracket. - diagnosticTest("[]", .expected("]")) - diagnosticTest("(?x)[ ]", .expected("]")) + // Character classes may not be empty. + diagnosticTest("[]", .expectedCustomCharacterClassMembers) + diagnosticTest("[]]", .expectedCustomCharacterClassMembers) + diagnosticTest("[]a]", .expectedCustomCharacterClassMembers) + diagnosticTest("(?x)[ ]", .expectedCustomCharacterClassMembers) + diagnosticTest("(?x)[ ] ]", .expectedCustomCharacterClassMembers) + diagnosticTest("(?x)[ ] a ]", .expectedCustomCharacterClassMembers) + diagnosticTest("(?xx)[ ] a ]+", .expectedCustomCharacterClassMembers) + diagnosticTest("(?x)[ ]]", .expectedCustomCharacterClassMembers) diagnosticTest("[&&]", .expectedCustomCharacterClassMembers) diagnosticTest("[a&&]", .expectedCustomCharacterClassMembers)