Skip to content

Commit 4a91584

Browse files
committed
[GR-32916] Handle comments in extended-mode Regexps when counting capture groups and others.
PullRequest: graal/9454
2 parents 519c9b3 + b664d3b commit 4a91584

File tree

4 files changed

+105
-23
lines changed

4 files changed

+105
-23
lines changed

regex/src/com.oracle.truffle.regex.test/src/com/oracle/truffle/regex/tregex/test/RubyTests.java

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -340,6 +340,7 @@ public void treatLeadingClosingBracketsInCharClassesAsLiteralCharacters() {
340340
test("\\A[^]]\\z", "", "a", 0, true, 0, 1);
341341
}
342342

343+
@Test
343344
public void ignoreAtomicGroups() {
344345
test("(?>foo)", "", "foo", 0, true, 0, 3);
345346
}
@@ -350,4 +351,31 @@ public void reportBacktracking() {
350351
Assert.assertTrue(compileRegex("(?:foo){64}", "").getMember("isBacktracking").asBoolean());
351352
Assert.assertTrue(compileRegex("(x+)\\1", "").getMember("isBacktracking").asBoolean());
352353
}
354+
355+
@Test
356+
public void lineBreakEscape() {
357+
test("\\R", "", "\r", 0, true, 0, 1);
358+
test("\\R", "", "\n", 0, true, 0, 1);
359+
test("\\R", "", "\r\n", 0, true, 0, 2);
360+
361+
test("\\A\\R\\R\\z", "", "\r\r", 0, true, 0, 2);
362+
test("\\A\\R\\R\\z", "", "\n\n", 0, true, 0, 2);
363+
test("\\A\\R\\R\\z", "", "\r\n", 0, false);
364+
}
365+
366+
@Test
367+
public void github2412() {
368+
// Checkstyle: stop line length
369+
// 1 root capture group and 16 named capture groups
370+
Assert.assertEquals(1 + 16, compileRegex(" % (?<type>%)\n" +
371+
" | % (?<flags>(?-mix:[ #0+-]|(?-mix:(\\d+)\\$))*)\n" +
372+
" (?:\n" +
373+
" (?: (?-mix:(?<width>(?-mix:\\d+|(?-mix:\\*(?-mix:(\\d+)\\$)?))))? (?-mix:\\.(?<precision>(?-mix:\\d+|(?-mix:\\*(?-mix:(\\d+)\\$)?))))? (?-mix:<(?<name>\\w+)>)?\n" +
374+
" | (?-mix:(?<width>(?-mix:\\d+|(?-mix:\\*(?-mix:(\\d+)\\$)?))))? (?-mix:<(?<name>\\w+)>) (?-mix:\\.(?<precision>(?-mix:\\d+|(?-mix:\\*(?-mix:(\\d+)\\$)?))))?\n" +
375+
" | (?-mix:<(?<name>\\w+)>) (?<more_flags>(?-mix:[ #0+-]|(?-mix:(\\d+)\\$))*) (?-mix:(?<width>(?-mix:\\d+|(?-mix:\\*(?-mix:(\\d+)\\$)?))))? (?-mix:\\.(?<precision>(?-mix:\\d+|(?-mix:\\*(?-mix:(\\d+)\\$)?))))?\n" +
376+
" ) (?-mix:(?<type>[bBdiouxXeEfgGaAcps]))\n" +
377+
" | (?-mix:(?<width>(?-mix:\\d+|(?-mix:\\*(?-mix:(\\d+)\\$)?))))? (?-mix:\\.(?<precision>(?-mix:\\d+|(?-mix:\\*(?-mix:(\\d+)\\$)?))))? (?-mix:\\{(?<name>\\w+)\\})\n" +
378+
" )", "x").getMember("groupCount").asInt());
379+
// Checkstyle: resume line length
380+
}
353381
}

regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/flavors/RubyFlavor.java

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -65,11 +65,11 @@
6565
* state so that it deletes any characters matched so far and considers the current position as the
6666
* start of the reported match. There is no operator like this in ECMAScript that would allow one to
6767
* tinker with the matcher's state.</li>
68-
* <li>named capture groups with the same name: Ruby admits regular expressions with named capture
69-
* groups that share the same name. These situations can't be handled by replacing those capture
70-
* groups with regular numbered capture groups and then mapping the capture group names to lists of
71-
* capture group indices as we wouldn't know which of the homonymous capture groups was matched last
72-
* and therefore which value should be used.</li>
68+
* <li>backreferences to named capture groups with the same name: Ruby admits regular expressions
69+
* with named capture groups that share the same name. These situations can't be handled by
70+
* replacing those capture groups with regular numbered capture groups and then mapping the capture
71+
* group names to lists of capture group indices as we wouldn't know which of the homonymous capture
72+
* groups was matched last and therefore which value should be used.</li>
7373
* <li>Unicode character properties not supported by ECMAScript and not covered by the POSIX
7474
* character classes: Ruby regular expressions use the syntax \p{...} for Unicode character
7575
* properties. Similar to ECMAScript, they offer access to Unicode Scripts, General Categories and
@@ -86,11 +86,10 @@
8686
* also don't support those backreferences.</li>
8787
* <li>(?>....) atomic groups: This construct allows control over the matcher's backtracking by
8888
* making committed choices which can't be undone. This is not something we can support using
89-
* ECMAScript regexes.</li>
89+
* ECMAScript regexes, however these is an option ({@code IgnoreAtomicGroups}), that lets atomic
90+
* groups be treated like any other groups.</li>
9091
* <li>\X extended grapheme cluster escapes: This is just syntactic sugar for a certain expression
9192
* which uses atomic groups, and it is therefore not supported.</li>
92-
* <li>\R line break escapes: These are also translated by Joni to atomic groups, which we do not
93-
* support.</li>
9493
* <li>possessive quantifiers, e.g. a*+: Possessive quantifiers are quantifiers which consume
9594
* greedily and also do not allow backtracking, so they are another example of the atomic groups
9695
* that we do not support (a*+ is equivalent to (?>a*)).</li>

regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/flavors/RubyFlavorProcessor.java

Lines changed: 43 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -45,10 +45,12 @@
4545
import java.util.ArrayList;
4646
import java.util.Deque;
4747
import java.util.HashMap;
48+
import java.util.HashSet;
4849
import java.util.LinkedList;
4950
import java.util.List;
5051
import java.util.Map;
5152
import java.util.Optional;
53+
import java.util.Set;
5254
import java.util.function.BiConsumer;
5355
import java.util.function.Predicate;
5456
import java.util.regex.Matcher;
@@ -327,6 +329,12 @@ private enum PosixClassParseResult {
327329
* named capture groups so far.
328330
*/
329331
private Map<String, Integer> namedCaptureGroups;
332+
/**
333+
* A set of capture groups names which occur repeatedly in the expression. Backreferences to
334+
* such capture groups can refer to either of the homonymous capture groups, depending on which
335+
* of them matched most recently. Such backreferences are not supported in TRegex.
336+
*/
337+
private Set<String> ambiguousCaptureGroups;
330338

331339
/**
332340
* The number of capture groups encountered in the input pattern so far, i.e. the (zero-based)
@@ -400,6 +408,7 @@ public RubyFlavorProcessor(RegexSource source) {
400408
this.lookbehindDepth = 0;
401409
this.groupStack = new ArrayDeque<>();
402410
this.namedCaptureGroups = null;
411+
this.ambiguousCaptureGroups = null;
403412
this.groupIndex = 0;
404413
this.lastTerm = TermCategory.None;
405414
this.lastTermOutPosition = -1;
@@ -651,19 +660,22 @@ private void scanForCaptureGroups() {
651660
while (!atEnd()) {
652661
switch (consumeChar()) {
653662
case '\\':
654-
while (match("c") || match("C-") || match("M-")) {
655-
// skip control escape sequences, \\cX, \\C-X or \\M-X, which can be nested
656-
}
657-
// skip escaped char; if it includes a group name, skip that too
658-
int c = consumeChar();
659-
switch (c) {
663+
switch (curChar()) {
660664
case 'k':
661665
case 'g':
662666
// skip contents of group name (which might contain syntax chars)
667+
int c = consumeChar();
663668
if (match("<")) {
664669
parseGroupReference('>', true, true, c == 'k', true);
665670
}
666671
break;
672+
default:
673+
while (match("c") || match("C-") || match("M-")) {
674+
// skip control escape sequences, \\cX, \\C-X or \\M-X, which can be
675+
// nested
676+
}
677+
// skip escaped char
678+
advance();
667679
}
668680
break;
669681
case '[':
@@ -680,14 +692,19 @@ private void scanForCaptureGroups() {
680692
case '(':
681693
if (charClassDepth == 0) {
682694
if (match("?")) {
683-
if (match("<") && curChar() != '=' && curChar() != '!') {
695+
if (match("<")) {
696+
if (curChar() == '=' || curChar() == '!') {
697+
// look-behind
698+
break;
699+
}
684700
String groupName = parseGroupName('>');
685701
if (namedCaptureGroups == null) {
686702
namedCaptureGroups = new HashMap<>();
703+
ambiguousCaptureGroups = new HashSet<>();
687704
numberOfCaptureGroups = 0;
688705
}
689706
if (namedCaptureGroups.containsKey(groupName)) {
690-
bailOut("different capture groups with the same name are not supported");
707+
ambiguousCaptureGroups.add(groupName);
691708
}
692709
numberOfCaptureGroups++;
693710
namedCaptureGroups.put(groupName, numberOfCaptureGroups);
@@ -708,12 +725,14 @@ private void scanForCaptureGroups() {
708725
}
709726
break;
710727
case '#':
711-
if (globalFlags.isExtended()) {
712-
int endOfLine = inPattern.indexOf('\n', position);
713-
if (endOfLine >= 0) {
714-
position = endOfLine + 1;
715-
} else {
716-
position = inPattern.length();
728+
if (charClassDepth == 0) {
729+
if (globalFlags.isExtended()) {
730+
int endOfLine = inPattern.indexOf('\n', position);
731+
if (endOfLine >= 0) {
732+
position = endOfLine + 1;
733+
} else {
734+
position = inPattern.length();
735+
}
717736
}
718737
}
719738
break;
@@ -1339,6 +1358,9 @@ private int parseGroupReference(char terminator, boolean allowNumeric, boolean a
13391358
throw syntaxErrorAt(RbErrorMessages.unknownGroupName(groupName), beginPos);
13401359
}
13411360
} else {
1361+
if (ambiguousCaptureGroups.contains(groupName)) {
1362+
bailOut("backreferences to multiple homonymous named capture groups are not supported");
1363+
}
13421364
groupNumber = namedCaptureGroups.get(groupName);
13431365
}
13441366
}
@@ -1389,7 +1411,13 @@ private boolean isCaptureGroupOpen(int groupNumber) {
13891411
private boolean lineBreak() {
13901412
if (curChar() == 'R') {
13911413
advance();
1392-
bailOut("line break escape not supported");
1414+
// When matching \\x0d, we check that it is not followed by \\x0a to emulate the
1415+
// atomic group in the original Ruby expansion: (?>\x0d\x0a|[\x0a-\x0d\x85\u2028\u2029])
1416+
if (inSource.getEncoding().isUnicode()) {
1417+
emitSnippet("(?:\\x0d\\x0a|\\x0d(?!\\x0a)|[\\x0a-\\x0c\\x85\\u2028\\u2029])");
1418+
} else {
1419+
emitSnippet("(?:\\x0d\\x0a|\\x0d(?!\\x0a)|[\\x0a-\\x0c])");
1420+
}
13931421
return true;
13941422
} else {
13951423
return false;

regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/string/Encodings.java

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,8 @@ public int getMinValue() {
101101

102102
public abstract boolean isFixedCodePointWidth(CodePointSet set);
103103

104+
public abstract boolean isUnicode();
105+
104106
public abstract AbstractStringBuffer createStringBuffer(int capacity);
105107

106108
public abstract DFAStateNode.LoopOptimizationNode extractLoopOptNode(CodePointSet loopCPS);
@@ -145,6 +147,11 @@ public boolean isFixedCodePointWidth(CodePointSet set) {
145147
return true;
146148
}
147149

150+
@Override
151+
public boolean isUnicode() {
152+
return true;
153+
}
154+
148155
@Override
149156
public StringBufferUTF32 createStringBuffer(int capacity) {
150157
return new StringBufferUTF32(capacity);
@@ -220,6 +227,11 @@ public boolean isFixedCodePointWidth(CodePointSet set) {
220227
return !(min < 0x10000 && max > 0x10000);
221228
}
222229

230+
@Override
231+
public boolean isUnicode() {
232+
return true;
233+
}
234+
223235
@Override
224236
public LoopOptimizationNode extractLoopOptNode(CodePointSet cps) {
225237
if (cps.inverseGetMax(this) <= 0xffff) {
@@ -315,6 +327,11 @@ public boolean isFixedCodePointWidth(CodePointSet set) {
315327
return true;
316328
}
317329

330+
@Override
331+
public boolean isUnicode() {
332+
return true;
333+
}
334+
318335
@Override
319336
public StringBufferUTF16 createStringBuffer(int capacity) {
320337
return new StringBufferUTF16(capacity);
@@ -399,6 +416,11 @@ public boolean isFixedCodePointWidth(CodePointSet set) {
399416
return !(min < 0x80 && max >= 0x80 || min < 0x800 && max >= 0x800 || min < 0x10000 && max > 0x10000);
400417
}
401418

419+
@Override
420+
public boolean isUnicode() {
421+
return true;
422+
}
423+
402424
@Override
403425
public StringBufferUTF8 createStringBuffer(int capacity) {
404426
return new StringBufferUTF8(capacity);
@@ -465,6 +487,11 @@ public boolean isFixedCodePointWidth(CodePointSet set) {
465487
return true;
466488
}
467489

490+
@Override
491+
public boolean isUnicode() {
492+
return false;
493+
}
494+
468495
@Override
469496
public StringBufferLATIN1 createStringBuffer(int capacity) {
470497
return new StringBufferLATIN1(capacity);

0 commit comments

Comments
 (0)