[GR-32916] Handle comments in extended-mode Regexps when counting capture groups and others.

jirkamarsik · jirkamarsik · commit 4a9158422d34 · 2021-08-02T08:29:08.000Z
PullRequest: graal/9454
diff --git a/regex/src/com.oracle.truffle.regex.test/src/com/oracle/truffle/regex/tregex/test/RubyTests.java b/regex/src/com.oracle.truffle.regex.test/src/com/oracle/truffle/regex/tregex/test/RubyTests.java
@@ -340,6 +340,7 @@ public void treatLeadingClosingBracketsInCharClassesAsLiteralCharacters() {
         test("\\A[^]]\\z", "", "a", 0, true, 0, 1);
     }
 
+    @Test
     public void ignoreAtomicGroups() {
         test("(?>foo)", "", "foo", 0, true, 0, 3);
     }
@@ -350,4 +351,31 @@ public void reportBacktracking() {
         Assert.assertTrue(compileRegex("(?:foo){64}", "").getMember("isBacktracking").asBoolean());
         Assert.assertTrue(compileRegex("(x+)\\1", "").getMember("isBacktracking").asBoolean());
     }
+
+    @Test
+    public void lineBreakEscape() {
+        test("\\R", "", "\r", 0, true, 0, 1);
+        test("\\R", "", "\n", 0, true, 0, 1);
+        test("\\R", "", "\r\n", 0, true, 0, 2);
+
+        test("\\A\\R\\R\\z", "", "\r\r", 0, true, 0, 2);
+        test("\\A\\R\\R\\z", "", "\n\n", 0, true, 0, 2);
+        test("\\A\\R\\R\\z", "", "\r\n", 0, false);
+    }
+
+    @Test
+    public void github2412() {
+        // Checkstyle: stop line length
+        // 1 root capture group and 16 named capture groups
+        Assert.assertEquals(1 + 16, compileRegex("           % (?<type>%)\n" +
+                        "          | % (?<flags>(?-mix:[ #0+-]|(?-mix:(\\d+)\\$))*)\n" +
+                        "            (?:\n" +
+                        "              (?: (?-mix:(?<width>(?-mix:\\d+|(?-mix:\\*(?-mix:(\\d+)\\$)?))))? (?-mix:\\.(?<precision>(?-mix:\\d+|(?-mix:\\*(?-mix:(\\d+)\\$)?))))? (?-mix:<(?<name>\\w+)>)?\n" +
+                        "                | (?-mix:(?<width>(?-mix:\\d+|(?-mix:\\*(?-mix:(\\d+)\\$)?))))? (?-mix:<(?<name>\\w+)>) (?-mix:\\.(?<precision>(?-mix:\\d+|(?-mix:\\*(?-mix:(\\d+)\\$)?))))?\n" +
+                        "                | (?-mix:<(?<name>\\w+)>) (?<more_flags>(?-mix:[ #0+-]|(?-mix:(\\d+)\\$))*) (?-mix:(?<width>(?-mix:\\d+|(?-mix:\\*(?-mix:(\\d+)\\$)?))))? (?-mix:\\.(?<precision>(?-mix:\\d+|(?-mix:\\*(?-mix:(\\d+)\\$)?))))?\n" +
+                        "              ) (?-mix:(?<type>[bBdiouxXeEfgGaAcps]))\n" +
+                        "              | (?-mix:(?<width>(?-mix:\\d+|(?-mix:\\*(?-mix:(\\d+)\\$)?))))? (?-mix:\\.(?<precision>(?-mix:\\d+|(?-mix:\\*(?-mix:(\\d+)\\$)?))))? (?-mix:\\{(?<name>\\w+)\\})\n" +
+                        "            )", "x").getMember("groupCount").asInt());
+        // Checkstyle: resume line length
+    }
 }
diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/flavors/RubyFlavor.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/flavors/RubyFlavor.java
@@ -65,11 +65,11 @@
  * state so that it deletes any characters matched so far and considers the current position as the
  * start of the reported match. There is no operator like this in ECMAScript that would allow one to
  * tinker with the matcher's state.</li>
- * <li>named capture groups with the same name: Ruby admits regular expressions with named capture
- * groups that share the same name. These situations can't be handled by replacing those capture
- * groups with regular numbered capture groups and then mapping the capture group names to lists of
- * capture group indices as we wouldn't know which of the homonymous capture groups was matched last
- * and therefore which value should be used.</li>
+ * <li>backreferences to named capture groups with the same name: Ruby admits regular expressions
+ * with named capture groups that share the same name. These situations can't be handled by
+ * replacing those capture groups with regular numbered capture groups and then mapping the capture
+ * group names to lists of capture group indices as we wouldn't know which of the homonymous capture
+ * groups was matched last and therefore which value should be used.</li>
  * <li>Unicode character properties not supported by ECMAScript and not covered by the POSIX
  * character classes: Ruby regular expressions use the syntax \p{...} for Unicode character
  * properties. Similar to ECMAScript, they offer access to Unicode Scripts, General Categories and
@@ -86,11 +86,10 @@
  * also don't support those backreferences.</li>
  * <li>(?>....) atomic groups: This construct allows control over the matcher's backtracking by
  * making committed choices which can't be undone. This is not something we can support using
- * ECMAScript regexes.</li>
+ * ECMAScript regexes, however these is an option ({@code IgnoreAtomicGroups}), that lets atomic
+ * groups be treated like any other groups.</li>
  * <li>\X extended grapheme cluster escapes: This is just syntactic sugar for a certain expression
  * which uses atomic groups, and it is therefore not supported.</li>
- * <li>\R line break escapes: These are also translated by Joni to atomic groups, which we do not
- * support.</li>
  * <li>possessive quantifiers, e.g. a*+: Possessive quantifiers are quantifiers which consume
  * greedily and also do not allow backtracking, so they are another example of the atomic groups
  * that we do not support (a*+ is equivalent to (?>a*)).</li>
diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/flavors/RubyFlavorProcessor.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/parser/flavors/RubyFlavorProcessor.java
@@ -45,10 +45,12 @@
 import java.util.ArrayList;
 import java.util.Deque;
 import java.util.HashMap;
+import java.util.HashSet;
 import java.util.LinkedList;
 import java.util.List;
 import java.util.Map;
 import java.util.Optional;
+import java.util.Set;
 import java.util.function.BiConsumer;
 import java.util.function.Predicate;
 import java.util.regex.Matcher;
@@ -327,6 +329,12 @@ private enum PosixClassParseResult {
      * named capture groups so far.
      */
     private Map<String, Integer> namedCaptureGroups;
+    /**
+     * A set of capture groups names which occur repeatedly in the expression. Backreferences to
+     * such capture groups can refer to either of the homonymous capture groups, depending on which
+     * of them matched most recently. Such backreferences are not supported in TRegex.
+     */
+    private Set<String> ambiguousCaptureGroups;
 
     /**
      * The number of capture groups encountered in the input pattern so far, i.e. the (zero-based)
@@ -400,6 +408,7 @@ public RubyFlavorProcessor(RegexSource source) {
         this.lookbehindDepth = 0;
         this.groupStack = new ArrayDeque<>();
         this.namedCaptureGroups = null;
+        this.ambiguousCaptureGroups = null;
         this.groupIndex = 0;
         this.lastTerm = TermCategory.None;
         this.lastTermOutPosition = -1;
@@ -651,19 +660,22 @@ private void scanForCaptureGroups() {
         while (!atEnd()) {
             switch (consumeChar()) {
                 case '\\':
-                    while (match("c") || match("C-") || match("M-")) {
-                        // skip control escape sequences, \\cX, \\C-X or \\M-X, which can be nested
-                    }
-                    // skip escaped char; if it includes a group name, skip that too
-                    int c = consumeChar();
-                    switch (c) {
+                    switch (curChar()) {
                         case 'k':
                         case 'g':
                             // skip contents of group name (which might contain syntax chars)
+                            int c = consumeChar();
                             if (match("<")) {
                                 parseGroupReference('>', true, true, c == 'k', true);
                             }
                             break;
+                        default:
+                            while (match("c") || match("C-") || match("M-")) {
+                                // skip control escape sequences, \\cX, \\C-X or \\M-X, which can be
+                                // nested
+                            }
+                            // skip escaped char
+                            advance();
                     }
                     break;
                 case '[':
@@ -680,14 +692,19 @@ private void scanForCaptureGroups() {
                 case '(':
                     if (charClassDepth == 0) {
                         if (match("?")) {
-                            if (match("<") && curChar() != '=' && curChar() != '!') {
+                            if (match("<")) {
+                                if (curChar() == '=' || curChar() == '!') {
+                                    // look-behind
+                                    break;
+                                }
                                 String groupName = parseGroupName('>');
                                 if (namedCaptureGroups == null) {
                                     namedCaptureGroups = new HashMap<>();
+                                    ambiguousCaptureGroups = new HashSet<>();
                                     numberOfCaptureGroups = 0;
                                 }
                                 if (namedCaptureGroups.containsKey(groupName)) {
-                                    bailOut("different capture groups with the same name are not supported");
+                                    ambiguousCaptureGroups.add(groupName);
                                 }
                                 numberOfCaptureGroups++;
                                 namedCaptureGroups.put(groupName, numberOfCaptureGroups);
@@ -708,12 +725,14 @@ private void scanForCaptureGroups() {
                     }
                     break;
                 case '#':
-                    if (globalFlags.isExtended()) {
-                        int endOfLine = inPattern.indexOf('\n', position);
-                        if (endOfLine >= 0) {
-                            position = endOfLine + 1;
-                        } else {
-                            position = inPattern.length();
+                    if (charClassDepth == 0) {
+                        if (globalFlags.isExtended()) {
+                            int endOfLine = inPattern.indexOf('\n', position);
+                            if (endOfLine >= 0) {
+                                position = endOfLine + 1;
+                            } else {
+                                position = inPattern.length();
+                            }
                         }
                     }
                     break;
@@ -1339,6 +1358,9 @@ private int parseGroupReference(char terminator, boolean allowNumeric, boolean a
                     throw syntaxErrorAt(RbErrorMessages.unknownGroupName(groupName), beginPos);
                 }
             } else {
+                if (ambiguousCaptureGroups.contains(groupName)) {
+                    bailOut("backreferences to multiple homonymous named capture groups are not supported");
+                }
                 groupNumber = namedCaptureGroups.get(groupName);
             }
         }
@@ -1389,7 +1411,13 @@ private boolean isCaptureGroupOpen(int groupNumber) {
     private boolean lineBreak() {
         if (curChar() == 'R') {
             advance();
-            bailOut("line break escape not supported");
+            // When matching \\x0d, we check that it is not followed by \\x0a to emulate the
+            // atomic group in the original Ruby expansion: (?>\x0d\x0a|[\x0a-\x0d\x85\u2028\u2029])
+            if (inSource.getEncoding().isUnicode()) {
+                emitSnippet("(?:\\x0d\\x0a|\\x0d(?!\\x0a)|[\\x0a-\\x0c\\x85\\u2028\\u2029])");
+            } else {
+                emitSnippet("(?:\\x0d\\x0a|\\x0d(?!\\x0a)|[\\x0a-\\x0c])");
+            }
             return true;
         } else {
             return false;
diff --git a/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/string/Encodings.java b/regex/src/com.oracle.truffle.regex/src/com/oracle/truffle/regex/tregex/string/Encodings.java
@@ -101,6 +101,8 @@ public int getMinValue() {
 
         public abstract boolean isFixedCodePointWidth(CodePointSet set);
 
+        public abstract boolean isUnicode();
+
         public abstract AbstractStringBuffer createStringBuffer(int capacity);
 
         public abstract DFAStateNode.LoopOptimizationNode extractLoopOptNode(CodePointSet loopCPS);
@@ -145,6 +147,11 @@ public boolean isFixedCodePointWidth(CodePointSet set) {
                 return true;
             }
 
+            @Override
+            public boolean isUnicode() {
+                return true;
+            }
+
             @Override
             public StringBufferUTF32 createStringBuffer(int capacity) {
                 return new StringBufferUTF32(capacity);
@@ -220,6 +227,11 @@ public boolean isFixedCodePointWidth(CodePointSet set) {
                 return !(min < 0x10000 && max > 0x10000);
             }
 
+            @Override
+            public boolean isUnicode() {
+                return true;
+            }
+
             @Override
             public LoopOptimizationNode extractLoopOptNode(CodePointSet cps) {
                 if (cps.inverseGetMax(this) <= 0xffff) {
@@ -315,6 +327,11 @@ public boolean isFixedCodePointWidth(CodePointSet set) {
                 return true;
             }
 
+            @Override
+            public boolean isUnicode() {
+                return true;
+            }
+
             @Override
             public StringBufferUTF16 createStringBuffer(int capacity) {
                 return new StringBufferUTF16(capacity);
@@ -399,6 +416,11 @@ public boolean isFixedCodePointWidth(CodePointSet set) {
                 return !(min < 0x80 && max >= 0x80 || min < 0x800 && max >= 0x800 || min < 0x10000 && max > 0x10000);
             }
 
+            @Override
+            public boolean isUnicode() {
+                return true;
+            }
+
             @Override
             public StringBufferUTF8 createStringBuffer(int capacity) {
                 return new StringBufferUTF8(capacity);
@@ -465,6 +487,11 @@ public boolean isFixedCodePointWidth(CodePointSet set) {
                 return true;
             }
 
+            @Override
+            public boolean isUnicode() {
+                return false;
+            }
+
             @Override
             public StringBufferLATIN1 createStringBuffer(int capacity) {
                 return new StringBufferLATIN1(capacity);