Skip to content

Commit 0e379d4

Browse files
miss-islingtonanimalize
andauthored
bpo-34294: re module, fix wrong capturing groups in rare cases. (GH-11546)
Need to reset capturing groups between two SRE(match) callings in loops, this fixes wrong capturing groups in rare cases. Also add a missing index in re.rst. (cherry picked from commit 4a7f44a) Co-authored-by: animalize <[email protected]>
1 parent a01065a commit 0e379d4

File tree

5 files changed

+49
-1
lines changed

5 files changed

+49
-1
lines changed

Doc/library/re.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -368,6 +368,8 @@ The special characters are:
368368
``(?#...)``
369369
A comment; the contents of the parentheses are simply ignored.
370370

371+
.. index:: single: (?=; in regular expressions
372+
371373
``(?=...)``
372374
Matches if ``...`` matches next, but doesn't consume any of the string. This is
373375
called a :dfn:`lookahead assertion`. For example, ``Isaac (?=Asimov)`` will match

Lib/test/test_re.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2031,6 +2031,40 @@ def test_bug_29444(self):
20312031
self.assertEqual(m.group(), b'xyz')
20322032
self.assertEqual(m2.group(), b'')
20332033

2034+
def test_bug_34294(self):
2035+
# Issue 34294: wrong capturing groups
2036+
2037+
# exists since Python 2
2038+
s = "a\tx"
2039+
p = r"\b(?=(\t)|(x))x"
2040+
self.assertEqual(re.search(p, s).groups(), (None, 'x'))
2041+
2042+
# introduced in Python 3.7.0
2043+
s = "ab"
2044+
p = r"(?=(.)(.)?)"
2045+
self.assertEqual(re.findall(p, s),
2046+
[('a', 'b'), ('b', '')])
2047+
self.assertEqual([m.groups() for m in re.finditer(p, s)],
2048+
[('a', 'b'), ('b', None)])
2049+
2050+
# test-cases provided by issue34294, introduced in Python 3.7.0
2051+
p = r"(?=<(?P<tag>\w+)/?>(?:(?P<text>.+?)</(?P=tag)>)?)"
2052+
s = "<test><foo2/></test>"
2053+
self.assertEqual(re.findall(p, s),
2054+
[('test', '<foo2/>'), ('foo2', '')])
2055+
self.assertEqual([m.groupdict() for m in re.finditer(p, s)],
2056+
[{'tag': 'test', 'text': '<foo2/>'},
2057+
{'tag': 'foo2', 'text': None}])
2058+
s = "<test>Hello</test><foo/>"
2059+
self.assertEqual([m.groupdict() for m in re.finditer(p, s)],
2060+
[{'tag': 'test', 'text': 'Hello'},
2061+
{'tag': 'foo', 'text': None}])
2062+
s = "<test>Hello</test><foo/><foo/>"
2063+
self.assertEqual([m.groupdict() for m in re.finditer(p, s)],
2064+
[{'tag': 'test', 'text': 'Hello'},
2065+
{'tag': 'foo', 'text': None},
2066+
{'tag': 'foo', 'text': None}])
2067+
20342068

20352069
class PatternReprTests(unittest.TestCase):
20362070
def check(self, pattern, expected):
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
re module, fix wrong capturing groups in rare cases. :func:`re.search`,
2+
:func:`re.findall`, :func:`re.sub` and other functions that scan through
3+
string looking for a match, should reset capturing groups between two match
4+
attempts. Patch by Ma Lin.

Modules/_sre.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -347,7 +347,7 @@ _sre_unicode_tolower_impl(PyObject *module, int character)
347347
LOCAL(void)
348348
state_reset(SRE_STATE* state)
349349
{
350-
/* FIXME: dynamic! */
350+
/* state->mark will be set to 0 in SRE_OP_MARK dynamically. */
351351
/*memset(state->mark, 0, sizeof(*state->mark) * SRE_MARK_SIZE);*/
352352

353353
state->lastmark = -1;

Modules/sre_lib.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1363,6 +1363,10 @@ SRE(match)(SRE_STATE* state, SRE_CODE* pattern, int toplevel)
13631363
return ret; /* should never get here */
13641364
}
13651365

1366+
/* need to reset capturing groups between two SRE(match) callings in loops */
1367+
#define RESET_CAPTURE_GROUP() \
1368+
do { state->lastmark = state->lastindex = -1; } while (0)
1369+
13661370
LOCAL(Py_ssize_t)
13671371
SRE(search)(SRE_STATE* state, SRE_CODE* pattern)
13681372
{
@@ -1440,6 +1444,7 @@ SRE(search)(SRE_STATE* state, SRE_CODE* pattern)
14401444
if (status != 0)
14411445
return status;
14421446
++ptr;
1447+
RESET_CAPTURE_GROUP();
14431448
}
14441449
return 0;
14451450
}
@@ -1487,6 +1492,7 @@ SRE(search)(SRE_STATE* state, SRE_CODE* pattern)
14871492
/* close but no cigar -- try again */
14881493
if (++ptr >= end)
14891494
return 0;
1495+
RESET_CAPTURE_GROUP();
14901496
}
14911497
i = overlap[i];
14921498
} while (i != 0);
@@ -1510,6 +1516,7 @@ SRE(search)(SRE_STATE* state, SRE_CODE* pattern)
15101516
if (status != 0)
15111517
break;
15121518
ptr++;
1519+
RESET_CAPTURE_GROUP();
15131520
}
15141521
} else {
15151522
/* general case */
@@ -1520,6 +1527,7 @@ SRE(search)(SRE_STATE* state, SRE_CODE* pattern)
15201527
state->must_advance = 0;
15211528
while (status == 0 && ptr < end) {
15221529
ptr++;
1530+
RESET_CAPTURE_GROUP();
15231531
TRACE(("|%p|%p|SEARCH\n", pattern, ptr));
15241532
state->start = state->ptr = ptr;
15251533
status = SRE(match)(state, pattern, 0);

0 commit comments

Comments
 (0)