[Python-checkins] bpo-34294: re module, fix wrong capturing groups in rare cases. (GH-11546)

Miss Islington (bot) webhook-mailer at python.org
Mon Feb 18 08:48:27 EST 2019


https://github.com/python/cpython/commit/0e379d43acc25277f02262212932d3c589a2031b
commit: 0e379d43acc25277f02262212932d3c589a2031b
branch: 3.7
author: Miss Islington (bot) <31488909+miss-islington at users.noreply.github.com>
committer: GitHub <noreply at github.com>
date: 2019-02-18T05:48:23-08:00
summary:

bpo-34294: re module, fix wrong capturing groups in rare cases. (GH-11546)


Need to reset capturing groups between two SRE(match) callings in loops, this fixes wrong capturing groups in rare cases.

Also add a missing index in re.rst.
(cherry picked from commit 4a7f44a2ed49ff1e87db062e7177a56c6e4bbdb0)

Co-authored-by: animalize <animalize at users.noreply.github.com>

files:
A Misc/NEWS.d/next/Library/2019-01-14-11-53-10.bpo-34294.3JFdg2.rst
M Doc/library/re.rst
M Lib/test/test_re.py
M Modules/_sre.c
M Modules/sre_lib.h

diff --git a/Doc/library/re.rst b/Doc/library/re.rst
index e6455bb359df..39ba44eba1c1 100644
--- a/Doc/library/re.rst
+++ b/Doc/library/re.rst
@@ -368,6 +368,8 @@ The special characters are:
 ``(?#...)``
    A comment; the contents of the parentheses are simply ignored.
 
+.. index:: single: (?=; in regular expressions
+
 ``(?=...)``
    Matches if ``...`` matches next, but doesn't consume any of the string.  This is
    called a :dfn:`lookahead assertion`.  For example, ``Isaac (?=Asimov)`` will match
diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py
index 9fed4bef8809..0b710e3766ab 100644
--- a/Lib/test/test_re.py
+++ b/Lib/test/test_re.py
@@ -2031,6 +2031,40 @@ def test_bug_29444(self):
         self.assertEqual(m.group(), b'xyz')
         self.assertEqual(m2.group(), b'')
 
+    def test_bug_34294(self):
+        # Issue 34294: wrong capturing groups
+
+        # exists since Python 2
+        s = "a\tx"
+        p = r"\b(?=(\t)|(x))x"
+        self.assertEqual(re.search(p, s).groups(), (None, 'x'))
+
+        # introduced in Python 3.7.0
+        s = "ab"
+        p = r"(?=(.)(.)?)"
+        self.assertEqual(re.findall(p, s),
+                         [('a', 'b'), ('b', '')])
+        self.assertEqual([m.groups() for m in re.finditer(p, s)],
+                         [('a', 'b'), ('b', None)])
+
+        # test-cases provided by issue34294, introduced in Python 3.7.0
+        p = r"(?=<(?P<tag>\w+)/?>(?:(?P<text>.+?)</(?P=tag)>)?)"
+        s = "<test><foo2/></test>"
+        self.assertEqual(re.findall(p, s),
+                         [('test', '<foo2/>'), ('foo2', '')])
+        self.assertEqual([m.groupdict() for m in re.finditer(p, s)],
+                         [{'tag': 'test', 'text': '<foo2/>'},
+                          {'tag': 'foo2', 'text': None}])
+        s = "<test>Hello</test><foo/>"
+        self.assertEqual([m.groupdict() for m in re.finditer(p, s)],
+                         [{'tag': 'test', 'text': 'Hello'},
+                          {'tag': 'foo', 'text': None}])
+        s = "<test>Hello</test><foo/><foo/>"
+        self.assertEqual([m.groupdict() for m in re.finditer(p, s)],
+                         [{'tag': 'test', 'text': 'Hello'},
+                          {'tag': 'foo', 'text': None},
+                          {'tag': 'foo', 'text': None}])
+
 
 class PatternReprTests(unittest.TestCase):
     def check(self, pattern, expected):
diff --git a/Misc/NEWS.d/next/Library/2019-01-14-11-53-10.bpo-34294.3JFdg2.rst b/Misc/NEWS.d/next/Library/2019-01-14-11-53-10.bpo-34294.3JFdg2.rst
new file mode 100644
index 000000000000..e1ae2ea6a337
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2019-01-14-11-53-10.bpo-34294.3JFdg2.rst
@@ -0,0 +1,4 @@
+re module, fix wrong capturing groups in rare cases. :func:`re.search`,
+:func:`re.findall`, :func:`re.sub` and other functions that scan through
+string looking for a match, should reset capturing groups between two match
+attempts. Patch by Ma Lin.
\ No newline at end of file
diff --git a/Modules/_sre.c b/Modules/_sre.c
index d2ea62d55a81..a97ce7790e39 100644
--- a/Modules/_sre.c
+++ b/Modules/_sre.c
@@ -347,7 +347,7 @@ _sre_unicode_tolower_impl(PyObject *module, int character)
 LOCAL(void)
 state_reset(SRE_STATE* state)
 {
-    /* FIXME: dynamic! */
+    /* state->mark will be set to 0 in SRE_OP_MARK dynamically. */
     /*memset(state->mark, 0, sizeof(*state->mark) * SRE_MARK_SIZE);*/
 
     state->lastmark = -1;
diff --git a/Modules/sre_lib.h b/Modules/sre_lib.h
index 44948e21ad95..437ab43f434a 100644
--- a/Modules/sre_lib.h
+++ b/Modules/sre_lib.h
@@ -1363,6 +1363,10 @@ SRE(match)(SRE_STATE* state, SRE_CODE* pattern, int toplevel)
     return ret; /* should never get here */
 }
 
+/* need to reset capturing groups between two SRE(match) callings in loops */
+#define RESET_CAPTURE_GROUP() \
+    do { state->lastmark = state->lastindex = -1; } while (0)
+
 LOCAL(Py_ssize_t)
 SRE(search)(SRE_STATE* state, SRE_CODE* pattern)
 {
@@ -1440,6 +1444,7 @@ SRE(search)(SRE_STATE* state, SRE_CODE* pattern)
             if (status != 0)
                 return status;
             ++ptr;
+            RESET_CAPTURE_GROUP();
         }
         return 0;
     }
@@ -1487,6 +1492,7 @@ SRE(search)(SRE_STATE* state, SRE_CODE* pattern)
                     /* close but no cigar -- try again */
                     if (++ptr >= end)
                         return 0;
+                    RESET_CAPTURE_GROUP();
                 }
                 i = overlap[i];
             } while (i != 0);
@@ -1510,6 +1516,7 @@ SRE(search)(SRE_STATE* state, SRE_CODE* pattern)
             if (status != 0)
                 break;
             ptr++;
+            RESET_CAPTURE_GROUP();
         }
     } else {
         /* general case */
@@ -1520,6 +1527,7 @@ SRE(search)(SRE_STATE* state, SRE_CODE* pattern)
         state->must_advance = 0;
         while (status == 0 && ptr < end) {
             ptr++;
+            RESET_CAPTURE_GROUP();
             TRACE(("|%p|%p|SEARCH\n", pattern, ptr));
             state->start = state->ptr = ptr;
             status = SRE(match)(state, pattern, 0);



More information about the Python-checkins mailing list