[Python-checkins] bpo-42885: Optimize search for regular expressions starting with "\A" or "^" (GH-32021)

serhiy-storchaka webhook-mailer at python.org
Tue Mar 22 11:28:06 EDT 2022


https://github.com/python/cpython/commit/492d4109f4d953c478cb48f17aa32adbb912623b
commit: 492d4109f4d953c478cb48f17aa32adbb912623b
branch: main
author: Serhiy Storchaka <storchaka at gmail.com>
committer: serhiy-storchaka <storchaka at gmail.com>
date: 2022-03-22T17:27:55+02:00
summary:

bpo-42885: Optimize search for regular expressions starting with "\A" or "^" (GH-32021)

Affected functions are re.search(), re.split(), re.findall(), re.finditer()
and re.sub().

files:
A Misc/NEWS.d/next/Library/2022-03-21-08-32-19.bpo-42885.LCnTTp.rst
M Lib/test/test_re.py
M Modules/sre_lib.h

diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py
index da827ca7c4e92..fd6db6a300d03 100644
--- a/Lib/test/test_re.py
+++ b/Lib/test/test_re.py
@@ -5,6 +5,7 @@
 import re
 import sre_compile
 import string
+import time
 import unittest
 import warnings
 from re import Scanner
@@ -2038,6 +2039,20 @@ def test_bug_40736(self):
         with self.assertRaisesRegex(TypeError, "got 'type'"):
             re.search("x*", type)
 
+    def test_search_anchor_at_beginning(self):
+        s = 'x'*10**7
+        start = time.perf_counter()
+        for p in r'\Ay', r'^y':
+            self.assertIsNone(re.search(p, s))
+            self.assertEqual(re.split(p, s), [s])
+            self.assertEqual(re.findall(p, s), [])
+            self.assertEqual(list(re.finditer(p, s)), [])
+            self.assertEqual(re.sub(p, '', s), s)
+        t = time.perf_counter() - start
+        # Without optimization it takes 1 second on my computer.
+        # With optimization -- 0.0003 seconds.
+        self.assertLess(t, 0.1)
+
     def test_possessive_quantifiers(self):
         """Test Possessive Quantifiers
         Test quantifiers of the form @+ for some repetition operator @,
diff --git a/Misc/NEWS.d/next/Library/2022-03-21-08-32-19.bpo-42885.LCnTTp.rst b/Misc/NEWS.d/next/Library/2022-03-21-08-32-19.bpo-42885.LCnTTp.rst
new file mode 100644
index 0000000000000..5f9c1a19de221
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2022-03-21-08-32-19.bpo-42885.LCnTTp.rst
@@ -0,0 +1,3 @@
+Optimize :func:`re.search`, :func:`re.split`, :func:`re.findall`,
+:func:`re.finditer` and :func:`re.sub` for regular expressions starting with
+``\A`` or ``^``.
diff --git a/Modules/sre_lib.h b/Modules/sre_lib.h
index 956fd3fad9164..a82210ff94a0f 100644
--- a/Modules/sre_lib.h
+++ b/Modules/sre_lib.h
@@ -1693,6 +1693,13 @@ SRE(search)(SRE_STATE* state, SRE_CODE* pattern)
         state->start = state->ptr = ptr;
         status = SRE(match)(state, pattern, 1);
         state->must_advance = 0;
+        if (status == 0 && pattern[0] == SRE_OP_AT &&
+            (pattern[1] == SRE_AT_BEGINNING ||
+             pattern[1] == SRE_AT_BEGINNING_STRING))
+        {
+            state->start = state->ptr = ptr = end;
+            return 0;
+        }
         while (status == 0 && ptr < end) {
             ptr++;
             RESET_CAPTURE_GROUP();



More information about the Python-checkins mailing list