[Python-checkins] cpython: Issue #24426: Fast searching optimization in regular expressions now works

serhiy.storchaka python-checkins at python.org
Sun Jun 21 13:07:18 CEST 2015


https://hg.python.org/cpython/rev/7e46a503dd16
changeset:   96622:7e46a503dd16
user:        Serhiy Storchaka <storchaka at gmail.com>
date:        Sun Jun 21 14:06:55 2015 +0300
summary:
  Issue #24426: Fast searching optimization in regular expressions now works
for patterns that starts with capturing groups.  Fast searching optimization
now can't be disabled at compile time.

files:
  Lib/sre_compile.py |  117 +++++++++++++++++++-------------
  Misc/NEWS          |    4 +
  Modules/_sre.c     |    3 -
  Modules/sre_lib.h  |   53 +++++++-------
  4 files changed, 99 insertions(+), 78 deletions(-)


diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py
--- a/Lib/sre_compile.py
+++ b/Lib/sre_compile.py
@@ -409,57 +409,39 @@
             table[i] = idx + 1
     return table
 
-def _compile_info(code, pattern, flags):
-    # internal: compile an info block.  in the current version,
-    # this contains min/max pattern width, and an optional literal
-    # prefix or a character map
-    lo, hi = pattern.getwidth()
-    if hi > MAXCODE:
-        hi = MAXCODE
-    if lo == 0:
-        code.extend([INFO, 4, 0, lo, hi])
-        return
-    # look for a literal prefix
+def _get_literal_prefix(pattern):
+    # look for literal prefix
     prefix = []
     prefixappend = prefix.append
-    prefix_skip = 0
+    prefix_skip = None
+    got_all = True
+    for op, av in pattern.data:
+        if op is LITERAL:
+            prefixappend(av)
+        elif op is SUBPATTERN:
+            prefix1, prefix_skip1, got_all = _get_literal_prefix(av[1])
+            if prefix_skip is None:
+                if av[0] is not None:
+                    prefix_skip = len(prefix)
+                elif prefix_skip1 is not None:
+                    prefix_skip = len(prefix) + prefix_skip1
+            prefix.extend(prefix1)
+            if not got_all:
+                break
+        else:
+            got_all = False
+            break
+    return prefix, prefix_skip, got_all
+
+def _get_charset_prefix(pattern):
     charset = [] # not used
     charsetappend = charset.append
-    if not (flags & SRE_FLAG_IGNORECASE):
-        # look for literal prefix
-        for op, av in pattern.data:
+    if pattern.data:
+        op, av = pattern.data[0]
+        if op is SUBPATTERN and av[1]:
+            op, av = av[1][0]
             if op is LITERAL:
-                if len(prefix) == prefix_skip:
-                    prefix_skip = prefix_skip + 1
-                prefixappend(av)
-            elif op is SUBPATTERN and len(av[1]) == 1:
-                op, av = av[1][0]
-                if op is LITERAL:
-                    prefixappend(av)
-                else:
-                    break
-            else:
-                break
-        # if no prefix, look for charset prefix
-        if not prefix and pattern.data:
-            op, av = pattern.data[0]
-            if op is SUBPATTERN and av[1]:
-                op, av = av[1][0]
-                if op is LITERAL:
-                    charsetappend((op, av))
-                elif op is BRANCH:
-                    c = []
-                    cappend = c.append
-                    for p in av[1]:
-                        if not p:
-                            break
-                        op, av = p[0]
-                        if op is LITERAL:
-                            cappend((op, av))
-                        else:
-                            break
-                    else:
-                        charset = c
+                charsetappend((op, av))
             elif op is BRANCH:
                 c = []
                 cappend = c.append
@@ -473,8 +455,43 @@
                         break
                 else:
                     charset = c
-            elif op is IN:
-                charset = av
+        elif op is BRANCH:
+            c = []
+            cappend = c.append
+            for p in av[1]:
+                if not p:
+                    break
+                op, av = p[0]
+                if op is LITERAL:
+                    cappend((op, av))
+                else:
+                    break
+            else:
+                charset = c
+        elif op is IN:
+            charset = av
+    return charset
+
+def _compile_info(code, pattern, flags):
+    # internal: compile an info block.  in the current version,
+    # this contains min/max pattern width, and an optional literal
+    # prefix or a character map
+    lo, hi = pattern.getwidth()
+    if hi > MAXCODE:
+        hi = MAXCODE
+    if lo == 0:
+        code.extend([INFO, 4, 0, lo, hi])
+        return
+    # look for a literal prefix
+    prefix = []
+    prefix_skip = 0
+    charset = [] # not used
+    if not (flags & SRE_FLAG_IGNORECASE):
+        # look for literal prefix
+        prefix, prefix_skip, got_all = _get_literal_prefix(pattern)
+        # if no prefix, look for charset prefix
+        if not prefix:
+            charset = _get_charset_prefix(pattern)
 ##     if prefix:
 ##         print("*** PREFIX", prefix, prefix_skip)
 ##     if charset:
@@ -487,7 +504,7 @@
     mask = 0
     if prefix:
         mask = SRE_INFO_PREFIX
-        if len(prefix) == prefix_skip == len(pattern.data):
+        if prefix_skip is None and got_all:
             mask = mask | SRE_INFO_LITERAL
     elif charset:
         mask = mask | SRE_INFO_CHARSET
@@ -502,6 +519,8 @@
     # add literal prefix
     if prefix:
         emit(len(prefix)) # length
+        if prefix_skip is None:
+            prefix_skip =  len(prefix)
         emit(prefix_skip) # skip
         code.extend(prefix)
         # generate overlap table
diff --git a/Misc/NEWS b/Misc/NEWS
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -13,6 +13,10 @@
 Library
 -------
 
+- Issue #24426: Fast searching optimization in regular expressions now works
+  for patterns that starts with capturing groups.  Fast searching optimization
+  now can't be disabled at compile time.
+
 Documentation
 -------------
 
diff --git a/Modules/_sre.c b/Modules/_sre.c
--- a/Modules/_sre.c
+++ b/Modules/_sre.c
@@ -62,9 +62,6 @@
 /* -------------------------------------------------------------------- */
 /* optional features */
 
-/* enables fast searching */
-#define USE_FAST_SEARCH
-
 /* enables copy/deepcopy handling (work in progress) */
 #undef USE_BUILTIN_COPY
 
diff --git a/Modules/sre_lib.h b/Modules/sre_lib.h
--- a/Modules/sre_lib.h
+++ b/Modules/sre_lib.h
@@ -1248,7 +1248,32 @@
            prefix, prefix_len, prefix_skip));
     TRACE(("charset = %p\n", charset));
 
-#if defined(USE_FAST_SEARCH)
+    if (prefix_len == 1) {
+        /* pattern starts with a literal character */
+        SRE_CHAR c = (SRE_CHAR) prefix[0];
+#if SIZEOF_SRE_CHAR < 4
+        if ((SRE_CODE) c != prefix[0])
+            return 0; /* literal can't match: doesn't fit in char width */
+#endif
+        end = (SRE_CHAR *)state->end;
+        while (ptr < end) {
+            while (*ptr != c) {
+                if (++ptr >= end)
+                    return 0;
+            }
+            TRACE(("|%p|%p|SEARCH LITERAL\n", pattern, ptr));
+            state->start = ptr;
+            state->ptr = ptr + prefix_skip;
+            if (flags & SRE_INFO_LITERAL)
+                return 1; /* we got all of it */
+            status = SRE(match)(state, pattern + 2*prefix_skip, 0);
+            if (status != 0)
+                return status;
+            ++ptr;
+        }
+        return 0;
+    }
+
     if (prefix_len > 1) {
         /* pattern starts with a known prefix.  use the overlap
            table to skip forward as fast as we possibly can */
@@ -1297,32 +1322,8 @@
         }
         return 0;
     }
-#endif
 
-    if (pattern[0] == SRE_OP_LITERAL) {
-        /* pattern starts with a literal character.  this is used
-           for short prefixes, and if fast search is disabled */
-        SRE_CHAR c = (SRE_CHAR) pattern[1];
-#if SIZEOF_SRE_CHAR < 4
-        if ((SRE_CODE) c != pattern[1])
-            return 0; /* literal can't match: doesn't fit in char width */
-#endif
-        end = (SRE_CHAR *)state->end;
-        while (ptr < end) {
-            while (*ptr != c) {
-                if (++ptr >= end)
-                    return 0;
-            }
-            TRACE(("|%p|%p|SEARCH LITERAL\n", pattern, ptr));
-            state->start = ptr;
-            state->ptr = ++ptr;
-            if (flags & SRE_INFO_LITERAL)
-                return 1; /* we got all of it */
-            status = SRE(match)(state, pattern + 2, 0);
-            if (status != 0)
-                break;
-        }
-    } else if (charset) {
+    if (charset) {
         /* pattern starts with a character from a known set */
         end = (SRE_CHAR *)state->end;
         for (;;) {

-- 
Repository URL: https://hg.python.org/cpython


More information about the Python-checkins mailing list