[pypy-svn] r15747 - pypy/dist/pypy/module/_sre

nik at codespeak.net nik at codespeak.net
Sun Aug 7 17:53:12 CEST 2005


Author: nik
Date: Sun Aug  7 17:53:11 2005
New Revision: 15747

Modified:
   pypy/dist/pypy/module/_sre/app_sre.py
Log:
several optimizations


Modified: pypy/dist/pypy/module/_sre/app_sre.py
==============================================================================
--- pypy/dist/pypy/module/_sre/app_sre.py	(original)
+++ pypy/dist/pypy/module/_sre/app_sre.py	Sun Aug  7 17:53:11 2005
@@ -357,7 +357,15 @@
         self.repeat = None
 
     def match(self, pattern_codes):
-        # XXX INFO optimization missing here
+        # Optimization: Check string length. pattern_codes[3] contains the
+        # minimum length for a string to possibly match.
+        from sre_constants import OPCODES
+        if pattern_codes[0] == OPCODES["info"] and pattern_codes[3]:
+            if self.end - self.string_position < pattern_codes[3]:
+                #_log("reject (got %d chars, need %d)"
+                #         % (self.end - self.string_position, pattern_codes[3]))
+                return False
+        
         dispatcher = _OpcodeDispatcher()
         self.context_stack.append(_MatchContext(self, pattern_codes))
         has_matched = None
@@ -369,10 +377,13 @@
         return has_matched
 
     def search(self, pattern_codes):
-        from sre_constants import OPCODES
+        from sre_constants import OPCODES, SRE_INFO_PREFIX
         if pattern_codes[0] == OPCODES["info"]:
-            pattern_codes = pattern_codes[pattern_codes[1] + 1:]   
-        # XXX USE_FAST_SEARCH optimizations missing here        
+            # optimization info block
+            # <INFO> <1=skip> <2=flags> <3=min> <4=max> <5=prefix info>
+            if pattern_codes[2] & SRE_INFO_PREFIX and pattern_codes[5] > 1:
+                return self.fast_search(pattern_codes)
+            pattern_codes = pattern_codes[pattern_codes[1] + 1:]
         # XXX literal and charset optimizations missing here
         string_position = self.start
         while string_position <= self.end:
@@ -383,6 +394,43 @@
             string_position += 1
         return False
 
+    def fast_search(self, pattern_codes):
+        """Skips forward in a string as fast as possible using information from
+        an optimization info block."""
+        from sre_constants import SRE_INFO_LITERAL
+        # pattern starts with a known prefix
+        # <5=length> <6=skip> <7=prefix data> <overlap data>
+        flags = pattern_codes[2]
+        prefix_len = pattern_codes[5]
+        prefix_skip = pattern_codes[6] # don't really know what this is good for
+        prefix = pattern_codes[7:7 + prefix_len]
+        overlap = pattern_codes[7 + prefix_len - 1:pattern_codes[1] + 1]
+        pattern_codes = pattern_codes[pattern_codes[1] + 1:]
+        i = 0
+        string_position = self.string_position
+        while string_position < self.end:
+            while True:
+                if ord(self.string[string_position]) != prefix[i]:
+                    if i == 0:
+                        break
+                    else:
+                        i = overlap[i]
+                else:
+                    i += 1
+                    if i == prefix_len:
+                        # found a potential match
+                        self.start = string_position + 1 - prefix_len
+                        self.string_position = string_position + 1 \
+                                                     - prefix_len + prefix_skip
+                        if flags & SRE_INFO_LITERAL:
+                            return True # matched all of pure literal pattern
+                        if self.match(pattern_codes[2 * prefix_skip:]):
+                            return True
+                        i = overlap[i]
+                    break
+            string_position += 1
+        return False
+
     def set_mark(self, mark_nr, position):
         if mark_nr & 1:
             # This id marks the end of a group.
@@ -683,18 +731,22 @@
         # alternation
         # <BRANCH> <0=skip> code <JUMP> ... <NULL>
         #self._log(ctx, "BRANCH")
+        from sre_constants import OPCODES
         ctx.state.marks_push()
         ctx.skip_code(1)
         current_branch_length = ctx.peek_code(0)
         while current_branch_length:
-            # XXX OP_LITERAL and OP_IN optimizations here
-            ctx.state.string_position = ctx.string_position
-            child_context = ctx.push_new_context(1)
-            yield False
-            if child_context.has_matched:
-                ctx.has_matched = True
-                yield True
-            ctx.state.marks_pop_keep()
+            # The following tries to shortcut branches starting with a
+            # (unmatched) literal. _sre.c also shortcuts charsets here.
+            if not (ctx.peek_code(1) == OPCODES["literal"] and \
+                    (ctx.at_end() or ctx.peek_code(2) != ord(ctx.peek_char()))):
+                ctx.state.string_position = ctx.string_position
+                child_context = ctx.push_new_context(1)
+                yield False
+                if child_context.has_matched:
+                    ctx.has_matched = True
+                    yield True
+                ctx.state.marks_pop_keep()
             ctx.skip_code(current_branch_length)
             current_branch_length = ctx.peek_code(0)
         ctx.state.marks_pop_discard()



More information about the Pypy-commit mailing list