[pypy-commit] pypy unicode-utf8-py3: revert interp_sre.py to py3.5

Mon Jul 9 09:01:04 EDT 2018

Author: Matti Picus <matti.picus at gmail.com>
Branch: unicode-utf8-py3
Changeset: r94838:4ef833b2310d
Date: 2018-07-09 05:35 -0700
http://bitbucket.org/pypy/pypy/changeset/4ef833b2310d/

Log:	revert interp_sre.py to py3.5

diff --git a/pypy/module/_sre/interp_sre.py b/pypy/module/_sre/interp_sre.py
--- a/pypy/module/_sre/interp_sre.py
+++ b/pypy/module/_sre/interp_sre.py
@@ -6,14 +6,14 @@
 from pypy.interpreter.gateway import interp2app, unwrap_spec, WrappedDefault
 from pypy.interpreter.error import OperationError, oefmt
 from rpython.rlib.rarithmetic import intmask
-from rpython.rlib import jit, rutf8
-from rpython.rlib.rstring import StringBuilder
+from rpython.rlib import jit
+from rpython.rlib.rstring import StringBuilder, UnicodeBuilder
 
 # ____________________________________________________________
 #
 # Constants and exposed functions
 
-from rpython.rlib.rsre import rsre_core, rsre_char, rsre_utf8
+from rpython.rlib.rsre import rsre_core, rsre_char
 from rpython.rlib.rsre.rsre_char import CODESIZE, MAXREPEAT, MAXGROUPS, getlower, set_unicode_db
 
 
@@ -34,21 +34,14 @@
 
 
 def slice_w(space, ctx, start, end, w_default):
-    # 'start' and 'end' are byte positions
-    if ctx.ZERO <= start <= end:
+    if 0 <= start <= end:
         if isinstance(ctx, rsre_core.BufMatchContext):
             return space.newbytes(ctx._buffer.getslice(start, end, 1,
                                                         end-start))
         if isinstance(ctx, rsre_core.StrMatchContext):
-            start = ctx._real_pos(start)
-            end = ctx._real_pos(end)
             return space.newbytes(ctx._string[start:end])
-        elif isinstance(ctx, rsre_utf8.Utf8MatchContext):
-            s = ctx._utf8[start:end]
-            lgt = rutf8.get_utf8_length(s)
-            return space.newutf8(s, lgt)
         elif isinstance(ctx, rsre_core.UnicodeMatchContext):
-            return space.newtext(ctx._unicodestr[start:end])
+            return space.newunicode(ctx._unicodestr[start:end])
         else:
             # unreachable
             raise SystemError
@@ -60,7 +53,6 @@
     # Returns a list of RPython-level integers.
     # Unlike the app-level groups() method, groups are numbered from 0
     # and the returned list does not start with the whole match range.
-    # The integers are byte positions, not character indexes (for utf8).
     if num_groups == 0:
         return None
     result = [-1] * (2 * num_groups)
@@ -113,7 +105,7 @@
 
     def repr_w(self):
         space = self.space
-        u = space.utf8_w(space.repr(self.w_pattern)).decode()
+        u = space.realunicode_w(space.repr(self.w_pattern))
         if len(u) > 200:
             u = u[:200]
         flag_items = []
@@ -136,7 +128,7 @@
         else:
             usep = u', '
             uflags = u'|'.join([item.decode('latin-1') for item in flag_items])
-        return space.newtext(u're.compile(%s%s%s)' % (u, usep, uflags))
+        return space.newunicode(u're.compile(%s%s%s)' % (u, usep, uflags))
 
     def fget_groupindex(self, space):
         w_groupindex = self.w_groupindex
@@ -166,7 +158,7 @@
         buf = None
         space = self.space
         if space.isinstance_w(w_string, space.w_unicode):
-            unicodestr = space.utf8_w(w_string).decode('utf8')
+            unicodestr = space.realunicode_w(w_string)
             length = len(unicodestr)
         elif space.isinstance_w(w_string, space.w_bytes):
             string = space.bytes_w(w_string)
@@ -178,7 +170,7 @@
         return (length, unicodestr, string, buf)
 
     def make_ctx(self, w_string, pos=0, endpos=sys.maxint, flags=0):
-        """Make a StrMatchContext, BufMatchContext or a Utf8MatchContext for
+        """Make a StrMatchContext, BufMatchContext or a UnicodeMatchContext for
         searching in the given w_string object."""
         space = self.space
         length, unicodestr, string, buf = self.getstring(w_string)
@@ -211,27 +203,6 @@
                 return rsre_core.BufMatchContext(buf,
                                                  pos, endpos, flags)
 
-    def fresh_copy(self, ctx):
-        if isinstance(ctx, rsre_utf8.Utf8MatchContext):
-            result = rsre_utf8.Utf8MatchContext(
-                ctx._utf8, ctx.match_start, ctx.end, ctx.flags)
-            result.w_unicode_obj = ctx.w_unicode_obj
-        elif isinstance(ctx, rsre_core.StrMatchContext):
-            result = self._make_str_match_context(
-                ctx._string, ctx.match_start, ctx.end)
-        elif isinstance(ctx, rsre_core.BufMatchContext):
-            result = rsre_core.BufMatchContext(
-                ctx._buffer, ctx.match_start, ctx.end, ctx.flags)
-        else:
-            raise AssertionError("bad ctx type")
-        result.match_end = ctx.match_end
-        return result
-
-    def _make_str_match_context(self, str, pos, endpos):
-        # for tests to override
-        return rsre_core.StrMatchContext(str,
-                                         pos, endpos, self.flags)
-
     def getmatch(self, ctx, found):
         if found:
             return W_SRE_Match(self, ctx)
@@ -259,7 +230,7 @@
         space = self.space
         matchlist_w = []
         ctx = self.make_ctx(w_string, pos, endpos)
-        while True:
+        while ctx.match_start <= ctx.end:
             if not searchcontext(space, ctx, self.code):
                 break
             num_groups = self.num_groups
@@ -276,12 +247,8 @@
                     w_item = allgroups_w(space, ctx, fmarks, num_groups,
                                          w_emptystr)
             matchlist_w.append(w_item)
-            reset_at = ctx.match_end
-            if ctx.match_start == ctx.match_end:
-                if reset_at == ctx.end:
-                    break
-                reset_at = ctx.next_indirect(reset_at)
-            ctx.reset(reset_at)
+            no_progress = (ctx.match_start == ctx.match_end)
+            ctx.reset(ctx.match_end + no_progress)
         return space.newlist(matchlist_w)
 
     @unwrap_spec(pos=int, endpos=int)
@@ -306,15 +273,15 @@
         #
         splitlist = []
         n = 0
+        last = 0
         ctx = self.make_ctx(w_string)
-        last = ctx.ZERO
         while not maxsplit or n < maxsplit:
             if not searchcontext(space, ctx, self.code):
                 break
             if ctx.match_start == ctx.match_end:     # zero-width match
                 if ctx.match_start == ctx.end:       # or end of string
                     break
-                ctx.reset(ctx.next_indirect(ctx.match_end))
+                ctx.reset(ctx.match_end + 1)
                 continue
             splitlist.append(slice_w(space, ctx, last, ctx.match_start,
                                      space.w_None))
@@ -343,31 +310,27 @@
 
     def subx(self, w_ptemplate, w_string, count):
         space = self.space
-        # use a (much faster) string builder (possibly utf8) if w_ptemplate and
+        # use a (much faster) string/unicode builder if w_ptemplate and
         # w_string are both string or both unicode objects, and if w_ptemplate
         # is a literal
-        use_builder = '\x00'   # or 'S'tring or 'U'nicode/UTF8
-        filter_as_string = None
+        use_builder = False
+        filter_as_unicode = filter_as_string = None
         if space.is_true(space.callable(w_ptemplate)):
             w_filter = w_ptemplate
             filter_is_callable = True
         else:
-            if space.isinstance_w(w_ptemplate, space.w_unicode):
-                filter_as_string = space.utf8_w(w_ptemplate)
+            length, filter_as_unicode, filter_as_string, buf = (
+                self.getstring(w_ptemplate))
+            if filter_as_unicode is not None:
+                literal = u'\\' not in filter_as_unicode
+                use_builder = (
+                    space.isinstance_w(w_string, space.w_unicode) and literal)
+            else:
+                if buf is not None:
+                    filter_as_string = buf.as_str()
                 literal = '\\' not in filter_as_string
-                if space.isinstance_w(w_string, space.w_unicode) and literal:
-                    use_builder = 'U'
-            else:
-                try:
-                    filter_as_string = space.bytes_w(w_ptemplate)
-                except OperationError as e:
-                    if e.async(space):
-                        raise
-                    literal = False
-                else:
-                    literal = '\\' not in filter_as_string
-                    if space.isinstance_w(w_string, space.w_bytes) and literal:
-                        use_builder = 'S'
+                use_builder = (
+                    space.isinstance_w(w_string, space.w_bytes) and literal)
             if literal:
                 w_filter = w_ptemplate
                 filter_is_callable = False
@@ -384,16 +347,18 @@
         #
         # XXX this is a bit of a mess, but it improves performance a lot
         ctx = self.make_ctx(w_string)
-        sublist_w = strbuilder = None
-        if use_builder != '\x00':
-            assert filter_as_string is not None
-            strbuilder = StringBuilder(ctx.end)
+        sublist_w = strbuilder = unicodebuilder = None
+        if use_builder:
+            if filter_as_unicode is not None:
+                unicodebuilder = UnicodeBuilder(ctx.end)
+            else:
+                assert filter_as_string is not None
+                strbuilder = StringBuilder(ctx.end)
         else:
             sublist_w = []
-        n = 0
-        last_pos = ctx.ZERO
+        n = last_pos = 0
+        pattern = self.code
         while not count or n < count:
-            pattern = self.code
             sub_jitdriver.jit_merge_point(
                 self=self,
                 use_builder=use_builder,
@@ -402,7 +367,9 @@
                 ctx=ctx, pattern=pattern,
                 w_filter=w_filter,
                 strbuilder=strbuilder,
+                unicodebuilder=unicodebuilder,
                 filter_as_string=filter_as_string,
+                filter_as_unicode=filter_as_unicode,
                 count=count,
                 w_string=w_string,
                 n=n, last_pos=last_pos, sublist_w=sublist_w
@@ -413,7 +380,10 @@
             if last_pos < ctx.match_start:
                 _sub_append_slice(
                     ctx, space, use_builder, sublist_w,
-                    strbuilder, last_pos, ctx.match_start)
+                    strbuilder, unicodebuilder, last_pos, ctx.match_start)
+            start = ctx.match_end
+            if start == ctx.match_start:
+                start += 1
             if not (last_pos == ctx.match_start
                              == ctx.match_end and n > 0):
                 # the above ignores empty matches on latest position
@@ -421,48 +391,40 @@
                 if filter_is_callable:
                     w_match = self.getmatch(ctx, True)
                     # make a copy of 'ctx'; see test_sub_matches_stay_valid
-                    ctx = self.fresh_copy(ctx)
+                    ctx = ctx.fresh_copy(start) # match_start/match_end dropped
                     w_piece = space.call_function(w_filter, w_match)
                     if not space.is_w(w_piece, space.w_None):
-                        assert strbuilder is None
-                        assert use_builder == '\x00'
+                        assert strbuilder is None and unicodebuilder is None
+                        assert not use_builder
                         sublist_w.append(w_piece)
                 else:
-                    if use_builder != '\x00':
-                        assert filter_as_string is not None
-                        assert strbuilder is not None
-                        strbuilder.append(filter_as_string)
+                    if use_builder:
+                        if strbuilder is not None:
+                            assert filter_as_string is not None
+                            strbuilder.append(filter_as_string)
+                        else:
+                            assert unicodebuilder is not None
+                            assert filter_as_unicode is not None
+                            unicodebuilder.append(filter_as_unicode)
                     else:
                         sublist_w.append(w_filter)
                 n += 1
             elif last_pos >= ctx.end:
                 break    # empty match at the end: finished
-
-            start = ctx.match_end
-            if start == ctx.match_start:
-                if start == ctx.end:
-                    break
-                start = ctx.next_indirect(start)
             ctx.reset(start)
 
         if last_pos < ctx.end:
             _sub_append_slice(ctx, space, use_builder, sublist_w,
-                              strbuilder, last_pos, ctx.end)
-        if use_builder != '\x00':
-            assert strbuilder is not None
-            result_bytes = strbuilder.build()
-            if use_builder == 'S':
-                assert not isinstance(ctx, rsre_utf8.Utf8MatchContext)
-                return space.newbytes(result_bytes), n
-            elif use_builder == 'U':
-                assert isinstance(ctx, rsre_utf8.Utf8MatchContext)
-                return space.newutf8(result_bytes,
-                                     rutf8.get_utf8_length(result_bytes)), n
+                              strbuilder, unicodebuilder, last_pos, ctx.end)
+        if use_builder:
+            if strbuilder is not None:
+                return space.newbytes(strbuilder.build()), n
             else:
-                raise AssertionError(use_builder)
+                assert unicodebuilder is not None
+                return space.newunicode(unicodebuilder.build()), n
         else:
             if space.isinstance_w(w_string, space.w_unicode):
-                w_emptystr = space.newutf8('', 0)
+                w_emptystr = space.newunicode(u'')
             else:
                 w_emptystr = space.newbytes('')
             w_item = space.call_method(w_emptystr, 'join',
@@ -472,28 +434,26 @@
 sub_jitdriver = jit.JitDriver(
     reds="""count n last_pos
             ctx w_filter
-            strbuilder
+            strbuilder unicodebuilder
             filter_as_string
+            filter_as_unicode
             w_string sublist_w
             self""".split(),
     greens=["filter_is_callable", "use_builder", "filter_type", "pattern"])
 
 
 def _sub_append_slice(ctx, space, use_builder, sublist_w,
-                      strbuilder, start, end):
-    if use_builder != '\x00':
-        assert strbuilder is not None
+                      strbuilder, unicodebuilder, start, end):
+    if use_builder:
         if isinstance(ctx, rsre_core.BufMatchContext):
-            assert use_builder == 'S'
+            assert strbuilder is not None
             return strbuilder.append(ctx._buffer.getslice(start, end, 1, end-start))
         if isinstance(ctx, rsre_core.StrMatchContext):
-            assert use_builder == 'S'
-            start = ctx._real_pos(start)
-            end = ctx._real_pos(end)
+            assert strbuilder is not None
             return strbuilder.append_slice(ctx._string, start, end)
-        elif isinstance(ctx, rsre_utf8.Utf8MatchContext):
-            assert use_builder == 'U'
-            return strbuilder.append_slice(ctx._utf8, start, end)
+        elif isinstance(ctx, rsre_core.UnicodeMatchContext):
+            assert unicodebuilder is not None
+            return unicodebuilder.append_slice(ctx._unicodestr, start, end)
         assert 0, "unreachable"
     else:
         sublist_w.append(slice_w(space, ctx, start, end, space.w_None))
@@ -568,10 +528,10 @@
         ctx = self.ctx
         start, end = ctx.match_start, ctx.match_end
         w_s = slice_w(space, ctx, start, end, space.w_None)
-        u = space.utf8_w(space.repr(w_s)).decode()
+        u = space.realuicode_w(space.repr(w_s))
         if len(u) > 50:
             u = u[:50]
-        return space.newtext(u'<_sre.SRE_Match object; span=(%d, %d), match=%s>' %
+        return space.newunicode(u'<_sre.SRE_Match object; span=(%d, %d), match=%s>' %
                           (start, end, u))
 
     def cannot_copy_w(self):
@@ -629,38 +589,19 @@
     @unwrap_spec(w_groupnum=WrappedDefault(0))
     def start_w(self, w_groupnum):
         start, end = self.do_span(w_groupnum)
-        start = self.bytepos_to_charindex(start)
         return self.space.newint(start)
 
     @unwrap_spec(w_groupnum=WrappedDefault(0))
     def end_w(self, w_groupnum):
         start, end = self.do_span(w_groupnum)
-        end = self.bytepos_to_charindex(end)
         return self.space.newint(end)
 
     @unwrap_spec(w_groupnum=WrappedDefault(0))
     def span_w(self, w_groupnum):
         start, end = self.do_span(w_groupnum)
-        return self.new_charindex_tuple(start, end)
-
-    def new_charindex_tuple(self, start, end):
-        start = self.bytepos_to_charindex(start)
-        end = self.bytepos_to_charindex(end)
         return self.space.newtuple([self.space.newint(start),
                                     self.space.newint(end)])
 
-    def bytepos_to_charindex(self, bytepos):
-        # Transform a 'byte position', as returned by all methods from
-        # rsre_core, back into a 'character index'.  This is for UTF8
-        # handling.
-        ctx = self.ctx
-        if isinstance(ctx, rsre_utf8.Utf8MatchContext):
-            index_storage = ctx.w_unicode_obj._get_index_storage()
-            return rutf8.codepoint_index_at_byte_position(
-                ctx.w_unicode_obj._utf8, index_storage, bytepos)
-        else:
-            return bytepos
-
     def flatten_marks(self):
         if self.flatten_cache is None:
             num_groups = self.srepat.num_groups
@@ -668,8 +609,6 @@
         return self.flatten_cache
 
     def do_span(self, w_arg):
-        # return a pair of integers, which are byte positions, not
-        # character indexes (for utf8)
         space = self.space
         try:
             groupnum = space.int_w(w_arg)
@@ -717,10 +656,10 @@
         return space.w_None
 
     def fget_pos(self, space):
-        return space.newint(self.bytepos_to_charindex(self.ctx.original_pos))
+        return space.newint(self.ctx.original_pos)
 
     def fget_endpos(self, space):
-        return space.newint(self.bytepos_to_charindex(self.ctx.end))
+        return space.newint(self.ctx.end)
 
     def fget_regs(self, space):
         space = self.space
@@ -728,11 +667,11 @@
         num_groups = self.srepat.num_groups
         result_w = [None] * (num_groups + 1)
         ctx = self.ctx
-        result_w[0] = self.new_charindex_tuple(ctx.match_start,
-                                               ctx.match_end)
+        result_w[0] = space.newtuple([space.newint(ctx.match_start),
+                                      space.newint(ctx.match_end)])
         for i in range(num_groups):
-            result_w[i + 1] = self.new_charindex_tuple(fmarks[i*2],
-                                                       fmarks[i*2+1])
+            result_w[i + 1] = space.newtuple([space.newint(fmarks[i*2]),
+                                              space.newint(fmarks[i*2+1])])
         return space.newtuple(result_w)
 
     def fget_string(self, space):
@@ -741,11 +680,8 @@
             return space.newbytes(ctx._buffer.as_str())
         elif isinstance(ctx, rsre_core.StrMatchContext):
             return space.newbytes(ctx._string)
-        elif isinstance(ctx, rsre_utf8.Utf8MatchContext):
-            lgt = rutf8.get_utf8_length(ctx._utf8)
-            return space.newutf8(ctx._utf8, lgt)
         elif isinstance(ctx, rsre_core.UnicodeMatchContext):
-            return space.newtext(ctx._unicodestr)
+            return space.newunicode(ctx._unicodestr)
         else:
             raise SystemError
 
@@ -786,53 +722,38 @@
         self.ctx = ctx
         self.code = code
         # 'self.ctx' is always a fresh context in which no searching
-        # or matching succeeded so far.  It is None when the iterator is
-        # exhausted.
+        # or matching succeeded so far.
 
     def iter_w(self):
         return self
 
     def next_w(self):
-        if self.ctx is None:
+        if self.ctx.match_start > self.ctx.end:
             raise OperationError(self.space.w_StopIteration, self.space.w_None)
         if not searchcontext(self.space, self.ctx, self.code):
             raise OperationError(self.space.w_StopIteration, self.space.w_None)
         return self.getmatch(True)
 
     def match_w(self):
-        if self.ctx is None:
+        if self.ctx.match_start > self.ctx.end:
             return self.space.w_None
         return self.getmatch(matchcontext(self.space, self.ctx, self.code))
 
     def search_w(self):
-        if self.ctx is None:
+        if self.ctx.match_start > self.ctx.end:
             return self.space.w_None
         return self.getmatch(searchcontext(self.space, self.ctx, self.code))
 
     def getmatch(self, found):
-        ctx = self.ctx
-        assert ctx is not None
         if found:
+            ctx = self.ctx
             nextstart = ctx.match_end
-            exhausted = False
-            if ctx.match_start == nextstart:
-                if nextstart == ctx.end:
-                    exhausted = True
-                else:
-                    nextstart = ctx.next_indirect(nextstart)
-            if exhausted:
-                self.ctx = None
-            else:
-                self.ctx = self.srepat.fresh_copy(ctx)
-                self.ctx.match_start = nextstart
+            nextstart += (ctx.match_start == nextstart)
+            self.ctx = ctx.fresh_copy(nextstart)
             match = W_SRE_Match(self.srepat, ctx)
             return match
         else:
-            # obscure corner case
-            if ctx.match_start == ctx.end:
-                self.ctx = None
-            else:
-                ctx.match_start = ctx.next_indirect(ctx.match_start)
+            self.ctx.match_start += 1     # obscure corner case
             return None
 
 W_SRE_Scanner.typedef = TypeDef(