[pypy-commit] pypy unicode-utf8-re: in-progress

Sun Dec 3 09:13:16 EST 2017

Author: Armin Rigo <arigo at tunes.org>
Branch: unicode-utf8-re
Changeset: r93241:87a98889b109
Date: 2017-12-03 15:12 +0100
http://bitbucket.org/pypy/pypy/changeset/87a98889b109/

Log:	in-progress

diff --git a/rpython/rlib/rsre/rsre_core.py b/rpython/rlib/rsre/rsre_core.py
--- a/rpython/rlib/rsre/rsre_core.py
+++ b/rpython/rlib/rsre/rsre_core.py
@@ -324,7 +324,10 @@
             ctx.jitdriver_RepeatOne.jit_merge_point(
                 self=self, ptr=ptr, ctx=ctx, nextppos=nextppos)
             result = sre_match(ctx, nextppos, ptr, self.start_marks)
-            ptr = ctx.prev_or_minus1(ptr)
+            try:
+                ptr = ctx.prev(ptr)
+            except EndOfString:
+                ptr = -1
             if result is not None:
                 self.subresult = result
                 self.start_ptr = ptr
@@ -440,12 +443,12 @@
             min = ctx.pat(ppos+1)
             if enum is not None:
                 # matched one more 'item'.  record it and continue.
-                last_match_length = ctx.match_end - ptr
+                last_match_zero_length = (ctx.match_end == ptr)
                 self.pending = Pending(ptr, marks, enum, self.pending)
                 self.num_pending += 1
                 ptr = ctx.match_end
                 marks = ctx.match_marks
-                if last_match_length == 0 and self.num_pending >= min:
+                if last_match_zero_length and self.num_pending >= min:
                     # zero-width protection: after an empty match, if there
                     # are enough matches, don't try to match more.  Instead,
                     # fall through to trying to match 'tail'.
@@ -629,30 +632,30 @@
         elif op == OPCODE_GROUPREF:
             # match backreference
             # <GROUPREF> <groupnum>
-            startptr, length = get_group_ref(marks, ctx.pat(ppos))
-            if length < 0:
+            startptr, length_bytes = get_group_ref(ctx, marks, ctx.pat(ppos))
+            if length_bytes < 0:
                 return     # group was not previously defined
-            if not match_repeated(ctx, ptr, startptr, length):
+            if not match_repeated(ctx, ptr, startptr, length_bytes):
                 return     # no match
-            ptr += length
+            ptr = ctx.go_forward_by_bytes(ptr, length_bytes)
             ppos += 1
 
         elif op == OPCODE_GROUPREF_IGNORE:
             # match backreference
             # <GROUPREF> <groupnum>
-            startptr, length = get_group_ref(marks, ctx.pat(ppos))
-            if length < 0:
+            startptr, length_bytes = get_group_ref(ctx, marks, ctx.pat(ppos))
+            if length_bytes < 0:
                 return     # group was not previously defined
-            if not match_repeated_ignore(ctx, ptr, startptr, length):
+            ptr = match_repeated_ignore(ctx, ptr, startptr, length_bytes)
+            if ptr < ctx.ZERO:
                 return     # no match
-            ptr += length
             ppos += 1
 
         elif op == OPCODE_GROUPREF_EXISTS:
             # conditional match depending on the existence of a group
             # <GROUPREF_EXISTS> <group> <skip> codeyes <JUMP> codeno ...
-            _, length = get_group_ref(marks, ctx.pat(ppos))
-            if length >= 0:
+            _, length_bytes = get_group_ref(ctx, marks, ctx.pat(ppos))
+            if length_bytes >= 0:
                 ppos += 2                  # jump to 'codeyes'
             else:
                 ppos += ctx.pat(ppos+1)    # jump to 'codeno'
@@ -664,7 +667,7 @@
                                                              ctx.str(ptr)):
                 return
             ppos += ctx.pat(ppos)
-            ptr += 1
+            ptr = ctx.next(ptr)
 
         elif op == OPCODE_IN_IGNORE:
             # match set member (or non_member), ignoring case
@@ -673,7 +676,7 @@
                                                              ctx.lowstr(ptr)):
                 return
             ppos += ctx.pat(ppos)
-            ptr += 1
+            ptr = ctx.next(ptr)
 
         elif op == OPCODE_INFO:
             # optimization info block
@@ -699,7 +702,7 @@
             if ptr >= ctx.end or ctx.lowstr(ptr) != ctx.pat(ppos):
                 return
             ppos += 1
-            ptr += 1
+            ptr = ctx.next(ptr)
 
         elif op == OPCODE_MARK:
             # set mark
@@ -804,32 +807,36 @@
             raise Error("bad pattern code %d" % op)
 
 
-def get_group_ref(marks, groupnum):
+def get_group_ref(ctx, marks, groupnum):
     gid = groupnum * 2
     startptr = find_mark(marks, gid)
-    if startptr < 0:
+    if startptr < ctx.ZERO:
         return 0, -1
     endptr = find_mark(marks, gid + 1)
-    length = endptr - startptr     # < 0 if endptr < startptr (or if endptr=-1)
-    return startptr, length
+    length_bytes = ctx.bytes_difference(endptr, startptr)
+    #        < 0 if endptr < startptr (or if endptr=-1)
+    return startptr, length_bytes
 
 @specializectx
-def match_repeated(ctx, ptr, oldptr, length):
-    if ptr + length > ctx.end:
+def match_repeated(ctx, ptr, oldptr, length_bytes):
+    if ctx.bytes_difference(ctx.end, ptr) < length_bytes:
         return False
-    for i in range(length):
-        if ctx.str(ptr + i) != ctx.str(oldptr + i):
+    for i in range(length_bytes):
+        if ctx.get_single_byte(ptr, i) != ctx.get_single_byte(oldptr, i):
             return False
     return True
 
 @specializectx
-def match_repeated_ignore(ctx, ptr, oldptr, length):
-    if ptr + length > ctx.end:
-        return False
-    for i in range(length):
-        if ctx.lowstr(ptr + i) != ctx.lowstr(oldptr + i):
-            return False
-    return True
+def match_repeated_ignore(ctx, ptr, oldptr, length_bytes):
+    oldend = ctx.go_forward_by_bytes(oldptr, length_bytes)
+    while oldptr < oldend:
+        if ptr >= ctx.end:
+            return -1
+        if ctx.lowstr(ptr) != ctx.lowstr(oldptr):
+            return -1
+        ptr = ctx.next(ptr)
+        oldptr = ctx.next(oldptr)
+    return ptr
 
 @specializectx
 def find_repetition_end(ctx, ppos, ptr, maxcount, marks):
@@ -934,7 +941,7 @@
                 ctx.jitdriver_MatchInIgnore.jit_merge_point(ctx=ctx, ptr=ptr,
                                                             end=end, ppos=ppos)
                 if ptr < end and checkerfn(ctx, ptr, ppos):
-                    ptr += 1
+                    ptr = ctx.next(ptr)
                 else:
                     return ptr
     else:
@@ -996,9 +1003,8 @@
         return at_non_boundary(ctx, ptr)
 
     elif atcode == AT_END:
-        remaining_chars = ctx.end - ptr
-        return remaining_chars <= 0 or (
-            remaining_chars == 1 and rsre_char.is_linebreak(ctx.str(ptr)))
+        return (ptr == ctx.end or
+            (ctx.next(ptr) == ctx.end and rsre_char.is_linebreak(ctx.str(ptr))))
 
     elif atcode == AT_END_LINE:
         return ptr == ctx.end or rsre_char.is_linebreak(ctx.str(ptr))
diff --git a/rpython/rlib/rsre/test/support.py b/rpython/rlib/rsre/test/support.py
--- a/rpython/rlib/rsre/test/support.py
+++ b/rpython/rlib/rsre/test/support.py
@@ -14,35 +14,27 @@
     def __repr__(self):
         return '<Position %d>' % (self._p)
     def __cmp__(self, other):
-        if not isinstance(other, (Position, MinusOnePosition)):
-            raise TypeError("cannot compare %r with %r" % (self, other))
-        return cmp(self._p, other._p)
-
-class MinusOnePosition(object):
-    _p = -1
-    def __repr__(self):
-        return '<MinusOnePosition>'
-    def __cmp__(self, other):
-        if not isinstance(other, (Position, MinusOnePosition)):
-            raise TypeError("cannot compare %r with %r" % (self, other))
-        return cmp(self._p, other._p)
+        if isinstance(other, Position):
+            return cmp(self._p, other._p)
+        if type(other) is int and other == -1:
+            return cmp(self._p, -1)
+        raise TypeError("cannot compare %r with %r" % (self, other))
 
 
 class MatchContextForTests(StrMatchContext):
     """Concrete subclass for matching in a plain string, tweaked for tests"""
 
     ZERO = Position(0)
-    MINUS1 = MinusOnePosition()
     EXACT_DISTANCE = False
 
     def next(self, position):
         assert isinstance(position, Position)
         return Position(position._p + 1)
 
-    def prev_or_minus1(self, position):
+    def prev(self, position):
         assert isinstance(position, Position)
         if position._p == 0:
-            return self.MINUS1
+            raise EndOfString
         return Position(position._p - 1)
 
     def next_n(self, position, n, end_position):
@@ -89,6 +81,21 @@
         assert isinstance(position_high, Position)
         return position_high._p - position_low._p + random.randrange(0, 10)
 
+    def bytes_difference(self, position1, position2):
+        assert isinstance(position1, Position)
+        assert isinstance(position2, Position)
+        return position1._p - position2._p
+
+    def get_single_byte(self, base_position, index):
+        assert isinstance(base_position, Position)
+        assert isinstance(index, int)
+        return ord(self._string[base_position._p + index])
+
+    def go_forward_by_bytes(self, base_position, index):
+        assert isinstance(base_position, Position)
+        assert isinstance(index, int)
+        return Position(base_position._p + index)
+
 
 def match(pattern, string, start=0, end=sys.maxint, flags=0, fullmatch=False):
     start, end = _adjust(start, end, len(string))