[pypy-commit] pypy unicode-utf8-re: in-progress
arigo
pypy.commits at gmail.com
Sun Dec 3 09:13:16 EST 2017
Author: Armin Rigo <arigo at tunes.org>
Branch: unicode-utf8-re
Changeset: r93241:87a98889b109
Date: 2017-12-03 15:12 +0100
http://bitbucket.org/pypy/pypy/changeset/87a98889b109/
Log: in-progress
diff --git a/rpython/rlib/rsre/rsre_core.py b/rpython/rlib/rsre/rsre_core.py
--- a/rpython/rlib/rsre/rsre_core.py
+++ b/rpython/rlib/rsre/rsre_core.py
@@ -324,7 +324,10 @@
ctx.jitdriver_RepeatOne.jit_merge_point(
self=self, ptr=ptr, ctx=ctx, nextppos=nextppos)
result = sre_match(ctx, nextppos, ptr, self.start_marks)
- ptr = ctx.prev_or_minus1(ptr)
+ try:
+ ptr = ctx.prev(ptr)
+ except EndOfString:
+ ptr = -1
if result is not None:
self.subresult = result
self.start_ptr = ptr
@@ -440,12 +443,12 @@
min = ctx.pat(ppos+1)
if enum is not None:
# matched one more 'item'. record it and continue.
- last_match_length = ctx.match_end - ptr
+ last_match_zero_length = (ctx.match_end == ptr)
self.pending = Pending(ptr, marks, enum, self.pending)
self.num_pending += 1
ptr = ctx.match_end
marks = ctx.match_marks
- if last_match_length == 0 and self.num_pending >= min:
+ if last_match_zero_length and self.num_pending >= min:
# zero-width protection: after an empty match, if there
# are enough matches, don't try to match more. Instead,
# fall through to trying to match 'tail'.
@@ -629,30 +632,30 @@
elif op == OPCODE_GROUPREF:
# match backreference
# <GROUPREF> <groupnum>
- startptr, length = get_group_ref(marks, ctx.pat(ppos))
- if length < 0:
+ startptr, length_bytes = get_group_ref(ctx, marks, ctx.pat(ppos))
+ if length_bytes < 0:
return # group was not previously defined
- if not match_repeated(ctx, ptr, startptr, length):
+ if not match_repeated(ctx, ptr, startptr, length_bytes):
return # no match
- ptr += length
+ ptr = ctx.go_forward_by_bytes(ptr, length_bytes)
ppos += 1
elif op == OPCODE_GROUPREF_IGNORE:
# match backreference
# <GROUPREF> <groupnum>
- startptr, length = get_group_ref(marks, ctx.pat(ppos))
- if length < 0:
+ startptr, length_bytes = get_group_ref(ctx, marks, ctx.pat(ppos))
+ if length_bytes < 0:
return # group was not previously defined
- if not match_repeated_ignore(ctx, ptr, startptr, length):
+ ptr = match_repeated_ignore(ctx, ptr, startptr, length_bytes)
+ if ptr < ctx.ZERO:
return # no match
- ptr += length
ppos += 1
elif op == OPCODE_GROUPREF_EXISTS:
# conditional match depending on the existence of a group
# <GROUPREF_EXISTS> <group> <skip> codeyes <JUMP> codeno ...
- _, length = get_group_ref(marks, ctx.pat(ppos))
- if length >= 0:
+ _, length_bytes = get_group_ref(ctx, marks, ctx.pat(ppos))
+ if length_bytes >= 0:
ppos += 2 # jump to 'codeyes'
else:
ppos += ctx.pat(ppos+1) # jump to 'codeno'
@@ -664,7 +667,7 @@
ctx.str(ptr)):
return
ppos += ctx.pat(ppos)
- ptr += 1
+ ptr = ctx.next(ptr)
elif op == OPCODE_IN_IGNORE:
# match set member (or non_member), ignoring case
@@ -673,7 +676,7 @@
ctx.lowstr(ptr)):
return
ppos += ctx.pat(ppos)
- ptr += 1
+ ptr = ctx.next(ptr)
elif op == OPCODE_INFO:
# optimization info block
@@ -699,7 +702,7 @@
if ptr >= ctx.end or ctx.lowstr(ptr) != ctx.pat(ppos):
return
ppos += 1
- ptr += 1
+ ptr = ctx.next(ptr)
elif op == OPCODE_MARK:
# set mark
@@ -804,32 +807,36 @@
raise Error("bad pattern code %d" % op)
-def get_group_ref(marks, groupnum):
+def get_group_ref(ctx, marks, groupnum):
gid = groupnum * 2
startptr = find_mark(marks, gid)
- if startptr < 0:
+ if startptr < ctx.ZERO:
return 0, -1
endptr = find_mark(marks, gid + 1)
- length = endptr - startptr # < 0 if endptr < startptr (or if endptr=-1)
- return startptr, length
+ length_bytes = ctx.bytes_difference(endptr, startptr)
+ # < 0 if endptr < startptr (or if endptr=-1)
+ return startptr, length_bytes
@specializectx
-def match_repeated(ctx, ptr, oldptr, length):
- if ptr + length > ctx.end:
+def match_repeated(ctx, ptr, oldptr, length_bytes):
+ if ctx.bytes_difference(ctx.end, ptr) < length_bytes:
return False
- for i in range(length):
- if ctx.str(ptr + i) != ctx.str(oldptr + i):
+ for i in range(length_bytes):
+ if ctx.get_single_byte(ptr, i) != ctx.get_single_byte(oldptr, i):
return False
return True
@specializectx
-def match_repeated_ignore(ctx, ptr, oldptr, length):
- if ptr + length > ctx.end:
- return False
- for i in range(length):
- if ctx.lowstr(ptr + i) != ctx.lowstr(oldptr + i):
- return False
- return True
+def match_repeated_ignore(ctx, ptr, oldptr, length_bytes):
+ oldend = ctx.go_forward_by_bytes(oldptr, length_bytes)
+ while oldptr < oldend:
+ if ptr >= ctx.end:
+ return -1
+ if ctx.lowstr(ptr) != ctx.lowstr(oldptr):
+ return -1
+ ptr = ctx.next(ptr)
+ oldptr = ctx.next(oldptr)
+ return ptr
@specializectx
def find_repetition_end(ctx, ppos, ptr, maxcount, marks):
@@ -934,7 +941,7 @@
ctx.jitdriver_MatchInIgnore.jit_merge_point(ctx=ctx, ptr=ptr,
end=end, ppos=ppos)
if ptr < end and checkerfn(ctx, ptr, ppos):
- ptr += 1
+ ptr = ctx.next(ptr)
else:
return ptr
else:
@@ -996,9 +1003,8 @@
return at_non_boundary(ctx, ptr)
elif atcode == AT_END:
- remaining_chars = ctx.end - ptr
- return remaining_chars <= 0 or (
- remaining_chars == 1 and rsre_char.is_linebreak(ctx.str(ptr)))
+ return (ptr == ctx.end or
+ (ctx.next(ptr) == ctx.end and rsre_char.is_linebreak(ctx.str(ptr))))
elif atcode == AT_END_LINE:
return ptr == ctx.end or rsre_char.is_linebreak(ctx.str(ptr))
diff --git a/rpython/rlib/rsre/test/support.py b/rpython/rlib/rsre/test/support.py
--- a/rpython/rlib/rsre/test/support.py
+++ b/rpython/rlib/rsre/test/support.py
@@ -14,35 +14,27 @@
def __repr__(self):
return '<Position %d>' % (self._p)
def __cmp__(self, other):
- if not isinstance(other, (Position, MinusOnePosition)):
- raise TypeError("cannot compare %r with %r" % (self, other))
- return cmp(self._p, other._p)
-
-class MinusOnePosition(object):
- _p = -1
- def __repr__(self):
- return '<MinusOnePosition>'
- def __cmp__(self, other):
- if not isinstance(other, (Position, MinusOnePosition)):
- raise TypeError("cannot compare %r with %r" % (self, other))
- return cmp(self._p, other._p)
+ if isinstance(other, Position):
+ return cmp(self._p, other._p)
+ if type(other) is int and other == -1:
+ return cmp(self._p, -1)
+ raise TypeError("cannot compare %r with %r" % (self, other))
class MatchContextForTests(StrMatchContext):
"""Concrete subclass for matching in a plain string, tweaked for tests"""
ZERO = Position(0)
- MINUS1 = MinusOnePosition()
EXACT_DISTANCE = False
def next(self, position):
assert isinstance(position, Position)
return Position(position._p + 1)
- def prev_or_minus1(self, position):
+ def prev(self, position):
assert isinstance(position, Position)
if position._p == 0:
- return self.MINUS1
+ raise EndOfString
return Position(position._p - 1)
def next_n(self, position, n, end_position):
@@ -89,6 +81,21 @@
assert isinstance(position_high, Position)
return position_high._p - position_low._p + random.randrange(0, 10)
+ def bytes_difference(self, position1, position2):
+ assert isinstance(position1, Position)
+ assert isinstance(position2, Position)
+ return position1._p - position2._p
+
+ def get_single_byte(self, base_position, index):
+ assert isinstance(base_position, Position)
+ assert isinstance(index, int)
+ return ord(self._string[base_position._p + index])
+
+ def go_forward_by_bytes(self, base_position, index):
+ assert isinstance(base_position, Position)
+ assert isinstance(index, int)
+ return Position(base_position._p + index)
+
def match(pattern, string, start=0, end=sys.maxint, flags=0, fullmatch=False):
start, end = _adjust(start, end, len(string))
More information about the pypy-commit
mailing list