[pypy-commit] pypy unicode-utf8-py3: revert interp_sre.py to py3.5
mattip
pypy.commits at gmail.com
Mon Jul 9 09:01:04 EDT 2018
Author: Matti Picus <matti.picus at gmail.com>
Branch: unicode-utf8-py3
Changeset: r94838:4ef833b2310d
Date: 2018-07-09 05:35 -0700
http://bitbucket.org/pypy/pypy/changeset/4ef833b2310d/
Log: revert interp_sre.py to py3.5
diff --git a/pypy/module/_sre/interp_sre.py b/pypy/module/_sre/interp_sre.py
--- a/pypy/module/_sre/interp_sre.py
+++ b/pypy/module/_sre/interp_sre.py
@@ -6,14 +6,14 @@
from pypy.interpreter.gateway import interp2app, unwrap_spec, WrappedDefault
from pypy.interpreter.error import OperationError, oefmt
from rpython.rlib.rarithmetic import intmask
-from rpython.rlib import jit, rutf8
-from rpython.rlib.rstring import StringBuilder
+from rpython.rlib import jit
+from rpython.rlib.rstring import StringBuilder, UnicodeBuilder
# ____________________________________________________________
#
# Constants and exposed functions
-from rpython.rlib.rsre import rsre_core, rsre_char, rsre_utf8
+from rpython.rlib.rsre import rsre_core, rsre_char
from rpython.rlib.rsre.rsre_char import CODESIZE, MAXREPEAT, MAXGROUPS, getlower, set_unicode_db
@@ -34,21 +34,14 @@
def slice_w(space, ctx, start, end, w_default):
- # 'start' and 'end' are byte positions
- if ctx.ZERO <= start <= end:
+ if 0 <= start <= end:
if isinstance(ctx, rsre_core.BufMatchContext):
return space.newbytes(ctx._buffer.getslice(start, end, 1,
end-start))
if isinstance(ctx, rsre_core.StrMatchContext):
- start = ctx._real_pos(start)
- end = ctx._real_pos(end)
return space.newbytes(ctx._string[start:end])
- elif isinstance(ctx, rsre_utf8.Utf8MatchContext):
- s = ctx._utf8[start:end]
- lgt = rutf8.get_utf8_length(s)
- return space.newutf8(s, lgt)
elif isinstance(ctx, rsre_core.UnicodeMatchContext):
- return space.newtext(ctx._unicodestr[start:end])
+ return space.newunicode(ctx._unicodestr[start:end])
else:
# unreachable
raise SystemError
@@ -60,7 +53,6 @@
# Returns a list of RPython-level integers.
# Unlike the app-level groups() method, groups are numbered from 0
# and the returned list does not start with the whole match range.
- # The integers are byte positions, not character indexes (for utf8).
if num_groups == 0:
return None
result = [-1] * (2 * num_groups)
@@ -113,7 +105,7 @@
def repr_w(self):
space = self.space
- u = space.utf8_w(space.repr(self.w_pattern)).decode()
+ u = space.realunicode_w(space.repr(self.w_pattern))
if len(u) > 200:
u = u[:200]
flag_items = []
@@ -136,7 +128,7 @@
else:
usep = u', '
uflags = u'|'.join([item.decode('latin-1') for item in flag_items])
- return space.newtext(u're.compile(%s%s%s)' % (u, usep, uflags))
+ return space.newunicode(u're.compile(%s%s%s)' % (u, usep, uflags))
def fget_groupindex(self, space):
w_groupindex = self.w_groupindex
@@ -166,7 +158,7 @@
buf = None
space = self.space
if space.isinstance_w(w_string, space.w_unicode):
- unicodestr = space.utf8_w(w_string).decode('utf8')
+ unicodestr = space.realunicode_w(w_string)
length = len(unicodestr)
elif space.isinstance_w(w_string, space.w_bytes):
string = space.bytes_w(w_string)
@@ -178,7 +170,7 @@
return (length, unicodestr, string, buf)
def make_ctx(self, w_string, pos=0, endpos=sys.maxint, flags=0):
- """Make a StrMatchContext, BufMatchContext or a Utf8MatchContext for
+ """Make a StrMatchContext, BufMatchContext or a UnicodeMatchContext for
searching in the given w_string object."""
space = self.space
length, unicodestr, string, buf = self.getstring(w_string)
@@ -211,27 +203,6 @@
return rsre_core.BufMatchContext(buf,
pos, endpos, flags)
- def fresh_copy(self, ctx):
- if isinstance(ctx, rsre_utf8.Utf8MatchContext):
- result = rsre_utf8.Utf8MatchContext(
- ctx._utf8, ctx.match_start, ctx.end, ctx.flags)
- result.w_unicode_obj = ctx.w_unicode_obj
- elif isinstance(ctx, rsre_core.StrMatchContext):
- result = self._make_str_match_context(
- ctx._string, ctx.match_start, ctx.end)
- elif isinstance(ctx, rsre_core.BufMatchContext):
- result = rsre_core.BufMatchContext(
- ctx._buffer, ctx.match_start, ctx.end, ctx.flags)
- else:
- raise AssertionError("bad ctx type")
- result.match_end = ctx.match_end
- return result
-
- def _make_str_match_context(self, str, pos, endpos):
- # for tests to override
- return rsre_core.StrMatchContext(str,
- pos, endpos, self.flags)
-
def getmatch(self, ctx, found):
if found:
return W_SRE_Match(self, ctx)
@@ -259,7 +230,7 @@
space = self.space
matchlist_w = []
ctx = self.make_ctx(w_string, pos, endpos)
- while True:
+ while ctx.match_start <= ctx.end:
if not searchcontext(space, ctx, self.code):
break
num_groups = self.num_groups
@@ -276,12 +247,8 @@
w_item = allgroups_w(space, ctx, fmarks, num_groups,
w_emptystr)
matchlist_w.append(w_item)
- reset_at = ctx.match_end
- if ctx.match_start == ctx.match_end:
- if reset_at == ctx.end:
- break
- reset_at = ctx.next_indirect(reset_at)
- ctx.reset(reset_at)
+ no_progress = (ctx.match_start == ctx.match_end)
+ ctx.reset(ctx.match_end + no_progress)
return space.newlist(matchlist_w)
@unwrap_spec(pos=int, endpos=int)
@@ -306,15 +273,15 @@
#
splitlist = []
n = 0
+ last = 0
ctx = self.make_ctx(w_string)
- last = ctx.ZERO
while not maxsplit or n < maxsplit:
if not searchcontext(space, ctx, self.code):
break
if ctx.match_start == ctx.match_end: # zero-width match
if ctx.match_start == ctx.end: # or end of string
break
- ctx.reset(ctx.next_indirect(ctx.match_end))
+ ctx.reset(ctx.match_end + 1)
continue
splitlist.append(slice_w(space, ctx, last, ctx.match_start,
space.w_None))
@@ -343,31 +310,27 @@
def subx(self, w_ptemplate, w_string, count):
space = self.space
- # use a (much faster) string builder (possibly utf8) if w_ptemplate and
+ # use a (much faster) string/unicode builder if w_ptemplate and
# w_string are both string or both unicode objects, and if w_ptemplate
# is a literal
- use_builder = '\x00' # or 'S'tring or 'U'nicode/UTF8
- filter_as_string = None
+ use_builder = False
+ filter_as_unicode = filter_as_string = None
if space.is_true(space.callable(w_ptemplate)):
w_filter = w_ptemplate
filter_is_callable = True
else:
- if space.isinstance_w(w_ptemplate, space.w_unicode):
- filter_as_string = space.utf8_w(w_ptemplate)
+ length, filter_as_unicode, filter_as_string, buf = (
+ self.getstring(w_ptemplate))
+ if filter_as_unicode is not None:
+ literal = u'\\' not in filter_as_unicode
+ use_builder = (
+ space.isinstance_w(w_string, space.w_unicode) and literal)
+ else:
+ if buf is not None:
+ filter_as_string = buf.as_str()
literal = '\\' not in filter_as_string
- if space.isinstance_w(w_string, space.w_unicode) and literal:
- use_builder = 'U'
- else:
- try:
- filter_as_string = space.bytes_w(w_ptemplate)
- except OperationError as e:
- if e.async(space):
- raise
- literal = False
- else:
- literal = '\\' not in filter_as_string
- if space.isinstance_w(w_string, space.w_bytes) and literal:
- use_builder = 'S'
+ use_builder = (
+ space.isinstance_w(w_string, space.w_bytes) and literal)
if literal:
w_filter = w_ptemplate
filter_is_callable = False
@@ -384,16 +347,18 @@
#
# XXX this is a bit of a mess, but it improves performance a lot
ctx = self.make_ctx(w_string)
- sublist_w = strbuilder = None
- if use_builder != '\x00':
- assert filter_as_string is not None
- strbuilder = StringBuilder(ctx.end)
+ sublist_w = strbuilder = unicodebuilder = None
+ if use_builder:
+ if filter_as_unicode is not None:
+ unicodebuilder = UnicodeBuilder(ctx.end)
+ else:
+ assert filter_as_string is not None
+ strbuilder = StringBuilder(ctx.end)
else:
sublist_w = []
- n = 0
- last_pos = ctx.ZERO
+ n = last_pos = 0
+ pattern = self.code
while not count or n < count:
- pattern = self.code
sub_jitdriver.jit_merge_point(
self=self,
use_builder=use_builder,
@@ -402,7 +367,9 @@
ctx=ctx, pattern=pattern,
w_filter=w_filter,
strbuilder=strbuilder,
+ unicodebuilder=unicodebuilder,
filter_as_string=filter_as_string,
+ filter_as_unicode=filter_as_unicode,
count=count,
w_string=w_string,
n=n, last_pos=last_pos, sublist_w=sublist_w
@@ -413,7 +380,10 @@
if last_pos < ctx.match_start:
_sub_append_slice(
ctx, space, use_builder, sublist_w,
- strbuilder, last_pos, ctx.match_start)
+ strbuilder, unicodebuilder, last_pos, ctx.match_start)
+ start = ctx.match_end
+ if start == ctx.match_start:
+ start += 1
if not (last_pos == ctx.match_start
== ctx.match_end and n > 0):
# the above ignores empty matches on latest position
@@ -421,48 +391,40 @@
if filter_is_callable:
w_match = self.getmatch(ctx, True)
# make a copy of 'ctx'; see test_sub_matches_stay_valid
- ctx = self.fresh_copy(ctx)
+ ctx = ctx.fresh_copy(start) # match_start/match_end dropped
w_piece = space.call_function(w_filter, w_match)
if not space.is_w(w_piece, space.w_None):
- assert strbuilder is None
- assert use_builder == '\x00'
+ assert strbuilder is None and unicodebuilder is None
+ assert not use_builder
sublist_w.append(w_piece)
else:
- if use_builder != '\x00':
- assert filter_as_string is not None
- assert strbuilder is not None
- strbuilder.append(filter_as_string)
+ if use_builder:
+ if strbuilder is not None:
+ assert filter_as_string is not None
+ strbuilder.append(filter_as_string)
+ else:
+ assert unicodebuilder is not None
+ assert filter_as_unicode is not None
+ unicodebuilder.append(filter_as_unicode)
else:
sublist_w.append(w_filter)
n += 1
elif last_pos >= ctx.end:
break # empty match at the end: finished
-
- start = ctx.match_end
- if start == ctx.match_start:
- if start == ctx.end:
- break
- start = ctx.next_indirect(start)
ctx.reset(start)
if last_pos < ctx.end:
_sub_append_slice(ctx, space, use_builder, sublist_w,
- strbuilder, last_pos, ctx.end)
- if use_builder != '\x00':
- assert strbuilder is not None
- result_bytes = strbuilder.build()
- if use_builder == 'S':
- assert not isinstance(ctx, rsre_utf8.Utf8MatchContext)
- return space.newbytes(result_bytes), n
- elif use_builder == 'U':
- assert isinstance(ctx, rsre_utf8.Utf8MatchContext)
- return space.newutf8(result_bytes,
- rutf8.get_utf8_length(result_bytes)), n
+ strbuilder, unicodebuilder, last_pos, ctx.end)
+ if use_builder:
+ if strbuilder is not None:
+ return space.newbytes(strbuilder.build()), n
else:
- raise AssertionError(use_builder)
+ assert unicodebuilder is not None
+ return space.newunicode(unicodebuilder.build()), n
else:
if space.isinstance_w(w_string, space.w_unicode):
- w_emptystr = space.newutf8('', 0)
+ w_emptystr = space.newunicode(u'')
else:
w_emptystr = space.newbytes('')
w_item = space.call_method(w_emptystr, 'join',
@@ -472,28 +434,26 @@
sub_jitdriver = jit.JitDriver(
reds="""count n last_pos
ctx w_filter
- strbuilder
+ strbuilder unicodebuilder
filter_as_string
+ filter_as_unicode
w_string sublist_w
self""".split(),
greens=["filter_is_callable", "use_builder", "filter_type", "pattern"])
def _sub_append_slice(ctx, space, use_builder, sublist_w,
- strbuilder, start, end):
- if use_builder != '\x00':
- assert strbuilder is not None
+ strbuilder, unicodebuilder, start, end):
+ if use_builder:
if isinstance(ctx, rsre_core.BufMatchContext):
- assert use_builder == 'S'
+ assert strbuilder is not None
return strbuilder.append(ctx._buffer.getslice(start, end, 1, end-start))
if isinstance(ctx, rsre_core.StrMatchContext):
- assert use_builder == 'S'
- start = ctx._real_pos(start)
- end = ctx._real_pos(end)
+ assert strbuilder is not None
return strbuilder.append_slice(ctx._string, start, end)
- elif isinstance(ctx, rsre_utf8.Utf8MatchContext):
- assert use_builder == 'U'
- return strbuilder.append_slice(ctx._utf8, start, end)
+ elif isinstance(ctx, rsre_core.UnicodeMatchContext):
+ assert unicodebuilder is not None
+ return unicodebuilder.append_slice(ctx._unicodestr, start, end)
assert 0, "unreachable"
else:
sublist_w.append(slice_w(space, ctx, start, end, space.w_None))
@@ -568,10 +528,10 @@
ctx = self.ctx
start, end = ctx.match_start, ctx.match_end
w_s = slice_w(space, ctx, start, end, space.w_None)
- u = space.utf8_w(space.repr(w_s)).decode()
+ u = space.realuicode_w(space.repr(w_s))
if len(u) > 50:
u = u[:50]
- return space.newtext(u'<_sre.SRE_Match object; span=(%d, %d), match=%s>' %
+ return space.newunicode(u'<_sre.SRE_Match object; span=(%d, %d), match=%s>' %
(start, end, u))
def cannot_copy_w(self):
@@ -629,38 +589,19 @@
@unwrap_spec(w_groupnum=WrappedDefault(0))
def start_w(self, w_groupnum):
start, end = self.do_span(w_groupnum)
- start = self.bytepos_to_charindex(start)
return self.space.newint(start)
@unwrap_spec(w_groupnum=WrappedDefault(0))
def end_w(self, w_groupnum):
start, end = self.do_span(w_groupnum)
- end = self.bytepos_to_charindex(end)
return self.space.newint(end)
@unwrap_spec(w_groupnum=WrappedDefault(0))
def span_w(self, w_groupnum):
start, end = self.do_span(w_groupnum)
- return self.new_charindex_tuple(start, end)
-
- def new_charindex_tuple(self, start, end):
- start = self.bytepos_to_charindex(start)
- end = self.bytepos_to_charindex(end)
return self.space.newtuple([self.space.newint(start),
self.space.newint(end)])
- def bytepos_to_charindex(self, bytepos):
- # Transform a 'byte position', as returned by all methods from
- # rsre_core, back into a 'character index'. This is for UTF8
- # handling.
- ctx = self.ctx
- if isinstance(ctx, rsre_utf8.Utf8MatchContext):
- index_storage = ctx.w_unicode_obj._get_index_storage()
- return rutf8.codepoint_index_at_byte_position(
- ctx.w_unicode_obj._utf8, index_storage, bytepos)
- else:
- return bytepos
-
def flatten_marks(self):
if self.flatten_cache is None:
num_groups = self.srepat.num_groups
@@ -668,8 +609,6 @@
return self.flatten_cache
def do_span(self, w_arg):
- # return a pair of integers, which are byte positions, not
- # character indexes (for utf8)
space = self.space
try:
groupnum = space.int_w(w_arg)
@@ -717,10 +656,10 @@
return space.w_None
def fget_pos(self, space):
- return space.newint(self.bytepos_to_charindex(self.ctx.original_pos))
+ return space.newint(self.ctx.original_pos)
def fget_endpos(self, space):
- return space.newint(self.bytepos_to_charindex(self.ctx.end))
+ return space.newint(self.ctx.end)
def fget_regs(self, space):
space = self.space
@@ -728,11 +667,11 @@
num_groups = self.srepat.num_groups
result_w = [None] * (num_groups + 1)
ctx = self.ctx
- result_w[0] = self.new_charindex_tuple(ctx.match_start,
- ctx.match_end)
+ result_w[0] = space.newtuple([space.newint(ctx.match_start),
+ space.newint(ctx.match_end)])
for i in range(num_groups):
- result_w[i + 1] = self.new_charindex_tuple(fmarks[i*2],
- fmarks[i*2+1])
+ result_w[i + 1] = space.newtuple([space.newint(fmarks[i*2]),
+ space.newint(fmarks[i*2+1])])
return space.newtuple(result_w)
def fget_string(self, space):
@@ -741,11 +680,8 @@
return space.newbytes(ctx._buffer.as_str())
elif isinstance(ctx, rsre_core.StrMatchContext):
return space.newbytes(ctx._string)
- elif isinstance(ctx, rsre_utf8.Utf8MatchContext):
- lgt = rutf8.get_utf8_length(ctx._utf8)
- return space.newutf8(ctx._utf8, lgt)
elif isinstance(ctx, rsre_core.UnicodeMatchContext):
- return space.newtext(ctx._unicodestr)
+ return space.newunicode(ctx._unicodestr)
else:
raise SystemError
@@ -786,53 +722,38 @@
self.ctx = ctx
self.code = code
# 'self.ctx' is always a fresh context in which no searching
- # or matching succeeded so far. It is None when the iterator is
- # exhausted.
+ # or matching succeeded so far.
def iter_w(self):
return self
def next_w(self):
- if self.ctx is None:
+ if self.ctx.match_start > self.ctx.end:
raise OperationError(self.space.w_StopIteration, self.space.w_None)
if not searchcontext(self.space, self.ctx, self.code):
raise OperationError(self.space.w_StopIteration, self.space.w_None)
return self.getmatch(True)
def match_w(self):
- if self.ctx is None:
+ if self.ctx.match_start > self.ctx.end:
return self.space.w_None
return self.getmatch(matchcontext(self.space, self.ctx, self.code))
def search_w(self):
- if self.ctx is None:
+ if self.ctx.match_start > self.ctx.end:
return self.space.w_None
return self.getmatch(searchcontext(self.space, self.ctx, self.code))
def getmatch(self, found):
- ctx = self.ctx
- assert ctx is not None
if found:
+ ctx = self.ctx
nextstart = ctx.match_end
- exhausted = False
- if ctx.match_start == nextstart:
- if nextstart == ctx.end:
- exhausted = True
- else:
- nextstart = ctx.next_indirect(nextstart)
- if exhausted:
- self.ctx = None
- else:
- self.ctx = self.srepat.fresh_copy(ctx)
- self.ctx.match_start = nextstart
+ nextstart += (ctx.match_start == nextstart)
+ self.ctx = ctx.fresh_copy(nextstart)
match = W_SRE_Match(self.srepat, ctx)
return match
else:
- # obscure corner case
- if ctx.match_start == ctx.end:
- self.ctx = None
- else:
- ctx.match_start = ctx.next_indirect(ctx.match_start)
+ self.ctx.match_start += 1 # obscure corner case
return None
W_SRE_Scanner.typedef = TypeDef(
More information about the pypy-commit
mailing list