[pypy-commit] pypy unicode-utf8-py3: revert 4ef833b2310d
mattip
pypy.commits at gmail.com
Wed Jan 9 13:10:47 EST 2019
Author: Matti Picus <matti.picus at gmail.com>
Branch: unicode-utf8-py3
Changeset: r95590:082a51c10570
Date: 2019-01-09 08:08 +0200
http://bitbucket.org/pypy/pypy/changeset/082a51c10570/
Log: revert 4ef833b2310d
diff --git a/pypy/module/_sre/interp_sre.py b/pypy/module/_sre/interp_sre.py
--- a/pypy/module/_sre/interp_sre.py
+++ b/pypy/module/_sre/interp_sre.py
@@ -6,15 +6,14 @@
from pypy.interpreter.gateway import interp2app, unwrap_spec, WrappedDefault
from pypy.interpreter.error import OperationError, oefmt
from rpython.rlib.rarithmetic import intmask
-from rpython.rlib import jit
-from rpython.rlib.rstring import StringBuilder, UnicodeBuilder
-from rpython.rlib.runicode import unicode_encode_utf_8
+from rpython.rlib import jit, rutf8
+from rpython.rlib.rstring import StringBuilder
# ____________________________________________________________
#
# Constants and exposed functions
-from rpython.rlib.rsre import rsre_core, rsre_char
+from rpython.rlib.rsre import rsre_core, rsre_char, rsre_utf8
from rpython.rlib.rsre.rsre_char import CODESIZE, MAXREPEAT, MAXGROUPS, getlower, set_unicode_db
@@ -35,17 +34,21 @@
def slice_w(space, ctx, start, end, w_default):
- if 0 <= start <= end:
+ # 'start' and 'end' are byte positions
+ if ctx.ZERO <= start <= end:
if isinstance(ctx, rsre_core.BufMatchContext):
return space.newbytes(ctx._buffer.getslice(start, end, 1,
end-start))
if isinstance(ctx, rsre_core.StrMatchContext):
+ start = ctx._real_pos(start)
+ end = ctx._real_pos(end)
return space.newbytes(ctx._string[start:end])
+ elif isinstance(ctx, rsre_utf8.Utf8MatchContext):
+ s = ctx._utf8[start:end]
+ lgt = rutf8.get_utf8_length(s)
+ return space.newutf8(s, lgt)
elif isinstance(ctx, rsre_core.UnicodeMatchContext):
- uni = ctx._unicodestr[start:end]
- uni_utf8 = unicode_encode_utf_8(uni, len(uni), 'strict',
- allow_surrogates=True)
- return space.newtext(uni_utf8, len(uni))
+ return space.newtext(ctx._unicodestr[start:end])
else:
# unreachable
raise SystemError
@@ -57,6 +60,7 @@
# Returns a list of RPython-level integers.
# Unlike the app-level groups() method, groups are numbered from 0
# and the returned list does not start with the whole match range.
+ # The integers are byte positions, not character indexes (for utf8).
if num_groups == 0:
return None
result = [-1] * (2 * num_groups)
@@ -109,7 +113,7 @@
def repr_w(self):
space = self.space
- u = space.utf8_w(space.repr(self.w_pattern))
+ u = space.utf8_w(space.repr(self.w_pattern)).decode()
if len(u) > 200:
u = u[:200]
flag_items = []
@@ -127,12 +131,12 @@
if flags != 0:
flag_items.append('0x%x' % flags)
if len(flag_items) == 0:
- usep = ''
- uflags = ''
+ usep = u''
+ uflags = u''
else:
- usep = ', '
- uflags = '|'.join(flag_items)
- return space.newtext('re.compile(%s%s%s)' % (u, usep, uflags))
+ usep = u', '
+ uflags = u'|'.join([item.decode('latin-1') for item in flag_items])
+ return space.newtext(u're.compile(%s%s%s)' % (u, usep, uflags))
def fget_groupindex(self, space):
w_groupindex = self.w_groupindex
@@ -162,7 +166,7 @@
buf = None
space = self.space
if space.isinstance_w(w_string, space.w_unicode):
- unicodestr = space.realunicode_w(w_string)
+ unicodestr = space.utf8_w(w_string).decode('utf8')
length = len(unicodestr)
elif space.isinstance_w(w_string, space.w_bytes):
string = space.bytes_w(w_string)
@@ -174,7 +178,7 @@
return (length, unicodestr, string, buf)
def make_ctx(self, w_string, pos=0, endpos=sys.maxint, flags=0):
- """Make a StrMatchContext, BufMatchContext or a UnicodeMatchContext for
+ """Make a StrMatchContext, BufMatchContext or a Utf8MatchContext for
searching in the given w_string object."""
space = self.space
length, unicodestr, string, buf = self.getstring(w_string)
@@ -207,6 +211,27 @@
return rsre_core.BufMatchContext(buf,
pos, endpos, flags)
+ def fresh_copy(self, ctx):
+ if isinstance(ctx, rsre_utf8.Utf8MatchContext):
+ result = rsre_utf8.Utf8MatchContext(
+ ctx._utf8, ctx.match_start, ctx.end, ctx.flags)
+ result.w_unicode_obj = ctx.w_unicode_obj
+ elif isinstance(ctx, rsre_core.StrMatchContext):
+ result = self._make_str_match_context(
+ ctx._string, ctx.match_start, ctx.end)
+ elif isinstance(ctx, rsre_core.BufMatchContext):
+ result = rsre_core.BufMatchContext(
+ ctx._buffer, ctx.match_start, ctx.end, ctx.flags)
+ else:
+ raise AssertionError("bad ctx type")
+ result.match_end = ctx.match_end
+ return result
+
+ def _make_str_match_context(self, str, pos, endpos):
+ # for tests to override
+ return rsre_core.StrMatchContext(str,
+ pos, endpos, self.flags)
+
def getmatch(self, ctx, found):
if found:
return W_SRE_Match(self, ctx)
@@ -234,7 +259,7 @@
space = self.space
matchlist_w = []
ctx = self.make_ctx(w_string, pos, endpos)
- while ctx.match_start <= ctx.end:
+ while True:
if not searchcontext(space, ctx, self.code):
break
num_groups = self.num_groups
@@ -251,8 +276,12 @@
w_item = allgroups_w(space, ctx, fmarks, num_groups,
w_emptystr)
matchlist_w.append(w_item)
- no_progress = (ctx.match_start == ctx.match_end)
- ctx.reset(ctx.match_end + no_progress)
+ reset_at = ctx.match_end
+ if ctx.match_start == ctx.match_end:
+ if reset_at == ctx.end:
+ break
+ reset_at = ctx.next_indirect(reset_at)
+ ctx.reset(reset_at)
return space.newlist(matchlist_w)
@unwrap_spec(pos=int, endpos=int)
@@ -277,15 +306,15 @@
#
splitlist = []
n = 0
- last = 0
ctx = self.make_ctx(w_string)
+ last = ctx.ZERO
while not maxsplit or n < maxsplit:
if not searchcontext(space, ctx, self.code):
break
if ctx.match_start == ctx.match_end: # zero-width match
if ctx.match_start == ctx.end: # or end of string
break
- ctx.reset(ctx.match_end + 1)
+ ctx.reset(ctx.next_indirect(ctx.match_end))
continue
splitlist.append(slice_w(space, ctx, last, ctx.match_start,
space.w_None))
@@ -314,27 +343,31 @@
def subx(self, w_ptemplate, w_string, count):
space = self.space
- # use a (much faster) string/unicode builder if w_ptemplate and
+ # use a (much faster) string builder (possibly utf8) if w_ptemplate and
# w_string are both string or both unicode objects, and if w_ptemplate
# is a literal
- use_builder = False
- filter_as_unicode = filter_as_string = None
+ use_builder = '\x00' # or 'S'tring or 'U'nicode/UTF8
+ filter_as_string = None
if space.is_true(space.callable(w_ptemplate)):
w_filter = w_ptemplate
filter_is_callable = True
else:
- length, filter_as_unicode, filter_as_string, buf = (
- self.getstring(w_ptemplate))
- if filter_as_unicode is not None:
- literal = u'\\' not in filter_as_unicode
- use_builder = (
- space.isinstance_w(w_string, space.w_unicode) and literal)
+ if space.isinstance_w(w_ptemplate, space.w_unicode):
+ filter_as_string = space.utf8_w(w_ptemplate)
+ literal = '\\' not in filter_as_string
+ if space.isinstance_w(w_string, space.w_unicode) and literal:
+ use_builder = 'U'
else:
- if buf is not None:
- filter_as_string = buf.as_str()
- literal = '\\' not in filter_as_string
- use_builder = (
- space.isinstance_w(w_string, space.w_bytes) and literal)
+ try:
+ filter_as_string = space.bytes_w(w_ptemplate)
+ except OperationError as e:
+ if e.async(space):
+ raise
+ literal = False
+ else:
+ literal = '\\' not in filter_as_string
+ if space.isinstance_w(w_string, space.w_bytes) and literal:
+ use_builder = 'S'
if literal:
w_filter = w_ptemplate
filter_is_callable = False
@@ -351,18 +384,16 @@
#
# XXX this is a bit of a mess, but it improves performance a lot
ctx = self.make_ctx(w_string)
- sublist_w = strbuilder = unicodebuilder = None
- if use_builder:
- if filter_as_unicode is not None:
- unicodebuilder = UnicodeBuilder(ctx.end)
- else:
- assert filter_as_string is not None
- strbuilder = StringBuilder(ctx.end)
+ sublist_w = strbuilder = None
+ if use_builder != '\x00':
+ assert filter_as_string is not None
+ strbuilder = StringBuilder(ctx.end)
else:
sublist_w = []
- n = last_pos = 0
- pattern = self.code
+ n = 0
+ last_pos = ctx.ZERO
while not count or n < count:
+ pattern = self.code
sub_jitdriver.jit_merge_point(
self=self,
use_builder=use_builder,
@@ -371,9 +402,7 @@
ctx=ctx, pattern=pattern,
w_filter=w_filter,
strbuilder=strbuilder,
- unicodebuilder=unicodebuilder,
filter_as_string=filter_as_string,
- filter_as_unicode=filter_as_unicode,
count=count,
w_string=w_string,
n=n, last_pos=last_pos, sublist_w=sublist_w
@@ -384,10 +413,7 @@
if last_pos < ctx.match_start:
_sub_append_slice(
ctx, space, use_builder, sublist_w,
- strbuilder, unicodebuilder, last_pos, ctx.match_start)
- start = ctx.match_end
- if start == ctx.match_start:
- start += 1
+ strbuilder, last_pos, ctx.match_start)
if not (last_pos == ctx.match_start
== ctx.match_end and n > 0):
# the above ignores empty matches on latest position
@@ -395,40 +421,48 @@
if filter_is_callable:
w_match = self.getmatch(ctx, True)
# make a copy of 'ctx'; see test_sub_matches_stay_valid
- ctx = ctx.fresh_copy(start) # match_start/match_end dropped
+ ctx = self.fresh_copy(ctx)
w_piece = space.call_function(w_filter, w_match)
if not space.is_w(w_piece, space.w_None):
- assert strbuilder is None and unicodebuilder is None
- assert not use_builder
+ assert strbuilder is None
+ assert use_builder == '\x00'
sublist_w.append(w_piece)
else:
- if use_builder:
- if strbuilder is not None:
- assert filter_as_string is not None
- strbuilder.append(filter_as_string)
- else:
- assert unicodebuilder is not None
- assert filter_as_unicode is not None
- unicodebuilder.append(filter_as_unicode)
+ if use_builder != '\x00':
+ assert filter_as_string is not None
+ assert strbuilder is not None
+ strbuilder.append(filter_as_string)
else:
sublist_w.append(w_filter)
n += 1
elif last_pos >= ctx.end:
break # empty match at the end: finished
+
+ start = ctx.match_end
+ if start == ctx.match_start:
+ if start == ctx.end:
+ break
+ start = ctx.next_indirect(start)
ctx.reset(start)
if last_pos < ctx.end:
_sub_append_slice(ctx, space, use_builder, sublist_w,
- strbuilder, unicodebuilder, last_pos, ctx.end)
- if use_builder:
- if strbuilder is not None:
- return space.newbytes(strbuilder.build()), n
+ strbuilder, last_pos, ctx.end)
+ if use_builder != '\x00':
+ assert strbuilder is not None
+ result_bytes = strbuilder.build()
+ if use_builder == 'S':
+ assert not isinstance(ctx, rsre_utf8.Utf8MatchContext)
+ return space.newbytes(result_bytes), n
+ elif use_builder == 'U':
+ assert isinstance(ctx, rsre_utf8.Utf8MatchContext)
+ return space.newutf8(result_bytes,
+ rutf8.get_utf8_length(result_bytes)), n
else:
- assert unicodebuilder is not None
- return space.newtext(unicodebuilder.build()), n
+ raise AssertionError(use_builder)
else:
if space.isinstance_w(w_string, space.w_unicode):
- w_emptystr = space.newtext('')
+ w_emptystr = space.newutf8('', 0)
else:
w_emptystr = space.newbytes('')
w_item = space.call_method(w_emptystr, 'join',
@@ -438,26 +472,28 @@
sub_jitdriver = jit.JitDriver(
reds="""count n last_pos
ctx w_filter
- strbuilder unicodebuilder
+ strbuilder
filter_as_string
- filter_as_unicode
w_string sublist_w
self""".split(),
greens=["filter_is_callable", "use_builder", "filter_type", "pattern"])
def _sub_append_slice(ctx, space, use_builder, sublist_w,
- strbuilder, unicodebuilder, start, end):
- if use_builder:
+ strbuilder, start, end):
+ if use_builder != '\x00':
+ assert strbuilder is not None
if isinstance(ctx, rsre_core.BufMatchContext):
- assert strbuilder is not None
+ assert use_builder == 'S'
return strbuilder.append(ctx._buffer.getslice(start, end, 1, end-start))
if isinstance(ctx, rsre_core.StrMatchContext):
- assert strbuilder is not None
+ assert use_builder == 'S'
+ start = ctx._real_pos(start)
+ end = ctx._real_pos(end)
return strbuilder.append_slice(ctx._string, start, end)
- elif isinstance(ctx, rsre_core.UnicodeMatchContext):
- assert unicodebuilder is not None
- return unicodebuilder.append_slice(ctx._unicodestr, start, end)
+ elif isinstance(ctx, rsre_utf8.Utf8MatchContext):
+ assert use_builder == 'U'
+ return strbuilder.append_slice(ctx._utf8, start, end)
assert 0, "unreachable"
else:
sublist_w.append(slice_w(space, ctx, start, end, space.w_None))
@@ -532,10 +568,10 @@
ctx = self.ctx
start, end = ctx.match_start, ctx.match_end
w_s = slice_w(space, ctx, start, end, space.w_None)
- u = space.utf8_w(space.repr(w_s))
+ u = space.utf8_w(space.repr(w_s)).decode()
if len(u) > 50:
u = u[:50]
- return space.newtext('<_sre.SRE_Match object; span=(%d, %d), match=%s>' %
+ return space.newtext(u'<_sre.SRE_Match object; span=(%d, %d), match=%s>' %
(start, end, u))
def cannot_copy_w(self):
@@ -593,19 +629,38 @@
@unwrap_spec(w_groupnum=WrappedDefault(0))
def start_w(self, w_groupnum):
start, end = self.do_span(w_groupnum)
+ start = self.bytepos_to_charindex(start)
return self.space.newint(start)
@unwrap_spec(w_groupnum=WrappedDefault(0))
def end_w(self, w_groupnum):
start, end = self.do_span(w_groupnum)
+ end = self.bytepos_to_charindex(end)
return self.space.newint(end)
@unwrap_spec(w_groupnum=WrappedDefault(0))
def span_w(self, w_groupnum):
start, end = self.do_span(w_groupnum)
+ return self.new_charindex_tuple(start, end)
+
+ def new_charindex_tuple(self, start, end):
+ start = self.bytepos_to_charindex(start)
+ end = self.bytepos_to_charindex(end)
return self.space.newtuple([self.space.newint(start),
self.space.newint(end)])
+ def bytepos_to_charindex(self, bytepos):
+ # Transform a 'byte position', as returned by all methods from
+ # rsre_core, back into a 'character index'. This is for UTF8
+ # handling.
+ ctx = self.ctx
+ if isinstance(ctx, rsre_utf8.Utf8MatchContext):
+ index_storage = ctx.w_unicode_obj._get_index_storage()
+ return rutf8.codepoint_index_at_byte_position(
+ ctx.w_unicode_obj._utf8, index_storage, bytepos)
+ else:
+ return bytepos
+
def flatten_marks(self):
if self.flatten_cache is None:
num_groups = self.srepat.num_groups
@@ -613,6 +668,8 @@
return self.flatten_cache
def do_span(self, w_arg):
+ # return a pair of integers, which are byte positions, not
+ # character indexes (for utf8)
space = self.space
try:
groupnum = space.int_w(w_arg)
@@ -660,10 +717,10 @@
return space.w_None
def fget_pos(self, space):
- return space.newint(self.ctx.original_pos)
+ return space.newint(self.bytepos_to_charindex(self.ctx.original_pos))
def fget_endpos(self, space):
- return space.newint(self.ctx.end)
+ return space.newint(self.bytepos_to_charindex(self.ctx.end))
def fget_regs(self, space):
space = self.space
@@ -671,11 +728,11 @@
num_groups = self.srepat.num_groups
result_w = [None] * (num_groups + 1)
ctx = self.ctx
- result_w[0] = space.newtuple([space.newint(ctx.match_start),
- space.newint(ctx.match_end)])
+ result_w[0] = self.new_charindex_tuple(ctx.match_start,
+ ctx.match_end)
for i in range(num_groups):
- result_w[i + 1] = space.newtuple([space.newint(fmarks[i*2]),
- space.newint(fmarks[i*2+1])])
+ result_w[i + 1] = self.new_charindex_tuple(fmarks[i*2],
+ fmarks[i*2+1])
return space.newtuple(result_w)
def fget_string(self, space):
@@ -684,6 +741,9 @@
return space.newbytes(ctx._buffer.as_str())
elif isinstance(ctx, rsre_core.StrMatchContext):
return space.newbytes(ctx._string)
+ elif isinstance(ctx, rsre_utf8.Utf8MatchContext):
+ lgt = rutf8.get_utf8_length(ctx._utf8)
+ return space.newutf8(ctx._utf8, lgt)
elif isinstance(ctx, rsre_core.UnicodeMatchContext):
return space.newtext(ctx._unicodestr)
else:
@@ -726,38 +786,53 @@
self.ctx = ctx
self.code = code
# 'self.ctx' is always a fresh context in which no searching
- # or matching succeeded so far.
+ # or matching succeeded so far. It is None when the iterator is
+ # exhausted.
def iter_w(self):
return self
def next_w(self):
- if self.ctx.match_start > self.ctx.end:
+ if self.ctx is None:
raise OperationError(self.space.w_StopIteration, self.space.w_None)
if not searchcontext(self.space, self.ctx, self.code):
raise OperationError(self.space.w_StopIteration, self.space.w_None)
return self.getmatch(True)
def match_w(self):
- if self.ctx.match_start > self.ctx.end:
+ if self.ctx is None:
return self.space.w_None
return self.getmatch(matchcontext(self.space, self.ctx, self.code))
def search_w(self):
- if self.ctx.match_start > self.ctx.end:
+ if self.ctx is None:
return self.space.w_None
return self.getmatch(searchcontext(self.space, self.ctx, self.code))
def getmatch(self, found):
+ ctx = self.ctx
+ assert ctx is not None
if found:
- ctx = self.ctx
nextstart = ctx.match_end
- nextstart += (ctx.match_start == nextstart)
- self.ctx = ctx.fresh_copy(nextstart)
+ exhausted = False
+ if ctx.match_start == nextstart:
+ if nextstart == ctx.end:
+ exhausted = True
+ else:
+ nextstart = ctx.next_indirect(nextstart)
+ if exhausted:
+ self.ctx = None
+ else:
+ self.ctx = self.srepat.fresh_copy(ctx)
+ self.ctx.match_start = nextstart
match = W_SRE_Match(self.srepat, ctx)
return match
else:
- self.ctx.match_start += 1 # obscure corner case
+ # obscure corner case
+ if ctx.match_start == ctx.end:
+ self.ctx = None
+ else:
+ ctx.match_start = ctx.next_indirect(ctx.match_start)
return None
W_SRE_Scanner.typedef = TypeDef(
More information about the pypy-commit
mailing list