[pypy-commit] pypy unicode-utf8: (fijal, arigo)
arigo
pypy.commits at gmail.com
Thu Aug 24 11:01:38 EDT 2017
Author: Armin Rigo <arigo at tunes.org>
Branch: unicode-utf8
Changeset: r92254:d4bde635e3a9
Date: 2017-08-24 17:00 +0200
http://bitbucket.org/pypy/pypy/changeset/d4bde635e3a9/
Log: (fijal, arigo)
General progress
diff --git a/pypy/objspace/std/objspace.py b/pypy/objspace/std/objspace.py
--- a/pypy/objspace/std/objspace.py
+++ b/pypy/objspace/std/objspace.py
@@ -9,7 +9,7 @@
from rpython.rlib.debug import make_sure_not_resized
from rpython.rlib.rarithmetic import base_int, widen, is_valid_int
from rpython.rlib.objectmodel import import_from_mixin, enforceargs, not_rpython
-from rpython.rlib import jit
+from rpython.rlib import jit, rutf8
# Object imports
from pypy.objspace.std.basestringtype import basestring_typedef
@@ -312,11 +312,12 @@
return self.newlist(list_u)
return W_ListObject.newlist_unicode(self, list_u)
- def newlist_from_unicode(self, lst):
+ def newlist_utf8(self, lst):
res_w = []
- for u in lst:
- assert u is not None
- res_w.append(self.newutf8(u, -1))
+ for utf in lst:
+ assert utf is not None
+ assert isinstance(utf, str)
+ res_w.append(self.newutf8(utf, rutf8.check_utf8(utf)))
return self.newlist(res_w)
def newlist_int(self, list_i):
diff --git a/pypy/objspace/std/test/test_unicodeobject.py b/pypy/objspace/std/test/test_unicodeobject.py
--- a/pypy/objspace/std/test/test_unicodeobject.py
+++ b/pypy/objspace/std/test/test_unicodeobject.py
@@ -315,6 +315,16 @@
assert u'one!two!three!'.replace('x', '@') == u'one!two!three!'
assert u'one!two!three!'.replace(u'x', '@', 2) == u'one!two!three!'
assert u'abc'.replace('', u'-') == u'-a-b-c-'
+ assert u'\u1234'.replace(u'', '-') == u'-\u1234-'
+ assert u'\u0234\u5678'.replace('', u'-') == u'-\u0234-\u5678-'
+ assert u'\u0234\u5678'.replace('', u'-', 0) == u'\u0234\u5678'
+ assert u'\u0234\u5678'.replace('', u'-', 1) == u'-\u0234\u5678'
+ assert u'\u0234\u5678'.replace('', u'-', 2) == u'-\u0234-\u5678'
+ assert u'\u0234\u5678'.replace('', u'-', 3) == u'-\u0234-\u5678-'
+ assert u'\u0234\u5678'.replace('', u'-', 4) == u'-\u0234-\u5678-'
+ assert u'\u0234\u5678'.replace('', u'-', 700) == u'-\u0234-\u5678-'
+ assert u'\u0234\u5678'.replace('', u'-', -1) == u'-\u0234-\u5678-'
+ assert u'\u0234\u5678'.replace('', u'-', -42) == u'-\u0234-\u5678-'
assert u'abc'.replace(u'', u'-', 3) == u'-a-b-c'
assert u'abc'.replace('', '-', 0) == u'abc'
assert u''.replace(u'', '') == u''
diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -6,7 +6,7 @@
from rpython.rlib.buffer import StringBuffer
from rpython.rlib.mutbuffer import MutableStringBuffer
from rpython.rlib.rstring import StringBuilder, split, rsplit, UnicodeBuilder,\
- replace
+ replace_count
from rpython.rlib.runicode import make_unicode_escape_function
from rpython.rlib import rutf8, jit
@@ -41,7 +41,7 @@
self._length = length
self._ucs4 = ucs4str
if not we_are_translated():
- assert rutf8.compute_length_utf8(utf8str) == length
+ assert rutf8.check_utf8(utf8str) == length
def __repr__(self):
"""representation for debugging purposes"""
@@ -561,30 +561,30 @@
res = []
value = self._utf8
if space.is_none(w_sep):
- res = split(value, maxsplit=maxsplit, isutf8=1)
- return space.newlist_from_unicode(res)
+ res = split(value, maxsplit=maxsplit, isutf8=True)
+ return space.newlist_utf8(res)
by = self.convert_arg_to_w_unicode(space, w_sep)._utf8
if len(by) == 0:
raise oefmt(space.w_ValueError, "empty separator")
- res = split(value, by, maxsplit, isutf8=1)
+ res = split(value, by, maxsplit, isutf8=True)
- return space.newlist_from_unicode(res)
+ return space.newlist_utf8(res)
@unwrap_spec(maxsplit=int)
def descr_rsplit(self, space, w_sep=None, maxsplit=-1):
res = []
value = self._utf8
if space.is_none(w_sep):
- res = rsplit(value, maxsplit=maxsplit, isutf8=1)
- return space.newlist_from_unicode(res)
+ res = rsplit(value, maxsplit=maxsplit, isutf8=True)
+ return space.newlist_utf8(res)
by = self.convert_arg_to_w_unicode(space, w_sep)._utf8
if len(by) == 0:
raise oefmt(space.w_ValueError, "empty separator")
- res = rsplit(value, by, maxsplit, isutf8=1)
+ res = rsplit(value, by, maxsplit, isutf8=True)
- return space.newlist_from_unicode(res)
+ return space.newlist_utf8(res)
@unwrap_spec(width=int, w_fillchar=WrappedDefault(' '))
def descr_center(self, space, width, w_fillchar):
@@ -622,11 +622,13 @@
if count >= 0 and len(input) == 0:
return self._empty()
try:
- res = replace(input, w_sub._utf8, w_by._utf8, count)
+ res, replacements = replace_count(input, w_sub._utf8, w_by._utf8,
+ count, isutf8=True)
except OverflowError:
raise oefmt(space.w_OverflowError, "replace string is too long")
- return W_UnicodeObject(res, -1)
+ newlength = self._length + replacements * (w_by._length - w_sub._length)
+ return W_UnicodeObject(res, newlength)
def descr_mul(self, space, w_times):
try:
diff --git a/rpython/rlib/rstring.py b/rpython/rlib/rstring.py
--- a/rpython/rlib/rstring.py
+++ b/rpython/rlib/rstring.py
@@ -16,42 +16,38 @@
# -------------- public API for string functions -----------------------
@specialize.ll_and_arg(2)
-def _isspace(s, pos, isutf8=0):
+def _isspace(s, pos, isutf8=False):
if isutf8:
from rpython.rlib import rutf8
return rutf8.isspace(s, pos)
+ char = s[pos]
+ if isinstance(char, str):
+ return char.isspace()
else:
- char = s[pos]
- if isinstance(char, str):
- return char.isspace()
- else:
- assert isinstance(char, unicode)
- return unicodedb.isspace(ord(char))
+ assert isinstance(char, unicode)
+ return unicodedb.isspace(ord(char))
@specialize.ll_and_arg(2)
def _incr(s, pos, isutf8):
- from rpython.rlib.rutf8 import next_codepoint_pos
-
if isutf8:
- if pos == -1:
- return 0
+ from rpython.rlib.rutf8 import next_codepoint_pos
+ assert pos >= 0
return next_codepoint_pos(s, pos)
else:
return pos + 1
@specialize.ll_and_arg(2)
def _decr(s, pos, isutf8):
- from rpython.rlib.rutf8 import prev_codepoint_pos
-
if isutf8:
- if pos == 0:
+ from rpython.rlib.rutf8 import prev_codepoint_pos
+ if pos <= 0:
return -1
return prev_codepoint_pos(s, pos)
else:
return pos - 1
@specialize.ll_and_arg(3)
-def split(value, by=None, maxsplit=-1, isutf8=0):
+def split(value, by=None, maxsplit=-1, isutf8=False):
if by is None:
length = len(value)
i = 0
@@ -83,7 +79,11 @@
else:
break
return res
+ else:
+ return _split_by(value, by, maxsplit)
+ at specialize.argtype(0)
+def _split_by(value, by, maxsplit):
if isinstance(value, unicode):
assert isinstance(by, unicode)
if isinstance(value, str):
@@ -133,7 +133,7 @@
@specialize.ll_and_arg(3)
-def rsplit(value, by=None, maxsplit=-1, isutf8=0):
+def rsplit(value, by=None, maxsplit=-1, isutf8=False):
if by is None:
res = []
@@ -147,30 +147,34 @@
else:
break # end of string, finished
- # find the start of the word
- # (more precisely, 'j' will be the space character before the word)
+ # find the start of the word as 'j1'
if maxsplit == 0:
- j = -1 # take all the rest of the string
+ j1 = 0 # take all the rest of the string
+ j = -1
else:
- j = _decr(value, i, isutf8)
- while j >= 0 and not _isspace(value, j, isutf8):
- j = _decr(value, j, isutf8)
+ j1 = i
+ while True:
+ j = _decr(value, j1, isutf8)
+ if j < 0 or _isspace(value, j, isutf8):
+ break
+ j1 = j
maxsplit -= 1 # NB. if it's already < 0, it stays < 0
- # the word is value[j+1:i+1]
- j1 = _incr(value, j, isutf8)
+ # the word is value[j1:i+1]
assert j1 >= 0
i1 = _incr(value, i, isutf8)
res.append(value[j1:i1])
- if j < 0:
- break
# continue to look from the character before the space before the word
i = _decr(value, j, isutf8)
res.reverse()
return res
+ else:
+ return _rsplit_by(value, by, maxsplit)
+ at specialize.argtype(0)
+def _rsplit_by(value, by, maxsplit):
if isinstance(value, unicode):
assert isinstance(by, unicode)
if isinstance(value, str):
@@ -203,6 +207,11 @@
@specialize.argtype(0, 1)
@jit.elidable
def replace(input, sub, by, maxsplit=-1):
+ return replace_count(input, sub, by, maxsplit)[0]
+
+ at specialize.ll_and_arg(4)
+ at jit.elidable
+def replace_count(input, sub, by, maxsplit=-1, isutf8=False):
if isinstance(input, str):
Builder = StringBuilder
elif isinstance(input, unicode):
@@ -211,10 +220,10 @@
assert isinstance(input, list)
Builder = ByteListBuilder
if maxsplit == 0:
- return input
+ return input, 0
- if not sub:
+ if not sub and not isutf8:
upper = len(input)
if maxsplit > 0 and maxsplit < upper + 2:
upper = maxsplit - 1
@@ -234,9 +243,16 @@
builder.append(input[i])
builder.append(by)
builder.append_slice(input, upper, len(input))
+ replacements = upper + 1
else:
# First compute the exact result size
- cnt = count(input, sub, 0, len(input))
+ if sub:
+ cnt = count(input, sub, 0, len(input))
+ else:
+ assert isutf8
+ from rpython.rlib import rutf8
+ cnt = rutf8.compute_length_utf8(input) + 1
+
if cnt > maxsplit and maxsplit > 0:
cnt = maxsplit
diff_len = len(by) - len(sub)
@@ -245,23 +261,36 @@
result_size = ovfcheck(result_size + len(input))
except OverflowError:
raise
+ replacements = cnt
builder = Builder(result_size)
start = 0
sublen = len(sub)
- while maxsplit != 0:
- next = find(input, sub, start, len(input))
- if next < 0:
- break
- builder.append_slice(input, start, next)
- builder.append(by)
- start = next + sublen
- maxsplit -= 1 # NB. if it's already < 0, it stays < 0
+ if sublen == 0:
+ assert isutf8
+ from rpython.rlib import rutf8
+ while True:
+ builder.append(by)
+ maxsplit -= 1
+ if start == len(input) or maxsplit == 0:
+ break
+ next = rutf8.next_codepoint_pos(input, start)
+ builder.append_slice(input, start, next)
+ start = next
+ else:
+ while maxsplit != 0:
+ next = find(input, sub, start, len(input))
+ if next < 0:
+ break
+ builder.append_slice(input, start, next)
+ builder.append(by)
+ start = next + sublen
+ maxsplit -= 1 # NB. if it's already < 0, it stays < 0
builder.append_slice(input, start, len(input))
- return builder.build()
+ return builder.build(), replacements
def _normalize_start_end(length, start, end):
if start < 0:
diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py
--- a/rpython/rlib/rutf8.py
+++ b/rpython/rlib/rutf8.py
@@ -203,6 +203,24 @@
return True
return False
+def utf8_in_chars(value, pos, chars):
+ """Equivalent of u'x' in u'xyz', where the left-hand side is
+ a single UTF-8 character extracted from the string 'value' at 'pos'.
+ Only works if both 'value' and 'chars' are correctly-formed UTF-8
+ strings.
+ """
+ end = next_codepoint_pos(value, pos)
+ i = 0
+ while i < len(chars):
+ k = pos
+ while value[k] == chars[i]:
+ k += 1
+ i += 1
+ if k == end:
+ return True
+ i += 1
+ return False
+
def _invalid_cont_byte(ordch):
return ordch>>6 != 0x2 # 0b10
diff --git a/rpython/rlib/test/test_rutf8.py b/rpython/rlib/test/test_rutf8.py
--- a/rpython/rlib/test/test_rutf8.py
+++ b/rpython/rlib/test/test_rutf8.py
@@ -79,14 +79,8 @@
else:
assert not rutf8.isspace(unichr(i).encode('utf8'), 0)
- at given(strategies.integers(min_value=0, max_value=sys.maxunicode),
- strategies.characters())
-def test_utf8_in_chars(i, uni):
- if not uni:
- return
- if unichr(i) in uni:
- response = True
- else:
- response = False
- r = unichr(i).encode('utf8') in uni.encode('utf8')
+ at given(strategies.characters(), strategies.text())
+def test_utf8_in_chars(ch, txt):
+ response = rutf8.utf8_in_chars(ch.encode('utf8'), 0, txt.encode('utf8'))
+ r = (ch in txt)
assert r == response
More information about the pypy-commit
mailing list