[pypy-commit] pypy unicode-utf8: (fijal, arigo)

Thu Aug 24 11:01:38 EDT 2017

Author: Armin Rigo <arigo at tunes.org>
Branch: unicode-utf8
Changeset: r92254:d4bde635e3a9
Date: 2017-08-24 17:00 +0200
http://bitbucket.org/pypy/pypy/changeset/d4bde635e3a9/

Log:	(fijal, arigo)

	General progress

diff --git a/pypy/objspace/std/objspace.py b/pypy/objspace/std/objspace.py
--- a/pypy/objspace/std/objspace.py
+++ b/pypy/objspace/std/objspace.py
@@ -9,7 +9,7 @@
 from rpython.rlib.debug import make_sure_not_resized
 from rpython.rlib.rarithmetic import base_int, widen, is_valid_int
 from rpython.rlib.objectmodel import import_from_mixin, enforceargs, not_rpython
-from rpython.rlib import jit
+from rpython.rlib import jit, rutf8
 
 # Object imports
 from pypy.objspace.std.basestringtype import basestring_typedef
@@ -312,11 +312,12 @@
         return self.newlist(list_u)
         return W_ListObject.newlist_unicode(self, list_u)
 
-    def newlist_from_unicode(self, lst):
+    def newlist_utf8(self, lst):
         res_w = []
-        for u in lst:
-            assert u is not None
-            res_w.append(self.newutf8(u, -1))
+        for utf in lst:
+            assert utf is not None
+            assert isinstance(utf, str)
+            res_w.append(self.newutf8(utf, rutf8.check_utf8(utf)))
         return self.newlist(res_w)
 
     def newlist_int(self, list_i):
diff --git a/pypy/objspace/std/test/test_unicodeobject.py b/pypy/objspace/std/test/test_unicodeobject.py
--- a/pypy/objspace/std/test/test_unicodeobject.py
+++ b/pypy/objspace/std/test/test_unicodeobject.py
@@ -315,6 +315,16 @@
         assert u'one!two!three!'.replace('x', '@') == u'one!two!three!'
         assert u'one!two!three!'.replace(u'x', '@', 2) == u'one!two!three!'
         assert u'abc'.replace('', u'-') == u'-a-b-c-'
+        assert u'\u1234'.replace(u'', '-') == u'-\u1234-'
+        assert u'\u0234\u5678'.replace('', u'-') == u'-\u0234-\u5678-'
+        assert u'\u0234\u5678'.replace('', u'-', 0) == u'\u0234\u5678'
+        assert u'\u0234\u5678'.replace('', u'-', 1) == u'-\u0234\u5678'
+        assert u'\u0234\u5678'.replace('', u'-', 2) == u'-\u0234-\u5678'
+        assert u'\u0234\u5678'.replace('', u'-', 3) == u'-\u0234-\u5678-'
+        assert u'\u0234\u5678'.replace('', u'-', 4) == u'-\u0234-\u5678-'
+        assert u'\u0234\u5678'.replace('', u'-', 700) == u'-\u0234-\u5678-'
+        assert u'\u0234\u5678'.replace('', u'-', -1) == u'-\u0234-\u5678-'
+        assert u'\u0234\u5678'.replace('', u'-', -42) == u'-\u0234-\u5678-'
         assert u'abc'.replace(u'', u'-', 3) == u'-a-b-c'
         assert u'abc'.replace('', '-', 0) == u'abc'
         assert u''.replace(u'', '') == u''
diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -6,7 +6,7 @@
 from rpython.rlib.buffer import StringBuffer
 from rpython.rlib.mutbuffer import MutableStringBuffer
 from rpython.rlib.rstring import StringBuilder, split, rsplit, UnicodeBuilder,\
-     replace
+     replace_count
 from rpython.rlib.runicode import make_unicode_escape_function
 from rpython.rlib import rutf8, jit
 
@@ -41,7 +41,7 @@
         self._length = length
         self._ucs4 = ucs4str
         if not we_are_translated():
-            assert rutf8.compute_length_utf8(utf8str) == length
+            assert rutf8.check_utf8(utf8str) == length
 
     def __repr__(self):
         """representation for debugging purposes"""
@@ -561,30 +561,30 @@
         res = []
         value = self._utf8
         if space.is_none(w_sep):
-            res = split(value, maxsplit=maxsplit, isutf8=1)
-            return space.newlist_from_unicode(res)
+            res = split(value, maxsplit=maxsplit, isutf8=True)
+            return space.newlist_utf8(res)
 
         by = self.convert_arg_to_w_unicode(space, w_sep)._utf8
         if len(by) == 0:
             raise oefmt(space.w_ValueError, "empty separator")
-        res = split(value, by, maxsplit, isutf8=1)
+        res = split(value, by, maxsplit, isutf8=True)
 
-        return space.newlist_from_unicode(res)
+        return space.newlist_utf8(res)
 
     @unwrap_spec(maxsplit=int)
     def descr_rsplit(self, space, w_sep=None, maxsplit=-1):
         res = []
         value = self._utf8
         if space.is_none(w_sep):
-            res = rsplit(value, maxsplit=maxsplit, isutf8=1)
-            return space.newlist_from_unicode(res)
+            res = rsplit(value, maxsplit=maxsplit, isutf8=True)
+            return space.newlist_utf8(res)
 
         by = self.convert_arg_to_w_unicode(space, w_sep)._utf8
         if len(by) == 0:
             raise oefmt(space.w_ValueError, "empty separator")
-        res = rsplit(value, by, maxsplit, isutf8=1)
+        res = rsplit(value, by, maxsplit, isutf8=True)
 
-        return space.newlist_from_unicode(res)
+        return space.newlist_utf8(res)
 
     @unwrap_spec(width=int, w_fillchar=WrappedDefault(' '))
     def descr_center(self, space, width, w_fillchar):
@@ -622,11 +622,13 @@
         if count >= 0 and len(input) == 0:
             return self._empty()
         try:
-            res = replace(input, w_sub._utf8, w_by._utf8, count)
+            res, replacements = replace_count(input, w_sub._utf8, w_by._utf8,
+                                              count, isutf8=True)
         except OverflowError:
             raise oefmt(space.w_OverflowError, "replace string is too long")
 
-        return W_UnicodeObject(res, -1)
+        newlength = self._length + replacements * (w_by._length - w_sub._length)
+        return W_UnicodeObject(res, newlength)
 
     def descr_mul(self, space, w_times):
         try:
diff --git a/rpython/rlib/rstring.py b/rpython/rlib/rstring.py
--- a/rpython/rlib/rstring.py
+++ b/rpython/rlib/rstring.py
@@ -16,42 +16,38 @@
 # -------------- public API for string functions -----------------------
 
 @specialize.ll_and_arg(2)
-def _isspace(s, pos, isutf8=0):
+def _isspace(s, pos, isutf8=False):
     if isutf8:
         from rpython.rlib import rutf8
         return rutf8.isspace(s, pos)
+    char = s[pos]
+    if isinstance(char, str):
+        return char.isspace()
     else:
-        char = s[pos]
-        if isinstance(char, str):
-            return char.isspace()
-        else:
-            assert isinstance(char, unicode)
-            return unicodedb.isspace(ord(char))
+        assert isinstance(char, unicode)
+        return unicodedb.isspace(ord(char))
 
 @specialize.ll_and_arg(2)
 def _incr(s, pos, isutf8):
-    from rpython.rlib.rutf8 import next_codepoint_pos
-
     if isutf8:
-        if pos == -1:
-            return 0
+        from rpython.rlib.rutf8 import next_codepoint_pos
+        assert pos >= 0
         return next_codepoint_pos(s, pos)
     else:
         return pos + 1
 
 @specialize.ll_and_arg(2)
 def _decr(s, pos, isutf8):
-    from rpython.rlib.rutf8 import prev_codepoint_pos
-
     if isutf8:
-        if pos == 0:
+        from rpython.rlib.rutf8 import prev_codepoint_pos
+        if pos <= 0:
             return -1
         return prev_codepoint_pos(s, pos)
     else:
         return pos - 1
 
 @specialize.ll_and_arg(3)
-def split(value, by=None, maxsplit=-1, isutf8=0):
+def split(value, by=None, maxsplit=-1, isutf8=False):
     if by is None:
         length = len(value)
         i = 0
@@ -83,7 +79,11 @@
             else:
                 break
         return res
+    else:
+        return _split_by(value, by, maxsplit)
 
+ at specialize.argtype(0)
+def _split_by(value, by, maxsplit):
     if isinstance(value, unicode):
         assert isinstance(by, unicode)
     if isinstance(value, str):
@@ -133,7 +133,7 @@
 
 
 @specialize.ll_and_arg(3)
-def rsplit(value, by=None, maxsplit=-1, isutf8=0):
+def rsplit(value, by=None, maxsplit=-1, isutf8=False):
     if by is None:
         res = []
 
@@ -147,30 +147,34 @@
             else:
                 break  # end of string, finished
 
-            # find the start of the word
-            # (more precisely, 'j' will be the space character before the word)
+            # find the start of the word as 'j1'
             if maxsplit == 0:
-                j = -1   # take all the rest of the string
+                j1 = 0   # take all the rest of the string
+                j = -1
             else:
-                j = _decr(value, i, isutf8)
-                while j >= 0 and not _isspace(value, j, isutf8):
-                    j = _decr(value, j, isutf8)
+                j1 = i
+                while True:
+                    j = _decr(value, j1, isutf8)
+                    if j < 0 or _isspace(value, j, isutf8):
+                        break
+                    j1 = j
                 maxsplit -= 1   # NB. if it's already < 0, it stays < 0
 
-            # the word is value[j+1:i+1]
-            j1 = _incr(value, j, isutf8)
+            # the word is value[j1:i+1]
             assert j1 >= 0
             i1 = _incr(value, i, isutf8)
             res.append(value[j1:i1])
-            if j < 0:
-                break
 
             # continue to look from the character before the space before the word
             i = _decr(value, j, isutf8)
 
         res.reverse()
         return res
+    else:
+        return _rsplit_by(value, by, maxsplit)
 
+ at specialize.argtype(0)
+def _rsplit_by(value, by, maxsplit):
     if isinstance(value, unicode):
         assert isinstance(by, unicode)
     if isinstance(value, str):
@@ -203,6 +207,11 @@
 @specialize.argtype(0, 1)
 @jit.elidable
 def replace(input, sub, by, maxsplit=-1):
+    return replace_count(input, sub, by, maxsplit)[0]
+
+ at specialize.ll_and_arg(4)
+ at jit.elidable
+def replace_count(input, sub, by, maxsplit=-1, isutf8=False):
     if isinstance(input, str):
         Builder = StringBuilder
     elif isinstance(input, unicode):
@@ -211,10 +220,10 @@
         assert isinstance(input, list)
         Builder = ByteListBuilder
     if maxsplit == 0:
-        return input
+        return input, 0
 
 
-    if not sub:
+    if not sub and not isutf8:
         upper = len(input)
         if maxsplit > 0 and maxsplit < upper + 2:
             upper = maxsplit - 1
@@ -234,9 +243,16 @@
             builder.append(input[i])
         builder.append(by)
         builder.append_slice(input, upper, len(input))
+        replacements = upper + 1
     else:
         # First compute the exact result size
-        cnt = count(input, sub, 0, len(input))
+        if sub:
+            cnt = count(input, sub, 0, len(input))
+        else:
+            assert isutf8
+            from rpython.rlib import rutf8
+            cnt = rutf8.compute_length_utf8(input) + 1
+
         if cnt > maxsplit and maxsplit > 0:
             cnt = maxsplit
         diff_len = len(by) - len(sub)
@@ -245,23 +261,36 @@
             result_size = ovfcheck(result_size + len(input))
         except OverflowError:
             raise
+        replacements = cnt
 
         builder = Builder(result_size)
         start = 0
         sublen = len(sub)
 
-        while maxsplit != 0:
-            next = find(input, sub, start, len(input))
-            if next < 0:
-                break
-            builder.append_slice(input, start, next)
-            builder.append(by)
-            start = next + sublen
-            maxsplit -= 1   # NB. if it's already < 0, it stays < 0
+        if sublen == 0:
+            assert isutf8
+            from rpython.rlib import rutf8
+            while True:
+                builder.append(by)
+                maxsplit -= 1
+                if start == len(input) or maxsplit == 0:
+                    break
+                next = rutf8.next_codepoint_pos(input, start)
+                builder.append_slice(input, start, next)
+                start = next
+        else:
+            while maxsplit != 0:
+                next = find(input, sub, start, len(input))
+                if next < 0:
+                    break
+                builder.append_slice(input, start, next)
+                builder.append(by)
+                start = next + sublen
+                maxsplit -= 1   # NB. if it's already < 0, it stays < 0
 
         builder.append_slice(input, start, len(input))
 
-    return builder.build()
+    return builder.build(), replacements
 
 def _normalize_start_end(length, start, end):
     if start < 0:
diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py
--- a/rpython/rlib/rutf8.py
+++ b/rpython/rlib/rutf8.py
@@ -203,6 +203,24 @@
         return True
     return False
 
+def utf8_in_chars(value, pos, chars):
+    """Equivalent of u'x' in u'xyz', where the left-hand side is
+    a single UTF-8 character extracted from the string 'value' at 'pos'.
+    Only works if both 'value' and 'chars' are correctly-formed UTF-8
+    strings.
+    """
+    end = next_codepoint_pos(value, pos)
+    i = 0
+    while i < len(chars):
+        k = pos
+        while value[k] == chars[i]:
+            k += 1
+            i += 1
+            if k == end:
+                return True
+        i += 1
+    return False
+
 
 def _invalid_cont_byte(ordch):
     return ordch>>6 != 0x2    # 0b10
diff --git a/rpython/rlib/test/test_rutf8.py b/rpython/rlib/test/test_rutf8.py
--- a/rpython/rlib/test/test_rutf8.py
+++ b/rpython/rlib/test/test_rutf8.py
@@ -79,14 +79,8 @@
         else:
             assert not rutf8.isspace(unichr(i).encode('utf8'), 0)
 
- at given(strategies.integers(min_value=0, max_value=sys.maxunicode),
-       strategies.characters())
-def test_utf8_in_chars(i, uni):
-    if not uni:
-        return
-    if unichr(i) in uni:
-        response = True
-    else:
-        response = False
-    r = unichr(i).encode('utf8') in uni.encode('utf8')
+ at given(strategies.characters(), strategies.text())
+def test_utf8_in_chars(ch, txt):
+    response = rutf8.utf8_in_chars(ch.encode('utf8'), 0, txt.encode('utf8'))
+    r = (ch in txt)
     assert r == response