[pypy-commit] pypy unicode-utf8: fix and a workaround

Mon Feb 27 10:23:24 EST 2017

Author: fijal
Branch: unicode-utf8
Changeset: r90404:737c72b15c6d
Date: 2017-02-27 16:22 +0100
http://bitbucket.org/pypy/pypy/changeset/737c72b15c6d/

Log:	fix and a workaround

diff --git a/pypy/objspace/std/objspace.py b/pypy/objspace/std/objspace.py
--- a/pypy/objspace/std/objspace.py
+++ b/pypy/objspace/std/objspace.py
@@ -303,6 +303,13 @@
         return self.newlist(list_u)
         return W_ListObject.newlist_unicode(self, list_u)
 
+    def newlist_from_unicode(self, lst):
+        res_w = []
+        for u in lst:
+            assert u is not None
+            res_w.append(self.newutf8(u, -1))
+        return self.newlist(res_w)
+
     def newlist_int(self, list_i):
         return W_ListObject.newlist_int(self, list_i)
 
diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -109,7 +109,7 @@
         if self._len() != 1:
             raise oefmt(space.w_TypeError,
                          "ord() expected a character, but string of length %d "
-                         "found", len(self._value))
+                         "found", self._len())
         return space.newint(rutf8.codepoint_at_pos(self._utf8, 0))
 
     def _new(self, value):
@@ -126,6 +126,9 @@
             self._length = self._compute_length()
         return self._length
 
+    def _compute_length(self):
+        return rutf8.compute_length_utf8(self._utf8)
+
     def _val(self, space):
         return self._utf8.decode('utf8')
 
@@ -156,7 +159,7 @@
     @specialize.argtype(1)
     def _chr(self, char):
         assert len(char) == 1
-        return char[0]
+        return unichr(ord(char[0]))
 
     def _multi_chr(self, unichar):
         return unichar
@@ -513,7 +516,7 @@
     def descr_zfill(self, space, width):
         selfval = self._utf8
         if len(selfval) == 0:
-            return W_UnicodeObject(self._chr('0') * width, width)
+            return W_UnicodeObject('0' * width, width)
         num_zeros = width - self._len()
         if num_zeros <= 0:
             # cannot return self, in case it is a subclass of str
@@ -525,7 +528,7 @@
             start = 1
         else:
             start = 0
-        builder.append_multiple_char(self._chr('0'), num_zeros)
+        builder.append_multiple_char('0', num_zeros)
         builder.append_slice(selfval, start, len(selfval))
         return W_UnicodeObject(builder.build(), width)
 
@@ -536,14 +539,14 @@
         value = self._utf8
         if space.is_none(w_sep):
             res = split(value, maxsplit=maxsplit)
-            return space.newlist([W_UnicodeObject(s, -1) for s in res])
+            return space.newlist_from_unicode(res)
 
         by = self.convert_arg_to_w_unicode(space, w_sep)._utf8
         if len(by) == 0:
             raise oefmt(space.w_ValueError, "empty separator")
         res = split(value, by, maxsplit)
 
-        return space.newlist([W_UnicodeObject(s, -1) for s in res])
+        return space.newlist_from_unicode(res)
 
     @unwrap_spec(maxsplit=int)
     def descr_rsplit(self, space, w_sep=None, maxsplit=-1):
@@ -551,14 +554,14 @@
         value = self._utf8
         if space.is_none(w_sep):
             res = rsplit(value, maxsplit=maxsplit)
-            return space.newlist([W_UnicodeObject(s, -1) for s in res])
+            return space.newlist_from_unicode(res)
 
         by = self.convert_arg_to_w_unicode(space, w_sep)._utf8
         if len(by) == 0:
             raise oefmt(space.w_ValueError, "empty separator")
         res = rsplit(value, by, maxsplit)
 
-        return space.newlist([W_UnicodeObject(s, -1) for s in res])
+        return space.newlist_from_unicode(res)
 
     @unwrap_spec(width=int, w_fillchar=WrappedDefault(' '))
     def descr_center(self, space, width, w_fillchar):
diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py
--- a/rpython/rlib/rutf8.py
+++ b/rpython/rlib/rutf8.py
@@ -66,6 +66,14 @@
         return pos + 1
     return pos + ord(runicode._utf8_code_length[chr1 - 0x80])
 
+def compute_length_utf8(s):
+    pos = 0
+    lgt = 0
+    while pos < len(s):
+        pos = next_codepoint_pos(s, pos)
+        lgt += 1
+    return lgt
+
 def codepoint_at_pos(code, pos):
     """ Give a codepoint in code at pos - assumes valid utf8, no checking!
     """
diff --git a/rpython/rtyper/rmodel.py b/rpython/rtyper/rmodel.py
--- a/rpython/rtyper/rmodel.py
+++ b/rpython/rtyper/rmodel.py
@@ -359,6 +359,10 @@
     def ll_str(self, nothing): raise AssertionError("unreachable code")
 impossible_repr = VoidRepr()
 
+class __extend__(pairtype(Repr, VoidRepr)):
+    def convert_from_to((r_from, r_to), v, llops):
+        return inputconst(lltype.Void, None)
+
 class SimplePointerRepr(Repr):
     "Convenience Repr for simple ll pointer types with no operation on them."