[pypy-commit] pypy unicode-utf8: whack at cffi

Tue Nov 21 15:09:40 EST 2017

Author: fijal
Branch: unicode-utf8
Changeset: r93119:c6537b6d453f
Date: 2017-11-21 21:09 +0100
http://bitbucket.org/pypy/pypy/changeset/c6537b6d453f/

Log:	whack at cffi

diff --git a/TODO b/TODO
--- a/TODO
+++ b/TODO
@@ -7,3 +7,5 @@
 * better flag handling in split/splitlines maybe?
 * encode_error_handler has XXX
 * remove assertions from W_UnicodeObject.__init__ if all the builders pass
+* what to do with error handlers that go backwards. There were tests
+  in test_codecs that would check for that
diff --git a/pypy/interpreter/baseobjspace.py b/pypy/interpreter/baseobjspace.py
--- a/pypy/interpreter/baseobjspace.py
+++ b/pypy/interpreter/baseobjspace.py
@@ -1773,6 +1773,13 @@
                         "characters")
         return rstring.assert_str0(result)
 
+    def convert_arg_to_w_unicode(self, w_obj, strict=None):
+        # XXX why convert_to_w_unicode does something slightly different?
+        from pypy.objspace.std.unicodeobject import W_UnicodeObject
+        assert not hasattr(self, 'is_fake_objspace')
+        return W_UnicodeObject.convert_arg_to_w_unicode(self, w_obj, strict)
+
+
     def realutf8_w(self, w_obj):
         # Like utf8_w(), but only works if w_obj is really of type
         # 'unicode'.  On Python 3 this is the same as utf8_w().
diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -35,9 +35,7 @@
     return raise_unicode_exception_encode
 
 def convert_arg_to_w_unicode(space, w_arg, strict=None):
-    from pypy.objspace.std.unicodeobject import W_UnicodeObject
-    assert not hasattr(space, 'is_fake_objspace')
-    return W_UnicodeObject.convert_arg_to_w_unicode(space, w_arg, strict)
+    return space.convert_arg_to_w_unicode(w_arg)
 
 # ____________________________________________________________
 
diff --git a/pypy/module/_cffi_backend/ctypearray.py b/pypy/module/_cffi_backend/ctypearray.py
--- a/pypy/module/_cffi_backend/ctypearray.py
+++ b/pypy/module/_cffi_backend/ctypearray.py
@@ -63,11 +63,14 @@
             return (w_value, len(s) + 1)
         elif space.isinstance_w(w_value, space.w_unicode):
             from pypy.module._cffi_backend import wchar_helper
-            u = space.unicode_w(w_value)
-            if self.ctitem.size == 2:
-                length = wchar_helper.unicode_size_as_char16(u)
+            w_u = space.convert_arg_to_w_unicode(w_value)
+            if self.citem.size == 4:
+                length = w_u._len()
             else:
-                length = wchar_helper.unicode_size_as_char32(u)
+                if not w_u._has_surrogates():
+                    length = w_u._len()
+                else:
+                    length = wchar_helper.unicode_size_as_char16(w_u._utf8, w_u._len())
             return (w_value, length + 1)
         else:
             explicitlength = space.getindex_w(w_value, space.w_OverflowError)
diff --git a/pypy/module/_cffi_backend/ctypeprim.py b/pypy/module/_cffi_backend/ctypeprim.py
--- a/pypy/module/_cffi_backend/ctypeprim.py
+++ b/pypy/module/_cffi_backend/ctypeprim.py
@@ -5,7 +5,7 @@
 import sys
 
 from rpython.rlib.rarithmetic import r_uint, r_ulonglong, intmask
-from rpython.rlib import jit
+from rpython.rlib import jit, rutf8
 from rpython.rtyper.lltypesystem import lltype, rffi
 from rpython.rtyper.tool import rfficache
 
@@ -40,14 +40,15 @@
         return ord(s[0])
 
     def cast_unicode(self, w_ob):
+        import pdb
+        pdb.set_trace()
         space = self.space
-        s = space.unicode_w(w_ob)
-        try:
-            ordinal = wchar_helper.unicode_to_ordinal(s)
-        except ValueError:
+        w_u = space.convert_arg_to_w_unicode(w_ob)
+        if w_u._len() != 1:
             raise oefmt(space.w_TypeError,
                         "cannot cast unicode string of length %d to ctype '%s'",
-                        len(s), self.name)
+                        w_u._len(), self.name)
+        ordinal = rutf8.codepoint_at_pos(w_u._utf8, 0)
         return intmask(ordinal)
 
     def cast(self, w_ob):
@@ -175,8 +176,10 @@
 
     def convert_to_object(self, cdata):
         if self.is_signed_wchar:
-            unichardata = rffi.cast(rffi.CWCHARP, cdata)
-            return self.space.newunicode(unichardata[0])
+            code = ord(rffi.cast(rffi.CWCHARP, cdata)[0])
+            return self.space.newutf8(
+                rutf8.unichr_as_utf8(code), 1,
+                rutf8.get_flag_from_code(code))
         else:
             value = misc.read_raw_ulong_data(cdata, self.size)   # r_uint
             try:
@@ -185,7 +188,8 @@
                 raise oefmt(self.space.w_ValueError,
                             "char32_t out of range for "
                             "conversion to unicode: %s", hex(e.ordinal))
-            return self.space.newunicode(u)
+            return self.space.newutf8(rutf8.unichr_as_utf8(ord(u)), 1,
+                rutf8.get_flag_from_code(ord(u)))
 
     def string(self, cdataobj, maxlen):
         with cdataobj as ptr:
@@ -196,16 +200,7 @@
         # returns a r_uint.  If self.size == 2, it is smaller than 0x10000
         space = self.space
         if space.isinstance_w(w_ob, space.w_unicode):
-            u = space.unicode_w(w_ob)
-            try:
-                ordinal = wchar_helper.unicode_to_ordinal(u)
-            except ValueError:
-                pass
-            else:
-                if self.size == 2 and ordinal > 0xffff:
-                    raise self._convert_error("single character <= 0xFFFF",
-                                              w_ob)
-                return ordinal
+            return rutf8.codepoint_at_pos(space.utf8_w(w_ob), 0)
         elif (isinstance(w_ob, cdataobj.W_CData) and
                isinstance(w_ob.ctype, W_CTypePrimitiveUniChar) and
                w_ob.ctype.size == self.size):
diff --git a/pypy/module/_cffi_backend/ctypeptr.py b/pypy/module/_cffi_backend/ctypeptr.py
--- a/pypy/module/_cffi_backend/ctypeptr.py
+++ b/pypy/module/_cffi_backend/ctypeptr.py
@@ -91,11 +91,15 @@
             from pypy.module._cffi_backend import wchar_helper
             if not space.isinstance_w(w_ob, space.w_unicode):
                 raise self._convert_error("unicode or list or tuple", w_ob)
-            s = space.unicode_w(w_ob)
-            if self.ctitem.size == 2:
-                n = wchar_helper.unicode_size_as_char16(s)
+            w_u = space.convert_arg_to_w_unicode(w_ob)
+            if self.size == 4:
+                n = w_u._len()
             else:
-                n = wchar_helper.unicode_size_as_char32(s)
+                if not w_u._has_surrogates():
+                    n = w_u._len()
+                else:
+                    n = wchar_helper.unicode_size_as_char16(w_u._utf8,
+                                                            w_u._len())
             if self.length >= 0 and n > self.length:
                 raise oefmt(space.w_IndexError,
                             "initializer unicode string is too long for '%s' "
@@ -328,11 +332,12 @@
             length = len(s) + 1
         elif space.isinstance_w(w_init, space.w_unicode):
             from pypy.module._cffi_backend import wchar_helper
-            u = space.unicode_w(w_init)
+            w_u = space.convert_arg_to_w_unicode(w_init)
             if self.ctitem.size == 2:
-                length = wchar_helper.unicode_size_as_char16(u)
+                length = wchar_helper.unicode_size_as_char16(w_u._utf8,
+                                                             w_u._len())
             else:
-                length = wchar_helper.unicode_size_as_char32(u)
+                length = w_u._len()
             length += 1
         elif self.is_file:
             result = self.prepare_file(w_init)
diff --git a/pypy/module/_cffi_backend/wchar_helper.py b/pypy/module/_cffi_backend/wchar_helper.py
--- a/pypy/module/_cffi_backend/wchar_helper.py
+++ b/pypy/module/_cffi_backend/wchar_helper.py
@@ -1,10 +1,12 @@
+from rpython.rlib import rutf8
 from rpython.rlib.objectmodel import specialize
+from rpython.rlib.rstring import StringBuilder
 from rpython.rlib.rarithmetic import r_uint, r_ulonglong, intmask
 from rpython.rtyper.annlowlevel import llunicode
 from rpython.rtyper.lltypesystem import lltype, rffi
 from rpython.rtyper.lltypesystem.rstr import copy_unicode_to_raw
 
-SIZE_UNICODE = rffi.sizeof(lltype.UniChar)
+SIZE_UNICODE = 4
 
 
 if SIZE_UNICODE == 4:
@@ -48,7 +50,7 @@
         self.ordinal = ordinal
 
 def _unicode_from_wchar(ptr, length):
-    return rffi.wcharpsize2unicode(rffi.cast(rffi.CWCHARP, ptr), length)
+    return rffi.wcharpsize2utf8(rffi.cast(rffi.CWCHARP, ptr), length)
 
 
 if SIZE_UNICODE == 2:
@@ -86,7 +88,7 @@
     def unicode_from_char16(ptr, length):
         # 'ptr' is a pointer to 'length' 16-bit integers
         ptr = rffi.cast(rffi.USHORTP, ptr)
-        u = [u'\x00'] * length
+        u = StringBuilder(length)
         i = 0
         j = 0
         while j < length:
@@ -97,10 +99,9 @@
                 if 0xDC00 <= ch2 <= 0xDFFF:
                     ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000
                     j += 1
-            u[i] = unichr(ch)
+            rutf8.unichr_as_utf8_append(u, ch)
             i += 1
-        del u[i:]
-        return u''.join(u)
+        return u.build()
 
 
 @specialize.ll()
@@ -121,23 +122,16 @@
     return _measure_length(rffi.cast(rffi.UINTP, ptr), maxlen)
 
 
-def unicode_size_as_char16(u):
-    result = len(u)
-    if SIZE_UNICODE == 4:
-        for i in range(result):
-            if ord(u[i]) > 0xFFFF:
-                result += 1
+def unicode_size_as_char16(u, len):
+    result = len
+    i = 0
+    while i < len(u):
+        code = rutf8.codepoint_at_pos(u, i)
+        if code > 0xFFFF:
+            result += 1
+        i = rutf8.next_codepoint_pos(u, i)
     return result
 
-def unicode_size_as_char32(u):
-    result = len(u)
-    if SIZE_UNICODE == 2 and result > 1:
-        for i in range(result - 1):
-            if is_surrogate(u, i):
-                result -= 1
-    return result
-
-
 def _unicode_to_wchar(u, target_ptr, target_length, add_final_zero):
     # 'target_ptr' is a raw pointer to 'target_length' wchars;
     # we assume here that target_length == len(u).