[pypy-commit] pypy cffi-char16-char32: in-progress

Sun Jun 4 03:59:01 EDT 2017

Author: Armin Rigo <arigo at tunes.org>
Branch: cffi-char16-char32
Changeset: r91504:8bc39f008ba8
Date: 2017-06-04 09:58 +0200
http://bitbucket.org/pypy/pypy/changeset/8bc39f008ba8/

Log:	in-progress

diff --git a/pypy/module/_cffi_backend/ctypearray.py b/pypy/module/_cffi_backend/ctypearray.py
--- a/pypy/module/_cffi_backend/ctypearray.py
+++ b/pypy/module/_cffi_backend/ctypearray.py
@@ -36,8 +36,7 @@
         datasize = self.size
         #
         if datasize < 0:
-            from pypy.module._cffi_backend import misc
-            w_init, length = misc.get_new_array_length(space, w_init)
+            w_init, length = self.get_new_array_length(w_init)
             try:
                 datasize = ovfcheck(length * self.ctitem.size)
             except OverflowError:
@@ -53,6 +52,29 @@
                 self.convert_from_object(ptr, w_init)
         return cdata
 
+    def get_new_array_length(self, w_value):
+        space = self.space
+        if (space.isinstance_w(w_value, space.w_list) or
+            space.isinstance_w(w_value, space.w_tuple)):
+            return (w_value, space.int_w(space.len(w_value)))
+        elif space.isinstance_w(w_value, space.w_bytes):
+            # from a string, we add the null terminator
+            s = space.bytes_w(w_value)
+            return (w_value, len(s) + 1)
+        elif space.isinstance_w(w_value, space.w_unicode):
+            from pypy.module._cffi_backend import wchar_helper
+            u = space.unicode_w(w_value)
+            if self.ctitem.size == 2:
+                length = wchar_helper.unicode_size_as_char16(u)
+            else:
+                length = wchar_helper.unicode_size_as_char32(u)
+            return (w_value, length + 1)
+        else:
+            explicitlength = space.getindex_w(w_value, space.w_OverflowError)
+            if explicitlength < 0:
+                raise oefmt(space.w_ValueError, "negative array length")
+            return (space.w_None, explicitlength)
+
     def _check_subscript_index(self, w_cdata, i):
         space = self.space
         if i < 0:
diff --git a/pypy/module/_cffi_backend/ctypeprim.py b/pypy/module/_cffi_backend/ctypeprim.py
--- a/pypy/module/_cffi_backend/ctypeprim.py
+++ b/pypy/module/_cffi_backend/ctypeprim.py
@@ -42,12 +42,13 @@
     def cast_unicode(self, w_ob):
         space = self.space
         s = space.unicode_w(w_ob)
-        XXXXXXXXXXXXXX
-        if len(s) != 1:
+        try:
+            ordinal = wchar_helper.unicode_to_ordinal(s)
+        except ValueError:
             raise oefmt(space.w_TypeError,
                         "cannot cast unicode string of length %d to ctype '%s'",
                         len(s), self.name)
-        return ord(s[0])
+        return intmask(ordinal)
 
     def cast(self, w_ob):
         from pypy.module._cffi_backend import ctypeptr
diff --git a/pypy/module/_cffi_backend/ctypeptr.py b/pypy/module/_cffi_backend/ctypeptr.py
--- a/pypy/module/_cffi_backend/ctypeptr.py
+++ b/pypy/module/_cffi_backend/ctypeptr.py
@@ -4,9 +4,9 @@
 
 from rpython.rlib import rposix
 from rpython.rlib.rarithmetic import ovfcheck
-from rpython.rtyper.annlowlevel import llstr, llunicode
+from rpython.rtyper.annlowlevel import llstr
 from rpython.rtyper.lltypesystem import lltype, rffi
-from rpython.rtyper.lltypesystem.rstr import copy_string_to_raw, copy_unicode_to_raw
+from rpython.rtyper.lltypesystem.rstr import copy_string_to_raw
 
 from pypy.interpreter.error import OperationError, oefmt, wrap_oserror
 from pypy.module._cffi_backend import cdataobj, misc, ctypeprim, ctypevoid
@@ -88,31 +88,23 @@
             if n != self.length:
                 cdata[n] = '\x00'
         elif isinstance(self.ctitem, ctypeprim.W_CTypePrimitiveUniChar):
+            from pypy.module._cffi_backend import wchar_helper
             if not space.isinstance_w(w_ob, space.w_unicode):
                 raise self._convert_error("unicode or list or tuple", w_ob)
             s = space.unicode_w(w_ob)
-            XXXXXXXXXXXXXXX
-            n = len(s)
+            if self.ctitem.size == 2:
+                n = wchar_helper.unicode_size_as_char16(s)
+            else:
+                n = wchar_helper.unicode_size_as_char32(s)
             if self.length >= 0 and n > self.length:
                 raise oefmt(space.w_IndexError,
                             "initializer unicode string is too long for '%s' "
                             "(got %d characters)", self.name, n)
-
-
-
-
+            add_final_zero = (n != self.length)
             if self.ctitem.size == 2:
-                length = wchar_helper.measure_length_16(ptr, length)
+                wchar_helper.unicode_to_char16(s, cdata, n, add_final_zero)
             else:
-                length = wchar_helper.measure_length_32(ptr, length)
-            XXXX
-
-
-
-            unichardata = rffi.cast(rffi.CWCHARP, cdata)
-            copy_unicode_to_raw(llunicode(s), unichardata, 0, n)
-            if n != self.length:
-                unichardata[n] = u'\x00'
+                wchar_helper.unicode_to_char32(s, cdata, n, add_final_zero)
         else:
             raise self._convert_error("list or tuple", w_ob)
 
@@ -315,10 +307,18 @@
         if (space.isinstance_w(w_init, space.w_list) or
             space.isinstance_w(w_init, space.w_tuple)):
             length = space.int_w(space.len(w_init))
-        elif space.isinstance_w(w_init, space.w_basestring):
+        elif space.isinstance_w(w_init, space.w_bytes):
             # from a string, we add the null terminator
-            XXXXXXXXXXXXXXX
-            length = space.int_w(space.len(w_init)) + 1
+            s = space.bytes_w(w_init)
+            length = len(s) + 1
+        elif space.isinstance_w(w_init, space.w_unicode):
+            from pypy.module._cffi_backend import wchar_helper
+            u = space.unicode_w(w_init)
+            if self.ctitem.size == 2:
+                length = wchar_helper.unicode_size_as_char16(u)
+            else:
+                length = wchar_helper.unicode_size_as_char32(u)
+            length += 1
         elif self.is_file:
             result = self.prepare_file(w_init)
             if result:
diff --git a/pypy/module/_cffi_backend/ctypestruct.py b/pypy/module/_cffi_backend/ctypestruct.py
--- a/pypy/module/_cffi_backend/ctypestruct.py
+++ b/pypy/module/_cffi_backend/ctypestruct.py
@@ -244,7 +244,7 @@
         ct = self.ctype
         if isinstance(ct, ctypearray.W_CTypeArray) and ct.length < 0:
             space = ct.space
-            w_ob, varsizelength = misc.get_new_array_length(space, w_ob)
+            w_ob, varsizelength = ct.get_new_array_length(w_ob)
             if optvarsize != -1:
                 # in this mode, the only purpose of this function is to compute
                 # the real size of the structure from a var-sized C99 array
diff --git a/pypy/module/_cffi_backend/misc.py b/pypy/module/_cffi_backend/misc.py
--- a/pypy/module/_cffi_backend/misc.py
+++ b/pypy/module/_cffi_backend/misc.py
@@ -290,22 +290,6 @@
 
 # ____________________________________________________________
 
-def get_new_array_length(space, w_value):
-    if (space.isinstance_w(w_value, space.w_list) or
-        space.isinstance_w(w_value, space.w_tuple)):
-        return (w_value, space.int_w(space.len(w_value)))
-    elif space.isinstance_w(w_value, space.w_basestring):
-        # from a string, we add the null terminator
-        XXXXXXXXXX
-        return (w_value, space.int_w(space.len(w_value)) + 1)
-    else:
-        explicitlength = space.getindex_w(w_value, space.w_OverflowError)
-        if explicitlength < 0:
-            raise oefmt(space.w_ValueError, "negative array length")
-        return (space.w_None, explicitlength)
-
-# ____________________________________________________________
-
 @specialize.arg(0)
 def _raw_memcopy_tp(TPP, source, dest):
     # in its own function: LONGLONG may make the whole function jit-opaque
diff --git a/pypy/module/_cffi_backend/wchar_helper.py b/pypy/module/_cffi_backend/wchar_helper.py
--- a/pypy/module/_cffi_backend/wchar_helper.py
+++ b/pypy/module/_cffi_backend/wchar_helper.py
@@ -1,6 +1,8 @@
 from rpython.rlib.objectmodel import specialize
 from rpython.rlib.rarithmetic import r_uint, r_ulonglong, intmask
+from rpython.rtyper.annlowlevel import llunicode
 from rpython.rtyper.lltypesystem import lltype, rffi
+from rpython.rtyper.lltypesystem.rstr import copy_unicode_to_raw
 
 SIZE_UNICODE = rffi.sizeof(lltype.UniChar)
 
@@ -18,8 +20,7 @@
                     unichr(0xDC00 | (ordinal & 0x3FF)))
 
 def is_surrogate(u, index):
-    return (index + 1 < len(u) and
-            unichr(0xD800) <= u[index + 0] <= unichr(0xDBFF) and
+    return (unichr(0xD800) <= u[index + 0] <= unichr(0xDBFF) and
             unichr(0xDC00) <= u[index + 1] <= unichr(0xDFFF))
 
 def as_surrogate(u, index):
@@ -42,9 +43,13 @@
         ordinal = intmask(rffi.cast(rffi.INT, ordinal))
         self.ordinal = ordinal
 
+def _unicode_from_wchar(ptr, length):
+    return rffi.wcharpsize2unicode(rffi.cast(rffi.CWCHARP, ptr), length)
+
 
 if SIZE_UNICODE == 2:
     def unicode_from_char32(ptr, length):
+        # 'ptr' is a pointer to 'length' 32-bit integers
         ptr = rffi.cast(rffi.UINTP, ptr)
         alloc = length
         for i in range(length):
@@ -69,14 +74,13 @@
         assert j == len(u)
         return u''.join(u)
 
-    def unicode_from_char16(ptr, length):
-        return rffi.wcharpsize2unicode(rffi.cast(rffi.CWCHARP, ptr), length)
+    unicode_from_char16 = _unicode_from_wchar
 
 else:
-    def unicode_from_char32(ptr, length):
-        return rffi.wcharpsize2unicode(rffi.cast(rffi.CWCHARP, ptr), length)
+    unicode_from_char32 = _unicode_from_wchar
 
     def unicode_from_char16(ptr, length):
+        # 'ptr' is a pointer to 'length' 16-bit integers
         ptr = rffi.cast(rffi.USHORTP, ptr)
         u = [u'\x00'] * length
         i = 0
@@ -113,5 +117,71 @@
     return _measure_length(rffi.cast(rffi.UINTP, ptr), maxlen)
 
 
-def unicode_to_char16(u, ptr):
-    XXX
+def unicode_size_as_char16(u):
+    result = len(u)
+    if SIZE_UNICODE == 4:
+        for i in range(result):
+            if ord(u[i]) > 0xFFFF:
+                result += 1
+    return result
+
+def unicode_size_as_char32(u):
+    result = len(u)
+    if SIZE_UNICODE == 2 and result > 1:
+        for i in range(result - 1):
+            if is_surrogate(u, i):
+                result -= 1
+    return result
+
+
+def _unicode_to_wchar(u, target_ptr, target_length, add_final_zero):
+    # 'target_ptr' is a raw pointer to 'target_length' wchars;
+    # we assume here that target_length == len(u).
+    unichardata = rffi.cast(rffi.CWCHARP, target_ptr)
+    copy_unicode_to_raw(llunicode(u), unichardata, 0, target_length)
+    if add_final_zero:
+        unichardata[target_length] = u'\x00'
+
+
+if SIZE_UNICODE == 2:
+    def unicode_to_char32(u, target_ptr, target_length, add_final_zero):
+        # 'target_ptr' is a raw pointer to 'target_length' 32-bit integers;
+        # we assume here that target_length == unicode_size_as_char32(u).
+        ptr = rffi.cast(rffi.UINTP, target_ptr)
+        src_index = 0
+        for i in range(target_length):
+            if i < target_length - 1 and is_surrogate(u, src_index):
+                ordinal = as_surrogate(u, src_index)
+                src_index += 2
+            else:
+                ordinal = r_uint(ord(u[src_index]))
+                src_index += 1
+            ptr[i] = rffi.cast(rffi.UINT, ordinal)
+        if add_final_zero:
+            ptr[target_length] = rffi.cast(rffi.UINT, 0)
+
+    unicode_to_char16 = _unicode_to_wchar
+
+else:
+    unicode_to_char32 = _unicode_to_wchar
+
+    def unicode_to_char16(u, target_ptr, target_length, add_final_zero):
+        # 'target_ptr' is a raw pointer to 'target_length' 16-bit integers;
+        # we assume here that target_length == unicode_size_as_char16(u).
+        ptr = rffi.cast(rffi.USHORTP, target_ptr)
+        for uc in u:
+            ordinal = ord(uc)
+            if ordinal > 0xFFFF:
+                # NB. like CPython, ignore the problem of unicode string
+                # objects containing characters greater than sys.maxunicode
+                ordinal -= 0x10000
+                ptr[0] = rffi.cast(rffi.USHORT, 0xD800 | (ordinal >> 10))
+                ptr[1] = rffi.cast(rffi.USHORT, 0xDC00 | (ordinal & 0x3FF))
+                ptr = rffi.ptradd(ptr, 2)
+            else:
+                ptr[0] = rffi.cast(rffi.USHORT, ordinal)
+                ptr = rffi.ptradd(ptr, 1)
+        assert ptr == (
+            rffi.ptradd(rffi.cast(rffi.USHORTP, target_ptr), target_length))
+        if add_final_zero:
+            ptr[0] = rffi.cast(rffi.USHORT, 0)