[pypy-commit] pypy unicode-utf8: Fixes for _cffi_backend

Thu Nov 23 09:41:28 EST 2017

Author: Armin Rigo <arigo at tunes.org>
Branch: unicode-utf8
Changeset: r93137:a94b5860dbb3
Date: 2017-11-23 15:40 +0100
http://bitbucket.org/pypy/pypy/changeset/a94b5860dbb3/

Log:	Fixes for _cffi_backend

diff --git a/pypy/module/_cffi_backend/ctypearray.py b/pypy/module/_cffi_backend/ctypearray.py
--- a/pypy/module/_cffi_backend/ctypearray.py
+++ b/pypy/module/_cffi_backend/ctypearray.py
@@ -64,13 +64,10 @@
         elif space.isinstance_w(w_value, space.w_unicode):
             from pypy.module._cffi_backend import wchar_helper
             w_u = space.convert_arg_to_w_unicode(w_value)
-            if self.citem.size == 4:
+            if self.ctitem.size == 2:
+                length = wchar_helper.utf8_size_as_char16(w_u._utf8)
+            else:
                 length = w_u._len()
-            else:
-                if not w_u._has_surrogates():
-                    length = w_u._len()
-                else:
-                    length = wchar_helper.unicode_size_as_char16(w_u._utf8, w_u._len())
             return (w_value, length + 1)
         else:
             explicitlength = space.getindex_w(w_value, space.w_OverflowError)
diff --git a/pypy/module/_cffi_backend/ctypeprim.py b/pypy/module/_cffi_backend/ctypeprim.py
--- a/pypy/module/_cffi_backend/ctypeprim.py
+++ b/pypy/module/_cffi_backend/ctypeprim.py
@@ -40,16 +40,13 @@
         return ord(s[0])
 
     def cast_unicode(self, w_ob):
-        import pdb
-        pdb.set_trace()
         space = self.space
         w_u = space.convert_arg_to_w_unicode(w_ob)
         if w_u._len() != 1:
             raise oefmt(space.w_TypeError,
                         "cannot cast unicode string of length %d to ctype '%s'",
                         w_u._len(), self.name)
-        ordinal = rutf8.codepoint_at_pos(w_u._utf8, 0)
-        return intmask(ordinal)
+        return rutf8.codepoint_at_pos(w_u._utf8, 0)
 
     def cast(self, w_ob):
         from pypy.module._cffi_backend import ctypeptr
@@ -175,21 +172,19 @@
                 return self.space.newint(value)    # r_uint => 'long' object
 
     def convert_to_object(self, cdata):
-        if self.is_signed_wchar:
-            code = ord(rffi.cast(rffi.CWCHARP, cdata)[0])
-            return self.space.newutf8(
-                rutf8.unichr_as_utf8(code), 1,
-                rutf8.get_flag_from_code(code))
-        else:
-            value = misc.read_raw_ulong_data(cdata, self.size)   # r_uint
-            try:
-                u = wchar_helper.ordinal_to_unicode(value)
-            except wchar_helper.OutOfRange as e:
-                raise oefmt(self.space.w_ValueError,
-                            "char32_t out of range for "
-                            "conversion to unicode: %s", hex(e.ordinal))
-            return self.space.newutf8(rutf8.unichr_as_utf8(ord(u)), 1,
-                rutf8.get_flag_from_code(ord(u)))
+        value = misc.read_raw_ulong_data(cdata, self.size)   # r_uint
+        try:
+            utf8 = rutf8.unichr_as_utf8(value, allow_surrogates=True)
+        except ValueError:
+            if self.is_signed_wchar:
+                s = hex(intmask(value))
+            else:
+                s = hex(value)
+            raise oefmt(self.space.w_ValueError,
+                        "%s out of range for conversion to unicode: %s",
+                        self.name, s)
+        flag = rutf8.get_flag_from_code(intmask(value))
+        return self.space.newutf8(utf8, 1, flag)
 
     def string(self, cdataobj, maxlen):
         with cdataobj as ptr:
@@ -200,7 +195,13 @@
         # returns a r_uint.  If self.size == 2, it is smaller than 0x10000
         space = self.space
         if space.isinstance_w(w_ob, space.w_unicode):
-            return rutf8.codepoint_at_pos(space.utf8_w(w_ob), 0)
+            w_u = space.convert_arg_to_w_unicode(w_ob)
+            if w_u._len() != 1:
+                raise self._convert_error("single character", w_ob)
+            ordinal = rutf8.codepoint_at_pos(w_u._utf8, 0)
+            if self.size == 2 and ordinal > 0xFFFF:
+                raise self._convert_error("single character <= 0xFFFF", w_ob)
+            return r_uint(ordinal)
         elif (isinstance(w_ob, cdataobj.W_CData) and
                isinstance(w_ob.ctype, W_CTypePrimitiveUniChar) and
                w_ob.ctype.size == self.size):
@@ -214,15 +215,15 @@
 
     def unpack_ptr(self, w_ctypeptr, ptr, length):
         if self.size == 2:
-            u = wchar_helper.unicode_from_char16(ptr, length)
+            utf8, lgt, flag = wchar_helper.utf8_from_char16(ptr, length)
         else:
             try:
-                u = wchar_helper.unicode_from_char32(ptr, length)
+                utf8, lgt, flag = wchar_helper.utf8_from_char32(ptr, length)
             except wchar_helper.OutOfRange as e:
                 raise oefmt(self.space.w_ValueError,
-                            "char32_t out of range for "
-                            "conversion to unicode: %s", hex(e.ordinal))
-        return self.space.newunicode(u)
+                            "%s out of range for conversion to unicode: %s",
+                            self.name, hex(e.ordinal))
+        return self.space.newutf8(utf8, lgt, flag)
 
 
 class W_CTypePrimitiveSigned(W_CTypePrimitive):
diff --git a/pypy/module/_cffi_backend/ctypeptr.py b/pypy/module/_cffi_backend/ctypeptr.py
--- a/pypy/module/_cffi_backend/ctypeptr.py
+++ b/pypy/module/_cffi_backend/ctypeptr.py
@@ -92,28 +92,20 @@
             if not space.isinstance_w(w_ob, space.w_unicode):
                 raise self._convert_error("unicode or list or tuple", w_ob)
             w_u = space.convert_arg_to_w_unicode(w_ob)
-            if self.size == 4:
+            s = w_u._utf8
+            if self.ctitem.size == 2:
+                n = wchar_helper.utf8_size_as_char16(s)
+            else:
                 n = w_u._len()
-            else:
-                if not w_u._has_surrogates():
-                    n = w_u._len()
-                else:
-                    n = wchar_helper.unicode_size_as_char16(w_u._utf8,
-                                                            w_u._len())
             if self.length >= 0 and n > self.length:
                 raise oefmt(space.w_IndexError,
                             "initializer unicode string is too long for '%s' "
                             "(got %d characters)", self.name, n)
             add_final_zero = (n != self.length)
             if self.ctitem.size == 2:
-                try:
-                    wchar_helper.unicode_to_char16(s, cdata, n, add_final_zero)
-                except wchar_helper.OutOfRange as e:
-                    raise oefmt(self.space.w_ValueError,
-                                "unicode character ouf of range for "
-                                "conversion to char16_t: %s", hex(e.ordinal))
+                wchar_helper.utf8_to_char16(s, cdata, n, add_final_zero)
             else:
-                wchar_helper.unicode_to_char32(s, cdata, n, add_final_zero)
+                wchar_helper.utf8_to_char32(s, cdata, n, add_final_zero)
         else:
             raise self._convert_error("list or tuple", w_ob)
 
@@ -334,8 +326,7 @@
             from pypy.module._cffi_backend import wchar_helper
             w_u = space.convert_arg_to_w_unicode(w_init)
             if self.ctitem.size == 2:
-                length = wchar_helper.unicode_size_as_char16(w_u._utf8,
-                                                             w_u._len())
+                length = wchar_helper.utf8_size_as_char16(w_u._utf8)
             else:
                 length = w_u._len()
             length += 1
diff --git a/pypy/module/_cffi_backend/test/test_wchar_helper.py b/pypy/module/_cffi_backend/test/test_wchar_helper.py
new file mode 100644
--- /dev/null
+++ b/pypy/module/_cffi_backend/test/test_wchar_helper.py
@@ -0,0 +1,10 @@
+from hypothesis import given, strategies
+from pypy.module._cffi_backend.wchar_helper import utf8_size_as_char16
+
+
+
+ at given(strategies.text())
+def test_utf8_size_as_char16(u):
+    assert type(u) is unicode
+    length = utf8_size_as_char16(''.join(uc.encode('utf8') for uc in u))
+    assert length == sum((1 if uc <= u'\uFFFF' else 2) for uc in u)
diff --git a/pypy/module/_cffi_backend/wchar_helper.py b/pypy/module/_cffi_backend/wchar_helper.py
--- a/pypy/module/_cffi_backend/wchar_helper.py
+++ b/pypy/module/_cffi_backend/wchar_helper.py
@@ -6,41 +6,6 @@
 from rpython.rtyper.lltypesystem import lltype, rffi
 from rpython.rtyper.lltypesystem.rstr import copy_unicode_to_raw
 
-SIZE_UNICODE = 4
-
-
-if SIZE_UNICODE == 4:
-    def ordinal_to_unicode(ordinal):    # 'ordinal' is a r_uint
-        return unichr(intmask(ordinal))
-else:
-    def ordinal_to_unicode(ordinal):    # 'ordinal' is a r_uint
-        if ordinal <= 0xffff:
-            return unichr(intmask(ordinal))
-        elif ordinal <= 0x10ffff:
-            ordinal = intmask(ordinal - 0x10000)
-            return (unichr(0xD800 | (ordinal >> 10)) +
-                    unichr(0xDC00 | (ordinal & 0x3FF)))
-        else:
-            raise OutOfRange(ordinal)
-
-def is_surrogate(u, index):
-    return (unichr(0xD800) <= u[index + 0] <= unichr(0xDBFF) and
-            unichr(0xDC00) <= u[index + 1] <= unichr(0xDFFF))
-
-def as_surrogate(u, index):
-    ordinal = (ord(u[index + 0]) - 0xD800) << 10
-    ordinal |= (ord(u[index + 1]) - 0xDC00)
-    return r_uint(ordinal + 0x10000)
-
-def unicode_to_ordinal(u):
-    if len(u) == 1:
-        u = ord(u[0])
-        return r_uint(u)
-    elif SIZE_UNICODE == 2:
-        if len(u) == 2 and is_surrogate(u, 0):
-            return r_uint(as_surrogate(u, 0))
-    raise ValueError
-
 
 class OutOfRange(Exception):
     ordinal = 0
@@ -49,59 +14,41 @@
         ordinal = intmask(rffi.cast(rffi.INT, ordinal))
         self.ordinal = ordinal
 
-def _unicode_from_wchar(ptr, length):
-    return rffi.wcharpsize2utf8(rffi.cast(rffi.CWCHARP, ptr), length)
+def utf8_from_char32(ptr, length):
+    # 'ptr' is a pointer to 'length' 32-bit integers
+    ptr = rffi.cast(rffi.UINTP, ptr)
+    u = StringBuilder(length)
+    j = 0
+    flag = rutf8.FLAG_ASCII
+    while j < length:
+        ch = intmask(ptr[j])
+        j += 1
+        flag = rutf8.combine_flags(flag, rutf8.get_flag_from_code(ch))
+        try:
+            rutf8.unichr_as_utf8_append(u, ch, allow_surrogates=True)
+        except ValueError:
+            raise OutOfRange(ch)
+    return u.build(), length, flag
 
-
-if SIZE_UNICODE == 2:
-    def unicode_from_char32(ptr, length):
-        # 'ptr' is a pointer to 'length' 32-bit integers
-        ptr = rffi.cast(rffi.UINTP, ptr)
-        alloc = length
-        for i in range(length):
-            if rffi.cast(lltype.Unsigned, ptr[i]) > 0xFFFF:
-                alloc += 1
-
-        u = [u'\x00'] * alloc
-        j = 0
-        for i in range(length):
-            ordinal = rffi.cast(lltype.Unsigned, ptr[i])
-            if ordinal > 0xFFFF:
-                if ordinal > 0x10FFFF:
-                    raise OutOfRange(ordinal)
-                ordinal = intmask(ordinal - 0x10000)
-                u[j] = unichr(0xD800 | (ordinal >> 10))
+def utf8_from_char16(ptr, length):
+    # 'ptr' is a pointer to 'length' 16-bit integers
+    ptr = rffi.cast(rffi.USHORTP, ptr)
+    u = StringBuilder(length)
+    j = 0
+    result_length = length
+    flag = rutf8.FLAG_ASCII
+    while j < length:
+        ch = intmask(ptr[j])
+        j += 1
+        if 0xD800 <= ch <= 0xDBFF and j < length:
+            ch2 = intmask(ptr[j])
+            if 0xDC00 <= ch2 <= 0xDFFF:
+                ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000
                 j += 1
-                u[j] = unichr(0xDC00 | (ordinal & 0x3FF))
-                j += 1
-            else:
-                u[j] = unichr(intmask(ordinal))
-                j += 1
-        assert j == len(u)
-        return u''.join(u)
-
-    unicode_from_char16 = _unicode_from_wchar
-
-else:
-    unicode_from_char32 = _unicode_from_wchar
-
-    def unicode_from_char16(ptr, length):
-        # 'ptr' is a pointer to 'length' 16-bit integers
-        ptr = rffi.cast(rffi.USHORTP, ptr)
-        u = StringBuilder(length)
-        i = 0
-        j = 0
-        while j < length:
-            ch = intmask(ptr[j])
-            j += 1
-            if 0xD800 <= ch <= 0xDBFF and j < length:
-                ch2 = intmask(ptr[j])
-                if 0xDC00 <= ch2 <= 0xDFFF:
-                    ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000
-                    j += 1
-            rutf8.unichr_as_utf8_append(u, ch)
-            i += 1
-        return u.build()
+                result_length -= 1
+        flag = rutf8.combine_flags(flag, rutf8.get_flag_from_code(ch))
+        rutf8.unichr_as_utf8_append(u, ch, allow_surrogates=True)
+    return u.build(), result_length, flag
 
 
 @specialize.ll()
@@ -122,65 +69,44 @@
     return _measure_length(rffi.cast(rffi.UINTP, ptr), maxlen)
 
 
-def unicode_size_as_char16(u, len):
-    result = len
-    i = 0
-    while i < len(u):
-        code = rutf8.codepoint_at_pos(u, i)
-        if code > 0xFFFF:
-            result += 1
-        i = rutf8.next_codepoint_pos(u, i)
+def utf8_size_as_char16(u):
+    # Counts one per unichar in 'u', or two if they are greater than 0xffff.
+    TABLE = "\x01\x01\x01\x01\x01\x01\x01\x01\x00\x00\x00\x00\x01\x01\x01\x02"
+    result = 0
+    for c in u:
+        result += ord(TABLE[ord(c) >> 4])
     return result
 
-def _unicode_to_wchar(u, target_ptr, target_length, add_final_zero):
-    # 'target_ptr' is a raw pointer to 'target_length' wchars;
-    # we assume here that target_length == len(u).
-    unichardata = rffi.cast(rffi.CWCHARP, target_ptr)
-    copy_unicode_to_raw(llunicode(u), unichardata, 0, target_length)
+def utf8_to_char32(utf8, target_ptr, target_length, add_final_zero):
+    # 'target_ptr' is a raw pointer to 'target_length' 32-bit integers;
+    # we assume (and check) that target_length == number of unichars in utf8.
+    unichardata = rffi.cast(rffi.UINTP, target_ptr)
+    i = 0
+    for j in range(target_length):
+        code = rutf8.codepoint_at_pos(utf8, i)
+        unichardata[j] = rffi.cast(rffi.UINT, code)
+        i = rutf8.next_codepoint_pos(utf8, i)
+    assert i == len(utf8)
     if add_final_zero:
-        unichardata[target_length] = u'\x00'
+        unichardata[target_length] = rffi.cast(rffi.UINT, 0)
 
-
-if SIZE_UNICODE == 2:
-    def unicode_to_char32(u, target_ptr, target_length, add_final_zero):
-        # 'target_ptr' is a raw pointer to 'target_length' 32-bit integers;
-        # we assume here that target_length == unicode_size_as_char32(u).
-        ptr = rffi.cast(rffi.UINTP, target_ptr)
-        src_index = 0
-        last_surrogate_pos = len(u) - 2
-        for i in range(target_length):
-            if src_index <= last_surrogate_pos and is_surrogate(u, src_index):
-                ordinal = as_surrogate(u, src_index)
-                src_index += 2
-            else:
-                ordinal = r_uint(ord(u[src_index]))
-                src_index += 1
-            ptr[i] = rffi.cast(rffi.UINT, ordinal)
-        if add_final_zero:
-            ptr[target_length] = rffi.cast(rffi.UINT, 0)
-
-    unicode_to_char16 = _unicode_to_wchar
-
-else:
-    unicode_to_char32 = _unicode_to_wchar
-
-    def unicode_to_char16(u, target_ptr, target_length, add_final_zero):
-        # 'target_ptr' is a raw pointer to 'target_length' 16-bit integers;
-        # we assume here that target_length == unicode_size_as_char16(u).
-        ptr = rffi.cast(rffi.USHORTP, target_ptr)
-        for uc in u:
-            ordinal = ord(uc)
-            if ordinal > 0xFFFF:
-                if ordinal > 0x10FFFF:
-                    raise OutOfRange(ordinal)
-                ordinal -= 0x10000
-                ptr[0] = rffi.cast(rffi.USHORT, 0xD800 | (ordinal >> 10))
-                ptr[1] = rffi.cast(rffi.USHORT, 0xDC00 | (ordinal & 0x3FF))
-                ptr = rffi.ptradd(ptr, 2)
-            else:
-                ptr[0] = rffi.cast(rffi.USHORT, ordinal)
-                ptr = rffi.ptradd(ptr, 1)
-        assert ptr == (
-            rffi.ptradd(rffi.cast(rffi.USHORTP, target_ptr), target_length))
-        if add_final_zero:
-            ptr[0] = rffi.cast(rffi.USHORT, 0)
+def utf8_to_char16(utf8, target_ptr, target_length, add_final_zero):
+    # 'target_ptr' is a raw pointer to 'target_length' 16-bit integers;
+    # we assume (and check) that target_length == utf8_size_as_char16(utf8).
+    ptr = rffi.cast(rffi.USHORTP, target_ptr)
+    i = 0
+    while i < len(utf8):
+        ordinal = rutf8.codepoint_at_pos(utf8, i)
+        if ordinal > 0xFFFF:
+            ordinal -= 0x10000
+            ptr[0] = rffi.cast(rffi.USHORT, 0xD800 | (ordinal >> 10))
+            ptr[1] = rffi.cast(rffi.USHORT, 0xDC00 | (ordinal & 0x3FF))
+            ptr = rffi.ptradd(ptr, 2)
+        else:
+            ptr[0] = rffi.cast(rffi.USHORT, ordinal)
+            ptr = rffi.ptradd(ptr, 1)
+        i = rutf8.next_codepoint_pos(utf8, i)
+    assert ptr == (
+        rffi.ptradd(rffi.cast(rffi.USHORTP, target_ptr), target_length))
+    if add_final_zero:
+        ptr[0] = rffi.cast(rffi.USHORT, 0)
diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py
--- a/rpython/rlib/rutf8.py
+++ b/rpython/rlib/rutf8.py
@@ -453,6 +453,7 @@
     )))))
 
 def get_flag_from_code(oc):
+    assert isinstance(oc, int)
     if oc <= 0x7F:
         return FLAG_ASCII
     if 0xD800 <= oc <= 0xDFFF: