[pypy-commit] cffi char16_char32_t: A branch for issue #315

arigo pypy.commits at gmail.com
Wed May 31 06:27:45 EDT 2017


Author: Armin Rigo <arigo at tunes.org>
Branch: char16_char32_t
Changeset: r2951:bb0d933723be
Date: 2017-05-31 12:27 +0200
http://bitbucket.org/cffi/cffi/changeset/bb0d933723be/

Log:	A branch for issue #315

diff --git a/c/_cffi_backend.c b/c/_cffi_backend.c
--- a/c/_cffi_backend.c
+++ b/c/_cffi_backend.c
@@ -118,7 +118,7 @@
 /* base type flag: exactly one of the following: */
 #define CT_PRIMITIVE_SIGNED   0x001   /* signed integer */
 #define CT_PRIMITIVE_UNSIGNED 0x002   /* unsigned integer */
-#define CT_PRIMITIVE_CHAR     0x004   /* char, wchar_t */
+#define CT_PRIMITIVE_CHAR     0x004   /* char, wchar_t, charN_t */
 #define CT_PRIMITIVE_FLOAT    0x008   /* float, double, long double */
 #define CT_POINTER            0x010   /* pointer, excluding ptr-to-func */
 #define CT_ARRAY              0x020   /* array */
@@ -285,9 +285,7 @@
 # include "file_emulator.h"
 #endif
 
-#ifdef HAVE_WCHAR_H
-# include "wchar_helper.h"
-#endif
+#include "wchar_helper.h"
 
 typedef struct _cffi_allocator_s {
     PyObject *ca_alloc, *ca_free;
@@ -1049,12 +1047,14 @@
     }
     else if (ct->ct_flags & CT_PRIMITIVE_CHAR) {
         /*READ(data, ct->ct_size)*/
-        if (ct->ct_size == sizeof(char))
+        switch (ct->ct_size) {
+        case sizeof(char):
             return PyBytes_FromStringAndSize(data, 1);
-#ifdef HAVE_WCHAR_H
-        else
-            return _my_PyUnicode_FromWideChar((wchar_t *)data, 1);
-#endif
+        case 2:
+            return _my_PyUnicode_FromChar16((cffi_char16_t *)data, 1);
+        case 4:
+            return _my_PyUnicode_FromChar32((cffi_char32_t *)data, 1);
+        }
     }
     else if (ct->ct_flags & CT_PRIMITIVE_COMPLEX) {
         Py_complex value = read_raw_complex_data(data, ct->ct_size);
@@ -1133,27 +1133,53 @@
     return -1;
 }
 
-#ifdef HAVE_WCHAR_H
-static wchar_t _convert_to_wchar_t(PyObject *init)
-{
+static cffi_char16_t _convert_to_char16_t(PyObject *init)
+{
+    char err_got[80];
+    err_got[0] = 0;
+
     if (PyUnicode_Check(init)) {
-        wchar_t ordinal;
-        if (_my_PyUnicode_AsSingleWideChar(init, &ordinal) == 0)
+        cffi_char16_t ordinal;
+        if (_my_PyUnicode_AsSingleChar16(init, &ordinal, err_got) == 0)
             return ordinal;
     }
     if (CData_Check(init) &&
            (((CDataObject *)init)->c_type->ct_flags & CT_PRIMITIVE_CHAR) &&
-           (((CDataObject *)init)->c_type->ct_size == sizeof(wchar_t))) {
+           (((CDataObject *)init)->c_type->ct_size == 2)) {
         char *data = ((CDataObject *)init)->c_data;
-        /*READ(data, sizeof(wchar_t))*/
-        return *(wchar_t *)data;
+        /*READ(data, 2)*/
+        return *(cffi_char16_t *)data;
     }
     PyErr_Format(PyExc_TypeError,
-                 "initializer for ctype 'wchar_t' must be a unicode string "
-                 "of length 1, not %.200s", Py_TYPE(init)->tp_name);
-    return (wchar_t)-1;
-}
-#endif
+                 "initializer for ctype 'char16_t' must be a unicode string "
+                 "of length 1, not %.200s",
+                 err_got[0] == 0 ? Py_TYPE(init)->tp_name : err_got);
+    return (cffi_char16_t)-1;
+}
+
+static cffi_char32_t _convert_to_char32_t(PyObject *init)
+{
+    char err_got[80];
+    err_got[0] = 0;
+
+    if (PyUnicode_Check(init)) {
+        cffi_char32_t ordinal;
+        if (_my_PyUnicode_AsSingleChar32(init, &ordinal, err_got) == 0)
+            return ordinal;
+    }
+    if (CData_Check(init) &&
+           (((CDataObject *)init)->c_type->ct_flags & CT_PRIMITIVE_CHAR) &&
+           (((CDataObject *)init)->c_type->ct_size == 4)) {
+        char *data = ((CDataObject *)init)->c_data;
+        /*READ(data, 4)*/
+        return *(cffi_char32_t *)data;
+    }
+    PyErr_Format(PyExc_TypeError,
+                 "initializer for ctype 'char32_t' must be a unicode string "
+                 "of length 1, not %.200s",
+                 err_got[0] == 0 ? Py_TYPE(init)->tp_name : err_got);
+    return (cffi_char32_t)-1;
+}
 
 static int _convert_error(PyObject *init, const char *ct_name,
                           const char *expected)
@@ -1191,7 +1217,7 @@
 convert_from_object_bitfield(char *data, CFieldObject *cf, PyObject *init);
 
 static Py_ssize_t
-get_new_array_length(PyObject **pvalue)
+get_new_array_length(CTypeDescrObject *ctitem, PyObject **pvalue)
 {
     PyObject *value = *pvalue;
 
@@ -1204,7 +1230,12 @@
     }
     else if (PyUnicode_Check(value)) {
         /* from a unicode, we add the null terminator */
-        return _my_PyUnicode_SizeAsWideChar(value) + 1;
+        int length;
+        if (ctitem->ct_size == 4)
+            length = _my_PyUnicode_SizeAsChar32(value);
+        else
+            length = PyUnicode_GET_SIZE(value);
+        return length + 1;
     }
     else {
         Py_ssize_t explicitlength;
@@ -1336,14 +1367,18 @@
             memcpy(data, srcdata, n);
             return 0;
         }
-#ifdef HAVE_WCHAR_H
         else {
             Py_ssize_t n;
             if (!PyUnicode_Check(init)) {
                 expected = "unicode or list or tuple";
                 goto cannot_convert;
             }
-            n = _my_PyUnicode_SizeAsWideChar(init);
+
+            if (ctitem->ct_size == 4)
+                n = _my_PyUnicode_SizeAsChar32(init);
+            else
+                n = _my_PyUnicode_SizeAsChar16(init);
+
             if (ct->ct_length >= 0 && n > ct->ct_length) {
                 PyErr_Format(PyExc_IndexError,
                              "initializer unicode is too long for '%s' "
@@ -1352,10 +1387,12 @@
             }
             if (n != ct->ct_length)
                 n++;
-            _my_PyUnicode_AsWideChar(init, (wchar_t *)data, n);
+            if (ctitem->ct_size == 4)
+                _my_PyUnicode_AsChar32(init, (cffi_char32_t *)data, n);
+            else
+                _my_PyUnicode_AsChar16(init, (cffi_char16_t *)data, n);
             return 0;
         }
-#endif
     }
     else {
         expected = "list or tuple";
@@ -1537,22 +1574,28 @@
         return 0;
     }
     if (ct->ct_flags & CT_PRIMITIVE_CHAR) {
-        if (ct->ct_size == sizeof(char)) {
+        switch (ct->ct_size) {
+        case sizeof(char): {
             int res = _convert_to_char(init);
             if (res < 0)
                 return -1;
             data[0] = res;
             return 0;
         }
-#ifdef HAVE_WCHAR_H
-        else {
-            wchar_t res = _convert_to_wchar_t(init);
-            if (res == (wchar_t)-1 && PyErr_Occurred())
+        case 2: {
+            cffi_char16_t res = _convert_to_char16_t(init);
+            if (res == (cffi_char16_t)-1 && PyErr_Occurred())
                 return -1;
-            *(wchar_t *)data = res;
+            *(cffi_char16_t *)data = res;
             return 0;
         }
-#endif
+        case 4: {
+            int res = _convert_to_char32_t(init);
+            if (res == -1 && PyErr_Occurred())
+                return -1;
+            *(cffi_char32_t *)data = res;
+            return 0;
+        }
     }
     if (ct->ct_flags & (CT_STRUCT|CT_UNION)) {
 
@@ -2033,12 +2076,16 @@
     }
     else if (cd->c_type->ct_flags & CT_PRIMITIVE_CHAR) {
         /*READ(cd->c_data, cd->c_type->ct_size)*/
-        if (cd->c_type->ct_size == sizeof(char))
+        switch (cd->c_type->ct_size) {
+        case sizeof(char):
             return PyInt_FromLong((unsigned char)cd->c_data[0]);
-#ifdef HAVE_WCHAR_H
-        else
-            return PyInt_FromLong((long)*(wchar_t *)cd->c_data);
-#endif
+        case 2:
+            return PyInt_FromLong((long)*(cffi_char16_t *)cd->c_data);
+        case 4:
+            /* NB. cast via int32_t instead of cffi_char32_t, so that
+               we expose a signed result to the user */
+            return PyInt_FromLong((long)*(int32_t *)cd->c_data);
+        }
     }
     else if (cd->c_type->ct_flags & CT_PRIMITIVE_FLOAT) {
         PyObject *o = cdata_float(cd);
@@ -3626,18 +3673,17 @@
         value = (unsigned char)PyString_AS_STRING(ob)[0];
     }
 #endif
-#ifdef HAVE_WCHAR_H
     else if (PyUnicode_Check(ob)) {
-        wchar_t ordinal;
-        if (_my_PyUnicode_AsSingleWideChar(ob, &ordinal) < 0) {
+        char err_buf[80];
+        cffi_char32_t ordinal;
+        if (_my_PyUnicode_AsSingleChar32(ob, &ordinal, err_buf) < 0) {
             PyErr_Format(PyExc_TypeError,
-                      "cannot cast unicode string of length %zd to ctype '%s'",
-                         PyUnicode_GET_SIZE(ob), ct->ct_name);
+                         "cannot cast %s to ctype '%s'", err_buf, ct->ct_name);
             return NULL;
         }
-        value = (long)ordinal;
-    }
-#endif
+        /* the user sees char32_t being signed, but not char16_t */
+        value = (int32_t)ordinal;
+    }
     else if (PyBytes_Check(ob)) {
         int res = _convert_to_char(ob);
         if (res < 0)
@@ -3674,17 +3720,16 @@
         *out_value = (unsigned char)PyBytes_AS_STRING(io)[0];
         return 1;
     }
-#if HAVE_WCHAR_H
     else if (PyUnicode_Check(io)) {
-        wchar_t ordinal;
-        if (_my_PyUnicode_AsSingleWideChar(io, &ordinal) < 0) {
+        char ignored[80];
+        cffi_char32_t ordinal;
+        if (_my_PyUnicode_AsSingleChar32(io, &ordinal, ignored) < 0) {
             Py_DECREF(io);
             return -1;
         }
-        *out_value = (long)ordinal;
+        *out_value = (int32_t)ordinal;
         return 1;
     }
-#endif
     return 0;
 }
 
@@ -4106,6 +4151,8 @@
        EPTYPE2(fc, "float _Complex", cffi_float_complex_t, CT_PRIMITIVE_COMPLEX ) \
        EPTYPE2(dc, "double _Complex", cffi_double_complex_t, CT_PRIMITIVE_COMPLEX ) \
        ENUM_PRIMITIVE_TYPES_WCHAR                               \
+       EPTYPE2(c16, "char16_t", cffi_char16_t, CT_PRIMITIVE_CHAR ) \
+       EPTYPE2(c32, "char32_t", cffi_char32_t, CT_PRIMITIVE_CHAR ) \
        EPTYPE(b, _Bool, CT_PRIMITIVE_UNSIGNED | CT_IS_BOOL )    \
      /* the following types are not primitive in the C sense */ \
        EPTYPE(i8, int8_t, CT_PRIMITIVE_SIGNED)                  \
@@ -6036,27 +6083,45 @@
             }
             return PyBytes_FromStringAndSize(start, length);
         }
-#ifdef HAVE_WCHAR_H
         else if (cd->c_type->ct_itemdescr->ct_flags & CT_PRIMITIVE_CHAR) {
-            const wchar_t *start = (wchar_t *)cd->c_data;
-            assert(cd->c_type->ct_itemdescr->ct_size == sizeof(wchar_t));
-            if (length < 0) {
-                /*READ(start, sizeof(wchar_t))*/
-                length = 0;
-                while (start[length])
-                    length++;
-                /*READ(start, sizeof(wchar_t) * length)*/
+            switch (cd->c_type->ct_itemdescr->ct_size) {
+            case 2: {
+                const cffi_char16_t *start = (cffi_char16_t *)cd->c_data;
+                if (length < 0) {
+                    /*READ(start, 2)*/
+                    length = 0;
+                    while (start[length])
+                        length++;
+                    /*READ(start, 2 * length)*/
+                }
+                else {
+                    /*READ(start, 2 * length)*/
+                    maxlen = length;
+                    length = 0;
+                    while (length < maxlen && start[length])
+                        length++;
+                }
+                return _my_PyUnicode_FromChar16(start, length);
             }
-            else {
-                /*READ(start, sizeof(wchar_t) * length)*/
-                maxlen = length;
-                length = 0;
-                while (length < maxlen && start[length])
-                    length++;
+            case 4: {
+                const cffi_char32_t *start = (cffi_char32_t *)cd->c_data;
+                if (length < 0) {
+                    /*READ(start, 4)*/
+                    length = 0;
+                    while (start[length])
+                        length++;
+                    /*READ(start, 4 * length)*/
+                }
+                else {
+                    /*READ(start, 4 * length)*/
+                    maxlen = length;
+                    length = 0;
+                    while (length < maxlen && start[length])
+                        length++;
+                }
+                return _my_PyUnicode_FromChar32(start, length);
             }
-            return _my_PyUnicode_FromWideChar(start, length);
-        }
-#endif
+        }
     }
     else if (cd->c_type->ct_flags & CT_IS_ENUM) {
         return convert_cdata_to_enum_string(cd, 0);
@@ -6070,12 +6135,14 @@
         /*READ(cd->c_data, cd->c_type->ct_size)*/
         if (cd->c_type->ct_size == sizeof(char))
             return PyBytes_FromStringAndSize(cd->c_data, 1);
-#ifdef HAVE_WCHAR_H
         else if (cd->c_type->ct_flags & CT_PRIMITIVE_CHAR) {
-            assert(cd->c_type->ct_size == sizeof(wchar_t));
-            return _my_PyUnicode_FromWideChar((wchar_t *)cd->c_data, 1);
-        }
-#endif
+            switch (cd->c_type->ct_size) {
+            case 2:
+                return _my_PyUnicode_FromChar16((cffi_char16_t *)cd->c_data, 1);
+            case 4:
+                return _my_PyUnicode_FromChar32((cffi_char32_t *)cd->c_data, 1);
+            }
+        }
     }
     PyErr_Format(PyExc_TypeError, "string(): unexpected cdata '%s' argument",
                  cd->c_type->ct_name);
@@ -6120,12 +6187,14 @@
     /* byte- and unicode strings */
     ctitem = cd->c_type->ct_itemdescr;
     if (ctitem->ct_flags & CT_PRIMITIVE_CHAR) {
-        if (ctitem->ct_size == sizeof(char))
+        switch (ctitem->ct_size) {
+        case sizeof(char):
             return PyBytes_FromStringAndSize(cd->c_data, length);
-#ifdef HAVE_WCHAR_H
-        else if (ctitem->ct_size == sizeof(wchar_t))
-            return _my_PyUnicode_FromWideChar((wchar_t *)cd->c_data, length);
-#endif
+        case 2:
+            return _my_PyUnicode_FromChar16((cffi_char16_t *)cd->c_data,length);
+        case 4:
+            return _my_PyUnicode_FromChar32((cffi_char32_t *)cd->c_data,length);
+        }
     }
 
     /* else, the result is a list.  This implementation should be
@@ -6992,12 +7061,51 @@
     return PyBytes_FromStringAndSize(&x, 1);
 }
 
+/* backward-compatibility hack: instead of _cffi_to_c_char16_t() and
+ * _cffi_to_c_char32_t(), we have _cffi_to_c_wchar_t() handling whatever
+ * size is wchar_t, and _cffi_to_c_wchar3216_t() handling the opposite.
+ */
 #ifdef HAVE_WCHAR_H
-static PyObject *_cffi_from_c_wchar_t(wchar_t x) {
-    return _my_PyUnicode_FromWideChar(&x, 1);
-}
+typedef wchar_t cffi_wchar_t;
+#else
+typedef uint16_t cffi_wchar_t;   /* random pick... */
 #endif
 
+static cffi_wchar_t _cffi_to_c_wchar_t(PyObject *init)
+{
+    if (sizeof(cffi_wchar_t) == 2)
+        return (cffi_wchar_t)_convert_to_char16_t(init);
+    else
+        return (cffi_wchar_t)_convert_to_char32_t(init);
+}
+static PyObject *_cffi_from_c_wchar_t(cffi_wchar_t x) {
+    if (sizeof(cffi_wchar_t) == 2) {
+        cffi_char16_t input = x;
+        return _my_PyUnicode_FromChar16(&input, 1);
+    }
+    else {
+        cffi_char32_t input = x;
+        return _my_PyUnicode_FromChar32(&input, 1);
+    }
+}
+static int _cffi_to_c_wchar3216_t(PyObject *init)
+{
+    if (sizeof(cffi_wchar_t) == 4)
+        return (int)_convert_to_char16_t(init);
+    else
+        return (int)_convert_to_char32_t(init);
+}
+static PyObject *_cffi_from_c_wchar3216_t(int x) {
+    if (sizeof(cffi_wchar_t) == 4) {
+        cffi_char16_t input = x;
+        return _my_PyUnicode_FromChar16(&input, 1);
+    }
+    else {
+        cffi_char32_t input = x;
+        return _my_PyUnicode_FromChar32(&input, 1);
+    }
+}
+
 struct _cffi_externpy_s;      /* forward declaration */
 static void cffi_call_python(struct _cffi_externpy_s *, char *args);
 
@@ -7021,18 +7129,15 @@
     convert_to_object,
     convert_from_object,
     convert_struct_to_owning_object,
-#ifdef HAVE_WCHAR_H
-    _convert_to_wchar_t,
+    _cffi_to_c_wchar_t,
     _cffi_from_c_wchar_t,
-#else
-    0,
-    0,
-#endif
     _cffi_to_c_long_double,
     _cffi_to_c__Bool,
     _prepare_pointer_call_argument,
     convert_array_from_object,
     cffi_call_python,
+    _cffi_to_c_wchar3216_t,
+    _cffi_from_c_wchar3216_t,
 };
 
 static struct { const char *name; int value; } all_dlopen_flags[] = {
diff --git a/c/wchar_helper.h b/c/wchar_helper.h
--- a/c/wchar_helper.h
+++ b/c/wchar_helper.h
@@ -2,31 +2,28 @@
  * wchar_t helpers
  */
 
-#if (Py_UNICODE_SIZE == 2) && (SIZEOF_WCHAR_T == 4)
-# define CONVERT_WCHAR_TO_SURROGATES
-#endif
+typedef uint16_t cffi_char16_t;
+typedef uint32_t cffi_char32_t;
+/* NB. cffi_char32_t is unsigned to make the logic here a bit easier */
 
 
-#ifdef CONVERT_WCHAR_TO_SURROGATES
+#if Py_UNICODE_SIZE == 2
 
 /* Before Python 2.7, PyUnicode_FromWideChar is not able to convert
    wchar_t values greater than 65535 into two-unicode-characters surrogates.
    But even the Python 2.7 version doesn't detect wchar_t values that are
    out of range(1114112), and just returns nonsense.
+
+   From cffi 1.11 we can't use it anyway, because we need a version
+   with char32_t input types.
 */
 static PyObject *
-_my_PyUnicode_FromWideChar(register const wchar_t *w,
-                           Py_ssize_t size)
+_my_PyUnicode_FromChar32(const cffi_char32_t *w, Py_ssize_t size)
 {
     PyObject *unicode;
     register Py_ssize_t i;
     Py_ssize_t alloc;
-    const wchar_t *orig_w;
-
-    if (w == NULL) {
-        PyErr_BadInternalCall();
-        return NULL;
-    }
+    const cffi_char32_t *orig_w;
 
     alloc = size;
     orig_w = w;
@@ -45,11 +42,11 @@
         register Py_UNICODE *u;
         u = PyUnicode_AS_UNICODE(unicode);
         for (i = size; i > 0; i--) {
-            if (((unsigned int)*w) > 0xFFFF) {
-                wchar_t ordinal;
-                if (((unsigned int)*w) > 0x10FFFF) {
+            if (*w > 0xFFFF) {
+                cffi_char32_t ordinal;
+                if (*w > 0x10FFFF) {
                     PyErr_Format(PyExc_ValueError,
-                                 "wchar_t out of range for "
+                                 "char32_t out of range for "
                                  "conversion to unicode: 0x%x", (int)*w);
                     Py_DECREF(unicode);
                     return NULL;
@@ -66,9 +63,53 @@
     return unicode;
 }
 
-#else
+static PyObject *
+_my_PyUnicode_FromChar16(const cffi_char16_t *w, Py_ssize_t size)
+{
+    return PyUnicode_FromUnicode((const Py_UNICODE *)w, size);
+}
 
-# define _my_PyUnicode_FromWideChar PyUnicode_FromWideChar
+#else   /* Py_UNICODE_SIZE == 4 */
+
+static PyObject *
+_my_PyUnicode_FromChar32(const cffi_char32_t *w, Py_ssize_t size)
+{
+    return PyUnicode_FromUnicode((const Py_UNICODE *)w, size);
+}
+
+static PyObject *
+_my_PyUnicode_FromChar16(const cffi_char16_t *w, Py_ssize_t size)
+{
+    PyObject *result = PyUnicode_FromUnicode(NULL, size);
+
+    if (result != NULL) {
+        Py_UNICODE *u_base = PyUnicode_AS_UNICODE(result);
+        Py_UNICODE *u = u_base;
+        Py_ssize_t consumed;
+
+        while (size > 0) {
+            cffi_char32_t ch = *w++;
+            size--;
+            if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
+                cffi_char32_t ch2 = *w;
+                if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
+                    ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
+                    w++;
+                    size--;
+                }
+            }
+            *u++ = ch;
+        }
+        consumed = u - u_base;
+        if (consumed < size) {
+            if (PyUnicode_Resize(&result, consumed) < 0) {
+                Py_DECREF(result);
+                return NULL;
+            }
+        }
+    }
+    return result;
+}
 
 #endif
 
@@ -78,28 +119,70 @@
 #define AS_SURROGATE(u)   (0x10000 + (((u)[0] - 0xD800) << 10) +     \
                                      ((u)[1] - 0xDC00))
 
-static int _my_PyUnicode_AsSingleWideChar(PyObject *unicode, wchar_t *result)
+static int
+_my_PyUnicode_AsSingleChar16(PyObject *unicode, cffi_char16_t *result,
+                             char *err_got)
+{
+    Py_UNICODE *u = PyUnicode_AS_UNICODE(unicode);
+    if (PyUnicode_GET_SIZE(unicode) != 1) {
+        sprintf(err_got, "unicode string of length %zd",
+                PyUnicode_GET_SIZE(unicode));
+        return -1;
+    }
+#if Py_UNICODE_SIZE == 4
+    if (((unsigned int)u[0]) > 0xFFFF)
+    {
+        sprintf(err_got, "unicode character too large for 16 bits");
+        return -1;
+    }
+#endif
+    *result = (cffi_char16_t)u[0];
+    return 0;
+}
+
+static int
+_my_PyUnicode_AsSingleChar32(PyObject *unicode, cffi_char32_t *result,
+                             char *err_got)
 {
     Py_UNICODE *u = PyUnicode_AS_UNICODE(unicode);
     if (PyUnicode_GET_SIZE(unicode) == 1) {
-        *result = (wchar_t)(u[0]);
+        *result = (cffi_char32_t)u[0];
         return 0;
     }
-#ifdef CONVERT_WCHAR_TO_SURROGATES
+#if Py_UNICODE_SIZE == 2
     if (PyUnicode_GET_SIZE(unicode) == 2 && IS_SURROGATE(u)) {
         *result = AS_SURROGATE(u);
         return 0;
     }
 #endif
+    sprintf(err_got, "unicode string of length %zd",
+            PyUnicode_GET_SIZE(unicode));
     return -1;
 }
 
-static Py_ssize_t _my_PyUnicode_SizeAsWideChar(PyObject *unicode)
+static Py_ssize_t _my_PyUnicode_SizeAsChar16(PyObject *unicode)
 {
     Py_ssize_t length = PyUnicode_GET_SIZE(unicode);
     Py_ssize_t result = length;
 
-#ifdef CONVERT_WCHAR_TO_SURROGATES
+#if Py_UNICODE_SIZE == 4
+    Py_UNICODE *u = PyUnicode_AS_UNICODE(unicode);
+    Py_ssize_t i;
+
+    for (i=0; i<length; i++) {
+        if (u[i] > 0xFFFF)
+            result++;
+    }
+#endif
+    return result;
+}
+
+static Py_ssize_t _my_PyUnicode_SizeAsChar32(PyObject *unicode)
+{
+    Py_ssize_t length = PyUnicode_GET_SIZE(unicode);
+    Py_ssize_t result = length;
+
+#if Py_UNICODE_SIZE == 2
     Py_UNICODE *u = PyUnicode_AS_UNICODE(unicode);
     Py_ssize_t i;
 
@@ -111,15 +194,41 @@
     return result;
 }
 
-static void _my_PyUnicode_AsWideChar(PyObject *unicode,
-                                     wchar_t *result,
-                                     Py_ssize_t resultlen)
+static void _my_PyUnicode_AsChar16(PyObject *unicode,
+                                   cffi_char16_t *result,
+                                   Py_ssize_t resultlen)
+{
+    Py_ssize_t len = PyUnicode_GET_SIZE(unicode);
+    Py_UNICODE *u = PyUnicode_AS_UNICODE(unicode);
+    Py_ssize_t i;
+    for (i=0; i<len; i++) {
+#if Py_UNICODE_SIZE == 2
+        cffi_char16_t ordinal = u[i];
+#else
+        cffi_char32_t ordinal = u[i];
+        if (ordinal > 0xFFFF) {
+            /* NB. like CPython, ignore the problem of unicode string objects
+             * containing characters greater than sys.maxunicode.  It is
+             * easier to not add exception handling here */
+            ordinal -= 0x10000;
+            *result++ = 0xD800 | (ordinal >> 10);
+            *result++ = 0xDC00 | (ordinal & 0x3FF);
+            continue;
+        }
+#endif
+        *result++ = ordinal;
+    }
+}
+
+static void _my_PyUnicode_AsChar32(PyObject *unicode,
+                                   cffi_char32_t *result,
+                                   Py_ssize_t resultlen)
 {
     Py_UNICODE *u = PyUnicode_AS_UNICODE(unicode);
     Py_ssize_t i;
     for (i=0; i<resultlen; i++) {
-        wchar_t ordinal = *u;
-#ifdef CONVERT_WCHAR_TO_SURROGATES
+        cffi_char32_t ordinal = *u;
+#if Py_UNICODE_SIZE == 2
         if (IS_SURROGATE(u)) {
             ordinal = AS_SURROGATE(u);
             u++;


More information about the pypy-commit mailing list