[pypy-commit] cffi wchar_t: in-progress

Mon Jul 9 16:59:09 CEST 2012

Author: Armin Rigo <arigo at tunes.org>
Branch: wchar_t
Changeset: r605:a0c1585fe7d5
Date: 2012-07-09 16:03 +0200
http://bitbucket.org/cffi/cffi/changeset/a0c1585fe7d5/

Log:	in-progress

diff --git a/c/_cffi_backend.c b/c/_cffi_backend.c
--- a/c/_cffi_backend.c
+++ b/c/_cffi_backend.c
@@ -762,24 +762,46 @@
             return 0;
         }
         else if (ctitem->ct_flags & CT_PRIMITIVE_CHAR) {
-            char *srcdata;
-            Py_ssize_t n;
-            if (!PyString_Check(init)) {
-                expected = "str or list or tuple";
-                goto cannot_convert;
+            if (ctitem->ct_size == sizeof(char)) {
+                char *srcdata;
+                Py_ssize_t n;
+                if (!PyString_Check(init)) {
+                    expected = "str or list or tuple";
+                    goto cannot_convert;
+                }
+                n = PyString_GET_SIZE(init);
+                if (ct->ct_length >= 0 && n > ct->ct_length) {
+                    PyErr_Format(PyExc_IndexError,
+                                 "initializer string is too long for '%s' "
+                                 "(got %zd characters)", ct->ct_name, n);
+                    return -1;
+                }
+                if (n != ct->ct_length)
+                    n++;
+                srcdata = PyString_AS_STRING(init);
+                memcpy(data, srcdata, n);
+                return 0;
             }
-            n = PyString_GET_SIZE(init);
-            if (ct->ct_length >= 0 && n > ct->ct_length) {
-                PyErr_Format(PyExc_IndexError,
-                             "initializer string is too long for '%s' "
-                             "(got %zd characters)", ct->ct_name, n);
-                return -1;
+#ifdef HAVE_WCHAR_H
+            else {
+                Py_ssize_t n;
+                if (!PyUnicode_Check(init)) {
+                    expected = "unicode or list or tuple";
+                    goto cannot_convert;
+                }
+                n = _my_PyUnicode_SizeAsWideChar(init);
+                if (ct->ct_length >= 0 && n > ct->ct_length) {
+                    PyErr_Format(PyExc_IndexError,
+                                 "initializer unicode is too long for '%s' "
+                                 "(got %zd characters)", ct->ct_name, n);
+                    return -1;
+                }
+                if (n != ct->ct_length)
+                    n++;
+                _my_PyUnicode_AsWideChar(init, (wchar_t *)data, n);
+                return 0;
             }
-            if (n != ct->ct_length)
-                n++;
-            srcdata = PyString_AS_STRING(init);
-            memcpy(data, srcdata, n);
-            return 0;
+#endif
         }
         else {
             expected = "list or tuple";
@@ -1153,18 +1175,17 @@
     else if (cd->c_type->ct_itemdescr != NULL &&
              cd->c_type->ct_itemdescr->ct_flags & CT_PRIMITIVE_CHAR &&
              cd->c_type->ct_itemdescr->ct_size > sizeof(char)) {
-        abort();
         Py_ssize_t length;
 
         if (cd->c_type->ct_flags & CT_ARRAY) {
-            const char *start = cd->c_data;
-            const char *end;
-            length = get_array_length(cd);
-            end = (const char *)memchr(start, 0, length);
-            if (end != NULL)
-                length = end - start;
+            const wchar_t *start = (wchar_t *)cd->c_data;
+            const Py_ssize_t lenmax = get_array_length(cd);
+            length = 0;
+            while (length < lenmax && start[length])
+                length++;
         }
         else {
+            abort();
             if (cd->c_data == NULL) {
                 PyObject *s = cdata_repr(cd);
                 if (s != NULL) {
@@ -1178,7 +1199,7 @@
             length = strlen(cd->c_data);
         }
 
-        return PyString_FromStringAndSize(cd->c_data, length);
+        return _my_PyUnicode_FromWideChar((wchar_t *)cd->c_data, length);
     }
     else
         return cdata_repr(cd);
@@ -1949,6 +1970,10 @@
                 /* from a string, we add the null terminator */
                 explicitlength = PyString_GET_SIZE(init) + 1;
             }
+            else if (PyUnicode_Check(init)) {
+                /* from a unicode, we add the null terminator */
+                explicitlength = PyUnicode_GET_SIZE(init) + 1;
+            }
             else {
                 explicitlength = PyNumber_AsSsize_t(init, PyExc_OverflowError);
                 if (explicitlength < 0) {
diff --git a/c/test_c.py b/c/test_c.py
--- a/c/test_c.py
+++ b/c/test_c.py
@@ -1311,21 +1311,36 @@
     else:
         py.test.raises(ValueError, "s.a1 = u'\U00012345'")
     #
-    a = new_array_type(BWCharP, u'hello \u1234 world')
+    BWCharArray = new_array_type(BWCharP, None)
+    a = newp(BWCharArray, u'hello \u1234 world')
     assert len(a) == 14   # including the final null
     assert unicode(a) == u'hello \u1234 world'
-    py.test.raises(UnicodeEncodeError, str, a)
+    a[13] = u'!'
+    assert unicode(a) == u'hello \u1234 world!'
+    assert str(a) == repr(a)
     assert a[6] == u'\u1234'
     a[6] = '-'
     assert str(a) == 'hello - world'
     #
+    if wchar4:
+        u = u'\U00012345\U00012346\U00012347'
+        a = newp(BWCharArray, u)
+        assert len(a) == 4
+        assert unicode(a) == u
+        assert len(list(a)) == 4
+        expected = [u'\U00012345', u'\U00012346', u'\U00012347', unichr(0)]
+        assert list(a) == expected
+        got = [a[i] for i in range(4)]
+        assert got == expected
+        py.test.raises(IndexError, 'a[4]')
+    #
     w = cast(BWChar, 'a')
     assert repr(w) == "<cdata 'wchar_t' u'a'>"
     assert str(w) == 'a'
     assert unicode(w) == u'a'
     w = cast(BWChar, 0x1234)
     assert repr(w) == "<cdata 'wchar_t' u'\u1234'>"
-    py.test.raises(UnicodeEncodeError, str, w)
+    py.test.raises(xxUnicodeEncodeError, str, w)
     assert unicode(w) == u'\u1234'
     assert int(w) == 0x1234
     #
@@ -1333,13 +1348,13 @@
     assert str(p) == 'hello - world'
     assert unicode(p) == u'hello - world'
     p[6] = u'\u2345'
-    py.test.raises(UnicodeEncodeError, str, p)
+    py.test.raises(xxUnicodeEncodeError, str, p)
     assert unicode(p) == u'hello \u2345 world'
     #
     s = newp(BStructPtr, [u'\u1234', p])
     assert s.a1 == u'\u1234'
     assert s.a2 == p
-    py.test.raises(UnicodeEncodeError, str, s.a2)
+    py.test.raises(xxUnicodeEncodeError, str, s.a2)
     assert unicode(s.a2) == u'hello \u2345 world'
     #
     q = cast(BWCharP, 0)
diff --git a/c/wchar_helper.h b/c/wchar_helper.h
--- a/c/wchar_helper.h
+++ b/c/wchar_helper.h
@@ -63,6 +63,11 @@
 #endif
 
 
+#define IS_SURROGATE(u)   (0xD800 <= (u)[0] && (u)[0] <= 0xDBFF &&   \
+                           0xDC00 <= (u)[1] && (u)[1] <= 0xDFFF)
+#define AS_SURROGATE(u)   (0x10000 + (((u)[0] - 0xD800) << 10) +     \
+                                     ((u)[1] - 0xDC00))
+
 static int _my_PyUnicode_AsSingleWideChar(PyObject *unicode, wchar_t *result)
 {
     Py_UNICODE *u = PyUnicode_AS_UNICODE(unicode);
@@ -71,12 +76,46 @@
         return 0;
     }
 #ifdef CONVERT_WCHAR_TO_SURROGATES
-    if (PyUnicode_GET_SIZE(unicode) == 2 &&
-            0xD800 <= u[0] && u[0] <= 0xDBFF &&
-            0xDC00 <= u[1] && u[1] <= 0xDFFF) {
-        *result = 0x10000 + ((u[0] - 0xD800) << 10) + (u[1] - 0xDC00);
+    if (PyUnicode_GET_SIZE(unicode) == 2 && IS_SURROGATE(u)) {
+        *result = AS_SURROGATE(u);
         return 0;
     }
 #endif
     return -1;
 }
+
+static Py_ssize_t _my_PyUnicode_SizeAsWideChar(PyObject *unicode)
+{
+    Py_ssize_t length = PyUnicode_GET_SIZE(unicode);
+    Py_ssize_t result = length;
+
+#ifdef CONVERT_WCHAR_TO_SURROGATES
+    Py_UNICODE *u = PyUnicode_AS_UNICODE(unicode);
+    Py_ssize_t i;
+
+    for (i=0; i<length-1; i++) {
+        if (IS_SURROGATE(u+i))
+            result--;
+    }
+#endif
+    return result;
+}
+
+static void _my_PyUnicode_AsWideChar(PyObject *unicode,
+                                     wchar_t *result,
+                                     Py_ssize_t resultlen)
+{
+    Py_UNICODE *u = PyUnicode_AS_UNICODE(unicode);
+    Py_ssize_t i;
+    for (i=0; i<resultlen; i++) {
+        wchar_t ordinal = *u;
+#ifdef CONVERT_WCHAR_TO_SURROGATES
+        if (IS_SURROGATE(u)) {
+            ordinal = AS_SURROGATE(u);
+            u++;
+        }
+#endif
+        result[i] = ordinal;
+        u++;
+    }
+}