[pypy-commit] pypy default: PyUnicode_AsUTF{16,32}String()

Sun Jan 21 03:58:43 EST 2018

Author: Armin Rigo <arigo at tunes.org>
Branch: 
Changeset: r93692:84180176fef1
Date: 2018-01-21 09:58 +0100
http://bitbucket.org/pypy/pypy/changeset/84180176fef1/

Log:	PyUnicode_AsUTF{16,32}String()

diff --git a/pypy/module/cpyext/stubs.py b/pypy/module/cpyext/stubs.py
--- a/pypy/module/cpyext/stubs.py
+++ b/pypy/module/cpyext/stubs.py
@@ -1552,14 +1552,6 @@
     """
     raise NotImplementedError
 
- at cpython_api([PyObject], PyObject)
-def PyUnicode_AsUTF32String(space, unicode):
-    """Return a Python string using the UTF-32 encoding in native byte order. The
-    string always starts with a BOM mark.  Error handling is "strict".  Return
-    NULL if an exception was raised by the codec.
-    """
-    raise NotImplementedError
-
 @cpython_api([rffi.CCHARP, Py_ssize_t, rffi.CCHARP, rffi.INTP, Py_ssize_t], PyObject)
 def PyUnicode_DecodeUTF16Stateful(space, s, size, errors, byteorder, consumed):
     """If consumed is NULL, behave like PyUnicode_DecodeUTF16(). If
@@ -1595,13 +1587,6 @@
     changes in your code for properly supporting 64-bit systems."""
     raise NotImplementedError
 
- at cpython_api([PyObject], PyObject)
-def PyUnicode_AsUTF16String(space, unicode):
-    """Return a Python string using the UTF-16 encoding in native byte order. The
-    string always starts with a BOM mark.  Error handling is "strict".  Return
-    NULL if an exception was raised by the codec."""
-    raise NotImplementedError
-
 @cpython_api([rffi.CCHARP, Py_ssize_t, rffi.CCHARP], PyObject)
 def PyUnicode_DecodeUTF7(space, s, size, errors):
     """Create a Unicode object by decoding size bytes of the UTF-7 encoded string
diff --git a/pypy/module/cpyext/test/test_unicodeobject.py b/pypy/module/cpyext/test/test_unicodeobject.py
--- a/pypy/module/cpyext/test/test_unicodeobject.py
+++ b/pypy/module/cpyext/test/test_unicodeobject.py
@@ -145,6 +145,20 @@
         res = module.test_unicode_format(1, "xyz")
         assert res == u"bla 1 ble xyz\n"
 
+    def test_AsUTFNString(self):
+        module = self.import_extension('foo', [
+            ("asutf8", "METH_O", "return PyUnicode_AsUTF8String(args);"),
+            ("asutf16", "METH_O", "return PyUnicode_AsUTF16String(args);"),
+            ("asutf32", "METH_O", "return PyUnicode_AsUTF32String(args);"),
+            ])
+        u = u'sp\x09m\u1234\U00012345'
+        s = module.asutf8(u)
+        assert s == u.encode('utf-8')
+        s = module.asutf16(u)
+        assert s == u.encode('utf-16')
+        s = module.asutf32(u)
+        assert s == u.encode('utf-32')
+
 
 class TestUnicode(BaseApiTest):
     def test_unicodeobject(self, space):
@@ -247,10 +261,24 @@
         lltype.free(ar, flavor='raw')
 
     def test_AsUTF8String(self, space):
-        w_u = space.wrap(u'sp\x09m')
+        w_u = space.wrap(u'sp\x09m\u1234')
         w_res = PyUnicode_AsUTF8String(space, w_u)
         assert space.type(w_res) is space.w_bytes
-        assert space.unwrap(w_res) == 'sp\tm'
+        assert space.unwrap(w_res) == 'sp\tm\xe1\x88\xb4'
+
+    def test_AsUTF16String(self, space):
+        u = u'sp\x09m\u1234\U00012345'
+        w_u = space.wrap(u)
+        w_res = PyUnicode_AsUTF16String(space, w_u)
+        assert space.type(w_res) is space.w_bytes
+        assert space.unwrap(w_res) == u.encode('utf-16')
+
+    def test_AsUTF32String(self, space):
+        u = u'sp\x09m\u1234\U00012345'
+        w_u = space.wrap(u)
+        w_res = PyUnicode_AsUTF32String(space, w_u)
+        assert space.type(w_res) is space.w_bytes
+        assert space.unwrap(w_res) == u.encode('utf-32')
 
     def test_decode_utf8(self, space):
         u = rffi.str2charp(u'sp\x134m'.encode("utf-8"))
diff --git a/pypy/module/cpyext/unicodeobject.py b/pypy/module/cpyext/unicodeobject.py
--- a/pypy/module/cpyext/unicodeobject.py
+++ b/pypy/module/cpyext/unicodeobject.py
@@ -474,7 +474,7 @@
     ref[0] = rffi.cast(PyObject, py_newuni)
     return 0
 
-def make_conversion_functions(suffix, encoding):
+def make_conversion_functions(suffix, encoding, only_for_asstring=False):
     @cpython_api([PyObject], PyObject)
     @func_renamer('PyUnicode_As%sString' % suffix)
     def PyUnicode_AsXXXString(space, w_unicode):
@@ -486,6 +486,9 @@
         return unicodeobject.encode_object(space, w_unicode, encoding, "strict")
     globals()['PyUnicode_As%sString' % suffix] = PyUnicode_AsXXXString
 
+    if only_for_asstring:
+        return
+
     @cpython_api([CONST_STRING, Py_ssize_t, CONST_STRING], PyObject)
     @func_renamer('PyUnicode_Decode%s' % suffix)
     def PyUnicode_DecodeXXX(space, s, size, errors):
@@ -516,6 +519,8 @@
     globals()['PyUnicode_Encode%s' % suffix] = PyUnicode_EncodeXXX
 
 make_conversion_functions('UTF8', 'utf-8')
+make_conversion_functions('UTF16', 'utf-16', only_for_asstring=True)
+make_conversion_functions('UTF32', 'utf-32', only_for_asstring=True)
 make_conversion_functions('ASCII', 'ascii')
 make_conversion_functions('Latin1', 'latin-1')
 if sys.platform == 'win32':