[pypy-commit] pypy py3.5: Fix 2BYTE case in _PyUnicode_Ready(): don't prepend a BOM to the data

Sat Sep 16 13:29:09 EDT 2017

Author: Ronan Lamy <ronan.lamy at gmail.com>
Branch: py3.5
Changeset: r92410:b6ba2262940e
Date: 2017-09-16 18:28 +0100
http://bitbucket.org/pypy/pypy/changeset/b6ba2262940e/

Log:	Fix 2BYTE case in _PyUnicode_Ready(): don't prepend a BOM to the
	data

diff --git a/pypy/module/cpyext/test/test_unicodeobject.py b/pypy/module/cpyext/test/test_unicodeobject.py
--- a/pypy/module/cpyext/test/test_unicodeobject.py
+++ b/pypy/module/cpyext/test/test_unicodeobject.py
@@ -190,6 +190,26 @@
         b = s.encode('utf-32')[4:]  # Skip the BOM
         assert module.from_ucs4(b) == s
 
+    def test_substring(self):
+        module = self.import_extension('foo', [
+            ("slice_start", "METH_VARARGS",
+             '''
+             PyObject* text;
+             Py_ssize_t start, length;
+             if (!PyArg_ParseTuple(args, "On", &text, &start))
+                return NULL;
+             if (PyUnicode_READY(text) == -1) return NULL;
+             length = PyUnicode_GET_LENGTH(text);
+             if (start > length) return PyLong_FromSsize_t(start);
+             return PyUnicode_FromKindAndData(PyUnicode_KIND(text),
+                 PyUnicode_1BYTE_DATA(text) + start*PyUnicode_KIND(text),
+                 length-start);
+             ''')])
+        s = 'aАbБcСdД'
+        assert module.slice_start(s, 2) == 'bБcСdД'
+        s = 'xx\N{PILE OF POO}'
+        assert module.slice_start(s, 2) == '\N{PILE OF POO}'
+
     def test_aswidecharstring(self):
         module = self.import_extension('foo', [
             ("aswidecharstring", "METH_O",
diff --git a/pypy/module/cpyext/unicodeobject.py b/pypy/module/cpyext/unicodeobject.py
--- a/pypy/module/cpyext/unicodeobject.py
+++ b/pypy/module/cpyext/unicodeobject.py
@@ -1,6 +1,6 @@
 from pypy.interpreter.error import OperationError, oefmt
 from rpython.rtyper.lltypesystem import rffi, lltype
-from rpython.rlib.runicode import unicode_encode_latin_1, unicode_encode_utf_16
+from rpython.rlib.runicode import unicode_encode_latin_1, unicode_encode_utf_16_helper
 from rpython.rlib.rarithmetic import widen
 
 from pypy.module.unicodedata import unicodedb
@@ -289,8 +289,9 @@
             set_utf8_len(py_obj, 0)
     elif maxchar < 65536:
         # XXX: assumes that sizeof(wchar_t) == 4
-        ucs2_str = unicode_encode_utf_16(
-            w_obj._value, len(w_obj._value), errors='strict')
+        ucs2_str = unicode_encode_utf_16_helper(
+            w_obj._value, len(w_obj._value), errors='strict',
+            byteorder=runicode.BYTEORDER)
         ucs2_data = cts.cast('Py_UCS2 *', rffi.str2charp(ucs2_str))
         set_data(py_obj, cts.cast('void*', ucs2_data))
         set_len(py_obj, get_wsize(py_obj))