[pypy-commit] pypy default: Cherry-pick a few changes from the cpyext-ext branch:

Mon Mar 21 13:09:02 EDT 2016

Author: Amaury Forgeot d'Arc <amauryfa at gmail.com>
Branch: 
Changeset: r83223:edf35def96ce
Date: 2016-03-21 17:55 +0100
http://bitbucket.org/pypy/pypy/changeset/edf35def96ce/

Log:	Cherry-pick a few changes from the cpyext-ext branch: 3df26326119c
	43629fab94e1 931af853eaab

	- expose "defenc" and "hash" fields of PyUnicodeObject
	- Allow PyString_AsString to process unicode objects. The "defenc"
	field is returned.

diff --git a/pypy/module/cpyext/bytesobject.py b/pypy/module/cpyext/bytesobject.py
--- a/pypy/module/cpyext/bytesobject.py
+++ b/pypy/module/cpyext/bytesobject.py
@@ -1,4 +1,4 @@
-from pypy.interpreter.error import OperationError
+from pypy.interpreter.error import OperationError, oefmt
 from rpython.rtyper.lltypesystem import rffi, lltype
 from pypy.module.cpyext.api import (
     cpython_api, cpython_struct, bootstrap_function, build_type_checkers,
@@ -134,8 +134,14 @@
     if from_ref(space, rffi.cast(PyObject, ref.c_ob_type)) is space.w_str:
         pass    # typecheck returned "ok" without forcing 'ref' at all
     elif not PyString_Check(space, ref):   # otherwise, use the alternate way
-        raise OperationError(space.w_TypeError, space.wrap(
-            "PyString_AsString only support strings"))
+        from pypy.module.cpyext.unicodeobject import (
+            PyUnicode_Check, _PyUnicode_AsDefaultEncodedString)
+        if PyUnicode_Check(space, ref):
+            ref = _PyUnicode_AsDefaultEncodedString(space, ref, None)
+        else:
+            raise oefmt(space.w_TypeError,
+                        "expected string or Unicode object, %T found",
+                        from_ref(space, ref))
     ref_str = rffi.cast(PyStringObject, ref)
     if not ref_str.c_buffer:
         # copy string buffer
@@ -147,8 +153,14 @@
 @cpython_api([PyObject, rffi.CCHARPP, rffi.CArrayPtr(Py_ssize_t)], rffi.INT_real, error=-1)
 def PyString_AsStringAndSize(space, ref, buffer, length):
     if not PyString_Check(space, ref):
-        raise OperationError(space.w_TypeError, space.wrap(
-            "PyString_AsStringAndSize only support strings"))
+        from pypy.module.cpyext.unicodeobject import (
+            PyUnicode_Check, _PyUnicode_AsDefaultEncodedString)
+        if PyUnicode_Check(space, ref):
+            ref = _PyUnicode_AsDefaultEncodedString(space, ref, None)
+        else:
+            raise oefmt(space.w_TypeError,
+                        "expected string or Unicode object, %T found",
+                        from_ref(space, ref))
     ref_str = rffi.cast(PyStringObject, ref)
     if not ref_str.c_buffer:
         # copy string buffer
diff --git a/pypy/module/cpyext/include/unicodeobject.h b/pypy/module/cpyext/include/unicodeobject.h
--- a/pypy/module/cpyext/include/unicodeobject.h
+++ b/pypy/module/cpyext/include/unicodeobject.h
@@ -20,8 +20,12 @@
 
 typedef struct {
     PyObject_HEAD
-    Py_UNICODE *buffer;
+    Py_UNICODE *str;
     Py_ssize_t size;
+    long hash;                  /* Hash value; -1 if not set */
+    PyObject *defenc;           /* (Default) Encoded version as Python
+                                   string, or NULL; this is used for
+                                   implementing the buffer protocol */
 } PyUnicodeObject;
 
 
diff --git a/pypy/module/cpyext/test/test_bytesobject.py b/pypy/module/cpyext/test/test_bytesobject.py
--- a/pypy/module/cpyext/test/test_bytesobject.py
+++ b/pypy/module/cpyext/test/test_bytesobject.py
@@ -139,6 +139,44 @@
             ])
         module.getstring()
 
+    def test_py_string_as_string_Unicode(self):
+        module = self.import_extension('foo', [
+            ("getstring_unicode", "METH_NOARGS",
+             """
+                 Py_UNICODE chars[] = {'t', 'e', 's', 't'};
+                 PyObject* u1 = PyUnicode_FromUnicode(chars, 4);
+                 char *buf;
+                 buf = PyString_AsString(u1);
+                 if (buf == NULL)
+                     return NULL;
+                 if (buf[3] != 't') {
+                     PyErr_SetString(PyExc_AssertionError, "Bad conversion");
+                     return NULL;
+                 }
+                 Py_DECREF(u1);
+                 Py_INCREF(Py_None);
+                 return Py_None;
+             """),
+            ("getstringandsize_unicode", "METH_NOARGS",
+             """
+                 Py_UNICODE chars[] = {'t', 'e', 's', 't'};
+                 PyObject* u1 = PyUnicode_FromUnicode(chars, 4);
+                 char *buf;
+                 Py_ssize_t len;
+                 if (PyString_AsStringAndSize(u1, &buf, &len) < 0)
+                     return NULL;
+                 if (len != 4) {
+                     PyErr_SetString(PyExc_AssertionError, "Bad Length");
+                     return NULL;
+                 }
+                 Py_DECREF(u1);
+                 Py_INCREF(Py_None);
+                 return Py_None;
+             """),
+            ])
+        module.getstring_unicode()
+        module.getstringandsize_unicode()
+
     def test_format_v(self):
         module = self.import_extension('foo', [
             ("test_string_format_v", "METH_VARARGS",
diff --git a/pypy/module/cpyext/test/test_unicodeobject.py b/pypy/module/cpyext/test/test_unicodeobject.py
--- a/pypy/module/cpyext/test/test_unicodeobject.py
+++ b/pypy/module/cpyext/test/test_unicodeobject.py
@@ -24,7 +24,7 @@
                  if(PyUnicode_GetSize(s) == 11) {
                      result = 1;
                  }
-                 if(s->ob_type->tp_basicsize != sizeof(void*)*5)
+                 if(s->ob_type->tp_basicsize != sizeof(void*)*7)
                      result = 0;
                  Py_DECREF(s);
                  return PyBool_FromLong(result);
@@ -66,6 +66,7 @@
                  c = PyUnicode_AsUnicode(s);
                  c[0] = 'a';
                  c[1] = 0xe9;
+                 c[2] = 0x00;
                  c[3] = 'c';
                  return s;
              """),
@@ -74,7 +75,35 @@
         assert len(s) == 4
         assert s == u'a�\x00c'
 
+    def test_hash(self):
+        module = self.import_extension('foo', [
+            ("test_hash", "METH_VARARGS",
+             '''
+                PyObject* obj = (PyTuple_GetItem(args, 0));
+                long hash = ((PyUnicodeObject*)obj)->hash;
+                return PyLong_FromLong(hash);  
+             '''
+             ),
+            ])
+        res = module.test_hash(u"xyz")
+        assert res == hash(u'xyz')
 
+    def test_default_encoded_string(self):
+        module = self.import_extension('foo', [
+            ("test_default_encoded_string", "METH_O",
+             '''
+                PyObject* result = _PyUnicode_AsDefaultEncodedString(args, "replace");
+                Py_INCREF(result);
+                return result;
+             '''
+             ),
+            ])
+        res = module.test_default_encoded_string(u"xyz")
+        assert isinstance(res, str)
+        assert res == 'xyz'
+        res = module.test_default_encoded_string(u"caf\xe9")
+        assert isinstance(res, str)
+        assert res == 'caf?'
 
 class TestUnicode(BaseApiTest):
     def test_unicodeobject(self, space, api):
@@ -155,22 +184,22 @@
     def test_unicode_resize(self, space, api):
         py_uni = new_empty_unicode(space, 10)
         ar = lltype.malloc(PyObjectP.TO, 1, flavor='raw')
-        py_uni.c_buffer[0] = u'a'
-        py_uni.c_buffer[1] = u'b'
-        py_uni.c_buffer[2] = u'c'
+        py_uni.c_str[0] = u'a'
+        py_uni.c_str[1] = u'b'
+        py_uni.c_str[2] = u'c'
         ar[0] = rffi.cast(PyObject, py_uni)
         api.PyUnicode_Resize(ar, 3)
         py_uni = rffi.cast(PyUnicodeObject, ar[0])
         assert py_uni.c_size == 3
-        assert py_uni.c_buffer[1] == u'b'
-        assert py_uni.c_buffer[3] == u'\x00'
+        assert py_uni.c_str[1] == u'b'
+        assert py_uni.c_str[3] == u'\x00'
         # the same for growing
         ar[0] = rffi.cast(PyObject, py_uni)
         api.PyUnicode_Resize(ar, 10)
         py_uni = rffi.cast(PyUnicodeObject, ar[0])
         assert py_uni.c_size == 10
-        assert py_uni.c_buffer[1] == 'b'
-        assert py_uni.c_buffer[10] == '\x00'
+        assert py_uni.c_str[1] == 'b'
+        assert py_uni.c_str[10] == '\x00'
         Py_DecRef(space, ar[0])
         lltype.free(ar, flavor='raw')
 
diff --git a/pypy/module/cpyext/unicodeobject.py b/pypy/module/cpyext/unicodeobject.py
--- a/pypy/module/cpyext/unicodeobject.py
+++ b/pypy/module/cpyext/unicodeobject.py
@@ -22,7 +22,8 @@
 PyUnicodeObjectStruct = lltype.ForwardReference()
 PyUnicodeObject = lltype.Ptr(PyUnicodeObjectStruct)
 PyUnicodeObjectFields = (PyObjectFields +
-    (("buffer", rffi.CWCHARP), ("size", Py_ssize_t)))
+    (("str", rffi.CWCHARP), ("size", Py_ssize_t),
+     ("hash", rffi.LONG), ("defenc", PyObject)))
 cpython_struct("PyUnicodeObject", PyUnicodeObjectFields, PyUnicodeObjectStruct)
 
 @bootstrap_function
@@ -54,16 +55,20 @@
 
     buflen = length + 1
     py_uni.c_size = length
-    py_uni.c_buffer = lltype.malloc(rffi.CWCHARP.TO, buflen,
-                                    flavor='raw', zero=True,
-                                    add_memory_pressure=True)
+    py_uni.c_str = lltype.malloc(rffi.CWCHARP.TO, buflen,
+                                 flavor='raw', zero=True,
+                                 add_memory_pressure=True)
+    py_uni.c_hash = -1
+    py_uni.c_defenc = lltype.nullptr(PyObject.TO)
     return py_uni
 
 def unicode_attach(space, py_obj, w_obj):
     "Fills a newly allocated PyUnicodeObject with a unicode string"
     py_unicode = rffi.cast(PyUnicodeObject, py_obj)
     py_unicode.c_size = len(space.unicode_w(w_obj))
-    py_unicode.c_buffer = lltype.nullptr(rffi.CWCHARP.TO)
+    py_unicode.c_str = lltype.nullptr(rffi.CWCHARP.TO)
+    py_unicode.c_hash = space.hash_w(w_obj)
+    py_unicode.c_defenc = lltype.nullptr(PyObject.TO)
 
 def unicode_realize(space, py_obj):
     """
@@ -71,17 +76,20 @@
     be modified after this call.
     """
     py_uni = rffi.cast(PyUnicodeObject, py_obj)
-    s = rffi.wcharpsize2unicode(py_uni.c_buffer, py_uni.c_size)
+    s = rffi.wcharpsize2unicode(py_uni.c_str, py_uni.c_size)
     w_obj = space.wrap(s)
+    py_uni.c_hash = space.hash_w(w_obj)
     track_reference(space, py_obj, w_obj)
     return w_obj
 
 @cpython_api([PyObject], lltype.Void, header=None)
 def unicode_dealloc(space, py_obj):
     py_unicode = rffi.cast(PyUnicodeObject, py_obj)
-    if py_unicode.c_buffer:
-        lltype.free(py_unicode.c_buffer, flavor="raw")
+    if py_unicode.c_str:
+        lltype.free(py_unicode.c_str, flavor="raw")
     from pypy.module.cpyext.object import PyObject_dealloc
+    if py_unicode.c_defenc:
+        PyObject_dealloc(space, py_unicode.c_defenc)
     PyObject_dealloc(space, py_obj)
 
 @cpython_api([Py_UNICODE], rffi.INT_real, error=CANNOT_FAIL)
@@ -205,12 +213,12 @@
     """Return a pointer to the internal Py_UNICODE buffer of the object.  ref
     has to be a PyUnicodeObject (not checked)."""
     ref_unicode = rffi.cast(PyUnicodeObject, ref)
-    if not ref_unicode.c_buffer:
+    if not ref_unicode.c_str:
         # Copy unicode buffer
         w_unicode = from_ref(space, ref)
         u = space.unicode_w(w_unicode)
-        ref_unicode.c_buffer = rffi.unicode2wcharp(u)
-    return ref_unicode.c_buffer
+        ref_unicode.c_str = rffi.unicode2wcharp(u)
+    return ref_unicode.c_str
 
 @cpython_api([PyObject], rffi.CWCHARP)
 def PyUnicode_AsUnicode(space, ref):
@@ -241,7 +249,7 @@
     string may or may not be 0-terminated.  It is the responsibility of the caller
     to make sure that the wchar_t string is 0-terminated in case this is
     required by the application."""
-    c_buffer = PyUnicode_AS_UNICODE(space, rffi.cast(PyObject, ref))
+    c_str = PyUnicode_AS_UNICODE(space, rffi.cast(PyObject, ref))
     c_size = ref.c_size
 
     # If possible, try to copy the 0-termination as well
@@ -251,7 +259,7 @@
 
     i = 0
     while i < size:
-        buf[i] = c_buffer[i]
+        buf[i] = c_str[i]
         i += 1
 
     if size > c_size:
@@ -343,8 +351,15 @@
     return PyUnicode_FromUnicode(space, wchar_p, length)
 
 @cpython_api([PyObject, CONST_STRING], PyObject)
-def _PyUnicode_AsDefaultEncodedString(space, w_unicode, errors):
-    return PyUnicode_AsEncodedString(space, w_unicode, lltype.nullptr(rffi.CCHARP.TO), errors)
+def _PyUnicode_AsDefaultEncodedString(space, ref, errors):
+    # Returns a borrowed reference.
+    py_uni = rffi.cast(PyUnicodeObject, ref)
+    if not py_uni.c_defenc:
+        py_uni.c_defenc = make_ref(
+            space, PyUnicode_AsEncodedString(
+                space, ref,
+                lltype.nullptr(rffi.CCHARP.TO), errors))
+    return py_uni.c_defenc
 
 @cpython_api([CONST_STRING, Py_ssize_t, CONST_STRING, CONST_STRING], PyObject)
 def PyUnicode_Decode(space, s, size, encoding, errors):
@@ -444,7 +459,7 @@
 def PyUnicode_Resize(space, ref, newsize):
     # XXX always create a new string so far
     py_uni = rffi.cast(PyUnicodeObject, ref[0])
-    if not py_uni.c_buffer:
+    if not py_uni.c_str:
         raise OperationError(space.w_SystemError, space.wrap(
             "PyUnicode_Resize called on already created string"))
     try:
@@ -458,7 +473,7 @@
     if oldsize < newsize:
         to_cp = oldsize
     for i in range(to_cp):
-        py_newuni.c_buffer[i] = py_uni.c_buffer[i]
+        py_newuni.c_str[i] = py_uni.c_str[i]
     Py_DecRef(space, ref[0])
     ref[0] = rffi.cast(PyObject, py_newuni)
     return 0