[pypy-svn] pypy default: Add support for PyUnicode_FromUnicode(NULL, size), which allocates a (temporarily) mutable unicode string.

Sat Mar 26 01:12:49 CET 2011

Author: Amaury Forgeot d'Arc <amauryfa at gmail.com>
Branch: 
Changeset: r42951:88b090e851cc
Date: 2011-03-26 01:12 +0100
http://bitbucket.org/pypy/pypy/changeset/88b090e851cc/

Log:	Add support for PyUnicode_FromUnicode(NULL, size), which allocates a
	(temporarily) mutable unicode string. Also implement
	PyUnicode_Resize.

	See comments in stringobject.py for a complete explanation

diff --git a/pypy/module/cpyext/test/test_unicodeobject.py b/pypy/module/cpyext/test/test_unicodeobject.py
--- a/pypy/module/cpyext/test/test_unicodeobject.py
+++ b/pypy/module/cpyext/test/test_unicodeobject.py
@@ -1,9 +1,81 @@
 # encoding: iso-8859-15
 from pypy.module.cpyext.test.test_api import BaseApiTest
-from pypy.module.cpyext.unicodeobject import Py_UNICODE
+from pypy.module.cpyext.test.test_cpyext import AppTestCpythonExtensionBase
+from pypy.module.cpyext.unicodeobject import (
+    Py_UNICODE, PyUnicodeObject, new_empty_unicode)
+from pypy.module.cpyext.api import PyObjectP, PyObject
+from pypy.module.cpyext.pyobject import Py_DecRef
 from pypy.rpython.lltypesystem import rffi, lltype
 import sys, py
 
+class AppTestUnicodeObject(AppTestCpythonExtensionBase):
+    def test_unicodeobject(self):
+        module = self.import_extension('foo', [
+            ("get_hello1", "METH_NOARGS",
+             """
+                 return PyUnicode_FromStringAndSize(
+                     "Hello world<should not be included>", 11);
+             """),
+            ("test_GetSize", "METH_NOARGS",
+             """
+                 PyObject* s = PyUnicode_FromString("Hello world");
+                 int result = 0;
+
+                 if(PyUnicode_GetSize(s) == 11) {
+                     result = 1;
+                 }
+                 if(s->ob_type->tp_basicsize != sizeof(void*)*4)
+                     result = 0;
+                 Py_DECREF(s);
+                 return PyBool_FromLong(result);
+             """),
+            ("test_GetSize_exception", "METH_NOARGS",
+             """
+                 PyObject* f = PyFloat_FromDouble(1.0);
+                 Py_ssize_t size = PyUnicode_GetSize(f);
+
+                 Py_DECREF(f);
+                 return NULL;
+             """),
+             ("test_is_unicode", "METH_VARARGS",
+             """
+                return PyBool_FromLong(PyUnicode_Check(PyTuple_GetItem(args, 0)));
+             """)])
+        assert module.get_hello1() == u'Hello world'
+        assert module.test_GetSize()
+        raises(TypeError, module.test_GetSize_exception)
+
+        assert module.test_is_unicode(u"")
+        assert not module.test_is_unicode(())
+
+    def test_unicode_buffer_init(self):
+        module = self.import_extension('foo', [
+            ("getunicode", "METH_NOARGS",
+             """
+                 PyObject *s, *t;
+                 Py_UNICODE* c;
+                 Py_ssize_t len;
+
+                 s = PyUnicode_FromUnicode(NULL, 4);
+                 if (s == NULL)
+                    return NULL;
+                 t = PyUnicode_FromUnicode(NULL, 3);
+                 if (t == NULL)
+                    return NULL;
+                 Py_DECREF(t);
+                 c = PyUnicode_AsUnicode(s);
+                 c[0] = 'a';
+                 c[1] = 0xe9;
+                 c[3] = 'c';
+                 return s;
+             """),
+            ])
+        s = module.getunicode()
+        assert len(s) == 4
+        assert s == u'a&#65533;\x00c'
+
+
+
 class TestUnicode(BaseApiTest):
     def test_unicodeobject(self, space, api):
         assert api.PyUnicode_GET_SIZE(space.wrap(u'sp&#65533;m')) == 4
@@ -77,6 +149,28 @@
         assert space.unwrap(w_res) == u'sp&#65533;'
         rffi.free_charp(s)
 
+    def test_unicode_resize(self, space, api):
+        py_uni = new_empty_unicode(space, 10)
+        ar = lltype.malloc(PyObjectP.TO, 1, flavor='raw')
+        py_uni.c_buffer[0] = u'a'
+        py_uni.c_buffer[1] = u'b'
+        py_uni.c_buffer[2] = u'c'
+        ar[0] = rffi.cast(PyObject, py_uni)
+        api.PyUnicode_Resize(ar, 3)
+        py_uni = rffi.cast(PyUnicodeObject, ar[0])
+        assert py_uni.c_size == 3
+        assert py_uni.c_buffer[1] == u'b'
+        assert py_uni.c_buffer[3] == u'\x00'
+        # the same for growing
+        ar[0] = rffi.cast(PyObject, py_uni)
+        api.PyUnicode_Resize(ar, 10)
+        py_uni = rffi.cast(PyUnicodeObject, ar[0])
+        assert py_uni.c_size == 10
+        assert py_uni.c_buffer[1] == 'b'
+        assert py_uni.c_buffer[10] == '\x00'
+        Py_DecRef(space, ar[0])
+        lltype.free(ar, flavor='raw')
+
     def test_AsUTF8String(self, space, api):
         w_u = space.wrap(u'sp&#65533;m')
         w_res = api.PyUnicode_AsUTF8String(w_u)
@@ -235,13 +329,13 @@
 
         x_chunk = api.PyUnicode_AS_UNICODE(w_x)
         api.Py_UNICODE_COPY(target_chunk, x_chunk, 4)
-        w_y = api.PyUnicode_FromUnicode(target_chunk, 4)
+        w_y = space.wrap(rffi.wcharpsize2unicode(target_chunk, 4))
 
         assert space.eq_w(w_y, space.wrap(u"abcd"))
 
         size = api.PyUnicode_GET_SIZE(w_x)
         api.Py_UNICODE_COPY(target_chunk, x_chunk, size)
-        w_y = api.PyUnicode_FromUnicode(target_chunk, size)
+        w_y = space.wrap(rffi.wcharpsize2unicode(target_chunk, size))
 
         assert space.eq_w(w_y, w_x)
 

diff --git a/pypy/module/cpyext/stringobject.py b/pypy/module/cpyext/stringobject.py
--- a/pypy/module/cpyext/stringobject.py
+++ b/pypy/module/cpyext/stringobject.py
@@ -15,7 +15,7 @@
 ## The problem
 ## -----------
 ##
-## PyString_AsString() must returns a (non-movable) pointer to the underlying
+## PyString_AsString() must return a (non-movable) pointer to the underlying
 ## buffer, whereas pypy strings are movable.  C code may temporarily store
 ## this address and use it, as long as it owns a reference to the PyObject.
 ## There is no "release" function to specify that the pointer is not needed

diff --git a/pypy/module/cpyext/unicodeobject.py b/pypy/module/cpyext/unicodeobject.py
--- a/pypy/module/cpyext/unicodeobject.py
+++ b/pypy/module/cpyext/unicodeobject.py
@@ -7,15 +7,16 @@
     bootstrap_function, PyObjectFields, cpython_struct, CONST_STRING,
     CONST_WSTRING)
 from pypy.module.cpyext.pyerrors import PyErr_BadArgument
-from pypy.module.cpyext.pyobject import PyObject, from_ref, make_typedescr
+from pypy.module.cpyext.pyobject import (
+    PyObject, PyObjectP, Py_DecRef, make_ref, from_ref, track_reference,
+    make_typedescr, get_typedescr)
 from pypy.module.cpyext.stringobject import PyString_Check
 from pypy.module.sys.interp_encoding import setdefaultencoding
 from pypy.objspace.std import unicodeobject, unicodetype
 from pypy.rlib import runicode
 import sys
 
-## See comment in stringobject.py.  PyUnicode_FromUnicode(NULL, size) is not
-## yet supported.
+## See comment in stringobject.py.
 
 PyUnicodeObjectStruct = lltype.ForwardReference()
 PyUnicodeObject = lltype.Ptr(PyUnicodeObjectStruct)
@@ -28,7 +29,8 @@
     make_typedescr(space.w_unicode.instancetypedef,
                    basestruct=PyUnicodeObject.TO,
                    attach=unicode_attach,
-                   dealloc=unicode_dealloc)
+                   dealloc=unicode_dealloc,
+                   realize=unicode_realize)
 
 # Buffer for the default encoding (used by PyUnicde_GetDefaultEncoding)
 DEFAULT_ENCODING_SIZE = 100
@@ -39,12 +41,39 @@
 
 Py_UNICODE = lltype.UniChar
 
+def new_empty_unicode(space, length):
+    """
+    Allocatse a PyUnicodeObject and its buffer, but without a corresponding
+    interpreter object.  The buffer may be mutated, until unicode_realize() is
+    called.
+    """
+    typedescr = get_typedescr(space.w_unicode.instancetypedef)
+    py_obj = typedescr.allocate(space, space.w_unicode)
+    py_uni = rffi.cast(PyUnicodeObject, py_obj)
+
+    buflen = length + 1
+    py_uni.c_size = length
+    py_uni.c_buffer = lltype.malloc(rffi.CWCHARP.TO, buflen,
+                                    flavor='raw', zero=True)
+    return py_uni
+
 def unicode_attach(space, py_obj, w_obj):
     "Fills a newly allocated PyUnicodeObject with a unicode string"
     py_unicode = rffi.cast(PyUnicodeObject, py_obj)
     py_unicode.c_size = len(space.unicode_w(w_obj))
     py_unicode.c_buffer = lltype.nullptr(rffi.CWCHARP.TO)
 
+def unicode_realize(space, py_obj):
+    """
+    Creates the unicode in the interpreter. The PyUnicodeObject buffer must not
+    be modified after this call.
+    """
+    py_uni = rffi.cast(PyUnicodeObject, py_obj)
+    s = rffi.wcharpsize2unicode(py_uni.c_buffer, py_uni.c_size)
+    w_obj = space.wrap(s)
+    track_reference(space, py_obj, w_obj)
+    return w_obj
+
 @cpython_api([PyObject], lltype.Void, external=False)
 def unicode_dealloc(space, py_obj):
     py_unicode = rffi.cast(PyUnicodeObject, py_obj)
@@ -128,7 +157,9 @@
 def PyUnicode_AsUnicode(space, ref):
     """Return a read-only pointer to the Unicode object's internal Py_UNICODE
     buffer, NULL if unicode is not a Unicode object."""
-    if not PyUnicode_Check(space, ref):
+    # Don't use PyUnicode_Check, it will realize the object :-(
+    w_type = from_ref(space, rffi.cast(PyObject, ref.c_ob_type))
+    if not space.is_true(space.issubtype(w_type, space.w_unicode)):
         raise OperationError(space.w_TypeError,
                              space.wrap("expected unicode object"))
     return PyUnicode_AS_UNICODE(space, ref)
@@ -237,10 +268,11 @@
     object. If the buffer is not NULL, the return value might be a shared object.
     Therefore, modification of the resulting Unicode object is only allowed when u
     is NULL."""
-    if not wchar_p:
-        raise NotImplementedError
-    s = rffi.wcharpsize2unicode(wchar_p, length)
-    return space.wrap(s)
+    if wchar_p:
+        s = rffi.wcharpsize2unicode(wchar_p, length)
+        return make_ref(space, space.wrap(s))
+    else:
+        return rffi.cast(PyObject, new_empty_unicode(space, length))
 
 @cpython_api([CONST_WSTRING, Py_ssize_t], PyObject)
 def PyUnicode_FromWideChar(space, wchar_p, length):
@@ -330,6 +362,29 @@
     w_str = space.wrap(rffi.charpsize2str(s, size))
     return space.call_method(w_str, 'decode', space.wrap("utf-8"))
 
+ at cpython_api([PyObjectP, Py_ssize_t], rffi.INT_real, error=-1)
+def PyUnicode_Resize(space, ref, newsize):
+    # XXX always create a new string so far
+    py_uni = rffi.cast(PyUnicodeObject, ref[0])
+    if not py_uni.c_buffer:
+        raise OperationError(space.w_SystemError, space.wrap(
+            "PyUnicode_Resize called on already created string"))
+    try:
+        py_newuni = new_empty_unicode(space, newsize)
+    except MemoryError:
+        Py_DecRef(space, ref[0])
+        ref[0] = lltype.nullptr(PyObject.TO)
+        raise
+    to_cp = newsize
+    oldsize = py_uni.c_size
+    if oldsize < newsize:
+        to_cp = oldsize
+    for i in range(to_cp):
+        py_newuni.c_buffer[i] = py_uni.c_buffer[i]
+    Py_DecRef(space, ref[0])
+    ref[0] = rffi.cast(PyObject, py_newuni)
+    return 0
+
 @cpython_api([PyObject], PyObject)
 def PyUnicode_AsUTF8String(space, w_unicode):
     """Encode a Unicode object using UTF-8 and return the result as Python string