[Python-checkins] cpython: Rewrite PyUnicode_Append(); unicode_modifiable() is more strict

Mon Dec 12 01:25:58 CET 2011

http://hg.python.org/cpython/rev/561f5e86fa47
changeset:   73940:561f5e86fa47
user:        Victor Stinner <victor.stinner at haypocalc.com>
date:        Mon Dec 12 00:01:39 2011 +0100
summary:
  Rewrite PyUnicode_Append(); unicode_modifiable() is more strict

 * Rename unicode_resizable() to unicode_modifiable()
 * Rename _PyUnicode_Dirty() to unicode_check_modifiable() to make it clear
   that the function is private
 * Inline PyUnicode_Concat() and unicode_append_inplace() in PyUnicode_Append()
   to simplify the code
 * unicode_modifiable() return 0 if the hash has been computed or if the string
   is not an exact unicode string
 * Remove _PyUnicode_DIRTY(): no need to reset the hash anymore, because if the
   hash has already been computed, you cannot modify a string inplace anymore
 * PyUnicode_Concat() checks for integer overflow

files:
  Objects/unicodeobject.c |  197 ++++++++++++++-------------
  1 files changed, 101 insertions(+), 96 deletions(-)

diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -165,9 +165,6 @@
             *_to++ = (to_type) *_iter++;                \
     } while (0)
 
-/* The Unicode string has been modified: reset the hash */
-#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
-
 /* This dictionary holds all interned unicode strings.  Note that references
    to strings in this dictionary are *not* counted in the string's ob_refcnt.
    When the interned string reaches a refcnt of 0 the string deallocation
@@ -226,6 +223,8 @@
     PyObject *to, Py_ssize_t to_start,
     PyObject *from, Py_ssize_t from_start,
     Py_ssize_t how_many);
+static int unicode_modifiable(PyObject *unicode);
+
 
 static PyObject *
 unicode_fromascii(const unsigned char *s, Py_ssize_t size);
@@ -645,10 +644,11 @@
     Py_ssize_t new_size;
     int share_wstr;
     PyObject *new_unicode;
-
     assert(PyUnicode_IS_READY(unicode));
+    assert(PyUnicode_IS_COMPACT(unicode));
+
     char_size = PyUnicode_KIND(unicode);
-    if (PyUnicode_IS_COMPACT_ASCII(unicode))
+    if (PyUnicode_IS_ASCII(unicode))
         struct_size = sizeof(PyASCIIObject);
     else
         struct_size = sizeof(PyCompactUnicodeObject);
@@ -676,7 +676,7 @@
     _PyUnicode_LENGTH(unicode) = length;
     if (share_wstr) {
         _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
-        if (!PyUnicode_IS_COMPACT_ASCII(unicode))
+        if (!PyUnicode_IS_ASCII(unicode))
             _PyUnicode_WSTR_LENGTH(unicode) = length;
     }
     PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
@@ -691,8 +691,6 @@
     assert(!PyUnicode_IS_COMPACT(unicode));
     assert(Py_REFCNT(unicode) == 1);
 
-    _PyUnicode_DIRTY(unicode);
-
     if (PyUnicode_IS_READY(unicode)) {
         Py_ssize_t char_size;
         Py_ssize_t new_size;
@@ -1115,15 +1113,13 @@
 #endif
 
 static int
-_PyUnicode_Dirty(PyObject *unicode)
-{
-    assert(_PyUnicode_CHECK(unicode));
-    if (Py_REFCNT(unicode) != 1) {
+unicode_check_modifiable(PyObject *unicode)
+{
+    if (!unicode_modifiable(unicode)) {
         PyErr_SetString(PyExc_SystemError,
-                        "Cannot modify a string having more than 1 reference");
+                        "Cannot modify a string currently used");
         return -1;
     }
-    _PyUnicode_DIRTY(unicode);
     return 0;
 }
 
@@ -1289,7 +1285,7 @@
     if (how_many == 0)
         return 0;
 
-    if (_PyUnicode_Dirty(to))
+    if (unicode_check_modifiable(to))
         return -1;
 
     err = _copy_characters(to, to_start, from, from_start, how_many, 1);
@@ -1537,12 +1533,17 @@
 #endif
 
 static int
-unicode_resizable(PyObject *unicode)
-{
+unicode_modifiable(PyObject *unicode)
+{
+    assert(_PyUnicode_CHECK(unicode));
     if (Py_REFCNT(unicode) != 1)
         return 0;
+    if (_PyUnicode_HASH(unicode) != -1)
+        return 0;
     if (PyUnicode_CHECK_INTERNED(unicode))
         return 0;
+    if (!PyUnicode_CheckExact(unicode))
+        return 0;
 #ifdef Py_DEBUG
     /* singleton refcount is greater than 1 */
     assert(!unicode_is_singleton(unicode));
@@ -1577,7 +1578,7 @@
         return 0;
     }
 
-    if (!unicode_resizable(unicode)) {
+    if (!unicode_modifiable(unicode)) {
         PyObject *copy = resize_copy(unicode, length);
         if (copy == NULL)
             return -1;
@@ -3591,11 +3592,12 @@
         PyErr_BadArgument();
         return -1;
     }
+    assert(PyUnicode_IS_READY(unicode));
     if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
         PyErr_SetString(PyExc_IndexError, "string index out of range");
         return -1;
     }
-    if (_PyUnicode_Dirty(unicode))
+    if (unicode_check_modifiable(unicode))
         return -1;
     PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
                     index, ch);
@@ -10566,6 +10568,7 @@
 {
     PyObject *u = NULL, *v = NULL, *w;
     Py_UCS4 maxchar, maxchar2;
+    Py_ssize_t u_len, v_len, new_len;
 
     /* Coerce the two arguments */
     u = PyUnicode_FromObject(left);
@@ -10585,18 +10588,25 @@
         return v;
     }
 
+    u_len = PyUnicode_GET_LENGTH(u);
+    v_len = PyUnicode_GET_LENGTH(v);
+    if (u_len > PY_SSIZE_T_MAX - v_len) {
+        PyErr_SetString(PyExc_OverflowError,
+                        "strings are too large to concat");
+        goto onError;
+    }
+    new_len = u_len + v_len;
+
     maxchar = PyUnicode_MAX_CHAR_VALUE(u);
     maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
     maxchar = Py_MAX(maxchar, maxchar2);
 
     /* Concat the two Unicode strings */
-    w = PyUnicode_New(
-        PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
-        maxchar);
+    w = PyUnicode_New(new_len, maxchar);
     if (w == NULL)
         goto onError;
-    copy_characters(w, 0, u, 0, PyUnicode_GET_LENGTH(u));
-    copy_characters(w, PyUnicode_GET_LENGTH(u), v, 0, PyUnicode_GET_LENGTH(v));
+    copy_characters(w, 0, u, 0, u_len);
+    copy_characters(w, u_len, v, 0, v_len);
     Py_DECREF(u);
     Py_DECREF(v);
     assert(_PyUnicode_CheckConsistency(w, 1));
@@ -10608,15 +10618,41 @@
     return NULL;
 }
 
-static void
-unicode_append_inplace(PyObject **p_left, PyObject *right)
-{
+void
+PyUnicode_Append(PyObject **p_left, PyObject *right)
+{
+    PyObject *left, *res;
+    Py_UCS4 maxchar, maxchar2;
     Py_ssize_t left_len, right_len, new_len;
 
-    assert(PyUnicode_IS_READY(*p_left));
-    assert(PyUnicode_IS_READY(right));
-
-    left_len = PyUnicode_GET_LENGTH(*p_left);
+    if (p_left == NULL) {
+        if (!PyErr_Occurred())
+            PyErr_BadInternalCall();
+        return;
+    }
+    left = *p_left;
+    if (right == NULL || !PyUnicode_Check(left)) {
+        if (!PyErr_Occurred())
+            PyErr_BadInternalCall();
+        goto error;
+    }
+
+    if (PyUnicode_READY(left))
+        goto error;
+    if (PyUnicode_READY(right))
+        goto error;
+
+    /* Shortcuts */
+    if (left == unicode_empty) {
+        Py_DECREF(left);
+        Py_INCREF(right);
+        *p_left = right;
+        return;
+    }
+    if (right == unicode_empty)
+        return;
+
+    left_len = PyUnicode_GET_LENGTH(left);
     right_len = PyUnicode_GET_LENGTH(right);
     if (left_len > PY_SSIZE_T_MAX - right_len) {
         PyErr_SetString(PyExc_OverflowError,
@@ -10625,78 +10661,47 @@
     }
     new_len = left_len + right_len;
 
-    /* Now we own the last reference to 'left', so we can resize it
-     * in-place.
-     */
-    if (unicode_resize(p_left, new_len) != 0) {
-        /* XXX if _PyUnicode_Resize() fails, 'left' has been
-         * deallocated so it cannot be put back into
-         * 'variable'.  The MemoryError is raised when there
-         * is no value in 'variable', which might (very
-         * remotely) be a cause of incompatibilities.
-         */
-        goto error;
-    }
-    /* copy 'right' into the newly allocated area of 'left' */
-    copy_characters(*p_left, left_len, right, 0, right_len);
-    _PyUnicode_DIRTY(*p_left);
-    return;
-
-error:
-    Py_DECREF(*p_left);
-    *p_left = NULL;
-}
-
-void
-PyUnicode_Append(PyObject **p_left, PyObject *right)
-{
-    PyObject *left, *res;
-
-    if (p_left == NULL) {
-        if (!PyErr_Occurred())
-            PyErr_BadInternalCall();
-        return;
-    }
-    left = *p_left;
-    if (right == NULL || !PyUnicode_Check(left)) {
-        if (!PyErr_Occurred())
-            PyErr_BadInternalCall();
-        goto error;
-    }
-
-    if (PyUnicode_READY(left))
-        goto error;
-    if (PyUnicode_READY(right))
-        goto error;
-
-    if (PyUnicode_CheckExact(left) && left != unicode_empty
-        && PyUnicode_CheckExact(right) && right != unicode_empty
-        && unicode_resizable(left)
-        && (_PyUnicode_KIND(right) <= _PyUnicode_KIND(left)
-            || _PyUnicode_WSTR(left) != NULL))
-    {
+    if (unicode_modifiable(left)
+        && PyUnicode_CheckExact(right)
+        && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
         /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
            to change the structure size, but characters are stored just after
            the structure, and so it requires to move all characters which is
            not so different than duplicating the string. */
-        if (!(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
-        {
-            unicode_append_inplace(p_left, right);
-            assert(p_left == NULL || _PyUnicode_CheckConsistency(*p_left, 1));
-            return;
-        }
-    }
-
-    res = PyUnicode_Concat(left, right);
-    if (res == NULL)
-        goto error;
-    Py_DECREF(left);
-    *p_left = res;
+        && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
+    {
+        /* append inplace */
+        if (unicode_resize(p_left, new_len) != 0) {
+            /* XXX if _PyUnicode_Resize() fails, 'left' has been
+             * deallocated so it cannot be put back into
+             * 'variable'.  The MemoryError is raised when there
+             * is no value in 'variable', which might (very
+             * remotely) be a cause of incompatibilities.
+             */
+            goto error;
+        }
+        /* copy 'right' into the newly allocated area of 'left' */
+        copy_characters(*p_left, left_len, right, 0, right_len);
+    }
+    else {
+        maxchar = PyUnicode_MAX_CHAR_VALUE(left);
+        maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
+        maxchar = Py_MAX(maxchar, maxchar2);
+
+        /* Concat the two Unicode strings */
+        res = PyUnicode_New(new_len, maxchar);
+        if (res == NULL)
+            goto error;
+        copy_characters(res, 0, left, 0, left_len);
+        copy_characters(res, left_len, right, 0, right_len);
+        Py_DECREF(left);
+        *p_left = res;
+    }
+    assert(_PyUnicode_CheckConsistency(*p_left, 1));
     return;
 
 error:
-    Py_DECREF(*p_left);
-    *p_left = NULL;
+    Py_CLEAR(*p_left);
 }
 
 void

-- 
Repository URL: http://hg.python.org/cpython