[Python-checkins] cpython: Move the slowest UTF-8 decoder to its own subfunction

Mon Dec 12 01:25:53 CET 2011

http://hg.python.org/cpython/rev/8815966e881a
changeset:   73933:8815966e881a
user:        Victor Stinner <victor.stinner at haypocalc.com>
date:        Sun Dec 11 20:09:03 2011 +0100
summary:
  Move the slowest UTF-8 decoder to its own subfunction

 * Create decode_utf8_errors()
 * Reuse unicode_fromascii()
 * decode_utf8_errors() doesn't refit at the beginning
 * Remove refit_partial_string(), use unicode_adjust_maxchar() instead

files:
  Objects/unicodeobject.c |  230 ++++++++++++---------------
  1 files changed, 100 insertions(+), 130 deletions(-)

diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -1784,7 +1784,7 @@
 static PyObject*
 unicode_fromascii(const unsigned char* s, Py_ssize_t size)
 {
-    PyObject *res;
+    PyObject *unicode;
 #ifdef Py_DEBUG
     const unsigned char *p;
     const unsigned char *end = s + size;
@@ -1794,11 +1794,12 @@
 #endif
     if (size == 1)
         return get_latin1_char(s[0]);
-    res = PyUnicode_New(size, 127);
-    if (!res)
-        return NULL;
-    memcpy(PyUnicode_1BYTE_DATA(res), s, size);
-    return res;
+    unicode = PyUnicode_New(size, 127);
+    if (!unicode)
+        return NULL;
+    memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
+    assert(_PyUnicode_CheckConsistency(unicode, 1));
+    return unicode;
 }
 
 static Py_UCS4
@@ -4320,126 +4321,38 @@
     return 65537;
 }
 
-/* Called when we encountered some error that wasn't detected in the original
-   scan, e.g. an encoded surrogate character. The original maxchar computation
-   may have been incorrect, so redo it. */
-static int
-refit_partial_string(PyObject **unicode, int kind, void *data, Py_ssize_t n)
-{
-    PyObject *tmp;
-    Py_ssize_t k;
-    Py_UCS4 maxchar;
-    for (k = 0, maxchar = 0; k < n; k++)
-        maxchar = Py_MAX(maxchar, PyUnicode_READ(kind, data, k));
-    tmp = PyUnicode_New(PyUnicode_GET_LENGTH(*unicode), maxchar);
-    if (tmp == NULL)
-        return -1;
-    PyUnicode_CopyCharacters(tmp, 0, *unicode, 0, n);
-    Py_DECREF(*unicode);
-    *unicode = tmp;
-    return 0;
-}
-
 /* Similar to PyUnicode_WRITE but may attempt to widen and resize the string
-   in case of errors. Implicit parameters: unicode, kind, data, has_errors,
-   onError. Potential resizing overallocates, so the result needs to shrink
-   at the end.
+   in case of errors. Implicit parameters: unicode, kind, data, onError.
+   Potential resizing overallocates, so the result needs to shrink at the end.
 */
-#define WRITE_MAYBE_FAIL(index, value)                                  \
-    do {                                                                \
-        if (has_errors) {                                               \
-            Py_ssize_t pos = index;                                     \
-            if (pos > PyUnicode_GET_LENGTH(unicode) &&                  \
-                unicode_resize(&unicode, pos + pos/8) < 0)              \
-                goto onError;                                           \
-            if (unicode_putchar(&unicode, &pos, value) < 0)             \
-                goto onError;                                           \
-        }                                                               \
-        else                                                            \
-            PyUnicode_WRITE(kind, data, index, value);                  \
+#define WRITE_MAYBE_FAIL(index, value)                              \
+    do {                                                            \
+        Py_ssize_t pos = index;                                     \
+        if (pos > PyUnicode_GET_LENGTH(unicode) &&                  \
+            unicode_resize(&unicode, pos + pos/8) < 0)              \
+            goto onError;                                           \
+        if (unicode_putchar(&unicode, &pos, value) < 0)             \
+            goto onError;                                           \
     } while (0)
 
 PyObject *
-PyUnicode_DecodeUTF8Stateful(const char *s,
-                             Py_ssize_t size,
-                             const char *errors,
-                             Py_ssize_t *consumed)
-{
-    const char *starts = s;
+decode_utf8_errors(const char *starts,
+                   Py_ssize_t size,
+                   const char *errors,
+                   Py_ssize_t *consumed,
+                   const char *s,
+                   PyObject *unicode,
+                   Py_ssize_t i)
+{
     int n;
     int k;
     Py_ssize_t startinpos;
     Py_ssize_t endinpos;
-    const char *e, *aligned_end;
-    PyObject *unicode;
+    const char *e = starts + size;
+    const char *aligned_end;
     const char *errmsg = "";
     PyObject *errorHandler = NULL;
     PyObject *exc = NULL;
-    Py_UCS4 maxchar = 0;
-    Py_ssize_t unicode_size;
-    Py_ssize_t i;
-    int kind;
-    void *data;
-    int has_errors = 0;
-
-    if (size == 0) {
-        if (consumed)
-            *consumed = 0;
-        return (PyObject *)PyUnicode_New(0, 0);
-    }
-    maxchar = utf8_max_char_size_and_char_count(s, size, &unicode_size);
-    /* When the string is ASCII only, just use memcpy and return.
-       unicode_size may be != size if there is an incomplete UTF-8
-       sequence at the end of the ASCII block.  */
-    if (maxchar < 128 && size == unicode_size) {
-        if (consumed)
-            *consumed = size;
-
-        if (size == 1)
-            return get_latin1_char((unsigned char)s[0]);
-
-        unicode = PyUnicode_New(unicode_size, maxchar);
-        if (!unicode)
-            return NULL;
-        Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
-        assert(_PyUnicode_CheckConsistency(unicode, 1));
-        return unicode;
-    }
-
-    /* In case of errors, maxchar and size computation might be incorrect;
-       code below refits and resizes as necessary. */
-    unicode = PyUnicode_New(unicode_size, maxchar);
-    if (!unicode)
-        return NULL;
-    kind = PyUnicode_KIND(unicode);
-    data = PyUnicode_DATA(unicode);
-
-    /* Unpack UTF-8 encoded data */
-    i = 0;
-    e = s + size;
-    switch (kind) {
-    case PyUnicode_1BYTE_KIND:
-        has_errors = ucs1lib_utf8_try_decode(s, e, (Py_UCS1 *) data, &s, &i);
-        break;
-    case PyUnicode_2BYTE_KIND:
-        has_errors = ucs2lib_utf8_try_decode(s, e, (Py_UCS2 *) data, &s, &i);
-        break;
-    case PyUnicode_4BYTE_KIND:
-        has_errors = ucs4lib_utf8_try_decode(s, e, (Py_UCS4 *) data, &s, &i);
-        break;
-    }
-    if (!has_errors) {
-        /* Ensure the unicode size calculation was correct */
-        assert(i == unicode_size);
-        assert(s == e);
-        if (consumed)
-            *consumed = s-starts;
-        return unicode;
-    }
-    /* Fall through to the generic decoding loop for the rest of
-       the string */
-    if (refit_partial_string(&unicode, kind, data, i) < 0)
-        goto onError;
 
     aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
 
@@ -4591,11 +4504,6 @@
         continue;
 
       utf8Error:
-        if (!has_errors) {
-            if (refit_partial_string(&unicode, kind, data, i) < 0)
-                goto onError;
-            has_errors = 1;
-        }
         if (unicode_decode_call_errorhandler(
                 errors, &errorHandler,
                 "utf8", errmsg,
@@ -4604,22 +4512,18 @@
             goto onError;
         /* Update data because unicode_decode_call_errorhandler might have
            re-created or resized the unicode object. */
-        data = PyUnicode_DATA(unicode);
-        kind = PyUnicode_KIND(unicode);
         aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
     }
-    /* Ensure the unicode_size calculation above was correct: */
-    assert(has_errors || i == unicode_size);
-
     if (consumed)
         *consumed = s-starts;
 
     /* Adjust length and ready string when it contained errors and
        is of the old resizable kind. */
-    if (has_errors) {
-        if (PyUnicode_Resize(&unicode, i) < 0)
-            goto onError;
-    }
+    if (unicode_resize(&unicode, i) < 0)
+        goto onError;
+    unicode_adjust_maxchar(&unicode);
+    if (unicode == NULL)
+        goto onError;
 
     Py_XDECREF(errorHandler);
     Py_XDECREF(exc);
@@ -4629,12 +4533,78 @@
   onError:
     Py_XDECREF(errorHandler);
     Py_XDECREF(exc);
-    Py_DECREF(unicode);
+    Py_XDECREF(unicode);
     return NULL;
 }
-
 #undef WRITE_MAYBE_FAIL
 
+PyObject *
+PyUnicode_DecodeUTF8Stateful(const char *s,
+                             Py_ssize_t size,
+                             const char *errors,
+                             Py_ssize_t *consumed)
+{
+    Py_UCS4 maxchar = 0;
+    Py_ssize_t unicode_size;
+    int has_errors = 0;
+    PyObject *unicode;
+    int kind;
+    void *data;
+    const char *starts = s;
+    const char *e;
+    Py_ssize_t i;
+
+    if (size == 0) {
+        if (consumed)
+            *consumed = 0;
+        return (PyObject *)PyUnicode_New(0, 0);
+    }
+
+    maxchar = utf8_max_char_size_and_char_count(s, size, &unicode_size);
+
+    /* When the string is ASCII only, just use memcpy and return.
+       unicode_size may be != size if there is an incomplete UTF-8
+       sequence at the end of the ASCII block.  */
+    if (maxchar < 128 && size == unicode_size) {
+        if (consumed)
+            *consumed = size;
+        return unicode_fromascii(s, size);
+    }
+
+    unicode = PyUnicode_New(unicode_size, maxchar);
+    if (!unicode)
+        return NULL;
+    kind = PyUnicode_KIND(unicode);
+    data = PyUnicode_DATA(unicode);
+
+    /* Unpack UTF-8 encoded data */
+    i = 0;
+    e = starts + size;
+    switch (kind) {
+    case PyUnicode_1BYTE_KIND:
+        has_errors = ucs1lib_utf8_try_decode(s, e, (Py_UCS1 *) data, &s, &i);
+        break;
+    case PyUnicode_2BYTE_KIND:
+        has_errors = ucs2lib_utf8_try_decode(s, e, (Py_UCS2 *) data, &s, &i);
+        break;
+    case PyUnicode_4BYTE_KIND:
+        has_errors = ucs4lib_utf8_try_decode(s, e, (Py_UCS4 *) data, &s, &i);
+        break;
+    }
+    if (!has_errors) {
+        /* Ensure the unicode size calculation was correct */
+        assert(i == unicode_size);
+        assert(s == e);
+        if (consumed)
+            *consumed = size;
+        return unicode;
+    }
+
+    /* In case of errors, maxchar and size computation might be incorrect;
+       code below refits and resizes as necessary. */
+    return decode_utf8_errors(starts, size, errors, consumed, s, unicode, i);
+}
+
 #ifdef __APPLE__
 
 /* Simplified UTF-8 decoder using surrogateescape error handler,

-- 
Repository URL: http://hg.python.org/cpython