[Python-checkins] cpython: Close #14716: str.format() now uses the new "unicode writer" API instead of the

Mon May 7 13:08:10 CEST 2012

http://hg.python.org/cpython/rev/7be716a47e9d
changeset:   76820:7be716a47e9d
user:        Victor Stinner <victor.stinner at gmail.com>
date:        Mon May 07 12:47:02 2012 +0200
summary:
  Close #14716: str.format() now uses the new "unicode writer" API instead of the
PyAccu API. For example, it makes str.format() from 25% to 30% faster on Linux.

files:
  Objects/stringlib/unicode_format.h |   60 +--
  Objects/unicodeobject.c            |  258 ++++++++--------
  2 files changed, 148 insertions(+), 170 deletions(-)

diff --git a/Objects/stringlib/unicode_format.h b/Objects/stringlib/unicode_format.h
--- a/Objects/stringlib/unicode_format.h
+++ b/Objects/stringlib/unicode_format.h
@@ -2,8 +2,6 @@
     unicode_format.h -- implementation of str.format().
 */
 
-#include "accu.h"
-
 /* Defines for more efficiently reallocating the string buffer */
 #define INITIAL_SIZE_INCREMENT 100
 #define SIZE_MULTIPLIER 2
@@ -112,33 +110,6 @@
 
 
 /************************************************************************/
-/***********    Output string management functions       ****************/
-/************************************************************************/
-
-/*
-    output_data dumps characters into our output string
-    buffer.
-
-    In some cases, it has to reallocate the string.
-
-    It returns a status:  0 for a failed reallocation,
-    1 for success.
-*/
-static int
-output_data(_PyAccu *acc, PyObject *s, Py_ssize_t start, Py_ssize_t end)
-{
-    PyObject *substring;
-    int r;
-
-    substring = PyUnicode_Substring(s, start, end);
-    if (substring == NULL)
-        return 0;
-    r = _PyAccu_Accumulate(acc, substring);
-    Py_DECREF(substring);
-    return r == 0;
-}
-
-/************************************************************************/
 /***********  Format string parsing -- integers and identifiers *********/
 /************************************************************************/
 
@@ -523,7 +494,7 @@
     appends to the output.
 */
 static int
-render_field(PyObject *fieldobj, SubString *format_spec, _PyAccu *acc)
+render_field(PyObject *fieldobj, SubString *format_spec, unicode_writer_t *writer)
 {
     int ok = 0;
     PyObject *result = NULL;
@@ -566,7 +537,8 @@
         goto done;
 
     assert(PyUnicode_Check(result));
-    ok = output_data(acc, result, 0, PyUnicode_GET_LENGTH(result));
+
+    ok = (unicode_writer_write_str(writer, result, 0, PyUnicode_GET_LENGTH(result)) == 0);
 done:
     Py_XDECREF(format_spec_object);
     Py_XDECREF(result);
@@ -831,7 +803,7 @@
 static int
 output_markup(SubString *field_name, SubString *format_spec,
               int format_spec_needs_expanding, Py_UCS4 conversion,
-              _PyAccu *acc, PyObject *args, PyObject *kwargs,
+              unicode_writer_t *writer, PyObject *args, PyObject *kwargs,
               int recursion_depth, AutoNumber *auto_number)
 {
     PyObject *tmp = NULL;
@@ -872,7 +844,7 @@
     else
         actual_format_spec = format_spec;
 
-    if (render_field(fieldobj, actual_format_spec, acc) == 0)
+    if (render_field(fieldobj, actual_format_spec, writer) == 0)
         goto done;
 
     result = 1;
@@ -892,7 +864,7 @@
 */
 static int
 do_markup(SubString *input, PyObject *args, PyObject *kwargs,
-          _PyAccu *acc, int recursion_depth, AutoNumber *auto_number)
+          unicode_writer_t *writer, int recursion_depth, AutoNumber *auto_number)
 {
     MarkupIterator iter;
     int format_spec_needs_expanding;
@@ -902,17 +874,21 @@
     SubString field_name;
     SubString format_spec;
     Py_UCS4 conversion;
+    int err;
 
     MarkupIterator_init(&iter, input->str, input->start, input->end);
     while ((result = MarkupIterator_next(&iter, &literal, &field_present,
                                          &field_name, &format_spec,
                                          &conversion,
                                          &format_spec_needs_expanding)) == 2) {
-        if (!output_data(acc, literal.str, literal.start, literal.end))
+        err = unicode_writer_write_str(writer,
+                                       literal.str, literal.start,
+                                       literal.end - literal.start);
+        if (err == -1)
             return 0;
         if (field_present)
             if (!output_markup(&field_name, &format_spec,
-                               format_spec_needs_expanding, conversion, acc,
+                               format_spec_needs_expanding, conversion, writer,
                                args, kwargs, recursion_depth, auto_number))
                 return 0;
     }
@@ -928,7 +904,8 @@
 build_string(SubString *input, PyObject *args, PyObject *kwargs,
              int recursion_depth, AutoNumber *auto_number)
 {
-    _PyAccu acc;
+    unicode_writer_t writer;
+    Py_ssize_t initlen;
 
     /* check the recursion level */
     if (recursion_depth <= 0) {
@@ -937,16 +914,17 @@
         return NULL;
     }
 
-    if (_PyAccu_Init(&acc))
+    initlen = PyUnicode_GET_LENGTH(input->str) + 100;
+    if (unicode_writer_init(&writer, initlen, 127) == -1)
         return NULL;
 
-    if (!do_markup(input, args, kwargs, &acc, recursion_depth,
+    if (!do_markup(input, args, kwargs, &writer, recursion_depth,
                    auto_number)) {
-        _PyAccu_Destroy(&acc);
+        unicode_writer_dealloc(&writer);
         return NULL;
     }
 
-    return _PyAccu_Finish(&acc);
+    return unicode_writer_finish(&writer);
 }
 
 /************************************************************************/
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -13200,6 +13200,135 @@
     return PyBool_FromLong(result);
 }
 
+typedef struct {
+    PyObject *buffer;
+    void *data;
+    enum PyUnicode_Kind kind;
+    Py_UCS4 maxchar;
+    Py_ssize_t pos;
+} unicode_writer_t;
+
+Py_LOCAL_INLINE(void)
+unicode_writer_update(unicode_writer_t *writer)
+{
+    writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
+    writer->data = PyUnicode_DATA(writer->buffer);
+    writer->kind = PyUnicode_KIND(writer->buffer);
+}
+
+Py_LOCAL(int)
+unicode_writer_init(unicode_writer_t *writer,
+                    Py_ssize_t length, Py_UCS4 maxchar)
+{
+    writer->pos = 0;
+    writer->buffer = PyUnicode_New(length, maxchar);
+    if (writer->buffer == NULL)
+        return -1;
+    unicode_writer_update(writer);
+    return 0;
+}
+
+Py_LOCAL_INLINE(int)
+unicode_writer_prepare(unicode_writer_t *writer,
+                       Py_ssize_t length, Py_UCS4 maxchar)
+{
+    Py_ssize_t newlen;
+    PyObject *newbuffer;
+
+    if (length > PY_SSIZE_T_MAX - writer->pos) {
+        PyErr_NoMemory();
+        return -1;
+    }
+    newlen = writer->pos + length;
+
+    if (newlen > PyUnicode_GET_LENGTH(writer->buffer)) {
+        /* overallocate 25% to limit the number of resize */
+        if (newlen <= (PY_SSIZE_T_MAX - newlen / 4))
+            newlen += newlen / 4;
+
+        if (maxchar > writer->maxchar) {
+            /* resize + widen */
+            newbuffer = PyUnicode_New(newlen, maxchar);
+            if (newbuffer == NULL)
+                return -1;
+            PyUnicode_CopyCharacters(newbuffer, 0,
+                                     writer->buffer, 0, writer->pos);
+            Py_DECREF(writer->buffer);
+        }
+        else {
+            newbuffer = resize_compact(writer->buffer, newlen);
+            if (newbuffer == NULL)
+                return -1;
+        }
+        writer->buffer = newbuffer;
+        unicode_writer_update(writer);
+    }
+    else if (maxchar > writer->maxchar) {
+        if (unicode_widen(&writer->buffer, writer->pos, maxchar) < 0)
+            return -1;
+        unicode_writer_update(writer);
+    }
+    return 0;
+}
+
+Py_LOCAL_INLINE(int)
+unicode_writer_write_str(
+    unicode_writer_t *writer,
+    PyObject *str, Py_ssize_t start, Py_ssize_t length)
+{
+    Py_UCS4 maxchar;
+
+    assert(str != NULL);
+    assert(PyUnicode_Check(str));
+    if (PyUnicode_READY(str) == -1)
+        return -1;
+
+    assert(0 <= start);
+    assert(0 <= length);
+    assert(start + length <= PyUnicode_GET_LENGTH(str));
+    if (length == 0)
+        return 0;
+
+    maxchar = _PyUnicode_FindMaxChar(str, start, start + length);
+    if (unicode_writer_prepare(writer, length, maxchar) == -1)
+        return -1;
+
+    assert((writer->pos + length) <= PyUnicode_GET_LENGTH(writer->buffer));
+    copy_characters(writer->buffer, writer->pos,
+                    str, start, length);
+    writer->pos += length;
+    return 0;
+}
+
+Py_LOCAL_INLINE(int)
+unicode_writer_write_char(
+    unicode_writer_t *writer,
+    Py_UCS4 ch)
+{
+    if (unicode_writer_prepare(writer, 1, ch) == -1)
+        return -1;
+    assert((writer->pos + 1) <= PyUnicode_GET_LENGTH(writer->buffer));
+    PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
+    writer->pos += 1;
+    return 0;
+}
+
+Py_LOCAL(PyObject *)
+unicode_writer_finish(unicode_writer_t *writer)
+{
+    if (PyUnicode_Resize(&writer->buffer, writer->pos) < 0) {
+        Py_DECREF(writer->buffer);
+        return NULL;
+    }
+    return writer->buffer;
+}
+
+Py_LOCAL(void)
+unicode_writer_dealloc(unicode_writer_t *writer)
+{
+    Py_CLEAR(writer->buffer);
+}
+
 #include "stringlib/unicode_format.h"
 
 PyDoc_STRVAR(format__doc__,
@@ -13649,135 +13778,6 @@
     return (Py_UCS4) -1;
 }
 
-typedef struct {
-    PyObject *buffer;
-    void *data;
-    enum PyUnicode_Kind kind;
-    Py_UCS4 maxchar;
-    Py_ssize_t pos;
-} unicode_writer_t;
-
-Py_LOCAL_INLINE(void)
-unicode_writer_update(unicode_writer_t *writer)
-{
-    writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
-    writer->data = PyUnicode_DATA(writer->buffer);
-    writer->kind = PyUnicode_KIND(writer->buffer);
-}
-
-Py_LOCAL(int)
-unicode_writer_init(unicode_writer_t *writer,
-                    Py_ssize_t length, Py_UCS4 maxchar)
-{
-    writer->pos = 0;
-    writer->buffer = PyUnicode_New(length, maxchar);
-    if (writer->buffer == NULL)
-        return -1;
-    unicode_writer_update(writer);
-    return 0;
-}
-
-Py_LOCAL_INLINE(int)
-unicode_writer_prepare(unicode_writer_t *writer,
-                       Py_ssize_t length, Py_UCS4 maxchar)
-{
-    Py_ssize_t newlen;
-    PyObject *newbuffer;
-
-    if (length > PY_SSIZE_T_MAX - writer->pos) {
-        PyErr_NoMemory();
-        return -1;
-    }
-    newlen = writer->pos + length;
-
-    if (newlen > PyUnicode_GET_LENGTH(writer->buffer)) {
-        /* overallocate 25% to limit the number of resize */
-        if (newlen <= (PY_SSIZE_T_MAX - newlen / 4))
-            newlen += newlen / 4;
-
-        if (maxchar > writer->maxchar) {
-            /* resize + widen */
-            newbuffer = PyUnicode_New(newlen, maxchar);
-            if (newbuffer == NULL)
-                return -1;
-            PyUnicode_CopyCharacters(newbuffer, 0,
-                                     writer->buffer, 0, writer->pos);
-            Py_DECREF(writer->buffer);
-        }
-        else {
-            newbuffer = resize_compact(writer->buffer, newlen);
-            if (newbuffer == NULL)
-                return -1;
-        }
-        writer->buffer = newbuffer;
-        unicode_writer_update(writer);
-    }
-    else if (maxchar > writer->maxchar) {
-        if (unicode_widen(&writer->buffer, writer->pos, maxchar) < 0)
-            return -1;
-        unicode_writer_update(writer);
-    }
-    return 0;
-}
-
-Py_LOCAL_INLINE(int)
-unicode_writer_write_str(
-    unicode_writer_t *writer,
-    PyObject *str, Py_ssize_t start, Py_ssize_t length)
-{
-    Py_UCS4 maxchar;
-
-    assert(str != NULL);
-    assert(PyUnicode_Check(str));
-    if (PyUnicode_READY(str) == -1)
-        return -1;
-
-    assert(0 <= start);
-    assert(0 <= length);
-    assert(start + length <= PyUnicode_GET_LENGTH(str));
-    if (length == 0)
-        return 0;
-
-    maxchar = _PyUnicode_FindMaxChar(str, start, start + length);
-    if (unicode_writer_prepare(writer, length, maxchar) == -1)
-        return -1;
-
-    assert((writer->pos + length) <= PyUnicode_GET_LENGTH(writer->buffer));
-    copy_characters(writer->buffer, writer->pos,
-                    str, start, length);
-    writer->pos += length;
-    return 0;
-}
-
-Py_LOCAL_INLINE(int)
-unicode_writer_write_char(
-    unicode_writer_t *writer,
-    Py_UCS4 ch)
-{
-    if (unicode_writer_prepare(writer, 1, ch) == -1)
-        return -1;
-    assert((writer->pos + 1) <= PyUnicode_GET_LENGTH(writer->buffer));
-    PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
-    writer->pos += 1;
-    return 0;
-}
-
-Py_LOCAL(PyObject *)
-unicode_writer_finish(unicode_writer_t *writer)
-{
-    if (PyUnicode_Resize(&writer->buffer, writer->pos) < 0) {
-        Py_DECREF(writer->buffer);
-        return NULL;
-    }
-    return writer->buffer;
-}
-
-Py_LOCAL(void)
-unicode_writer_dealloc(unicode_writer_t *writer)
-{
-    Py_CLEAR(writer->buffer);
-}
-
 PyObject *
 PyUnicode_Format(PyObject *format, PyObject *args)
 {

-- 
Repository URL: http://hg.python.org/cpython