[Python-checkins] cpython: Unicode: optimize creating of 1-character strings

Thu May 3 02:33:56 CEST 2012

http://hg.python.org/cpython/rev/3d83e2297166
changeset:   76722:3d83e2297166
user:        Victor Stinner <victor.stinner at gmail.com>
date:        Thu May 03 02:17:04 2012 +0200
summary:
  Unicode: optimize creating of 1-character strings

files:
  Objects/unicodeobject.c |  60 ++++++++++++++++++++++++----
  1 files changed, 51 insertions(+), 9 deletions(-)

diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -1919,8 +1919,18 @@
         return unicode_empty;
     }
     assert(size > 0);
-    if (size == 1 && u[0] < 256)
-        return get_latin1_char((unsigned char)u[0]);
+    if (size == 1) {
+        Py_UCS4 ch = u[0];
+        if (ch < 256)
+            return get_latin1_char((unsigned char)ch);
+
+        res = PyUnicode_New(1, ch);
+        if (res == NULL)
+            return NULL;
+        PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch);
+        assert(_PyUnicode_CheckConsistency(res, 1));
+        return res;
+    }
 
     max_char = ucs2lib_find_max_char(u, u + size);
     res = PyUnicode_New(size, max_char);
@@ -1947,8 +1957,18 @@
         return unicode_empty;
     }
     assert(size > 0);
-    if (size == 1 && u[0] < 256)
-        return get_latin1_char((unsigned char)u[0]);
+    if (size == 1) {
+        Py_UCS4 ch = u[0];
+        if (ch < 256)
+            return get_latin1_char((unsigned char)ch);
+
+        res = PyUnicode_New(1, ch);
+        if (res == NULL)
+            return NULL;
+        PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch);
+        assert(_PyUnicode_CheckConsistency(res, 1));
+        return res;
+    }
 
     max_char = ucs4lib_find_max_char(u, u + size);
     res = PyUnicode_New(size, max_char);
@@ -11368,10 +11388,33 @@
 static PyObject *
 unicode_getitem(PyObject *self, Py_ssize_t index)
 {
-    Py_UCS4 ch = PyUnicode_ReadChar(self, index);
-    if (ch == (Py_UCS4)-1)
-        return NULL;
-    return PyUnicode_FromOrdinal(ch);
+    void *data;
+    enum PyUnicode_Kind kind;
+    Py_UCS4 ch;
+    PyObject *res;
+
+    if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
+        PyErr_BadArgument();
+        return NULL;
+    }
+    if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
+        PyErr_SetString(PyExc_IndexError, "string index out of range");
+        return NULL;
+    }
+    kind = PyUnicode_KIND(self);
+    data = PyUnicode_DATA(self);
+    ch = PyUnicode_READ(kind, data, index);
+    if (ch < 256)
+        return get_latin1_char(ch);
+
+    res = PyUnicode_New(1, ch);
+    if (res == NULL)
+        return NULL;
+    kind = PyUnicode_KIND(res);
+    data = PyUnicode_DATA(res);
+    PyUnicode_WRITE(kind, data, 0, ch);
+    assert(_PyUnicode_CheckConsistency(res, 1));
+    return res;
 }
 
 /* Believe it or not, this produces the same value for ASCII strings
@@ -12039,7 +12082,6 @@
     }
 
     if (PyUnicode_IS_ASCII(self)) {
-        kind = PyUnicode_KIND(self);
         data = PyUnicode_1BYTE_DATA(self);
         return unicode_fromascii(data + start, length);
     }

-- 
Repository URL: http://hg.python.org/cpython