[Python-checkins] r68626 - sandbox/trunk/io-c/_textio.c

Fri Jan 16 03:24:24 CET 2009

Author: antoine.pitrou
Date: Fri Jan 16 03:24:23 2009
New Revision: 68626

Log:
Speedup small writes a lot for a few common encodings
+ fix utf-16 BOM read problem when seeking to the start of the file



Modified:
   sandbox/trunk/io-c/_textio.c

Modified: sandbox/trunk/io-c/_textio.c
==============================================================================

--- sandbox/trunk/io-c/_textio.c	(original)
+++ sandbox/trunk/io-c/_textio.c	Fri Jan 16 03:24:23 2009
@@ -428,7 +428,6 @@
     PyType_GenericNew,          /* tp_new */
 };
 
-
 
 /* TextIOWrapper */
 
@@ -455,6 +454,9 @@
     "write contains a newline character."
     );
 
+typedef PyObject *
+        (*encodefunc_t)(PyObject *, PyObject *);
+
 typedef struct
 {
     PyObject_HEAD
@@ -465,6 +467,7 @@
     PyObject *encoder;
     PyObject *decoder;
     PyObject *readnl;
+    PyObject *errors;
     const char *writenl; /* utf-8 encoded, NULL stands for \n */
     int line_buffering:1;
     int readuniversal:1;
@@ -472,6 +475,8 @@
     int writetranslate:1;
     int seekable:1;
     int telling:1;
+    /* Specialized encoding func (see below) */
+    encodefunc_t encodefunc;
 
     /* Reads and writes are internally buffered in order to speed things up.
        However, any read will first flush the write buffer if itsn't empty.
@@ -498,6 +503,87 @@
     PyObject *dict;
 } PyTextIOWrapperObject;
 
+
+/* A couple of specialized cases in order to bypass the slow incremental
+   encoding methods for the most popular encodings. */
+
+static PyObject *
+ascii_encode(PyTextIOWrapperObject *self, PyObject *text)
+{
+    return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(text),
+                                 PyUnicode_GET_SIZE(text),
+                                 PyBytes_AS_STRING(self->errors));
+}
+
+static PyObject *
+utf16be_encode(PyTextIOWrapperObject *self, PyObject *text)
+{
+    return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(text),
+                                 PyUnicode_GET_SIZE(text),
+                                 PyBytes_AS_STRING(self->errors), 1);
+}
+
+static PyObject *
+utf16le_encode(PyTextIOWrapperObject *self, PyObject *text)
+{
+    return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(text),
+                                 PyUnicode_GET_SIZE(text),
+                                 PyBytes_AS_STRING(self->errors), -1);
+}
+
+static PyObject *
+utf16_encode(PyTextIOWrapperObject *self, PyObject *text)
+{
+    PyObject *res;
+    res = PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(text),
+                                PyUnicode_GET_SIZE(text),
+                                PyBytes_AS_STRING(self->errors), 0);
+    if (res == NULL)
+        return NULL;
+    /* Next writes will skip the BOM and use native byte ordering */
+#if defined(WORDS_BIGENDIAN)
+    self->encodefunc = (encodefunc_t) utf16be_encode;
+#else
+    self->encodefunc = (encodefunc_t) utf16le_encode;
+#endif
+    return res;
+}
+
+
+static PyObject *
+utf8_encode(PyTextIOWrapperObject *self, PyObject *text)
+{
+    return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(text),
+                                PyUnicode_GET_SIZE(text),
+                                PyBytes_AS_STRING(self->errors));
+}
+
+static PyObject *
+latin1_encode(PyTextIOWrapperObject *self, PyObject *text)
+{
+    return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(text),
+                                  PyUnicode_GET_SIZE(text),
+                                  PyBytes_AS_STRING(self->errors));
+}
+
+/* Map normalized encoding names onto the specialized encoding funcs */
+
+typedef struct {
+    const char *name;
+    encodefunc_t encodefunc;
+} encodefuncentry;
+
+encodefuncentry encodefuncs[] = {
+    {"ascii",       (encodefunc_t) ascii_encode},
+    {"iso8859-1",   (encodefunc_t) latin1_encode},
+    {"utf-16-be",   (encodefunc_t) utf16be_encode},
+    {"utf-16-le",   (encodefunc_t) utf16le_encode},
+    {"utf-16",      (encodefunc_t) utf16_encode},
+    {"utf-8",       (encodefunc_t) utf8_encode},
+    {NULL, NULL}
+};
+
+
 static int
 TextIOWrapper_init(PyTextIOWrapperObject *self, PyObject *args, PyObject *kwds)
 {
@@ -536,14 +622,16 @@
     Py_CLEAR(self->decoded_chars);
     Py_CLEAR(self->pending_bytes);
     Py_CLEAR(self->snapshot);
+    Py_CLEAR(self->errors);
     self->decoded_chars_used = 0;
     self->pending_bytes_count = 0;
+    self->encodefunc = NULL;
 
     if (encoding == NULL) {
         /* Try os.device_encoding(fileno) */
         PyObject *os = PyImport_ImportModule("os");
         if (os == NULL)
-            return -1;
+            goto error;
         self->encoding = PyObject_CallMethod(
             os, "device_encoding", "N",
             PyObject_CallMethod(buffer, "fileno", NULL));
@@ -566,7 +654,7 @@
                 locale, "getpreferredencoding", NULL);
             Py_DECREF(locale);
             if (self->encoding == NULL)
-                return -1;
+                goto error;
             if (!PyUnicode_Check(self->encoding))
                 Py_CLEAR(self->encoding);
         }
@@ -576,6 +664,9 @@
 
     if (errors == NULL)
         errors = "strict";
+    self->errors = PyBytes_FromString(errors);
+    if (self->errors == NULL)
+        goto error;
 
     self->chunk_size = 8192;
     self->readuniversal = (newline == NULL || newline[0] == '\0');
@@ -628,10 +719,30 @@
     if (r == -1)
         goto error;
     if (r == 1) {
+        PyObject *ci;
         self->encoder = PyCodec_IncrementalEncoder(
             encoding, errors);
         if (self->encoder == NULL)
             goto error;
+        /* Get the normalized named of the codec */
+        ci = _PyCodec_Lookup(encoding);
+        if (ci == NULL)
+            goto error;
+        res = PyObject_GetAttrString(ci, "name");
+        Py_DECREF(ci);
+        if (res == NULL)
+            PyErr_Clear();
+        else if (PyUnicode_Check(res)) {
+            encodefuncentry *e = encodefuncs;
+            while (e->name != NULL) {
+                if (!PyUnicode_CompareWithASCIIString(res, e->name)) {
+                    self->encodefunc = e->encodefunc;
+                    break;
+                }
+                e++;
+            }
+            Py_DECREF(res);
+        }
     }
 
     self->buffer = buffer;
@@ -805,7 +916,11 @@
         needflush = 1;
 
     /* XXX What if we were just reading? */
-    b = PyObject_CallMethodObjArgs(self->encoder, _PyIO_str_encode, text, NULL);
+    if (self->encodefunc != NULL)
+        b = (*self->encodefunc)((PyObject *) self, text);
+    else
+        b = PyObject_CallMethodObjArgs(self->encoder,
+                                       _PyIO_str_encode, text, NULL);
     Py_DECREF(text);
     if (b == NULL)
         return NULL;
@@ -1458,6 +1573,28 @@
 }
 #undef IS_LITTLE_ENDIAN
 
+static int
+_TextIOWrapper_decoder_setstate(PyTextIOWrapperObject *self,
+                                CookieStruct *cookie)
+{
+    PyObject *res;
+    /* When seeking to the start of the stream, we call decoder.reset()
+       rather than decoder.getstate().
+       This is for a few decoders such as utf-16 for which the state value
+       at start is not (b"", 0) but e.g. (b"", 2) (meaning, in the case of
+       utf-16, that we are expecting a BOM).
+    */
+    if (cookie->start_pos == 0 && cookie->dec_flags == 0)
+        res = PyObject_CallMethod(self->decoder, "reset", NULL);
+    else
+        res = PyObject_CallMethod(self->decoder, "setstate",
+                                  "((yi))", "", cookie->dec_flags);
+    if (res == NULL)
+        return -1;
+    Py_DECREF(res);
+    return 0;
+}
+
 static PyObject *
 TextIOWrapper_seek(PyTextIOWrapperObject *self, PyObject *args)
 {
@@ -1576,13 +1713,9 @@
     Py_CLEAR(self->snapshot);
 
     /* Restore the decoder to its state from the safe start point. */
-    if (self->decoder || cookie.dec_flags || cookie.chars_to_skip) {
-        res = PyObject_CallMethod(self->decoder, "setstate",
-                                  "((yi))", "", cookie.dec_flags);
-        if (res == NULL)
+    if (self->decoder) {
+        if (_TextIOWrapper_decoder_setstate(self, &cookie) < 0)
             goto fail;
-        Py_DECREF(res);
-
         self->snapshot = Py_BuildValue("iy", cookie.dec_flags, "");
         if (self->snapshot == NULL)
             goto fail;
@@ -1703,11 +1836,8 @@
         goto fail;
 
     /* Note our initial start point. */
-    res = PyObject_CallMethod(self->decoder, "setstate",
-                              "((yi))", "", cookie.dec_flags);
-    if (res == NULL)
+    if (_TextIOWrapper_decoder_setstate(self, &cookie) < 0)
         goto fail;
-    Py_DECREF(res);
 
     /* Feed the decoder one byte at a time.  As we go, note the
      * nearest "safe start point" before the current location