[Python-checkins] r68626 - sandbox/trunk/io-c/_textio.c
antoine.pitrou
python-checkins at python.org
Fri Jan 16 03:24:24 CET 2009
Author: antoine.pitrou
Date: Fri Jan 16 03:24:23 2009
New Revision: 68626
Log:
Speedup small writes a lot for a few common encodings
+ fix utf-16 BOM read problem when seeking to the start of the file
Modified:
sandbox/trunk/io-c/_textio.c
Modified: sandbox/trunk/io-c/_textio.c
==============================================================================
--- sandbox/trunk/io-c/_textio.c (original)
+++ sandbox/trunk/io-c/_textio.c Fri Jan 16 03:24:23 2009
@@ -428,7 +428,6 @@
PyType_GenericNew, /* tp_new */
};
-
/* TextIOWrapper */
@@ -455,6 +454,9 @@
"write contains a newline character."
);
+typedef PyObject *
+ (*encodefunc_t)(PyObject *, PyObject *);
+
typedef struct
{
PyObject_HEAD
@@ -465,6 +467,7 @@
PyObject *encoder;
PyObject *decoder;
PyObject *readnl;
+ PyObject *errors;
const char *writenl; /* utf-8 encoded, NULL stands for \n */
int line_buffering:1;
int readuniversal:1;
@@ -472,6 +475,8 @@
int writetranslate:1;
int seekable:1;
int telling:1;
+ /* Specialized encoding func (see below) */
+ encodefunc_t encodefunc;
/* Reads and writes are internally buffered in order to speed things up.
However, any read will first flush the write buffer if itsn't empty.
@@ -498,6 +503,87 @@
PyObject *dict;
} PyTextIOWrapperObject;
+
+/* A couple of specialized cases in order to bypass the slow incremental
+ encoding methods for the most popular encodings. */
+
+static PyObject *
+ascii_encode(PyTextIOWrapperObject *self, PyObject *text)
+{
+ return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(text),
+ PyUnicode_GET_SIZE(text),
+ PyBytes_AS_STRING(self->errors));
+}
+
+static PyObject *
+utf16be_encode(PyTextIOWrapperObject *self, PyObject *text)
+{
+ return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(text),
+ PyUnicode_GET_SIZE(text),
+ PyBytes_AS_STRING(self->errors), 1);
+}
+
+static PyObject *
+utf16le_encode(PyTextIOWrapperObject *self, PyObject *text)
+{
+ return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(text),
+ PyUnicode_GET_SIZE(text),
+ PyBytes_AS_STRING(self->errors), -1);
+}
+
+static PyObject *
+utf16_encode(PyTextIOWrapperObject *self, PyObject *text)
+{
+ PyObject *res;
+ res = PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(text),
+ PyUnicode_GET_SIZE(text),
+ PyBytes_AS_STRING(self->errors), 0);
+ if (res == NULL)
+ return NULL;
+ /* Next writes will skip the BOM and use native byte ordering */
+#if defined(WORDS_BIGENDIAN)
+ self->encodefunc = (encodefunc_t) utf16be_encode;
+#else
+ self->encodefunc = (encodefunc_t) utf16le_encode;
+#endif
+ return res;
+}
+
+
+static PyObject *
+utf8_encode(PyTextIOWrapperObject *self, PyObject *text)
+{
+ return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(text),
+ PyUnicode_GET_SIZE(text),
+ PyBytes_AS_STRING(self->errors));
+}
+
+static PyObject *
+latin1_encode(PyTextIOWrapperObject *self, PyObject *text)
+{
+ return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(text),
+ PyUnicode_GET_SIZE(text),
+ PyBytes_AS_STRING(self->errors));
+}
+
+/* Map normalized encoding names onto the specialized encoding funcs */
+
+typedef struct {
+ const char *name;
+ encodefunc_t encodefunc;
+} encodefuncentry;
+
+encodefuncentry encodefuncs[] = {
+ {"ascii", (encodefunc_t) ascii_encode},
+ {"iso8859-1", (encodefunc_t) latin1_encode},
+ {"utf-16-be", (encodefunc_t) utf16be_encode},
+ {"utf-16-le", (encodefunc_t) utf16le_encode},
+ {"utf-16", (encodefunc_t) utf16_encode},
+ {"utf-8", (encodefunc_t) utf8_encode},
+ {NULL, NULL}
+};
+
+
static int
TextIOWrapper_init(PyTextIOWrapperObject *self, PyObject *args, PyObject *kwds)
{
@@ -536,14 +622,16 @@
Py_CLEAR(self->decoded_chars);
Py_CLEAR(self->pending_bytes);
Py_CLEAR(self->snapshot);
+ Py_CLEAR(self->errors);
self->decoded_chars_used = 0;
self->pending_bytes_count = 0;
+ self->encodefunc = NULL;
if (encoding == NULL) {
/* Try os.device_encoding(fileno) */
PyObject *os = PyImport_ImportModule("os");
if (os == NULL)
- return -1;
+ goto error;
self->encoding = PyObject_CallMethod(
os, "device_encoding", "N",
PyObject_CallMethod(buffer, "fileno", NULL));
@@ -566,7 +654,7 @@
locale, "getpreferredencoding", NULL);
Py_DECREF(locale);
if (self->encoding == NULL)
- return -1;
+ goto error;
if (!PyUnicode_Check(self->encoding))
Py_CLEAR(self->encoding);
}
@@ -576,6 +664,9 @@
if (errors == NULL)
errors = "strict";
+ self->errors = PyBytes_FromString(errors);
+ if (self->errors == NULL)
+ goto error;
self->chunk_size = 8192;
self->readuniversal = (newline == NULL || newline[0] == '\0');
@@ -628,10 +719,30 @@
if (r == -1)
goto error;
if (r == 1) {
+ PyObject *ci;
self->encoder = PyCodec_IncrementalEncoder(
encoding, errors);
if (self->encoder == NULL)
goto error;
+ /* Get the normalized named of the codec */
+ ci = _PyCodec_Lookup(encoding);
+ if (ci == NULL)
+ goto error;
+ res = PyObject_GetAttrString(ci, "name");
+ Py_DECREF(ci);
+ if (res == NULL)
+ PyErr_Clear();
+ else if (PyUnicode_Check(res)) {
+ encodefuncentry *e = encodefuncs;
+ while (e->name != NULL) {
+ if (!PyUnicode_CompareWithASCIIString(res, e->name)) {
+ self->encodefunc = e->encodefunc;
+ break;
+ }
+ e++;
+ }
+ Py_DECREF(res);
+ }
}
self->buffer = buffer;
@@ -805,7 +916,11 @@
needflush = 1;
/* XXX What if we were just reading? */
- b = PyObject_CallMethodObjArgs(self->encoder, _PyIO_str_encode, text, NULL);
+ if (self->encodefunc != NULL)
+ b = (*self->encodefunc)((PyObject *) self, text);
+ else
+ b = PyObject_CallMethodObjArgs(self->encoder,
+ _PyIO_str_encode, text, NULL);
Py_DECREF(text);
if (b == NULL)
return NULL;
@@ -1458,6 +1573,28 @@
}
#undef IS_LITTLE_ENDIAN
+static int
+_TextIOWrapper_decoder_setstate(PyTextIOWrapperObject *self,
+ CookieStruct *cookie)
+{
+ PyObject *res;
+ /* When seeking to the start of the stream, we call decoder.reset()
+ rather than decoder.getstate().
+ This is for a few decoders such as utf-16 for which the state value
+ at start is not (b"", 0) but e.g. (b"", 2) (meaning, in the case of
+ utf-16, that we are expecting a BOM).
+ */
+ if (cookie->start_pos == 0 && cookie->dec_flags == 0)
+ res = PyObject_CallMethod(self->decoder, "reset", NULL);
+ else
+ res = PyObject_CallMethod(self->decoder, "setstate",
+ "((yi))", "", cookie->dec_flags);
+ if (res == NULL)
+ return -1;
+ Py_DECREF(res);
+ return 0;
+}
+
static PyObject *
TextIOWrapper_seek(PyTextIOWrapperObject *self, PyObject *args)
{
@@ -1576,13 +1713,9 @@
Py_CLEAR(self->snapshot);
/* Restore the decoder to its state from the safe start point. */
- if (self->decoder || cookie.dec_flags || cookie.chars_to_skip) {
- res = PyObject_CallMethod(self->decoder, "setstate",
- "((yi))", "", cookie.dec_flags);
- if (res == NULL)
+ if (self->decoder) {
+ if (_TextIOWrapper_decoder_setstate(self, &cookie) < 0)
goto fail;
- Py_DECREF(res);
-
self->snapshot = Py_BuildValue("iy", cookie.dec_flags, "");
if (self->snapshot == NULL)
goto fail;
@@ -1703,11 +1836,8 @@
goto fail;
/* Note our initial start point. */
- res = PyObject_CallMethod(self->decoder, "setstate",
- "((yi))", "", cookie.dec_flags);
- if (res == NULL)
+ if (_TextIOWrapper_decoder_setstate(self, &cookie) < 0)
goto fail;
- Py_DECREF(res);
/* Feed the decoder one byte at a time. As we go, note the
* nearest "safe start point" before the current location
More information about the Python-checkins
mailing list