[Python-checkins] r68518 - sandbox/trunk/io-c/_textio.c

Sun Jan 11 02:38:50 CET 2009

Author: antoine.pitrou
Date: Sun Jan 11 02:38:50 2009
New Revision: 68518

Log:
Text IO changes:
- readline() twice as fast
- introduce a CHECK_CLOSED macro
- introduce a CHECK_INITIALIZED macro



Modified:
   sandbox/trunk/io-c/_textio.c

Modified: sandbox/trunk/io-c/_textio.c
==============================================================================

--- sandbox/trunk/io-c/_textio.c	(original)
+++ sandbox/trunk/io-c/_textio.c	Sun Jan 11 02:38:50 2009
@@ -450,6 +450,7 @@
 typedef struct
 {
     PyObject_HEAD
+    int ok; /* initialized? */
     Py_ssize_t chunk_size;
     PyObject *buffer;
     PyObject *encoding;
@@ -492,6 +493,7 @@
     PyObject *res;
     int r;
 
+    self->ok = 0;
     if (!PyArg_ParseTupleAndKeywords(args, kwds, "O|zzzi:fileio",
                                      kwlist, &buffer, &encoding, &errors,
                                      &newline, &line_buffering))
@@ -609,6 +611,7 @@
     self->seekable = self->telling = PyObject_IsTrue(res);
     Py_DECREF(res);
 
+    self->ok = 1;
     return 0;
 
   error:
@@ -620,7 +623,7 @@
 {
     PyObject *res;
     /* XXX this is inelegant */
-    if (Py_TYPE(self)->tp_del == NULL) {
+    if (Py_TYPE(self)->tp_del == NULL && self->ok) {
         /* We need to resurrect the object as calling close() can invoke
            arbitrary code. */
         ((PyObject *) self)->ob_refcnt++;
@@ -635,6 +638,7 @@
         if (--((PyObject *) self)->ob_refcnt != 0)
             return;
     }
+    self->ok = 0;
     Py_CLEAR(self->buffer);
     Py_CLEAR(self->encoding);
     Py_CLEAR(self->encoder);
@@ -648,6 +652,42 @@
     Py_TYPE(self)->tp_free((PyObject *)self);
 }
 
+static PyObject *
+TextIOWrapper_closed_get(PyTextIOWrapperObject *self, void *context);
+
+/* This macro takes some shortcuts to make the common case faster. We could
+   specialize even more, by detecting that the underlying buffer is a 
+   BufferedObject. */
+#define CHECK_CLOSED(self) \
+    do { \
+        int r; \
+        PyObject *_res; \
+        if (Py_TYPE(self) == &PyTextIOWrapper_Type) { \
+            _res = TextIOWrapper_closed_get(self, NULL); \
+            if (_res == NULL) \
+                return NULL; \
+            r = PyObject_IsTrue(_res); \
+            Py_DECREF(_res); \
+            if (r < 0) \
+                return NULL; \
+            if (r > 0) { \
+                PyErr_SetString(PyExc_ValueError, \
+                                "I/O operation on closed file."); \
+                return NULL; \
+            } \
+        } \
+        else if (_PyIOBase_checkClosed((PyObject *)self, Py_True) == NULL) \
+            return NULL; \
+    } while (0)
+
+#define CHECK_INITIALIZED(self) \
+    if (self->ok <= 0) { \
+        PyErr_SetString(PyExc_ValueError, \
+            "I/O operation on uninitialized object"); \
+        return NULL; \
+    }
+
+
 Py_LOCAL_INLINE(const Py_UNICODE *)
 findchar(const Py_UNICODE *s, Py_ssize_t size, Py_UNICODE ch)
 {
@@ -670,12 +710,13 @@
     int haslf = 0;
     int needflush = 0;
 
+    CHECK_INITIALIZED(self);
+
     if (!PyArg_ParseTuple(args, "U:write", &text)) {
         return NULL;
     }
 
-    if (_PyIOBase_checkClosed((PyObject *)self, Py_True) == NULL)
-        return NULL;
+    CHECK_CLOSED(self);
 
     Py_INCREF(text);
 
@@ -873,15 +914,20 @@
 
 }
 
+
 static PyObject *
 TextIOWrapper_read(PyTextIOWrapperObject *self, PyObject *args)
 {
     Py_ssize_t n = -1;
     PyObject *result;
 
+    CHECK_INITIALIZED(self);
+
     if (!PyArg_ParseTuple(args, "|n:read", &n))
         return NULL;
 
+    CHECK_CLOSED(self);
+
     if (n < 0) {
         /* Read everything */
         PyObject *bytes = PyObject_CallMethod(self->buffer, "read", NULL);
@@ -935,9 +981,10 @@
     return NULL;
 }
 
-/* It is assumed that end points to the real end of the Py_UNICODE storage,
-   that is to the NUL character. Otherwise the function will produce incorrect
-   results. */
+
+/* NOTE: `end` must point to the real end of the Py_UNICODE storage,
+   that is to the NUL character. Otherwise the function will produce
+   incorrect results. */
 static Py_UNICODE *
 find_control_char(Py_UNICODE *start, Py_UNICODE *end, Py_UNICODE ch)
 {
@@ -953,151 +1000,236 @@
     }
 }
 
-static PyObject *
-_TextIOWrapper_readline(PyTextIOWrapperObject *self, Py_ssize_t limit)
+/* Finds the first line ending between start and end.
+   If not found, returns -1 and sets (*consumed) to the number of characters
+   which can be safely put aside before another search.
+   If found, returns the index after the line ending and doesn't touch
+   (*consumed).
+   
+   NOTE: `end` must point to the real end of the Py_UNICODE storage,
+   that is to the NUL character. Otherwise the function will produce
+   incorrect results. */
+static Py_ssize_t
+find_line_ending(PyTextIOWrapperObject *self,
+                 Py_UNICODE *start, Py_UNICODE *end, Py_ssize_t *consumed)
 {
-    PyObject *line;
-    Py_ssize_t start, endpos;
-    int res;
-
-    if (_PyIOBase_checkClosed((PyObject *)self, Py_True) == NULL)
-        return NULL;
+    Py_ssize_t len = end - start;
 
-    /* Grab all the decoded text (we will rewind any extra bits later). */
-    line = TextIOWrapper_get_decoded_chars(self, -1);
-    if (line == NULL)
-        return NULL;
-
-    start = 0;
-
-    endpos = -1;
-
-    while (1) {
-        Py_UNICODE* ptr = PyUnicode_AS_UNICODE(line);
-        if (self->readtranslate) {
-            /* Newlines are already translated, only search for \n */
-            Py_UNICODE *pos = find_control_char(ptr + start,
-                                                ptr + PyUnicode_GET_SIZE(line),
-                                                '\n');
-            if (pos != NULL) {
-                endpos = pos - ptr + 1;
-                break;
-            }
-            else
-                start = PyUnicode_GET_SIZE(line);
+    if (self->readtranslate) {
+        /* Newlines are already translated, only search for \n */
+        Py_UNICODE *pos = find_control_char(start, end, '\n');
+        if (pos != NULL)
+            return pos - start + 1;
+        else {
+            *consumed = len;
+            return -1;
         }
-        else if (self->readuniversal) {
-            /* Universal newline search. Find any of \r, \r\n, \n
-             * The decoder ensures that \r\n are not split in two pieces
-             */
-            Py_UNICODE *s = ptr + start;
-            Py_UNICODE *e = ptr + PyUnicode_GET_SIZE(line);
-            for (;;) {
-                Py_UNICODE ch;
-                /* Fast path for non-control chars. The loop always ends
-                   since the Py_UNICODE storage is NUL-terminated. */
-                while (*s > '\r')
-                    s++;
-                if (s == e)
-                    goto _universal_not_found;
-                ch = *s++;
-                if (ch == '\n') {
-                    endpos = s - ptr;
-                    break;
-                }
-                if (ch == '\r') {
-                    if (*s == '\n')
-                        endpos = s - ptr + 1;
-                    else
-                        endpos = s - ptr;
-                    break;
-                }
+    }
+    else if (self->readuniversal) {
+        /* Universal newline search. Find any of \r, \r\n, \n
+         * The decoder ensures that \r\n are not split in two pieces
+         */
+        Py_UNICODE *s = start;
+        for (;;) {
+            Py_UNICODE ch;
+            /* Fast path for non-control chars. The loop always ends
+               since the Py_UNICODE storage is NUL-terminated. */
+            while (*s > '\r')
+                s++;
+            if (s >= end) {
+                *consumed = len;
+                return -1;
             }
-            break;
-          _universal_not_found:
-            start = PyUnicode_GET_SIZE(line);
+            ch = *s++;
+            if (ch == '\n')
+                return s - start;
+            if (ch == '\r') {
+                if (*s == '\n')
+                    return s - start + 1;
+                else
+                    return s - start;
+            }
+        }
+    }
+    else {
+        /* Non-universal mode. */
+        Py_ssize_t readnl_len = PyUnicode_GET_SIZE(self->readnl);
+        Py_UNICODE *nl = PyUnicode_AS_UNICODE(self->readnl);
+        if (readnl_len == 1) {
+            Py_UNICODE *pos = find_control_char(start, end, nl[0]);
+            if (pos != NULL)
+                return pos - start + 1;
+            *consumed = len;
+            return -1;
         }
         else {
-            /* Non-universal mode. */
-            Py_ssize_t readnl_len = PyUnicode_GET_SIZE(self->readnl);
-            Py_ssize_t line_len = PyUnicode_GET_SIZE(line);
-            if (readnl_len <= line_len) {
-                if (readnl_len == 1) {
-                    Py_UNICODE *pos = find_control_char(
-                            ptr + start, 
-                            ptr + line_len,
-                            PyUnicode_AS_UNICODE(self->readnl)[0]);
-                    if (pos != NULL) {
-                        endpos = pos - ptr + 1;
-                        break;
-                    }
-                    start = PyUnicode_GET_SIZE(line);
-                }
-                else {
-                    Py_ssize_t pos = PyUnicode_Find(line, self->readnl,
-                                                    start, line_len, 1);
-                    if (pos >= 0) {
-                        endpos = pos + readnl_len;
+            Py_UNICODE *s = start;
+            Py_UNICODE *e = end - readnl_len + 1;
+            Py_UNICODE *pos;
+            if (e < s)
+                e = s;
+            while (s < e) {
+                Py_ssize_t i;
+                Py_UNICODE *pos = find_control_char(s, end, nl[0]);
+                if (pos == NULL || pos >= e)
+                    break;
+                for (i = 1; i < readnl_len; i++) {
+                    if (pos[i] != nl[i])
                         break;
-                    }
-                    start = line_len - readnl_len + 1;
                 }
+                if (i == readnl_len)
+                    return pos - start + readnl_len;
+                s = pos + 1;
             }
+            pos = find_control_char(e, end, nl[0]);
+            if (pos == NULL)
+                *consumed = len;
+            else
+                *consumed = pos - start;
+            return -1;
         }
+    }
+}
 
-        if (limit >= 0 && PyUnicode_GET_SIZE(line) >= limit) {
-            /* reached length limit */
-            endpos = limit;
-            break;
-        }
+static PyObject *
+_TextIOWrapper_readline(PyTextIOWrapperObject *self, Py_ssize_t limit)
+{
+    PyObject *line = NULL, *chunks = NULL, *remaining = NULL;
+    Py_ssize_t start, endpos, chunked, offset_to_buffer;
+    int res;
+
+    CHECK_CLOSED(self);
+
+    chunked = 0;
 
-        /* No line ending seen yet - get more data */
-        while (1) {
+    while (1) {
+        Py_UNICODE *ptr;
+        Py_ssize_t line_len;
+        Py_ssize_t consumed = 0;
+
+        /* First, get some data if necessary */
+        res = 1;
+        while (!self->decoded_chars ||
+               !PyUnicode_GET_SIZE(self->decoded_chars)) {
             res = TextIOWrapper_read_chunk(self);
             if (res < 0)
                 goto error;
             if (res == 0)
                 break;
-            if (self->decoded_chars &&
-                PyUnicode_GET_SIZE(self->decoded_chars))
-                break;
         }
         if (res == 0) {
             /* end of file */
             TextIOWrapper_set_decoded_chars(self, NULL);
             Py_CLEAR(self->snapshot);
-            return line;
+            start = endpos = offset_to_buffer = 0;
+            break;
+        }
+
+        if (remaining == NULL) {
+            line = self->decoded_chars;
+            start = self->decoded_chars_used;
+            offset_to_buffer = 0;
+            Py_INCREF(line);
         }
         else {
-            PyUnicode_AppendAndDel(&line,
-                                   TextIOWrapper_get_decoded_chars(
-                                       self, -1));
+            assert(self->decoded_chars_used == 0);
+            line = PyUnicode_Concat(remaining, self->decoded_chars);
+            start = 0;
+            offset_to_buffer = PyUnicode_GET_SIZE(remaining);
+            Py_CLEAR(remaining);
             if (line == NULL)
                 goto error;
         }
-    }
 
-    if (limit >= 0 && endpos > limit)
-        endpos = limit; /* don't exceed limit */
+        ptr = PyUnicode_AS_UNICODE(line);
+        line_len = PyUnicode_GET_SIZE(line);
 
-    /* Rewind decoded_chars to just after the line ending we found. */
-    TextIOWrapper_rewind_decoded_chars(
-        self, PyUnicode_GET_SIZE(line) - endpos);
-
-    if (PyUnicode_GET_SIZE(line) != endpos) {
-        PyObject *resized = PyUnicode_FromUnicode(
-            PyUnicode_AS_UNICODE(line), endpos);
+        endpos = find_line_ending(self, ptr + start, ptr + line_len, &consumed);
+        if (endpos >= 0) {
+            endpos += start;
+            if (limit >= 0 && (endpos - start) + chunked >= limit)
+                endpos = start + limit - chunked;
+            break;
+        }
 
-        if (resized == NULL)
-            goto error;
+        /* We can put aside up to `endpos` */
+        endpos = consumed + start;
+        if (limit >= 0 && (endpos - start) + chunked >= limit) {
+            /* Didn't find line ending, but reached length limit */
+            endpos = start + limit - chunked;
+            break;
+        }
 
-        Py_DECREF(line);
-        line = resized;
+        if (endpos > start) {
+            /* No line ending seen yet - put aside current data */
+            PyObject *s;
+            if (chunks == NULL) {
+                chunks = PyList_New(0);
+                if (chunks == NULL)
+                    goto error;
+            }
+            s = PyUnicode_FromUnicode(ptr + start, endpos - start);
+            if (s == NULL)
+                goto error;
+            if (PyList_Append(chunks, s) < 0) {
+                Py_DECREF(s);
+                goto error;
+            }
+            chunked += PyUnicode_GET_SIZE(s);
+            Py_DECREF(s);
+        }
+        /* There may be some remaining bytes we'll have to prepend to the
+           next chunk of data */
+        if (endpos < line_len) {
+            remaining = PyUnicode_FromUnicode(
+                    ptr + endpos, line_len - endpos);
+            if (remaining == NULL)
+                goto error;
+        }
+        Py_CLEAR(line);
+        /* We have consumed the buffer */
+        TextIOWrapper_set_decoded_chars(self, NULL);
+    }
+
+    if (line != NULL) {
+        /* Our line ends in the current buffer */
+        self->decoded_chars_used = endpos - offset_to_buffer;
+        if (start > 0 || endpos < PyUnicode_GET_SIZE(line)) {
+            if (start == 0 && Py_REFCNT(line) == 1) {
+                if (PyUnicode_Resize(&line, endpos) < 0)
+                    goto error;
+            }
+            else {
+                PyObject *s = PyUnicode_FromUnicode(
+                        PyUnicode_AS_UNICODE(line) + start, endpos - start);
+                Py_CLEAR(line);
+                if (s == NULL)
+                    goto error;
+                line = s;
+            }
+        }
+    }
+    if (chunks != NULL) {
+        if (remaining != NULL && PyList_Append(chunks, remaining) < 0)
+            goto error;
+        Py_CLEAR(remaining);
+        if (line != NULL && PyList_Append(chunks, line) < 0)
+            goto error;
+        Py_CLEAR(line);
+        line = PyUnicode_Join(PyUnicode_FromStringAndSize(NULL, 0), chunks);
+        if (line == NULL)
+            goto error;
+        Py_DECREF(chunks);
     }
+    if (line == NULL)
+        line = PyUnicode_FromStringAndSize(NULL, 0);
+
     return line;
 
   error:
-    Py_DECREF(line);
+    Py_XDECREF(chunks);
+    Py_XDECREF(remaining);
+    Py_XDECREF(line);
     return NULL;
 }
 
@@ -1106,6 +1238,7 @@
 {
     Py_ssize_t limit = -1;
 
+    CHECK_INITIALIZED(self);
     if (!PyArg_ParseTuple(args, "|n:readline", &limit)) {
         return NULL;
     }
@@ -1234,6 +1367,8 @@
     PyObject *res;
     int cmp;
 
+    CHECK_INITIALIZED(self);
+
     if (zero == NULL) {
         zero = PyLong_FromLong(0L);
         if (zero == NULL)
@@ -1244,8 +1379,7 @@
         return NULL;
     Py_INCREF(cookieObj);
 
-    if (_PyIOBase_checkClosed((PyObject *)self, Py_True) == NULL)
-        goto fail;
+    CHECK_CLOSED(self);
 
     if (!self->seekable) {
         PyErr_SetString(PyExc_IOError,
@@ -1404,6 +1538,9 @@
     PyObject *saved_state = NULL;
     char *input, *input_end;
 
+    CHECK_INITIALIZED(self);
+    CHECK_CLOSED(self);
+
     if (!self->seekable) {
         PyErr_SetString(PyExc_IOError,
                         "underlying stream is not seekable");
@@ -1561,36 +1698,43 @@
 static PyObject *
 TextIOWrapper_fileno(PyTextIOWrapperObject *self, PyObject *args)
 {
+    CHECK_INITIALIZED(self);
     return PyObject_CallMethod(self->buffer, "fileno", NULL);
 }
 
 static PyObject *
 TextIOWrapper_seekable(PyTextIOWrapperObject *self, PyObject *args)
 {
+    CHECK_INITIALIZED(self);
     return PyObject_CallMethod(self->buffer, "seekable", NULL);
 }
 
 static PyObject *
 TextIOWrapper_readable(PyTextIOWrapperObject *self, PyObject *args)
 {
+    CHECK_INITIALIZED(self);
     return PyObject_CallMethod(self->buffer, "readable", NULL);
 }
 
 static PyObject *
 TextIOWrapper_writable(PyTextIOWrapperObject *self, PyObject *args)
 {
+    CHECK_INITIALIZED(self);
     return PyObject_CallMethod(self->buffer, "writable", NULL);
 }
 
 static PyObject *
 TextIOWrapper_isatty(PyTextIOWrapperObject *self, PyObject *args)
 {
+    CHECK_INITIALIZED(self);
     return PyObject_CallMethod(self->buffer, "isatty", NULL);
 }
 
 static PyObject *
 TextIOWrapper_flush(PyTextIOWrapperObject *self, PyObject *args)
 {
+    CHECK_INITIALIZED(self);
+    CHECK_CLOSED(self);
     self->telling = self->seekable;
     return PyObject_CallMethod(self->buffer, "flush", NULL);
 }
@@ -1598,7 +1742,9 @@
 static PyObject *
 TextIOWrapper_close(PyTextIOWrapperObject *self, PyObject *args)
 {
-    PyObject *res = PyObject_CallMethod((PyObject *)self, "flush", NULL);
+    PyObject *res;
+    CHECK_INITIALIZED(self);
+    res = PyObject_CallMethod((PyObject *)self, "flush", NULL);
     if (res == NULL) {
         /* If flush() fails, just give up */
         PyErr_Clear();
@@ -1614,8 +1760,9 @@
 {
     PyObject *line;
 
-    self->telling = 0;
+    CHECK_INITIALIZED(self);
 
+    self->telling = 0;
     if (Py_TYPE(self) == &PyTextIOWrapper_Type) {
         /* Skip method call overhead for speed */
         line = _TextIOWrapper_readline(self, -1);
@@ -1645,12 +1792,14 @@
 static PyObject *
 TextIOWrapper_name_get(PyTextIOWrapperObject *self, void *context)
 {
+    CHECK_INITIALIZED(self);
     return PyObject_GetAttrString(self->buffer, "name");
 }
 
 static PyObject *
 TextIOWrapper_closed_get(PyTextIOWrapperObject *self, void *context)
 {
+    CHECK_INITIALIZED(self);
     return PyObject_GetAttr(self->buffer, _PyIO_str_closed);
 }
 
@@ -1658,6 +1807,7 @@
 TextIOWrapper_newlines_get(PyTextIOWrapperObject *self, void *context)
 {
     PyObject *res;
+    CHECK_INITIALIZED(self);
     if (self->decoder == NULL)
         Py_RETURN_NONE;
     res = PyObject_GetAttr(self->decoder, _PyIO_str_newlines);