[Python-checkins] r68510 - sandbox/trunk/io-c/_textio.c

Sat Jan 10 23:35:55 CET 2009

Author: antoine.pitrou
Date: Sat Jan 10 23:35:55 2009
New Revision: 68510

Log:
Slightly faster newline detection



Modified:
   sandbox/trunk/io-c/_textio.c

Modified: sandbox/trunk/io-c/_textio.c
==============================================================================

--- sandbox/trunk/io-c/_textio.c	(original)
+++ sandbox/trunk/io-c/_textio.c	Sat Jan 10 23:35:55 2009
@@ -180,7 +180,7 @@
        all in one pass. */
     {
         Py_UNICODE *in_str;
-        Py_ssize_t in, len;
+        Py_ssize_t len;
         int seennl = self->seennl;
         int only_lf;
 
@@ -197,26 +197,33 @@
                 && !memchr(in_str, '\r', len * sizeof(Py_UNICODE)));
 
         if (!self->translate) {
+            Py_UNICODE *s, *end;
             if (seennl == SEEN_ALL)
                 goto endscan;
             if (only_lf)
                 goto endscan;
-            for (in = 0; in < len;) {
-                Py_UNICODE c = in_str[in++];
-                if (c >= 0x20)
-                    continue;
+            s = in_str;
+            end = in_str + len;
+            for (;;) {
+                Py_UNICODE c;
+                /* Fast loop for non-control characters */
+                while (*s > '\r')
+                    s++;
+                c = *s++;
                 if (c == '\n')
                     seennl |= SEEN_LF;
                 else if (c == '\r') {
-                    if (in_str[in] == '\n') {
+                    if (*s == '\n') {
                         seennl |= SEEN_CRLF;
-                        in++;
+                        s++;
                     }
                     else
                         seennl |= SEEN_CR;
                 }
+                if (s > end)
+                    break;
                 if (seennl == SEEN_ALL)
-                    goto endscan;
+                    break;
             }
         endscan:
             ;
@@ -224,7 +231,7 @@
         else if (!only_lf) {
             PyObject *translated = NULL;
             Py_UNICODE *out_str;
-            Py_ssize_t out;
+            Py_UNICODE *in, *out, *end;
             if (Py_REFCNT(output) != 1) {
                 /* We could try to optimize this so that we only do a copy
                    when there is something to translate. On the other hand,
@@ -242,35 +249,39 @@
                 translated = output;
             }
             out_str = PyUnicode_AS_UNICODE(translated);
-            for (in = 0, out = 0; in < len;) {
-                Py_UNICODE c = in_str[in++];
-                if (c >= 0x20) {
-                    out_str[out++] = c;
-                    continue;
-                }
+            in = in_str;
+            out = out_str;
+            end = in_str + len;
+            for (;;) {
+                Py_UNICODE c;
+                /* Fast loop for non-control characters */
+                while ((c = *in++) > '\r')
+                    *out++ = c;
                 if (c == '\n') {
-                    out_str[out++] = c;
+                    *out++ = c;
                     seennl |= SEEN_LF;
                     continue;
                 }
-                if (c != '\r') {
-                    out_str[out++] = c;
+                if (c == '\r') {
+                    if (*in == '\n') {
+                        in++;
+                        seennl |= SEEN_CRLF;
+                    }
+                    else
+                        seennl |= SEEN_CR;
+                    *out++ = '\n';
                     continue;
                 }
-                if (in_str[in] == '\n') {
-                    in++;
-                    seennl |= SEEN_CRLF;
-                }
-                else
-                    seennl |= SEEN_CR;
-                out_str[out++] = '\n';
+                if (in > end)
+                    break;
+                *out++ = c;
             }
             if (translated != output) {
                 Py_DECREF(output);
                 output = translated;
             }
-            if (out != in) {
-                if (PyUnicode_Resize(&output, out) < 0)
+            if (out - out_str != len) {
+                if (PyUnicode_Resize(&output, out - out_str) < 0)
                     goto error;
             }
         }
@@ -924,28 +935,22 @@
     return NULL;
 }
 
+/* It is assumed that end points to the real end of the Py_UNICODE storage,
+   that is to the NUL character. Otherwise the function will produce incorrect
+   results. */
 static Py_UNICODE *
-find_LF(Py_UNICODE *start, Py_UNICODE *end)
-{
-    Py_UNICODE *s = start;
-    while (s < end) {
-        if (*s == '\n')
-            return s;
-        s++;
-    }
-    return NULL;
-}
-
-static Py_UNICODE *
-find_CR(Py_UNICODE *start, Py_UNICODE *end)
+find_control_char(Py_UNICODE *start, Py_UNICODE *end, Py_UNICODE ch)
 {
     Py_UNICODE *s = start;
-    while (s < end) {
-        if (*s == '\r')
+    for (;;) {
+        while (*s > ch)
+            s++;
+        if (*s == ch)
             return s;
+        if (s == end)
+            return NULL;
         s++;
     }
-    return NULL;
 }
 
 static PyObject *
@@ -971,8 +976,9 @@
         Py_UNICODE* ptr = PyUnicode_AS_UNICODE(line);
         if (self->readtranslate) {
             /* Newlines are already translated, only search for \n */
-            Py_UNICODE *pos = find_LF(ptr + start,
-                                      ptr + PyUnicode_GET_SIZE(line));
+            Py_UNICODE *pos = find_control_char(ptr + start,
+                                                ptr + PyUnicode_GET_SIZE(line),
+                                                '\n');
             if (pos != NULL) {
                 endpos = pos - ptr + 1;
                 break;
@@ -984,53 +990,58 @@
             /* Universal newline search. Find any of \r, \r\n, \n
              * The decoder ensures that \r\n are not split in two pieces
              */
-
-            /* In C we'd look for these in parallel of course.
-             * XXX Hey!
-             */
-            Py_UNICODE* nlpos = find_LF(ptr + start,
-                                        ptr + PyUnicode_GET_SIZE(line));
-            Py_UNICODE* crpos = find_CR(ptr + start,
-                                        ptr + PyUnicode_GET_SIZE(line));
-            if (crpos == NULL) {
-                if (nlpos == NULL) {
-                    /* Nothing found */
-                    start = PyUnicode_GET_SIZE(line);
+            Py_UNICODE *s = ptr + start;
+            Py_UNICODE *e = ptr + PyUnicode_GET_SIZE(line);
+            for (;;) {
+                Py_UNICODE ch;
+                /* Fast path for non-control chars. The loop always ends
+                   since the Py_UNICODE storage is NUL-terminated. */
+                while (*s > '\r')
+                    s++;
+                if (s == e)
+                    goto _universal_not_found;
+                ch = *s++;
+                if (ch == '\n') {
+                    endpos = s - ptr;
+                    break;
                 }
-                else {
-                    /* Found \n */
-                    endpos = nlpos - ptr + 1;
+                if (ch == '\r') {
+                    if (*s == '\n')
+                        endpos = s - ptr + 1;
+                    else
+                        endpos = s - ptr;
                     break;
                 }
             }
-            else if (nlpos == NULL) {
-                /* Found lone \r */
-                endpos = crpos - ptr + 1;
-                break;
-            }
-            else if (nlpos < crpos) {
-                /* Found \n */
-                endpos = nlpos - ptr + 1;
-                break;
-            }
-            else if (nlpos == crpos + 1) {
-                /* Found \r\n */
-                endpos = crpos - ptr + 2;
-                break;
-            }
-            else {
-                /* Found \r */
-                endpos = crpos - ptr + 1;
-                break;
-            }
+            break;
+          _universal_not_found:
+            start = PyUnicode_GET_SIZE(line);
         }
         else {
-            /* non-universal */
-            Py_ssize_t pos = PyUnicode_Find(line, self->readnl,
-                                            start, -1, 1);
-            if (pos >= 0) {
-                endpos = pos + PyUnicode_GET_SIZE(self->readnl);
-                break;
+            /* Non-universal mode. */
+            Py_ssize_t readnl_len = PyUnicode_GET_SIZE(self->readnl);
+            Py_ssize_t line_len = PyUnicode_GET_SIZE(line);
+            if (readnl_len <= line_len) {
+                if (readnl_len == 1) {
+                    Py_UNICODE *pos = find_control_char(
+                            ptr + start, 
+                            ptr + line_len,
+                            PyUnicode_AS_UNICODE(self->readnl)[0]);
+                    if (pos != NULL) {
+                        endpos = pos - ptr + 1;
+                        break;
+                    }
+                    start = PyUnicode_GET_SIZE(line);
+                }
+                else {
+                    Py_ssize_t pos = PyUnicode_Find(line, self->readnl,
+                                                    start, line_len, 1);
+                    if (pos >= 0) {
+                        endpos = pos + readnl_len;
+                        break;
+                    }
+                    start = line_len - readnl_len + 1;
+                }
             }
         }