[Python-checkins] r68510 - sandbox/trunk/io-c/_textio.c
antoine.pitrou
python-checkins at python.org
Sat Jan 10 23:35:55 CET 2009
Author: antoine.pitrou
Date: Sat Jan 10 23:35:55 2009
New Revision: 68510
Log:
Slightly faster newline detection
Modified:
sandbox/trunk/io-c/_textio.c
Modified: sandbox/trunk/io-c/_textio.c
==============================================================================
--- sandbox/trunk/io-c/_textio.c (original)
+++ sandbox/trunk/io-c/_textio.c Sat Jan 10 23:35:55 2009
@@ -180,7 +180,7 @@
all in one pass. */
{
Py_UNICODE *in_str;
- Py_ssize_t in, len;
+ Py_ssize_t len;
int seennl = self->seennl;
int only_lf;
@@ -197,26 +197,33 @@
&& !memchr(in_str, '\r', len * sizeof(Py_UNICODE)));
if (!self->translate) {
+ Py_UNICODE *s, *end;
if (seennl == SEEN_ALL)
goto endscan;
if (only_lf)
goto endscan;
- for (in = 0; in < len;) {
- Py_UNICODE c = in_str[in++];
- if (c >= 0x20)
- continue;
+ s = in_str;
+ end = in_str + len;
+ for (;;) {
+ Py_UNICODE c;
+ /* Fast loop for non-control characters */
+ while (*s > '\r')
+ s++;
+ c = *s++;
if (c == '\n')
seennl |= SEEN_LF;
else if (c == '\r') {
- if (in_str[in] == '\n') {
+ if (*s == '\n') {
seennl |= SEEN_CRLF;
- in++;
+ s++;
}
else
seennl |= SEEN_CR;
}
+ if (s > end)
+ break;
if (seennl == SEEN_ALL)
- goto endscan;
+ break;
}
endscan:
;
@@ -224,7 +231,7 @@
else if (!only_lf) {
PyObject *translated = NULL;
Py_UNICODE *out_str;
- Py_ssize_t out;
+ Py_UNICODE *in, *out, *end;
if (Py_REFCNT(output) != 1) {
/* We could try to optimize this so that we only do a copy
when there is something to translate. On the other hand,
@@ -242,35 +249,39 @@
translated = output;
}
out_str = PyUnicode_AS_UNICODE(translated);
- for (in = 0, out = 0; in < len;) {
- Py_UNICODE c = in_str[in++];
- if (c >= 0x20) {
- out_str[out++] = c;
- continue;
- }
+ in = in_str;
+ out = out_str;
+ end = in_str + len;
+ for (;;) {
+ Py_UNICODE c;
+ /* Fast loop for non-control characters */
+ while ((c = *in++) > '\r')
+ *out++ = c;
if (c == '\n') {
- out_str[out++] = c;
+ *out++ = c;
seennl |= SEEN_LF;
continue;
}
- if (c != '\r') {
- out_str[out++] = c;
+ if (c == '\r') {
+ if (*in == '\n') {
+ in++;
+ seennl |= SEEN_CRLF;
+ }
+ else
+ seennl |= SEEN_CR;
+ *out++ = '\n';
continue;
}
- if (in_str[in] == '\n') {
- in++;
- seennl |= SEEN_CRLF;
- }
- else
- seennl |= SEEN_CR;
- out_str[out++] = '\n';
+ if (in > end)
+ break;
+ *out++ = c;
}
if (translated != output) {
Py_DECREF(output);
output = translated;
}
- if (out != in) {
- if (PyUnicode_Resize(&output, out) < 0)
+ if (out - out_str != len) {
+ if (PyUnicode_Resize(&output, out - out_str) < 0)
goto error;
}
}
@@ -924,28 +935,22 @@
return NULL;
}
+/* It is assumed that end points to the real end of the Py_UNICODE storage,
+ that is to the NUL character. Otherwise the function will produce incorrect
+ results. */
static Py_UNICODE *
-find_LF(Py_UNICODE *start, Py_UNICODE *end)
-{
- Py_UNICODE *s = start;
- while (s < end) {
- if (*s == '\n')
- return s;
- s++;
- }
- return NULL;
-}
-
-static Py_UNICODE *
-find_CR(Py_UNICODE *start, Py_UNICODE *end)
+find_control_char(Py_UNICODE *start, Py_UNICODE *end, Py_UNICODE ch)
{
Py_UNICODE *s = start;
- while (s < end) {
- if (*s == '\r')
+ for (;;) {
+ while (*s > ch)
+ s++;
+ if (*s == ch)
return s;
+ if (s == end)
+ return NULL;
s++;
}
- return NULL;
}
static PyObject *
@@ -971,8 +976,9 @@
Py_UNICODE* ptr = PyUnicode_AS_UNICODE(line);
if (self->readtranslate) {
/* Newlines are already translated, only search for \n */
- Py_UNICODE *pos = find_LF(ptr + start,
- ptr + PyUnicode_GET_SIZE(line));
+ Py_UNICODE *pos = find_control_char(ptr + start,
+ ptr + PyUnicode_GET_SIZE(line),
+ '\n');
if (pos != NULL) {
endpos = pos - ptr + 1;
break;
@@ -984,53 +990,58 @@
/* Universal newline search. Find any of \r, \r\n, \n
* The decoder ensures that \r\n are not split in two pieces
*/
-
- /* In C we'd look for these in parallel of course.
- * XXX Hey!
- */
- Py_UNICODE* nlpos = find_LF(ptr + start,
- ptr + PyUnicode_GET_SIZE(line));
- Py_UNICODE* crpos = find_CR(ptr + start,
- ptr + PyUnicode_GET_SIZE(line));
- if (crpos == NULL) {
- if (nlpos == NULL) {
- /* Nothing found */
- start = PyUnicode_GET_SIZE(line);
+ Py_UNICODE *s = ptr + start;
+ Py_UNICODE *e = ptr + PyUnicode_GET_SIZE(line);
+ for (;;) {
+ Py_UNICODE ch;
+ /* Fast path for non-control chars. The loop always ends
+ since the Py_UNICODE storage is NUL-terminated. */
+ while (*s > '\r')
+ s++;
+ if (s == e)
+ goto _universal_not_found;
+ ch = *s++;
+ if (ch == '\n') {
+ endpos = s - ptr;
+ break;
}
- else {
- /* Found \n */
- endpos = nlpos - ptr + 1;
+ if (ch == '\r') {
+ if (*s == '\n')
+ endpos = s - ptr + 1;
+ else
+ endpos = s - ptr;
break;
}
}
- else if (nlpos == NULL) {
- /* Found lone \r */
- endpos = crpos - ptr + 1;
- break;
- }
- else if (nlpos < crpos) {
- /* Found \n */
- endpos = nlpos - ptr + 1;
- break;
- }
- else if (nlpos == crpos + 1) {
- /* Found \r\n */
- endpos = crpos - ptr + 2;
- break;
- }
- else {
- /* Found \r */
- endpos = crpos - ptr + 1;
- break;
- }
+ break;
+ _universal_not_found:
+ start = PyUnicode_GET_SIZE(line);
}
else {
- /* non-universal */
- Py_ssize_t pos = PyUnicode_Find(line, self->readnl,
- start, -1, 1);
- if (pos >= 0) {
- endpos = pos + PyUnicode_GET_SIZE(self->readnl);
- break;
+ /* Non-universal mode. */
+ Py_ssize_t readnl_len = PyUnicode_GET_SIZE(self->readnl);
+ Py_ssize_t line_len = PyUnicode_GET_SIZE(line);
+ if (readnl_len <= line_len) {
+ if (readnl_len == 1) {
+ Py_UNICODE *pos = find_control_char(
+ ptr + start,
+ ptr + line_len,
+ PyUnicode_AS_UNICODE(self->readnl)[0]);
+ if (pos != NULL) {
+ endpos = pos - ptr + 1;
+ break;
+ }
+ start = PyUnicode_GET_SIZE(line);
+ }
+ else {
+ Py_ssize_t pos = PyUnicode_Find(line, self->readnl,
+ start, line_len, 1);
+ if (pos >= 0) {
+ endpos = pos + readnl_len;
+ break;
+ }
+ start = line_len - readnl_len + 1;
+ }
}
}
More information about the Python-checkins
mailing list