[issue16061] performance regression in string replace for 3.3
Serhiy Storchaka
report at bugs.python.org
Thu Oct 11 10:36:27 CEST 2012
Serhiy Storchaka added the comment:
> I compared performances of the two methods: dummy loop vs find.
You can hybridize them. First just compare chars and if not match then use
memcmp(). This speed up the case of repeated chars.
----------
Added file: http://bugs.python.org/file27526/unicode_2.patch
_______________________________________
Python tracker <report at bugs.python.org>
<http://bugs.python.org/issue16061>
_______________________________________
-------------- next part --------------
diff -r 9475fc82768e Objects/unicodeobject.c
--- a/Objects/unicodeobject.c Wed Oct 10 08:36:43 2012 -0700
+++ b/Objects/unicodeobject.c Thu Oct 11 11:30:16 2012 +0300
@@ -9881,6 +9881,64 @@
return 0;
}
+static void
+replace_1char(PyObject *self, Py_ssize_t maxcount, Py_ssize_t pos, char *sbuf,
+ Py_UCS4 u1, Py_UCS4 u2, PyObject *u)
+{
+ Py_ssize_t index;
+ char *src;
+ Py_ssize_t len = PyUnicode_GET_LENGTH(self);
+ int skind = PyUnicode_KIND(self);
+ int rkind = PyUnicode_KIND(u);
+
+ if (skind == 1 && rkind == 1) {
+ char *sdata = PyUnicode_DATA(self);
+ char *udata = PyUnicode_DATA(u);
+ char *uend = udata + len;
+
+ memcpy(udata, sdata, len);
+
+ while (udata < uend) {
+ if (*udata == u1) {
+ *udata = u2;
+ if (!--maxcount)
+ break;
+ udata++;
+ continue;
+ }
+ udata++;
+ len = uend - udata;
+ if (!len)
+ break;
+ udata = memchr(udata, u1, len);
+ if (udata == NULL)
+ break;
+ *udata = u2;
+ if (!--maxcount)
+ break;
+ udata++;
+ }
+ }
+ else {
+ _PyUnicode_FastCopyCharacters(u, 0, self, 0, len);
+ PyUnicode_WRITE(rkind, PyUnicode_DATA(u), pos, u2);
+
+ index = 0;
+ src = sbuf;
+ while (--maxcount)
+ {
+ pos++;
+ src += pos * skind;
+ len -= pos;
+ index += pos;
+ pos = findchar(src, skind, len, u1, 1);
+ if (pos < 0)
+ break;
+ PyUnicode_WRITE(rkind, PyUnicode_DATA(u), index + pos, u2);
+ }
+ }
+}
+
static PyObject *
replace(PyObject *self, PyObject *str1,
PyObject *str2, Py_ssize_t maxcount)
@@ -9924,9 +9982,7 @@
if (len1 == 1) {
/* replace characters */
Py_UCS4 u1, u2;
- int rkind;
- Py_ssize_t index, pos;
- char *src;
+ Py_ssize_t pos;
u1 = PyUnicode_READ_CHAR(str1, 0);
pos = findchar(sbuf, PyUnicode_KIND(self), slen, u1, 1);
@@ -9936,23 +9992,8 @@
u = PyUnicode_New(slen, maxchar);
if (!u)
goto error;
- _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
- rkind = PyUnicode_KIND(u);
-
- PyUnicode_WRITE(rkind, PyUnicode_DATA(u), pos, u2);
- index = 0;
- src = sbuf;
- while (--maxcount)
- {
- pos++;
- src += pos * PyUnicode_KIND(self);
- slen -= pos;
- index += pos;
- pos = findchar(src, PyUnicode_KIND(self), slen, u1, 1);
- if (pos < 0)
- break;
- PyUnicode_WRITE(rkind, PyUnicode_DATA(u), index + pos, u2);
- }
+
+ replace_1char(self, maxcount, pos, sbuf, u1, u2, u);
}
else {
int rkind = skind;
More information about the Python-bugs-list
mailing list