[pypy-commit] pypy default: Speed up a little bit str.split('char'). Unsure why CPython is so much faster

Sat May 28 16:59:57 CEST 2011

Author: Armin Rigo <arigo at tunes.org>
Branch: 
Changeset: r44578:a9ccee57412b
Date: 2011-05-28 15:13 +0000
http://bitbucket.org/pypy/pypy/changeset/a9ccee57412b/

Log:	Speed up a little bit str.split('char'). Unsure why CPython is so
	much faster at doing this on a big string (benchmark: a 60MB string
	with 3 million times the separator character).

diff --git a/pypy/objspace/std/stringobject.py b/pypy/objspace/std/stringobject.py
--- a/pypy/objspace/std/stringobject.py
+++ b/pypy/objspace/std/stringobject.py
@@ -252,15 +252,30 @@
 
     res_w = []
     start = 0
-    while maxsplit != 0:
-        next = value.find(by, start)
-        if next < 0:
-            break
-        res_w.append(sliced(space, value, start, next, w_self))
-        start = next + bylen
-        maxsplit -= 1   # NB. if it's already < 0, it stays < 0
+    if bylen == 1 and maxsplit < 0:
+        # fast path: uses str.rfind(character) and str.count(character)
+        by = by[0]    # annotator hack: string -> char
+        count = value.count(by)
+        res_w = [None] * (count + 1)
+        end = len(value)
+        while count >= 0:
+            assert end >= 0
+            prev = value.rfind(by, 0, end)
+            start = prev + 1
+            assert start >= 0
+            res_w[count] = sliced(space, value, start, end, w_self)
+            count -= 1
+            end = prev
+    else:
+        while maxsplit != 0:
+            next = value.find(by, start)
+            if next < 0:
+                break
+            res_w.append(sliced(space, value, start, next, w_self))
+            start = next + bylen
+            maxsplit -= 1   # NB. if it's already < 0, it stays < 0
+        res_w.append(sliced(space, value, start, len(value), w_self))
 
-    res_w.append(sliced(space, value, start, len(value), w_self))
     return space.newlist(res_w)
 
 def str_rsplit__String_None_ANY(space, w_self, w_none, w_maxsplit=-1):