[pypy-commit] pypy refactor-str-types: Make descr_splitlines() unicode-aware.

Tue Jul 30 14:15:42 CEST 2013

Author: Manuel Jacob
Branch: refactor-str-types
Changeset: r65818:93f93f772e11
Date: 2013-07-29 19:40 +0200
http://bitbucket.org/pypy/pypy/changeset/93f93f772e11/

Log:	Make descr_splitlines() unicode-aware.

diff --git a/pypy/objspace/std/bytearrayobject.py b/pypy/objspace/std/bytearrayobject.py
--- a/pypy/objspace/std/bytearrayobject.py
+++ b/pypy/objspace/std/bytearrayobject.py
@@ -74,6 +74,9 @@
 
     _iscased = _isalpha
 
+    def _islinebreak(self, ch):
+        return (ch == '\n') or (ch == '\r')
+
     def _upper(self, ch):
         if ch.islower():
             o = ord(ch) - 32
diff --git a/pypy/objspace/std/bytesobject.py b/pypy/objspace/std/bytesobject.py
--- a/pypy/objspace/std/bytesobject.py
+++ b/pypy/objspace/std/bytesobject.py
@@ -117,6 +117,9 @@
 
     _iscased = _isalpha
 
+    def _islinebreak(self, ch):
+        return (ch == '\n') or (ch == '\r')
+
     def _upper(self, ch):
         if ch.islower():
             o = ord(ch) - 32
diff --git a/pypy/objspace/std/stringmethods.py b/pypy/objspace/std/stringmethods.py
--- a/pypy/objspace/std/stringmethods.py
+++ b/pypy/objspace/std/stringmethods.py
@@ -628,26 +628,24 @@
     @unwrap_spec(keepends=bool)
     @specialize.argtype(0)
     def descr_splitlines(self, space, keepends=False):
-        data = self._val(space)
-        selflen = len(data)
+        value = self._val(space)
+        length = len(value)
         strs = []
-        i = j = 0
-        while i < selflen:
-            # Find a line and append it
-            while i < selflen and data[i] != '\n' and data[i] != '\r':
-                i += 1
-            # Skip the line break reading CRLF as one line break
-            eol = i
-            i += 1
-            if i < selflen and data[i-1] == '\r' and data[i] == '\n':
-                i += 1
+        pos = 0
+        while pos < length:
+            sol = pos
+            while pos < length and not self._islinebreak(value[pos]):
+                pos += 1
+            eol = pos
+            pos += 1
+            # read CRLF as one line break
+            if pos < length and value[eol] == '\r' and value[pos] == '\n':
+                pos += 1
             if keepends:
-                eol = i
-            strs.append(data[j:eol])
-            j = i
-
-        if j < selflen:
-            strs.append(data[j:len(data)])
+                eol = pos
+            strs.append(value[sol:eol])
+        if pos < length:
+            strs.append(value[pos:length])
         return self._newlist_unwrapped(space, strs)
 
     @specialize.argtype(0)
diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -125,6 +125,9 @@
     def _iscased(self, ch):
         return unicodedb.iscased(ord(ch))
 
+    def _islinebreak(self, ch):
+        return unicodedb.islinebreak(ord(ch))
+
     def _upper(self, ch):
         return unichr(unicodedb.toupper(ord(ch)))