[pypy-commit] pypy StringIO-perf: Store UnicodeIO data as a list of unichars instead of GC strings

Fri Jan 31 00:21:54 EST 2020

Author: Ronan Lamy <ronan.lamy at gmail.com>
Branch: StringIO-perf
Changeset: r98603:b53569f5531d
Date: 2020-01-31 05:20 +0000
http://bitbucket.org/pypy/pypy/changeset/b53569f5531d/

Log:	Store UnicodeIO data as a list of unichars instead of GC strings

diff --git a/pypy/module/_io/interp_stringio.py b/pypy/module/_io/interp_stringio.py
--- a/pypy/module/_io/interp_stringio.py
+++ b/pypy/module/_io/interp_stringio.py
@@ -21,7 +21,7 @@
         if len(self.data) > newlength:
             self.data = self.data[:newlength]
         if len(self.data) < newlength:
-            self.data.extend(['\0'] * (newlength - len(self.data)))
+            self.data.extend([u'\0'] * (newlength - len(self.data)))
 
     def read(self, size):
         start = self.pos
@@ -34,7 +34,7 @@
             end = len(self.data)
         assert 0 <= start <= end
         self.pos = end
-        return ''.join(self.data[start:end])
+        return u''.join(self.data[start:end]).encode('utf-8')
 
     def _convert_limit(self, limit):
         if limit < 0 or limit > len(self.data) - self.pos:
@@ -51,18 +51,18 @@
         while pos < end:
             ch = self.data[pos]
             pos += 1
-            if ch == '\n':
+            if ch == u'\n':
                 break
-            if ch == '\r':
+            if ch == u'\r':
                 if pos >= end:
                     break
-                if self.data[pos] == '\n':
+                if self.data[pos] == u'\n':
                     pos += 1
                     break
                 else:
                     break
         self.pos = pos
-        result = ''.join(self.data[start:pos])
+        result = u''.join(self.data[start:pos]).encode('utf-8')
         return result
 
     def readline(self, marker, limit):
@@ -70,6 +70,7 @@
         limit = self._convert_limit(limit)
         end = start + limit
         found = False
+        marker = marker.decode('utf-8')
         for pos in range(start, end - len(marker) + 1):
             ch = self.data[pos]
             if ch == marker[0]:
@@ -83,19 +84,17 @@
         if not found:
             pos = end
         self.pos = pos
-        result = ''.join(self.data[start:pos])
+        result = u''.join(self.data[start:pos]).encode('utf-8')
         return result
 
     def write(self, string):
-        length = codepoints_in_utf8(string)
-        if self.pos + length > len(self.data):
-            self.resize(self.pos + length)
-        pos = 0
-        for i in range(length):
-            nextpos = next_codepoint_pos(string, pos)
-            self.data[self.pos + i] = string[pos:nextpos]
-            pos = nextpos
-        self.pos += length
+        ustr = string.decode('utf-8')
+        newlen = self.pos + len(ustr)
+        if newlen > len(self.data):
+            self.resize(newlen)
+        for i in range(len(ustr)):
+            self.data[self.pos + i] = ustr[i]
+        self.pos += len(ustr)
 
     def seek(self, pos):
         self.pos = pos
@@ -105,7 +104,7 @@
             self.resize(size)
 
     def getvalue(self):
-        return ''.join(self.data)
+        return u''.join(self.data).encode('utf-8')
 
 
 class W_StringIO(W_TextIOBase):