[pypy-commit] pypy unicode-utf8: test, fix for StringIO(unicode).read(cnt)

Wed Jan 16 17:40:22 EST 2019

Author: Matti Picus <matti.picus at gmail.com>
Branch: unicode-utf8
Changeset: r95655:6185982e509f
Date: 2019-01-16 23:42 +0200
http://bitbucket.org/pypy/pypy/changeset/6185982e509f/

Log:	test, fix for StringIO(unicode).read(cnt)

diff --git a/pypy/module/_io/interp_stringio.py b/pypy/module/_io/interp_stringio.py
--- a/pypy/module/_io/interp_stringio.py
+++ b/pypy/module/_io/interp_stringio.py
@@ -1,4 +1,4 @@
-from rpython.rlib.rutf8 import get_utf8_length
+from rpython.rlib.rutf8 import get_utf8_length, next_codepoint_pos
 
 from pypy.interpreter.error import OperationError, oefmt
 from pypy.interpreter.typedef import (
@@ -11,8 +11,16 @@
 class UnicodeIO(object):
     def __init__(self, data=None, pos=0):
         if data is None:
-            data = []
-        self.data = data
+            data = ''
+        self.data = []
+        self.pos = 0
+        # break the data into unicode codepoints
+        _pos = 0
+        while _pos < pos:
+            _pos = next_codepoint_pos(data, _pos)
+            if _pos >= len(data):
+                break
+        self.write(data[_pos:])
         self.pos = pos
 
     def resize(self, newlength):
@@ -85,12 +93,14 @@
         return result
 
     def write(self, string):
-        length = len(string)
+        length = get_utf8_length(string)
         if self.pos + length > len(self.data):
             self.resize(self.pos + length)
-
+        pos = 0
         for i in range(length):
-            self.data[self.pos + i] = string[i]
+            nextpos = next_codepoint_pos(string, pos)
+            self.data[self.pos + i] = string[pos:nextpos]
+            pos = nextpos
         self.pos += length
 
     def seek(self, pos):
@@ -186,7 +196,7 @@
         if pos < 0:
             raise oefmt(space.w_ValueError,
                         "position value cannot be negative")
-        self.buf = UnicodeIO(list(initval), pos)
+        self.buf = UnicodeIO(initval, pos)
         if not space.is_w(w_dict, space.w_None):
             if not space.isinstance_w(w_dict, space.w_dict):
                 raise oefmt(
diff --git a/pypy/module/_io/test/test_stringio.py b/pypy/module/_io/test/test_stringio.py
--- a/pypy/module/_io/test/test_stringio.py
+++ b/pypy/module/_io/test/test_stringio.py
@@ -42,6 +42,17 @@
         assert buf[5:] == sio.read(900)
         assert u"" == sio.read()
 
+    def test_read_binary(self):
+        # data is from a test_imghdr test for a GIF file
+        import io
+        buf_in = (u'\x47\x49\x46\x38\x39\x61\x10\x00\x10\x00\xf6\x64\x00\xeb'
+                  u'\xbb\x18\xeb\xbe\x21\xf3\xc1\x1a\xfa\xc7\x19\xfd\xcb\x1b'
+                  u'\xff\xcc\x1c\xeb')
+        assert len(buf_in) == 32
+        sio = io.StringIO(buf_in)
+        buf_out = sio.read(32)
+        assert buf_in == buf_out
+
     def test_readline(self):
         import io
         sio = io.StringIO(u'123\n456')