[pypy-commit] pypy default: backport eee2717be5e2 to default:
cfbolz
pypy.commits at gmail.com
Wed Sep 11 08:37:45 EDT 2019
Author: Carl Friedrich Bolz-Tereick <cfbolz at gmx.de>
Branch:
Changeset: r97434:2352eded240c
Date: 2019-09-11 13:42 +0200
http://bitbucket.org/pypy/pypy/changeset/2352eded240c/
Log: backport eee2717be5e2 to default:
more improvement to the performance of _io:
make get_chars track the number of unicode codepoints. also fix a
bug in W_TextIOWrapper._read that assumed ascii
diff --git a/pypy/module/_io/interp_textio.py b/pypy/module/_io/interp_textio.py
--- a/pypy/module/_io/interp_textio.py
+++ b/pypy/module/_io/interp_textio.py
@@ -293,14 +293,18 @@
class DecodeBuffer(object):
- def __init__(self, text=None):
+ def __init__(self, text=None, ulen=-1):
# self.text is a valid utf-8 string
+ if text is not None:
+ assert ulen >= 0
self.text = text
self.pos = 0
self.upos = 0
+ self.ulen = ulen
def set(self, space, w_decoded):
check_decoded(space, w_decoded)
+ self.ulen = space.len_w(w_decoded)
self.text = space.utf8_w(w_decoded)
self.pos = 0
self.upos = 0
@@ -309,13 +313,14 @@
self.text = None
self.pos = 0
self.upos = 0
+ self.ulen = -1
def get_chars(self, size):
""" returns a tuple (utf8, lgt) """
if self.text is None or size == 0:
- return ""
+ return "", 0
- lgt = codepoints_in_utf8(self.text)
+ lgt = self.ulen
available = lgt - self.upos
if size < 0 or size > available:
size = available
@@ -323,7 +328,6 @@
if self.pos > 0 or size < available:
start = self.pos
- ret = []
pos = start
for i in range(size):
pos = next_codepoint_pos(self.text, pos)
@@ -336,8 +340,9 @@
chars = self.text
self.pos = len(self.text)
self.upos = lgt
+ size = lgt
- return chars
+ return chars, size
def has_data(self):
return (self.text is not None and not self.exhausted())
@@ -709,8 +714,7 @@
w_bytes = space.call_method(self.w_buffer, "read")
w_decoded = space.call_method(self.w_decoder, "decode", w_bytes, space.w_True)
check_decoded(space, w_decoded)
- chars = self.decoded.get_chars(-1)
- lgt = get_utf8_length(chars)
+ chars, lgt = self.decoded.get_chars(-1)
w_result = space.newutf8(chars, lgt)
w_final = space.add(w_result, w_decoded)
self.snapshot = None
@@ -723,9 +727,9 @@
while remaining > 0:
if not self._ensure_data(space):
break
- data = self.decoded.get_chars(remaining)
- builder.append(data)
- remaining -= len(data)
+ data, size = self.decoded.get_chars(remaining)
+ builder.append_utf8(data, size)
+ remaining -= size
return space.newutf8(builder.build(), builder.getlength())
@@ -756,6 +760,7 @@
def _readline(self, space, limit):
# This is a separate function so that readline_w() can be jitted.
remnant = None
+ remnant_ulen = -1
builder = Utf8StringBuilder()
while True:
# First, get some data if necessary
@@ -763,7 +768,7 @@
if not has_data:
# end of file
if remnant:
- builder.append(remnant) # XXX
+ builder.append_utf8(remnant, remnant_ulen)
break
if remnant:
@@ -772,11 +777,14 @@
if remnant == '\r' and self.decoded.text[0] == '\n':
builder.append_utf8('\r\n', 2)
self.decoded.pos = 1
+ self.decoded.upos = 1
remnant = None
+ remnant_ulen = -1
break
else:
- builder.append(remnant) # XXX
+ builder.append_utf8(remnant, remnant_ulen)
remnant = None
+ remnant_ulen = -1
continue
if limit >= 0:
@@ -800,7 +808,7 @@
# There may be some remaining chars we'll have to prepend to the
# next chunk of data
if not self.decoded.exhausted():
- remnant = self.decoded.get_chars(-1)
+ remnant, remnant_ulen = self.decoded.get_chars(-1)
# We have consumed the buffer
self.decoded.reset()
diff --git a/pypy/module/_io/test/test_interp_textio.py b/pypy/module/_io/test/test_interp_textio.py
--- a/pypy/module/_io/test/test_interp_textio.py
+++ b/pypy/module/_io/test/test_interp_textio.py
@@ -58,27 +58,31 @@
@given(st.text())
def test_read_buffer(text):
- buf = DecodeBuffer(text.encode('utf-8'))
- assert buf.get_chars(-1) == text.encode('utf-8')
+ buf = DecodeBuffer(text.encode('utf8'), len(text))
+ chars, size = buf.get_chars(-1)
+ assert chars.decode('utf8') == text
+ assert len(text) == size
assert buf.exhausted()
@given(st.text(), st.lists(st.integers(min_value=0)))
@example(u'\x80', [1])
def test_readn_buffer(text, sizes):
- buf = DecodeBuffer(text.encode('utf-8'))
+ buf = DecodeBuffer(text.encode('utf8'), len(text))
strings = []
for n in sizes:
- s = buf.get_chars(n)
+ chars, size = buf.get_chars(n)
+ s = chars.decode('utf8')
+ assert size == len(s)
if not buf.exhausted():
- assert len(s.decode('utf-8')) == n
+ assert len(s) == n
else:
- assert len(s.decode('utf-8')) <= n
+ assert len(s) <= n
strings.append(s)
- assert ''.join(strings) == text[:sum(sizes)].encode('utf-8')
+ assert ''.join(strings) == text[:sum(sizes)]
@given(st.text())
def test_next_char(text):
- buf = DecodeBuffer(text.encode('utf-8'))
+ buf = DecodeBuffer(text.encode('utf8'), len(text))
for i in range(len(text)):
ch = buf.next_char()
assert ch == text[i].encode('utf-8')
diff --git a/pypy/module/_io/test/test_textio.py b/pypy/module/_io/test/test_textio.py
--- a/pypy/module/_io/test/test_textio.py
+++ b/pypy/module/_io/test/test_textio.py
@@ -1,3 +1,5 @@
+#encoding: utf-8
+
class AppTestTextIO:
spaceconfig = dict(usemodules=['_io', '_locale'])
@@ -103,6 +105,16 @@
reads += t.readline()
assert reads == u"abc\ndef\n"
+ def test_read_bug_unicode(self):
+ import _io
+ inp = b"\xc3\xa4bc\ndef\n"
+ r = _io.BytesIO(inp)
+ t = _io.TextIOWrapper(r, encoding="utf-8")
+ reads = t.read(4)
+ assert reads == inp[:5].decode("utf-8")
+ reads += t.readline()
+ assert reads == inp.decode("utf-8")
+
def test_encoded_writes(self):
import _io
data = u"1234567890"
More information about the pypy-commit
mailing list