[pypy-commit] pypy default: More refactoring: deal with the remnant more explicitly and handle size limit inside _find_line_ending()

Fri Nov 24 14:45:56 EST 2017

Author: Ronan Lamy <ronan.lamy at gmail.com>
Branch: 
Changeset: r93168:189c2cce360e
Date: 2017-11-24 19:43 +0000
http://bitbucket.org/pypy/pypy/changeset/189c2cce360e/

Log:	More refactoring: deal with the remnant more explicitly and handle
	size limit inside _find_line_ending()

diff --git a/extra_tests/test_textio.py b/extra_tests/test_textio.py
new file mode 100644
--- /dev/null
+++ b/extra_tests/test_textio.py
@@ -0,0 +1,27 @@
+from hypothesis import given, strategies as st
+
+from io import BytesIO, TextIOWrapper
+
+LINESEP = ['', '\r', '\n', '\r\n']
+
+ at st.composite
+def text_with_newlines(draw):
+    sep = draw(st.sampled_from(LINESEP))
+    lines = draw(st.lists(st.text(max_size=10), max_size=10))
+    return sep.join(lines)
+
+ at given(txt=text_with_newlines(),
+       mode=st.sampled_from(['\r', '\n', '\r\n', '']),
+       limit=st.integers(min_value=-1))
+def test_readline(txt, mode, limit):
+    textio = TextIOWrapper(BytesIO(txt.encode('utf-8')), newline=mode)
+    lines = []
+    while True:
+        line = textio.readline(limit)
+        if limit > 0:
+            assert len(line) < limit
+        if line:
+            lines.append(line)
+        else:
+            break
+    assert u''.join(lines) == txt
diff --git a/pypy/module/_io/interp_stringio.py b/pypy/module/_io/interp_stringio.py
--- a/pypy/module/_io/interp_stringio.py
+++ b/pypy/module/_io/interp_stringio.py
@@ -174,18 +174,16 @@
         start = self.pos
         if limit < 0 or limit > len(self.buf) - self.pos:
             limit = len(self.buf) - self.pos
-
         assert limit >= 0
-        end = start + limit
 
         endpos, consumed = self._find_line_ending(
             # XXX: super inefficient, makes a copy of the entire contents.
             u"".join(self.buf),
             start,
-            end
+            limit
         )
         if endpos < 0:
-            endpos = end
+            endpos = start + limit
         assert endpos >= 0
         self.pos = endpos
         return space.newunicode(u"".join(self.buf[start:endpos]))
diff --git a/pypy/module/_io/interp_textio.py b/pypy/module/_io/interp_textio.py
--- a/pypy/module/_io/interp_textio.py
+++ b/pypy/module/_io/interp_textio.py
@@ -214,53 +214,49 @@
     def newlines_get_w(self, space):
         return space.w_None
 
-    def _find_newline_universal(self, line, start, end):
+    def _find_newline_universal(self, line, start, limit):
         # Universal newline search. Find any of \r, \r\n, \n
         # The decoder ensures that \r\n are not split in two pieces
+        limit = min(limit, len(line) - start)
+        end = start + limit
         i = start
         while i < end:
             ch = line[i]
             i += 1
             if ch == '\n':
-                return i
+                return i, 0
             if ch == '\r':
-                if start + i >= end:
-                    return i
+                if i >= end:
+                    break
                 if line[i] == '\n':
-                    return i + 1
+                    return i + 1, 0
                 else:
-                    return i
-        return -1
+                    return i, 0
+        return -1, end
 
-    def _find_marker(self, marker, line, start, end):
+    def _find_marker(self, marker, line, start, limit):
+        limit = min(limit, len(line) - start)
+        end = start + limit
         for i in range(start, end - len(marker) + 1):
             ch = line[i]
             if ch == marker[0]:
                 for j in range(1, len(marker)):
                     if line[i + j] != marker[j]:
-                        break
+                        break  # from inner loop
                 else:
-                    return i + len(marker)
-        return -1
+                    return i + len(marker), 0
+        return -1, end - len(marker) + 1
 
-    def _find_line_ending(self, line, start, end):
+    def _find_line_ending(self, line, start, limit):
         if self.readuniversal:
-            i = self._find_newline_universal(line, start, end)
-            if i < 0:
-                return i, end
-            else:
-                return i, 0
+            return self._find_newline_universal(line, start, limit)
         if self.readtranslate:
             # Newlines are already translated, only search for \n
             newline = u'\n'
         else:
             # Non-universal mode.
             newline = self.readnl
-        i = self._find_marker(newline, line, start, end)
-        if i < 0:
-            return i, end - len(newline) + 1
-        else:
-            return i, 0
+        return self._find_marker(newline, line, start, limit)
 
 W_TextIOBase.typedef = TypeDef(
     '_io._TextIOBase', W_IOBase.typedef,
@@ -671,35 +667,42 @@
             has_data = self._ensure_data(space)
             if not has_data:
                 # end of file
-                start = endpos = offset_to_buffer = 0
+                start = endpos = 0
                 break
 
-            if not remnant:
-                line = self.decoded_chars
-                start = self.decoded_chars_used
-                offset_to_buffer = 0
-            else:
+            if remnant:
+                assert not self.readtranslate and self.readnl == u'\r\n'
                 assert self.decoded_chars_used == 0
-                line = remnant + self.decoded_chars
-                start = 0
-                offset_to_buffer = len(remnant)
-                remnant = None
+                if remnant == u'\r' and self.decoded_chars[0] == u'\n':
+                    builder.append(u'\r\n')
+                    self.decoded_chars_used = 1
+                    line = remnant = None
+                    start = endpos = 0
+                    break
+                else:
+                    builder.append(remnant)
+                    remnant = None
+                    continue
+
+            line = self.decoded_chars
+            start = self.decoded_chars_used
 
             line_len = len(line)
-            endpos, end_scan = self._find_line_ending(line, start, line_len)
-            chunked = builder.getlength()
+            if limit > 0:
+                remaining = limit - builder.getlength()
+                assert remaining >= 0
+            else:
+                remaining = sys.maxint
+            endpos, end_scan = self._find_line_ending(line, start, remaining)
+
             if endpos >= 0:
-                if limit >= 0 and endpos >= start + limit - chunked:
-                    endpos = start + limit - chunked
-                    assert endpos >= 0
                 break
+
             assert end_scan >= 0
-
             # We can put aside up to `end_scan`
-            if limit >= 0 and end_scan >= limit - chunked:
+            if limit >= 0 and end_scan - start >= remaining:
                 # Didn't find line ending, but reached length limit
-                endpos = start + limit - chunked
-                assert endpos >= 0
+                endpos = end_scan
                 break
 
             # No line ending seen yet - put aside current data
@@ -709,7 +712,7 @@
 
             # There may be some remaining chars we'll have to prepend to the
             # next chunk of data
-            if end_scan < line_len:
+            if end_scan < len(line):
                 remnant = line[end_scan:]
             line = None
             # We have consumed the buffer
@@ -717,9 +720,7 @@
 
         if line:
             # Our line ends in the current buffer
-            decoded_chars_used = endpos - offset_to_buffer
-            assert decoded_chars_used >= 0
-            self.decoded_chars_used = decoded_chars_used
+            self.decoded_chars_used = endpos
             if start > 0 or endpos < len(line):
                 line = line[start:endpos]
             builder.append(line)
diff --git a/pypy/module/_io/test/test_interp_textio.py b/pypy/module/_io/test/test_interp_textio.py
new file mode 100644
--- /dev/null
+++ b/pypy/module/_io/test/test_interp_textio.py
@@ -0,0 +1,33 @@
+from hypothesis import given, strategies as st, assume
+from pypy.module._io.interp_bytesio import W_BytesIO
+from pypy.module._io.interp_textio import W_TextIOWrapper
+
+LINESEP = ['', '\r', '\n', '\r\n']
+
+ at st.composite
+def text_with_newlines(draw):
+    sep = draw(st.sampled_from(LINESEP))
+    lines = draw(st.lists(st.text(max_size=10), max_size=10))
+    return sep.join(lines)
+
+ at given(txt=text_with_newlines(),
+       mode=st.sampled_from(['\r', '\n', '\r\n', '']),
+       limit=st.integers(min_value=-1))
+def test_readline(space, txt, mode, limit):
+    assume(limit != 0)
+    w_stream = W_BytesIO(space)
+    w_stream.descr_init(space, space.newbytes(txt.encode('utf-8')))
+    w_textio = W_TextIOWrapper(space)
+    w_textio.descr_init(
+        space, w_stream, encoding='utf-8',
+        w_newline=space.newtext(mode))
+    lines = []
+    while True:
+        line = space.unicode_w(w_textio.readline_w(space, space.newint(limit)))
+        if limit > 0:
+            assert len(line) <= limit
+        if line:
+            lines.append(line)
+        else:
+            break
+    assert u''.join(lines) == txt