[pypy-commit] pypy unicode-utf8: hg merge default

Fri Nov 24 15:26:03 EST 2017

Author: Ronan Lamy <ronan.lamy at gmail.com>
Branch: unicode-utf8
Changeset: r93170:f9a1926628b2
Date: 2017-11-24 20:22 +0000
http://bitbucket.org/pypy/pypy/changeset/f9a1926628b2/

Log:	hg merge default

diff --git a/extra_tests/test_textio.py b/extra_tests/test_textio.py
new file mode 100644
--- /dev/null
+++ b/extra_tests/test_textio.py
@@ -0,0 +1,27 @@
+from hypothesis import given, strategies as st
+
+from io import BytesIO, TextIOWrapper
+
+LINESEP = ['', '\r', '\n', '\r\n']
+
+ at st.composite
+def text_with_newlines(draw):
+    sep = draw(st.sampled_from(LINESEP))
+    lines = draw(st.lists(st.text(max_size=10), max_size=10))
+    return sep.join(lines)
+
+ at given(txt=text_with_newlines(),
+       mode=st.sampled_from(['\r', '\n', '\r\n', '']),
+       limit=st.integers(min_value=-1))
+def test_readline(txt, mode, limit):
+    textio = TextIOWrapper(BytesIO(txt.encode('utf-8')), newline=mode)
+    lines = []
+    while True:
+        line = textio.readline(limit)
+        if limit > 0:
+            assert len(line) < limit
+        if line:
+            lines.append(line)
+        else:
+            break
+    assert u''.join(lines) == txt
diff --git a/pypy/module/_continuation/test/conftest.py b/pypy/module/_continuation/test/conftest.py
new file mode 100644
--- /dev/null
+++ b/pypy/module/_continuation/test/conftest.py
@@ -0,0 +1,7 @@
+import pytest
+import sys
+
+def pytest_configure(config):
+    if sys.platform.startswith('linux'):
+        from rpython.rlib.rvmprof.cintf import configure_libbacktrace_linux
+        configure_libbacktrace_linux()
diff --git a/pypy/module/_io/interp_stringio.py b/pypy/module/_io/interp_stringio.py
--- a/pypy/module/_io/interp_stringio.py
+++ b/pypy/module/_io/interp_stringio.py
@@ -174,18 +174,16 @@
         start = self.pos
         if limit < 0 or limit > len(self.buf) - self.pos:
             limit = len(self.buf) - self.pos
+        assert limit >= 0
 
-        assert limit >= 0
-        end = start + limit
-
-        endpos, consumed = self._find_line_ending(
+        endpos, found = self._find_line_ending(
             # XXX: super inefficient, makes a copy of the entire contents.
             u"".join(self.buf),
             start,
-            end
+            limit
         )
-        if endpos < 0:
-            endpos = end
+        if not found:
+            endpos = start + limit
         assert endpos >= 0
         self.pos = endpos
         return space.newunicode(u"".join(self.buf[start:endpos]))
diff --git a/pypy/module/_io/interp_textio.py b/pypy/module/_io/interp_textio.py
--- a/pypy/module/_io/interp_textio.py
+++ b/pypy/module/_io/interp_textio.py
@@ -221,44 +221,49 @@
     def newlines_get_w(self, space):
         return space.w_None
 
-    def _find_line_ending(self, line, start, end):
-        size = end - start
+    def _find_newline_universal(self, line, start, limit):
+        # Universal newline search. Find any of \r, \r\n, \n
+        # The decoder ensures that \r\n are not split in two pieces
+        limit = min(limit, len(line) - start)
+        end = start + limit
+        i = start
+        while i < end:
+            ch = line[i]
+            i += 1
+            if ch == '\n':
+                return i, True
+            if ch == '\r':
+                if i >= end:
+                    break
+                if line[i] == '\n':
+                    return i + 1, True
+                else:
+                    return i, True
+        return end, False
+
+    def _find_marker(self, marker, line, start, limit):
+        limit = min(limit, len(line) - start)
+        end = start + limit
+        for i in range(start, end - len(marker) + 1):
+            ch = line[i]
+            if ch == marker[0]:
+                for j in range(1, len(marker)):
+                    if line[i + j] != marker[j]:
+                        break  # from inner loop
+                else:
+                    return i + len(marker), True
+        return end - len(marker) + 1, False
+
+    def _find_line_ending(self, line, start, limit):
         if self.readuniversal:
-            # Universal newline search. Find any of \r, \r\n, \n
-            # The decoder ensures that \r\n are not split in two pieces
-            i = start
-            while True:
-                # Fast path for non-control chars.
-                while i < end and line[i] > '\r':
-                    i += 1
-                if i >= end:
-                    return -1, size
-                ch = line[i]
-                i += 1
-                if ch == '\n':
-                    return i, 0
-                if ch == '\r':
-                    if line[i] == '\n':
-                        return i + 1, 0
-                    else:
-                        return i, 0
+            return self._find_newline_universal(line, start, limit)
         if self.readtranslate:
             # Newlines are already translated, only search for \n
             newline = '\n'
         else:
             # Non-universal mode.
             newline = self.readnl
-        end_scan = end - len(newline) + 1
-        for i in range(start, end_scan):
-            ch = line[i]
-            if ch == newline[0]:
-                for j in range(1, len(newline)):
-                    if line[i + j] != newline[j]:
-                        break
-                else:
-                    return i + len(newline), 0
-        return -1, end_scan
-
+        return self._find_marker(newline, line, start, limit)
 
 W_TextIOBase.typedef = TypeDef(
     '_io._TextIOBase', W_IOBase.typedef,
@@ -661,7 +666,7 @@
         limit = convert_size(space, w_limit)
 
         line = None
-        remaining = None
+        remnant = None
         builder = StringBuilder()
 
         while True:
@@ -669,61 +674,60 @@
             has_data = self._ensure_data(space)
             if not has_data:
                 # end of file
-                start = endpos = offset_to_buffer = 0
+                start = end_scan = 0
                 break
 
-            if not remaining:
-                line = self.decoded_chars
-                start = self.decoded_chars_used
-                offset_to_buffer = 0
+            if remnant:
+                assert not self.readtranslate and self.readnl == '\r\n'
+                assert self.decoded_chars_used == 0
+                if remnant == '\r' and self.decoded_chars[0] == '\n':
+                    builder.append('\r\n')
+                    self.decoded_chars_used = 1
+                    line = remnant = None
+                    start = end_scan = 0
+                    break
+                else:
+                    builder.append(remnant)
+                    remnant = None
+                    continue
+
+            line = self.decoded_chars
+            start = self.decoded_chars_used
+            if limit > 0:
+                remaining = limit - builder.getlength()
+                assert remaining >= 0
             else:
-                assert self.decoded_chars_used == 0
-                line = remaining + self.decoded_chars
-                start = 0
-                offset_to_buffer = len(remaining)
-                remaining = None
+                remaining = sys.maxint
+            end_scan, found = self._find_line_ending(line, start, remaining)
+            assert end_scan >= 0
+            if found:
+                break
 
-            line_len = len(line)
-            endpos, consumed = self._find_line_ending(line, start, line_len)
-            chunked = builder.getlength()
-            if endpos >= 0:
-                if limit >= 0 and endpos >= start + limit - chunked:
-                    endpos = start + limit - chunked
-                    assert endpos >= 0
-                break
-            assert consumed >= 0
-
-            # We can put aside up to `endpos`
-            endpos = consumed + start
-            if limit >= 0 and endpos >= start + limit - chunked:
+            if limit >= 0 and end_scan - start >= remaining:
                 # Didn't find line ending, but reached length limit
-                endpos = start + limit - chunked
-                assert endpos >= 0
                 break
 
             # No line ending seen yet - put aside current data
-            if endpos > start:
-                s = line[start:endpos]
+            if end_scan > start:
+                s = line[start:end_scan]
                 builder.append(s)
 
-            # There may be some remaining bytes we'll have to prepend to the
+            # There may be some remaining chars we'll have to prepend to the
             # next chunk of data
-            if endpos < line_len:
-                remaining = line[endpos:]
+            if end_scan < len(line):
+                remnant = line[end_scan:]
             line = None
             # We have consumed the buffer
             self._unset_decoded()
 
         if line:
             # Our line ends in the current buffer
-            decoded_chars_used = endpos - offset_to_buffer
-            assert decoded_chars_used >= 0
-            self.decoded_chars_used = decoded_chars_used
-            if start > 0 or endpos < len(line):
-                line = line[start:endpos]
+            self.decoded_chars_used = end_scan
+            if start > 0 or end_scan < len(line):
+                line = line[start:end_scan]
             builder.append(line)
-        elif remaining:
-            builder.append(remaining)
+        elif remnant:
+            builder.append(remnant)
 
         result = builder.build()
         return space.new_from_utf8(result)
diff --git a/pypy/module/_io/test/test_interp_textio.py b/pypy/module/_io/test/test_interp_textio.py
new file mode 100644
--- /dev/null
+++ b/pypy/module/_io/test/test_interp_textio.py
@@ -0,0 +1,33 @@
+from hypothesis import given, strategies as st, assume
+from pypy.module._io.interp_bytesio import W_BytesIO
+from pypy.module._io.interp_textio import W_TextIOWrapper
+
+LINESEP = ['', '\r', '\n', '\r\n']
+
+ at st.composite
+def text_with_newlines(draw):
+    sep = draw(st.sampled_from(LINESEP))
+    lines = draw(st.lists(st.text(max_size=10), max_size=10))
+    return sep.join(lines)
+
+ at given(txt=text_with_newlines(),
+       mode=st.sampled_from(['\r', '\n', '\r\n', '']),
+       limit=st.integers(min_value=-1))
+def test_readline(space, txt, mode, limit):
+    assume(limit != 0)
+    w_stream = W_BytesIO(space)
+    w_stream.descr_init(space, space.newbytes(txt.encode('utf-8')))
+    w_textio = W_TextIOWrapper(space)
+    w_textio.descr_init(
+        space, w_stream, encoding='utf-8',
+        w_newline=space.newtext(mode))
+    lines = []
+    while True:
+        line = space.unicode_w(w_textio.readline_w(space, space.newint(limit)))
+        if limit > 0:
+            assert len(line) <= limit
+        if line:
+            lines.append(line)
+        else:
+            break
+    assert u''.join(lines) == txt
diff --git a/requirements.txt b/requirements.txt
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,7 @@
 cffi>=1.4.0
-vmprof>=0.4.10  # required to parse log files in rvmprof tests
+
+# parse log files in rvmprof tests
+vmprof>=0.4.10; 'x86' in platform.machine #skip arm, s390x
 
 # hypothesis is used for test generation on untranslated tests
 hypothesis
diff --git a/rpython/rlib/rvmprof/cintf.py b/rpython/rlib/rvmprof/cintf.py
--- a/rpython/rlib/rvmprof/cintf.py
+++ b/rpython/rlib/rvmprof/cintf.py
@@ -9,6 +9,7 @@
 from rpython.rtyper.tool import rffi_platform as platform
 from rpython.rlib import rthread, jit
 from rpython.rlib.objectmodel import we_are_translated
+from rpython.config.translationoption import get_translation_config
 
 class VMProfPlatformUnsupported(Exception):
     pass
@@ -133,11 +134,17 @@
 #endif
 """])
 
+if get_translation_config() is None:
+    # tests need the full eci here
+    _eci = global_eci
+else:
+    _eci = auto_eci
+
 vmprof_stop_sampling = rffi.llexternal("vmprof_stop_sampling", [],
-                                       rffi.INT, compilation_info=auto_eci,
+                                       rffi.INT, compilation_info=_eci,
                                        _nowrapper=True)
 vmprof_start_sampling = rffi.llexternal("vmprof_start_sampling", [],
-                                        lltype.Void, compilation_info=auto_eci,
+                                        lltype.Void, compilation_info=_eci,
                                         _nowrapper=True)