[pypy-commit] pypy unicode-utf8: merge heads

arigo pypy.commits at gmail.com
Mon Nov 27 16:17:18 EST 2017

Author: Armin Rigo <arigo at tunes.org>
Branch: unicode-utf8
Changeset: r93186:350cb9b5b92b
Date: 2017-11-27 22:16 +0100

Log:	merge heads

diff too long, truncating to 2000 out of 2094 lines

diff --git a/TODO b/TODO
--- a/TODO
+++ b/TODO
@@ -9,3 +9,5 @@
 * remove assertions from W_UnicodeObject.__init__ if all the builders pass
 * what to do with error handlers that go backwards. There were tests
   in test_codecs that would check for that
+* fix _pypyjson to not use a wrapped dict when decoding an object
diff --git a/extra_tests/test_textio.py b/extra_tests/test_textio.py
new file mode 100644
--- /dev/null
+++ b/extra_tests/test_textio.py
@@ -0,0 +1,28 @@
+from hypothesis import given, strategies as st
+from io import BytesIO, TextIOWrapper
+LINESEP = ['', '\r', '\n', '\r\n']
+ at st.composite
+def text_with_newlines(draw):
+    sep = draw(st.sampled_from(LINESEP))
+    lines = draw(st.lists(st.text(max_size=10), max_size=10))
+    return sep.join(lines)
+ at given(txt=text_with_newlines(),
+       mode=st.sampled_from(['\r', '\n', '\r\n', '']),
+       limit=st.integers(min_value=-1))
+def test_readline(txt, mode, limit):
+    textio = TextIOWrapper(
+        BytesIO(txt.encode('utf-8')), encoding='utf-8', newline=mode)
+    lines = []
+    while True:
+        line = textio.readline(limit)
+        if limit > 0:
+            assert len(line) < limit
+        if line:
+            lines.append(line)
+        else:
+            break
+    assert u''.join(lines) == txt
diff --git a/pypy/interpreter/baseobjspace.py b/pypy/interpreter/baseobjspace.py
--- a/pypy/interpreter/baseobjspace.py
+++ b/pypy/interpreter/baseobjspace.py
@@ -1760,10 +1760,6 @@
     def utf8_w(self, w_obj):
         return w_obj.utf8_w(self)
-    def unicode_w(self, w_obj):
-        # XXX: kill me!
-        return w_obj.utf8_w(self).decode('utf-8')
     def convert_to_w_unicode(self, w_obj):
         return w_obj.convert_to_w_unicode(self)
diff --git a/pypy/module/_continuation/test/conftest.py b/pypy/module/_continuation/test/conftest.py
new file mode 100644
--- /dev/null
+++ b/pypy/module/_continuation/test/conftest.py
@@ -0,0 +1,7 @@
+import pytest
+import sys
+def pytest_configure(config):
+    if sys.platform.startswith('linux'):
+        from rpython.rlib.rvmprof.cintf import configure_libbacktrace_linux
+        configure_libbacktrace_linux()
diff --git a/pypy/module/_io/interp_stringio.py b/pypy/module/_io/interp_stringio.py
--- a/pypy/module/_io/interp_stringio.py
+++ b/pypy/module/_io/interp_stringio.py
@@ -2,21 +2,115 @@
 from pypy.interpreter.typedef import (
     TypeDef, generic_new_descr, GetSetProperty)
 from pypy.interpreter.gateway import interp2app, unwrap_spec, WrappedDefault
-from pypy.module._io.interp_textio import W_TextIOBase, W_IncrementalNewlineDecoder
+from pypy.module._io.interp_textio import (
+        W_TextIOBase, W_IncrementalNewlineDecoder)
 from pypy.module._io.interp_iobase import convert_size
+class UnicodeIO(object):
+    def __init__(self, data=None, pos=0):
+        if data is None:
+            data = []
+        self.data = data
+        self.pos = pos
+    def resize(self, newlength):
+        if len(self.data) > newlength:
+            self.data = self.data[:newlength]
+        if len(self.data) < newlength:
+            self.data.extend([u'\0'] * (newlength - len(self.data)))
+    def read(self, size):
+        start = self.pos
+        available = len(self.data) - start
+        if available <= 0:
+            return u''
+        if size >= 0 and size <= available:
+            end = start + size
+        else:
+            end = len(self.data)
+        assert 0 <= start <= end
+        self.pos = end
+        return u''.join(self.data[start:end])
+    def _convert_limit(self, limit):
+        if limit < 0 or limit > len(self.data) - self.pos:
+            limit = len(self.data) - self.pos
+        assert limit >= 0
+        return limit
+    def readline_universal(self, limit):
+        # Universal newline search. Find any of \r, \r\n, \n
+        limit = self._convert_limit(limit)
+        start = self.pos
+        end = start + limit
+        pos = start
+        while pos < end:
+            ch = self.data[pos]
+            pos += 1
+            if ch == '\n':
+                break
+            if ch == '\r':
+                if pos >= end:
+                    break
+                if self.data[pos] == '\n':
+                    pos += 1
+                    break
+                else:
+                    break
+        self.pos = pos
+        result = u''.join(self.data[start:pos])
+        return result
+    def readline(self, marker, limit):
+        start = self.pos
+        limit = self._convert_limit(limit)
+        end = start + limit
+        found = False
+        for pos in range(start, end - len(marker) + 1):
+            ch = self.data[pos]
+            if ch == marker[0]:
+                for j in range(1, len(marker)):
+                    if self.data[pos + j] != marker[j]:
+                        break  # from inner loop
+                else:
+                    pos += len(marker)
+                    found = True
+                    break
+        if not found:
+            pos = end
+        self.pos = pos
+        result = u''.join(self.data[start:pos])
+        return result
+    def write(self, string):
+        length = len(string)
+        if self.pos + length > len(self.data):
+            self.resize(self.pos + length)
+        for i in range(length):
+            self.data[self.pos + i] = string[i]
+        self.pos += length
+    def seek(self, pos):
+        self.pos = pos
+    def truncate(self, size):
+        if size < len(self.data):
+            self.resize(size)
+    def getvalue(self):
+        return u''.join(self.data)
 class W_StringIO(W_TextIOBase):
     def __init__(self, space):
         W_TextIOBase.__init__(self, space)
-        self.buf = []
-        self.pos = 0
+        self.buf = UnicodeIO()
-    @unwrap_spec(w_newline = WrappedDefault("\n"))
+    @unwrap_spec(w_newline=WrappedDefault("\n"))
     def descr_init(self, space, w_initvalue=None, w_newline=None):
         # In case __init__ is called multiple times
-        self.buf = []
-        self.pos = 0
+        self.buf = UnicodeIO()
         self.w_decoder = None
         self.readnl = None
         self.writenl = None
@@ -27,7 +121,7 @@
             newline = space.unicode_w(w_newline)
         if (newline is not None and newline != u"" and newline != u"\n" and
-            newline != u"\r" and newline != u"\r\n"):
+                newline != u"\r" and newline != u"\r\n"):
             # Not using oefmt() because I don't know how to use it
             # with unicode
             raise OperationError(space.w_ValueError,
@@ -50,7 +144,7 @@
         if not space.is_none(w_initvalue):
             self.write_w(space, w_initvalue)
-            self.pos = 0
+            self.buf.pos = 0
     def descr_getstate(self, space):
         w_initialval = self.getvalue_w(space)
@@ -58,9 +152,9 @@
         if self.readnl is None:
             w_readnl = space.w_None
-            w_readnl = space.str(space.newunicode(self.readnl)) # YYY
+            w_readnl = space.str(space.newunicode(self.readnl))  # YYY
         return space.newtuple([
-            w_initialval, w_readnl, space.newint(self.pos), w_dict
+            w_initialval, w_readnl, space.newint(self.buf.pos), w_dict
     def descr_setstate(self, space, w_state):
@@ -69,34 +163,33 @@
         # We allow the state tuple to be longer than 4, because we may need
         # someday to extend the object's state without breaking
         # backwards-compatibility
-        if not space.isinstance_w(w_state, space.w_tuple) or space.len_w(w_state) < 4:
+        if (not space.isinstance_w(w_state, space.w_tuple)
+                or space.len_w(w_state) < 4):
             raise oefmt(space.w_TypeError,
                         "%T.__setstate__ argument should be a 4-tuple, got %T",
                         self, w_state)
         w_initval, w_readnl, w_pos, w_dict = space.unpackiterable(w_state, 4)
+        if not space.isinstance_w(w_initval, space.w_unicode):
+            raise oefmt(space.w_TypeError,
+                        "unicode argument expected, got '%T'", w_initval)
         # Initialize state
-        self.descr_init(space, w_initval, w_readnl)
+        self.descr_init(space, None, w_readnl)
-        # Restore the buffer state. Even if __init__ did initialize the buffer,
-        # we have to initialize it again since __init__ may translates the
-        # newlines in the inital_value string. We clearly do not want that
+        # Restore the buffer state. We're not doing it via __init__
         # because the string value in the state tuple has already been
         # translated once by __init__. So we do not take any chance and replace
         # object's buffer completely
         initval = space.unicode_w(w_initval)
-        size = len(initval)
-        self.resize_buffer(size)
-        self.buf = list(initval)
         pos = space.getindex_w(w_pos, space.w_TypeError)
         if pos < 0:
             raise oefmt(space.w_ValueError,
                         "position value cannot be negative")
-        self.pos = pos
+        self.buf = UnicodeIO(list(initval), pos)
         if not space.is_w(w_dict, space.w_None):
             if not space.isinstance_w(w_dict, space.w_dict):
-                raise oefmt(space.w_TypeError,
-                            "fourth item of state should be a dict, got a %T",
-                            w_dict)
+                raise oefmt(
+                    space.w_TypeError,
+                    "fourth item of state should be a dict, got a %T", w_dict)
             # Alternatively, we could replace the internal dictionary
             # completely. However, it seems more practical to just update it.
             space.call_method(self.w_dict, "update", w_dict)
@@ -107,88 +200,47 @@
                 message = "I/O operation on closed file"
             raise OperationError(space.w_ValueError, space.newtext(message))
-    def resize_buffer(self, newlength):
-        if len(self.buf) > newlength:
-            self.buf = self.buf[:newlength]
-        if len(self.buf) < newlength:
-            self.buf.extend([u'\0'] * (newlength - len(self.buf)))
-    def write(self, string):
-        length = len(string)
-        if self.pos + length > len(self.buf):
-            self.resize_buffer(self.pos + length)
-        for i in range(length):
-            self.buf[self.pos + i] = string[i]
-        self.pos += length
     def write_w(self, space, w_obj):
         if not space.isinstance_w(w_obj, space.w_unicode):
             raise oefmt(space.w_TypeError,
                         "unicode argument expected, got '%T'", w_obj)
         orig_size = space.len_w(w_obj)
         if self.w_decoder is not None:
             w_decoded = space.call_method(
-                self.w_decoder, "decode", w_obj, space.w_True
-            )
+                self.w_decoder, "decode", w_obj, space.w_True)
             w_decoded = w_obj
         if self.writenl:
             w_decoded = space.call_method(
-                w_decoded, "replace", space.newtext("\n"), space.newunicode(self.writenl)
-            )
+                w_decoded, "replace",
+                space.newtext("\n"), space.newunicode(self.writenl))
+        string = space.unicode_w(w_decoded)
+        if string:
+            self.buf.write(string)
-        string = space.unicode_w(w_decoded)
-        size = len(string)
-        if size:
-            self.write(string)
         return space.newint(orig_size)
     def read_w(self, space, w_size=None):
         size = convert_size(space, w_size)
-        start = self.pos
-        available = len(self.buf) - start
-        if available <= 0:
-            return space.newunicode(u"")
-        if size >= 0 and size <= available:
-            end = start + size
-        else:
-            end = len(self.buf)
-        assert 0 <= start <= end
-        self.pos = end
-        return space.newunicode(u''.join(self.buf[start:end]))
+        return space.newunicode(self.buf.read(size))
     def readline_w(self, space, w_limit=None):
         limit = convert_size(space, w_limit)
+        if self.readuniversal:
+            result = self.buf.readline_universal(limit)
+        else:
+            if self.readtranslate:
+                # Newlines are already translated, only search for \n
+                newline = u'\n'
+            else:
+                newline = self.readnl
+            result = self.buf.readline(newline, limit)
+        return space.newunicode(result)
-        if self.pos >= len(self.buf):
-            return space.newunicode(u"")
-        start = self.pos
-        if limit < 0 or limit > len(self.buf) - self.pos:
-            limit = len(self.buf) - self.pos
-        assert limit >= 0
-        end = start + limit
-        endpos, consumed = self._find_line_ending(
-            # XXX: super inefficient, makes a copy of the entire contents.
-            u"".join(self.buf),
-            start,
-            end
-        )
-        if endpos < 0:
-            endpos = end
-        assert endpos >= 0
-        self.pos = endpos
-        return space.newunicode(u"".join(self.buf[start:endpos]))
     @unwrap_spec(pos=int, mode=int)
     def seek_w(self, space, pos, mode=0):
@@ -204,32 +256,27 @@
         # XXX: this makes almost no sense, but its how CPython does it.
         if mode == 1:
-            pos = self.pos
+            pos = self.buf.pos
         elif mode == 2:
-            pos = len(self.buf)
+            pos = len(self.buf.data)
         assert pos >= 0
-        self.pos = pos
+        self.buf.seek(pos)
         return space.newint(pos)
     def truncate_w(self, space, w_size=None):
         if space.is_none(w_size):
-            size = self.pos
+            size = self.buf.pos
             size = space.int_w(w_size)
         if size < 0:
             raise oefmt(space.w_ValueError, "Negative size value %d", size)
-        if size < len(self.buf):
-            self.resize_buffer(size)
+        self.buf.truncate(size)
         return space.newint(size)
     def getvalue_w(self, space):
-        return space.newunicode(u''.join(self.buf))
+        return space.newunicode(self.buf.getvalue())
     def readable_w(self, space):
diff --git a/pypy/module/_io/interp_textio.py b/pypy/module/_io/interp_textio.py
--- a/pypy/module/_io/interp_textio.py
+++ b/pypy/module/_io/interp_textio.py
@@ -221,46 +221,6 @@
     def newlines_get_w(self, space):
         return space.w_None
-    def _find_line_ending(self, line, start, end):
-        size = end - start
-        if self.readtranslate:
-            # Newlines are already translated, only search for \n
-            pos = line.find('\n', start, end)
-            if pos >= 0:
-                return pos + 1, 0
-            else:
-                return -1, size
-        elif self.readuniversal:
-            # Universal newline search. Find any of \r, \r\n, \n
-            # The decoder ensures that \r\n are not split in two pieces
-            i = start
-            while True:
-                # Fast path for non-control chars.
-                while i < end and line[i] > '\r':
-                    i += 1
-                if i >= end:
-                    return -1, size
-                ch = line[i]
-                i += 1
-                if ch == '\n':
-                    return i, 0
-                if ch == '\r':
-                    if line[i] == '\n':
-                        return i + 1, 0
-                    else:
-                        return i, 0
-        else:
-            # Non-universal mode.
-            pos = line.find(self.readnl, start, end)
-            if pos >= 0:
-                return pos + len(self.readnl), 0
-            else:
-                pos = line.find(self.readnl[0], start, end)
-                if pos >= 0:
-                    return -1, pos - start
-                return -1, size
 W_TextIOBase.typedef = TypeDef(
     '_io._TextIOBase', W_IOBase.typedef,
     __new__ = generic_new_descr(W_TextIOBase),
@@ -336,6 +296,126 @@
         self.input = input
+class DecodeBuffer(object):
+    def __init__(self, text=None):
+        self.text = text
+        self.pos = 0
+    def set(self, space, w_decoded):
+        check_decoded(space, w_decoded)
+        self.text = space.unicode_w(w_decoded)
+        self.pos = 0
+    def reset(self):
+        self.text = None
+        self.pos = 0
+    def get_chars(self, size):
+        if self.text is None:
+            return u""
+        available = len(self.text) - self.pos
+        if size < 0 or size > available:
+            size = available
+        assert size >= 0
+        if self.pos > 0 or size < available:
+            start = self.pos
+            end = self.pos + size
+            assert start >= 0
+            assert end >= 0
+            chars = self.text[start:end]
+        else:
+            chars = self.text
+        self.pos += size
+        return chars
+    def has_data(self):
+        return (self.text is not None and not self.exhausted())
+    def exhausted(self):
+        return self.pos >= len(self.text)
+    def next_char(self):
+        if self.exhausted():
+            raise StopIteration
+        ch = self.text[self.pos]
+        self.pos += 1
+        return ch
+    def peek_char(self):
+        # like next_char, but doesn't advance pos
+        if self.exhausted():
+            raise StopIteration
+        ch = self.text[self.pos]
+        return ch
+    def find_newline_universal(self, limit):
+        # Universal newline search. Find any of \r, \r\n, \n
+        # The decoder ensures that \r\n are not split in two pieces
+        if limit < 0:
+            limit = sys.maxint
+        scanned = 0
+        while scanned < limit:
+            try:
+                ch = self.next_char()
+            except StopIteration:
+                return False
+            if ch == u'\n':
+                return True
+            if ch == u'\r':
+                if scanned >= limit:
+                    return False
+                try:
+                    ch = self.peek_char()
+                except StopIteration:
+                    return False
+                if ch == u'\n':
+                    self.next_char()
+                    return True
+                else:
+                    return True
+        return False
+    def find_crlf(self, limit):
+        if limit < 0:
+            limit = sys.maxint
+        scanned = 0
+        while scanned < limit:
+            try:
+                ch = self.next_char()
+            except StopIteration:
+                return False
+            scanned += 1
+            if ch == u'\r':
+                if scanned >= limit:
+                    return False
+                try:
+                    if self.peek_char() == u'\n':
+                        self.next_char()
+                        return True
+                except StopIteration:
+                    # This is the tricky case: we found a \r right at the end
+                    self.pos -= 1
+                    return False
+        return False
+    def find_char(self, marker, limit):
+        if limit < 0:
+            limit = sys.maxint
+        scanned = 0
+        while scanned < limit:
+            try:
+                ch = self.next_char()
+            except StopIteration:
+                return False
+            if ch == marker:
+                return True
+            scanned += 1
+        return False
 def check_decoded(space, w_decoded):
     if not space.isinstance_w(w_decoded, space.w_unicode):
         msg = "decoder should return a string result, not '%T'"
@@ -349,8 +429,7 @@
         self.w_encoder = None
         self.w_decoder = None
-        self.decoded_chars = None   # buffer for text returned from decoder
-        self.decoded_chars_used = 0 # offset into _decoded_chars for read()
+        self.decoded = DecodeBuffer()
         self.pending_bytes = None   # list of bytes objects waiting to be
                                     # written, or NULL
         self.chunk_size = 8192
@@ -518,40 +597,10 @@
     # _____________________________________________________________
     # read methods
-    def _unset_decoded(self):
-        self.decoded_chars = None
-        self.decoded_chars_used = 0
-    def _set_decoded(self, space, w_decoded):
-        check_decoded(space, w_decoded)
-        self.decoded_chars = space.utf8_w(w_decoded)
-        self.decoded_chars_used = 0
-    def _get_decoded_chars(self, size):
-        if self.decoded_chars is None:
-            return ""
-        available = len(self.decoded_chars) - self.decoded_chars_used
-        if size < 0 or size > available:
-            size = available
-        assert size >= 0
-        if self.decoded_chars_used > 0 or size < available:
-            start = self.decoded_chars_used
-            end = self.decoded_chars_used + size
-            assert start >= 0
-            assert end >= 0
-            chars = self.decoded_chars[start:end]
-        else:
-            chars = self.decoded_chars
-        self.decoded_chars_used += size
-        return chars
     def _read_chunk(self, space):
         """Read and decode the next chunk of data from the BufferedReader.
         The return value is True unless EOF was reached.  The decoded string
-        is placed in self._decoded_chars (replacing its previous value).
+        is placed in self.decoded (replacing its previous value).
         The entire input chunk is sent to the decoder, though some of it may
         remain buffered in the decoder, yet to be converted."""
@@ -571,7 +620,7 @@
             dec_buffer = None
             dec_flags = 0
-        # Read a chunk, decode it, and put the result in self._decoded_chars
+        # Read a chunk, decode it, and put the result in self.decoded
         w_input = space.call_method(self.w_buffer, "read1",
@@ -583,7 +632,7 @@
         eof = space.len_w(w_input) == 0
         w_decoded = space.call_method(self.w_decoder, "decode",
                                       w_input, space.newbool(eof))
-        self._set_decoded(space, w_decoded)
+        self.decoded.set(space, w_decoded)
         if space.len_w(w_decoded) > 0:
             eof = False
@@ -595,6 +644,19 @@
         return not eof
+    def _ensure_data(self, space):
+        while not self.decoded.has_data():
+            try:
+                if not self._read_chunk(space):
+                    self.decoded.reset()
+                    self.snapshot = None
+                    return False
+            except OperationError as e:
+                if trap_eintr(space, e):
+                    continue
+                raise
+        return True
     def next_w(self, space):
         self.telling = False
@@ -619,7 +681,7 @@
             w_bytes = space.call_method(self.w_buffer, "read")
             w_decoded = space.call_method(self.w_decoder, "decode", w_bytes, space.w_True)
             check_decoded(space, w_decoded)
-            w_result = space.new_from_utf8(self._get_decoded_chars(-1))
+            w_result = space.new_from_utf8(self.decoded.get_chars(-1))
             w_final = space.add(w_result, w_decoded)
             self.snapshot = None
             return w_final
@@ -628,24 +690,29 @@
         builder = StringBuilder(size)
         # Keep reading chunks until we have n characters to return
-        while True:
-            data = self._get_decoded_chars(remaining)
+        while remaining > 0:
+            if not self._ensure_data(space):
+                break
+            data = self.decoded.get_chars(remaining)
             remaining -= len(data)
-            if remaining <= 0: # Done
-                break
+        return space.new_from_utf8(builder.build())
-            try:
-                if not self._read_chunk(space):
-                    # EOF
-                    break
-            except OperationError as e:
-                if trap_eintr(space, e):
-                    continue
-                raise
-        return space.new_from_utf8(builder.build())
+    def _scan_line_ending(self, limit):
+        if self.readuniversal:
+            return self.decoded.find_newline_universal(limit)
+        else:
+            if self.readtranslate:
+                # Newlines are already translated, only search for \n
+                newline = u'\n'
+            else:
+                # Non-universal mode.
+                newline = self.readnl
+            if newline == u'\r\n':
+                return self.decoded.find_crlf(limit)
+            else:
+                return self.decoded.find_char(newline[0], limit)
     def readline_w(self, space, w_limit=None):
@@ -653,82 +720,52 @@
         limit = convert_size(space, w_limit)
-        line = None
-        remaining = None
+        remnant = None
         builder = StringBuilder()
         while True:
             # First, get some data if necessary
-            has_data = True
-            while not self.decoded_chars:
-                try:
-                    if not self._read_chunk(space):
-                        has_data = False
-                        break
-                except OperationError as e:
-                    if trap_eintr(space, e):
-                        continue
-                    raise
+            has_data = self._ensure_data(space)
             if not has_data:
                 # end of file
-                self._unset_decoded()
-                self.snapshot = None
-                start = endpos = offset_to_buffer = 0
+                if remnant:
+                    builder.append(remnant)
-            if not remaining:
-                line = self.decoded_chars
-                start = self.decoded_chars_used
-                offset_to_buffer = 0
+            if remnant:
+                assert not self.readtranslate and self.readnl == '\r\n'
+                assert self.decoded.pos == 0
+                if remnant == '\r' and self.decoded.text[0] == '\n':
+                    builder.append('\r\n')
+                    self.decoded.pos = 1
+                    remnant = None
+                    break
+                else:
+                    builder.append(remnant)
+                    remnant = None
+                    continue
+            if limit > 0:
+                remaining = limit - builder.getlength()
+                assert remaining >= 0
-                assert self.decoded_chars_used == 0
-                line = remaining + self.decoded_chars
-                start = 0
-                offset_to_buffer = len(remaining)
-                remaining = None
+                remaining = -1
+            start = self.decoded.pos
+            assert start >= 0
+            found = self._scan_line_ending(remaining)
+            end_scan = self.decoded.pos
+            if end_scan > start:
+                s = self.decoded.text[start:end_scan]
+                builder.append(s)
-            line_len = len(line)
-            endpos, consumed = self._find_line_ending(line, start, line_len)
-            chunked = builder.getlength()
-            if endpos >= 0:
-                if limit >= 0 and endpos >= start + limit - chunked:
-                    endpos = start + limit - chunked
-                    assert endpos >= 0
-                break
-            assert consumed >= 0
-            # We can put aside up to `endpos`
-            endpos = consumed + start
-            if limit >= 0 and endpos >= start + limit - chunked:
-                # Didn't find line ending, but reached length limit
-                endpos = start + limit - chunked
-                assert endpos >= 0
+            if found or (limit >= 0 and builder.getlength() >= limit):
-            # No line ending seen yet - put aside current data
-            if endpos > start:
-                s = line[start:endpos]
-                builder.append(s)
-            # There may be some remaining bytes we'll have to prepend to the
+            # There may be some remaining chars we'll have to prepend to the
             # next chunk of data
-            if endpos < line_len:
-                remaining = line[endpos:]
-            line = None
+            if not self.decoded.exhausted():
+                remnant = self.decoded.get_chars(-1)
             # We have consumed the buffer
-            self._unset_decoded()
-        if line:
-            # Our line ends in the current buffer
-            decoded_chars_used = endpos - offset_to_buffer
-            assert decoded_chars_used >= 0
-            self.decoded_chars_used = decoded_chars_used
-            if start > 0 or endpos < len(line):
-                line = line[start:endpos]
-            builder.append(line)
-        elif remaining:
-            builder.append(remaining)
+            self.decoded.reset()
         result = builder.build()
         return space.new_from_utf8(result)
@@ -862,7 +899,7 @@
                 raise oefmt(space.w_IOError,
                             "can't do nonzero end-relative seeks")
             space.call_method(self, "flush")
-            self._unset_decoded()
+            self.decoded.reset()
             self.snapshot = None
             if self.w_decoder:
                 space.call_method(self.w_decoder, "reset")
@@ -887,7 +924,7 @@
         # Seek back to the safe start point
         space.call_method(self.w_buffer, "seek", space.newint(cookie.start_pos))
-        self._unset_decoded()
+        self.decoded.reset()
         self.snapshot = None
         # Restore the decoder to its state from the safe start point.
@@ -908,13 +945,13 @@
             w_decoded = space.call_method(self.w_decoder, "decode",
                                           w_chunk, space.newbool(bool(cookie.need_eof)))
-            self._set_decoded(space, w_decoded)
+            self.decoded.set(space, w_decoded)
             # Skip chars_to_skip of the decoded characters
-            if len(self.decoded_chars) < cookie.chars_to_skip:
+            if len(self.decoded.text) < cookie.chars_to_skip:
                 raise oefmt(space.w_IOError,
                             "can't restore logical file position")
-            self.decoded_chars_used = cookie.chars_to_skip
+            self.decoded.pos = cookie.chars_to_skip
             self.snapshot = PositionSnapshot(cookie.dec_flags, "")
@@ -940,7 +977,7 @@
         w_pos = space.call_method(self.w_buffer, "tell")
         if self.w_decoder is None or self.snapshot is None:
-            assert not self.decoded_chars
+            assert not self.decoded.text
             return w_pos
         cookie = PositionCookie(space.bigint_w(w_pos))
@@ -951,11 +988,11 @@
         cookie.start_pos -= len(input)
         # How many decoded characters have been used up since the snapshot?
-        if not self.decoded_chars_used:
+        if not self.decoded.pos:
             # We haven't moved from the snapshot point.
             return space.newlong_from_rbigint(cookie.pack())
-        chars_to_skip = self.decoded_chars_used
+        chars_to_skip = self.decoded.pos
         # Starting from the snapshot position, we will walk the decoder
         # forward until it gives us enough decoded characters.
diff --git a/pypy/module/_io/test/test_interp_textio.py b/pypy/module/_io/test/test_interp_textio.py
new file mode 100644
--- /dev/null
+++ b/pypy/module/_io/test/test_interp_textio.py
@@ -0,0 +1,68 @@
+import pytest
+    from hypothesis import given, strategies as st, assume
+except ImportError:
+    pytest.skip("hypothesis required")
+from pypy.module._io.interp_bytesio import W_BytesIO
+from pypy.module._io.interp_textio import W_TextIOWrapper, DecodeBuffer
+LINESEP = ['', '\r', '\n', '\r\n']
+ at st.composite
+def text_with_newlines(draw):
+    sep = draw(st.sampled_from(LINESEP))
+    lines = draw(st.lists(st.text(max_size=10), max_size=10))
+    return sep.join(lines)
+ at given(txt=text_with_newlines(),
+       mode=st.sampled_from(['\r', '\n', '\r\n', '']),
+       limit=st.integers(min_value=-1))
+def test_readline(space, txt, mode, limit):
+    assume(limit != 0)
+    w_stream = W_BytesIO(space)
+    w_stream.descr_init(space, space.newbytes(txt.encode('utf-8')))
+    w_textio = W_TextIOWrapper(space)
+    w_textio.descr_init(
+        space, w_stream, encoding='utf-8',
+        w_newline=space.newtext(mode))
+    lines = []
+    while True:
+        line = space.unicode_w(w_textio.readline_w(space, space.newint(limit)))
+        if limit > 0:
+            assert len(line) <= limit
+        if line:
+            lines.append(line)
+        else:
+            break
+    assert u''.join(lines) == txt
+ at given(st.text())
+def test_read_buffer(text):
+    buf = DecodeBuffer(text)
+    assert buf.get_chars(-1) == text
+    assert buf.exhausted()
+ at given(st.text(), st.lists(st.integers(min_value=0)))
+def test_readn_buffer(text, sizes):
+    buf = DecodeBuffer(text)
+    strings = []
+    for n in sizes:
+        s = buf.get_chars(n)
+        if not buf.exhausted():
+            assert len(s) == n
+        else:
+            assert len(s) <= n
+        strings.append(s)
+    assert ''.join(strings) == text[:sum(sizes)]
+ at given(st.text())
+def test_next_char(text):
+    buf = DecodeBuffer(text)
+    chars = []
+    try:
+        while True:
+            chars.append(buf.next_char())
+    except StopIteration:
+        pass
+    assert buf.exhausted()
+    assert u''.join(chars) == text
diff --git a/pypy/module/_multibytecodec/c_codecs.py b/pypy/module/_multibytecodec/c_codecs.py
--- a/pypy/module/_multibytecodec/c_codecs.py
+++ b/pypy/module/_multibytecodec/c_codecs.py
@@ -197,19 +197,21 @@
-def encode(codec, unicodedata, errors="strict", errorcb=None, namecb=None):
+def encode(codec, unicodedata, length, errors="strict", errorcb=None,
+           namecb=None):
     encodebuf = pypy_cjk_enc_new(codec)
     if not encodebuf:
         raise MemoryError
-        return encodeex(encodebuf, unicodedata, errors, errorcb, namecb)
+        return encodeex(encodebuf, unicodedata, length, errors, errorcb, namecb)
-def encodeex(encodebuf, unicodedata, errors="strict", errorcb=None,
+def encodeex(encodebuf, utf8data, length, errors="strict", errorcb=None,
              namecb=None, ignore_error=0):
-    inleft = len(unicodedata)
-    with rffi.scoped_nonmoving_unicodebuffer(unicodedata) as inbuf:
+    inleft = length
+    inbuf = rffi.utf82wcharp(utf8data, length)
+    try:
         if pypy_cjk_enc_init(encodebuf, inbuf, inleft) < 0:
             raise MemoryError
         if ignore_error == 0:
@@ -221,16 +223,18 @@
             if r == 0 or r == ignore_error:
             multibytecodec_encerror(encodebuf, r, errors,
-                                    errorcb, namecb, unicodedata)
+                                    errorcb, namecb, utf8data)
         while flags & MBENC_RESET:
             r = pypy_cjk_enc_reset(encodebuf)
             if r == 0:
             multibytecodec_encerror(encodebuf, r, errors,
-                                    errorcb, namecb, unicodedata)
+                                    errorcb, namecb, utf8data)
         src = pypy_cjk_enc_outbuf(encodebuf)
         length = pypy_cjk_enc_outlen(encodebuf)
         return rffi.charpsize2str(src, length)
+    finally:
+        lltype.free(inbuf, flavor='raw')
 def multibytecodec_encerror(encodebuf, e, errors,
                             errorcb, namecb, unicodedata):
@@ -256,21 +260,16 @@
     elif errors == "replace":
         codec = pypy_cjk_enc_getcodec(encodebuf)
-            replace = encode(codec, u"?")
+            replace = encode(codec, "?", 1)
         except EncodeDecodeError:
             replace = "?"
         assert errorcb
-        XXX
-        retu, rets, end = errorcb(errors, namecb, reason,
-                                  unicodedata.encode("utf8"), start, end)
-        if rets is not None:
-            # py3k only
-            replace = rets
-        else:
-            assert retu is not None
-            codec = pypy_cjk_enc_getcodec(encodebuf)
-            replace = encode(codec, retu, "strict", errorcb, namecb)
+        rets, end = errorcb(errors, namecb, reason,
+                            unicodedata, start, end)
+        codec = pypy_cjk_enc_getcodec(encodebuf)
+        lgt, _ = rutf8.get_utf8_length_flag(rets)
+        replace = encode(codec, rets, lgt, "strict", errorcb, namecb)
     with rffi.scoped_nonmovingbuffer(replace) as inbuf:
         r = pypy_cjk_enc_replace_on_error(encodebuf, inbuf, len(replace), end)
     if r == MBERR_NOMEMORY:
diff --git a/pypy/module/_multibytecodec/interp_incremental.py b/pypy/module/_multibytecodec/interp_incremental.py
--- a/pypy/module/_multibytecodec/interp_incremental.py
+++ b/pypy/module/_multibytecodec/interp_incremental.py
@@ -1,4 +1,5 @@
 from rpython.rtyper.lltypesystem import lltype
+from rpython.rlib import rutf8
 from pypy.module._multibytecodec import c_codecs
 from pypy.module._multibytecodec.interp_multibytecodec import (
     MultibyteCodec, wrap_unicodedecodeerror, wrap_runtimeerror,
@@ -65,7 +66,8 @@
         pos = c_codecs.pypy_cjk_dec_inbuf_consumed(self.decodebuf)
         assert 0 <= pos <= len(object)
         self.pending = object[pos:]
-        return space.newunicode(output)
+        lgt, flag = rutf8.get_utf8_length_flag(output)
+        return space.newutf8(output, lgt, flag)
@@ -88,7 +90,8 @@
     def _initialize(self):
         self.encodebuf = c_codecs.pypy_cjk_enc_new(self.codec)
-        self.pending = u""
+        self.pending = ""
+        self.pending_len = 0
     def _free(self):
         self.pending = None
@@ -96,25 +99,37 @@
             self.encodebuf = lltype.nullptr(c_codecs.ENCODEBUF_P.TO)
-    @unwrap_spec(object='utf8', final=bool)
-    def encode_w(self, object, final=False):
-        u_object = object.decode('utf8')
+    @unwrap_spec(final=bool)
+    def encode_w(self, space, w_object, final=False):
+        utf8data, length = space.utf8_len_w(w_object)
         space = self.space
         state = space.fromcache(CodecState)
         if len(self.pending) > 0:
-            u_object = self.pending + u_object
+            utf8data = self.pending + utf8data
+            length += self.pending_len
-            output = c_codecs.encodeex(self.encodebuf, u_object, self.errors,
+            output = c_codecs.encodeex(self.encodebuf, utf8data, length,
+                                       self.errors,
                                        state.encode_error_handler, self.name,
         except c_codecs.EncodeDecodeError as e:
-            raise wrap_unicodeencodeerror(space, e, object, len(u_object),
+            raise wrap_unicodeencodeerror(space, e, utf8data, length,
         except RuntimeError:
             raise wrap_runtimeerror(space)
         pos = c_codecs.pypy_cjk_enc_inbuf_consumed(self.encodebuf)
-        assert 0 <= pos <= len(u_object)
-        self.pending = u_object[pos:]
+        assert 0 <= pos <= length
+        # scan the utf8 string until we hit pos
+        i = 0
+        stop = length - pos
+        self.pending_len = stop
+        if stop > 0:
+            while pos > 0:
+                i = rutf8.next_codepoint_pos(utf8data, i)
+                pos -= 1
+            self.pending = utf8data[i:]
+        else:
+            self.pending = ""
         return space.newbytes(output)
diff --git a/pypy/module/_multibytecodec/interp_multibytecodec.py b/pypy/module/_multibytecodec/interp_multibytecodec.py
--- a/pypy/module/_multibytecodec/interp_multibytecodec.py
+++ b/pypy/module/_multibytecodec/interp_multibytecodec.py
@@ -31,23 +31,23 @@
         return space.newtuple([space.newutf8(utf8_output, lgt, flag),
-    @unwrap_spec(input='utf8', errors="text_or_none")
-    def encode(self, space, input, errors=None):
+    @unwrap_spec(errors="text_or_none")
+    def encode(self, space, w_input, errors=None):
         if errors is None:
             errors = 'strict'
         state = space.fromcache(CodecState)
+        input, length = space.utf8_len_w(w_input)
-        u_input = input.decode('utf8')
-            output = c_codecs.encode(self.codec, u_input, errors,
+            output = c_codecs.encode(self.codec, input, length, errors,
                                      state.encode_error_handler, self.name)
         except c_codecs.EncodeDecodeError as e:
-            raise wrap_unicodeencodeerror(space, e, input, len(u_input),
+            raise wrap_unicodeencodeerror(space, e, input, length,
         except RuntimeError:
             raise wrap_runtimeerror(space)
         return space.newtuple([space.newbytes(output),
-                               space.newint(len(u_input))])
+                               space.newint(length)])
 MultibyteCodec.typedef = TypeDef(
diff --git a/pypy/module/_multibytecodec/test/test_c_codecs.py b/pypy/module/_multibytecodec/test/test_c_codecs.py
--- a/pypy/module/_multibytecodec/test/test_c_codecs.py
+++ b/pypy/module/_multibytecodec/test/test_c_codecs.py
@@ -14,27 +14,27 @@
 def test_decode_gbk():
     c = getcodec("gbk")
     u = decode(c, "\xA1\xAA")
-    assert u == unichr(0x2014)
+    assert u == unichr(0x2014).encode('utf8')
     u = decode(c, "foobar")
-    assert u == u"foobar"
+    assert u == "foobar"
 def test_decode_hz():
     # stateful
     c = getcodec("hz")
     u = decode(c, "~{abc}")
-    assert u == u'\u5f95\u6cef'
+    assert u == u'\u5f95\u6cef'.encode('utf8')
     u = decode(c, "~{")
-    assert u == u''
+    assert u == ''
 def test_decodeex_hz():
     c = getcodec("hz")
     decodebuf = c_codecs.pypy_cjk_dec_new(c)
     u = c_codecs.decodeex(decodebuf, "~{abcd~}")
-    assert u == u'\u5f95\u6c85'
+    assert u == u'\u5f95\u6c85'.encode('utf8')
     u = c_codecs.decodeex(decodebuf, "~{efgh~}")
-    assert u == u'\u5f50\u73b7'
+    assert u == u'\u5f50\u73b7'.encode('utf8')
     u = c_codecs.decodeex(decodebuf, "!~{abcd~}xyz~{efgh")
-    assert u == u'!\u5f95\u6c85xyz\u5f50\u73b7'
+    assert u == u'!\u5f95\u6c85xyz\u5f50\u73b7'.encode('utf8')
 def test_decodeex_hz_incomplete():
@@ -64,7 +64,7 @@
         buf += c
         u = c_codecs.decodeex(decodebuf, buf,
                               ignore_error = c_codecs.MBERR_TOOFEW)
-        assert u == output
+        assert u == output.encode('utf8')
         incompletepos = c_codecs.pypy_cjk_dec_inbuf_consumed(decodebuf)
         buf = buf[incompletepos:]
     assert buf == ''
@@ -86,46 +86,47 @@
 def test_decode_hz_ignore():
     c = getcodec("hz")
     u = decode(c, 'def~{}abc', 'ignore')
-    assert u == u'def\u5fcf'
+    assert u == u'def\u5fcf'.encode('utf8')
 def test_decode_hz_replace():
     c = getcodec("hz")
     u = decode(c, 'def~{}abc', 'replace')
-    assert u == u'def\ufffd\u5fcf'
+    assert u == u'def\ufffd\u5fcf'.encode('utf8')
 def test_encode_hz():
     c = getcodec("hz")
-    s = encode(c, u'foobar')
+    s = encode(c, u'foobar'.encode('utf8'), 6)
     assert s == 'foobar' and type(s) is str
-    s = encode(c, u'\u5f95\u6cef')
+    s = encode(c, u'\u5f95\u6cef'.encode('utf8'), 2)
     assert s == '~{abc}~}'
 def test_encode_hz_error():
     # error
     c = getcodec("hz")
-    e = py.test.raises(EncodeDecodeError, encode, c, u'abc\u1234def').value
+    e = py.test.raises(EncodeDecodeError, encode, c, u'abc\u1234def'.encode('utf8'), 7).value
     assert e.start == 3
     assert e.end == 4
     assert e.reason == "illegal multibyte sequence"
 def test_encode_hz_ignore():
     c = getcodec("hz")
-    s = encode(c, u'abc\u1234def', 'ignore')
+    s = encode(c, u'abc\u1234def'.encode('utf8'), 7, 'ignore')
     assert s == 'abcdef'
 def test_encode_hz_replace():
     c = getcodec("hz")
-    s = encode(c, u'abc\u1234def', 'replace')
+    s = encode(c, u'abc\u1234def'.encode('utf8'), 7, 'replace')
     assert s == 'abc?def'
 def test_encode_jisx0208():
     c = getcodec('iso2022_jp')
-    s = encode(c, u'\u83ca\u5730\u6642\u592b')
+    s = encode(c, u'\u83ca\u5730\u6642\u592b'.encode('utf8'), 4)
     assert s == '\x1b$B5FCO;~IW\x1b(B' and type(s) is str
 def test_encode_custom_error_handler_bytes():
+    py.test.skip("needs revamping in py3k")
     c = getcodec("hz")
     def errorhandler(errors, enc, msg, t, startingpos, endingpos):
-        return None, '\xc3', endingpos
-    s = encode(c, u'abc\u1234def', 'foo', errorhandler)
+        return u'\xc3'.encode('utf8'), endingpos
+    s = encode(c, u'abc\u1234def'.encode('utf8'), 7, 'foo', errorhandler)
     assert '\xc3' in s
diff --git a/pypy/module/_multibytecodec/test/test_translation.py b/pypy/module/_multibytecodec/test/test_translation.py
--- a/pypy/module/_multibytecodec/test/test_translation.py
+++ b/pypy/module/_multibytecodec/test/test_translation.py
@@ -1,6 +1,7 @@
 from pypy.module._multibytecodec import c_codecs
 from rpython.translator.c.test import test_standalone
 from rpython.config.translationoption import get_combined_translation_config
+from rpython.rlib import rutf8
 class TestTranslation(test_standalone.StandaloneTests):
@@ -13,7 +14,8 @@
             codecname, string = argv[1], argv[2]
             c = c_codecs.getcodec(codecname)
             u = c_codecs.decode(c, string)
-            r = c_codecs.encode(c, u)
+            lgt, _ = rutf8.get_utf8_length_flag(u)
+            r = c_codecs.encode(c, u, lgt)
             print r
             return 0
diff --git a/pypy/module/_pypyjson/interp_decoder.py b/pypy/module/_pypyjson/interp_decoder.py
--- a/pypy/module/_pypyjson/interp_decoder.py
+++ b/pypy/module/_pypyjson/interp_decoder.py
@@ -1,7 +1,7 @@
 import sys
 from rpython.rlib.rstring import StringBuilder
 from rpython.rlib.objectmodel import specialize, always_inline, r_dict
-from rpython.rlib import rfloat, runicode
+from rpython.rlib import rfloat, runicode, rutf8
 from rpython.rtyper.lltypesystem import lltype, rffi
 from pypy.interpreter.error import oefmt
 from pypy.interpreter import unicodehelper
@@ -19,29 +19,6 @@
         return 0.0
     return x * NEG_POW_10[exp]
-def strslice2unicode_latin1(s, start, end):
-    """
-    Convert s[start:end] to unicode. s is supposed to be an RPython string
-    encoded in latin-1, which means that the numeric value of each char is the
-    same as the corresponding unicode code point.
-    Internally it's implemented at the level of low-level helpers, to avoid
-    the extra copy we would need if we take the actual slice first.
-    No bound checking is done, use carefully.
-    """
-    from rpython.rtyper.annlowlevel import llstr, hlunicode
-    from rpython.rtyper.lltypesystem.rstr import malloc, UNICODE
-    from rpython.rtyper.lltypesystem.lltype import cast_primitive, UniChar
-    length = end-start
-    ll_s = llstr(s)
-    ll_res = malloc(UNICODE, length)
-    ll_res.hash = 0
-    for i in range(length):
-        ch = ll_s.chars[start+i]
-        ll_res.chars[i] = cast_primitive(UniChar, ch)
-    return hlunicode(ll_res)
 def slice_eq(a, b):
     (ll_chars1, start1, length1, _) = a
     (ll_chars2, start2, length2, _) = b
@@ -270,10 +247,11 @@
             self.pos = i+1
             return self.space.newdict()
-        d = {}
+        # XXX this should be improved to use an unwrapped dict
+        w_dict = self.space.newdict()
         while True:
             # parse a key: value
-            name = self.decode_key(i)
+            w_name = self.decode_key(i)
             i = self.skip_whitespace(self.pos)
             ch = self.ll_chars[i]
             if ch != ':':
@@ -282,13 +260,13 @@
             i = self.skip_whitespace(i)
             w_value = self.decode_any(i)
-            d[name] = w_value
+            self.space.setitem(w_dict, w_name, w_value)
             i = self.skip_whitespace(self.pos)
             ch = self.ll_chars[i]
             i += 1
             if ch == '}':
                 self.pos = i
-                return self._create_dict(d)
+                return w_dict
             elif ch == ',':
             elif ch == '\0':
@@ -297,10 +275,6 @@
                 self._raise("Unexpected '%s' when decoding object (char %d)",
                             ch, i-1)
-    def _create_dict(self, d):
-        from pypy.objspace.std.dictmultiobject import from_unicode_key_dict
-        return from_unicode_key_dict(self.space, d)
     def decode_string(self, i):
         start = i
         bits = 0
@@ -312,8 +286,7 @@
             bits |= ord(ch)
             if ch == '"':
                 self.pos = i
-                return self.space.newunicode(
-                        self._create_string(start, i - 1, bits))
+                return self._create_string(start, i - 1, bits)
             elif ch == '\\' or ch < '\x20':
                 self.pos = i-1
                 return self.decode_string_escaped(start)
@@ -322,12 +295,15 @@
         if bits & 0x80:
             # the 8th bit is set, it's an utf8 string
             content_utf8 = self.getslice(start, end)
-            return unicodehelper.decode_utf8(self.space, content_utf8)
+            lgt, flag = unicodehelper.check_utf8_or_raise(self.space,
+                                                          content_utf8)
+            return self.space.newutf8(content_utf8, lgt, flag)
             # ascii only, fast path (ascii is a strict subset of
             # latin1, and we already checked that all the chars are <
             # 128)
-            return strslice2unicode_latin1(self.s, start, end)
+            return self.space.newutf8(self.getslice(start, end),
+                                      end - start, rutf8.FLAG_ASCII)
     def decode_string_escaped(self, start):
         i = self.pos
@@ -340,9 +316,10 @@
             i += 1
             if ch == '"':
                 content_utf8 = builder.build()
-                content_unicode = unicodehelper.decode_utf8(self.space, content_utf8)
+                lgt, f = unicodehelper.check_utf8_or_raise(self.space,
+                                                           content_utf8)
                 self.pos = i
-                return self.space.newunicode(content_unicode)
+                return self.space.newutf8(content_utf8, lgt, f)
             elif ch == '\\':
                 i = self.decode_escape_sequence(i, builder)
             elif ch < '\x20':
@@ -389,8 +366,7 @@
             return # help the annotator to know that we'll never go beyond
                    # this point
-        uchr = runicode.code_to_unichr(val)     # may be a surrogate pair again
-        utf8_ch = unicodehelper.encode_utf8(self.space, uchr)
+        utf8_ch = rutf8.unichr_as_utf8(val, allow_surrogates=True)
         return i
@@ -404,7 +380,7 @@
         return 0x10000 + (((highsurr - 0xd800) << 10) | (lowsurr - 0xdc00))
     def decode_key(self, i):
-        """ returns an unwrapped unicode """
+        """ returns a wrapped unicode """
         from rpython.rlib.rarithmetic import intmask
         i = self.skip_whitespace(i)
diff --git a/pypy/module/_pypyjson/interp_encoder.py b/pypy/module/_pypyjson/interp_encoder.py
--- a/pypy/module/_pypyjson/interp_encoder.py
+++ b/pypy/module/_pypyjson/interp_encoder.py
@@ -1,5 +1,5 @@
 from rpython.rlib.rstring import StringBuilder
-from rpython.rlib.runicode import str_decode_utf_8
+from rpython.rlib import rutf8
 from pypy.interpreter import unicodehelper
@@ -30,11 +30,8 @@
             # the input is a string with only non-special ascii chars
             return w_string
-        eh = unicodehelper.decode_error_handler(space)
-        u = str_decode_utf_8(
-                s, len(s), None, final=True, errorhandler=eh,
-                allow_surrogates=True)[0]
-        sb = StringBuilder(len(u))
+        unicodehelper.check_utf8_or_raise(space, s)
+        sb = StringBuilder(len(s))
         sb.append_slice(s, 0, first)
         # We used to check if 'u' contains only safe characters, and return
@@ -44,29 +41,31 @@
         # a string (with the ascii encoding).  This requires two passes
         # over the characters.  So we may as well directly turn it into a
         # string here --- only one pass.
-        u = space.unicode_w(w_string)
-        sb = StringBuilder(len(u))
+        s = space.utf8_w(w_string)
+        sb = StringBuilder(len(s))
         first = 0
-    for i in range(first, len(u)):
-        c = u[i]
-        if c <= u'~':
-            if c == u'"' or c == u'\\':
+    it = rutf8.Utf8StringIterator(s)
+    for i in range(first):
+        it.next()
+    for c in it:
+        if c <= ord('~'):
+            if c == ord('"') or c == ord('\\'):
-            elif c < u' ':
-                sb.append(ESCAPE_BEFORE_SPACE[ord(c)])
+            elif c < ord(' '):
+                sb.append(ESCAPE_BEFORE_SPACE[c])
-            sb.append(chr(ord(c)))
+            sb.append(chr(c))
-            if c <= u'\uffff':
+            if c <= ord(u'\uffff'):
-                sb.append(HEX[ord(c) >> 12])
-                sb.append(HEX[(ord(c) >> 8) & 0x0f])
-                sb.append(HEX[(ord(c) >> 4) & 0x0f])
-                sb.append(HEX[ord(c) & 0x0f])
+                sb.append(HEX[c >> 12])
+                sb.append(HEX[(c >> 8) & 0x0f])
+                sb.append(HEX[(c >> 4) & 0x0f])
+                sb.append(HEX[c & 0x0f])
                 # surrogate pair
-                n = ord(c) - 0x10000
+                n = c - 0x10000
                 s1 = 0xd800 | ((n >> 10) & 0x3ff)
                 sb.append(HEX[(s1 >> 8) & 0x0f])
diff --git a/pypy/module/_pypyjson/test/test__pypyjson.py b/pypy/module/_pypyjson/test/test__pypyjson.py
--- a/pypy/module/_pypyjson/test/test__pypyjson.py
+++ b/pypy/module/_pypyjson/test/test__pypyjson.py
@@ -10,10 +10,14 @@
     assert dec.skip_whitespace(8) == len(s)
+class FakeSpace(object):
+    def newutf8(self, s, l, f):
+        return s
 def test_decode_key():
     s1 = "123" * 100
     s = ' "%s"   "%s" ' % (s1, s1)
-    dec = JSONDecoder('fake space', s)
+    dec = JSONDecoder(FakeSpace(), s)
     assert dec.pos == 0
     x = dec.decode_key(0)
     assert x == s1
diff --git a/pypy/module/_rawffi/alt/interp_funcptr.py b/pypy/module/_rawffi/alt/interp_funcptr.py
--- a/pypy/module/_rawffi/alt/interp_funcptr.py
+++ b/pypy/module/_rawffi/alt/interp_funcptr.py
@@ -167,8 +167,8 @@
         addr = rffi.cast(rffi.ULONG, buf)
-    def handle_unichar_p(self, w_ffitype, w_obj, unicodeval):
-        buf = rffi.unicode2wcharp(unicodeval)
+    def handle_unichar_p(self, w_ffitype, w_obj, utf8val, utf8len):
+        buf = rffi.utf82wcharp(utf8val, utf8len)
         self.w_func.to_free.append(rffi.cast(rffi.VOIDP, buf))
         addr = rffi.cast(rffi.ULONG, buf)
diff --git a/pypy/module/_rawffi/alt/test/test_type_converter.py b/pypy/module/_rawffi/alt/test/test_type_converter.py
--- a/pypy/module/_rawffi/alt/test/test_type_converter.py
+++ b/pypy/module/_rawffi/alt/test/test_type_converter.py
@@ -6,7 +6,7 @@
 class DummyFromAppLevelConverter(FromAppLevelConverter):
-    def handle_all(self, w_ffitype, w_obj, val):
+    def handle_all(self, w_ffitype, w_obj, val, lgt=None):
         self.lastval = val
     handle_signed = handle_all
@@ -120,8 +120,8 @@
     def test_strings(self):
         # first, try automatic conversion from applevel
         self.check(app_types.char_p, self.space.newbytes('foo'), 'foo')
-        self.check(app_types.unichar_p, self.space.wrap(u'foo\u1234'), u'foo\u1234')
-        self.check(app_types.unichar_p, self.space.wrap('foo'), u'foo')
+        self.check(app_types.unichar_p, self.space.wrap(u'foo\u1234'), u'foo\u1234'.encode('utf8'))
+        self.check(app_types.unichar_p, self.space.wrap('foo'), 'foo')
         # then, try to pass explicit pointers
         self.check(app_types.char_p, self.space.wrap(42), 42)
         self.check(app_types.unichar_p, self.space.wrap(42), 42)
diff --git a/pypy/module/_rawffi/alt/type_converter.py b/pypy/module/_rawffi/alt/type_converter.py
--- a/pypy/module/_rawffi/alt/type_converter.py
+++ b/pypy/module/_rawffi/alt/type_converter.py
@@ -1,6 +1,6 @@
 from rpython.rlib import libffi
-from rpython.rlib import jit
-from rpython.rlib.rarithmetic import r_uint
+from rpython.rlib import jit, rutf8
+from rpython.rlib.rarithmetic import r_uint, intmask
 from pypy.interpreter.error import oefmt
 from pypy.module._rawffi.structure import W_StructureInstance, W_Structure
 from pypy.module._rawffi.alt.interp_ffitype import app_types
@@ -85,8 +85,8 @@
             return True
         elif w_ffitype.is_unichar_p() and (w_type is self.space.w_bytes or
                                            w_type is self.space.w_unicode):
-            unicodeval = self.space.unicode_w(w_obj)
-            self.handle_unichar_p(w_ffitype, w_obj, unicodeval)
+            utf8, lgt = self.space.utf8_len_w(w_obj)
+            self.handle_unichar_p(w_ffitype, w_obj, utf8, lgt)
             return True
         return False
@@ -147,7 +147,7 @@
         self.error(w_ffitype, w_obj)
-    def handle_unichar_p(self, w_ffitype, w_obj, unicodeval):
+    def handle_unichar_p(self, w_ffitype, w_obj, utf8val, utf8len):
         unicodeval: interp-level unicode
@@ -228,7 +228,8 @@
             return space.newbytes(chr(ucharval))
         elif w_ffitype.is_unichar():
             wcharval = self.get_unichar(w_ffitype)
-            return space.newunicode(unichr(wcharval))
+            return space.newutf8(rutf8.unichr_as_utf8(wcharval), 1,
+                                 rutf8.get_flag_from_code(intmask(wcharval)))
         elif w_ffitype.is_double():
             return self._float(w_ffitype)
         elif w_ffitype.is_singlefloat():
diff --git a/pypy/module/_rawffi/interp_rawffi.py b/pypy/module/_rawffi/interp_rawffi.py
--- a/pypy/module/_rawffi/interp_rawffi.py
+++ b/pypy/module/_rawffi/interp_rawffi.py
@@ -10,6 +10,7 @@
 from rpython.rtyper.lltypesystem import lltype, rffi
 from rpython.rtyper.tool import rffi_platform
 from rpython.rlib.unroll import unrolling_iterable
+from rpython.rlib import rutf8
 from rpython.rlib.objectmodel import specialize
 import rpython.rlib.rposix as rposix
@@ -416,13 +417,13 @@
         val = s[0]
         push_func(add_arg, argdesc, val)
     elif letter == 'u':
-        s = space.unicode_w(w_arg)
-        if len(s) != 1:
+        s, lgt = space.utf8_len_w(w_arg)
+        if lgt != 1:
             raise oefmt(space.w_TypeError,
                         "Expected unicode string of length one as wide "
-        val = s[0]
-        push_func(add_arg, argdesc, val)
+        val = rutf8.codepoint_at_pos(s, 0)
+        push_func(add_arg, argdesc, rffi.cast(rffi.WCHAR_T, val))
         for c in unroll_letters_for_numbers:
             if letter == c:
diff --git a/pypy/module/_sre/interp_sre.py b/pypy/module/_sre/interp_sre.py
--- a/pypy/module/_sre/interp_sre.py
+++ b/pypy/module/_sre/interp_sre.py
@@ -7,7 +7,8 @@
 from pypy.interpreter.error import OperationError, oefmt
 from rpython.rlib.rarithmetic import intmask
 from rpython.rlib import jit
-from rpython.rlib.rstring import StringBuilder, UnicodeBuilder
+from rpython.rlib.rstring import StringBuilder
+from rpython.rlib.rutf8 import Utf8StringBuilder
 # ____________________________________________________________
@@ -237,8 +238,8 @@
             filter_is_callable = True
             if space.isinstance_w(w_ptemplate, space.w_unicode):
-                filter_as_unicode = space.unicode_w(w_ptemplate)
-                literal = u'\\' not in filter_as_unicode
+                filter_as_unicode = space.utf8_w(w_ptemplate)
+                literal = '\\' not in filter_as_unicode
                 use_builder = (
                     space.isinstance_w(w_string, space.w_unicode) and literal)
@@ -267,7 +268,7 @@
         sublist_w = strbuilder = unicodebuilder = None
         if use_builder:
             if filter_as_unicode is not None:
-                unicodebuilder = UnicodeBuilder(ctx.end)
+                unicodebuilder = Utf8StringBuilder(ctx.end)
                 assert filter_as_string is not None
                 strbuilder = StringBuilder(ctx.end)
@@ -335,7 +336,9 @@
                 return space.newbytes(strbuilder.build()), n
                 assert unicodebuilder is not None
-                return space.newunicode(unicodebuilder.build()), n
+                return space.newutf8(unicodebuilder.build(),
+                                     unicodebuilder.get_length(),
+                                     unicodebuilder.get_flag()), n
             if space.isinstance_w(w_string, space.w_unicode):
                 w_emptystr = space.newunicode(u'')
diff --git a/pypy/module/_ssl/interp_ssl.py b/pypy/module/_ssl/interp_ssl.py
--- a/pypy/module/_ssl/interp_ssl.py
+++ b/pypy/module/_ssl/interp_ssl.py
@@ -1566,12 +1566,13 @@
                 cadata = space.bufferstr_w(w_cadata)
                 ca_file_type = SSL_FILETYPE_PEM
-                try:
-                    cadata = space.unicode_w(w_cadata).encode('ascii')
-                except UnicodeEncodeError:
+                w_uni = space.convert_arg_to_w_unicode(w_cadata)
+                if not w_uni.is_ascii():
                     raise oefmt(space.w_TypeError,
                                 "cadata should be a ASCII string or a "
                                 "bytes-like object")
+                cadata = space.utf8_w(w_uni)
         if cafile is None and capath is None and cadata is None:
             raise oefmt(space.w_TypeError,
                         "cafile and capath cannot be both omitted")
diff --git a/pypy/objspace/std/dictmultiobject.py b/pypy/objspace/std/dictmultiobject.py
--- a/pypy/objspace/std/dictmultiobject.py
+++ b/pypy/objspace/std/dictmultiobject.py
@@ -1257,12 +1257,6 @@
-def from_unicode_key_dict(space, d):
-    strategy = space.fromcache(UnicodeDictStrategy)
-    storage = strategy.erase(d)
-    return W_DictObject(space, strategy, storage)
 class IntDictStrategy(AbstractTypedStrategy, DictStrategy):
     erase, unerase = rerased.new_erasing_pair("int")
     erase = staticmethod(erase)
diff --git a/pypy/objspace/std/objspace.py b/pypy/objspace/std/objspace.py
--- a/pypy/objspace/std/objspace.py
+++ b/pypy/objspace/std/objspace.py
@@ -367,23 +367,10 @@
         assert isinstance(utf8s, str)
         return W_UnicodeObject(utf8s, length, flag)
-    def new_from_utf8(self, utf8s):
-        # XXX: kill me!
-        assert isinstance(utf8s, str)
-        length, flag = rutf8.check_utf8(utf8s, True)
-        return W_UnicodeObject(utf8s, length, flag)
     def newfilename(self, s):
         assert isinstance(s, str) # on pypy3, this decodes the byte string
         return W_BytesObject(s)   # with the filesystem encoding
-    def newunicode(self, unistr):
-        # XXX: kill me!
-        assert isinstance(unistr, unicode)
-        utf8s = unistr.encode("utf-8")
-        length, flag = rutf8.check_utf8(utf8s, True)
-        return self.newutf8(utf8s, length, flag)
     def type(self, w_obj):
         return w_obj.getclass(self)
diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -64,6 +64,11 @@
         # - malloced object, which means it has index, then
         #   _index_storage.flags determines the kind
+    @staticmethod
+    def from_utf8builder(builder):
+        return W_UnicodeObject(
+            builder.build(), builder.get_length(), builder.get_flag())
     def __repr__(self):
         """representation for debugging purposes"""
         return "%s(%r)" % (self.__class__.__name__, self._utf8)
@@ -344,57 +349,38 @@
         return mod_format(space, w_values, self, do_unicode=True)
     def descr_swapcase(self, space):
-        selfvalue = self._utf8
-        builder = StringBuilder(len(selfvalue))
-        flag = self._get_flag()
-        i = 0
-        while i < len(selfvalue):
-            ch = rutf8.codepoint_at_pos(selfvalue, i)
-            i = rutf8.next_codepoint_pos(selfvalue, i)
+        input = self._utf8
+        builder = rutf8.Utf8StringBuilder(len(input))
+        for ch in rutf8.Utf8StringIterator(input):
             if unicodedb.isupper(ch):
                 ch = unicodedb.tolower(ch)
             elif unicodedb.islower(ch):
                 ch = unicodedb.toupper(ch)
-            if ch >= 0x80:
-                flag = rutf8.combine_flags(flag, rutf8.FLAG_REGULAR)
-            rutf8.unichr_as_utf8_append(builder, ch, allow_surrogates=True)
-        return W_UnicodeObject(builder.build(), self._length, flag)
+            builder.append_code(ch)
+        return self.from_utf8builder(builder)
     def descr_title(self, space):
         if len(self._utf8) == 0:
             return self
-        utf8, flag = self.title_unicode(self._utf8)
-        return W_UnicodeObject(utf8, self._len(), flag)
+        return self.title_unicode(self._utf8)
     def title_unicode(self, value):
         input = self._utf8
-        builder = StringBuilder(len(input))
-        i = 0
+        builder = rutf8.Utf8StringBuilder(len(input))
         previous_is_cased = False
-        flag = self._get_flag()
-        while i < len(input):
-            ch = rutf8.codepoint_at_pos(input, i)
-            i = rutf8.next_codepoint_pos(input, i)
+        for ch in rutf8.Utf8StringIterator(input):
             if not previous_is_cased:
                 ch = unicodedb.totitle(ch)
                 ch = unicodedb.tolower(ch)
-            if ch >= 0x80:
-                flag = rutf8.combine_flags(flag, rutf8.FLAG_REGULAR)
-            rutf8.unichr_as_utf8_append(builder, ch, allow_surrogates=True)
+            builder.append_code(ch)
             previous_is_cased = unicodedb.iscased(ch)
-        return builder.build(), flag
+        return self.from_utf8builder(builder)
     def descr_translate(self, space, w_table):
-        input = self._utf8
-        result = StringBuilder(len(input))
-        result_length = 0
-        flag = self._get_flag()
-        i = 0
-        while i < len(input):
-            codepoint = rutf8.codepoint_at_pos(input, i)
-            i = rutf8.next_codepoint_pos(input, i)
+        builder = rutf8.Utf8StringBuilder(len(self._utf8))
+        for codepoint in rutf8.Utf8StringIterator(self._utf8):
                 w_newval = space.getitem(w_table, space.newint(codepoint))
             except OperationError as e:
@@ -406,24 +392,19 @@
                 elif space.isinstance_w(w_newval, space.w_int):
                     codepoint = space.int_w(w_newval)
                 elif isinstance(w_newval, W_UnicodeObject):
-                    result.append(w_newval._utf8)
-                    flag = rutf8.combine_flags(flag, w_newval._get_flag())
-                    result_length += w_newval._length
+                    builder.append_utf8(
+                        w_newval._utf8, w_newval._length, w_newval._get_flag())
                     raise oefmt(space.w_TypeError,
                                 "character mapping must return integer, None "
                                 "or unicode")
-                if codepoint >= 0x80:
-                    flag = rutf8.combine_flags(flag, rutf8.FLAG_REGULAR)
-                rutf8.unichr_as_utf8_append(result, codepoint,
-                                            allow_surrogates=True)
-                result_length += 1
+                builder.append_code(codepoint)
             except ValueError:
                 raise oefmt(space.w_TypeError,
                             "character mapping must be in range(0x110000)")
-        return W_UnicodeObject(result.build(), result_length, flag)
+        return self.from_utf8builder(builder)
     def descr_find(self, space, w_sub, w_start=None, w_end=None):
         w_result = self._unwrap_and_search(space, w_sub, w_start, w_end)
@@ -517,12 +498,6 @@
     def _join_return_one(self, space, w_obj):
         return space.is_w(space.type(w_obj), space.w_unicode)
-    def _join_check_item(self, space, w_obj):
-        if (space.isinstance_w(w_obj, space.w_bytes) or
-            space.isinstance_w(w_obj, space.w_unicode)):
-            return 0
-        return 1
     def descr_formatter_parser(self, space):
         from pypy.objspace.std.newformat import unicode_template_formatter
         tformat = unicode_template_formatter(space, space.utf8_w(self))
@@ -534,16 +509,11 @@
         return tformat.formatter_field_name_split()
     def descr_lower(self, space):
-        builder = StringBuilder(len(self._utf8))
-        pos = 0
-        flag = self._get_flag()
-        while pos < len(self._utf8):
-            lower = unicodedb.tolower(rutf8.codepoint_at_pos(self._utf8, pos))
-            if lower >= 0x80:
-                flag = rutf8.combine_flags(flag, rutf8.FLAG_REGULAR)
-            rutf8.unichr_as_utf8_append(builder, lower, allow_surrogates=True)
-            pos = rutf8.next_codepoint_pos(self._utf8, pos)
-        return W_UnicodeObject(builder.build(), self._len(), flag)
+        builder = rutf8.Utf8StringBuilder(len(self._utf8))
+        for ch in rutf8.Utf8StringIterator(self._utf8):
+            lower = unicodedb.tolower(ch)
+            builder.append_code(lower)
+        return self.from_utf8builder(builder)
     def descr_isdecimal(self, space):
         return self._is_generic(space, '_isdecimal')
@@ -657,13 +627,11 @@
         flag = self._get_flag()
         for i in range(size):
             w_s = list_w[i]
-            check_item = self._join_check_item(space, w_s)
-            if check_item == 1:
+            if not (space.isinstance_w(w_s, space.w_bytes) or
+                    space.isinstance_w(w_s, space.w_unicode)):
                 raise oefmt(space.w_TypeError,
-                            "sequence item %d: expected string, %T found",
+                            "sequence item %d: expected string or unicode, %T found",
                             i, w_s)
-            elif check_item == 2:
-                return self._join_autoconvert(space, list_w)
             # XXX Maybe the extra copy here is okay? It was basically going to
             #     happen anyway, what with being placed into the builder
             w_u = self.convert_arg_to_w_unicode(space, w_s)
@@ -711,18 +679,11 @@
         return space.newlist(strs_w)
     def descr_upper(self, space):
-        value = self._utf8
-        builder = StringBuilder(len(value))
-        flag = self._get_flag()
-        i = 0
-        while i < len(value):
-            uchar = rutf8.codepoint_at_pos(value, i)
-            uchar = unicodedb.toupper(uchar)
-            if uchar >= 0x80:
-                flag = rutf8.combine_flags(flag, rutf8.FLAG_REGULAR)
-            i = rutf8.next_codepoint_pos(value, i)
-            rutf8.unichr_as_utf8_append(builder, uchar, allow_surrogates=True)
-        return W_UnicodeObject(builder.build(), self._length, flag)
+        builder = rutf8.Utf8StringBuilder(len(self._utf8))
+        for ch in rutf8.Utf8StringIterator(self._utf8):
+            ch = unicodedb.toupper(ch)
+            builder.append_code(ch)
+        return self.from_utf8builder(builder)
     def descr_zfill(self, space, width):
@@ -826,22 +787,15 @@
         if len(value) == 0:
             return self._empty()
-        flag = self._get_flag()
-        builder = StringBuilder(len(value))
-        uchar = rutf8.codepoint_at_pos(value, 0)
-        i = rutf8.next_codepoint_pos(value, 0)
+        builder = rutf8.Utf8StringBuilder(len(self._utf8))
+        it = rutf8.Utf8StringIterator(self._utf8)
+        uchar = it.next()
         ch = unicodedb.toupper(uchar)
-        rutf8.unichr_as_utf8_append(builder, ch, allow_surrogates=True)
-        if ch >= 0x80:
-            flag = rutf8.combine_flags(flag, rutf8.FLAG_REGULAR)
-        while i < len(value):
-            uchar = rutf8.codepoint_at_pos(value, i)
-            i = rutf8.next_codepoint_pos(value, i)
-            ch = unicodedb.tolower(uchar)
-            rutf8.unichr_as_utf8_append(builder, ch, allow_surrogates=True)
-            if ch >= 0x80:
-                flag = rutf8.combine_flags(flag, rutf8.FLAG_REGULAR)
-        return W_UnicodeObject(builder.build(), self._len(), flag)
+        builder.append_code(ch)
+        for ch in it:
+            ch = unicodedb.tolower(ch)
+            builder.append_code(ch)
+        return self.from_utf8builder(builder)
     @unwrap_spec(width=int, w_fillchar=WrappedDefault(' '))
     def descr_center(self, space, width, w_fillchar):
diff --git a/requirements.txt b/requirements.txt
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,7 @@
-vmprof>=0.4.10  # required to parse log files in rvmprof tests
+# parse log files in rvmprof tests
+vmprof>=0.4.10; 'x86' in platform.machine #skip arm, s390x
 # hypothesis is used for test generation on untranslated tests
diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py
--- a/rpython/rlib/rutf8.py
+++ b/rpython/rlib/rutf8.py
@@ -687,6 +687,11 @@
         self._lgt += 1
         unichr_as_utf8_append(self._s, code, True)
+    def append_utf8(self, utf8, length, flag):
+        self._flag = combine_flags(self._flag, flag)
+        self._lgt += length
+        self._s.append(utf8)
     def build(self):
         return self._s.build()
@@ -702,10 +707,12 @@
         self._end = len(utf8s)
         self._pos = 0
-    def done(self):
-        return self._pos == self._end
+    def __iter__(self):
+        return self
     def next(self):

More information about the pypy-commit mailing list