[pypy-commit] pypy unicode-utf8-py3: merge unicode-utf8 into branch (way too painful)

mattip pypy.commits at gmail.com
Wed Jan 2 09:09:48 EST 2019


Author: Matti Picus <matti.picus at gmail.com>
Branch: unicode-utf8-py3
Changeset: r95568:43c80d4b68c9
Date: 2019-01-02 13:49 +0200
http://bitbucket.org/pypy/pypy/changeset/43c80d4b68c9/

Log:	merge unicode-utf8 into branch (way too painful)

diff --git a/LICENSE b/LICENSE
--- a/LICENSE
+++ b/LICENSE
@@ -30,7 +30,7 @@
     DEALINGS IN THE SOFTWARE.
 
 
-PyPy Copyright holders 2003-2018
+PyPy Copyright holders 2003-2019
 --------------------------------
 
 Except when otherwise stated (look for LICENSE files or information at
diff --git a/TODO b/TODO
--- a/TODO
+++ b/TODO
@@ -4,6 +4,7 @@
 * improve performance of splitlines
 * fix _pypyjson to not use a wrapped dict when decoding an object
 * make sure we review all the places that call ord(unichr) to check for ValueErrors
+* Find a more elegant way to define MAXUNICODE in rpython/rlib/runicode.py
 * rewrite unicodeobject.unicode_to_decimal_w to only use utf8 encoded bytes
 * revisit why runicode import str_decode_utf_8_impl needed instead of runicode import str_decode_utf_8
 * revisit all places where we do utf8.decode('utf-8'), they should work directly with utf8
diff --git a/extra_tests/test_cPickle.py b/extra_tests/test_cPickle.py
new file mode 100644
--- /dev/null
+++ b/extra_tests/test_cPickle.py
@@ -0,0 +1,34 @@
+import pytest
+import cPickle
+
+def test_stack_underflow():
+    with pytest.raises(cPickle.UnpicklingError):
+        cPickle.loads("a string")
+
+def test_bad_key():
+    with pytest.raises(cPickle.UnpicklingError) as excinfo:
+        cPickle.loads("v")
+    assert str(excinfo.value) == "invalid load key, 'v'."
+
+def test_find_global():
+    import time, cStringIO
+    entry = time.strptime('Fri Mar 27 22:20:42 2017')
+    f = cStringIO.StringIO()
+    cPickle.Pickler(f).dump(entry)
+
+    f = cStringIO.StringIO(f.getvalue())
+    e = cPickle.Unpickler(f).load()
+    assert e == entry
+
+    f = cStringIO.StringIO(f.getvalue())
+    up = cPickle.Unpickler(f)
+    up.find_global = None
+    with pytest.raises(cPickle.UnpicklingError) as e:
+        up.load()
+    assert str(e.value) == "Global and instance pickles are not supported."
+
+    f = cStringIO.StringIO(f.getvalue())
+    up = cPickle.Unpickler(f)
+    up.find_global = lambda module, name: lambda a, b: (name, a, b)
+    e = up.load()
+    assert e == ('struct_time', (2017, 3, 27, 22, 20, 42, 4, 86, -1), {})
diff --git a/extra_tests/test_cStringIO.py b/extra_tests/test_cStringIO.py
new file mode 100644
--- /dev/null
+++ b/extra_tests/test_cStringIO.py
@@ -0,0 +1,23 @@
+"""
+Tests for the PyPy cStringIO implementation.
+"""
+from cStringIO import StringIO
+
+data = b"some bytes"
+
+def test_reset():
+    """
+    Test that the reset method of cStringIO objects sets the position
+    marker to the beginning of the stream.
+    """
+    stream = StringIO()
+    stream.write(data)
+    assert stream.read() == ''
+    stream.reset()
+    assert stream.read() == data
+
+    stream = StringIO(data)
+    assert stream.read() == data
+    assert stream.read() == ''
+    stream.reset()
+    assert stream.read() == data
diff --git a/extra_tests/test_string.py b/extra_tests/test_string.py
new file mode 100644
--- /dev/null
+++ b/extra_tests/test_string.py
@@ -0,0 +1,46 @@
+
+"""
+Test module for functions in string.py
+"""
+import pytest
+
+def test_maketrans():
+    import string
+    assert string.maketrans('', '') == (
+        '\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12'
+        '\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !"#$%&\'()*+,-./0'
+        '123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstu'
+        'vwxyz{|}~\x7f\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d'
+        '\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e'
+        '\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf'
+        '\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0'
+        '\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1'
+        '\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2'
+        '\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3'
+        '\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff')
+    assert string.maketrans('a', 'b') == (
+        '\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12'
+        '\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !"#$%&\'()*+,-./0'
+        '123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`bbcdefghijklmnopqrstu'
+        'vwxyz{|}~\x7f\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d'
+        '\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e'
+        '\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf'
+        '\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0'
+        '\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1'
+        '\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2'
+        '\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3'
+        '\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff')
+    assert string.maketrans('ab', 'cd') == (
+        '\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12'
+        '\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !"#$%&\'()*+,-./0'
+        '123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`cdcdefghijklmnopqrstu'
+        'vwxyz{|}~\x7f\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d'
+        '\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e'
+        '\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf'
+        '\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0'
+        '\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1'
+        '\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2'
+        '\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3'
+        '\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff')
+    with pytest.raises(ValueError):
+        string.maketrans('aa', '')
diff --git a/pypy/doc/conf.py b/pypy/doc/conf.py
--- a/pypy/doc/conf.py
+++ b/pypy/doc/conf.py
@@ -59,7 +59,7 @@
 
 # General information about the project.
 project = u'PyPy'
-copyright = u'2018, The PyPy Project'
+copyright = u'2019, The PyPy Project'
 
 # The version info for the project you're documenting, acts as replacement for
 # |version| and |release|, also used in various other places throughout the
diff --git a/pypy/doc/whatsnew-head.rst b/pypy/doc/whatsnew-head.rst
--- a/pypy/doc/whatsnew-head.rst
+++ b/pypy/doc/whatsnew-head.rst
@@ -74,4 +74,8 @@
 Make it possible to manually manage the GC by using a combination of
 gc.disable() and gc.collect_step(). Make sure to write a proper release
 announcement in which we explain that existing programs could leak memory if
-they run for too much time between a gc.disable()/gc.enable()
\ No newline at end of file
+they run for too much time between a gc.disable()/gc.enable()
+
+.. branch: unicode-utf8
+
+Use utf8 internally to represent unicode
diff --git a/pypy/interpreter/argument.py b/pypy/interpreter/argument.py
--- a/pypy/interpreter/argument.py
+++ b/pypy/interpreter/argument.py
@@ -583,31 +583,33 @@
         if num_remainingkwds == 1:
             for i in range(len(keywords)):
                 if i not in kwds_mapping:
-                    name = keywords[i]
-                    if name is None:
-                        # We'll assume it's unicode. Encode it.
-                        # Careful, I *think* it should not be possible to
-                        # get an IndexError here but you never know.
-                        try:
-                            if keyword_names_w is None:
-                                raise IndexError
-                            # note: negative-based indexing from the end
-                            w_name = keyword_names_w[i - len(keywords)]
-                        except IndexError:
+                    name = '?'
+                    # We'll assume it's unicode. Encode it.
+                    # Careful, I *think* it should not be possible to
+                    # get an IndexError here but you never know.
+                    try:
+                        if keyword_names_w is None:
+                            raise IndexError
+                        # note: negative-based indexing from the end
+                        w_name = keyword_names_w[i - len(keywords)]
+                    except IndexError:
+                        if keywords is None:
                             name = '?'
                         else:
-                            name = space.text_w(w_name)
+                            name = keywords[i]
+                    else:
+                        w_enc = space.newtext(space.sys.defaultencoding)
+                        w_err = space.newtext("replace")
+                        w_name = space.call_method(w_name, "encode", w_enc,
+                                                   w_err)
+                        name = space.text_w(w_name)
                     break
         self.kwd_name = name
 
     def getmsg(self):
         if self.num_kwds == 1:
-            if isinstance(self.kwd_name, unicode):
-                uname = unicode_encode_utf_8(self.kwd_name, len(self.kwd_name),
-                        'strict', allow_surrogates=False)
-            else:
-                uname = self.kwd_name
-            msg = "got an unexpected keyword argument '%s'" % uname
+            msg = "got an unexpected keyword argument '%s'" % (
+                self.kwd_name)
         else:
             msg = "got %d unexpected keyword arguments" % (
                 self.num_kwds)
diff --git a/pypy/interpreter/baseobjspace.py b/pypy/interpreter/baseobjspace.py
--- a/pypy/interpreter/baseobjspace.py
+++ b/pypy/interpreter/baseobjspace.py
@@ -1605,6 +1605,8 @@
         else:
             assert False
 
+        if self.isinstance_w(w_obj, self.w_unicode):
+            return w_obj.charbuf_w(self)
     def text_or_none_w(self, w_obj):
         return None if self.is_none(w_obj) else self.text_w(w_obj)
 
diff --git a/pypy/interpreter/test/test_argument.py b/pypy/interpreter/test/test_argument.py
--- a/pypy/interpreter/test/test_argument.py
+++ b/pypy/interpreter/test/test_argument.py
@@ -55,6 +55,9 @@
     pass
 
 class DummySpace(object):
+    class sys:
+        defaultencoding = 'utf-8'
+
     def newtuple(self, items):
         return tuple(items)
 
diff --git a/pypy/interpreter/test/test_unicodehelper.py b/pypy/interpreter/test/test_unicodehelper.py
--- a/pypy/interpreter/test/test_unicodehelper.py
+++ b/pypy/interpreter/test/test_unicodehelper.py
@@ -1,6 +1,10 @@
 import py
 import pytest
-from hypothesis import given, strategies
+try:
+    from hypothesis import given, strategies
+    HAS_HYPOTHESIS = True
+except ImportError:
+    HAS_HYPOTHESIS = False
 import struct
 import sys
 from pypy.interpreter.unicodehelper import (
@@ -130,13 +134,6 @@
     with pytest.raises(UnicodeDecodeError):
         str_decode_utf_32_be(b"\x00\x00\xdc\x80", 4, None)
 
-
- at given(strategies.text())
-def test_utf8_encode_ascii_2(u):
-    def eh(errors, encoding, reason, p, start, end):
-        return "?" * (end - start), end, 'b'
-    assert utf8_encode_ascii(u.encode("utf8"), "replace", eh) == u.encode("ascii", "replace")
-
 def test_str_decode_ascii():
     assert str_decode_ascii("abc", "??", True, "??") == ("abc", 3, 3)
     def eh(errors, encoding, reason, p, start, end):
@@ -156,16 +153,6 @@
                    ("??", "ascii", input, 5, 6),
                    ("??", "ascii", input, 6, 7)]
 
- at given(strategies.text())
-def test_unicode_raw_escape(u):
-    r = uh.utf8_encode_raw_unicode_escape(u.encode("utf8"), 'strict', None)
-    assert r == u.encode("raw-unicode-escape")
-
- at given(strategies.text())
-def test_unicode_escape(u):
-    r = uh.utf8_encode_unicode_escape(u.encode("utf8"), "strict", None)
-    assert r == u.encode("unicode-escape")
-
 def test_encode_decimal(space):
     assert uh.unicode_encode_decimal(u' 12, 34 ', None) == ' 12, 34 '
     with pytest.raises(ValueError):
@@ -178,3 +165,21 @@
     result = uh.unicode_encode_decimal(
         u'12\u1234'.encode('utf8'), 'xmlcharrefreplace', handler)
     assert result == '12ሴ'
+
+if HAS_HYPOTHESIS:
+    @given(strategies.text())
+    def test_utf8_encode_ascii_2(u):
+        def eh(errors, encoding, reason, p, start, end):
+            return "?" * (end - start), end, 'b'
+        assert utf8_encode_ascii(u.encode("utf8"), "replace", eh) == u.encode("ascii", "replace")
+
+    @given(strategies.text())
+    def test_unicode_raw_escape(u):
+        r = uh.utf8_encode_raw_unicode_escape(u.encode("utf8"), 'strict', None)
+        assert r == u.encode("raw-unicode-escape")
+
+    @given(strategies.text())
+    def test_unicode_escape(u):
+        r = uh.utf8_encode_unicode_escape(u.encode("utf8"), "strict", None)
+        assert r == u.encode("unicode-escape")
+
diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -1291,7 +1291,6 @@
                                         allow_surrogates, "little",
                                         'utf-16-le')
 
-
 # ____________________________________________________________
 # utf-32
 
diff --git a/pypy/module/__builtin__/operation.py b/pypy/module/__builtin__/operation.py
--- a/pypy/module/__builtin__/operation.py
+++ b/pypy/module/__builtin__/operation.py
@@ -38,29 +38,34 @@
     return space.len(w_obj)
 
 
-def checkattrname(space, w_name):
+def checkattrname(space, w_name, msg):
     # This is a check to ensure that getattr/setattr/delattr only pass a
-    # string to the rest of the code.  XXX not entirely sure if these three
+    # ascii string to the rest of the code.  XXX not entirely sure if these
     # functions are the only way for non-string objects to reach
     # space.{get,set,del}attr()...
-    # Note that if w_name is already an exact string it must be returned
-    # unmodified (and not e.g. unwrapped-rewrapped).
-    if not space.is_w(space.type(w_name), space.w_text):
-        name = space.text_w(w_name)    # typecheck
-        w_name = space.newtext(name)     # rewrap as a real string
+    # Note that if w_name is already an exact string it must be ascii encoded
+    if not space.isinstance_w(w_name, space.w_text):
+        try:
+            name = space.text_w(w_name)    # typecheck
+        except OperationError as e:
+            if e.match(space, space.w_UnicodeError):
+                raise e
+            raise oefmt(space.w_TypeError,
+                 "%s(): attribute name must be string", msg)
+        w_name = space.newtext(name)
     return w_name
 
 def delattr(space, w_object, w_name):
     """Delete a named attribute on an object.
 delattr(x, 'y') is equivalent to ``del x.y''."""
-    w_name = checkattrname(space, w_name)
+    w_name = checkattrname(space, w_name, 'delattr')
     space.delattr(w_object, w_name)
     return space.w_None
 
 def getattr(space, w_object, w_name, w_defvalue=None):
     """Get a named attribute from an object.
 getattr(x, 'y') is equivalent to ``x.y''."""
-    w_name = checkattrname(space, w_name)
+    w_name = checkattrname(space, w_name, 'getattr')
     try:
         return space.getattr(w_object, w_name)
     except OperationError as e:
@@ -72,7 +77,7 @@
 def hasattr(space, w_object, w_name):
     """Return whether the object has an attribute with the given name.
     (This is done by calling getattr(object, name) and catching exceptions.)"""
-    w_name = checkattrname(space, w_name)
+    w_name = checkattrname(space, w_name, 'hasattr')
     try:
         space.getattr(w_object, w_name)
     except OperationError as e:
@@ -174,7 +179,7 @@
 def setattr(space, w_object, w_name, w_val):
     """Store a named attribute into an object.
 setattr(x, 'y', z) is equivalent to ``x.y = z''."""
-    w_name = checkattrname(space, w_name)
+    w_name = checkattrname(space, w_name, 'setattr')
     space.setattr(w_object, w_name, w_val)
     return space.w_None
 
diff --git a/pypy/module/_cffi_backend/misc.py b/pypy/module/_cffi_backend/misc.py
--- a/pypy/module/_cffi_backend/misc.py
+++ b/pypy/module/_cffi_backend/misc.py
@@ -408,8 +408,9 @@
 def dlopen_w(space, w_filename, flags):
     if WIN32 and space.isinstance_w(w_filename, space.w_unicode):
         fname = space.text_w(space.repr(w_filename))
-        unicode_name = space.realunicode_w(w_filename)
-        with rffi.scoped_unicode2wcharp(unicode_name) as ll_libname:
+        utf8_name = space.utf8_w(w_filename)
+        uni_len = space.len_w(w_filename)
+        with rffi.scoped_utf82wcharp(utf8_name, uni_len) as ll_libname:
             try:
                 handle = dlopenU(ll_libname, flags)
             except DLOpenError as e:
diff --git a/pypy/module/_cffi_backend/test/test_wchar_helper.py b/pypy/module/_cffi_backend/test/test_wchar_helper.py
--- a/pypy/module/_cffi_backend/test/test_wchar_helper.py
+++ b/pypy/module/_cffi_backend/test/test_wchar_helper.py
@@ -1,10 +1,15 @@
-from hypothesis import given, strategies
+try:
+    from hypothesis import given, strategies
+    HAS_HYPOTHESIS = True
+except ImportError:
+    HAS_HYPOTHESIS = False
+
 from pypy.module._cffi_backend.wchar_helper import utf8_size_as_char16
 
 
-
- at given(strategies.text())
-def test_utf8_size_as_char16(u):
-    assert type(u) is unicode
-    length = utf8_size_as_char16(''.join(uc.encode('utf8') for uc in u))
-    assert length == sum((1 if uc <= u'\uFFFF' else 2) for uc in u)
+if HAS_HYPOTHESIS:
+    @given(strategies.text())
+    def test_utf8_size_as_char16(u):
+        assert type(u) is unicode
+        length = utf8_size_as_char16(''.join(uc.encode('utf8') for uc in u))
+        assert length == sum((1 if uc <= u'\uFFFF' else 2) for uc in u)
diff --git a/pypy/module/_codecs/interp_codecs.py b/pypy/module/_codecs/interp_codecs.py
--- a/pypy/module/_codecs/interp_codecs.py
+++ b/pypy/module/_codecs/interp_codecs.py
@@ -1,10 +1,9 @@
 import sys
-from rpython.rlib import jit
+from rpython.rlib import jit, rutf8
 from rpython.rlib.objectmodel import we_are_translated, not_rpython
 from rpython.rlib.rstring import StringBuilder, UnicodeBuilder
 from rpython.rlib import runicode
 from rpython.rlib.runicode import raw_unicode_escape_helper
-from rpython.rlib import rutf8
 
 from pypy.interpreter.error import OperationError, oefmt
 from pypy.interpreter.gateway import interp2app, unwrap_spec, WrappedDefault
@@ -248,6 +247,7 @@
 
 def xmlcharrefreplace_errors(space, w_exc):
 
+
     check_exception(space, w_exc)
     if space.isinstance_w(w_exc, space.w_UnicodeEncodeError):
         w_obj = space.getattr(w_exc, space.newtext('object'))
@@ -276,6 +276,7 @@
 
 def backslashreplace_errors(space, w_exc):
 
+
     check_exception(space, w_exc)
     if (space.isinstance_w(w_exc, space.w_UnicodeEncodeError) or
             space.isinstance_w(w_exc, space.w_UnicodeTranslateError)):
@@ -303,9 +304,9 @@
         builder = StringBuilder()
         pos = start
         while pos < end:
-            oc = ord(obj[pos])
+            oc = rutf8.codepoint_at_pos(obj, pos)
             raw_unicode_escape_helper(builder, oc)
-            pos += 1
+            pos = rutf8.next_codepoint_pos(obj, pos)
         return space.newtuple([space.newtext(builder.build()), w_end])
     else:
         raise oefmt(space.w_TypeError,
@@ -663,6 +664,7 @@
     def wrap_encoder(space, w_arg, errors="strict"):
         # w_arg is a W_Unicode or W_Bytes?
         w_arg = space.convert_arg_to_w_unicode(w_arg, errors)
+        w_arg = space.convert_arg_to_w_unicode(w_arg)
         if errors is None:
             errors = 'strict'
         allow_surrogates = False
@@ -683,6 +685,7 @@
                  w_final=WrappedDefault(False))
     def wrap_decoder(space, string, errors="strict", w_final=None):
 
+
         if errors is None:
             errors = 'strict'
         final = space.is_true(w_final)
@@ -743,6 +746,7 @@
              w_final = WrappedDefault(False))
 def utf_8_decode(space, string, errors="strict", w_final=None):
 
+
     if errors is None:
         errors = 'strict'
     final = space.is_true(w_final)
@@ -883,6 +887,7 @@
 @unwrap_spec(string='bufferstr', errors='text_or_none')
 def charmap_decode(space, string, errors="strict", w_mapping=None):
 
+
     if errors is None:
         errors = 'strict'
     if len(string) == 0:
@@ -953,6 +958,7 @@
 def unicode_escape_decode(space, w_string, errors="strict", w_final=None):
     string = space.getarg_w('s*', w_string).as_str()
 
+
     if errors is None:
         errors = 'strict'
     final = space.is_true(w_final)
@@ -987,6 +993,7 @@
 @unwrap_spec(errors='text_or_none')
 def unicode_internal_decode(space, w_string, errors="strict"):
 
+
     if errors is None:
         errors = 'strict'
     # special case for this codec: unicodes are returned as is
diff --git a/pypy/module/_codecs/test/test_codecs.py b/pypy/module/_codecs/test/test_codecs.py
--- a/pypy/module/_codecs/test/test_codecs.py
+++ b/pypy/module/_codecs/test/test_codecs.py
@@ -298,8 +298,8 @@
             ]
 
         buffer = b''
-        result = ""
-        for (c, partialresult) in zip("\x00\xff\u07ff\u0800\uffff\U00010000".encode(encoding), check_partial):
+        result = u""
+        for (c, partialresult) in zip(u"\x00\xff\u07ff\u0800\uffff\U00010000".encode(encoding), check_partial):
             buffer += bytes([c])
             res = _codecs.utf_8_decode(buffer,'strict',False)
             if res[1] >0 :
@@ -327,8 +327,8 @@
                     u"\x00\xff\u0100\uffff\U00010000",
                 ]
         buffer = b''
-        result = ""
-        for (c, partialresult) in zip("\x00\xff\u0100\uffff\U00010000".encode(encoding), check_partial):
+        result = u""
+        for (c, partialresult) in zip(u"\x00\xff\u0100\uffff\U00010000".encode(encoding), check_partial):
             buffer += bytes([c])
             res = _codecs.utf_16_decode(buffer,'strict',False)
             if res[1] >0 :
@@ -630,12 +630,12 @@
 
     def test_charmap_decode_1(self):
         import codecs
-        assert codecs.charmap_encode('xxx') == (b'xxx', 3)
-        assert codecs.charmap_encode('xxx', 'strict', {ord('x'): b'XX'}) == (b'XXXXXX', 3)
+        assert codecs.charmap_encode(u'xxx') == (b'xxx', 3)
+        assert codecs.charmap_encode(u'xxx', 'strict', {ord('x'): b'XX'}) == (b'XXXXXX', 3)
 
-        res = codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab")
+        res = codecs.charmap_decode(b"\x00\x01\x02", "replace", u"ab")
         assert res == ("ab\ufffd", 3)
-        res = codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab\ufffe")
+        res = codecs.charmap_decode(b"\x00\x01\x02", "replace", u"ab\ufffe")
         assert res == ('ab\ufffd', 3)
 
     def test_decode_errors(self):
@@ -654,28 +654,28 @@
     def test_errors(self):
         import codecs
         assert codecs.replace_errors(UnicodeEncodeError(
-            "ascii", "\u3042", 0, 1, "ouch")) == ("?", 1)
+            "ascii", u"\u3042", 0, 1, "ouch")) == (u"?", 1)
         assert codecs.replace_errors(UnicodeDecodeError(
-            "ascii", b"\xff", 0, 1, "ouch")) == ("\ufffd", 1)
+            "ascii", b"\xff", 0, 1, "ouch")) == (u"\ufffd", 1)
         assert codecs.replace_errors(UnicodeTranslateError(
             "\u3042", 0, 1, "ouch")) == ("\ufffd", 1)
 
         assert codecs.replace_errors(UnicodeEncodeError(
-            "ascii", "\u3042\u3042", 0, 2, "ouch")) == ("??", 2)
+            "ascii", "\u3042\u3042", 0, 2, "ouch")) == (u"??", 2)
         assert codecs.replace_errors(UnicodeDecodeError(
-            "ascii", b"\xff\xff", 0, 2, "ouch")) == ("\ufffd", 2)
+            "ascii", b"\xff\xff", 0, 2, "ouch")) == (u"\ufffd", 2)
         assert codecs.replace_errors(UnicodeTranslateError(
             "\u3042\u3042", 0, 2, "ouch")) == ("\ufffd\ufffd", 2)
 
         class BadStartUnicodeEncodeError(UnicodeEncodeError):
             def __init__(self):
-                UnicodeEncodeError.__init__(self, "ascii", "", 0, 1, "bad")
+                UnicodeEncodeError.__init__(self, "ascii", u"", 0, 1, "bad")
                 self.start = []
 
         # A UnicodeEncodeError object with a bad object attribute
         class BadObjectUnicodeEncodeError(UnicodeEncodeError):
             def __init__(self):
-                UnicodeEncodeError.__init__(self, "ascii", "", 0, 1, "bad")
+                UnicodeEncodeError.__init__(self, "ascii", u"", 0, 1, "bad")
                 self.object = []
 
         # A UnicodeDecodeError object without an end attribute
@@ -693,19 +693,19 @@
         # A UnicodeTranslateError object without a start attribute
         class NoStartUnicodeTranslateError(UnicodeTranslateError):
             def __init__(self):
-                UnicodeTranslateError.__init__(self, "", 0, 1, "bad")
+                UnicodeTranslateError.__init__(self, u"", 0, 1, "bad")
                 del self.start
 
         # A UnicodeTranslateError object without an end attribute
         class NoEndUnicodeTranslateError(UnicodeTranslateError):
             def __init__(self):
-                UnicodeTranslateError.__init__(self,  "", 0, 1, "bad")
+                UnicodeTranslateError.__init__(self,  u"", 0, 1, "bad")
                 del self.end
 
         # A UnicodeTranslateError object without an object attribute
         class NoObjectUnicodeTranslateError(UnicodeTranslateError):
             def __init__(self):
-                UnicodeTranslateError.__init__(self, "", 0, 1, "bad")
+                UnicodeTranslateError.__init__(self, u"", 0, 1, "bad")
                 del self.object
 
         import codecs
@@ -716,7 +716,7 @@
         raises(TypeError, codecs.replace_errors, BadObjectUnicodeEncodeError())
         raises(TypeError, codecs.replace_errors, BadObjectUnicodeDecodeError()
         )
-        # With the correct exception, "replace" returns an "?" or "\ufffd" replacement
+        # With the correct exception, "replace" returns an "?" or u"\ufffd" replacement
 
     def test_decode_ignore(self):
         assert b'\xff'.decode('utf-7', 'ignore') == ''
@@ -724,7 +724,6 @@
 
     def test_backslashreplace(self):
         import sys
-        import codecs
         sin = u"a\xac\u1234\u20ac\u8000\U0010ffff"
         if sys.maxunicode > 65535:
             expected_ascii = b"a\\xac\\u1234\\u20ac\\u8000\\U0010ffff"
@@ -827,7 +826,7 @@
 
     def test_badhandler(self):
         import codecs
-        results = ( 42, "foo", (1,2,3), ("foo", 1, 3), ("foo", None), ("foo",), ("foo", 1, 3), ("foo", None), ("foo",) )
+        results = ( 42, u"foo", (1,2,3), (u"foo", 1, 3), (u"foo", None), (u"foo",), ("foo", 1, 3), ("foo", None), ("foo",) )
         encs = ("ascii", "latin-1", "iso-8859-1", "iso-8859-15")
 
         for res in results:
@@ -856,7 +855,7 @@
         import codecs
         import sys
         errors = 'test.badhandler_longindex'
-        codecs.register_error(errors, lambda x: ('', sys.maxsize + 1))
+        codecs.register_error(errors, lambda x: (u'', sys.maxsize + 1))
         # CPython raises OverflowError here
         raises((IndexError, OverflowError), b'apple\x92ham\x93spam'.decode, 'utf-8', errors)
 
@@ -872,15 +871,15 @@
 
         res = b"\x00\x00\x00\x00\x00".decode("unicode-internal", "replace")
         if sys.maxunicode > 65535:
-            assert res == "\u0000\ufffd"    # UCS4 build
+            assert res == u"\u0000\ufffd"    # UCS4 build
         else:
-            assert res == "\x00\x00\ufffd"  # UCS2 build
+            assert res == u"\x00\x00\ufffd"  # UCS2 build
 
         res = b"\x00\x00\x00\x00\x00".decode("unicode-internal", "ignore")
         if sys.maxunicode > 65535:
-            assert res == "\u0000"   # UCS4 build
+            assert res == u"\u0000"   # UCS4 build
         else:
-            assert res == "\x00\x00" # UCS2 build
+            assert res == u"\x00\x00" # UCS2 build
 
         def handler_unicodeinternal(exc):
             if not isinstance(exc, UnicodeDecodeError):
@@ -889,9 +888,9 @@
         codecs.register_error("test.hui", handler_unicodeinternal)
         res = b"\x00\x00\x00\x00\x00".decode("unicode-internal", "test.hui")
         if sys.maxunicode > 65535:
-            assert res == "\u0000\u0001\u0000"   # UCS4 build
+            assert res == u"\u0000\u0001"   # UCS4 build
         else:
-            assert res == "\x00\x00\x01\x00\x00" # UCS2 build
+            assert res == u"\x00\x00\x01" # UCS2 build
 
         def handler1(exc):
             if not isinstance(exc, UnicodeEncodeError) \
@@ -946,12 +945,12 @@
     def test_encode_error_bad_handler(self):
         import codecs
         codecs.register_error("test.bad_handler", lambda e: (repl, 1))
-        assert "xyz".encode("latin-1", "test.bad_handler") == b"xyz"
-        repl = "\u1234"
-        raises(UnicodeEncodeError, "\u5678".encode, "latin-1",
+        assert u"xyz".encode("latin-1", "test.bad_handler") == "xyz"
+        repl = u"\u1234"
+        raises(UnicodeEncodeError, u"\u5678".encode, "latin-1",
                "test.bad_handler")
-        repl = "\u00E9"
-        s = "\u5678".encode("latin-1", "test.bad_handler")
+        repl = u"\u00E9"
+        s = u"\u5678".encode("latin-1", "test.bad_handler")
         assert s == b'\xe9'
         raises(UnicodeEncodeError, "\u5678".encode, "ascii",
                "test.bad_handler")
@@ -993,7 +992,7 @@
         charmap = dict([(c, bytes([c, c]).upper()) for c in b"abcdefgh"])
         charmap[ord("?")] = b"XYZ"
         import codecs
-        sin = "abcDEF"
+        sin = u"abcDEF"
         sout = codecs.charmap_encode(sin, "replace", charmap)[0]
         assert sout == b"AABBCCXYZXYZXYZ"
 
@@ -1002,7 +1001,7 @@
 
     def test_charmap_build(self):
         import codecs
-        assert codecs.charmap_build('123456') == {49: 0, 50: 1, 51: 2,
+        assert codecs.charmap_build(u'123456') == {49: 0, 50: 1, 51: 2,
                                                    52: 3, 53: 4, 54: 5}
 
     def test_utf7_start_end_in_exception(self):
@@ -1013,7 +1012,7 @@
             assert exc.end == 3
 
     def test_utf7_surrogate(self):
-        assert b'+3ADYAA-'.decode('utf-7') == '\udc00\ud800'
+        assert b'+3ADYAA-'.decode('utf-7') == u'\udc00\ud800'
 
     def test_utf7_errors(self):
         import codecs
@@ -1044,7 +1043,7 @@
 
     def test_utf_16_encode_decode(self):
         import codecs, sys
-        x = '123abc'
+        x = u'123abc'
         if sys.byteorder == 'big':
             assert codecs.getencoder('utf-16')(x) == (
                     b'\xfe\xff\x001\x002\x003\x00a\x00b\x00c', 6)
@@ -1058,10 +1057,10 @@
 
     def test_unicode_escape(self):
         import _codecs
-        assert '\\'.encode('unicode-escape') == b'\\\\'
-        assert b'\\\\'.decode('unicode-escape') == '\\'
-        assert '\ud801'.encode('unicode-escape') == b'\\ud801'
-        assert '\u0013'.encode('unicode-escape') == b'\\x13'
+        assert u'\\'.encode('unicode-escape') == b'\\\\'
+        assert b'\\\\'.decode('unicode-escape') == u'\\'
+        assert u'\ud801'.encode('unicode-escape') == b'\\ud801'
+        assert u'\u0013'.encode('unicode-escape') == b'\\x13'
         assert _codecs.unicode_escape_decode(r"\u1234") == ("\u1234", 6)
 
     def test_mbcs(self):
diff --git a/pypy/module/_io/test/test_interp_textio.py b/pypy/module/_io/test/test_interp_textio.py
--- a/pypy/module/_io/test/test_interp_textio.py
+++ b/pypy/module/_io/test/test_interp_textio.py
@@ -1,6 +1,6 @@
 import pytest
 try:
-    from hypothesis import given, strategies as st, settings
+    from hypothesis import given, strategies as st, settings, example
 except ImportError:
     pytest.skip("hypothesis required")
 import os
@@ -63,6 +63,7 @@
     assert buf.exhausted()
 
 @given(st.text(), st.lists(st.integers(min_value=0)))
+ at example(u'\x80', [1])
 def test_readn_buffer(text, sizes):
     buf = DecodeBuffer(text.encode('utf-8'))
     strings = []
@@ -80,5 +81,5 @@
     buf = DecodeBuffer(text.encode('utf-8'))
     for i in range(len(text)):
         ch = buf.next_char()
-        assert ch == text[i].encode('utf-8')[0]
+        assert ch == text[i].encode('utf-8')
     assert buf.exhausted()
diff --git a/pypy/module/_io/test/test_ztranslation.py b/pypy/module/_io/test/test_ztranslation.py
deleted file mode 100644
--- a/pypy/module/_io/test/test_ztranslation.py
+++ /dev/null
@@ -1,15 +0,0 @@
-from pypy.interpreter.typedef import GetSetProperty
-from pypy.module.exceptions.interp_exceptions import W_BaseException
-from pypy.objspace.fake.checkmodule import checkmodule
-
-def test_checkmodule():
-    # XXX: PyTraceback usage in these methods blows up checkmodule
-    def descr_gettraceback(self, space):
-        return space.w_None
-    def descr_settraceback(self, space, w_newtraceback):
-        pass
-    W_BaseException.descr_gettraceback = descr_gettraceback
-    W_BaseException.descr_settraceback = descr_settraceback
-    W_BaseException.typedef.add_entries(
-        __traceback__=GetSetProperty(descr_gettraceback, descr_settraceback))
-    checkmodule('_io')
diff --git a/pypy/module/_rawffi/structure.py b/pypy/module/_rawffi/structure.py
--- a/pypy/module/_rawffi/structure.py
+++ b/pypy/module/_rawffi/structure.py
@@ -14,7 +14,7 @@
 from pypy.module._rawffi.interp_rawffi import unroll_letters_for_numbers
 from pypy.module._rawffi.interp_rawffi import size_alignment
 from pypy.module._rawffi.interp_rawffi import read_ptr, write_ptr
-from rpython.rlib import clibffi, rgc
+from rpython.rlib import clibffi, rgc, rutf8
 from rpython.rlib.rarithmetic import intmask, signedtype, r_uint, \
     r_ulonglong
 from rpython.rtyper.lltypesystem import lltype, rffi
@@ -163,6 +163,10 @@
                 if name in name_to_index:
                     raise oefmt(space.w_ValueError,
                                 "duplicate field name %s", name)
+                try:
+                    rutf8.check_ascii(name)
+                except rutf8.CheckError:
+                    raise oefmt(space.w_TypeError, 'non-ascii field name')
                 name_to_index[name] = i
             size, alignment, pos, bitsizes = size_alignment_pos(
                 fields, is_union, pack)
diff --git a/pypy/module/_rawffi/test/test__rawffi.py b/pypy/module/_rawffi/test/test__rawffi.py
--- a/pypy/module/_rawffi/test/test__rawffi.py
+++ b/pypy/module/_rawffi/test/test__rawffi.py
@@ -352,9 +352,10 @@
         import _rawffi
         A = _rawffi.Array('u')
         a = A(6, 'xx\x00\x00xx')
-        res = _rawffi.wcharp2unicode(a.buffer)
-        assert isinstance(res, str)
-        assert res == 'xx'
+        for i in (-1, 6):
+            res = _rawffi.wcharp2unicode(a.buffer, i)
+            assert isinstance(res, str)
+            assert res == u'xx'
         a.free()
 
     def test_rawstring2charp(self):
diff --git a/pypy/module/array/interp_array.py b/pypy/module/array/interp_array.py
--- a/pypy/module/array/interp_array.py
+++ b/pypy/module/array/interp_array.py
@@ -439,7 +439,8 @@
         if len(s) % self.itemsize != 0:
             raise oefmt(space.w_ValueError,
                         "bytes length not a multiple of item size")
-        #self.check_valid_unicode(space, s) # empty for non-u arrays
+        # CPython accepts invalid unicode
+        # self.check_valid_unicode(space, s) # empty for non-u arrays
         oldlen = self.len
         new = len(s) / self.itemsize
         if not new:
@@ -1150,11 +1151,25 @@
             elif mytype.typecode == 'c':
                 return space.newbytes(item)
             elif mytype.typecode == 'u':
-                if ord(item) >= 0x110000:
-                    raise oefmt(space.w_ValueError,
-                                "array contains a unicode character out of "
-                                "range(0x110000)")
-                return space.newtext(rutf8.unichr_as_utf8(ord(item)), 1)
+                code = r_uint(ord(item))
+                # cpython will allow values > sys.maxunicode
+                # while silently truncating the top bits
+                if code <= r_uint(0x7F):
+                    # Encode ASCII
+                    item = chr(code)
+                elif code <= r_uint(0x07FF):
+                    item = (chr((0xc0 | (code >> 6))) + 
+                            chr((0x80 | (code & 0x3f))))
+                elif code <= r_uint(0xFFFF):
+                    item = (chr((0xe0 | (code >> 12))) +
+                            chr((0x80 | ((code >> 6) & 0x3f))) +
+                            chr((0x80 | (code & 0x3f))))
+                else:
+                    item = (chr((0xf0 | (code >> 18)) & 0xff) +
+                            chr((0x80 | ((code >> 12) & 0x3f))) +
+                            chr((0x80 | ((code >> 6) & 0x3f))) +
+                            chr((0x80 | (code & 0x3f))))
+                return space.newutf8(item, 1)
             assert 0, "unreachable"
 
         # interface
diff --git a/pypy/module/array/test/test_array.py b/pypy/module/array/test/test_array.py
--- a/pypy/module/array/test/test_array.py
+++ b/pypy/module/array/test/test_array.py
@@ -635,6 +635,12 @@
             assert a[1] == 2
             assert a[2] == 3
 
+    def test_deepcopy(self):
+        a = self.array('u', u'\x01\u263a\x00\ufeff')
+        from copy import deepcopy
+        b = deepcopy(a)
+        assert a == b
+
     def test_addmul(self):
         a = self.array('i', [1, 2, 3])
         assert repr(a + a) == "array('i', [1, 2, 3, 1, 2, 3])"
@@ -892,14 +898,6 @@
         b.byteswap()
         raises(ValueError, "a != b")
 
-    def test_unicode_ord_positive(self):
-        import sys
-        if sys.maxunicode == 0xffff:
-            skip("test for 32-bit unicodes")
-        a = self.array('u', b'\xff\xff\xff\xff')
-        assert len(a) == 1
-        raises(ValueError, "a[0]")
-
     def test_weakref(self):
         import weakref
         a = self.array('u', 'Hi!')
diff --git a/pypy/objspace/std/bytesobject.py b/pypy/objspace/std/bytesobject.py
--- a/pypy/objspace/std/bytesobject.py
+++ b/pypy/objspace/std/bytesobject.py
@@ -431,10 +431,17 @@
     def utf8_w(self, space):
         return self._value
 
+    def utf8_w(self, space):
+        return self._value
+
     def buffer_w(self, space, flags):
         space.check_buf_flags(flags, True)
         return SimpleView(StringBuffer(self._value))
 
+    def descr_encode(self, space, w_encoding=None, w_errors=None):
+        w_uni = self.descr_decode(space, space.newtext('ascii'), space.newtext('strict'))
+        return space.call_method(w_uni, 'encode', w_encoding, w_errors)
+
     def descr_getbuffer(self, space, w_flags):
         #from pypy.objspace.std.bufferobject import W_Buffer
         #return W_Buffer(StringBuffer(self._value))
diff --git a/pypy/objspace/std/formatting.py b/pypy/objspace/std/formatting.py
--- a/pypy/objspace/std/formatting.py
+++ b/pypy/objspace/std/formatting.py
@@ -11,6 +11,7 @@
 from rpython.tool.sourcetools import func_with_new_name
 
 from pypy.interpreter.error import OperationError, oefmt
+from pypy.interpreter.unicodehelper import check_ascii_or_raise
 
 
 class BaseStringFormatter(object):
@@ -498,7 +499,6 @@
                 else:
                     s = ''
                 if len(s) == 1:
-                    self.std_wp(s)
                     return
                 raise oefmt(space.w_TypeError, "%c requires int or single byte")
             else:
@@ -581,7 +581,7 @@
     formatter = UnicodeFormatter(space, fmt, values_w, w_valuedict)
     result = formatter.format()
     # this can force strings, not sure if it's a problem or not
-    lgt = rutf8.check_utf8(result, True)
+    lgt = rutf8.codepoints_in_utf8(result)
     return space.newutf8(result, lgt)
 
 def mod_format(space, w_format, w_values, fmt_type=FORMAT_STR):
diff --git a/pypy/objspace/std/test/test_bytesobject.py b/pypy/objspace/std/test/test_bytesobject.py
--- a/pypy/objspace/std/test/test_bytesobject.py
+++ b/pypy/objspace/std/test/test_bytesobject.py
@@ -1,7 +1,9 @@
 # coding: utf-8
+import pytest
 
 from pypy.interpreter.error import OperationError
 
+
 class TestW_BytesObject:
 
     def teardown_method(self, method):
@@ -637,6 +639,7 @@
     def test_unicode_join_str_arg_ascii(self):
         raises(TypeError, ''.join, [b'\xc3\xa1'])
 
+    @pytest.mark.xfail(reason='setdefaultencoding does not work?')
     def test_unicode_join_endcase(self):
         # This class inserts a Unicode object into its argument's natural
         # iteration, in the 3rd position.
diff --git a/pypy/objspace/std/test/test_unicodeobject.py b/pypy/objspace/std/test/test_unicodeobject.py
--- a/pypy/objspace/std/test/test_unicodeobject.py
+++ b/pypy/objspace/std/test/test_unicodeobject.py
@@ -1,7 +1,12 @@
 # -*- encoding: utf-8 -*-
 import py
 import sys
-from hypothesis import given, strategies, settings, example
+try:
+    from hypothesis import given, strategies, settings, example
+    HAS_HYPOTHESIS = True
+except ImportError:
+    HAS_HYPOTHESIS = False
+    
 from rpython.rlib import rutf8
 from pypy.interpreter.error import OperationError
 
@@ -33,86 +38,145 @@
                 space.w_unicode, "__new__", space.w_unicode, w_uni)
         assert w_new is w_uni
 
-    @given(strategies.text(), strategies.integers(min_value=0, max_value=10),
-                              strategies.integers(min_value=-1, max_value=10))
-    def test_hypo_index_find(self, u, start, len1):
-        if start + len1 < 0:
-            return   # skip this case
-        v = u[start : start + len1]
-        space = self.space
-        w_u = space.newutf8(u.encode('utf8'), len(u))
-        w_v = space.newutf8(v.encode('utf8'), len(v))
-        expected = u.find(v, start, start + len1)
-        try:
-            w_index = space.call_method(w_u, 'index', w_v,
+    if HAS_HYPOTHESIS:
+        @given(strategies.text(), strategies.integers(min_value=0, max_value=10),
+                                  strategies.integers(min_value=-1, max_value=10))
+        def test_hypo_index_find(self, u, start, len1):
+            if start + len1 < 0:
+                return   # skip this case
+            v = u[start : start + len1]
+            space = self.space
+            w_u = space.newutf8(u.encode('utf8'), len(u))
+            w_v = space.newutf8(v.encode('utf8'), len(v))
+            expected = u.find(v, start, start + len1)
+            try:
+                w_index = space.call_method(w_u, 'index', w_v,
+                                            space.newint(start),
+                                            space.newint(start + len1))
+            except OperationError as e:
+                if not e.match(space, space.w_ValueError):
+                    raise
+                assert expected == -1
+            else:
+                assert space.int_w(w_index) == expected >= 0
+
+            w_index = space.call_method(w_u, 'find', w_v,
                                         space.newint(start),
                                         space.newint(start + len1))
-        except OperationError as e:
-            if not e.match(space, space.w_ValueError):
-                raise
-            assert expected == -1
-        else:
-            assert space.int_w(w_index) == expected >= 0
+            assert space.int_w(w_index) == expected
+            rexpected = u.rfind(v, start, start + len1)
+            try:
+                w_index = space.call_method(w_u, 'rindex', w_v,
+                                            space.newint(start),
+                                            space.newint(start + len1))
+            except OperationError as e:
+                if not e.match(space, space.w_ValueError):
+                    raise
+                assert rexpected == -1
+            else:
+                assert space.int_w(w_index) == rexpected >= 0
 
-        w_index = space.call_method(w_u, 'find', w_v,
-                                    space.newint(start),
-                                    space.newint(start + len1))
-        assert space.int_w(w_index) == expected
-
-        rexpected = u.rfind(v, start, start + len1)
-        try:
-            w_index = space.call_method(w_u, 'rindex', w_v,
+            w_index = space.call_method(w_u, 'rfind', w_v,
                                         space.newint(start),
                                         space.newint(start + len1))
-        except OperationError as e:
-            if not e.match(space, space.w_ValueError):
-                raise
-            assert rexpected == -1
-        else:
-            assert space.int_w(w_index) == rexpected >= 0
+            assert space.int_w(w_index) == rexpected
 
-        w_index = space.call_method(w_u, 'rfind', w_v,
-                                    space.newint(start),
-                                    space.newint(start + len1))
-        assert space.int_w(w_index) == rexpected
+            expected = u.startswith(v, start)
+            w_res = space.call_method(w_u, 'startswith', w_v,
+                                      space.newint(start))
+            assert w_res is space.newbool(expected)
 
-        expected = u.startswith(v, start)
-        if expected and start > len(u):
-            expected = False # python2 vs. python3
-        w_res = space.call_method(w_u, 'startswith', w_v,
-                                  space.newint(start))
-        assert w_res is space.newbool(expected)
+            expected = u.startswith(v, start, start + len1)
+            w_res = space.call_method(w_u, 'startswith', w_v,
+                                      space.newint(start),
+                                      space.newint(start + len1))
+            assert w_res is space.newbool(expected)
 
-        expected = u.startswith(v, start, start + len1)
-        if expected and start > len(u):
-            expected = False # python2 vs. python3
-        w_res = space.call_method(w_u, 'startswith', w_v,
-                                  space.newint(start),
-                                  space.newint(start + len1))
-        assert w_res is space.newbool(expected)
+            expected = u.endswith(v, start)
+            w_res = space.call_method(w_u, 'endswith', w_v,
+                                      space.newint(start))
+            assert w_res is space.newbool(expected)
 
-        expected = u.endswith(v, start)
-        if expected and start > len(u):
-            expected = False # python2 vs. python3
-        w_res = space.call_method(w_u, 'endswith', w_v,
-                                  space.newint(start))
-        assert w_res is space.newbool(expected)
+            expected = u.endswith(v, start, start + len1)
+            w_res = space.call_method(w_u, 'endswith', w_v,
+                                      space.newint(start),
+                                      space.newint(start + len1))
+            assert w_res is space.newbool(expected)
 
-        expected = u.endswith(v, start, start + len1)
-        if expected and start > len(u):
-            expected = False # python2 vs. python3
-        w_res = space.call_method(w_u, 'endswith', w_v,
-                                  space.newint(start),
-                                  space.newint(start + len1))
-        assert w_res is space.newbool(expected)
 
-    def test_text_w(self):
-        space = self.space
-        w_uni = space.wrap(u'abcd')
-        assert space.text_w(w_uni) == 'abcd'
-        w_uni = space.wrap(unichr(0xd921) + unichr(0xdddd))
-        # XXXX Test is from py3.5, should this still fail?
-        space.raises_w(space.w_UnicodeEncodeError, space.text_w, w_uni)
+        @given(u=strategies.text(),
+               start=strategies.integers(min_value=0, max_value=10),
+               len1=strategies.integers(min_value=-1, max_value=10))
+        def test_hypo_index_find(self, u, start, len1):
+            space = self.space
+            if start + len1 < 0:
+                return   # skip this case
+            v = u[start : start + len1]
+            w_u = space.wrap(u)
+            w_v = space.wrap(v)
+            expected = u.find(v, start, start + len1)
+            try:
+                w_index = space.call_method(w_u, 'index', w_v,
+                                            space.newint(start),
+                                            space.newint(start + len1))
+            except OperationError as e:
+                if not e.match(space, space.w_ValueError):
+                    raise
+                assert expected == -1
+            else:
+                assert space.int_w(w_index) == expected >= 0
+
+            w_index = space.call_method(w_u, 'find', w_v,
+                                        space.newint(start),
+                                        space.newint(start + len1))
+            assert space.int_w(w_index) == expected
+
+            rexpected = u.rfind(v, start, start + len1)
+            try:
+                w_index = space.call_method(w_u, 'rindex', w_v,
+                                            space.newint(start),
+                                            space.newint(start + len1))
+            except OperationError as e:
+                if not e.match(space, space.w_ValueError):
+                    raise
+                assert rexpected == -1
+            else:
+                assert space.int_w(w_index) == rexpected >= 0
+
+            w_index = space.call_method(w_u, 'rfind', w_v,
+                                        space.newint(start),
+                                        space.newint(start + len1))
+            assert space.int_w(w_index) == rexpected
+
+            expected = u.startswith(v, start)
+            if expected and start > len(u):
+                expected = False # python2 vs. python3
+            w_res = space.call_method(w_u, 'startswith', w_v,
+                                      space.newint(start))
+            assert w_res is space.newbool(expected)
+
+            expected = u.startswith(v, start, start + len1)
+            if expected and start > len(u):
+                expected = False # python2 vs. python3
+            w_res = space.call_method(w_u, 'startswith', w_v,
+                                      space.newint(start),
+                                      space.newint(start + len1))
+            assert w_res is space.newbool(expected)
+
+            expected = u.endswith(v, start)
+            if expected and start > len(u):
+                expected = False # python2 vs. python3
+            w_res = space.call_method(w_u, 'endswith', w_v,
+                                      space.newint(start))
+            assert w_res is space.newbool(expected)
+
+            expected = u.endswith(v, start, start + len1)
+            if expected and start > len(u):
+                expected = False # python2 vs. python3
+            w_res = space.call_method(w_u, 'endswith', w_v,
+                                      space.newint(start),
+                                      space.newint(start + len1))
+            assert w_res is space.newbool(expected)
 
 
 class AppTestUnicodeStringStdOnly:
@@ -853,18 +917,20 @@
     def test_rfind_corner_case(self):
         assert 'abc'.rfind('', 4) == -1
 
-    def test_count(self):
-        assert "".count("x") ==0
-        assert "".count("") ==1
-        assert "Python".count("") ==7
-        assert "ab aaba".count("ab") ==2
-        assert 'aaa'.count('a') == 3
-        assert 'aaa'.count('b') == 0
-        assert 'aaa'.count('a', -1) == 1
-        assert 'aaa'.count('a', -10) == 3
-        assert 'aaa'.count('a', 0, -1) == 2
-        assert 'aaa'.count('a', 0, -10) == 0
-        assert 'ababa'.count('aba') == 1
+    def test_count_unicode(self):
+        assert u'aaa'.count(u'', 10) == 0
+        assert u'aaa'.count(u'', 3) == 1
+        assert u"".count(u"x") ==0
+        assert u"".count(u"") ==1
+        assert u"Python".count(u"") ==7
+        assert u"ab aaba".count(u"ab") ==2
+        assert u'aaa'.count(u'a') == 3
+        assert u'aaa'.count(u'b') == 0
+        assert u'aaa'.count(u'a', -1) == 1
+        assert u'aaa'.count(u'a', -10) == 3
+        assert u'aaa'.count(u'a', 0, -1) == 2
+        assert u'aaa'.count(u'a', 0, -10) == 0
+        assert u'ababa'.count(u'aba') == 1
 
     def test_swapcase(self):
         assert '\xe4\xc4\xdf'.swapcase() == '\xc4\xe4SS'
@@ -1210,9 +1276,8 @@
         assert type(str(z)) is str
         assert str(z) == u'foobaz'
         #
-        # two completely corner cases where we differ from CPython:
-        #assert unicode(encoding='supposedly_the_encoding') == u''
-        #assert unicode(errors='supposedly_the_error') == u''
+        assert unicode(encoding='supposedly_the_encoding') == u''
+        assert unicode(errors='supposedly_the_error') == u''
         e = raises(TypeError, str, u'', 'supposedly_the_encoding')
         assert str(e.value) == 'decoding str is not supported'
         e = raises(TypeError, str, u'', errors='supposedly_the_error')
diff --git a/rpython/rlib/rsre/rsre_core.py b/rpython/rlib/rsre/rsre_core.py
--- a/rpython/rlib/rsre/rsre_core.py
+++ b/rpython/rlib/rsre/rsre_core.py
@@ -460,6 +460,7 @@
         ptr = self.start_ptr
         if not self.next_char_ok(ctx, pattern, ptr, self.ppos3):
             return
+        assert not isinstance(ctx, AbstractMatchContext)
         self.start_ptr = ctx.next(ptr)
         return self.find_first_result(ctx, pattern)
 
diff --git a/rpython/rlib/runicode.py b/rpython/rlib/runicode.py
--- a/rpython/rlib/runicode.py
+++ b/rpython/rlib/runicode.py
@@ -8,6 +8,7 @@
 from rpython.rlib import jit, nonconst
 
 
+# We always use MAXUNICODE = 0x10ffff when unicode objects use utf8
 if 1 or rffi.sizeof(lltype.UniChar) == 4:
     MAXUNICODE = 0x10ffff
     allow_surrogate_by_default = False
diff --git a/rpython/rtyper/lltypesystem/rffi.py b/rpython/rtyper/lltypesystem/rffi.py
--- a/rpython/rtyper/lltypesystem/rffi.py
+++ b/rpython/rtyper/lltypesystem/rffi.py
@@ -1046,7 +1046,7 @@
 
     s = rutf8.Utf8StringBuilder(maxlen)
     i = 0
-    while i < maxlen and w[i] != '\x00':
+    while i < maxlen and ord(w[i]):
         s.append_code(ord(w[i]))
         i += 1
     return s.build(), i


More information about the pypy-commit mailing list