[pypy-commit] pypy unicode-utf8-py3: one test uncovered two fixes

Thu Nov 29 08:26:00 EST 2018

Author: Matti Picus <matti.picus at gmail.com>
Branch: unicode-utf8-py3
Changeset: r95385:96d5bc8106a8
Date: 2018-11-29 05:24 -0800
http://bitbucket.org/pypy/pypy/changeset/96d5bc8106a8/

Log:	one test uncovered two fixes

diff --git a/pypy/module/_codecs/interp_codecs.py b/pypy/module/_codecs/interp_codecs.py
--- a/pypy/module/_codecs/interp_codecs.py
+++ b/pypy/module/_codecs/interp_codecs.py
@@ -749,7 +749,7 @@
     state = space.fromcache(CodecState)
     # call the fast version for checking
     try:
-        lgt = rutf8.check_utf8(string, allow_surrogates=True)
+        lgt = rutf8.check_utf8(string, allow_surrogates=False)
     except rutf8.CheckError:
         res, lgt, pos = unicodehelper.str_decode_utf8(string,
             errors, final, state.decode_error_handler)
diff --git a/pypy/module/_codecs/test/test_codecs.py b/pypy/module/_codecs/test/test_codecs.py
--- a/pypy/module/_codecs/test/test_codecs.py
+++ b/pypy/module/_codecs/test/test_codecs.py
@@ -625,6 +625,8 @@
         assert '[\uDC80]'.encode('utf-8', 'namereplace') == b'[\\udc80]'
 
     def test_surrogateescape(self):
+        uni = b"\xed\xb0\x80".decode("utf-8", "surrogateescape")
+        assert uni == "\udced\udcb0\udc80"
         assert "\udce4\udceb\udcef\udcf6\udcfc".encode("latin-1",
                              "surrogateescape") == b"\xe4\xeb\xef\xf6\xfc"
         assert b'a\x80b'.decode('utf-8', 'surrogateescape') == 'a\udc80b'
diff --git a/pypy/objspace/std/stringmethods.py b/pypy/objspace/std/stringmethods.py
--- a/pypy/objspace/std/stringmethods.py
+++ b/pypy/objspace/std/stringmethods.py
@@ -198,11 +198,13 @@
         if encoding is None:
             encoding = 'utf8'
         if encoding == 'utf8' or encoding == 'utf-8':
+            # fast path - do not call into app-level codecs.py
             from pypy.module._codecs.interp_codecs import CodecState
             state = space.fromcache(CodecState)
             eh = state.decode_error_handler
             s = space.charbuf_w(self)
             ret, lgt, pos = str_decode_utf8(s, errors, True, eh)
+            return space.newtext(ret, lgt)
         return decode_object(space, self, encoding, errors)
 
     @unwrap_spec(tabsize=int)