[pypy-commit] pypy unicode-utf8-py3: use encode_utf8, str_decode_utf8, and maybe handle surrogates in the latter

Sun Sep 2 05:52:56 EDT 2018

Author: Matti Picus <matti.picus at gmail.com>
Branch: unicode-utf8-py3
Changeset: r95073:b040f44dc71b
Date: 2018-09-02 10:18 +0200
http://bitbucket.org/pypy/pypy/changeset/b040f44dc71b/

Log:	use encode_utf8, str_decode_utf8, and maybe handle surrogates in the
	latter

diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -50,6 +50,23 @@
         return u'', None, 0
     return raise_unicode_exception_encode
 
+ at specialize.memo()
+def encode_unicode_error_handler(space):
+    # Fast version of the "strict" errors handler.
+    def raise_unicode_exception_encode(errors, encoding, msg, uni,
+                                       startingpos, endingpos):
+        assert isinstance(uni, unicode)
+        u_len = len(uni)
+        utf8 = runicode.unicode_encode_utf8sp(uni, u_len)
+        raise OperationError(space.w_UnicodeEncodeError,
+                             space.newtuple([space.newtext(encoding),
+                                             space.newtext(utf8, u_len),
+                                             space.newint(startingpos),
+                                             space.newint(endingpos),
+                                             space.newtext(msg)]))
+        return u'', None, 0
+    return raise_unicode_exception_encode
+
 def default_error_encode(
         errors, encoding, msg, u, startingpos, endingpos):
     """A default handler, for tests"""
@@ -322,7 +339,6 @@
     valid so we're trying to either raise or pack stuff with error handler.
     The key difference is that this is call_may_force
     """
-    # XXX need to handle allow_surrogates
     slen = len(s)
     res = StringBuilder(slen)
     pos = 0
@@ -377,7 +393,7 @@
             ordch2 = ord(s[pos])
             ordch3 = ord(s[pos + 1])
 
-            if rutf8._invalid_byte_2_of_3(ordch1, ordch2, True):
+            if rutf8._invalid_byte_2_of_3(ordch1, ordch2, allow_surrogates):
                 r, pos = errorhandler(errors, "utf8", "invalid continuation byte",
                     s, pos - 1, pos)
                 res.append(r)
@@ -994,7 +1010,7 @@
     assert isinstance(uni, unicode)
     return runicode.unicode_encode_utf_8(
         uni, len(uni), "strict",
-        errorhandler=encode_error_handler(space),
+        errorhandler=encode_unicode_error_handler(space),
         allow_surrogates=allow_surrogates)
 
 def encode_utf8sp(space, uni):
diff --git a/pypy/objspace/std/stringmethods.py b/pypy/objspace/std/stringmethods.py
--- a/pypy/objspace/std/stringmethods.py
+++ b/pypy/objspace/std/stringmethods.py
@@ -7,6 +7,7 @@
     find, rfind, count, endswith, replace, rsplit, split, startswith)
 from pypy.interpreter.error import OperationError, oefmt
 from pypy.interpreter.gateway import WrappedDefault, unwrap_spec
+from pypy.interpreter.unicodehelper import str_decode_utf8
 from pypy.objspace.std.sliceobject import W_SliceObject, unwrap_start_stop
 
 
@@ -197,6 +198,12 @@
             errors = 'strict'
         if encoding is None:
             encoding = 'utf8'
+        if encoding == 'utf8' or encoding == 'utf-8':
+            from pypy.module._codecs.interp_codecs import CodecState
+            state = space.fromcache(CodecState)
+            eh = state.decode_error_handler
+            s = space.charbuf_w(self)
+            ret, lgt, pos = str_decode_utf8(s, errors, True, eh)
         return decode_object(space, self, encoding, errors)
 
     @unwrap_spec(tabsize=int)
diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -1898,12 +1898,8 @@
         raise oefmt(space.w_TypeError, "expected unicode, got '%T'", w_unistr)
     value = _rpy_unicode_to_decimal_w(space, w_unistr.utf8_w(space).decode('utf8'))
     # XXX this is the only place in the code that this funcion is called.
-    # It does not translate, since it uses a pypy-level error handler
-    # to throw the UnicodeEncodeError not the rpython default handler
-    #return unicodehelper.encode_utf8(space, value,
-    #                                 allow_surrogates=allow_surrogates)
-    assert isinstance(value, unicode)
-    return value.encode('utf8')
+    return unicodehelper.encode_utf8(space, value,
+                                     allow_surrogates=allow_surrogates)
 
 def _rpy_unicode_to_decimal_w(space, unistr):
     # XXX rewrite this to accept a utf8 string and use a StringBuilder