[pypy-commit] pypy unicode-utf8-py3: remove round-tripping str-unicode from errors

Fri Feb 1 08:53:46 EST 2019

Author: Matti Picus <matti.picus at gmail.com>
Branch: unicode-utf8-py3
Changeset: r95768:be2b370c0652
Date: 2019-02-01 15:49 +0200
http://bitbucket.org/pypy/pypy/changeset/be2b370c0652/

Log:	remove round-tripping str-unicode from errors

diff --git a/pypy/interpreter/error.py b/pypy/interpreter/error.py
--- a/pypy/interpreter/error.py
+++ b/pypy/interpreter/error.py
@@ -472,19 +472,7 @@
     assert len(formats) > 0, "unsupported: no % command found"
     return tuple(parts), tuple(formats)
 
-def _decode_utf8(string):
-    # when building the error message, don't crash if the byte string
-    # provided is not valid UTF-8
-    if isinstance(string, unicode):
-        return string
-    assert isinstance(string, str)
-    #return string.decode('utf8')
-    result, consumed = runicode.str_decode_utf_8(
-        string, len(string), "replace", final=True)
-    return result
-
 def get_operrcls2(valuefmt):
-    valuefmt = valuefmt.decode('ascii')
     strings, formats = decompose_valuefmt(valuefmt)
     assert len(strings) == len(formats) + 1
     try:
@@ -503,49 +491,49 @@
                 self.setup(w_type)
 
             def _compute_value(self, space):
-                # TODO: avoid utf8->unicode->utf8 dance
                 lst = [None] * (len(formats) + len(formats) + 1)
+                lgt = 0
                 for i, fmt, attr in entries:
                     lst[i + i] = self.xstrings[i]
+                    lgt += len(self.xstrings[i])
                     value = getattr(self, attr)
                     if fmt == 'd':
-                        result = str(value).decode('ascii')
+                        result = str(value)
+                        lgt += len(result)
                     elif fmt == 'R':
-                        result = space.realunicode_w(space.repr(value))
+                        result = space.utf8_w(space.repr(value))
+                        lgt += len(result)
                     elif fmt == 'S':
-                        result = space.realunicode_w(space.str(value))
+                        result = space.utf8_w(space.str(value))
+                        lgt += len(result)
                     elif fmt == 'T':
-                        result = _decode_utf8(space.type(value).name)
+                        result = space.type(value).name
+                        lgt += len(result)
                     elif fmt == 'N':
-                        name = value.getname(space)
-                        if isinstance(name, unicode):
-                            result = name
-                        else:
-                            result = _decode_utf8(name)
+                        result = value.getname(space)
+                        lgt += len(result)
                     elif fmt == '8':
                         # u'str\uxxxx' -> 'str\xXX\xXX' -> u"'str\xXX\xXX'"
-                        if isinstance(value, unicode):
-                            result = runicode.unicode_encode_utf_8(value,
-                                     len(value), 'strict', allow_surrogates=True)
-                        else:
-                            from pypy.interpreter import unicodehelper
-                            result = _decode_utf8(unicodehelper.str_decode_utf8(
-                                value, 'replace', True,
-                                unicodehelper.decode_never_raise, True)[0])
+                        from pypy.interpreter import unicodehelper
+                        result, _lgt, pos  = unicodehelper.str_decode_utf8(
+                            value, 'replace', True,
+                            unicodehelper.decode_never_raise, True)
+                        lgt += _lgt
+                    elif isinstance(value, unicode):
+                        # 's'
+                        result = str(value.encode('utf-8'))
+                        lgt += len(value)
                     else:
-                        if isinstance(value, unicode):
-                            result = value
-                        else:
-                            result = _decode_utf8(str(value))
+                        from rpython.rlib import rutf8
+                        result = str(value)
+                        # Assumes valid utf-8
+                        lgt += rutf8.check_utf8(result, True)
                     lst[i + i + 1] = result
                 lst[-1] = self.xstrings[-1]
-                retval = u''.join(lst)
-                # We need to annotate both allow_surrogates=True,False
-                # since this function is used to replace uni.encode('utf8')
-                # deep in rpython
-                return runicode.unicode_encode_utf_8(retval, len(retval),
-                             'strict', allow_surrogates=False), len(retval)
-        #
+                lgt += len(self.xstrings[-1])
+                retval = ''.join(lst)
+                return retval, lgt
+
         _fmtcache2[formats] = OpErrFmt
     return OpErrFmt, strings