[pypy-commit] pypy unicode-utf8-py3: fix some utf8 - unicode confusion, edge cases

Sat Aug 4 19:45:46 EDT 2018

Author: Matti Picus <matti.picus at gmail.com>
Branch: unicode-utf8-py3
Changeset: r94944:d9685a6896aa
Date: 2018-08-04 16:44 -0700
http://bitbucket.org/pypy/pypy/changeset/d9685a6896aa/

Log:	fix some utf8 - unicode confusion, edge cases

diff --git a/pypy/interpreter/argument.py b/pypy/interpreter/argument.py
--- a/pypy/interpreter/argument.py
+++ b/pypy/interpreter/argument.py
@@ -602,11 +602,11 @@
 
     def getmsg(self):
         if self.num_kwds == 1:
-            if isinstance(self.kwd_name, str):
-                uname = self.kwd_name.decode('utf8')
+            if isinstance(self.kwd_name, unicode):
+                uname = self.kwd_name.encode('utf8')
             else:
                 uname = self.kwd_name
-            msg = u"got an unexpected keyword argument '%s'" % uname
+            msg = "got an unexpected keyword argument '%s'" % uname
         else:
             msg = "got %d unexpected keyword arguments" % (
                 self.num_kwds)
diff --git a/pypy/interpreter/error.py b/pypy/interpreter/error.py
--- a/pypy/interpreter/error.py
+++ b/pypy/interpreter/error.py
@@ -520,7 +520,7 @@
                         if isinstance(value, unicode):
                             result = value.encode('utf8')
                         else:
-                            result = value
+                            result = value.decode('utf8', errors='replace')
                     else:
                         if isinstance(value, unicode):
                             result = value
diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -117,7 +117,7 @@
         # instead
         from pypy.module._codecs.locale import (
             unicode_encode_locale_surrogateescape)
-        uni = space.realunicode_w(w_uni).decode('utf8')
+        uni = space.realunicode_w(w_uni)
         if u'\x00' in uni:
             raise oefmt(space.w_ValueError, "embedded null character")
         bytes = unicode_encode_locale_surrogateescape(
diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -513,7 +513,7 @@
     def descr_encode(self, space, w_encoding=None, w_errors=None):
         encoding, errors = _get_encoding_and_errors(space, w_encoding,
                                                     w_errors)
-        return encode_object(space, self, encoding, errors)
+        return encode_object(space, self, encoding, errors, allow_surrogates=True)
 
     @unwrap_spec(tabsize=int)
     def descr_expandtabs(self, space, tabsize=8):
@@ -1183,16 +1183,17 @@
     return encoding, errors
 
 
-def encode_object(space, w_object, encoding, errors):
+def encode_object(space, w_object, encoding, errors, allow_surrogates=False):
     utf8 = space.utf8_w(w_object)
     # TODO: refactor unnatrual use of error hanlders here,
     # we should make a single pass over the utf8 str
-    pos = rutf8.surrogate_in_utf8(utf8)
-    if pos >= 0:
-        eh = unicodehelper.encode_error_handler(space)
-        eh(None, "utf8", "surrogates not allowed", utf8,
-            pos, pos + 1)
-        assert False, "always raises"
+    if not allow_surrogates:
+        pos = rutf8.surrogate_in_utf8(utf8)
+        if pos >= 0:
+            eh = unicodehelper.encode_error_handler(space)
+            eh(None, "utf8", "surrogates not allowed", utf8,
+                pos, pos + 1)
+            assert False, "always raises"
     if errors is None or errors == 'strict':
         if encoding is None or encoding == 'utf-8':
             #if rutf8.has_surrogates(utf8):