[pypy-commit] pypy unicode-utf8-py3: fix some utf8 - unicode confusion, edge cases
mattip
pypy.commits at gmail.com
Sat Aug 4 19:45:46 EDT 2018
Author: Matti Picus <matti.picus at gmail.com>
Branch: unicode-utf8-py3
Changeset: r94944:d9685a6896aa
Date: 2018-08-04 16:44 -0700
http://bitbucket.org/pypy/pypy/changeset/d9685a6896aa/
Log: fix some utf8 - unicode confusion, edge cases
diff --git a/pypy/interpreter/argument.py b/pypy/interpreter/argument.py
--- a/pypy/interpreter/argument.py
+++ b/pypy/interpreter/argument.py
@@ -602,11 +602,11 @@
def getmsg(self):
if self.num_kwds == 1:
- if isinstance(self.kwd_name, str):
- uname = self.kwd_name.decode('utf8')
+ if isinstance(self.kwd_name, unicode):
+ uname = self.kwd_name.encode('utf8')
else:
uname = self.kwd_name
- msg = u"got an unexpected keyword argument '%s'" % uname
+ msg = "got an unexpected keyword argument '%s'" % uname
else:
msg = "got %d unexpected keyword arguments" % (
self.num_kwds)
diff --git a/pypy/interpreter/error.py b/pypy/interpreter/error.py
--- a/pypy/interpreter/error.py
+++ b/pypy/interpreter/error.py
@@ -520,7 +520,7 @@
if isinstance(value, unicode):
result = value.encode('utf8')
else:
- result = value
+ result = value.decode('utf8', errors='replace')
else:
if isinstance(value, unicode):
result = value
diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -117,7 +117,7 @@
# instead
from pypy.module._codecs.locale import (
unicode_encode_locale_surrogateescape)
- uni = space.realunicode_w(w_uni).decode('utf8')
+ uni = space.realunicode_w(w_uni)
if u'\x00' in uni:
raise oefmt(space.w_ValueError, "embedded null character")
bytes = unicode_encode_locale_surrogateescape(
diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -513,7 +513,7 @@
def descr_encode(self, space, w_encoding=None, w_errors=None):
encoding, errors = _get_encoding_and_errors(space, w_encoding,
w_errors)
- return encode_object(space, self, encoding, errors)
+ return encode_object(space, self, encoding, errors, allow_surrogates=True)
@unwrap_spec(tabsize=int)
def descr_expandtabs(self, space, tabsize=8):
@@ -1183,16 +1183,17 @@
return encoding, errors
-def encode_object(space, w_object, encoding, errors):
+def encode_object(space, w_object, encoding, errors, allow_surrogates=False):
utf8 = space.utf8_w(w_object)
# TODO: refactor unnatrual use of error hanlders here,
# we should make a single pass over the utf8 str
- pos = rutf8.surrogate_in_utf8(utf8)
- if pos >= 0:
- eh = unicodehelper.encode_error_handler(space)
- eh(None, "utf8", "surrogates not allowed", utf8,
- pos, pos + 1)
- assert False, "always raises"
+ if not allow_surrogates:
+ pos = rutf8.surrogate_in_utf8(utf8)
+ if pos >= 0:
+ eh = unicodehelper.encode_error_handler(space)
+ eh(None, "utf8", "surrogates not allowed", utf8,
+ pos, pos + 1)
+ assert False, "always raises"
if errors is None or errors == 'strict':
if encoding is None or encoding == 'utf-8':
#if rutf8.has_surrogates(utf8):
More information about the pypy-commit
mailing list