[pypy-commit] pypy unicode-utf8-py3: use encode_utf8, str_decode_utf8, and maybe handle surrogates in the latter
mattip
pypy.commits at gmail.com
Sun Sep 2 05:52:56 EDT 2018
Author: Matti Picus <matti.picus at gmail.com>
Branch: unicode-utf8-py3
Changeset: r95073:b040f44dc71b
Date: 2018-09-02 10:18 +0200
http://bitbucket.org/pypy/pypy/changeset/b040f44dc71b/
Log: use encode_utf8, str_decode_utf8, and maybe handle surrogates in the
latter
diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -50,6 +50,23 @@
return u'', None, 0
return raise_unicode_exception_encode
+ at specialize.memo()
+def encode_unicode_error_handler(space):
+ # Fast version of the "strict" errors handler.
+ def raise_unicode_exception_encode(errors, encoding, msg, uni,
+ startingpos, endingpos):
+ assert isinstance(uni, unicode)
+ u_len = len(uni)
+ utf8 = runicode.unicode_encode_utf8sp(uni, u_len)
+ raise OperationError(space.w_UnicodeEncodeError,
+ space.newtuple([space.newtext(encoding),
+ space.newtext(utf8, u_len),
+ space.newint(startingpos),
+ space.newint(endingpos),
+ space.newtext(msg)]))
+ return u'', None, 0
+ return raise_unicode_exception_encode
+
def default_error_encode(
errors, encoding, msg, u, startingpos, endingpos):
"""A default handler, for tests"""
@@ -322,7 +339,6 @@
valid so we're trying to either raise or pack stuff with error handler.
The key difference is that this is call_may_force
"""
- # XXX need to handle allow_surrogates
slen = len(s)
res = StringBuilder(slen)
pos = 0
@@ -377,7 +393,7 @@
ordch2 = ord(s[pos])
ordch3 = ord(s[pos + 1])
- if rutf8._invalid_byte_2_of_3(ordch1, ordch2, True):
+ if rutf8._invalid_byte_2_of_3(ordch1, ordch2, allow_surrogates):
r, pos = errorhandler(errors, "utf8", "invalid continuation byte",
s, pos - 1, pos)
res.append(r)
@@ -994,7 +1010,7 @@
assert isinstance(uni, unicode)
return runicode.unicode_encode_utf_8(
uni, len(uni), "strict",
- errorhandler=encode_error_handler(space),
+ errorhandler=encode_unicode_error_handler(space),
allow_surrogates=allow_surrogates)
def encode_utf8sp(space, uni):
diff --git a/pypy/objspace/std/stringmethods.py b/pypy/objspace/std/stringmethods.py
--- a/pypy/objspace/std/stringmethods.py
+++ b/pypy/objspace/std/stringmethods.py
@@ -7,6 +7,7 @@
find, rfind, count, endswith, replace, rsplit, split, startswith)
from pypy.interpreter.error import OperationError, oefmt
from pypy.interpreter.gateway import WrappedDefault, unwrap_spec
+from pypy.interpreter.unicodehelper import str_decode_utf8
from pypy.objspace.std.sliceobject import W_SliceObject, unwrap_start_stop
@@ -197,6 +198,12 @@
errors = 'strict'
if encoding is None:
encoding = 'utf8'
+ if encoding == 'utf8' or encoding == 'utf-8':
+ from pypy.module._codecs.interp_codecs import CodecState
+ state = space.fromcache(CodecState)
+ eh = state.decode_error_handler
+ s = space.charbuf_w(self)
+ ret, lgt, pos = str_decode_utf8(s, errors, True, eh)
return decode_object(space, self, encoding, errors)
@unwrap_spec(tabsize=int)
diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -1898,12 +1898,8 @@
raise oefmt(space.w_TypeError, "expected unicode, got '%T'", w_unistr)
value = _rpy_unicode_to_decimal_w(space, w_unistr.utf8_w(space).decode('utf8'))
# XXX this is the only place in the code that this funcion is called.
- # It does not translate, since it uses a pypy-level error handler
- # to throw the UnicodeEncodeError not the rpython default handler
- #return unicodehelper.encode_utf8(space, value,
- # allow_surrogates=allow_surrogates)
- assert isinstance(value, unicode)
- return value.encode('utf8')
+ return unicodehelper.encode_utf8(space, value,
+ allow_surrogates=allow_surrogates)
def _rpy_unicode_to_decimal_w(space, unistr):
# XXX rewrite this to accept a utf8 string and use a StringBuilder
More information about the pypy-commit
mailing list