[pypy-commit] pypy unicode-utf8: apply fix from 0cca4bcffdbf, reduce diff to unicode-utf8-py3, fix test
mattip
pypy.commits at gmail.com
Tue Jan 1 13:29:11 EST 2019
Author: Matti Picus <matti.picus at gmail.com>
Branch: unicode-utf8
Changeset: r95562:5d13e76c2ee0
Date: 2019-01-01 20:28 +0200
http://bitbucket.org/pypy/pypy/changeset/5d13e76c2ee0/
Log: apply fix from 0cca4bcffdbf, reduce diff to unicode-utf8-py3, fix
test
diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -3,7 +3,7 @@
from pypy.interpreter.error import OperationError, oefmt
from rpython.rlib.objectmodel import specialize
from rpython.rlib.rstring import StringBuilder
-from rpython.rlib import rutf8
+from rpython.rlib import rutf8, runicode
from rpython.rlib.rarithmetic import r_uint, intmask
from rpython.rtyper.lltypesystem import rffi
from pypy.module.unicodedata import unicodedb
@@ -21,6 +21,11 @@
space.newtext(msg)]))
return raise_unicode_exception_decode
+def decode_never_raise(errors, encoding, msg, s, startingpos, endingpos):
+ assert startingpos >= 0
+ ux = ['\ux' + hex(ord(x))[2:].upper() for x in s[startingpos:endingpos]]
+ return ''.join(ux), endingpos, 'b'
+
@specialize.memo()
def encode_error_handler(space):
# Fast version of the "strict" errors handler.
@@ -35,6 +40,23 @@
space.newtext(msg)]))
return raise_unicode_exception_encode
+ at specialize.memo()
+def encode_unicode_error_handler(space):
+ # Fast version of the "strict" errors handler.
+ def raise_unicode_exception_encode(errors, encoding, msg, uni,
+ startingpos, endingpos):
+ assert isinstance(uni, unicode)
+ u_len = len(uni)
+ utf8 = runicode.unicode_encode_utf8sp(uni, u_len)
+ raise OperationError(space.w_UnicodeEncodeError,
+ space.newtuple([space.newtext(encoding),
+ space.newtext(utf8, u_len),
+ space.newint(startingpos),
+ space.newint(endingpos),
+ space.newtext(msg)]))
+ return u'', None, 0
+ return raise_unicode_exception_encode
+
def default_error_encode(
errors, encoding, msg, u, startingpos, endingpos):
"""A default handler, for tests"""
@@ -45,10 +67,10 @@
return '', endingpos
raise ValueError
-def convert_arg_to_w_unicode(space, w_arg, strict=None):
- return space.convert_arg_to_w_unicode(w_arg)
+# ____________________________________________________________
+_WIN32 = sys.platform == 'win32'
+_MACOSX = sys.platform == 'darwin'
-# ____________________________________________________________
def encode(space, w_data, encoding=None, errors='strict'):
from pypy.objspace.std.unicodeobject import encode_object
@@ -245,18 +267,21 @@
res = StringBuilder(slen)
pos = 0
end = len(s)
+ suppressing = False # we are in a chain of "bad" unicode, only emit one fix
while pos < end:
ordch1 = ord(s[pos])
# fast path for ASCII
if ordch1 <= 0x7F:
pos += 1
res.append(chr(ordch1))
+ suppressing = False
continue
if ordch1 <= 0xC1:
r, pos = errorhandler(errors, "utf8", "invalid start byte",
s, pos, pos + 1)
- res.append(r)
+ if not suppressing:
+ res.append(r)
continue
pos += 1
@@ -268,14 +293,16 @@
break
r, pos = errorhandler(errors, "utf8", "unexpected end of data",
s, pos - 1, pos)
- res.append(r)
+ if not suppressing:
+ res.append(r)
continue
ordch2 = ord(s[pos])
if rutf8._invalid_byte_2_of_2(ordch2):
r, pos = errorhandler(errors, "utf8", "invalid continuation byte",
s, pos - 1, pos)
- res.append(r)
+ if not suppressing:
+ res.append(r)
continue
# 110yyyyy 10zzzzzz -> 00000000 00000yyy yyzzzzzz
pos += 1
@@ -289,8 +316,9 @@
pos -= 1
break
r, pos = errorhandler(errors, "utf8", "unexpected end of data",
- s, pos - 1, pos + 1)
+ s, pos - 1, pos)
res.append(r)
+ suppressing = True
continue
ordch2 = ord(s[pos])
ordch3 = ord(s[pos + 1])
@@ -298,12 +326,14 @@
if rutf8._invalid_byte_2_of_3(ordch1, ordch2, True):
r, pos = errorhandler(errors, "utf8", "invalid continuation byte",
s, pos - 1, pos)
- res.append(r)
+ if not suppressing:
+ res.append(r)
continue
elif rutf8._invalid_byte_3_of_3(ordch3):
r, pos = errorhandler(errors, "utf8", "invalid continuation byte",
s, pos - 1, pos + 1)
- res.append(r)
+ if not suppressing:
+ res.append(r)
continue
pos += 2
@@ -311,6 +341,7 @@
res.append(chr(ordch1))
res.append(chr(ordch2))
res.append(chr(ordch3))
+ suppressing = False
continue
if ordch1 <= 0xF4:
@@ -321,6 +352,7 @@
r, pos = errorhandler(errors, "utf8", "unexpected end of data",
s, pos - 1, pos)
res.append(r)
+ suppressing = True
continue
ordch2 = ord(s[pos])
ordch3 = ord(s[pos + 1])
@@ -329,7 +361,8 @@
if rutf8._invalid_byte_2_of_4(ordch1, ordch2):
r, pos = errorhandler(errors, "utf8", "invalid continuation byte",
s, pos - 1, pos)
- res.append(r)
+ if not suppressing:
+ res.append(r)
continue
elif rutf8._invalid_byte_3_of_4(ordch3):
r, pos = errorhandler(errors, "utf8", "invalid continuation byte",
@@ -339,7 +372,8 @@
elif rutf8._invalid_byte_4_of_4(ordch4):
r, pos = errorhandler(errors, "utf8", "invalid continuation byte",
s, pos - 1, pos + 2)
- res.append(r)
+ if not suppressing:
+ res.append(r)
continue
pos += 3
@@ -348,11 +382,13 @@
res.append(chr(ordch2))
res.append(chr(ordch3))
res.append(chr(ordch4))
+ suppressing = False
continue
r, pos = errorhandler(errors, "utf8", "invalid start byte",
s, pos - 1, pos)
- res.append(r)
+ if not suppressing:
+ res.append(r)
r = res.build()
return r, pos, rutf8.check_utf8(r, True)
@@ -899,6 +935,33 @@
return result.build()
+def encode_utf8(space, uni, allow_surrogates=False):
+ # Note that Python3 tends to forbid *all* surrogates in utf-8.
+ # If allow_surrogates=True, then revert to the Python 2 behavior
+ # which never raises UnicodeEncodeError. Surrogate pairs are then
+ # allowed, either paired or lone. A paired surrogate is considered
+ # like the non-BMP character it stands for. See also *_utf8sp().
+ assert isinstance(uni, unicode)
+ return runicode.unicode_encode_utf_8(
+ uni, len(uni), "strict",
+ errorhandler=encode_unicode_error_handler(space),
+ allow_surrogates=allow_surrogates)
+
+def encode_utf8sp(space, uni, allow_surrogates=True):
+ # Surrogate-preserving utf-8 encoding. Any surrogate character
+ # turns into its 3-bytes encoding, whether it is paired or not.
+ # This should always be reversible, and the reverse is
+ # decode_utf8sp().
+ return runicode.unicode_encode_utf8sp(uni, len(uni))
+
+def decode_utf8sp(space, string):
+ # Surrogate-preserving utf-8 decoding. Assuming there is no
+ # encoding error, it should always be reversible, and the reverse is
+ # encode_utf8sp().
+ return str_decode_utf8(string, "string", True, decode_never_raise,
+ allow_surrogates=True)
+
+
# ____________________________________________________________
# utf-16
diff --git a/pypy/module/_codecs/interp_codecs.py b/pypy/module/_codecs/interp_codecs.py
--- a/pypy/module/_codecs/interp_codecs.py
+++ b/pypy/module/_codecs/interp_codecs.py
@@ -221,7 +221,7 @@
if space.isinstance_w(w_exc, space.w_UnicodeEncodeError):
w_obj = space.getattr(w_exc, space.newtext('object'))
space.realutf8_w(w_obj) # weeoes
- w_obj = unicodehelper.convert_arg_to_w_unicode(space, w_obj)
+ w_obj = space.convert_arg_to_w_unicode(w_obj)
start = space.int_w(space.getattr(w_exc, space.newtext('start')))
w_end = space.getattr(w_exc, space.newtext('end'))
end = space.int_w(w_end)
@@ -250,7 +250,7 @@
if space.isinstance_w(w_exc, space.w_UnicodeEncodeError):
w_obj = space.getattr(w_exc, space.newtext('object'))
space.realutf8_w(w_obj) # for errors
- w_obj = unicodehelper.convert_arg_to_w_unicode(space, w_obj)
+ w_obj = space.convert_arg_to_w_unicode(w_obj)
start = space.int_w(space.getattr(w_exc, space.newtext('start')))
w_end = space.getattr(w_exc, space.newtext('end'))
end = space.int_w(w_end)
@@ -395,7 +395,7 @@
def wrap_encoder(space, w_arg, errors="strict"):
from pypy.interpreter import unicodehelper
- w_arg = unicodehelper.convert_arg_to_w_unicode(space, w_arg, rname)
+ w_arg = space.convert_arg_to_w_unicode(w_arg)
if errors is None:
errors = 'strict'
state = space.fromcache(CodecState)
@@ -650,7 +650,7 @@
mapping = Charmap_Encode(space, w_mapping)
state = space.fromcache(CodecState)
- w_uni = unicodehelper.convert_arg_to_w_unicode(space, w_unicode)
+ w_uni = space.convert_arg_to_w_unicode(w_unicode)
result = unicodehelper.utf8_encode_charmap(
space.utf8_w(w_uni), errors, state.encode_error_handler, mapping)
return space.newtuple([space.newbytes(result), space.newint(w_uni._len())])
diff --git a/pypy/module/_codecs/test/test_codecs.py b/pypy/module/_codecs/test/test_codecs.py
--- a/pypy/module/_codecs/test/test_codecs.py
+++ b/pypy/module/_codecs/test/test_codecs.py
@@ -140,6 +140,7 @@
assert unicode_escape_encode(u'abc') == (u'abc'.encode('unicode_escape'), 3)
assert unicode_escape_decode('abc') == (u'abc'.decode('unicode_escape'), 3)
assert unicode_escape_decode('\\x61\\x62\\x63') == (u'abc', 12)
+
def test_unicode_replace(self):
# CPython #8271: during the decoding of an invalid UTF-8 byte sequence,
# only the start byte and the continuation byte(s) are now considered
@@ -216,14 +217,13 @@
(b'\xfe', FFFD),
(b'\xfe\x80\x80', FFFD*3),
# other sequences
- (b'\xf1\x80\x41\x42\x43', '\ufffd\x41\x42\x43'),
- (b'\xf1\x80\xff\x42\x43', '\ufffd\ufffd\x42\x43'),
- (b'\xf1\x80\xc2\x81\x43', '\ufffd\x81\x43'),
+ (b'\xf1\x80\x41\x42\x43', u'\ufffd\x41\x42\x43'),
+ (b'\xf1\x80\xff\x42\x43', u'\ufffd\ufffd\x42\x43'),
+ (b'\xf1\x80\xc2\x81\x43', u'\ufffd\x81\x43'),
(b'\x61\xF1\x80\x80\xE1\x80\xC2\x62\x80\x63\x80\xBF\x64',
- '\x61\uFFFD\uFFFD\uFFFD\x62\uFFFD\x63\uFFFD\uFFFD\x64'),
+ u'\x61\uFFFD\uFFFD\uFFFD\x62\uFFFD\x63\uFFFD\uFFFD\x64'),
]
for n, (seq, res) in enumerate(sequences):
- print(seq, res)
raises(UnicodeDecodeError, seq.decode, 'utf-8', 'strict')
uni = seq.decode('utf-8', 'replace')
assert uni == res
@@ -233,7 +233,6 @@
assert uni == res.replace(u'\uFFFD', '')
-
class AppTestPartialEvaluation:
spaceconfig = dict(usemodules=['array',])
if sys.platform == 'win32':
diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -1155,7 +1155,7 @@
# test_unicode_conversion_with__str__
if w_unicode_method is None:
if space.isinstance_w(w_obj, space.w_unicode):
- return unicodehelper.convert_arg_to_w_unicode(space, w_obj)
+ return space.convert_arg_to_w_unicode(w_obj)
w_unicode_method = space.lookup(w_obj, "__str__")
if w_unicode_method is not None:
w_res = space.get_and_call_function(w_unicode_method, w_obj)
More information about the pypy-commit
mailing list