[pypy-commit] pypy unicode-utf8: add failing test from unicode-utf8-py3, and minimize diff
mattip
pypy.commits at gmail.com
Tue Jan 1 13:29:09 EST 2019
Author: Matti Picus <matti.picus at gmail.com>
Branch: unicode-utf8
Changeset: r95561:347997cdfae4
Date: 2019-01-01 19:49 +0200
http://bitbucket.org/pypy/pypy/changeset/347997cdfae4/
Log: add failing test from unicode-utf8-py3, and minimize diff
diff --git a/pypy/module/_codecs/test/test_codecs.py b/pypy/module/_codecs/test/test_codecs.py
--- a/pypy/module/_codecs/test/test_codecs.py
+++ b/pypy/module/_codecs/test/test_codecs.py
@@ -140,6 +140,98 @@
assert unicode_escape_encode(u'abc') == (u'abc'.encode('unicode_escape'), 3)
assert unicode_escape_decode('abc') == (u'abc'.decode('unicode_escape'), 3)
assert unicode_escape_decode('\\x61\\x62\\x63') == (u'abc', 12)
+ def test_unicode_replace(self):
+ # CPython #8271: during the decoding of an invalid UTF-8 byte sequence,
+ # only the start byte and the continuation byte(s) are now considered
+ # invalid, instead of the number of bytes specified by the start byte.
+ # See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf (page 42,
+ # table 3-8, Row 2) for more information about the algorithm used.
+ FFFD = u'\ufffd'
+ sequences = [
+ # invalid start bytes
+ (b'\x80', FFFD), # continuation byte
+ (b'\x80\x80', FFFD*2), # 2 continuation bytes
+ (b'\xc0', FFFD),
+ (b'\xc0\xc0', FFFD*2),
+ (b'\xc1', FFFD),
+ (b'\xc1\xc0', FFFD*2),
+ (b'\xc0\xc1', FFFD*2),
+ # with start byte of a 2-byte sequence
+ (b'\xc2', FFFD), # only the start byte
+ (b'\xc2\xc2', FFFD*2), # 2 start bytes
+ (b'\xc2\xc2\xc2', FFFD*3), # 3 start bytes
+ (b'\xc2\x41', FFFD+'A'), # invalid continuation byte
+ # with start byte of a 3-byte sequence
+ (b'\xe1', FFFD), # only the start byte
+ (b'\xe1\xe1', FFFD*2), # 2 start bytes
+ (b'\xe1\xe1\xe1', FFFD*3), # 3 start bytes
+ (b'\xe1\xe1\xe1\xe1', FFFD*4), # 4 start bytes
+ (b'\xe1\x80', FFFD), # only 1 continuation byte
+ (b'\xe1\x41', FFFD+'A'), # invalid continuation byte
+ (b'\xe1\x41\x80', FFFD+'A'+FFFD), # invalid cb followed by valid cb
+ (b'\xe1\x41\x41', FFFD+'AA'), # 2 invalid continuation bytes
+ (b'\xe1\x80\x41', FFFD+'A'), # only 1 valid continuation byte
+ (b'\xe1\x80\xe1\x41', FFFD*2+'A'), # 1 valid and the other invalid
+ (b'\xe1\x41\xe1\x80', FFFD+'A'+FFFD), # 1 invalid and the other valid
+ # with start byte of a 4-byte sequence
+ (b'\xf1', FFFD), # only the start byte
+ (b'\xf1\xf1', FFFD*2), # 2 start bytes
+ (b'\xf1\xf1\xf1', FFFD*3), # 3 start bytes
+ (b'\xf1\xf1\xf1\xf1', FFFD*4), # 4 start bytes
+ (b'\xf1\xf1\xf1\xf1\xf1', FFFD*5), # 5 start bytes
+ (b'\xf1\x80', FFFD), # only 1 continuation bytes
+ (b'\xf1\x80\x80', FFFD), # only 2 continuation bytes
+ (b'\xf1\x80\x41', FFFD+'A'), # 1 valid cb and 1 invalid
+ (b'\xf1\x80\x41\x41', FFFD+'AA'), # 1 valid cb and 1 invalid
+ (b'\xf1\x80\x80\x41', FFFD+'A'), # 2 valid cb and 1 invalid
+ (b'\xf1\x41\x80', FFFD+'A'+FFFD), # 1 invalid cv and 1 valid
+ (b'\xf1\x41\x80\x80', FFFD+'A'+FFFD*2), # 1 invalid cb and 2 invalid
+ (b'\xf1\x41\x80\x41', FFFD+'A'+FFFD+'A'), # 2 invalid cb and 1 invalid
+ (b'\xf1\x41\x41\x80', FFFD+'AA'+FFFD), # 1 valid cb and 1 invalid
+ (b'\xf1\x41\xf1\x80', FFFD+'A'+FFFD),
+ (b'\xf1\x41\x80\xf1', FFFD+'A'+FFFD*2),
+ (b'\xf1\xf1\x80\x41', FFFD*2+'A'),
+ (b'\xf1\x41\xf1\xf1', FFFD+'A'+FFFD*2),
+ # with invalid start byte of a 4-byte sequence (rfc2279)
+ (b'\xf5', FFFD), # only the start byte
+ (b'\xf5\xf5', FFFD*2), # 2 start bytes
+ (b'\xf5\x80', FFFD*2), # only 1 continuation byte
+ (b'\xf5\x80\x80', FFFD*3), # only 2 continuation byte
+ (b'\xf5\x80\x80\x80', FFFD*4), # 3 continuation bytes
+ (b'\xf5\x80\x41', FFFD*2+'A'), # 1 valid cb and 1 invalid
+ (b'\xf5\x80\x41\xf5', FFFD*2+'A'+FFFD),
+ (b'\xf5\x41\x80\x80\x41', FFFD+'A'+FFFD*2+'A'),
+ # with invalid start byte of a 5-byte sequence (rfc2279)
+ (b'\xf8', FFFD), # only the start byte
+ (b'\xf8\xf8', FFFD*2), # 2 start bytes
+ (b'\xf8\x80', FFFD*2), # only one continuation byte
+ (b'\xf8\x80\x41', FFFD*2 + 'A'), # 1 valid cb and 1 invalid
+ (b'\xf8\x80\x80\x80\x80', FFFD*5), # invalid 5 bytes seq with 5 bytes
+ # with invalid start byte of a 6-byte sequence (rfc2279)
+ (b'\xfc', FFFD), # only the start byte
+ (b'\xfc\xfc', FFFD*2), # 2 start bytes
+ (b'\xfc\x80\x80', FFFD*3), # only 2 continuation bytes
+ (b'\xfc\x80\x80\x80\x80\x80', FFFD*6), # 6 continuation bytes
+ # invalid start byte
+ (b'\xfe', FFFD),
+ (b'\xfe\x80\x80', FFFD*3),
+ # other sequences
+ (b'\xf1\x80\x41\x42\x43', '\ufffd\x41\x42\x43'),
+ (b'\xf1\x80\xff\x42\x43', '\ufffd\ufffd\x42\x43'),
+ (b'\xf1\x80\xc2\x81\x43', '\ufffd\x81\x43'),
+ (b'\x61\xF1\x80\x80\xE1\x80\xC2\x62\x80\x63\x80\xBF\x64',
+ '\x61\uFFFD\uFFFD\uFFFD\x62\uFFFD\x63\uFFFD\uFFFD\x64'),
+ ]
+ for n, (seq, res) in enumerate(sequences):
+ print(seq, res)
+ raises(UnicodeDecodeError, seq.decode, 'utf-8', 'strict')
+ uni = seq.decode('utf-8', 'replace')
+ assert uni == res
+ uni = (seq+b'b').decode('utf-8', 'replace')
+ assert uni == res+'b'
+ uni = seq.decode('utf-8', 'ignore')
+ assert uni == res.replace(u'\uFFFD', '')
+
class AppTestPartialEvaluation:
@@ -359,32 +451,31 @@
def search_function(encoding):
def f(input, errors="strict"):
return 42
- print encoding
if encoding == 'test.mytestenc':
return (f, f, None, None)
return None
_codecs.register(search_function)
- raises(TypeError, "hello".decode, "test.mytestenc")
+ raises(TypeError, b"hello".decode, "test.mytestenc")
raises(TypeError, u"hello".encode, "test.mytestenc")
def test_cpytest_decode(self):
import codecs
- assert codecs.decode('\xe4\xf6\xfc', 'latin-1') == u'\xe4\xf6\xfc'
+ assert codecs.decode(b'\xe4\xf6\xfc', 'latin-1') == u'\xe4\xf6\xfc'
raises(TypeError, codecs.decode)
- assert codecs.decode('abc') == u'abc'
- raises(UnicodeDecodeError, codecs.decode, '\xff', 'ascii')
+ assert codecs.decode(b'abc') == u'abc'
+ raises(UnicodeDecodeError, codecs.decode, b'\xff', 'ascii')
def test_bad_errorhandler_return(self):
import codecs
def baddecodereturn1(exc):
return 42
codecs.register_error("test.baddecodereturn1", baddecodereturn1)
- raises(TypeError, "\xff".decode, "ascii", "test.baddecodereturn1")
- raises(TypeError, "\\".decode, "unicode-escape", "test.baddecodereturn1")
- raises(TypeError, "\\x0".decode, "unicode-escape", "test.baddecodereturn1")
- raises(TypeError, "\\x0y".decode, "unicode-escape", "test.baddecodereturn1")
- raises(TypeError, "\\Uffffeeee".decode, "unicode-escape", "test.baddecodereturn1")
- raises(TypeError, "\\uyyyy".decode, "raw-unicode-escape", "test.baddecodereturn1")
+ raises(TypeError, b"\xff".decode, "ascii", "test.baddecodereturn1")
+ raises(TypeError, b"\\".decode, "unicode-escape", "test.baddecodereturn1")
+ raises(TypeError, b"\\x0".decode, "unicode-escape", "test.baddecodereturn1")
+ raises(TypeError, b"\\x0y".decode, "unicode-escape", "test.baddecodereturn1")
+ raises(TypeError, b"\\Uffffeeee".decode, "unicode-escape", "test.baddecodereturn1")
+ raises(TypeError, b"\\uyyyy".decode, "raw-unicode-escape", "test.baddecodereturn1")
def test_cpy_bug1175396(self):
import codecs, StringIO
@@ -420,7 +511,7 @@
' self.abort("cannot load articles")\r\n',
]
stream = StringIO.StringIO("".join(s).encode("utf7"))
- assert "aborrt" not in stream.getvalue()
+ assert b"aborrt" not in stream.getvalue()
reader = codecs.getreader("utf7")(stream)
for (i, line) in enumerate(reader):
assert line == s[i]
@@ -439,30 +530,44 @@
s = u"spam"
assert d.decode(s.encode("utf-8-sig")) == s
+ def test_decoder_state(self):
+ import codecs
+ encoding = 'utf16'
+ u = 'abc123'
+ s = u.encode(encoding)
+ for i in range(len(u) + 1):
+ d = codecs.getincrementalencoder(encoding)()
+ part1 = d.encode(u[:i])
+ state = d.getstate()
+ d = codecs.getincrementalencoder(encoding)()
+ d.setstate(state)
+ part2 = d.encode(u[i:], True)
+ assert s == part1 + part2
+
def test_escape_decode_escaped_newline(self):
import _codecs
- s = '\\\n'
+ s = b'\\\n'
decoded = _codecs.unicode_escape_decode(s)[0]
assert decoded == ''
def test_charmap_decode_1(self):
import codecs
- assert codecs.charmap_encode(u'xxx') == ('xxx', 3)
- assert codecs.charmap_encode(u'xxx', 'strict', {ord('x'): 'XX'}) == ('XXXXXX', 3)
+ assert codecs.charmap_encode(u'xxx') == (b'xxx', 3)
+ assert codecs.charmap_encode(u'xxx', 'strict', {ord('x'): b'XX'}) == (b'XXXXXX', 3)
- res = codecs.charmap_decode("\x00\x01\x02", "replace", u"ab")
+ res = codecs.charmap_decode(b"\x00\x01\x02", "replace", u"ab")
assert res == (u"ab\ufffd", 3)
- res = codecs.charmap_decode("\x00\x01\x02", "replace", u"ab\ufffe")
+ res = codecs.charmap_decode(b"\x00\x01\x02", "replace", u"ab\ufffe")
assert res == (u'ab\ufffd', 3)
def test_decode_errors(self):
import sys
if sys.maxunicode > 0xffff:
try:
- "\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
+ b"\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
except UnicodeDecodeError as ex:
assert "unicode_internal" == ex.encoding
- assert "\x00\x00\x00\x00\x00\x11\x11\x00" == ex.object
+ assert b"\x00\x00\x00\x00\x00\x11\x11\x00" == ex.object
assert ex.start == 4
assert ex.end == 8
else:
@@ -473,14 +578,14 @@
assert codecs.replace_errors(UnicodeEncodeError(
"ascii", u"\u3042", 0, 1, "ouch")) == (u"?", 1)
assert codecs.replace_errors(UnicodeDecodeError(
- "ascii", "\xff", 0, 1, "ouch")) == (u"\ufffd", 1)
+ "ascii", b"\xff", 0, 1, "ouch")) == (u"\ufffd", 1)
assert codecs.replace_errors(UnicodeTranslateError(
u"\u3042", 0, 1, "ouch")) == (u"\ufffd", 1)
assert codecs.replace_errors(UnicodeEncodeError(
"ascii", u"\u3042\u3042", 0, 2, "ouch")) == (u"??", 2)
assert codecs.replace_errors(UnicodeDecodeError(
- "ascii", "\xff\xff", 0, 2, "ouch")) == (u"\ufffd", 2)
+ "ascii", b"\xff\xff", 0, 2, "ouch")) == (u"\ufffd", 2)
assert codecs.replace_errors(UnicodeTranslateError(
u"\u3042\u3042", 0, 2, "ouch")) == (u"\ufffd\ufffd", 2)
@@ -498,13 +603,13 @@
# A UnicodeDecodeError object without an end attribute
class NoEndUnicodeDecodeError(UnicodeDecodeError):
def __init__(self):
- UnicodeDecodeError.__init__(self, "ascii", "", 0, 1, "bad")
+ UnicodeDecodeError.__init__(self, "ascii", b"", 0, 1, "bad")
del self.end
# A UnicodeDecodeError object with a bad object attribute
class BadObjectUnicodeDecodeError(UnicodeDecodeError):
def __init__(self):
- UnicodeDecodeError.__init__(self, "ascii", "", 0, 1, "bad")
+ UnicodeDecodeError.__init__(self, "ascii", b"", 0, 1, "bad")
self.object = []
# A UnicodeTranslateError object without a start attribute
@@ -536,18 +641,18 @@
# With the correct exception, "replace" returns an "?" or u"\ufffd" replacement
def test_decode_ignore(self):
- assert '\xff'.decode('utf-7', 'ignore') == ''
- assert '\x00'.decode('unicode-internal', 'ignore') == ''
+ assert b'\xff'.decode('utf-7', 'ignore') == ''
+ assert b'\x00'.decode('unicode-internal', 'ignore') == ''
def test_backslashreplace(self):
import sys
sin = u"a\xac\u1234\u20ac\u8000\U0010ffff"
if sys.maxunicode > 65535:
- expected_ascii = "a\\xac\\u1234\\u20ac\\u8000\\U0010ffff"
- expected_8859 = "a\xac\\u1234\xa4\\u8000\\U0010ffff"
+ expected_ascii = b"a\\xac\\u1234\\u20ac\\u8000\\U0010ffff"
+ expected_8859 = b"a\xac\\u1234\xa4\\u8000\\U0010ffff"
else:
- expected_ascii = "a\\xac\\u1234\\u20ac\\u8000\\udbff\\udfff"
- expected_8859 = "a\xac\\u1234\xa4\\u8000\\udbff\\udfff"
+ expected_ascii = b"a\\xac\\u1234\\u20ac\\u8000\\udbff\\udfff"
+ expected_8859 = b"a\xac\\u1234\xa4\\u8000\\udbff\\udfff"
assert sin.encode('ascii', 'backslashreplace') == expected_ascii
assert sin.encode("iso-8859-15", "backslashreplace") == expected_8859
@@ -566,10 +671,10 @@
"test.badhandler"
)
for (enc, bytes) in (
- ("utf-8", "\xff"),
- ("ascii", "\xff"),
- ("utf-7", "+x-"),
- ("unicode-internal", "\x00"),
+ ("utf-8", b"\xff"),
+ ("ascii", b"\xff"),
+ ("utf-7", b"+x-"),
+ ("unicode-internal", b"\x00"),
):
raises(
TypeError,
@@ -578,17 +683,25 @@
"test.badhandler"
)
+ def test_badhandler_longindex(self):
+ import codecs
+ import sys
+ errors = 'test.badhandler_longindex'
+ codecs.register_error(errors, lambda x: (u'', sys.maxsize + 1))
+ # CPython raises OverflowError here
+ raises((IndexError, OverflowError), b'apple\x92ham\x93spam'.decode, 'utf-8', errors)
+
def test_unicode_internal(self):
import codecs
import sys
try:
- '\x00'.decode('unicode-internal')
+ b'\x00'.decode('unicode-internal')
except UnicodeDecodeError:
pass
else:
raise Exception("DID NOT RAISE")
- res = "\x00\x00\x00\x00\x00".decode("unicode-internal", "replace")
+ res = b"\x00\x00\x00\x00\x00".decode("unicode-internal", "replace")
if sys.maxunicode > 65535:
assert res == u"\u0000\ufffd" # UCS4 build
else:
@@ -605,7 +718,7 @@
raise TypeError("don't know how to handle %r" % exc)
return (u"\x01", 5)
codecs.register_error("test.hui", handler_unicodeinternal)
- res = "\x00\x00\x00\x00\x00".decode("unicode-internal", "test.hui")
+ res = b"\x00\x00\x00\x00\x00".decode("unicode-internal", "test.hui")
if sys.maxunicode > 65535:
assert res == u"\u0000\u0001" # UCS4 build
else:
@@ -720,18 +833,18 @@
x = u'123abc'
if sys.byteorder == 'big':
assert codecs.getencoder('utf-16')(x) == (
- '\xfe\xff\x001\x002\x003\x00a\x00b\x00c', 6)
+ b'\xfe\xff\x001\x002\x003\x00a\x00b\x00c', 6)
assert codecs.getdecoder('utf-16')(
- '\xfe\xff\x001\x002\x003\x00a\x00b\x00c') == (x, 14)
+ b'\xfe\xff\x001\x002\x003\x00a\x00b\x00c') == (x, 14)
else:
assert codecs.getencoder('utf-16')(x) == (
- '\xff\xfe1\x002\x003\x00a\x00b\x00c\x00', 6)
+ b'\xff\xfe1\x002\x003\x00a\x00b\x00c\x00', 6)
assert codecs.getdecoder('utf-16')(
- '\xff\xfe1\x002\x003\x00a\x00b\x00c\x00') == (x, 14)
+ b'\xff\xfe1\x002\x003\x00a\x00b\x00c\x00') == (x, 14)
def test_unicode_escape(self):
assert u'\\'.encode('unicode-escape') == '\\\\'
- assert '\\\\'.decode('unicode-escape') == u'\\'
+ assert b'\\\\'.decode('unicode-escape') == u'\\'
assert u'\ud801'.encode('unicode-escape') == '\\ud801'
assert u'\u0013'.encode('unicode-escape') == '\\x13'
More information about the pypy-commit
mailing list