[pypy-commit] pypy unicode-utf8-py3: add failing test
mattip
pypy.commits at gmail.com
Sun Dec 9 05:22:04 EST 2018
Author: Matti Picus <matti.picus at gmail.com>
Branch: unicode-utf8-py3
Changeset: r95442:987e7f0a2866
Date: 2018-12-02 18:48 -0800
http://bitbucket.org/pypy/pypy/changeset/987e7f0a2866/
Log: add failing test
diff --git a/pypy/module/_codecs/test/test_codecs.py b/pypy/module/_codecs/test/test_codecs.py
--- a/pypy/module/_codecs/test/test_codecs.py
+++ b/pypy/module/_codecs/test/test_codecs.py
@@ -148,6 +148,98 @@
lgt = 12
assert unicode_escape_decode(b'\\x61\\x62\\x63') == ('abc', lgt)
+ def test_unicode_replace(self):
+ # CPython #8271: during the decoding of an invalid UTF-8 byte sequence,
+ # only the start byte and the continuation byte(s) are now considered
+ # invalid, instead of the number of bytes specified by the start byte.
+ # See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf (page 42,
+ # table 3-8, Row 2) for more information about the algorithm used.
+ FFFD = '\ufffd'
+ sequences = [
+ # invalid start bytes
+ (b'\x80', FFFD), # continuation byte
+ (b'\x80\x80', FFFD*2), # 2 continuation bytes
+ (b'\xc0', FFFD),
+ (b'\xc0\xc0', FFFD*2),
+ (b'\xc1', FFFD),
+ (b'\xc1\xc0', FFFD*2),
+ (b'\xc0\xc1', FFFD*2),
+ # with start byte of a 2-byte sequence
+ (b'\xc2', FFFD), # only the start byte
+ (b'\xc2\xc2', FFFD*2), # 2 start bytes
+ (b'\xc2\xc2\xc2', FFFD*3), # 3 start bytes
+ (b'\xc2\x41', FFFD+'A'), # invalid continuation byte
+ # with start byte of a 3-byte sequence
+ (b'\xe1', FFFD), # only the start byte
+ (b'\xe1\xe1', FFFD*2), # 2 start bytes
+ (b'\xe1\xe1\xe1', FFFD*3), # 3 start bytes
+ (b'\xe1\xe1\xe1\xe1', FFFD*4), # 4 start bytes
+ (b'\xe1\x80', FFFD), # only 1 continuation byte
+ (b'\xe1\x41', FFFD+'A'), # invalid continuation byte
+ (b'\xe1\x41\x80', FFFD+'A'+FFFD), # invalid cb followed by valid cb
+ (b'\xe1\x41\x41', FFFD+'AA'), # 2 invalid continuation bytes
+ (b'\xe1\x80\x41', FFFD+'A'), # only 1 valid continuation byte
+ (b'\xe1\x80\xe1\x41', FFFD*2+'A'), # 1 valid and the other invalid
+ (b'\xe1\x41\xe1\x80', FFFD+'A'+FFFD), # 1 invalid and the other valid
+ # with start byte of a 4-byte sequence
+ (b'\xf1', FFFD), # only the start byte
+ (b'\xf1\xf1', FFFD*2), # 2 start bytes
+ (b'\xf1\xf1\xf1', FFFD*3), # 3 start bytes
+ (b'\xf1\xf1\xf1\xf1', FFFD*4), # 4 start bytes
+ (b'\xf1\xf1\xf1\xf1\xf1', FFFD*5), # 5 start bytes
+ (b'\xf1\x80', FFFD), # only 1 continuation bytes
+ (b'\xf1\x80\x80', FFFD), # only 2 continuation bytes
+ (b'\xf1\x80\x41', FFFD+'A'), # 1 valid cb and 1 invalid
+ (b'\xf1\x80\x41\x41', FFFD+'AA'), # 1 valid cb and 1 invalid
+ (b'\xf1\x80\x80\x41', FFFD+'A'), # 2 valid cb and 1 invalid
+ (b'\xf1\x41\x80', FFFD+'A'+FFFD), # 1 invalid cv and 1 valid
+ (b'\xf1\x41\x80\x80', FFFD+'A'+FFFD*2), # 1 invalid cb and 2 invalid
+ (b'\xf1\x41\x80\x41', FFFD+'A'+FFFD+'A'), # 2 invalid cb and 1 invalid
+ (b'\xf1\x41\x41\x80', FFFD+'AA'+FFFD), # 1 valid cb and 1 invalid
+ (b'\xf1\x41\xf1\x80', FFFD+'A'+FFFD),
+ (b'\xf1\x41\x80\xf1', FFFD+'A'+FFFD*2),
+ (b'\xf1\xf1\x80\x41', FFFD*2+'A'),
+ (b'\xf1\x41\xf1\xf1', FFFD+'A'+FFFD*2),
+ # with invalid start byte of a 4-byte sequence (rfc2279)
+ (b'\xf5', FFFD), # only the start byte
+ (b'\xf5\xf5', FFFD*2), # 2 start bytes
+ (b'\xf5\x80', FFFD*2), # only 1 continuation byte
+ (b'\xf5\x80\x80', FFFD*3), # only 2 continuation byte
+ (b'\xf5\x80\x80\x80', FFFD*4), # 3 continuation bytes
+ (b'\xf5\x80\x41', FFFD*2+'A'), # 1 valid cb and 1 invalid
+ (b'\xf5\x80\x41\xf5', FFFD*2+'A'+FFFD),
+ (b'\xf5\x41\x80\x80\x41', FFFD+'A'+FFFD*2+'A'),
+ # with invalid start byte of a 5-byte sequence (rfc2279)
+ (b'\xf8', FFFD), # only the start byte
+ (b'\xf8\xf8', FFFD*2), # 2 start bytes
+ (b'\xf8\x80', FFFD*2), # only one continuation byte
+ (b'\xf8\x80\x41', FFFD*2 + 'A'), # 1 valid cb and 1 invalid
+ (b'\xf8\x80\x80\x80\x80', FFFD*5), # invalid 5 bytes seq with 5 bytes
+ # with invalid start byte of a 6-byte sequence (rfc2279)
+ (b'\xfc', FFFD), # only the start byte
+ (b'\xfc\xfc', FFFD*2), # 2 start bytes
+ (b'\xfc\x80\x80', FFFD*3), # only 2 continuation bytes
+ (b'\xfc\x80\x80\x80\x80\x80', FFFD*6), # 6 continuation bytes
+ # invalid start byte
+ (b'\xfe', FFFD),
+ (b'\xfe\x80\x80', FFFD*3),
+ # other sequences
+ (b'\xf1\x80\x41\x42\x43', '\ufffd\x41\x42\x43'),
+ (b'\xf1\x80\xff\x42\x43', '\ufffd\ufffd\x42\x43'),
+ (b'\xf1\x80\xc2\x81\x43', '\ufffd\x81\x43'),
+ (b'\x61\xF1\x80\x80\xE1\x80\xC2\x62\x80\x63\x80\xBF\x64',
+ '\x61\uFFFD\uFFFD\uFFFD\x62\uFFFD\x63\uFFFD\uFFFD\x64'),
+ ]
+ for n, (seq, res) in enumerate(sequences):
+ print([hex(x) for x in seq], [hex(ord(x)) for x in res])
+ raises(UnicodeDecodeError, seq.decode, 'utf-8', 'strict')
+ uni = seq.decode('utf-8', 'replace')
+ assert uni == res
+ uni = (seq+b'b').decode('utf-8', 'replace')
+ assert uni == res+'b'
+ uni = seq.decode('utf-8', 'ignore')
+ assert uni == res.replace('\uFFFD', '')
+
def test_unexpected_end_of_data(self):
"""
Test that an 'unexpected end of data' error is raised when the string
@@ -171,7 +263,6 @@
]
FFFD = '\ufffd'
for seq in sequences:
- print(seq)
bseq = bytes(int(c, 16) for c in seq.split())
exc = raises(UnicodeDecodeError, bseq.decode, 'utf-8')
assert 'unexpected end of data' in str(exc.value)
@@ -413,7 +504,6 @@
def search_function(encoding):
def f(input, errors="strict"):
return 42
- print(encoding)
if encoding == 'test.mytestenc':
return (f, f, None, None)
return None
@@ -871,7 +961,6 @@
encodings = ('utf-8', 'utf-16', 'utf-16-le', 'utf-16-be',
'utf-32', 'utf-32-le', 'utf-32-be')
for encoding in encodings:
- print('encoding', encoding)
raises(UnicodeEncodeError, u'\ud800'.encode, encoding)
assert (u'[\udc80]'.encode(encoding, "backslashreplace") ==
'[\\udc80]'.encode(encoding))
@@ -884,7 +973,6 @@
('utf-16-be', b'\xdc\x80'),
('utf-32-le', b'\x80\xdc\x00\x00'),
('utf-32-be', b'\x00\x00\xdc\x80')]:
- print(encoding)
before, after = "[", "]"
before_sequence = before.encode(encoding)
after_sequence = after.encode(encoding)
@@ -1053,7 +1141,6 @@
assert w[0].category == DeprecationWarning
with warnings.catch_warnings(record=True) as w:
- print(type(encoded_abc))
decoder(encoded_abc)
assert len(w) == 1
assert str(w[0].message) == warning_msg
More information about the pypy-commit
mailing list