[pypy-commit] pypy unicode-utf8-py3: more tests, fix backslashreplace_errors: decode handles utf8 not unicode
mattip
pypy.commits at gmail.com
Tue Jan 15 18:18:26 EST 2019
Author: Matti Picus <matti.picus at gmail.com>
Branch: unicode-utf8-py3
Changeset: r95641:f9702f2e0e19
Date: 2019-01-15 21:17 +0200
http://bitbucket.org/pypy/pypy/changeset/f9702f2e0e19/
Log: more tests, fix backslashreplace_errors: decode handles utf8 not
unicode
diff --git a/pypy/module/_codecs/interp_codecs.py b/pypy/module/_codecs/interp_codecs.py
--- a/pypy/module/_codecs/interp_codecs.py
+++ b/pypy/module/_codecs/interp_codecs.py
@@ -302,9 +302,9 @@
builder = StringBuilder()
pos = start
while pos < end:
- oc = rutf8.codepoint_at_pos(obj, pos)
+ oc = ord(obj[pos])
raw_unicode_escape_helper(builder, oc)
- pos = rutf8.next_codepoint_pos(obj, pos)
+ pos += 1
return space.newtuple([space.newtext(builder.build()), w_end])
else:
raise oefmt(space.w_TypeError,
diff --git a/pypy/module/_codecs/test/test_codecs.py b/pypy/module/_codecs/test/test_codecs.py
--- a/pypy/module/_codecs/test/test_codecs.py
+++ b/pypy/module/_codecs/test/test_codecs.py
@@ -971,12 +971,27 @@
('utf-16-be', b'\xdc\x80'),
('utf-32-le', b'\x80\xdc\x00\x00'),
('utf-32-be', b'\x00\x00\xdc\x80')]:
- before, after = "[", "]"
- before_sequence = before.encode(encoding)
- after_sequence = after.encode(encoding)
- test_string = before + "\uDC80" + after
- test_sequence = before_sequence + ill_surrogate + after_sequence
- raises(UnicodeDecodeError, test_sequence.decode, encoding)
+ ill_formed_sequence_replace = "\ufffd"
+ if encoding == 'utf-8':
+ ill_formed_sequence_replace *= 3
+ bom = "".encode(encoding)
+ for before, after in [("\U00010fff", "A"), ("[", "]"),
+ ("A", "\U00010fff")]:
+ before_sequence = before.encode(encoding)[len(bom):]
+ after_sequence = after.encode(encoding)[len(bom):]
+ test_string = before + "\uDC80" + after
+ test_sequence = (bom + before_sequence + ill_surrogate + after_sequence)
+ raises(UnicodeDecodeError, test_sequence.decode, encoding)
+ assert test_string.encode(encoding, 'surrogatepass') == test_sequence
+ assert test_sequence.decode(encoding, 'surrogatepass') == test_string
+ assert test_sequence.decode(encoding, 'ignore') == before + after
+ assert test_sequence.decode(encoding, 'replace') == (before +
+ ill_formed_sequence_replace + after), str(
+ (encoding, test_sequence, before + ill_formed_sequence_replace + after))
+ backslashreplace = ''.join('\\x%02x' % b for b in ill_surrogate)
+ assert test_sequence.decode(encoding, "backslashreplace") == (before +
+ backslashreplace + after)
+
def test_charmap_encode(self):
assert 'xxx'.encode('charmap') == b'xxx'
More information about the pypy-commit
mailing list