[pypy-commit] pypy default: The py3k version of the utf32 decoder should not allow lone surrogates.
amauryfa
pypy.commits at gmail.com
Wed Dec 13 17:29:27 EST 2017
Author: Amaury Forgeot d'Arc <amauryfa at gmail.com>
Branch:
Changeset: r93411:33178f62171f
Date: 2017-12-13 10:04 +0100
http://bitbucket.org/pypy/pypy/changeset/33178f62171f/
Log: The py3k version of the utf32 decoder should not allow lone
surrogates.
diff --git a/rpython/rlib/runicode.py b/rpython/rlib/runicode.py
--- a/rpython/rlib/runicode.py
+++ b/rpython/rlib/runicode.py
@@ -489,21 +489,21 @@
return result, length
def py3k_str_decode_utf_16(s, size, errors, final=True,
- errorhandler=None):
+ errorhandler=None):
result, length, byteorder = str_decode_utf_16_helper(s, size, errors, final,
errorhandler, "native",
'utf-16-' + BYTEORDER2)
return result, length
def py3k_str_decode_utf_16_be(s, size, errors, final=True,
- errorhandler=None):
+ errorhandler=None):
result, length, byteorder = str_decode_utf_16_helper(s, size, errors, final,
errorhandler, "big",
'utf-16-be')
return result, length
def py3k_str_decode_utf_16_le(s, size, errors, final=True,
- errorhandler=None):
+ errorhandler=None):
result, length, byteorder = str_decode_utf_16_helper(s, size, errors, final,
errorhandler, "little",
'utf-16-le')
@@ -714,41 +714,41 @@
def str_decode_utf_32(s, size, errors, final=True,
errorhandler=None):
- result, length, byteorder = str_decode_utf_32_helper(s, size, errors, final,
- errorhandler, "native")
+ result, length, byteorder = str_decode_utf_32_helper(
+ s, size, errors, final, errorhandler, "native")
return result, length
def str_decode_utf_32_be(s, size, errors, final=True,
errorhandler=None):
- result, length, byteorder = str_decode_utf_32_helper(s, size, errors, final,
- errorhandler, "big")
+ result, length, byteorder = str_decode_utf_32_helper(
+ s, size, errors, final, errorhandler, "big")
return result, length
def str_decode_utf_32_le(s, size, errors, final=True,
errorhandler=None):
- result, length, byteorder = str_decode_utf_32_helper(s, size, errors, final,
- errorhandler, "little")
+ result, length, byteorder = str_decode_utf_32_helper(
+ s, size, errors, final, errorhandler, "little")
return result, length
def py3k_str_decode_utf_32(s, size, errors, final=True,
errorhandler=None):
- result, length, byteorder = str_decode_utf_32_helper(s, size, errors, final,
- errorhandler, "native",
- 'utf-32-' + BYTEORDER2)
+ result, length, byteorder = str_decode_utf_32_helper(
+ s, size, errors, final, errorhandler, "native",
+ 'utf-32-' + BYTEORDER2, allow_surrogates=False)
return result, length
def py3k_str_decode_utf_32_be(s, size, errors, final=True,
errorhandler=None):
- result, length, byteorder = str_decode_utf_32_helper(s, size, errors, final,
- errorhandler, "big",
- 'utf-32-be')
+ result, length, byteorder = str_decode_utf_32_helper(
+ s, size, errors, final, errorhandler, "big",
+ 'utf-32-be', allow_surrogates=False)
return result, length
def py3k_str_decode_utf_32_le(s, size, errors, final=True,
errorhandler=None):
- result, length, byteorder = str_decode_utf_32_helper(s, size, errors, final,
- errorhandler, "little",
- 'utf-32-le')
+ result, length, byteorder = str_decode_utf_32_helper(
+ s, size, errors, final, errorhandler, "little",
+ 'utf-32-le', allow_surrogates=False)
return result, length
BOM32_DIRECT = intmask(0x0000FEFF)
@@ -757,7 +757,8 @@
def str_decode_utf_32_helper(s, size, errors, final=True,
errorhandler=None,
byteorder="native",
- public_encoding_name='utf32'):
+ public_encoding_name='utf32',
+ allow_surrogates=True):
if errorhandler is None:
errorhandler = default_unicode_error_decode
bo = 0
@@ -821,7 +822,13 @@
continue
ch = ((ord(s[pos + iorder[3]]) << 24) | (ord(s[pos + iorder[2]]) << 16) |
(ord(s[pos + iorder[1]]) << 8) | ord(s[pos + iorder[0]]))
- if ch >= 0x110000:
+ if not allow_surrogates and 0xD800 <= ch <= 0xDFFFF:
+ r, pos = errorhandler(errors, public_encoding_name,
+ "code point in surrogate code point "
+ "range(0xd800, 0xe000)",
+ s, pos, len(s))
+ result.append(r)
+ elif ch >= 0x110000:
r, pos = errorhandler(errors, public_encoding_name,
"codepoint not in range(0x110000)",
s, pos, len(s))
diff --git a/rpython/rlib/test/test_runicode.py b/rpython/rlib/test/test_runicode.py
--- a/rpython/rlib/test/test_runicode.py
+++ b/rpython/rlib/test/test_runicode.py
@@ -284,6 +284,11 @@
errorhandler, allow_surrogates=False)
assert replace_with(u'rep', None) == u'<rep>'.encode('utf-32-be')
assert replace_with(None, '\xca\xfe\xca\xfe') == '\x00\x00\x00<\xca\xfe\xca\xfe\x00\x00\x00>'
+ #
+ assert runicode.str_decode_utf_32_be(
+ b"\x00\x00\xdc\x80", 4, None) == (u'\udc80', 4)
+ py.test.raises(UnicodeDecodeError, runicode.py3k_str_decode_utf_32_be,
+ b"\x00\x00\xdc\x80", 4, None)
def test_utf7_bugs(self):
u = u'A\u2262\u0391.'
More information about the pypy-commit
mailing list