[pypy-commit] pypy default: Support for py3k's more precise encoding, as reported to the error handler,
arigo
pypy.commits at gmail.com
Thu Jul 27 11:17:29 EDT 2017
Author: Armin Rigo <arigo at tunes.org>
Branch:
Changeset: r91977:c9e1134edc4a
Date: 2017-07-27 17:05 +0200
http://bitbucket.org/pypy/pypy/changeset/c9e1134edc4a/
Log: Support for py3k's more precise encoding, as reported to the error
handler, when using utf-16 or utf-32
diff --git a/rpython/rlib/runicode.py b/rpython/rlib/runicode.py
--- a/rpython/rlib/runicode.py
+++ b/rpython/rlib/runicode.py
@@ -16,6 +16,8 @@
allow_surrogate_by_default = True
BYTEORDER = sys.byteorder
+BYTEORDER2 = BYTEORDER[0] + 'e' # either "le" or "be"
+assert BYTEORDER2 in ('le', 'be')
# python 2.7 has a preview of py3k behavior, so those functions
# are used either when we're testing wide pypy on narrow cpython
@@ -486,9 +488,31 @@
errorhandler, "little")
return result, length
+def py3k_str_decode_utf_16(s, size, errors, final=True,
+ errorhandler=None):
+ result, length, byteorder = str_decode_utf_16_helper(s, size, errors, final,
+ errorhandler, "native",
+ 'utf-16-' + BYTEORDER2)
+ return result, length
+
+def py3k_str_decode_utf_16_be(s, size, errors, final=True,
+ errorhandler=None):
+ result, length, byteorder = str_decode_utf_16_helper(s, size, errors, final,
+ errorhandler, "big",
+ 'utf-16-be')
+ return result, length
+
+def py3k_str_decode_utf_16_le(s, size, errors, final=True,
+ errorhandler=None):
+ result, length, byteorder = str_decode_utf_16_helper(s, size, errors, final,
+ errorhandler, "little",
+ 'utf-16-le')
+ return result, length
+
def str_decode_utf_16_helper(s, size, errors, final=True,
errorhandler=None,
- byteorder="native"):
+ byteorder="native",
+ public_encoding_name='utf16'):
if errorhandler is None:
errorhandler = default_unicode_error_decode
bo = 0
@@ -546,7 +570,8 @@
if len(s) - pos < 2:
if not final:
break
- r, pos = errorhandler(errors, 'utf16', "truncated data",
+ r, pos = errorhandler(errors, public_encoding_name,
+ "truncated data",
s, pos, len(s))
result.append(r)
if len(s) - pos < 2:
@@ -562,7 +587,8 @@
if not final:
break
errmsg = "unexpected end of data"
- r, pos = errorhandler(errors, 'utf16', errmsg, s, pos, len(s))
+ r, pos = errorhandler(errors, public_encoding_name,
+ errmsg, s, pos, len(s))
result.append(r)
if len(s) - pos < 2:
break
@@ -578,12 +604,12 @@
(ch2 & 0x3FF)) + 0x10000))
continue
else:
- r, pos = errorhandler(errors, 'utf16',
+ r, pos = errorhandler(errors, public_encoding_name,
"illegal UTF-16 surrogate",
s, pos - 4, pos - 2)
result.append(r)
else:
- r, pos = errorhandler(errors, 'utf16',
+ r, pos = errorhandler(errors, public_encoding_name,
"illegal encoding",
s, pos - 2, pos)
result.append(r)
@@ -592,7 +618,8 @@
def unicode_encode_utf_16_helper(s, size, errors,
errorhandler=None,
allow_surrogates=True,
- byteorder='little'):
+ byteorder='little',
+ public_encoding_name='utf16'):
if errorhandler is None:
errorhandler = default_unicode_error_encode
if size == 0:
@@ -620,13 +647,13 @@
elif ch >= 0xE000 or allow_surrogates:
_STORECHAR(result, ch, byteorder)
else:
- ru, rs, pos = errorhandler(errors, 'utf16',
+ ru, rs, pos = errorhandler(errors, public_encoding_name,
'surrogates not allowed',
s, pos-1, pos)
if rs is not None:
# py3k only
if len(rs) % 2 != 0:
- errorhandler('strict', 'utf16',
+ errorhandler('strict', public_encoding_name,
'surrogates not allowed',
s, pos-1, pos)
result.append(rs)
@@ -635,7 +662,7 @@
if ord(ch) < 0xD800:
_STORECHAR(result, ord(ch), byteorder)
else:
- errorhandler('strict', 'utf16',
+ errorhandler('strict', public_encoding_name,
'surrogates not allowed',
s, pos-1, pos)
continue
@@ -648,20 +675,39 @@
return unicode_encode_utf_16_helper(s, size, errors, errorhandler,
allow_surrogates, "native")
-
def unicode_encode_utf_16_be(s, size, errors,
errorhandler=None,
allow_surrogates=True):
return unicode_encode_utf_16_helper(s, size, errors, errorhandler,
allow_surrogates, "big")
-
def unicode_encode_utf_16_le(s, size, errors,
errorhandler=None,
allow_surrogates=True):
return unicode_encode_utf_16_helper(s, size, errors, errorhandler,
allow_surrogates, "little")
+def py3k_unicode_encode_utf_16(s, size, errors,
+ errorhandler=None,
+ allow_surrogates=True):
+ return unicode_encode_utf_16_helper(s, size, errors, errorhandler,
+ allow_surrogates, "native",
+ 'utf-16-' + BYTEORDER2)
+
+def py3k_unicode_encode_utf_16_be(s, size, errors,
+ errorhandler=None,
+ allow_surrogates=True):
+ return unicode_encode_utf_16_helper(s, size, errors, errorhandler,
+ allow_surrogates, "big",
+ 'utf-16-be')
+
+def py3k_unicode_encode_utf_16_le(s, size, errors,
+ errorhandler=None,
+ allow_surrogates=True):
+ return unicode_encode_utf_16_helper(s, size, errors, errorhandler,
+ allow_surrogates, "little",
+ 'utf-16-le')
+
# ____________________________________________________________
# utf-32
@@ -684,12 +730,34 @@
errorhandler, "little")
return result, length
+def py3k_str_decode_utf_32(s, size, errors, final=True,
+ errorhandler=None):
+ result, length, byteorder = str_decode_utf_32_helper(s, size, errors, final,
+ errorhandler, "native",
+ 'utf-32-' + BYTEORDER2)
+ return result, length
+
+def py3k_str_decode_utf_32_be(s, size, errors, final=True,
+ errorhandler=None):
+ result, length, byteorder = str_decode_utf_32_helper(s, size, errors, final,
+ errorhandler, "big",
+ 'utf-32-be')
+ return result, length
+
+def py3k_str_decode_utf_32_le(s, size, errors, final=True,
+ errorhandler=None):
+ result, length, byteorder = str_decode_utf_32_helper(s, size, errors, final,
+ errorhandler, "little",
+ 'utf-32-le')
+ return result, length
+
BOM32_DIRECT = intmask(0x0000FEFF)
BOM32_REVERSE = intmask(0xFFFE0000)
def str_decode_utf_32_helper(s, size, errors, final=True,
errorhandler=None,
- byteorder="native"):
+ byteorder="native",
+ public_encoding_name='utf32'):
if errorhandler is None:
errorhandler = default_unicode_error_decode
bo = 0
@@ -744,7 +812,8 @@
if len(s) - pos < 4:
if not final:
break
- r, pos = errorhandler(errors, 'utf32', "truncated data",
+ r, pos = errorhandler(errors, public_encoding_name,
+ "truncated data",
s, pos, len(s))
result.append(r)
if len(s) - pos < 4:
@@ -753,7 +822,8 @@
ch = ((ord(s[pos + iorder[3]]) << 24) | (ord(s[pos + iorder[2]]) << 16) |
(ord(s[pos + iorder[1]]) << 8) | ord(s[pos + iorder[0]]))
if ch >= 0x110000:
- r, pos = errorhandler(errors, 'utf32', "codepoint not in range(0x110000)",
+ r, pos = errorhandler(errors, public_encoding_name,
+ "codepoint not in range(0x110000)",
s, pos, len(s))
result.append(r)
continue
@@ -786,7 +856,8 @@
def unicode_encode_utf_32_helper(s, size, errors,
errorhandler=None,
allow_surrogates=True,
- byteorder='little'):
+ byteorder='little',
+ public_encoding_name='utf32'):
if errorhandler is None:
errorhandler = default_unicode_error_encode
if size == 0:
@@ -808,13 +879,13 @@
ch2 = 0
if 0xD800 <= ch < 0xDC00:
if not allow_surrogates:
- ru, rs, pos = errorhandler(errors, 'utf32',
+ ru, rs, pos = errorhandler(errors, public_encoding_name,
'surrogates not allowed',
s, pos-1, pos)
if rs is not None:
# py3k only
if len(rs) % 4 != 0:
- errorhandler('strict', 'utf32',
+ errorhandler('strict', public_encoding_name,
'surrogates not allowed',
s, pos-1, pos)
result.append(rs)
@@ -823,7 +894,7 @@
if ord(ch) < 0xD800:
_STORECHAR32(result, ord(ch), byteorder)
else:
- errorhandler('strict', 'utf32',
+ errorhandler('strict', public_encoding_name,
'surrogates not allowed',
s, pos-1, pos)
continue
@@ -841,18 +912,34 @@
return unicode_encode_utf_32_helper(s, size, errors, errorhandler,
allow_surrogates, "native")
-
def unicode_encode_utf_32_be(s, size, errors,
errorhandler=None, allow_surrogates=True):
return unicode_encode_utf_32_helper(s, size, errors, errorhandler,
allow_surrogates, "big")
-
def unicode_encode_utf_32_le(s, size, errors,
errorhandler=None, allow_surrogates=True):
return unicode_encode_utf_32_helper(s, size, errors, errorhandler,
allow_surrogates, "little")
+def py3k_unicode_encode_utf_32(s, size, errors,
+ errorhandler=None, allow_surrogates=True):
+ return unicode_encode_utf_32_helper(s, size, errors, errorhandler,
+ allow_surrogates, "native",
+ 'utf-32-' + BYTEORDER2)
+
+def py3k_unicode_encode_utf_32_be(s, size, errors,
+ errorhandler=None, allow_surrogates=True):
+ return unicode_encode_utf_32_helper(s, size, errors, errorhandler,
+ allow_surrogates, "big",
+ 'utf-32-be')
+
+def py3k_unicode_encode_utf_32_le(s, size, errors,
+ errorhandler=None, allow_surrogates=True):
+ return unicode_encode_utf_32_helper(s, size, errors, errorhandler,
+ allow_surrogates, "little",
+ 'utf-32-le')
+
# ____________________________________________________________
# utf-7
diff --git a/rpython/rlib/test/test_runicode.py b/rpython/rlib/test/test_runicode.py
--- a/rpython/rlib/test/test_runicode.py
+++ b/rpython/rlib/test/test_runicode.py
@@ -38,8 +38,10 @@
assert x == y
assert type(x) is type(y)
- def getdecoder(self, encoding):
- return getattr(runicode, "str_decode_%s" % encoding.replace("-", "_"))
+ def getdecoder(self, encoding, look_for_py3k=False):
+ prefix = "py3k_" if look_for_py3k else ""
+ return getattr(runicode, "%sstr_decode_%s" %
+ (prefix, encoding.replace("-", "_")))
def getencoder(self, encoding):
return getattr(runicode,
@@ -96,14 +98,17 @@
assert '\xc3' in result
def checkdecodeerror(self, s, encoding, start, stop,
- addstuff=True, msg=None):
+ addstuff=True, msg=None,
+ expected_reported_encoding=None,
+ look_for_py3k=False):
called = [0]
def errorhandler(errors, enc, errmsg, t, startingpos,
endingpos):
called[0] += 1
if called[0] == 1:
assert errors == "foo!"
- assert enc == encoding.replace('-', '')
+ assert enc == (expected_reported_encoding or
+ encoding.replace('-', ''))
assert t is s
assert start == startingpos
assert stop == endingpos
@@ -111,7 +116,7 @@
assert errmsg == msg
return u"42424242", stop
return u"", endingpos
- decoder = self.getdecoder(encoding)
+ decoder = self.getdecoder(encoding, look_for_py3k=look_for_py3k)
if addstuff:
s += "some rest in ascii"
result, _ = decoder(s, len(s), "foo!", True, errorhandler)
@@ -218,6 +223,27 @@
]:
self.checkdecodeerror(s, "utf-16", 2, 4, addstuff=False)
+ def test_utf16_errors_py3k(self):
+ letter = sys.byteorder[0]
+ self.checkdecodeerror("\xff", "utf-16", 0, 1, addstuff=False,
+ expected_reported_encoding='utf-16-%se' % letter,
+ look_for_py3k=True)
+ self.checkdecodeerror("\xff", "utf-16-be", 0, 1, addstuff=False,
+ expected_reported_encoding='utf-16-be',
+ look_for_py3k=True)
+ self.checkdecodeerror("\xff", "utf-16-le", 0, 1, addstuff=False,
+ expected_reported_encoding='utf-16-le',
+ look_for_py3k=True)
+ self.checkdecodeerror("\xff", "utf-32", 0, 1, addstuff=False,
+ expected_reported_encoding='utf-32-%se' % letter,
+ look_for_py3k=True)
+ self.checkdecodeerror("\xff", "utf-32-be", 0, 1, addstuff=False,
+ expected_reported_encoding='utf-32-be',
+ look_for_py3k=True)
+ self.checkdecodeerror("\xff", "utf-32-le", 0, 1, addstuff=False,
+ expected_reported_encoding='utf-32-le',
+ look_for_py3k=True)
+
def test_utf16_bugs(self):
s = '\x80-\xe9\xdeL\xa3\x9b'
py.test.raises(UnicodeDecodeError, runicode.str_decode_utf_16_le,
More information about the pypy-commit
mailing list