[pypy-commit] pypy fix_test_codecs: Addressing code review feedback on #612
Yusuke Tsutsumi
pypy.commits at gmail.com
Tue May 29 02:08:39 EDT 2018
Author: Yusuke Tsutsumi <yusuke at tsutsumi.io>
Branch: fix_test_codecs
Changeset: r94703:c6a20c1af5c0
Date: 2018-05-26 21:56 -0700
http://bitbucket.org/pypy/pypy/changeset/c6a20c1af5c0/
Log: Addressing code review feedback on #612
* removing all changes to rpython, as nothing needs to change there
to ensure pypy3 is python3.6 compliant.
* adding tests for new behavior introduced in pypy3, to satsify
pyhton3.6 behavior
diff --git a/pypy/interpreter/test/test_unicodehelper.py b/pypy/interpreter/test/test_unicodehelper.py
--- a/pypy/interpreter/test/test_unicodehelper.py
+++ b/pypy/interpreter/test/test_unicodehelper.py
@@ -3,7 +3,10 @@
import struct
import sys
from pypy.interpreter.unicodehelper import (
- encode_utf8, decode_utf8, unicode_encode_utf_32_be, str_decode_utf_32_be)
+ encode_utf8, decode_utf8,
+ unicode_encode_utf_8,
+ unicode_encode_utf_32_be, str_decode_utf_32_be
+)
from pypy.interpreter.unicodehelper import encode_utf8sp, decode_utf8sp
@@ -28,6 +31,35 @@
c = u"\udc00"
py.test.raises(Hit, encode_utf8, space, u"\ud800" + c)
+
+def test_encode_utf_8_combine_surrogates():
+ """
+ In the case of a surrogate pair, the error handler should
+ return back a start and stop position of the full surrogate
+ pair (new behavior inherited from python3.6)
+ """
+ u = u"\udc80\ud800\udfff"
+
+ handler_num = 0
+
+ def errorhandler(errors, encoding, msg, s, start, end):
+ """
+ This handler will be called twice, so asserting both times:
+
+ 1. the first time, 0xDC80 will be handled as a single surrogate,
+ since it is a standalone character and an invalid surrogate.
+ 2. the second time, the characters will be 0xD800 and 0xDFFF, since
+ that is a valid surrogate pair.
+ """
+ assert s[start:end] in [u'\udc80', u'\uD800\uDFFF']
+ return [], None, end
+
+ unicode_encode_utf_8(
+ u, len(u), True,
+ errorhandler=errorhandler,
+ allow_surrogates=False
+ )
+
def test_encode_utf8_allow_surrogates():
sp = FakeSpace()
assert encode_utf8(sp, u"\ud800", allow_surrogates=True) == "\xed\xa0\x80"
diff --git a/pypy/module/_codecs/test/test_codecs.py b/pypy/module/_codecs/test/test_codecs.py
--- a/pypy/module/_codecs/test/test_codecs.py
+++ b/pypy/module/_codecs/test/test_codecs.py
@@ -796,6 +796,14 @@
test_sequence = before_sequence + ill_surrogate + after_sequence
raises(UnicodeDecodeError, test_sequence.decode, encoding)
+ def test_lone_surrogates_utf_8(self):
+ """
+ utf-8 should not longer allow surrogates,
+ and should return back full surrogate pairs.
+ """
+ e = raises(UnicodeEncodeError, u"\udc80\ud800\udfff".encode, "utf-8")
+ assert e.object[e.start:e.end] == u'\ud800\udfff'
+
def test_charmap_encode(self):
assert 'xxx'.encode('charmap') == b'xxx'
diff --git a/rpython/rlib/runicode.py b/rpython/rlib/runicode.py
--- a/rpython/rlib/runicode.py
+++ b/rpython/rlib/runicode.py
@@ -361,27 +361,20 @@
else:
# Encode UCS2 Unicode ordinals
if ch < 0x10000:
- # Special case: check for surrogates
+ # Special case: check for high surrogate
if 0xD800 <= ch <= 0xDFFF:
- error_start_pos = pos - 1
if pos != size:
ch2 = ord(s[pos])
- # check if the first character is a high surrogate,
- # and the second character is a low surrogate. If so,
- # they should be handled collectively.
- if ch <= 0xDBFF and 0xDC00 <= ch2 <= 0xDFFFF:
- # pos should be incremented regardless.
- # by doing so, it ensures the lower surrogate
- # is also included in the characters considered
- # in the errorhandler.
+ # Check for low surrogate and combine the two to
+ # form a UCS4 value
+ if ((allow_surrogates or MAXUNICODE < 65536
+ or is_narrow_host()) and
+ ch <= 0xDBFF and 0xDC00 <= ch2 <= 0xDFFF):
+ ch3 = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000
+ assert ch3 >= 0
pos += 1
- # if we allow surrogates, we should combine
- # the two and form a UCS4 value
- if allow_surrogates or MAXUNICODE < 65535 or is_narrow_host():
- ch3 = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000
- assert ch3 >= 0
- _encodeUCS4(result, ch3)
- continue
+ _encodeUCS4(result, ch3)
+ continue
# note: if the program only ever calls this with
# allow_surrogates=True, then we'll never annotate
# the following block of code, and errorhandler()
@@ -390,7 +383,7 @@
if not allow_surrogates or nonconst.NonConstant(False):
ru, rs, pos = errorhandler(errors, 'utf8',
'surrogates not allowed',
- s, error_start_pos, pos)
+ s, pos-1, pos)
if rs is not None:
# py3k only
result.append(rs)
@@ -401,7 +394,7 @@
else:
errorhandler('strict', 'utf8',
'surrogates not allowed',
- s, pos - 1 , pos)
+ s, pos-1, pos)
continue
# else: Fall through and handles isolated high surrogates
result.append((chr((0xe0 | (ch >> 12)))))
@@ -1442,11 +1435,10 @@
errorhandler = default_unicode_error_decode
if size == 0:
- return u'', 0, None
+ return u'', 0
builder = UnicodeBuilder(size)
pos = 0
- first_escape_error_char = None
while pos < size:
ch = s[pos]
@@ -1549,11 +1541,10 @@
message, s, pos-1, look+1)
builder.append(res)
else:
- first_escape_error_char = unichr(ord(ch))
builder.append(u'\\')
builder.append(unichr(ord(ch)))
- return builder.build(), pos, first_escape_error_char
+ return builder.build(), pos
def make_unicode_escape_function(pass_printable=False, unicode_output=False,
quotes=False, prefix=None):
diff --git a/rpython/rlib/test/test_runicode.py b/rpython/rlib/test/test_runicode.py
--- a/rpython/rlib/test/test_runicode.py
+++ b/rpython/rlib/test/test_runicode.py
@@ -963,32 +963,3 @@
py.test.raises(
UnicodeEncodeError, runicode.unicode_encode_utf_8,
u, len(u), True, allow_surrogates=False)
-
- def test_encode_utf_8_combine_surrogates(self):
- """
- In the case of a surrogate pair, the error handler should
- return back a start and stop position of the full surrogate
- pair (new behavior inherited from python3.6)
- """
- u = runicode.UNICHR(0xDC80) + runicode.UNICHR(0xD800) + \
- runicode.UNICHR(0xDFFF)
-
- handler_num = 0
-
- def errorhandler(errors, encoding, msg, s, start, end):
- """
- This handler will be called twice, so asserting both times:
-
- 1. the first time, 0xDC80 will be handled as a single surrogate,
- since it is a standalone character and an invalid surrogate.
- 2. the second time, the characters will be 0xD800 and 0xDFFF, since
- that is a valid surrogate pair.
- """
- assert s[start:end] in [u'\udc80', u'\uD800\uDFFF']
- return [], None, end
-
- runicode.unicode_encode_utf_8(
- u, len(u), True,
- errorhandler=errorhandler,
- allow_surrogates=False
- )
More information about the pypy-commit
mailing list