[pypy-commit] pypy py3.5: more tests, fix the comments
arigo
pypy.commits at gmail.com
Sat Dec 10 09:40:08 EST 2016
Author: Armin Rigo <arigo at tunes.org>
Branch: py3.5
Changeset: r88996:40d2fd7302c4
Date: 2016-12-10 15:39 +0100
http://bitbucket.org/pypy/pypy/changeset/40d2fd7302c4/
Log: more tests, fix the comments
diff --git a/pypy/interpreter/test/test_unicodehelper.py b/pypy/interpreter/test/test_unicodehelper.py
--- a/pypy/interpreter/test/test_unicodehelper.py
+++ b/pypy/interpreter/test/test_unicodehelper.py
@@ -23,6 +23,14 @@
c = u"\udc00"
py.test.raises(Hit, encode_utf8, space, u"\ud800" + c)
+def test_encode_utf8_allow_surrogates():
+ sp = FakeSpace()
+ assert encode_utf8(sp, u"\ud800", allow_surrogates=True) == "\xed\xa0\x80"
+ assert encode_utf8(sp, u"\udc00", allow_surrogates=True) == "\xed\xb0\x80"
+ c = u"\udc00"
+ got = encode_utf8(sp, u"\ud800" + c, allow_surrogates=True)
+ assert got == "\xf0\x90\x80\x80"
+
def test_decode_utf8():
space = FakeSpace()
assert decode_utf8(space, "abc") == u"abc"
@@ -32,3 +40,12 @@
py.test.raises(Hit, decode_utf8, space, "\xed\xa0\x80\xed\xb0\x80")
got = decode_utf8(space, "\xf0\x90\x80\x80")
assert map(ord, got) == [0x10000]
+
+def test_decode_utf8_allow_surrogates():
+ sp = FakeSpace()
+ assert decode_utf8(sp, "\xed\xa0\x80", allow_surrogates=True) == u"\ud800"
+ assert decode_utf8(sp, "\xed\xb0\x80", allow_surrogates=True) == u"\udc00"
+ got = decode_utf8(sp, "\xed\xa0\x80\xed\xb0\x80", allow_surrogates=True)
+ assert map(ord, got) == [0xd800, 0xdc00]
+ got = decode_utf8(sp, "\xf0\x90\x80\x80", allow_surrogates=True)
+ assert map(ord, got) == [0x10000]
diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -134,6 +134,11 @@
return result
def decode_utf8(space, string, allow_surrogates=False):
+ # Note that Python3 tends to forbid *all* surrogates in utf-8.
+ # If allow_surrogates=True, then revert to the Python 2 behavior,
+ # i.e. surrogates are accepted and not treated specially at all.
+ # If there happen to be two 3-bytes encoding a pair of surrogates,
+ # you still get two surrogate unicode characters in the result.
result, consumed = runicode.str_decode_utf_8(
string, len(string), "strict",
final=True, errorhandler=decode_error_handler(space),
@@ -141,9 +146,11 @@
return result
def encode_utf8(space, uni, allow_surrogates=False):
- # Note that Python3 tends to forbid lone surrogates
- # Also, note that the two characters \d800\dc00 are considered as
- # a paired surrogate, and turn into a single 4-byte utf8 char.
+ # Note that Python3 tends to forbid *all* surrogates in utf-8.
+ # If allow_surrogates=True, then revert to the Python 2 behavior
+ # which never raises UnicodeEncodeError. Surrogate pairs are then
+ # allowed, either paired or lone. A paired surrogate is considered
+ # like the non-BMP character it stands for.
return runicode.unicode_encode_utf_8(
uni, len(uni), "strict",
errorhandler=encode_error_handler(space),
More information about the pypy-commit
mailing list