[pypy-commit] pypy py3.5: more tests, fix the comments

Sat Dec 10 09:40:08 EST 2016

Author: Armin Rigo <arigo at tunes.org>
Branch: py3.5
Changeset: r88996:40d2fd7302c4
Date: 2016-12-10 15:39 +0100
http://bitbucket.org/pypy/pypy/changeset/40d2fd7302c4/

Log:	more tests, fix the comments

diff --git a/pypy/interpreter/test/test_unicodehelper.py b/pypy/interpreter/test/test_unicodehelper.py
--- a/pypy/interpreter/test/test_unicodehelper.py
+++ b/pypy/interpreter/test/test_unicodehelper.py
@@ -23,6 +23,14 @@
     c = u"\udc00"
     py.test.raises(Hit, encode_utf8, space, u"\ud800" + c)
 
+def test_encode_utf8_allow_surrogates():
+    sp = FakeSpace()
+    assert encode_utf8(sp, u"\ud800", allow_surrogates=True) == "\xed\xa0\x80"
+    assert encode_utf8(sp, u"\udc00", allow_surrogates=True) == "\xed\xb0\x80"
+    c = u"\udc00"
+    got = encode_utf8(sp, u"\ud800" + c, allow_surrogates=True)
+    assert got == "\xf0\x90\x80\x80"
+
 def test_decode_utf8():
     space = FakeSpace()
     assert decode_utf8(space, "abc") == u"abc"
@@ -32,3 +40,12 @@
     py.test.raises(Hit, decode_utf8, space, "\xed\xa0\x80\xed\xb0\x80")
     got = decode_utf8(space, "\xf0\x90\x80\x80")
     assert map(ord, got) == [0x10000]
+
+def test_decode_utf8_allow_surrogates():
+    sp = FakeSpace()
+    assert decode_utf8(sp, "\xed\xa0\x80", allow_surrogates=True) == u"\ud800"
+    assert decode_utf8(sp, "\xed\xb0\x80", allow_surrogates=True) == u"\udc00"
+    got = decode_utf8(sp, "\xed\xa0\x80\xed\xb0\x80", allow_surrogates=True)
+    assert map(ord, got) == [0xd800, 0xdc00]
+    got = decode_utf8(sp, "\xf0\x90\x80\x80", allow_surrogates=True)
+    assert map(ord, got) == [0x10000]
diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -134,6 +134,11 @@
     return result
 
 def decode_utf8(space, string, allow_surrogates=False):
+    # Note that Python3 tends to forbid *all* surrogates in utf-8.
+    # If allow_surrogates=True, then revert to the Python 2 behavior,
+    # i.e. surrogates are accepted and not treated specially at all.
+    # If there happen to be two 3-bytes encoding a pair of surrogates,
+    # you still get two surrogate unicode characters in the result.
     result, consumed = runicode.str_decode_utf_8(
         string, len(string), "strict",
         final=True, errorhandler=decode_error_handler(space),
@@ -141,9 +146,11 @@
     return result
 
 def encode_utf8(space, uni, allow_surrogates=False):
-    # Note that Python3 tends to forbid lone surrogates
-    # Also, note that the two characters \d800\dc00 are considered as
-    # a paired surrogate, and turn into a single 4-byte utf8 char.
+    # Note that Python3 tends to forbid *all* surrogates in utf-8.
+    # If allow_surrogates=True, then revert to the Python 2 behavior
+    # which never raises UnicodeEncodeError.  Surrogate pairs are then
+    # allowed, either paired or lone.  A paired surrogate is considered
+    # like the non-BMP character it stands for.
     return runicode.unicode_encode_utf_8(
         uni, len(uni), "strict",
         errorhandler=encode_error_handler(space),