[pypy-commit] pypy fix_test_codecs: Addressing code review feedback on #612

Tue May 29 02:08:39 EDT 2018

Author: Yusuke Tsutsumi <yusuke at tsutsumi.io>
Branch: fix_test_codecs
Changeset: r94703:c6a20c1af5c0
Date: 2018-05-26 21:56 -0700
http://bitbucket.org/pypy/pypy/changeset/c6a20c1af5c0/

Log:	Addressing code review feedback on #612

	* removing all changes to rpython, as nothing needs to change there
	to ensure pypy3 is python3.6 compliant.
	* adding tests for new behavior introduced in pypy3, to satsify
	pyhton3.6 behavior

diff --git a/pypy/interpreter/test/test_unicodehelper.py b/pypy/interpreter/test/test_unicodehelper.py
--- a/pypy/interpreter/test/test_unicodehelper.py
+++ b/pypy/interpreter/test/test_unicodehelper.py
@@ -3,7 +3,10 @@
 import struct
 import sys
 from pypy.interpreter.unicodehelper import (
-    encode_utf8, decode_utf8, unicode_encode_utf_32_be, str_decode_utf_32_be)
+    encode_utf8, decode_utf8,
+    unicode_encode_utf_8,
+    unicode_encode_utf_32_be, str_decode_utf_32_be
+)
 from pypy.interpreter.unicodehelper import encode_utf8sp, decode_utf8sp
 
 
@@ -28,6 +31,35 @@
     c = u"\udc00"
     py.test.raises(Hit, encode_utf8, space, u"\ud800" + c)
 
+
+def test_encode_utf_8_combine_surrogates():
+    """
+    In the case of a surrogate pair, the error handler should
+    return back a start and stop position of the full surrogate
+    pair (new behavior inherited from python3.6)
+    """
+    u = u"\udc80\ud800\udfff"
+
+    handler_num = 0
+
+    def errorhandler(errors, encoding, msg, s, start, end):
+        """
+        This handler will be called twice, so asserting both times:
+
+        1. the first time, 0xDC80 will be handled as a single surrogate,
+           since it is a standalone character and an invalid surrogate.
+        2. the second time, the characters will be 0xD800 and 0xDFFF, since
+           that is a valid surrogate pair.
+        """
+        assert s[start:end] in [u'\udc80', u'\uD800\uDFFF']
+        return [], None, end
+
+    unicode_encode_utf_8(
+        u, len(u), True,
+        errorhandler=errorhandler,
+        allow_surrogates=False
+    )
+
 def test_encode_utf8_allow_surrogates():
     sp = FakeSpace()
     assert encode_utf8(sp, u"\ud800", allow_surrogates=True) == "\xed\xa0\x80"
diff --git a/pypy/module/_codecs/test/test_codecs.py b/pypy/module/_codecs/test/test_codecs.py
--- a/pypy/module/_codecs/test/test_codecs.py
+++ b/pypy/module/_codecs/test/test_codecs.py
@@ -796,6 +796,14 @@
             test_sequence = before_sequence + ill_surrogate + after_sequence
             raises(UnicodeDecodeError, test_sequence.decode, encoding)
 
+    def test_lone_surrogates_utf_8(self):
+        """
+        utf-8 should not longer allow surrogates,
+        and should return back full surrogate pairs.
+        """
+        e = raises(UnicodeEncodeError, u"\udc80\ud800\udfff".encode, "utf-8")
+        assert e.object[e.start:e.end] == u'\ud800\udfff'
+
     def test_charmap_encode(self):
         assert 'xxx'.encode('charmap') == b'xxx'
 
diff --git a/rpython/rlib/runicode.py b/rpython/rlib/runicode.py
--- a/rpython/rlib/runicode.py
+++ b/rpython/rlib/runicode.py
@@ -361,27 +361,20 @@
         else:
             # Encode UCS2 Unicode ordinals
             if ch < 0x10000:
-                # Special case: check for surrogates
+                # Special case: check for high surrogate
                 if 0xD800 <= ch <= 0xDFFF:
-                    error_start_pos = pos - 1
                     if pos != size:
                         ch2 = ord(s[pos])
-                        # check if the first character is a high surrogate,
-                        # and the second character is a low surrogate. If so,
-                        # they should be handled collectively.
-                        if ch <= 0xDBFF and 0xDC00 <= ch2 <= 0xDFFFF:
-                            # pos should be incremented regardless.
-                            # by doing so, it ensures the lower surrogate
-                            # is also included in the characters considered
-                            # in the errorhandler.
+                        # Check for low surrogate and combine the two to
+                        # form a UCS4 value
+                        if ((allow_surrogates or MAXUNICODE < 65536
+                             or is_narrow_host()) and
+                            ch <= 0xDBFF and 0xDC00 <= ch2 <= 0xDFFF):
+                            ch3 = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000
+                            assert ch3 >= 0
                             pos += 1
-                            # if we allow surrogates, we should combine
-                            # the two and form a UCS4 value
-                            if allow_surrogates or MAXUNICODE < 65535 or is_narrow_host():
-                                ch3 = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000
-                                assert ch3 >= 0
-                                _encodeUCS4(result, ch3)
-                                continue
+                            _encodeUCS4(result, ch3)
+                            continue
                     # note: if the program only ever calls this with
                     # allow_surrogates=True, then we'll never annotate
                     # the following block of code, and errorhandler()
@@ -390,7 +383,7 @@
                     if not allow_surrogates or nonconst.NonConstant(False):
                         ru, rs, pos = errorhandler(errors, 'utf8',
                                                    'surrogates not allowed',
-                                                   s, error_start_pos, pos)
+                                                   s, pos-1, pos)
                         if rs is not None:
                             # py3k only
                             result.append(rs)
@@ -401,7 +394,7 @@
                             else:
                                 errorhandler('strict', 'utf8',
                                              'surrogates not allowed',
-                                             s, pos - 1 , pos)
+                                             s, pos-1, pos)
                         continue
                     # else: Fall through and handles isolated high surrogates
                 result.append((chr((0xe0 | (ch >> 12)))))
@@ -1442,11 +1435,10 @@
         errorhandler = default_unicode_error_decode
 
     if size == 0:
-        return u'', 0, None
+        return u'', 0
 
     builder = UnicodeBuilder(size)
     pos = 0
-    first_escape_error_char = None
     while pos < size:
         ch = s[pos]
 
@@ -1549,11 +1541,10 @@
                                         message, s, pos-1, look+1)
                 builder.append(res)
         else:
-            first_escape_error_char = unichr(ord(ch))
             builder.append(u'\\')
             builder.append(unichr(ord(ch)))
 
-    return builder.build(), pos, first_escape_error_char
+    return builder.build(), pos
 
 def make_unicode_escape_function(pass_printable=False, unicode_output=False,
                                  quotes=False, prefix=None):
diff --git a/rpython/rlib/test/test_runicode.py b/rpython/rlib/test/test_runicode.py
--- a/rpython/rlib/test/test_runicode.py
+++ b/rpython/rlib/test/test_runicode.py
@@ -963,32 +963,3 @@
             py.test.raises(
                 UnicodeEncodeError, runicode.unicode_encode_utf_8,
                 u, len(u), True, allow_surrogates=False)
-
-    def test_encode_utf_8_combine_surrogates(self):
-        """
-        In the case of a surrogate pair, the error handler should
-        return back a start and stop position of the full surrogate
-        pair (new behavior inherited from python3.6)
-        """
-        u = runicode.UNICHR(0xDC80) + runicode.UNICHR(0xD800) + \
-            runicode.UNICHR(0xDFFF)
-
-        handler_num = 0
-
-        def errorhandler(errors, encoding, msg, s, start, end):
-            """
-            This handler will be called twice, so asserting both times:
-
-            1. the first time, 0xDC80 will be handled as a single surrogate,
-               since it is a standalone character and an invalid surrogate.
-            2. the second time, the characters will be 0xD800 and 0xDFFF, since
-               that is a valid surrogate pair.
-            """
-            assert s[start:end] in [u'\udc80', u'\uD800\uDFFF']
-            return [], None, end
-
-        runicode.unicode_encode_utf_8(
-            u, len(u), True,
-            errorhandler=errorhandler,
-            allow_surrogates=False
-        )