[pypy-commit] pypy fix_test_codecs: Adding unit test for new behavior in runicode. Fixing bug

Tue May 29 02:08:37 EDT 2018

Author: Yusuke Tsutsumi <yusuke at tsutsumi.io>
Branch: fix_test_codecs
Changeset: r94702:c2a3d03741e2
Date: 2018-05-25 23:09 -0700
http://bitbucket.org/pypy/pypy/changeset/c2a3d03741e2/

Log:	Adding unit test for new behavior in runicode. Fixing bug

	Fixing a bug in the utf 8 handling which did not consider lower
	surrogates below 0xDC80.

	Adding unit tests for the new behavior in runicode, which combines
	high and low surrogates into a single errorhandler call.

diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -199,7 +199,7 @@
                         # check if the first character is a high surrogate,
                         # and the second character is a low surrogate. If so,
                         # they should be handled collectively.
-                        if ch <= 0xDBFF and 0xDC80 <= ch2 <= 0xDFFFF:
+                        if ch <= 0xDBFF and 0xDC00 <= ch2 <= 0xDFFFF:
                             # pos should be incremented regardless.
                             # by doing so, it ensures the lower surrogate
                             # is also included in the characters considered
diff --git a/rpython/rlib/runicode.py b/rpython/rlib/runicode.py
--- a/rpython/rlib/runicode.py
+++ b/rpython/rlib/runicode.py
@@ -369,7 +369,7 @@
                         # check if the first character is a high surrogate,
                         # and the second character is a low surrogate. If so,
                         # they should be handled collectively.
-                        if ch <= 0xDBFF and 0xDC80 <= ch2 <= 0xDFFFF:
+                        if ch <= 0xDBFF and 0xDC00 <= ch2 <= 0xDFFFF:
                             # pos should be incremented regardless.
                             # by doing so, it ensures the lower surrogate
                             # is also included in the characters considered
diff --git a/rpython/rlib/test/test_runicode.py b/rpython/rlib/test/test_runicode.py
--- a/rpython/rlib/test/test_runicode.py
+++ b/rpython/rlib/test/test_runicode.py
@@ -963,3 +963,32 @@
             py.test.raises(
                 UnicodeEncodeError, runicode.unicode_encode_utf_8,
                 u, len(u), True, allow_surrogates=False)
+
+    def test_encode_utf_8_combine_surrogates(self):
+        """
+        In the case of a surrogate pair, the error handler should
+        return back a start and stop position of the full surrogate
+        pair (new behavior inherited from python3.6)
+        """
+        u = runicode.UNICHR(0xDC80) + runicode.UNICHR(0xD800) + \
+            runicode.UNICHR(0xDFFF)
+
+        handler_num = 0
+
+        def errorhandler(errors, encoding, msg, s, start, end):
+            """
+            This handler will be called twice, so asserting both times:
+
+            1. the first time, 0xDC80 will be handled as a single surrogate,
+               since it is a standalone character and an invalid surrogate.
+            2. the second time, the characters will be 0xD800 and 0xDFFF, since
+               that is a valid surrogate pair.
+            """
+            assert s[start:end] in [u'\udc80', u'\uD800\uDFFF']
+            return [], None, end
+
+        runicode.unicode_encode_utf_8(
+            u, len(u), True,
+            errorhandler=errorhandler,
+            allow_surrogates=False
+        )