[pypy-commit] pypy fix_test_codecs: Adding low surrogates into errorhandlers for utf-8encoding

Tue May 29 02:08:31 EDT 2018

Author: Yusuke Tsutsumi <yusuke at tsutsumi.io>
Branch: fix_test_codecs
Changeset: r94699:6d4fc7830371
Date: 2018-05-20 06:34 -0700
http://bitbucket.org/pypy/pypy/changeset/6d4fc7830371/

Log:	Adding low surrogates into errorhandlers for utf-8encoding

	In CPython 3.6, Behavior was defined that included low surrogates
	when handling unicode encoding errors. This change adds that
	behavior.

	References:

	https://hg.python.org/cpython/rev/2b5357b38366
	https://bugs.python.org/issue25267

diff --git a/rpython/rlib/runicode.py b/rpython/rlib/runicode.py
--- a/rpython/rlib/runicode.py
+++ b/rpython/rlib/runicode.py
@@ -361,20 +361,27 @@
         else:
             # Encode UCS2 Unicode ordinals
             if ch < 0x10000:
-                # Special case: check for high surrogate
+                # Special case: check for surrogates
                 if 0xD800 <= ch <= 0xDFFF:
+                    error_start_pos = pos - 1
                     if pos != size:
                         ch2 = ord(s[pos])
-                        # Check for low surrogate and combine the two to
-                        # form a UCS4 value
-                        if ((allow_surrogates or MAXUNICODE < 65536
-                             or is_narrow_host()) and
-                            ch <= 0xDBFF and 0xDC00 <= ch2 <= 0xDFFF):
-                            ch3 = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000
-                            assert ch3 >= 0
+                        # check if the first character is a high surrogate,
+                        # and the second character is a low surrogate. If so,
+                        # they should be handled collectively.
+                        if ch <= 0xDBFF and 0xDC80 <= ch2 <= 0xDFFFF:
+                            # pos should be incremented regardless.
+                            # by doing so, it ensures the lower surrogate
+                            # is also included in the characters considered
+                            # in the errorhandler.
                             pos += 1
-                            _encodeUCS4(result, ch3)
-                            continue
+                            # if we allow surrogates, we should combine
+                            # the two and form a UCS4 value
+                            if allow_surrogates or MAXUNICODE < 65535 or is_narrow_host():
+                                ch3 = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000
+                                assert ch3 >= 0
+                                _encodeUCS4(result, ch3)
+                                continue
                     # note: if the program only ever calls this with
                     # allow_surrogates=True, then we'll never annotate
                     # the following block of code, and errorhandler()
@@ -383,7 +390,7 @@
                     if not allow_surrogates or nonconst.NonConstant(False):
                         ru, rs, pos = errorhandler(errors, 'utf8',
                                                    'surrogates not allowed',
-                                                   s, pos-1, pos)
+                                                   s, error_start_pos, pos)
                         if rs is not None:
                             # py3k only
                             result.append(rs)
@@ -394,7 +401,7 @@
                             else:
                                 errorhandler('strict', 'utf8',
                                              'surrogates not allowed',
-                                             s, pos-1, pos)
+                                             s, pos - 1 , pos)
                         continue
                     # else: Fall through and handles isolated high surrogates
                 result.append((chr((0xe0 | (ch >> 12)))))