[pypy-commit] pypy unicode-utf8: apply fix from 0cca4bcffdbf, reduce diff to unicode-utf8-py3, fix test

Tue Jan 1 13:29:11 EST 2019

Author: Matti Picus <matti.picus at gmail.com>
Branch: unicode-utf8
Changeset: r95562:5d13e76c2ee0
Date: 2019-01-01 20:28 +0200
http://bitbucket.org/pypy/pypy/changeset/5d13e76c2ee0/

Log:	apply fix from 0cca4bcffdbf, reduce diff to unicode-utf8-py3, fix
	test

diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -3,7 +3,7 @@
 from pypy.interpreter.error import OperationError, oefmt
 from rpython.rlib.objectmodel import specialize
 from rpython.rlib.rstring import StringBuilder
-from rpython.rlib import rutf8
+from rpython.rlib import rutf8, runicode
 from rpython.rlib.rarithmetic import r_uint, intmask
 from rpython.rtyper.lltypesystem import rffi
 from pypy.module.unicodedata import unicodedb
@@ -21,6 +21,11 @@
                                              space.newtext(msg)]))
     return raise_unicode_exception_decode
 
+def decode_never_raise(errors, encoding, msg, s, startingpos, endingpos):
+    assert startingpos >= 0
+    ux = ['\ux' + hex(ord(x))[2:].upper() for x in s[startingpos:endingpos]]
+    return ''.join(ux), endingpos, 'b'
+
 @specialize.memo()
 def encode_error_handler(space):
     # Fast version of the "strict" errors handler.
@@ -35,6 +40,23 @@
                                              space.newtext(msg)]))
     return raise_unicode_exception_encode
 
+ at specialize.memo()
+def encode_unicode_error_handler(space):
+    # Fast version of the "strict" errors handler.
+    def raise_unicode_exception_encode(errors, encoding, msg, uni,
+                                       startingpos, endingpos):
+        assert isinstance(uni, unicode)
+        u_len = len(uni)
+        utf8 = runicode.unicode_encode_utf8sp(uni, u_len)
+        raise OperationError(space.w_UnicodeEncodeError,
+                             space.newtuple([space.newtext(encoding),
+                                             space.newtext(utf8, u_len),
+                                             space.newint(startingpos),
+                                             space.newint(endingpos),
+                                             space.newtext(msg)]))
+        return u'', None, 0
+    return raise_unicode_exception_encode
+
 def default_error_encode(
         errors, encoding, msg, u, startingpos, endingpos):
     """A default handler, for tests"""
@@ -45,10 +67,10 @@
         return '', endingpos
     raise ValueError
 
-def convert_arg_to_w_unicode(space, w_arg, strict=None):
-    return space.convert_arg_to_w_unicode(w_arg)
+# ____________________________________________________________
+_WIN32 = sys.platform == 'win32'
+_MACOSX = sys.platform == 'darwin'
 
-# ____________________________________________________________
 
 def encode(space, w_data, encoding=None, errors='strict'):
     from pypy.objspace.std.unicodeobject import encode_object
@@ -245,18 +267,21 @@
     res = StringBuilder(slen)
     pos = 0
     end = len(s)
+    suppressing = False # we are in a chain of "bad" unicode, only emit one fix
     while pos < end:
         ordch1 = ord(s[pos])
         # fast path for ASCII
         if ordch1 <= 0x7F:
             pos += 1
             res.append(chr(ordch1))
+            suppressing = False
             continue
 
         if ordch1 <= 0xC1:
             r, pos = errorhandler(errors, "utf8", "invalid start byte",
                     s, pos, pos + 1)
-            res.append(r)
+            if not suppressing:
+                res.append(r)
             continue
 
         pos += 1
@@ -268,14 +293,16 @@
                     break
                 r, pos = errorhandler(errors, "utf8", "unexpected end of data",
                     s, pos - 1, pos)
-                res.append(r)
+                if not suppressing:
+                    res.append(r)
                 continue
             ordch2 = ord(s[pos])
 
             if rutf8._invalid_byte_2_of_2(ordch2):
                 r, pos = errorhandler(errors, "utf8", "invalid continuation byte",
                     s, pos - 1, pos)
-                res.append(r)
+                if not suppressing:
+                    res.append(r)
                 continue
             # 110yyyyy 10zzzzzz -> 00000000 00000yyy yyzzzzzz
             pos += 1
@@ -289,8 +316,9 @@
                     pos -= 1
                     break
                 r, pos = errorhandler(errors, "utf8", "unexpected end of data",
-                    s, pos - 1, pos + 1)
+                    s, pos - 1, pos)
                 res.append(r)
+                suppressing = True
                 continue
             ordch2 = ord(s[pos])
             ordch3 = ord(s[pos + 1])
@@ -298,12 +326,14 @@
             if rutf8._invalid_byte_2_of_3(ordch1, ordch2, True):
                 r, pos = errorhandler(errors, "utf8", "invalid continuation byte",
                     s, pos - 1, pos)
-                res.append(r)
+                if not suppressing:
+                    res.append(r)
                 continue
             elif rutf8._invalid_byte_3_of_3(ordch3):
                 r, pos = errorhandler(errors, "utf8", "invalid continuation byte",
                     s, pos - 1, pos + 1)
-                res.append(r)
+                if not suppressing:
+                    res.append(r)
                 continue
             pos += 2
 
@@ -311,6 +341,7 @@
             res.append(chr(ordch1))
             res.append(chr(ordch2))
             res.append(chr(ordch3))
+            suppressing = False
             continue
 
         if ordch1 <= 0xF4:
@@ -321,6 +352,7 @@
                 r, pos = errorhandler(errors, "utf8", "unexpected end of data",
                     s, pos - 1, pos)
                 res.append(r)
+                suppressing = True
                 continue
             ordch2 = ord(s[pos])
             ordch3 = ord(s[pos + 1])
@@ -329,7 +361,8 @@
             if rutf8._invalid_byte_2_of_4(ordch1, ordch2):
                 r, pos = errorhandler(errors, "utf8", "invalid continuation byte",
                     s, pos - 1, pos)
-                res.append(r)
+                if not suppressing:
+                    res.append(r)
                 continue
             elif rutf8._invalid_byte_3_of_4(ordch3):
                 r, pos = errorhandler(errors, "utf8", "invalid continuation byte",
@@ -339,7 +372,8 @@
             elif rutf8._invalid_byte_4_of_4(ordch4):
                 r, pos = errorhandler(errors, "utf8", "invalid continuation byte",
                     s, pos - 1, pos + 2)
-                res.append(r)
+                if not suppressing:
+                    res.append(r)
                 continue
 
             pos += 3
@@ -348,11 +382,13 @@
             res.append(chr(ordch2))
             res.append(chr(ordch3))
             res.append(chr(ordch4))
+            suppressing = False
             continue
 
         r, pos = errorhandler(errors, "utf8", "invalid start byte",
                 s, pos - 1, pos)
-        res.append(r)
+        if not suppressing:
+            res.append(r)
 
     r = res.build()
     return r, pos, rutf8.check_utf8(r, True)
@@ -899,6 +935,33 @@
 
     return result.build()
 
+def encode_utf8(space, uni, allow_surrogates=False):
+    # Note that Python3 tends to forbid *all* surrogates in utf-8.
+    # If allow_surrogates=True, then revert to the Python 2 behavior
+    # which never raises UnicodeEncodeError.  Surrogate pairs are then
+    # allowed, either paired or lone.  A paired surrogate is considered
+    # like the non-BMP character it stands for.  See also *_utf8sp().
+    assert isinstance(uni, unicode)
+    return runicode.unicode_encode_utf_8(
+        uni, len(uni), "strict",
+        errorhandler=encode_unicode_error_handler(space),
+        allow_surrogates=allow_surrogates)
+
+def encode_utf8sp(space, uni, allow_surrogates=True):
+    # Surrogate-preserving utf-8 encoding.  Any surrogate character
+    # turns into its 3-bytes encoding, whether it is paired or not.
+    # This should always be reversible, and the reverse is
+    # decode_utf8sp().
+    return runicode.unicode_encode_utf8sp(uni, len(uni))
+
+def decode_utf8sp(space, string):
+    # Surrogate-preserving utf-8 decoding.  Assuming there is no
+    # encoding error, it should always be reversible, and the reverse is
+    # encode_utf8sp().
+    return str_decode_utf8(string, "string", True, decode_never_raise,
+                           allow_surrogates=True)
+
+
 # ____________________________________________________________
 # utf-16
 
diff --git a/pypy/module/_codecs/interp_codecs.py b/pypy/module/_codecs/interp_codecs.py
--- a/pypy/module/_codecs/interp_codecs.py
+++ b/pypy/module/_codecs/interp_codecs.py
@@ -221,7 +221,7 @@
     if space.isinstance_w(w_exc, space.w_UnicodeEncodeError):
         w_obj = space.getattr(w_exc, space.newtext('object'))
         space.realutf8_w(w_obj) # weeoes
-        w_obj = unicodehelper.convert_arg_to_w_unicode(space, w_obj)
+        w_obj = space.convert_arg_to_w_unicode(w_obj)
         start = space.int_w(space.getattr(w_exc, space.newtext('start')))
         w_end = space.getattr(w_exc, space.newtext('end'))
         end = space.int_w(w_end)
@@ -250,7 +250,7 @@
     if space.isinstance_w(w_exc, space.w_UnicodeEncodeError):
         w_obj = space.getattr(w_exc, space.newtext('object'))
         space.realutf8_w(w_obj) # for errors
-        w_obj = unicodehelper.convert_arg_to_w_unicode(space, w_obj)
+        w_obj = space.convert_arg_to_w_unicode(w_obj)
         start = space.int_w(space.getattr(w_exc, space.newtext('start')))
         w_end = space.getattr(w_exc, space.newtext('end'))
         end = space.int_w(w_end)
@@ -395,7 +395,7 @@
     def wrap_encoder(space, w_arg, errors="strict"):
         from pypy.interpreter import unicodehelper
 
-        w_arg = unicodehelper.convert_arg_to_w_unicode(space, w_arg, rname)
+        w_arg = space.convert_arg_to_w_unicode(w_arg)
         if errors is None:
             errors = 'strict'
         state = space.fromcache(CodecState)
@@ -650,7 +650,7 @@
         mapping = Charmap_Encode(space, w_mapping)
 
     state = space.fromcache(CodecState)
-    w_uni = unicodehelper.convert_arg_to_w_unicode(space, w_unicode)
+    w_uni = space.convert_arg_to_w_unicode(w_unicode)
     result = unicodehelper.utf8_encode_charmap(
         space.utf8_w(w_uni), errors, state.encode_error_handler, mapping)
     return space.newtuple([space.newbytes(result), space.newint(w_uni._len())])
diff --git a/pypy/module/_codecs/test/test_codecs.py b/pypy/module/_codecs/test/test_codecs.py
--- a/pypy/module/_codecs/test/test_codecs.py
+++ b/pypy/module/_codecs/test/test_codecs.py
@@ -140,6 +140,7 @@
         assert unicode_escape_encode(u'abc') == (u'abc'.encode('unicode_escape'), 3)
         assert unicode_escape_decode('abc') == (u'abc'.decode('unicode_escape'), 3)
         assert unicode_escape_decode('\\x61\\x62\\x63') == (u'abc', 12)
+
     def test_unicode_replace(self):
         # CPython #8271: during the decoding of an invalid UTF-8 byte sequence,
         # only the start byte and the continuation byte(s) are now considered
@@ -216,14 +217,13 @@
             (b'\xfe', FFFD),
             (b'\xfe\x80\x80', FFFD*3),
             # other sequences
-            (b'\xf1\x80\x41\x42\x43', '\ufffd\x41\x42\x43'),
-            (b'\xf1\x80\xff\x42\x43', '\ufffd\ufffd\x42\x43'),
-            (b'\xf1\x80\xc2\x81\x43', '\ufffd\x81\x43'),
+            (b'\xf1\x80\x41\x42\x43', u'\ufffd\x41\x42\x43'),
+            (b'\xf1\x80\xff\x42\x43', u'\ufffd\ufffd\x42\x43'),
+            (b'\xf1\x80\xc2\x81\x43', u'\ufffd\x81\x43'),
             (b'\x61\xF1\x80\x80\xE1\x80\xC2\x62\x80\x63\x80\xBF\x64',
-             '\x61\uFFFD\uFFFD\uFFFD\x62\uFFFD\x63\uFFFD\uFFFD\x64'),
+             u'\x61\uFFFD\uFFFD\uFFFD\x62\uFFFD\x63\uFFFD\uFFFD\x64'),
         ]
         for n, (seq, res) in enumerate(sequences):
-            print(seq, res)
             raises(UnicodeDecodeError, seq.decode, 'utf-8', 'strict')
             uni = seq.decode('utf-8', 'replace')
             assert uni == res
@@ -233,7 +233,6 @@
             assert uni == res.replace(u'\uFFFD', '')
 
 
-
 class AppTestPartialEvaluation:
     spaceconfig = dict(usemodules=['array',])
     if sys.platform == 'win32':
diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -1155,7 +1155,7 @@
         # test_unicode_conversion_with__str__
         if w_unicode_method is None:
             if space.isinstance_w(w_obj, space.w_unicode):
-                return unicodehelper.convert_arg_to_w_unicode(space, w_obj)
+                return space.convert_arg_to_w_unicode(w_obj)
             w_unicode_method = space.lookup(w_obj, "__str__")
         if w_unicode_method is not None:
             w_res = space.get_and_call_function(w_unicode_method, w_obj)