[pypy-commit] pypy unicode-utf8-py3: remove consumed from decoding functions

Fri Jul 27 16:02:22 EDT 2018

Author: Matti Picus <matti.picus at gmail.com>
Branch: unicode-utf8-py3
Changeset: r94903:97cd2e230f8a
Date: 2018-07-11 23:33 -0500
http://bitbucket.org/pypy/pypy/changeset/97cd2e230f8a/

Log:	remove consumed from decoding functions

diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -31,7 +31,7 @@
 
 def decode_never_raise(errors, encoding, msg, s, startingpos, endingpos):
     ux = ['\ux' + hex(ord(x))[2:].upper() for x in s[startingpos:endingpos]]
-    return ''.join(ux), endingpos
+    return ''.join(ux), endingpos, endingpos
 
 @specialize.memo()
 def encode_error_handler(space):
@@ -144,7 +144,7 @@
     from pypy.module._codecs import interp_codecs
     state = space.fromcache(interp_codecs.CodecState)
     unicodedata_handler = state.get_unicodedata_handler(space)
-    result_utf8, consumed, length = str_decode_unicode_escape(
+    result_utf8, length = str_decode_unicode_escape(
         string, "strict",
         final=True,
         errorhandler=decode_error_handler(space),
@@ -152,7 +152,7 @@
     return result_utf8, length
 
 def decode_raw_unicode_escape(space, string):
-    result_utf8, consumed, lgt = str_decode_raw_unicode_escape(
+    result_utf8, lgt = str_decode_raw_unicode_escape(
         string, "strict",
         final=True, errorhandler=decode_error_handler(space))
     return result_utf8, lgt
@@ -172,7 +172,7 @@
     # If there happen to be two 3-bytes encoding a pair of surrogates,
     # you still get two surrogate unicode characters in the result.
     assert isinstance(string, str)
-    result, consumed = runicode.str_decode_utf_8(
+    result, lgth = runicode.str_decode_utf_8(
         string, len(string), "strict",
         final=True, errorhandler=decode_error_handler(space),
         # XXX handle surrogates
@@ -182,7 +182,7 @@
 def str_decode_ascii(s, errors, final, errorhandler):
     try:
         rutf8.check_ascii(s)
-        return s, len(s), len(s)
+        return s, len(s)
     except rutf8.CheckError:
         return _str_decode_ascii_slowpath(s, errors, final, errorhandler)
 
@@ -200,12 +200,12 @@
             i += 1
     ress = res.build()
     lgt = rutf8.check_utf8(ress, True)
-    return ress, len(s), lgt
+    return ress, lgt
 
 def str_decode_latin_1(s, errors, final, errorhandler):
     try:
         rutf8.check_ascii(s)
-        return s, len(s), len(s)
+        return s, len(s)
     except rutf8.CheckError:
         return _str_decode_latin_1_slowpath(s, errors, final, errorhandler)
 
@@ -225,7 +225,7 @@
             res.append_slice(s, start, end)
             i = end
     # cannot be ASCII, cannot have surrogates, I believe
-    return res.build(), len(s), len(s)
+    return res.build(), len(s)
 
 def utf8_encode_latin_1(s, errors, errorhandler):
     try:
@@ -310,7 +310,7 @@
         slen = len(s)
         res, size = runicode.str_decode_mbcs(s, slen, final=final, errors=errors,
                                            errorhandler=errorhandler)
-        return res.encode('utf8'), size, len(res)
+        return res.encode('utf8'), len(res)
 
 def str_decode_utf8(s, errors, final, errorhandler, allow_surrogates=False):
     """ Same as checking for the valid utf8, but we know the utf8 is not
@@ -432,7 +432,7 @@
         res.append(r)
 
     r = res.build()
-    return r, pos, rutf8.check_utf8(r, True)
+    return r, rutf8.check_utf8(r, True)
 
 hexdigits = "0123456789ABCDEFabcdef"
 
@@ -471,7 +471,7 @@
 def str_decode_unicode_escape(s, errors, final, errorhandler, ud_handler):
     size = len(s)
     if size == 0:
-        return '', 0, 0
+        return '', 0
 
     builder = rutf8.Utf8StringBuilder(size)
     pos = 0
@@ -590,7 +590,7 @@
             builder.append_char('\\')
             builder.append_code(ord(ch))
 
-    return builder.build(), pos, builder.getlength()
+    return builder.build(), builder.getlength()
 
 def wcharpsize2utf8(space, wcharp, size):
     """Safe version of rffi.wcharpsize2utf8.
@@ -612,7 +612,7 @@
                                   errorhandler=None):
     size = len(s)
     if size == 0:
-        return '', 0, 0
+        return '', 0
 
     builder = rutf8.Utf8StringBuilder(size)
     pos = 0
@@ -652,7 +652,7 @@
         pos = hexescape(builder, s, pos, digits,
                            "rawunicodeescape", errorhandler, message, errors)
 
-    return builder.build(), pos, builder.getlength()
+    return builder.build(), builder.getlength()
 
 _utf8_encode_unicode_escape = rutf8.make_utf8_escape_function()
 
@@ -787,7 +787,7 @@
                      errorhandler=None):
     size = len(s)
     if size == 0:
-        return '', 0, 0
+        return '', 0
 
     inShift = False
     base64bits = 0
@@ -922,7 +922,7 @@
         final_length = shiftOutStartPos # back off output
 
     assert final_length >= 0
-    return result.build()[:final_length], pos, outsize
+    return result.build()[:final_length], outsize
 
 def utf8_encode_utf_7(s, errors, errorhandler):
     size = len(s)
@@ -1012,21 +1012,21 @@
 
 def str_decode_utf_16(s, errors, final=True,
                       errorhandler=None):
-    result, c, lgt, _ = str_decode_utf_16_helper(s, errors, final,
+    result, lgt = str_decode_utf_16_helper(s, errors, final,
                                                          errorhandler, "native")
-    return result, c, lgt
+    return result, lgt
 
 def str_decode_utf_16_be(s, errors, final=True,
                         errorhandler=None):
-    result, c, lgt, _ = str_decode_utf_16_helper(s, errors, final,
+    result, lgt = str_decode_utf_16_helper(s, errors, final,
                                                          errorhandler, "big")
-    return result, c, lgt
+    return result, lgt
 
 def str_decode_utf_16_le(s, errors, final=True,
                          errorhandler=None):
-    result, c, lgt, _ = str_decode_utf_16_helper(s, errors, final,
+    result, lgt = str_decode_utf_16_helper(s, errors, final,
                                                          errorhandler, "little")
-    return result, c, lgt
+    return result, lgt
 
 def str_decode_utf_16_helper(s, errors, final=True,
                              errorhandler=None,
@@ -1069,7 +1069,7 @@
     else:
         bo = 1
     if size == 0:
-        return '', 0, 0, bo
+        return '', 0
     if bo == -1:
         # force little endian
         ihi = 1
@@ -1129,7 +1129,7 @@
             result.append(r)
     r = result.build()
     lgt = rutf8.check_utf8(r, True)
-    return result.build(), pos, lgt, bo
+    return result.build(), lgt
 
 def _STORECHAR(result, CH, byteorder):
     hi = chr(((CH) >> 8) & 0xff)
@@ -1162,8 +1162,26 @@
     pos = 0
     index = 0
     while pos < size:
-        ch = rutf8.codepoint_at_pos(s, pos)
-
+        try:
+            ch = rutf8.codepoint_at_pos(s, pos)
+        except IndexError:
+            # malformed codepoint, blindly use ch
+            ch = ord(s[pos])
+            pos += 1
+            if errorhandler:
+                res_8, newindex = errorhandler(
+                    errors, public_encoding_name, 'malformed unicode',
+                    s, pos - 1, pos)
+                for cp in rutf8.Utf8StringIterator(res_8):
+                    if cp < 0xD800:
+                        _STORECHAR(result, cp, byteorder)
+                    else:
+                        errorhandler('strict', public_encoding_name,
+                                     'malformed unicode',
+                                 s, pos-1, pos)
+            else:
+                _STORECHAR(result, ch, byteorder)
+                continue
         if ch < 0xD800:
             _STORECHAR(result, ch, byteorder)
         elif ch >= 0x10000:
@@ -1219,21 +1237,21 @@
 
 def str_decode_utf_32(s, errors, final=True,
                       errorhandler=None):
-    result, c, lgt, _ = str_decode_utf_32_helper(
+    result, lgt_ = str_decode_utf_32_helper(
         s, errors, final, errorhandler, "native", 'utf-32-' + BYTEORDER2,
         allow_surrogates=False)
-    return result, c, lgt
+    return result, lgt
 
 def str_decode_utf_32_be(s, errors, final=True,
                          errorhandler=None):
-    result, c, lgt, _ = str_decode_utf_32_helper(
+    result, lgt = str_decode_utf_32_helper(
         s, errors, final, errorhandler, "big", 'utf-32-be',
         allow_surrogates=False)
-    return result, c, lgt
+    return result, lgt
 
 def str_decode_utf_32_le(s, errors, final=True,
                          errorhandler=None):
-    result, c, lgt, _ = str_decode_utf_32_helper(
+    result, lgt_ = str_decode_utf_32_helper(
         s, errors, final, errorhandler, "little", 'utf-32-le',
         allow_surrogates=False)
     return result, c, lgt
@@ -1284,7 +1302,7 @@
     else:
         bo = 1
     if size == 0:
-        return '', 0, 0, bo
+        return '', 0
     if bo == -1:
         # force little endian
         iorder = [0, 1, 2, 3]
@@ -1326,7 +1344,7 @@
         pos += 4
     r = result.build()
     lgt = rutf8.check_utf8(r, True)
-    return r, pos, lgt, bo
+    return r, lgt
 
 def _STORECHAR32(result, CH, byteorder):
     c0 = chr(((CH) >> 24) & 0xff)
@@ -1365,8 +1383,31 @@
     pos = 0
     index = 0
     while pos < size:
-        ch = rutf8.codepoint_at_pos(s, pos)
-        pos = rutf8.next_codepoint_pos(s, pos)
+        try:
+            ch = rutf8.codepoint_at_pos(s, pos)
+            pos = rutf8.next_codepoint_pos(s, pos)
+        except IndexError:
+            # malformed codepoint, blindly use ch
+            ch = ord(s[pos])
+            pos += 1
+            if errorhandler:
+                res_8, newindex = errorhandler(
+                    errors, public_encoding_name, 'malformed unicode',
+                    s, pos - 1, pos)
+                if res_8:
+                    for cp in rutf8.Utf8StringIterator(res_8):
+                        if cp < 0xD800:
+                            _STORECHAR32(result, cp, byteorder)
+                        else:
+                            errorhandler('strict', public_encoding_name,
+                                     'malformed unicode',
+                                 s, pos-1, pos)
+                else:
+                    _STORECHAR32(result, ch, byteorder)
+            else:
+                _STORECHAR32(result, ch, byteorder)
+            index += 1
+            continue
         if not allow_surrogates and 0xD800 <= ch < 0xE000:
             res_8, newindex = errorhandler(
                 errors, public_encoding_name, 'surrogates not allowed',
@@ -1389,19 +1430,19 @@
 
 def utf8_encode_utf_32(s, errors,
                           errorhandler=None, allow_surrogates=True):
-    return unicode_encode_utf_32_helper(s, errors, errorhandler,
+    return unicode_encode_utf_32_helper(s.decode('utf8'), errors, errorhandler,
                                         allow_surrogates, "native",
                                         'utf-32-' + BYTEORDER2)
 
 def utf8_encode_utf_32_be(s, errors,
                                   errorhandler=None, allow_surrogates=True):
-    return unicode_encode_utf_32_helper(s, errors, errorhandler,
+    return unicode_encode_utf_32_helper(s.decode('utf8'), errors, errorhandler,
                                         allow_surrogates, "big",
                                         'utf-32-be')
 
 def utf8_encode_utf_32_le(s, errors,
                                   errorhandler=None, allow_surrogates=True):
-    return unicode_encode_utf_32_helper(s, errors, errorhandler,
+    return unicode_encode_utf_32_helper(s.decode('utf8'), errors, errorhandler,
                                         allow_surrogates, "little",
                                         'utf-32-le')
 # ____________________________________________________________
@@ -1411,7 +1452,7 @@
                                 errorhandler=None):
     size = len(s)
     if size == 0:
-        return '', 0, 0
+        return '', 0
 
     unicode_bytes = 4
     if BYTEORDER == "little":
@@ -1449,7 +1490,7 @@
         pos += unicode_bytes
     r = result.build()
     lgt = rutf8.check_utf8(r, True)
-    return r, pos, lgt
+    return r, lgt
 
 def utf8_encode_unicode_internal(s, errors, errorhandler):
     size = len(s)
@@ -1490,7 +1531,7 @@
                                   errorhandler=errorhandler)
     size = len(s)
     if size == 0:
-        return '', 0, 0
+        return '', 0
 
     pos = 0
     result = StringBuilder(size)
@@ -1508,7 +1549,7 @@
         pos += 1
     r = result.build()
     lgt = rutf8.check_utf8(r, True)
-    return r, pos, lgt
+    return r, lgt
 
 def utf8_encode_charmap(s, errors, errorhandler=None, mapping=None):
     size = len(s)
diff --git a/pypy/objspace/std/objspace.py b/pypy/objspace/std/objspace.py
--- a/pypy/objspace/std/objspace.py
+++ b/pypy/objspace/std/objspace.py
@@ -384,10 +384,10 @@
         if isinstance(s, unicode):
             s, lgt = s.encode('utf8'), len(s)
         elif isinstance(s, str):
-            s, uf8lgt, lgt = decode_utf8sp(self, s)
+            s, lgt = decode_utf8sp(self, s)
         elif isinstance(s, tuple):
             # result of decode_utf8
-            s, utf8lgt, lgt = s
+            s, lgt = s
         else:
             # XXX what is s ?
             lgt = rutf8.check_utf8(s, True)