[pypy-commit] pypy unicode-utf8: start working on more obscure codecs and completely remove hacks that go via UCS4 from unicodehelper. Now unicodehelper no longer uses runicode

Mon Nov 20 05:14:36 EST 2017

Author: fijal
Branch: unicode-utf8
Changeset: r93087:3e5acb0a1e81
Date: 2017-11-20 11:13 +0100
http://bitbucket.org/pypy/pypy/changeset/3e5acb0a1e81/

Log:	start working on more obscure codecs and completely remove hacks
	that go via UCS4 from unicodehelper. Now unicodehelper no longer
	uses runicode

diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -1,7 +1,9 @@
+import sys
+
 from pypy.interpreter.error import OperationError
 from rpython.rlib.objectmodel import specialize
-from rpython.rlib import runicode, rutf8
-from rpython.rlib.rarithmetic import r_uint
+from rpython.rlib import rutf8
+from rpython.rlib.rarithmetic import r_uint, intmask
 from rpython.rlib.rstring import StringBuilder
 from pypy.module._codecs import interp_codecs
 
@@ -168,47 +170,6 @@
     r = res.build()
     return r
 
-class DecodeWrapper(object):
-    def __init__(self, handler):
-        self.orig = handler
-
-    def handle(self, errors, encoding, msg, s, pos, endpos):
-        return self.orig(errors, encoding, msg, s, pos, endpos)
-
-class EncodeWrapper(object):
-    def __init__(self, handler):
-        self.orig = handler
-
-    def handle(self, errors, encoding, msg, s, pos, endpos):
-        return self.orig(errors, encoding, msg, s.encode("utf8"), pos, endpos)
-
-def setup_new_encoders_legacy(encoding):
-    encoder_name = 'utf8_encode_' + encoding
-    encoder_call_name = 'unicode_encode_' + encoding
-    decoder_name = 'str_decode_' + encoding
-    def encoder(utf8, errors, errorhandler):
-        u = utf8.decode("utf8")
-        w = EncodeWrapper(errorhandler)
-        return getattr(runicode, encoder_call_name)(u, len(u), errors,
-                       w.handle)
-    def decoder(s, slen, errors, final, errorhandler):
-        w = DecodeWrapper((errorhandler))
-        u, pos = getattr(runicode, decoder_name)(s, slen, errors, final, w.handle)
-        return u.encode('utf8'), pos, len(u), _get_flag(u)
-    encoder.__name__ = encoder_name
-    decoder.__name__ = decoder_name
-    if encoder_name not in globals():
-        globals()[encoder_name] = encoder
-    if decoder_name not in globals():
-        globals()[decoder_name] = decoder
-
-def setup():
-    for encoding in ['utf_16', 'utf_16_le', 'utf_16_be', 'utf_32_le', 'utf_32',
-                     'utf_32_be', 'unicode_internal']:
-        setup_new_encoders_legacy(encoding)
-
-setup()
-
 def utf8_encode_ascii(utf8, errors, errorhandler):
     """ Don't be confused - this is a slowpath for errors e.g. "ignore"
     or an obscure errorhandler
@@ -618,6 +579,41 @@
     lgt, flag = rutf8.check_utf8(r, True)
     return r, pos, lgt, flag
 
+
+TABLE = '0123456789abcdef'
+
+def raw_unicode_escape_helper(result, char):
+    if char >= 0x10000 or char < 0:
+        result.append("\\U")
+        zeros = 8
+    elif char >= 0x100:
+        result.append("\\u")
+        zeros = 4
+    else:
+        result.append("\\x")
+        zeros = 2
+    for i in range(zeros-1, -1, -1):
+        result.append(TABLE[(char >> (4 * i)) & 0x0f])
+
+def utf8_encode_raw_unicode_escape(s, errors, errorhandler=None):
+    # errorhandler is not used: this function cannot cause Unicode errors
+    size = len(s)
+    if size == 0:
+        return ''
+    result = StringBuilder(size)
+    pos = 0
+    while pos < size:
+        oc = ord(s[pos])
+
+        if oc < 0x100:
+            result.append(chr(oc))
+        else:
+            raw_unicode_escape_helper(result, oc)
+        pos += 1
+
+    return result.build()
+
+
 # ____________________________________________________________
 # utf-7
 
@@ -896,3 +892,395 @@
         result.append('-')
 
     return result.build()
+
+# ____________________________________________________________
+# utf-16
+
+BYTEORDER = sys.byteorder
+BYTEORDER2 = BYTEORDER[0] + 'e'      # either "le" or "be"
+assert BYTEORDER2 in ('le', 'be')
+
+def str_decode_utf_16(s, errors, final=True,
+                      errorhandler=None):
+    result, c, lgt, flag, _ = str_decode_utf_16_helper(s, errors, final,
+                                                         errorhandler, "native")
+    return result, c, lgt, flag
+
+def str_decode_utf_16_be(s, errors, final=True,
+                        errorhandler=None):
+    result, c, lgt, flag, _ = str_decode_utf_16_helper(s, errors, final,
+                                                         errorhandler, "big")
+    return result, c, lgt, flag
+
+def str_decode_utf_16_le(s, errors, final=True,
+                         errorhandler=None):
+    result, c, lgt, flag, _ = str_decode_utf_16_helper(s, errors, final,
+                                                         errorhandler, "little")
+    return result, c, lgt, flag
+
+def str_decode_utf_16_helper(s, errors, final=True,
+                             errorhandler=None,
+                             byteorder="native",
+                             public_encoding_name='utf16'):
+    size = len(s)
+    bo = 0
+
+    if BYTEORDER == 'little':
+        ihi = 1
+        ilo = 0
+    else:
+        ihi = 0
+        ilo = 1
+
+    #  Check for BOM marks (U+FEFF) in the input and adjust current
+    #  byte order setting accordingly. In native mode, the leading BOM
+    #  mark is skipped, in all other modes, it is copied to the output
+    #  stream as-is (giving a ZWNBSP character).
+    pos = 0
+    if byteorder == 'native':
+        if size >= 2:
+            bom = (ord(s[ihi]) << 8) | ord(s[ilo])
+            if BYTEORDER == 'little':
+                if bom == 0xFEFF:
+                    pos += 2
+                    bo = -1
+                elif bom == 0xFFFE:
+                    pos += 2
+                    bo = 1
+            else:
+                if bom == 0xFEFF:
+                    pos += 2
+                    bo = 1
+                elif bom == 0xFFFE:
+                    pos += 2
+                    bo = -1
+    elif byteorder == 'little':
+        bo = -1
+    else:
+        bo = 1
+    if size == 0:
+        return u'', 0, bo
+    if bo == -1:
+        # force little endian
+        ihi = 1
+        ilo = 0
+
+    elif bo == 1:
+        # force big endian
+        ihi = 0
+        ilo = 1
+
+    result = StringBuilder(size // 2)
+
+    #XXX I think the errors are not correctly handled here
+    while pos < size:
+        # remaining bytes at the end? (size should be even)
+        if len(s) - pos < 2:
+            if not final:
+                break
+            r, pos = errorhandler(errors, public_encoding_name,
+                                  "truncated data",
+                                  s, pos, len(s))
+            result.append(r)
+            if len(s) - pos < 2:
+                break
+        ch = (ord(s[pos + ihi]) << 8) | ord(s[pos + ilo])
+        pos += 2
+        if ch < 0xD800 or ch > 0xDFFF:
+            rutf8.unichr_as_utf8_append(result, ch)
+            continue
+        # UTF-16 code pair:
+        if len(s) - pos < 2:
+            pos -= 2
+            if not final:
+                break
+            errmsg = "unexpected end of data"
+            r, pos = errorhandler(errors, public_encoding_name,
+                                  errmsg, s, pos, len(s))
+            result.append(r)
+            if len(s) - pos < 2:
+                break
+        elif 0xD800 <= ch <= 0xDBFF:
+            ch2 = (ord(s[pos+ihi]) << 8) | ord(s[pos+ilo])
+            pos += 2
+            if 0xDC00 <= ch2 <= 0xDFFF:
+                ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000
+                rutf8.unichr_as_utf8_append(result, ch)
+                continue
+            else:
+                r, pos = errorhandler(errors, public_encoding_name,
+                                      "illegal UTF-16 surrogate",
+                                      s, pos - 4, pos - 2)
+                result.append(r)
+        else:
+            r, pos = errorhandler(errors, public_encoding_name,
+                                  "illegal encoding",
+                                  s, pos - 2, pos)
+            result.append(r)
+    r = result.build()
+    lgt, flag = rutf8.check_utf8(r, True)
+    return result.build(), pos, lgt, flag, bo
+
+def _STORECHAR(result, CH, byteorder):
+    hi = chr(((CH) >> 8) & 0xff)
+    lo = chr((CH) & 0xff)
+    if byteorder == 'little':
+        result.append(lo)
+        result.append(hi)
+    else:
+        result.append(hi)
+        result.append(lo)
+
+def unicode_encode_utf_16_helper(s, errors,
+                                 errorhandler=None,
+                                 allow_surrogates=True,
+                                 byteorder='little',
+                                 public_encoding_name='utf16'):
+    size = len(s)
+    if size == 0:
+        if byteorder == 'native':
+            result = StringBuilder(2)
+            _STORECHAR(result, 0xFEFF, BYTEORDER)
+            return result.build()
+        return ""
+
+    result = StringBuilder(size * 2 + 2)
+    if byteorder == 'native':
+        _STORECHAR(result, 0xFEFF, BYTEORDER)
+        byteorder = BYTEORDER
+
+    pos = 0
+    while pos < size:
+        ch = rutf8.codepoint_at_pos(s, pos)
+        pos = rutf8.next_codepoint_pos(s, pos)
+
+        if ch < 0xD800:
+            _STORECHAR(result, ch, byteorder)
+        elif ch >= 0x10000:
+            _STORECHAR(result, 0xD800 | ((ch-0x10000) >> 10), byteorder)
+            _STORECHAR(result, 0xDC00 | ((ch-0x10000) & 0x3FF), byteorder)
+        elif ch >= 0xE000 or allow_surrogates:
+            _STORECHAR(result, ch, byteorder)
+        else:
+            ru, pos = errorhandler(errors, public_encoding_name,
+                                   'surrogates not allowed',
+                                    s, pos-1, pos)
+            xxx
+            #if rs is not None:
+            #    # py3k only
+            #    if len(rs) % 2 != 0:
+            #        errorhandler('strict', public_encoding_name,
+            #                     'surrogates not allowed',
+            #                     s, pos-1, pos)
+            #    result.append(rs)
+            #    continue
+            for ch in ru:
+                if ord(ch) < 0xD800:
+                    _STORECHAR(result, ord(ch), byteorder)
+                else:
+                    errorhandler('strict', public_encoding_name,
+                                 'surrogates not allowed',
+                                 s, pos-1, pos)
+            continue
+
+    return result.build()
+
+def utf8_encode_utf_16(s, errors,
+                          errorhandler=None,
+                          allow_surrogates=True):
+    return unicode_encode_utf_16_helper(s, errors, errorhandler,
+                                        allow_surrogates, "native")
+
+def utf8_encode_utf_16_be(s, errors,
+                             errorhandler=None,
+                             allow_surrogates=True):
+    return unicode_encode_utf_16_helper(s, errors, errorhandler,
+                                        allow_surrogates, "big")
+
+def utf8_encode_utf_16_le(s, errors,
+                             errorhandler=None,
+                             allow_surrogates=True):
+    return unicode_encode_utf_16_helper(s, errors, errorhandler,
+                                        allow_surrogates, "little")
+
+# ____________________________________________________________
+# utf-32
+
+def str_decode_utf_32(s, errors, final=True,
+                      errorhandler=None):
+    result, c, lgt, flag, _ = str_decode_utf_32_helper(s, errors, final,
+                                                         errorhandler, "native")
+    return result, c, lgt, flag
+
+def str_decode_utf_32_be(s, errors, final=True,
+                         errorhandler=None):
+    result, c, lgt, flag, _ = str_decode_utf_32_helper(s, errors, final,
+                                                         errorhandler, "big")
+    return result, c, lgt, flag
+
+def str_decode_utf_32_le(s, errors, final=True,
+                         errorhandler=None):
+    result, c, lgt, flag, _ = str_decode_utf_32_helper(s, errors, final,
+                                                         errorhandler, "little")
+    return result, c, lgt, flag
+
+BOM32_DIRECT  = intmask(0x0000FEFF)
+BOM32_REVERSE = intmask(0xFFFE0000)
+
+def str_decode_utf_32_helper(s, errors, final=True,
+                             errorhandler=None,
+                             byteorder="native",
+                             public_encoding_name='utf32'):
+    bo = 0
+    size = len(s)
+
+    if BYTEORDER == 'little':
+        iorder = [0, 1, 2, 3]
+    else:
+        iorder = [3, 2, 1, 0]
+
+    #  Check for BOM marks (U+FEFF) in the input and adjust current
+    #  byte order setting accordingly. In native mode, the leading BOM
+    #  mark is skipped, in all other modes, it is copied to the output
+    #  stream as-is (giving a ZWNBSP character).
+    pos = 0
+    if byteorder == 'native':
+        if size >= 4:
+            bom = intmask(
+                (ord(s[iorder[3]]) << 24) | (ord(s[iorder[2]]) << 16) |
+                (ord(s[iorder[1]]) << 8)  | ord(s[iorder[0]]))
+            if BYTEORDER == 'little':
+                if bom == BOM32_DIRECT:
+                    pos += 4
+                    bo = -1
+                elif bom == BOM32_REVERSE:
+                    pos += 4
+                    bo = 1
+            else:
+                if bom == BOM32_DIRECT:
+                    pos += 4
+                    bo = 1
+                elif bom == BOM32_REVERSE:
+                    pos += 4
+                    bo = -1
+    elif byteorder == 'little':
+        bo = -1
+    else:
+        bo = 1
+    if size == 0:
+        return u'', 0, bo
+    if bo == -1:
+        # force little endian
+        iorder = [0, 1, 2, 3]
+
+    elif bo == 1:
+        # force big endian
+        iorder = [3, 2, 1, 0]
+
+    result = StringBuilder(size // 4)
+
+    while pos < size:
+        # remaining bytes at the end? (size should be divisible by 4)
+        if len(s) - pos < 4:
+            if not final:
+                break
+            r, pos = errorhandler(errors, public_encoding_name,
+                                  "truncated data",
+                                  s, pos, len(s))
+            result.append(r)
+            if len(s) - pos < 4:
+                break
+            continue
+        ch = ((ord(s[pos + iorder[3]]) << 24) | (ord(s[pos + iorder[2]]) << 16) |
+              (ord(s[pos + iorder[1]]) << 8)  | ord(s[pos + iorder[0]]))
+        if ch >= 0x110000:
+            r, pos = errorhandler(errors, public_encoding_name,
+                                  "codepoint not in range(0x110000)",
+                                  s, pos, len(s))
+            result.append(r)
+            continue
+
+        rutf8.unichr_as_utf8_append(result, ch)
+        pos += 4
+    r = result.build()
+    lgt, flag = rutf8.check_utf8(r, True)
+    return r, pos, lgt, flag, bo
+
+def _STORECHAR32(result, CH, byteorder):
+    c0 = chr(((CH) >> 24) & 0xff)
+    c1 = chr(((CH) >> 16) & 0xff)
+    c2 = chr(((CH) >> 8) & 0xff)
+    c3 = chr((CH) & 0xff)
+    if byteorder == 'little':
+        result.append(c3)
+        result.append(c2)
+        result.append(c1)
+        result.append(c0)
+    else:
+        result.append(c0)
+        result.append(c1)
+        result.append(c2)
+        result.append(c3)
+
+def unicode_encode_utf_32_helper(s, errors,
+                                 errorhandler=None,
+                                 allow_surrogates=True,
+                                 byteorder='little',
+                                 public_encoding_name='utf32'):
+    size = len(s)
+    if size == 0:
+        if byteorder == 'native':
+            result = StringBuilder(4)
+            _STORECHAR32(result, 0xFEFF, BYTEORDER)
+            return result.build()
+        return ""
+
+    result = StringBuilder(size * 4 + 4)
+    if byteorder == 'native':
+        _STORECHAR32(result, 0xFEFF, BYTEORDER)
+        byteorder = BYTEORDER
+
+    pos = 0
+    while pos < size:
+        ch = rutf8.codepoint_at_pos(s, pos)
+        pos = rutf8.next_codepoint_pos(s, pos)
+        ch2 = 0
+        if not allow_surrogates and 0xD800 <= ch < 0xE000:
+            ru, pos = errorhandler(errors, public_encoding_name,
+                                        'surrogates not allowed',
+                                        s, pos-1, pos)
+            XXX
+            if rs is not None:
+                # py3k only
+                if len(rs) % 4 != 0:
+                    errorhandler('strict', public_encoding_name,
+                                    'surrogates not allowed',
+                                    s, pos-1, pos)
+                result.append(rs)
+                continue
+            for ch in ru:
+                if ord(ch) < 0xD800:
+                    _STORECHAR32(result, ord(ch), byteorder)
+                else:
+                    errorhandler('strict', public_encoding_name,
+                                    'surrogates not allowed',
+                                    s, pos-1, pos)
+            continue
+        _STORECHAR32(result, ch, byteorder)
+
+    return result.build()
+
+def utf8_encode_utf_32(s, errors,
+                          errorhandler=None, allow_surrogates=True):
+    return unicode_encode_utf_32_helper(s, errors, errorhandler,
+                                        allow_surrogates, "native")
+
+def utf8_encode_utf_32_be(s, errors,
+                             errorhandler=None, allow_surrogates=True):
+    return unicode_encode_utf_32_helper(s, errors, errorhandler,
+                                        allow_surrogates, "big")
+
+def utf8_encode_utf_32_le(s, errors,
+                             errorhandler=None, allow_surrogates=True):
+    return unicode_encode_utf_32_helper(s, errors, errorhandler,
+                                        allow_surrogates, "little")
diff --git a/pypy/module/_codecs/interp_codecs.py b/pypy/module/_codecs/interp_codecs.py
--- a/pypy/module/_codecs/interp_codecs.py
+++ b/pypy/module/_codecs/interp_codecs.py
@@ -30,6 +30,10 @@
                               endpos):
             """Generic wrapper for calling into error handlers.
 
+            Note that error handler receives and returns position into
+            the unicode characters, not into the position of utf8 bytes,
+            so it needs to be converted by the codec
+
             Returns (unicode_or_none, str_or_none, newpos) as error
             handlers may return unicode or on Python 3, bytes.
             """
diff --git a/pypy/module/_codecs/test/test_codecs.py b/pypy/module/_codecs/test/test_codecs.py
--- a/pypy/module/_codecs/test/test_codecs.py
+++ b/pypy/module/_codecs/test/test_codecs.py
@@ -15,6 +15,7 @@
                          'utf-32', 'utf-32-le', 'utf-32-be',
                          'raw_unicode_escape',
                          'unicode_escape', 'unicode_internal'):
+            print encoding
             assert unicode(u.encode(encoding),encoding) == u
 
     def test_ucs4(self):
@@ -115,10 +116,10 @@
         raises(TypeError, charmap_decode, '\xff', "strict",  {0xff: 0x110000})
         assert (charmap_decode("\x00\x01\x02", "strict",
                                {0: 0x10FFFF, 1: ord('b'), 2: ord('c')}) ==
-                u"\U0010FFFFbc", 3)
+                (u"\U0010FFFFbc", 3))
         assert (charmap_decode("\x00\x01\x02", "strict",
                                {0: u'\U0010FFFF', 1: u'b', 2: u'c'}) ==
-                u"\U0010FFFFbc", 3)
+                (u"\U0010FFFFbc", 3))
 
     def test_escape_decode_errors(self):
         from _codecs import escape_decode as decode
diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -1194,7 +1194,7 @@
                 assert False, "always raises"
             return space.newbytes(s)
         if ((encoding is None and space.sys.defaultencoding == 'utf8') or
-             encoding == 'utf-8' or encoding == 'utf8'):
+             encoding == 'utf-8' or encoding == 'utf8' or encoding == 'UTF-8'):
             return space.newbytes(space.utf8_w(w_object))
     if w_encoder is None:
         from pypy.module._codecs.interp_codecs import lookup_codec