[pypy-commit] pypy default: Move utf_32 implementation from runicode to unicodehelper

Tue Dec 12 17:01:40 EST 2017

Author: Ronan Lamy <ronan.lamy at gmail.com>
Branch: 
Changeset: r93400:5d091c15169c
Date: 2017-12-12 22:00 +0000
http://bitbucket.org/pypy/pypy/changeset/5d091c15169c/

Log:	Move utf_32 implementation from runicode to unicodehelper

diff --git a/pypy/interpreter/test/test_unicodehelper.py b/pypy/interpreter/test/test_unicodehelper.py
--- a/pypy/interpreter/test/test_unicodehelper.py
+++ b/pypy/interpreter/test/test_unicodehelper.py
@@ -1,4 +1,7 @@
-from pypy.interpreter.unicodehelper import encode_utf8, decode_utf8
+import pytest
+import struct
+from pypy.interpreter.unicodehelper import (
+    encode_utf8, decode_utf8, unicode_encode_utf_32_be)
 
 class FakeSpace:
     pass
@@ -24,3 +27,23 @@
     assert map(ord, got) == [0xd800, 0xdc00]
     got = decode_utf8(space, "\xf0\x90\x80\x80")
     assert map(ord, got) == [0x10000]
+
+ at pytest.mark.parametrize('unich', [u"\ud800", u"\udc80"])
+def test_utf32_surrogates(unich):
+    assert (unicode_encode_utf_32_be(unich, 1, None) ==
+            struct.pack('>i', ord(unich)))
+    with pytest.raises(UnicodeEncodeError):
+        unicode_encode_utf_32_be(unich, 1, None, allow_surrogates=False)
+
+    def replace_with(ru, rs):
+        def errorhandler(errors, enc, msg, u, startingpos, endingpos):
+            if errors == 'strict':
+                raise UnicodeEncodeError(enc, u, startingpos, endingpos, msg)
+            return ru, rs, endingpos
+        return unicode_encode_utf_32_be(
+            u"<%s>" % unich, 3, None,
+            errorhandler, allow_surrogates=False)
+
+    assert replace_with(u'rep', None) == u'<rep>'.encode('utf-32-be')
+    assert (replace_with(None, '\xca\xfe\xca\xfe') ==
+            '\x00\x00\x00<\xca\xfe\xca\xfe\x00\x00\x00>')
diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -1,7 +1,11 @@
+from rpython.rlib.objectmodel import specialize
+from rpython.rlib.rarithmetic import intmask
+from rpython.rlib.rstring import StringBuilder, UnicodeBuilder
+from rpython.rlib import runicode
+from rpython.rlib.runicode import (
+    default_unicode_error_encode, default_unicode_error_decode,
+    MAXUNICODE, BYTEORDER, BYTEORDER2, UNICHR)
 from pypy.interpreter.error import OperationError
-from rpython.rlib.objectmodel import specialize
-from rpython.rlib import runicode
-from pypy.module._codecs import interp_codecs
 
 @specialize.memo()
 def decode_error_handler(space):
@@ -37,6 +41,7 @@
 
 # These functions take and return unwrapped rpython strings and unicodes
 def decode_unicode_escape(space, string):
+    from pypy.module._codecs import interp_codecs
     state = space.fromcache(interp_codecs.CodecState)
     unicodedata_handler = state.get_unicodedata_handler(space)
     result, consumed = runicode.str_decode_unicode_escape(
@@ -71,3 +76,229 @@
         uni, len(uni), "strict",
         errorhandler=None,
         allow_surrogates=True)
+
+# ____________________________________________________________
+# utf-32
+
+def str_decode_utf_32(s, size, errors, final=True,
+                      errorhandler=None):
+    result, length, byteorder = str_decode_utf_32_helper(
+        s, size, errors, final, errorhandler, "native")
+    return result, length
+
+def str_decode_utf_32_be(s, size, errors, final=True,
+                         errorhandler=None):
+    result, length, byteorder = str_decode_utf_32_helper(
+        s, size, errors, final, errorhandler, "big")
+    return result, length
+
+def str_decode_utf_32_le(s, size, errors, final=True,
+                         errorhandler=None):
+    result, length, byteorder = str_decode_utf_32_helper(
+        s, size, errors, final, errorhandler, "little")
+    return result, length
+
+def py3k_str_decode_utf_32(s, size, errors, final=True,
+                           errorhandler=None):
+    result, length, byteorder = str_decode_utf_32_helper(
+        s, size, errors, final, errorhandler, "native", 'utf-32-' + BYTEORDER2)
+    return result, length
+
+def py3k_str_decode_utf_32_be(s, size, errors, final=True,
+                              errorhandler=None):
+    result, length, byteorder = str_decode_utf_32_helper(
+        s, size, errors, final, errorhandler, "big", 'utf-32-be')
+    return result, length
+
+def py3k_str_decode_utf_32_le(s, size, errors, final=True,
+                              errorhandler=None):
+    result, length, byteorder = str_decode_utf_32_helper(
+        s, size, errors, final, errorhandler, "little", 'utf-32-le')
+    return result, length
+
+BOM32_DIRECT = intmask(0x0000FEFF)
+BOM32_REVERSE = intmask(0xFFFE0000)
+
+def str_decode_utf_32_helper(s, size, errors, final=True,
+                             errorhandler=None,
+                             byteorder="native",
+                             public_encoding_name='utf32'):
+    if errorhandler is None:
+        errorhandler = default_unicode_error_decode
+    bo = 0
+
+    if BYTEORDER == 'little':
+        iorder = [0, 1, 2, 3]
+    else:
+        iorder = [3, 2, 1, 0]
+
+    #  Check for BOM marks (U+FEFF) in the input and adjust current
+    #  byte order setting accordingly. In native mode, the leading BOM
+    #  mark is skipped, in all other modes, it is copied to the output
+    #  stream as-is (giving a ZWNBSP character).
+    pos = 0
+    if byteorder == 'native':
+        if size >= 4:
+            bom = intmask(
+                (ord(s[iorder[3]]) << 24) | (ord(s[iorder[2]]) << 16) |
+                (ord(s[iorder[1]]) << 8) | ord(s[iorder[0]]))
+            if BYTEORDER == 'little':
+                if bom == BOM32_DIRECT:
+                    pos += 4
+                    bo = -1
+                elif bom == BOM32_REVERSE:
+                    pos += 4
+                    bo = 1
+            else:
+                if bom == BOM32_DIRECT:
+                    pos += 4
+                    bo = 1
+                elif bom == BOM32_REVERSE:
+                    pos += 4
+                    bo = -1
+    elif byteorder == 'little':
+        bo = -1
+    else:
+        bo = 1
+    if size == 0:
+        return u'', 0, bo
+    if bo == -1:
+        # force little endian
+        iorder = [0, 1, 2, 3]
+    elif bo == 1:
+        # force big endian
+        iorder = [3, 2, 1, 0]
+
+    result = UnicodeBuilder(size // 4)
+
+    while pos < size:
+        # remaining bytes at the end? (size should be divisible by 4)
+        if len(s) - pos < 4:
+            if not final:
+                break
+            r, pos = errorhandler(errors, public_encoding_name,
+                                  "truncated data",
+                                  s, pos, len(s))
+            result.append(r)
+            if len(s) - pos < 4:
+                break
+            continue
+        ch = ((ord(s[pos + iorder[3]]) << 24) | (ord(s[pos + iorder[2]]) << 16) |
+            (ord(s[pos + iorder[1]]) << 8) | ord(s[pos + iorder[0]]))
+        if ch >= 0x110000:
+            r, pos = errorhandler(errors, public_encoding_name,
+                                  "codepoint not in range(0x110000)",
+                                  s, pos, len(s))
+            result.append(r)
+            continue
+
+        if MAXUNICODE < 65536 and ch >= 0x10000:
+            ch -= 0x10000L
+            result.append(unichr(0xD800 + (ch >> 10)))
+            result.append(unichr(0xDC00 + (ch & 0x03FF)))
+        else:
+            result.append(UNICHR(ch))
+        pos += 4
+    return result.build(), pos, bo
+
+def _STORECHAR32(result, CH, byteorder):
+    c0 = chr(((CH) >> 24) & 0xff)
+    c1 = chr(((CH) >> 16) & 0xff)
+    c2 = chr(((CH) >> 8) & 0xff)
+    c3 = chr((CH) & 0xff)
+    if byteorder == 'little':
+        result.append(c3)
+        result.append(c2)
+        result.append(c1)
+        result.append(c0)
+    else:
+        result.append(c0)
+        result.append(c1)
+        result.append(c2)
+        result.append(c3)
+
+def unicode_encode_utf_32_helper(s, size, errors,
+                                 errorhandler=None,
+                                 allow_surrogates=True,
+                                 byteorder='little',
+                                 public_encoding_name='utf32'):
+    if errorhandler is None:
+        errorhandler = default_unicode_error_encode
+    if size == 0:
+        if byteorder == 'native':
+            result = StringBuilder(4)
+            _STORECHAR32(result, 0xFEFF, BYTEORDER)
+            return result.build()
+        return ""
+
+    result = StringBuilder(size * 4 + 4)
+    if byteorder == 'native':
+        _STORECHAR32(result, 0xFEFF, BYTEORDER)
+        byteorder = BYTEORDER
+
+    pos = 0
+    while pos < size:
+        ch = ord(s[pos])
+        pos += 1
+        ch2 = 0
+        if not allow_surrogates and 0xD800 <= ch < 0xE000:
+            ru, rs, pos = errorhandler(
+                errors, public_encoding_name, 'surrogates not allowed',
+                s, pos - 1, pos)
+            if rs is not None:
+                # py3k only
+                if len(rs) % 4 != 0:
+                    errorhandler(
+                        'strict', public_encoding_name, 'surrogates not allowed',
+                        s, pos - 1, pos)
+                result.append(rs)
+                continue
+            for ch in ru:
+                if ord(ch) < 0xD800:
+                    _STORECHAR32(result, ord(ch), byteorder)
+                else:
+                    errorhandler(
+                        'strict', public_encoding_name,
+                        'surrogates not allowed', s, pos - 1, pos)
+            continue
+        if 0xD800 <= ch < 0xDC00 and MAXUNICODE < 65536 and pos < size:
+            ch2 = ord(s[pos])
+            if 0xDC00 <= ch2 < 0xE000:
+                ch = (((ch & 0x3FF) << 10) | (ch2 & 0x3FF)) + 0x10000
+                pos += 1
+        _STORECHAR32(result, ch, byteorder)
+
+    return result.build()
+
+def unicode_encode_utf_32(s, size, errors,
+                          errorhandler=None, allow_surrogates=True):
+    return unicode_encode_utf_32_helper(s, size, errors, errorhandler,
+                                        allow_surrogates, "native")
+
+def unicode_encode_utf_32_be(s, size, errors,
+                             errorhandler=None, allow_surrogates=True):
+    return unicode_encode_utf_32_helper(s, size, errors, errorhandler,
+                                        allow_surrogates, "big")
+
+def unicode_encode_utf_32_le(s, size, errors,
+                             errorhandler=None, allow_surrogates=True):
+    return unicode_encode_utf_32_helper(s, size, errors, errorhandler,
+                                        allow_surrogates, "little")
+
+def py3k_unicode_encode_utf_32(s, size, errors,
+                               errorhandler=None, allow_surrogates=True):
+    return unicode_encode_utf_32_helper(s, size, errors, errorhandler,
+                                        allow_surrogates, "native",
+                                        'utf-32-' + BYTEORDER2)
+
+def py3k_unicode_encode_utf_32_be(s, size, errors,
+                                  errorhandler=None, allow_surrogates=True):
+    return unicode_encode_utf_32_helper(s, size, errors, errorhandler,
+                                        allow_surrogates, "big",
+                                        'utf-32-be')
+
+def py3k_unicode_encode_utf_32_le(s, size, errors,
+                                  errorhandler=None, allow_surrogates=True):
+    return unicode_encode_utf_32_helper(s, size, errors, errorhandler,
+                                        allow_surrogates, "little",
+                                        'utf-32-le')
diff --git a/pypy/module/_codecs/interp_codecs.py b/pypy/module/_codecs/interp_codecs.py
--- a/pypy/module/_codecs/interp_codecs.py
+++ b/pypy/module/_codecs/interp_codecs.py
@@ -1,10 +1,12 @@
 from rpython.rlib import jit
 from rpython.rlib.objectmodel import we_are_translated, not_rpython
 from rpython.rlib.rstring import UnicodeBuilder
+from rpython.rlib import runicode
 from rpython.rlib.runicode import code_to_unichr, MAXUNICODE
 
 from pypy.interpreter.error import OperationError, oefmt
 from pypy.interpreter.gateway import interp2app, unwrap_spec, WrappedDefault
+from pypy.interpreter import unicodehelper
 
 
 class VersionTag(object):
@@ -365,19 +367,19 @@
         raise oefmt(space.w_TypeError, "handler must be callable")
 
 # ____________________________________________________________
-# delegation to runicode
-
-from rpython.rlib import runicode
+# delegation to runicode/unicodehelper
 
 def make_encoder_wrapper(name):
     rname = "unicode_encode_%s" % (name.replace("_encode", ""), )
-    assert hasattr(runicode, rname)
+    try:
+        func = getattr(unicodehelper, rname)
+    except AttributeError:
+        func = getattr(runicode, rname)
     @unwrap_spec(uni=unicode, errors='text_or_none')
     def wrap_encoder(space, uni, errors="strict"):
         if errors is None:
             errors = 'strict'
         state = space.fromcache(CodecState)
-        func = getattr(runicode, rname)
         result = func(uni, len(uni), errors, state.encode_error_handler)
         return space.newtuple([space.newbytes(result), space.newint(len(uni))])
     wrap_encoder.func_name = rname
@@ -385,7 +387,10 @@
 
 def make_decoder_wrapper(name):
     rname = "str_decode_%s" % (name.replace("_decode", ""), )
-    assert hasattr(runicode, rname)
+    try:
+        func = getattr(unicodehelper, rname)
+    except AttributeError:
+        func = getattr(runicode, rname)
     @unwrap_spec(string='bufferstr', errors='text_or_none',
                  w_final=WrappedDefault(False))
     def wrap_decoder(space, string, errors="strict", w_final=None):
@@ -393,7 +398,6 @@
             errors = 'strict'
         final = space.is_true(w_final)
         state = space.fromcache(CodecState)
-        func = getattr(runicode, rname)
         result, consumed = func(string, len(string), errors,
                                 final, state.decode_error_handler)
         return space.newtuple([space.newunicode(result), space.newint(consumed)])
diff --git a/pypy/module/cpyext/unicodeobject.py b/pypy/module/cpyext/unicodeobject.py
--- a/pypy/module/cpyext/unicodeobject.py
+++ b/pypy/module/cpyext/unicodeobject.py
@@ -12,6 +12,7 @@
 from pypy.module.cpyext.bytesobject import PyString_Check
 from pypy.module.sys.interp_encoding import setdefaultencoding
 from pypy.module._codecs.interp_codecs import CodecState
+from pypy.interpreter import unicodehelper
 from pypy.objspace.std import unicodeobject
 from rpython.rlib import rstring, runicode
 from rpython.tool.sourcetools import func_renamer
@@ -620,7 +621,7 @@
     else:
         errors = None
 
-    result, length, byteorder = runicode.str_decode_utf_32_helper(
+    result, length, byteorder = unicodehelper.str_decode_utf_32_helper(
         string, size, errors,
         True, # final ? false for multiple passes?
         None, # errorhandler
diff --git a/rpython/rlib/runicode.py b/rpython/rlib/runicode.py
--- a/rpython/rlib/runicode.py
+++ b/rpython/rlib/runicode.py
@@ -710,7 +710,7 @@
 
 
 # ____________________________________________________________
-# utf-32
+# utf-32 (not used in PyPy any more)
 
 def str_decode_utf_32(s, size, errors, final=True,
                       errorhandler=None):