[pypy-commit] pypy default: Move utf_32 implementation from runicode to unicodehelper
rlamy
pypy.commits at gmail.com
Tue Dec 12 17:01:40 EST 2017
Author: Ronan Lamy <ronan.lamy at gmail.com>
Branch:
Changeset: r93400:5d091c15169c
Date: 2017-12-12 22:00 +0000
http://bitbucket.org/pypy/pypy/changeset/5d091c15169c/
Log: Move utf_32 implementation from runicode to unicodehelper
diff --git a/pypy/interpreter/test/test_unicodehelper.py b/pypy/interpreter/test/test_unicodehelper.py
--- a/pypy/interpreter/test/test_unicodehelper.py
+++ b/pypy/interpreter/test/test_unicodehelper.py
@@ -1,4 +1,7 @@
-from pypy.interpreter.unicodehelper import encode_utf8, decode_utf8
+import pytest
+import struct
+from pypy.interpreter.unicodehelper import (
+ encode_utf8, decode_utf8, unicode_encode_utf_32_be)
class FakeSpace:
pass
@@ -24,3 +27,23 @@
assert map(ord, got) == [0xd800, 0xdc00]
got = decode_utf8(space, "\xf0\x90\x80\x80")
assert map(ord, got) == [0x10000]
+
+ at pytest.mark.parametrize('unich', [u"\ud800", u"\udc80"])
+def test_utf32_surrogates(unich):
+ assert (unicode_encode_utf_32_be(unich, 1, None) ==
+ struct.pack('>i', ord(unich)))
+ with pytest.raises(UnicodeEncodeError):
+ unicode_encode_utf_32_be(unich, 1, None, allow_surrogates=False)
+
+ def replace_with(ru, rs):
+ def errorhandler(errors, enc, msg, u, startingpos, endingpos):
+ if errors == 'strict':
+ raise UnicodeEncodeError(enc, u, startingpos, endingpos, msg)
+ return ru, rs, endingpos
+ return unicode_encode_utf_32_be(
+ u"<%s>" % unich, 3, None,
+ errorhandler, allow_surrogates=False)
+
+ assert replace_with(u'rep', None) == u'<rep>'.encode('utf-32-be')
+ assert (replace_with(None, '\xca\xfe\xca\xfe') ==
+ '\x00\x00\x00<\xca\xfe\xca\xfe\x00\x00\x00>')
diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -1,7 +1,11 @@
+from rpython.rlib.objectmodel import specialize
+from rpython.rlib.rarithmetic import intmask
+from rpython.rlib.rstring import StringBuilder, UnicodeBuilder
+from rpython.rlib import runicode
+from rpython.rlib.runicode import (
+ default_unicode_error_encode, default_unicode_error_decode,
+ MAXUNICODE, BYTEORDER, BYTEORDER2, UNICHR)
from pypy.interpreter.error import OperationError
-from rpython.rlib.objectmodel import specialize
-from rpython.rlib import runicode
-from pypy.module._codecs import interp_codecs
@specialize.memo()
def decode_error_handler(space):
@@ -37,6 +41,7 @@
# These functions take and return unwrapped rpython strings and unicodes
def decode_unicode_escape(space, string):
+ from pypy.module._codecs import interp_codecs
state = space.fromcache(interp_codecs.CodecState)
unicodedata_handler = state.get_unicodedata_handler(space)
result, consumed = runicode.str_decode_unicode_escape(
@@ -71,3 +76,229 @@
uni, len(uni), "strict",
errorhandler=None,
allow_surrogates=True)
+
+# ____________________________________________________________
+# utf-32
+
+def str_decode_utf_32(s, size, errors, final=True,
+ errorhandler=None):
+ result, length, byteorder = str_decode_utf_32_helper(
+ s, size, errors, final, errorhandler, "native")
+ return result, length
+
+def str_decode_utf_32_be(s, size, errors, final=True,
+ errorhandler=None):
+ result, length, byteorder = str_decode_utf_32_helper(
+ s, size, errors, final, errorhandler, "big")
+ return result, length
+
+def str_decode_utf_32_le(s, size, errors, final=True,
+ errorhandler=None):
+ result, length, byteorder = str_decode_utf_32_helper(
+ s, size, errors, final, errorhandler, "little")
+ return result, length
+
+def py3k_str_decode_utf_32(s, size, errors, final=True,
+ errorhandler=None):
+ result, length, byteorder = str_decode_utf_32_helper(
+ s, size, errors, final, errorhandler, "native", 'utf-32-' + BYTEORDER2)
+ return result, length
+
+def py3k_str_decode_utf_32_be(s, size, errors, final=True,
+ errorhandler=None):
+ result, length, byteorder = str_decode_utf_32_helper(
+ s, size, errors, final, errorhandler, "big", 'utf-32-be')
+ return result, length
+
+def py3k_str_decode_utf_32_le(s, size, errors, final=True,
+ errorhandler=None):
+ result, length, byteorder = str_decode_utf_32_helper(
+ s, size, errors, final, errorhandler, "little", 'utf-32-le')
+ return result, length
+
+BOM32_DIRECT = intmask(0x0000FEFF)
+BOM32_REVERSE = intmask(0xFFFE0000)
+
+def str_decode_utf_32_helper(s, size, errors, final=True,
+ errorhandler=None,
+ byteorder="native",
+ public_encoding_name='utf32'):
+ if errorhandler is None:
+ errorhandler = default_unicode_error_decode
+ bo = 0
+
+ if BYTEORDER == 'little':
+ iorder = [0, 1, 2, 3]
+ else:
+ iorder = [3, 2, 1, 0]
+
+ # Check for BOM marks (U+FEFF) in the input and adjust current
+ # byte order setting accordingly. In native mode, the leading BOM
+ # mark is skipped, in all other modes, it is copied to the output
+ # stream as-is (giving a ZWNBSP character).
+ pos = 0
+ if byteorder == 'native':
+ if size >= 4:
+ bom = intmask(
+ (ord(s[iorder[3]]) << 24) | (ord(s[iorder[2]]) << 16) |
+ (ord(s[iorder[1]]) << 8) | ord(s[iorder[0]]))
+ if BYTEORDER == 'little':
+ if bom == BOM32_DIRECT:
+ pos += 4
+ bo = -1
+ elif bom == BOM32_REVERSE:
+ pos += 4
+ bo = 1
+ else:
+ if bom == BOM32_DIRECT:
+ pos += 4
+ bo = 1
+ elif bom == BOM32_REVERSE:
+ pos += 4
+ bo = -1
+ elif byteorder == 'little':
+ bo = -1
+ else:
+ bo = 1
+ if size == 0:
+ return u'', 0, bo
+ if bo == -1:
+ # force little endian
+ iorder = [0, 1, 2, 3]
+ elif bo == 1:
+ # force big endian
+ iorder = [3, 2, 1, 0]
+
+ result = UnicodeBuilder(size // 4)
+
+ while pos < size:
+ # remaining bytes at the end? (size should be divisible by 4)
+ if len(s) - pos < 4:
+ if not final:
+ break
+ r, pos = errorhandler(errors, public_encoding_name,
+ "truncated data",
+ s, pos, len(s))
+ result.append(r)
+ if len(s) - pos < 4:
+ break
+ continue
+ ch = ((ord(s[pos + iorder[3]]) << 24) | (ord(s[pos + iorder[2]]) << 16) |
+ (ord(s[pos + iorder[1]]) << 8) | ord(s[pos + iorder[0]]))
+ if ch >= 0x110000:
+ r, pos = errorhandler(errors, public_encoding_name,
+ "codepoint not in range(0x110000)",
+ s, pos, len(s))
+ result.append(r)
+ continue
+
+ if MAXUNICODE < 65536 and ch >= 0x10000:
+ ch -= 0x10000L
+ result.append(unichr(0xD800 + (ch >> 10)))
+ result.append(unichr(0xDC00 + (ch & 0x03FF)))
+ else:
+ result.append(UNICHR(ch))
+ pos += 4
+ return result.build(), pos, bo
+
+def _STORECHAR32(result, CH, byteorder):
+ c0 = chr(((CH) >> 24) & 0xff)
+ c1 = chr(((CH) >> 16) & 0xff)
+ c2 = chr(((CH) >> 8) & 0xff)
+ c3 = chr((CH) & 0xff)
+ if byteorder == 'little':
+ result.append(c3)
+ result.append(c2)
+ result.append(c1)
+ result.append(c0)
+ else:
+ result.append(c0)
+ result.append(c1)
+ result.append(c2)
+ result.append(c3)
+
+def unicode_encode_utf_32_helper(s, size, errors,
+ errorhandler=None,
+ allow_surrogates=True,
+ byteorder='little',
+ public_encoding_name='utf32'):
+ if errorhandler is None:
+ errorhandler = default_unicode_error_encode
+ if size == 0:
+ if byteorder == 'native':
+ result = StringBuilder(4)
+ _STORECHAR32(result, 0xFEFF, BYTEORDER)
+ return result.build()
+ return ""
+
+ result = StringBuilder(size * 4 + 4)
+ if byteorder == 'native':
+ _STORECHAR32(result, 0xFEFF, BYTEORDER)
+ byteorder = BYTEORDER
+
+ pos = 0
+ while pos < size:
+ ch = ord(s[pos])
+ pos += 1
+ ch2 = 0
+ if not allow_surrogates and 0xD800 <= ch < 0xE000:
+ ru, rs, pos = errorhandler(
+ errors, public_encoding_name, 'surrogates not allowed',
+ s, pos - 1, pos)
+ if rs is not None:
+ # py3k only
+ if len(rs) % 4 != 0:
+ errorhandler(
+ 'strict', public_encoding_name, 'surrogates not allowed',
+ s, pos - 1, pos)
+ result.append(rs)
+ continue
+ for ch in ru:
+ if ord(ch) < 0xD800:
+ _STORECHAR32(result, ord(ch), byteorder)
+ else:
+ errorhandler(
+ 'strict', public_encoding_name,
+ 'surrogates not allowed', s, pos - 1, pos)
+ continue
+ if 0xD800 <= ch < 0xDC00 and MAXUNICODE < 65536 and pos < size:
+ ch2 = ord(s[pos])
+ if 0xDC00 <= ch2 < 0xE000:
+ ch = (((ch & 0x3FF) << 10) | (ch2 & 0x3FF)) + 0x10000
+ pos += 1
+ _STORECHAR32(result, ch, byteorder)
+
+ return result.build()
+
+def unicode_encode_utf_32(s, size, errors,
+ errorhandler=None, allow_surrogates=True):
+ return unicode_encode_utf_32_helper(s, size, errors, errorhandler,
+ allow_surrogates, "native")
+
+def unicode_encode_utf_32_be(s, size, errors,
+ errorhandler=None, allow_surrogates=True):
+ return unicode_encode_utf_32_helper(s, size, errors, errorhandler,
+ allow_surrogates, "big")
+
+def unicode_encode_utf_32_le(s, size, errors,
+ errorhandler=None, allow_surrogates=True):
+ return unicode_encode_utf_32_helper(s, size, errors, errorhandler,
+ allow_surrogates, "little")
+
+def py3k_unicode_encode_utf_32(s, size, errors,
+ errorhandler=None, allow_surrogates=True):
+ return unicode_encode_utf_32_helper(s, size, errors, errorhandler,
+ allow_surrogates, "native",
+ 'utf-32-' + BYTEORDER2)
+
+def py3k_unicode_encode_utf_32_be(s, size, errors,
+ errorhandler=None, allow_surrogates=True):
+ return unicode_encode_utf_32_helper(s, size, errors, errorhandler,
+ allow_surrogates, "big",
+ 'utf-32-be')
+
+def py3k_unicode_encode_utf_32_le(s, size, errors,
+ errorhandler=None, allow_surrogates=True):
+ return unicode_encode_utf_32_helper(s, size, errors, errorhandler,
+ allow_surrogates, "little",
+ 'utf-32-le')
diff --git a/pypy/module/_codecs/interp_codecs.py b/pypy/module/_codecs/interp_codecs.py
--- a/pypy/module/_codecs/interp_codecs.py
+++ b/pypy/module/_codecs/interp_codecs.py
@@ -1,10 +1,12 @@
from rpython.rlib import jit
from rpython.rlib.objectmodel import we_are_translated, not_rpython
from rpython.rlib.rstring import UnicodeBuilder
+from rpython.rlib import runicode
from rpython.rlib.runicode import code_to_unichr, MAXUNICODE
from pypy.interpreter.error import OperationError, oefmt
from pypy.interpreter.gateway import interp2app, unwrap_spec, WrappedDefault
+from pypy.interpreter import unicodehelper
class VersionTag(object):
@@ -365,19 +367,19 @@
raise oefmt(space.w_TypeError, "handler must be callable")
# ____________________________________________________________
-# delegation to runicode
-
-from rpython.rlib import runicode
+# delegation to runicode/unicodehelper
def make_encoder_wrapper(name):
rname = "unicode_encode_%s" % (name.replace("_encode", ""), )
- assert hasattr(runicode, rname)
+ try:
+ func = getattr(unicodehelper, rname)
+ except AttributeError:
+ func = getattr(runicode, rname)
@unwrap_spec(uni=unicode, errors='text_or_none')
def wrap_encoder(space, uni, errors="strict"):
if errors is None:
errors = 'strict'
state = space.fromcache(CodecState)
- func = getattr(runicode, rname)
result = func(uni, len(uni), errors, state.encode_error_handler)
return space.newtuple([space.newbytes(result), space.newint(len(uni))])
wrap_encoder.func_name = rname
@@ -385,7 +387,10 @@
def make_decoder_wrapper(name):
rname = "str_decode_%s" % (name.replace("_decode", ""), )
- assert hasattr(runicode, rname)
+ try:
+ func = getattr(unicodehelper, rname)
+ except AttributeError:
+ func = getattr(runicode, rname)
@unwrap_spec(string='bufferstr', errors='text_or_none',
w_final=WrappedDefault(False))
def wrap_decoder(space, string, errors="strict", w_final=None):
@@ -393,7 +398,6 @@
errors = 'strict'
final = space.is_true(w_final)
state = space.fromcache(CodecState)
- func = getattr(runicode, rname)
result, consumed = func(string, len(string), errors,
final, state.decode_error_handler)
return space.newtuple([space.newunicode(result), space.newint(consumed)])
diff --git a/pypy/module/cpyext/unicodeobject.py b/pypy/module/cpyext/unicodeobject.py
--- a/pypy/module/cpyext/unicodeobject.py
+++ b/pypy/module/cpyext/unicodeobject.py
@@ -12,6 +12,7 @@
from pypy.module.cpyext.bytesobject import PyString_Check
from pypy.module.sys.interp_encoding import setdefaultencoding
from pypy.module._codecs.interp_codecs import CodecState
+from pypy.interpreter import unicodehelper
from pypy.objspace.std import unicodeobject
from rpython.rlib import rstring, runicode
from rpython.tool.sourcetools import func_renamer
@@ -620,7 +621,7 @@
else:
errors = None
- result, length, byteorder = runicode.str_decode_utf_32_helper(
+ result, length, byteorder = unicodehelper.str_decode_utf_32_helper(
string, size, errors,
True, # final ? false for multiple passes?
None, # errorhandler
diff --git a/rpython/rlib/runicode.py b/rpython/rlib/runicode.py
--- a/rpython/rlib/runicode.py
+++ b/rpython/rlib/runicode.py
@@ -710,7 +710,7 @@
# ____________________________________________________________
-# utf-32
+# utf-32 (not used in PyPy any more)
def str_decode_utf_32(s, size, errors, final=True,
errorhandler=None):
More information about the pypy-commit
mailing list