[pypy-commit] pypy unicode-utf8: Add utf8-based replacement for runicode.unicode_encode_decimal() to unicodehelper and fix PyUnicode_EncodeDecimal()
rlamy
pypy.commits at gmail.com
Fri Dec 8 20:36:44 EST 2017
Author: Ronan Lamy <ronan.lamy at gmail.com>
Branch: unicode-utf8
Changeset: r93319:ac75e33e51bb
Date: 2017-12-09 01:36 +0000
http://bitbucket.org/pypy/pypy/changeset/ac75e33e51bb/
Log: Add utf8-based replacement for runicode.unicode_encode_decimal() to
unicodehelper and fix PyUnicode_EncodeDecimal()
diff --git a/pypy/interpreter/test/test_unicodehelper.py b/pypy/interpreter/test/test_unicodehelper.py
--- a/pypy/interpreter/test/test_unicodehelper.py
+++ b/pypy/interpreter/test/test_unicodehelper.py
@@ -1,3 +1,4 @@
+import pytest
from hypothesis import given, strategies
from rpython.rlib import rutf8
@@ -5,6 +6,7 @@
from pypy.interpreter.unicodehelper import str_decode_utf8
from pypy.interpreter.unicodehelper import utf8_encode_ascii, str_decode_ascii
from pypy.interpreter import unicodehelper as uh
+from pypy.module._codecs.interp_codecs import CodecState
def decode_utf8(u):
return str_decode_utf8(u, True, "strict", None)
@@ -68,3 +70,16 @@
def test_unicode_escape(u):
r = uh.utf8_encode_unicode_escape(u.encode("utf8"), "strict", None)
assert r == u.encode("unicode-escape")
+
+def test_encode_decimal(space):
+ assert uh.unicode_encode_decimal(u' 12, 34 ', None) == ' 12, 34 '
+ with pytest.raises(ValueError):
+ uh.unicode_encode_decimal(u' 12, \u1234 '.encode('utf8'), None)
+ state = space.fromcache(CodecState)
+ handler = state.encode_error_handler
+ assert uh.unicode_encode_decimal(
+ u'u\u1234\u1235v'.encode('utf8'), 'replace', handler) == 'u??v'
+
+ result = uh.unicode_encode_decimal(
+ u'12\u1234'.encode('utf8'), 'xmlcharrefreplace', handler)
+ assert result == '12ሴ'
diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -7,6 +7,7 @@
from rpython.rlib.rstring import StringBuilder
from rpython.rtyper.lltypesystem import rffi
from pypy.module._codecs import interp_codecs
+from pypy.module.unicodedata import unicodedb
@specialize.memo()
def decode_error_handler(space):
@@ -35,6 +36,16 @@
space.newtext(msg)]))
return raise_unicode_exception_encode
+def default_error_encode(
+ errors, encoding, msg, u, startingpos, endingpos):
+ """A default handler, for tests"""
+ assert endingpos >= 0
+ if errors == 'replace':
+ return '?', endingpos
+ if errors == 'ignore':
+ return '', endingpos
+ raise ValueError
+
def convert_arg_to_w_unicode(space, w_arg, strict=None):
return space.convert_arg_to_w_unicode(w_arg)
@@ -1458,3 +1469,70 @@
pos = rutf8.next_codepoint_pos(s, pos)
return result.build()
+# ____________________________________________________________
+# Decimal Encoder
+def unicode_encode_decimal(s, errors, errorhandler=None):
+ """Converts whitespace to ' ', decimal characters to their
+ corresponding ASCII digit and all other Latin-1 characters except
+ \0 as-is. Characters outside this range (Unicode ordinals 1-256)
+ are treated as errors. This includes embedded NULL bytes.
+ """
+ if errorhandler is None:
+ errorhandler = default_error_encode
+ result = StringBuilder(len(s))
+ pos = 0
+ i = 0
+ it = rutf8.Utf8StringIterator(s)
+ for ch in it:
+ if unicodedb.isspace(ch):
+ result.append(' ')
+ i += 1
+ continue
+ try:
+ decimal = unicodedb.decimal(ch)
+ except KeyError:
+ pass
+ else:
+ result.append(chr(48 + decimal))
+ i += 1
+ continue
+ if 0 < ch < 256:
+ result.append(chr(ch))
+ i += 1
+ continue
+ # All other characters are considered unencodable
+ start_index = i
+ i += 1
+ while not it.done():
+ ch = rutf8.codepoint_at_pos(s, it.get_pos())
+ try:
+ if (0 < ch < 256 or unicodedb.isspace(ch) or
+ unicodedb.decimal(ch) >= 0):
+ break
+ except KeyError:
+ # not a decimal
+ pass
+ if it.done():
+ break
+ ch = next(it)
+ i += 1
+ end_index = i
+ msg = "invalid decimal Unicode string"
+ r, pos = errorhandler(
+ errors, 'decimal', msg, s, start_index, end_index)
+ for ch in rutf8.Utf8StringIterator(r):
+ if unicodedb.isspace(ch):
+ result.append(' ')
+ continue
+ try:
+ decimal = unicodedb.decimal(ch)
+ except KeyError:
+ pass
+ else:
+ result.append(chr(48 + decimal))
+ continue
+ if 0 < ch < 256:
+ result.append(chr(ch))
+ continue
+ errorhandler('strict', 'decimal', msg, s, start_index, end_index)
+ return result.build()
diff --git a/pypy/module/cpyext/unicodeobject.py b/pypy/module/cpyext/unicodeobject.py
--- a/pypy/module/cpyext/unicodeobject.py
+++ b/pypy/module/cpyext/unicodeobject.py
@@ -4,7 +4,8 @@
from pypy.interpreter.error import OperationError, oefmt
from pypy.interpreter.unicodehelper import (
- wcharpsize2utf8, str_decode_utf_16_helper, str_decode_utf_32_helper)
+ wcharpsize2utf8, str_decode_utf_16_helper, str_decode_utf_32_helper,
+ unicode_encode_decimal)
from pypy.module.unicodedata import unicodedb
from pypy.module.cpyext.api import (
CANNOT_FAIL, Py_ssize_t, build_type_checkers_flags, cpython_api,
@@ -643,14 +644,13 @@
Returns 0 on success, -1 on failure.
"""
- u = rffi.wcharpsize2unicode(s, length)
+ u = rffi.wcharpsize2utf8(s, length)
if llerrors:
errors = rffi.charp2str(llerrors)
else:
errors = None
state = space.fromcache(CodecState)
- result = runicode.unicode_encode_decimal(u, length, errors,
- state.encode_error_handler)
+ result = unicode_encode_decimal(u, errors, state.encode_error_handler)
i = len(result)
output[i] = '\0'
i -= 1
More information about the pypy-commit
mailing list