[pypy-commit] pypy unicode-utf8-py3: return utf8, len, pos from decoders
mattip
pypy.commits at gmail.com
Sat Aug 4 18:01:28 EDT 2018
Author: Matti Picus <matti.picus at gmail.com>
Branch: unicode-utf8-py3
Changeset: r94937:8364a4fb4acd
Date: 2018-07-31 14:17 -0700
http://bitbucket.org/pypy/pypy/changeset/8364a4fb4acd/
Log: return utf8, len, pos from decoders
diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -152,10 +152,9 @@
return result_utf8, length
def decode_raw_unicode_escape(space, string):
- result_utf8, lgt = str_decode_raw_unicode_escape(
+ return str_decode_raw_unicode_escape(
string, "strict",
final=True, errorhandler=decode_error_handler(space))
- return result_utf8, lgt
def check_ascii_or_raise(space, string):
try:
@@ -181,7 +180,7 @@
def str_decode_ascii(s, errors, final, errorhandler):
try:
rutf8.check_ascii(s)
- return s, len(s)
+ return s, len(s), len(s)
except rutf8.CheckError:
return _str_decode_ascii_slowpath(s, errors, final, errorhandler)
@@ -199,12 +198,12 @@
i += 1
ress = res.build()
lgt = rutf8.check_utf8(ress, True)
- return ress, lgt
+ return ress, lgt, lgt
def str_decode_latin_1(s, errors, final, errorhandler):
try:
rutf8.check_ascii(s)
- return s, len(s)
+ return s, len(s), len(s)
except rutf8.CheckError:
return _str_decode_latin_1_slowpath(s, errors, final, errorhandler)
@@ -224,7 +223,7 @@
res.append_slice(s, start, end)
i = end
# cannot be ASCII, cannot have surrogates, I believe
- return res.build(), len(s)
+ return res.build(), len(s), len(s)
def utf8_encode_latin_1(s, errors, errorhandler):
try:
@@ -430,7 +429,7 @@
res.append(r)
r = res.build()
- return r, rutf8.check_utf8(r, True)
+ return r, rutf8.check_utf8(r, True), pos
hexdigits = "0123456789ABCDEFabcdef"
@@ -650,7 +649,7 @@
pos = hexescape(builder, s, pos, digits,
"rawunicodeescape", errorhandler, message, errors)
- return builder.build(), builder.getlength()
+ return builder.build(), builder.getlength(), pos
_utf8_encode_unicode_escape = rutf8.make_utf8_escape_function()
@@ -785,7 +784,7 @@
errorhandler=None):
size = len(s)
if size == 0:
- return '', 0
+ return '', 0, 0
inShift = False
base64bits = 0
@@ -920,7 +919,7 @@
final_length = shiftOutStartPos # back off output
assert final_length >= 0
- return result.build()[:final_length], outsize
+ return result.build()[:final_length], outsize, size
def utf8_encode_utf_7(s, errors, errorhandler):
size = len(s)
@@ -1010,21 +1009,15 @@
def str_decode_utf_16(s, errors, final=True,
errorhandler=None):
- result, lgt = str_decode_utf_16_helper(s, errors, final,
- errorhandler, "native")
- return result, lgt
+ return str_decode_utf_16_helper(s, errors, final, errorhandler, "native")
def str_decode_utf_16_be(s, errors, final=True,
errorhandler=None):
- result, lgt = str_decode_utf_16_helper(s, errors, final,
- errorhandler, "big")
- return result, lgt
+ return str_decode_utf_16_helper(s, errors, final, errorhandler, "big")
def str_decode_utf_16_le(s, errors, final=True,
errorhandler=None):
- result, lgt = str_decode_utf_16_helper(s, errors, final,
- errorhandler, "little")
- return result, lgt
+ return str_decode_utf_16_helper(s, errors, final, errorhandler, "little")
def str_decode_utf_16_helper(s, errors, final=True,
errorhandler=None,
@@ -1067,7 +1060,7 @@
else:
bo = 1
if size == 0:
- return '', 0
+ return '', 0, 0
if bo == -1:
# force little endian
ihi = 1
@@ -1127,7 +1120,7 @@
result.append(r)
r = result.build()
lgt = rutf8.check_utf8(r, True)
- return result.build(), lgt
+ return result.build(), lgt, pos
def _STORECHAR(result, CH, byteorder):
hi = chr(((CH) >> 8) & 0xff)
@@ -1235,24 +1228,21 @@
def str_decode_utf_32(s, errors, final=True,
errorhandler=None):
- result, lgt = str_decode_utf_32_helper(
+ return str_decode_utf_32_helper(
s, errors, final, errorhandler, "native", 'utf-32-' + BYTEORDER2,
allow_surrogates=False)
- return result, lgt
def str_decode_utf_32_be(s, errors, final=True,
errorhandler=None):
- result, lgt = str_decode_utf_32_helper(
+ return str_decode_utf_32_helper(
s, errors, final, errorhandler, "big", 'utf-32-be',
allow_surrogates=False)
- return result, lgt
def str_decode_utf_32_le(s, errors, final=True,
errorhandler=None):
- result, lgt = str_decode_utf_32_helper(
+ return str_decode_utf_32_helper(
s, errors, final, errorhandler, "little", 'utf-32-le',
allow_surrogates=False)
- return result, lgt
BOM32_DIRECT = intmask(0x0000FEFF)
BOM32_REVERSE = intmask(0xFFFE0000)
@@ -1300,7 +1290,7 @@
else:
bo = 1
if size == 0:
- return '', 0
+ return '', 0, 0
if bo == -1:
# force little endian
iorder = [0, 1, 2, 3]
@@ -1342,7 +1332,7 @@
pos += 4
r = result.build()
lgt = rutf8.check_utf8(r, True)
- return r, lgt
+ return r, lgt, pos
def _STORECHAR32(result, CH, byteorder):
c0 = chr(((CH) >> 24) & 0xff)
@@ -1615,7 +1605,7 @@
pos += 1
r = result.build()
lgt = rutf8.codepoints_in_utf8(r)
- return r, lgt
+ return r, lgt, pos
def utf8_encode_charmap(s, errors, errorhandler=None, mapping=None):
if mapping is None:
diff --git a/pypy/module/_codecs/interp_codecs.py b/pypy/module/_codecs/interp_codecs.py
--- a/pypy/module/_codecs/interp_codecs.py
+++ b/pypy/module/_codecs/interp_codecs.py
@@ -50,7 +50,7 @@
else:
w_cls = space.w_UnicodeEncodeError
length = rutf8.codepoints_in_utf8(input)
- w_input = space.newtext((input, length))
+ w_input = space.newtext((input, length, length))
w_exc = space.call_function(
w_cls,
space.newtext(encoding),
@@ -266,7 +266,6 @@
def backslashreplace_errors(space, w_exc):
- import pdb;pdb.set_trace()
check_exception(space, w_exc)
if (space.isinstance_w(w_exc, space.w_UnicodeEncodeError) or
space.isinstance_w(w_exc, space.w_UnicodeTranslateError)):
@@ -650,8 +649,9 @@
errors = 'strict'
final = space.is_true(w_final)
state = space.fromcache(CodecState)
- result, length = func(string, errors, final, state.decode_error_handler)
- return space.newtuple([space.newutf8(result, length), space.newint(length)])
+ result, length, pos = func(string, errors, final, state.decode_error_handler)
+ # must return bytes, len_of_original_string
+ return space.newtuple([space.newutf8(result, length), space.newint(pos)])
wrap_decoder.__name__ = func.__name__
globals()[name] = wrap_decoder
@@ -705,11 +705,11 @@
errors = 'strict'
final = space.is_true(w_final)
state = space.fromcache(CodecState)
- result, length = runicode.str_decode_mbcs(
+ result, length, pos = runicode.str_decode_mbcs(
string, len(string), errors,
final, state.decode_error_handler,
force_ignore=False)
- return space.newtuple([space.newtext(result, length), space.newint(length)])
+ return space.newtuple([space.newtext(result, length), space.newint(pos)])
# utf-8 functions are not regular, because we have to pass
# "allow_surrogates=False"
@@ -739,7 +739,7 @@
try:
lgt = rutf8.check_utf8(string, allow_surrogates=True)
except rutf8.CheckError:
- res, lgt = unicodehelper.str_decode_utf8(string,
+ res, lgt, pos = unicodehelper.str_decode_utf8(string,
errors, final, state.decode_error_handler)
return space.newtuple([space.newutf8(res, lgt),
space.newint(lgt)])
@@ -762,7 +762,7 @@
byteorder = 'little'
else:
byteorder = 'big'
- res, lgt = str_decode_utf_16_helper(
+ res, lgt, pos = str_decode_utf_16_helper(
data, errors, final,
state.decode_error_handler, byteorder)
return space.newtuple([space.newutf8(res, lgt),
@@ -781,7 +781,7 @@
byteorder = 'little'
else:
byteorder = 'big'
- res, lgt = str_decode_utf_32_helper(
+ res, lgt, pos = str_decode_utf_32_helper(
data, errors, final,
state.decode_error_handler, byteorder)
return space.newtuple([space.newutf8(res, lgt),
@@ -882,7 +882,7 @@
final = True
state = space.fromcache(CodecState)
- result, lgt = unicodehelper.str_decode_charmap(
+ result, lgt, pos = unicodehelper.str_decode_charmap(
string, errors, final, state.decode_error_handler, mapping)
return space.newtuple([space.newutf8(result, lgt),
space.newint(len(string))])
diff --git a/pypy/module/_codecs/test/test_codecs.py b/pypy/module/_codecs/test/test_codecs.py
--- a/pypy/module/_codecs/test/test_codecs.py
+++ b/pypy/module/_codecs/test/test_codecs.py
@@ -142,14 +142,10 @@
assert decode(br"[\x0]\x0", "replace") == (b"[?]?", 8)
def test_unicode_escape(self):
- import sys
from _codecs import unicode_escape_encode, unicode_escape_decode
assert unicode_escape_encode('abc') == ('abc'.encode('unicode_escape'), 3)
assert unicode_escape_decode(b'abc') == (b'abc'.decode('unicode_escape'), 3)
- if sys.version_info[0] < 3:
- lgt = 12
- else:
- lgt = 3
+ lgt = 12
assert unicode_escape_decode(b'\\x61\\x62\\x63') == ('abc', lgt)
diff --git a/pypy/objspace/std/objspace.py b/pypy/objspace/std/objspace.py
--- a/pypy/objspace/std/objspace.py
+++ b/pypy/objspace/std/objspace.py
@@ -384,10 +384,10 @@
if isinstance(s, unicode):
s, lgt = s.encode('utf8'), len(s)
elif isinstance(s, str):
- s, lgt = decode_utf8sp(self, s)
+ s, lgt, codepoints = decode_utf8sp(self, s)
elif isinstance(s, tuple):
# result of decode_utf8
- s, lgt = s
+ s, lgt, codepoints = s
else:
# XXX what is s ?
lgt = rutf8.check_utf8(s, True)
diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -1185,14 +1185,6 @@
def encode_object(space, w_object, encoding, errors):
utf8 = space.utf8_w(w_object)
- idx = rutf8.surrogate_in_utf8(utf8)
- if idx >= 0:
- print 'surrogate in unicodeobject.encode_object(', w_object, ',', encoding, ',', errors, ')', 'raising'
- if errors is None:
- w_err_handler = unicodehelper.encode_error_handler(space)
- else:
- w_err_handler = lookup_error(space, errors)
- w_err_handler(None, "utf8", "surrogates not allowed", utf8, idx, idx + 1)
if errors is None or errors == 'strict':
if encoding is None or encoding == 'utf-8':
#if rutf8.has_surrogates(utf8):
More information about the pypy-commit
mailing list