[pypy-commit] pypy unicode-utf8-py3: fix tests, then fix implementations to pass more tests
mattip
pypy.commits at gmail.com
Fri Jul 27 16:02:25 EDT 2018
Author: Matti Picus <matti.picus at gmail.com>
Branch: unicode-utf8-py3
Changeset: r94904:fed03b2241ff
Date: 2018-07-27 16:00 -0400
http://bitbucket.org/pypy/pypy/changeset/fed03b2241ff/
Log: fix tests, then fix implementations to pass more tests
diff --git a/pypy/interpreter/baseobjspace.py b/pypy/interpreter/baseobjspace.py
--- a/pypy/interpreter/baseobjspace.py
+++ b/pypy/interpreter/baseobjspace.py
@@ -1721,7 +1721,9 @@
return w_obj.convert_to_w_unicode(self)
def realunicode_w(self, w_obj):
- return w_obj.realunicode_w(self)
+ from rpython.rlib.runicode import str_decode_utf_8
+ utf8 = self.utf8_w(w_obj)
+ return str_decode_utf_8(utf8, len(utf8), 'strict', True)[0]
def utf8_0_w(self, w_obj):
"Like utf8_w, but rejects strings with NUL bytes."
diff --git a/pypy/interpreter/error.py b/pypy/interpreter/error.py
--- a/pypy/interpreter/error.py
+++ b/pypy/interpreter/error.py
@@ -472,16 +472,16 @@
assert len(formats) > 0, "unsupported: no % command found"
return tuple(parts), tuple(formats)
- at specialize.arg(1)
def _decode_utf8(string):
# when building the error message, don't crash if the byte string
# provided is not valid UTF-8
if isinstance(string, unicode):
return string
assert isinstance(string, str)
- result, consumed = runicode.str_decode_utf_8(
- string, len(string), "replace", final=True)
- return result
+ return string.decode('utf8')
+ #result, consumed = runicode.str_decode_utf_8(
+ # string, len(string), "replace", final=True)
+ #return result
def get_operrcls2(valuefmt):
valuefmt = valuefmt.decode('ascii')
diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -147,7 +147,7 @@
result_utf8, length = str_decode_unicode_escape(
string, "strict",
final=True,
- errorhandler=decode_error_handler(space),
+ errorhandler=state.decode_error_handler,
ud_handler=unicodedata_handler)
return result_utf8, length
@@ -161,7 +161,6 @@
try:
rutf8.check_ascii(string)
except rutf8.CheckError as e:
- print 'check_ascii_or_raise', string
decode_error_handler(space)('strict', 'ascii',
'ordinal not in range(128)', string,
e.pos, e.pos + 1)
@@ -172,12 +171,12 @@
# If there happen to be two 3-bytes encoding a pair of surrogates,
# you still get two surrogate unicode characters in the result.
assert isinstance(string, str)
- result, lgth = runicode.str_decode_utf_8(
- string, len(string), "strict",
- final=True, errorhandler=decode_error_handler(space),
- # XXX handle surrogates
- allow_surrogates=False)
- return len(result)
+ try:
+ return rutf8.check_utf8(string, True, start, end)
+ except rutf8.CheckError as e:
+ decode_error_handler(space)('strict', 'utf8',
+ 'unexpected end of data', string,
+ e.pos, e.pos + 1)
def str_decode_ascii(s, errors, final, errorhandler):
try:
@@ -298,10 +297,9 @@
return result.build()
if sys.platform == 'win32':
- def utf8_encode_mbcs(s, errors, errorhandler):
+ def utf8_encode_mbcs(s, slen, errors, errorhandler):
from rpython.rlib import runicode
s = s.decode('utf-8')
- slen = len(s)
res = runicode.unicode_encode_mbcs(s, slen, errors, errorhandler)
return res
@@ -647,7 +645,7 @@
continue
digits = 4 if s[pos] == 'u' else 8
- message = "truncated \\uXXXX"
+ message = "truncated \\uXXXX escape"
pos += 1
pos = hexescape(builder, s, pos, digits,
"rawunicodeescape", errorhandler, message, errors)
@@ -1141,7 +1139,7 @@
result.append(hi)
result.append(lo)
-def unicode_encode_utf_16_helper(s, errors,
+def utf8_encode_utf_16_helper(s, errors,
errorhandler=None,
allow_surrogates=True,
byteorder='little',
@@ -1213,21 +1211,21 @@
def utf8_encode_utf_16(s, errors,
errorhandler=None,
allow_surrogates=True):
- return unicode_encode_utf_16_helper(s, errors, errorhandler,
+ return utf8_encode_utf_16_helper(s, errors, errorhandler,
allow_surrogates, "native",
'utf-16-' + BYTEORDER2)
def utf8_encode_utf_16_be(s, errors,
errorhandler=None,
allow_surrogates=True):
- return unicode_encode_utf_16_helper(s, errors, errorhandler,
+ return utf8_encode_utf_16_helper(s, errors, errorhandler,
allow_surrogates, "big",
'utf-16-be')
def utf8_encode_utf_16_le(s, errors,
errorhandler=None,
allow_surrogates=True):
- return unicode_encode_utf_16_helper(s, errors, errorhandler,
+ return utf8_encode_utf_16_helper(s, errors, errorhandler,
allow_surrogates, "little",
'utf-16-le')
@@ -1237,7 +1235,7 @@
def str_decode_utf_32(s, errors, final=True,
errorhandler=None):
- result, lgt_ = str_decode_utf_32_helper(
+ result, lgt = str_decode_utf_32_helper(
s, errors, final, errorhandler, "native", 'utf-32-' + BYTEORDER2,
allow_surrogates=False)
return result, lgt
@@ -1251,10 +1249,10 @@
def str_decode_utf_32_le(s, errors, final=True,
errorhandler=None):
- result, lgt_ = str_decode_utf_32_helper(
+ result, lgt = str_decode_utf_32_helper(
s, errors, final, errorhandler, "little", 'utf-32-le',
allow_surrogates=False)
- return result, c, lgt
+ return result, lgt
BOM32_DIRECT = intmask(0x0000FEFF)
BOM32_REVERSE = intmask(0xFFFE0000)
@@ -1362,11 +1360,79 @@
result.append(c2)
result.append(c3)
+def utf8_encode_utf_32_helper(s, errors,
+ errorhandler=None,
+ allow_surrogates=True,
+ byteorder='little',
+ public_encoding_name='utf32'):
+ # s is utf8
+ size = len(s)
+ if size == 0:
+ if byteorder == 'native':
+ result = StringBuilder(4)
+ _STORECHAR32(result, 0xFEFF, BYTEORDER)
+ return result.build()
+ return ""
+
+ result = StringBuilder(size * 4 + 4)
+ if byteorder == 'native':
+ _STORECHAR32(result, 0xFEFF, BYTEORDER)
+ byteorder = BYTEORDER
+
+ pos = 0
+ index = 0
+ while pos < size:
+ try:
+ ch = rutf8.codepoint_at_pos(s, pos)
+ pos = rutf8.next_codepoint_pos(s, pos)
+ except IndexError:
+ # malformed codepoint, blindly use ch
+ ch = ord(s[pos])
+ pos += 1
+ if errorhandler:
+ res_8, newindex = errorhandler(
+ errors, public_encoding_name, 'malformed unicode',
+ s, pos - 1, pos)
+ if res_8:
+ for cp in rutf8.Utf8StringIterator(res_8):
+ if cp < 0xD800:
+ _STORECHAR32(result, cp, byteorder)
+ else:
+ errorhandler('strict', public_encoding_name,
+ 'malformed unicode',
+ s, pos-1, pos)
+ else:
+ _STORECHAR32(result, ch, byteorder)
+ else:
+ _STORECHAR32(result, ch, byteorder)
+ index += 1
+ continue
+ if not allow_surrogates and 0xD800 <= ch < 0xE000:
+ res_8, newindex = errorhandler(
+ errors, public_encoding_name, 'surrogates not allowed',
+ s, pos - 1, pos)
+ for ch in rutf8.Utf8StringIterator(res_8):
+ if ch < 0xD800:
+ _STORECHAR32(result, ch, byteorder)
+ else:
+ errorhandler(
+ 'strict', public_encoding_name, 'surrogates not allowed',
+ s, pos - 1, pos)
+ if index != newindex: # Should be uncommon
+ index = newindex
+ pos = rutf8._pos_at_index(s, newindex)
+ continue
+ _STORECHAR32(result, ch, byteorder)
+ index += 1
+
+ return result.build()
+
def unicode_encode_utf_32_helper(s, errors,
errorhandler=None,
allow_surrogates=True,
byteorder='little',
public_encoding_name='utf32'):
+ # s is uunicode
size = len(s)
if size == 0:
if byteorder == 'native':
@@ -1430,19 +1496,19 @@
def utf8_encode_utf_32(s, errors,
errorhandler=None, allow_surrogates=True):
- return unicode_encode_utf_32_helper(s.decode('utf8'), errors, errorhandler,
+ return utf8_encode_utf_32_helper(s, errors, errorhandler,
allow_surrogates, "native",
'utf-32-' + BYTEORDER2)
def utf8_encode_utf_32_be(s, errors,
errorhandler=None, allow_surrogates=True):
- return unicode_encode_utf_32_helper(s.decode('utf8'), errors, errorhandler,
+ return utf8_encode_utf_32_helper(s, errors, errorhandler,
allow_surrogates, "big",
'utf-32-be')
def utf8_encode_utf_32_le(s, errors,
errorhandler=None, allow_surrogates=True):
- return unicode_encode_utf_32_helper(s.decode('utf8'), errors, errorhandler,
+ return utf8_encode_utf_32_helper(s, errors, errorhandler,
allow_surrogates, "little",
'utf-32-le')
# ____________________________________________________________
@@ -1548,14 +1614,13 @@
result.append(c)
pos += 1
r = result.build()
- lgt = rutf8.check_utf8(r, True)
+ lgt = rutf8.codepoints_in_utf8(r)
return r, lgt
def utf8_encode_charmap(s, errors, errorhandler=None, mapping=None):
- size = len(s)
if mapping is None:
return utf8_encode_latin_1(s, errors, errorhandler=errorhandler)
-
+ size = len(s)
if size == 0:
return ''
result = StringBuilder(size)
diff --git a/pypy/module/_codecs/interp_codecs.py b/pypy/module/_codecs/interp_codecs.py
--- a/pypy/module/_codecs/interp_codecs.py
+++ b/pypy/module/_codecs/interp_codecs.py
@@ -49,8 +49,8 @@
length = len(input)
else:
w_cls = space.w_UnicodeEncodeError
- length = rutf8.check_utf8(input, allow_surrogates=True)
- w_input = space.newutf8(input, length)
+ length = rutf8.codepoints_in_utf8(input)
+ w_input = space.newtext((input, length))
w_exc = space.call_function(
w_cls,
space.newtext(encoding),
@@ -89,8 +89,7 @@
raise oefmt(space.w_IndexError,
"position %d from error handler out of bounds",
newpos)
- w_replace = space.convert_to_w_unicode(w_replace)
- return w_replace._utf8, newpos
+ return space.utf8_w(w_replace), newpos
return call_errorhandler
def make_decode_errorhandler(self, space):
@@ -238,7 +237,6 @@
"don't know how to handle %T in error callback", w_exc)
def xmlcharrefreplace_errors(space, w_exc):
- from pypy.interpreter import unicodehelper
check_exception(space, w_exc)
if space.isinstance_w(w_exc, space.w_UnicodeEncodeError):
@@ -267,8 +265,8 @@
"don't know how to handle %T in error callback", w_exc)
def backslashreplace_errors(space, w_exc):
- from pypy.interpreter import unicodehelper
+ import pdb;pdb.set_trace()
check_exception(space, w_exc)
if (space.isinstance_w(w_exc, space.w_UnicodeEncodeError) or
space.isinstance_w(w_exc, space.w_UnicodeTranslateError)):
@@ -630,34 +628,14 @@
func = _find_implementation(rname)
@unwrap_spec(errors='text_or_none')
def wrap_encoder(space, w_arg, errors="strict"):
- from pypy.interpreter import unicodehelper
-
+ # w_arg is a W_Unicode or W_Bytes?
w_arg = unicodehelper.convert_arg_to_w_unicode(space, w_arg, rname)
if errors is None:
errors = 'strict'
state = space.fromcache(CodecState)
- utf8len = w_arg._length
- # XXX deal with func() returning length or not
+ ulen = w_arg._length
result = func(w_arg._utf8, errors, state.encode_error_handler)
- return space.newtuple([space.newbytes(result.encode('utf8')), space.newint(utf8len)])
- wrap_encoder.__name__ = func.__name__
- globals()[name] = wrap_encoder
-
-def make_utf_encoder_wrapper(name):
- rname = "unicode_encode_%s" % (name.replace("_encode", ""), )
- func = _find_implementation(rname)
- @unwrap_spec(errors='text_or_none')
- def wrap_encoder(space, w_arg, errors="strict"):
- from pypy.interpreter import unicodehelper
-
- w_arg = unicodehelper.convert_arg_to_w_unicode(space, w_arg, rname)
- if errors is None:
- errors = 'strict'
- state = space.fromcache(CodecState)
- utf8len = w_arg._length
- result = func(w_arg._utf8, errors, state.encode_error_handler,
- allow_surrogates=False)
- return space.newtuple([space.newbytes(result), space.newint(utf8len)])
+ return space.newtuple([space.newbytes(result), space.newint(ulen)])
wrap_encoder.__name__ = func.__name__
globals()[name] = wrap_encoder
@@ -667,16 +645,13 @@
@unwrap_spec(string='bufferstr', errors='text_or_none',
w_final=WrappedDefault(False))
def wrap_decoder(space, string, errors="strict", w_final=None):
- from pypy.interpreter import unicodehelper
if errors is None:
errors = 'strict'
final = space.is_true(w_final)
state = space.fromcache(CodecState)
- result, consumed, length = func(string, errors,
- final, state.decode_error_handler)
- return space.newtuple([space.newutf8(result, length),
- space.newint(consumed)])
+ result, length = func(string, errors, final, state.decode_error_handler)
+ return space.newtuple([space.newutf8(result, length), space.newint(length)])
wrap_decoder.__name__ = func.__name__
globals()[name] = wrap_decoder
@@ -730,11 +705,11 @@
errors = 'strict'
final = space.is_true(w_final)
state = space.fromcache(CodecState)
- result, consumed = runicode.str_decode_mbcs(
+ result, length = runicode.str_decode_mbcs(
string, len(string), errors,
final, state.decode_error_handler,
force_ignore=False)
- return space.newtuple([space.newtext(result), space.newint(consumed)])
+ return space.newtuple([space.newtext(result, length), space.newint(length)])
# utf-8 functions are not regular, because we have to pass
# "allow_surrogates=False"
@@ -755,7 +730,6 @@
@unwrap_spec(string='bufferstr', errors='text_or_none',
w_final = WrappedDefault(False))
def utf_8_decode(space, string, errors="strict", w_final=None):
- from pypy.interpreter import unicodehelper
if errors is None:
errors = 'strict'
@@ -765,10 +739,10 @@
try:
lgt = rutf8.check_utf8(string, allow_surrogates=True)
except rutf8.CheckError:
- res, consumed, lgt = unicodehelper.str_decode_utf8(string,
+ res, lgt = unicodehelper.str_decode_utf8(string,
errors, final, state.decode_error_handler)
return space.newtuple([space.newutf8(res, lgt),
- space.newint(consumed)])
+ space.newint(lgt)])
else:
return space.newtuple([space.newutf8(string, lgt),
space.newint(len(string))])
@@ -788,15 +762,11 @@
byteorder = 'little'
else:
byteorder = 'big'
- consumed = len(data)
- if final:
- consumed = 0
- res, consumed, lgt, byteorder = str_decode_utf_16_helper(
+ res, lgt = str_decode_utf_16_helper(
data, errors, final,
state.decode_error_handler, byteorder)
return space.newtuple([space.newutf8(res, lgt),
- space.newint(consumed),
- space.newint(byteorder)])
+ space.newint(lgt)])
@unwrap_spec(data='bufferstr', errors='text_or_none', byteorder=int,
w_final=WrappedDefault(False))
@@ -811,15 +781,11 @@
byteorder = 'little'
else:
byteorder = 'big'
- consumed = len(data)
- if final:
- consumed = 0
- res, consumed, lgt, byteorder = str_decode_utf_32_helper(
+ res, lgt = str_decode_utf_32_helper(
data, errors, final,
state.decode_error_handler, byteorder)
return space.newtuple([space.newutf8(res, lgt),
- space.newint(consumed),
- space.newint(byteorder)])
+ space.newint(lgt)])
# ____________________________________________________________
# Charmap
@@ -902,7 +868,6 @@
@unwrap_spec(string='bufferstr', errors='text_or_none')
def charmap_decode(space, string, errors="strict", w_mapping=None):
- from pypy.interpreter import unicodehelper
if errors is None:
errors = 'strict'
@@ -917,14 +882,13 @@
final = True
state = space.fromcache(CodecState)
- result, consumed, lgt = unicodehelper.str_decode_charmap(
+ result, lgt = unicodehelper.str_decode_charmap(
string, errors, final, state.decode_error_handler, mapping)
return space.newtuple([space.newutf8(result, lgt),
- space.newint(consumed)])
+ space.newint(len(string))])
@unwrap_spec(errors='text_or_none')
def charmap_encode(space, w_unicode, errors="strict", w_mapping=None):
- from pypy.interpreter import unicodehelper
if errors is None:
errors = 'strict'
@@ -974,7 +938,6 @@
@unwrap_spec(errors='text_or_none', w_final=WrappedDefault(False))
def unicode_escape_decode(space, w_string, errors="strict", w_final=None):
string = space.getarg_w('s*', w_string).as_str()
- from pypy.interpreter import unicodehelper
if errors is None:
errors = 'strict'
@@ -983,12 +946,13 @@
unicode_name_handler = state.get_unicodedata_handler(space)
- result, consumed, lgt = unicodehelper.str_decode_unicode_escape(
+ result, lgt = unicodehelper.str_decode_unicode_escape(
string, errors,
final, state.decode_error_handler,
unicode_name_handler)
- return space.newtuple([space.newutf8(result, lgt), space.newint(consumed)])
+ s_len = len(string)
+ return space.newtuple([space.newutf8(result, lgt), space.newint(s_len)])
# ____________________________________________________________
# Raw Unicode escape (accepts bytes or str)
@@ -1000,17 +964,16 @@
errors = 'strict'
final = space.is_true(w_final)
state = space.fromcache(CodecState)
- result, consumed = runicode.str_decode_raw_unicode_escape(
+ result, lgt = runicode.str_decode_raw_unicode_escape(
string, len(string), errors,
final, state.decode_error_handler)
- return space.newtuple([space.newtext(result), space.newint(consumed)])
+ return space.newtuple([space.newtext(result), space.newint(lgt)])
# ____________________________________________________________
# Unicode-internal
@unwrap_spec(errors='text_or_none')
def unicode_internal_decode(space, w_string, errors="strict"):
- from pypy.interpreter import unicodehelper
if errors is None:
errors = 'strict'
@@ -1028,11 +991,11 @@
final = True
state = space.fromcache(CodecState)
- result, consumed, lgt = unicodehelper.str_decode_unicode_internal(
+ result, lgt = unicodehelper.str_decode_unicode_internal(
string, errors,
final, state.decode_error_handler)
return space.newtuple([space.newutf8(result, lgt),
- space.newint(consumed)])
+ space.newint(lgt)])
@unwrap_spec(errors='text_or_none')
def unicode_internal_encode(space, w_uni, errors="strict"):
@@ -1041,11 +1004,12 @@
if errors is None:
errors = 'strict'
if space.isinstance_w(w_uni, space.w_unicode):
- uni = space.utf8_w(w_uni)
+ utf8 = space.utf8_w(w_uni)
state = space.fromcache(CodecState)
- result = runicode.unicode_encode_unicode_internal(
- uni, len(uni), errors, state.encode_error_handler)
- return space.newtuple([space.newbytes(result), space.newint(len(uni))])
+ result = unicodehelper.utf8_encode_unicode_internal(
+ utf8, errors, state.encode_error_handler)
+ w_lgt = space.newint(space.len_w(w_uni))
+ return space.newtuple([space.newbytes(result), w_lgt])
else:
# special case for this codec: bytes are returned as is
string = space.charbuf_w(w_uni)
diff --git a/pypy/module/_codecs/test/test_codecs.py b/pypy/module/_codecs/test/test_codecs.py
--- a/pypy/module/_codecs/test/test_codecs.py
+++ b/pypy/module/_codecs/test/test_codecs.py
@@ -107,7 +107,8 @@
import sys
assert charmap_decode(b'', 'strict', 'blablabla') == ('', 0)
assert charmap_decode(b'xxx') == ('xxx', 3)
- assert charmap_decode(b'xxx', 'strict', {ord('x'): 'XX'}) == ('XXXXXX', 3)
+ res = charmap_decode(b'xxx', 'strict', {ord('x'): 'XX'})
+ assert res == ('XXXXXX', 3)
map = tuple([chr(i) for i in range(256)])
assert charmap_decode(b'xxx\xff', 'strict', map) == ('xxx\xff', 4)
@@ -123,7 +124,11 @@
def test_escape_decode(self):
from _codecs import unicode_escape_decode as decode
- assert decode('\\\x80') == (u'\\\x80', 2)
+ import sys
+ if sys.version_info[0] < 3:
+ assert decode('\\\x80') == (u'\\\x80', 2)
+ else:
+ assert decode('\\\x80') == (u'\\\xc2\x80', 3)
def test_escape_decode_errors(self):
from _codecs import escape_decode as decode
@@ -137,10 +142,15 @@
assert decode(br"[\x0]\x0", "replace") == (b"[?]?", 8)
def test_unicode_escape(self):
+ import sys
from _codecs import unicode_escape_encode, unicode_escape_decode
assert unicode_escape_encode('abc') == ('abc'.encode('unicode_escape'), 3)
assert unicode_escape_decode(b'abc') == (b'abc'.decode('unicode_escape'), 3)
- assert unicode_escape_decode(b'\\x61\\x62\\x63') == ('abc', 12)
+ if sys.version_info[0] < 3:
+ lgt = 12
+ else:
+ lgt = 3
+ assert unicode_escape_decode(b'\\x61\\x62\\x63') == ('abc', lgt)
class AppTestPartialEvaluation:
@@ -338,23 +348,26 @@
def test_unicode_escape_decode_errors(self):
from _codecs import unicode_escape_decode, raw_unicode_escape_decode
+ import sys
for decode in [unicode_escape_decode, raw_unicode_escape_decode]:
for c, d in ('u', 4), ('U', 4):
for i in range(d):
raises(UnicodeDecodeError, decode, "\\" + c + "0"*i)
raises(UnicodeDecodeError, decode, "[\\" + c + "0"*i + "]")
data = "[\\" + c + "0"*i + "]\\" + c + "0"*i
- assert decode(data, "ignore") == (u"[]", len(data))
- assert decode(data, "replace") == (u"[\ufffd]\ufffd", len(data))
+ lgt = len(data)
+ assert decode(data, "ignore") == (u"[]", lgt)
+ assert decode(data, "replace") == (u"[\ufffd]\ufffd", lgt)
raises(UnicodeDecodeError, decode, r"\U00110000")
- assert decode(r"\U00110000", "ignore") == (u"", 10)
- assert decode(r"\U00110000", "replace") == (u"\ufffd", 10)
+ lgt = 10
+ assert decode(r"\U00110000", "ignore") == (u"", lgt)
+ assert decode(r"\U00110000", "replace") == (u"\ufffd", lgt)
exc = raises(UnicodeDecodeError, unicode_escape_decode, b"\u1z32z3", 'strict')
assert str(exc.value) == r"'unicodeescape' codec can't decode bytes in position 0-2: truncated \uXXXX escape"
exc = raises(UnicodeDecodeError, raw_unicode_escape_decode, b"\u1z32z3", 'strict')
- assert str(exc.value) == r"'rawunicodeescape' codec can't decode bytes in position 0-2: truncated \uXXXX"
+ assert str(exc.value) == r"'rawunicodeescape' codec can't decode bytes in position 0-2: truncated \uXXXX escape"
exc = raises(UnicodeDecodeError, raw_unicode_escape_decode, b"\U1z32z3", 'strict')
- assert str(exc.value) == r"'rawunicodeescape' codec can't decode bytes in position 0-2: truncated \uXXXX"
+ assert str(exc.value) == r"'rawunicodeescape' codec can't decode bytes in position 0-2: truncated \UXXXXXXXX escape"
def test_escape_encode(self):
import _codecs
@@ -920,11 +933,11 @@
assert _codecs.utf_16_be_decode(b) == (u'', 0)
assert _codecs.utf_16_decode(b) == (u'', 0)
assert _codecs.utf_16_le_decode(b) == (u'', 0)
- assert _codecs.utf_16_ex_decode(b) == (u'', 0, 0)
+ assert _codecs.utf_16_ex_decode(b) == (u'', 0)
assert _codecs.utf_32_decode(b) == (u'', 0)
assert _codecs.utf_32_be_decode(b) == (u'', 0)
assert _codecs.utf_32_le_decode(b) == (u'', 0)
- assert _codecs.utf_32_ex_decode(b) == (u'', 0, 0)
+ assert _codecs.utf_32_ex_decode(b) == (u'', 0)
assert _codecs.charmap_decode(b) == (u'', 0)
assert _codecs.unicode_escape_decode(b) == (u'', 0)
assert _codecs.raw_unicode_escape_decode(b) == (u'', 0)
diff --git a/pypy/module/_codecs/test/test_locale.py b/pypy/module/_codecs/test/test_locale.py
--- a/pypy/module/_codecs/test/test_locale.py
+++ b/pypy/module/_codecs/test/test_locale.py
@@ -4,7 +4,8 @@
from pypy.module._codecs.locale import (
str_decode_locale_surrogateescape,
unicode_encode_locale_surrogateescape)
-from rpython.rlib import rlocale, runicode
+from rpython.rlib import rlocale
+from pypy.interpreter import unicodehelper
class TestLocaleCodec(object):
@@ -18,11 +19,11 @@
rlocale.setlocale(rlocale.LC_ALL, cls.oldlocale)
def getdecoder(self, encoding):
- return getattr(runicode, "str_decode_%s" % encoding.replace("-", "_"))
+ return getattr(unicodehelper, "str_decode_%s" % encoding.replace("-", ""))
def getencoder(self, encoding):
- return getattr(runicode,
- "unicode_encode_%s" % encoding.replace("-", "_"))
+ return getattr(unicodehelper,
+ "utf8_encode_%s" % encoding.replace("-", "_"))
def getstate(self):
return self.space.fromcache(interp_codecs.CodecState)
@@ -39,8 +40,8 @@
locale_encoder = unicode_encode_locale_surrogateescape
utf8_encoder = self.getencoder('utf-8')
for val in u'foo', u' 日本', u'\U0001320C':
- assert (locale_encoder(val) ==
- utf8_encoder(val, len(val), None))
+ assert (locale_encoder(val).encode('utf8') ==
+ utf8_encoder(val, 'strict', True, None))
def test_encode_locale_errorhandler(self):
self.setlocale("en_US.UTF-8")
@@ -48,17 +49,17 @@
utf8_encoder = self.getencoder('utf-8')
encode_error_handler = self.getstate().encode_error_handler
for val in u'foo\udc80bar', u'\udcff\U0001320C':
- expected = utf8_encoder(val, len(val), 'surrogateescape',
+ expected = utf8_encoder(val, 'surrogateescape',
encode_error_handler)
- assert locale_encoder(val) == expected
+ assert locale_encoder(val).encode('utf8') == expected
def test_decode_locale(self):
self.setlocale("en_US.UTF-8")
locale_decoder = str_decode_locale_surrogateescape
utf8_decoder = self.getdecoder('utf-8')
for val in 'foo', ' \xe6\x97\xa5\xe6\x9c\xac', '\xf0\x93\x88\x8c':
- assert (locale_decoder(val) ==
- utf8_decoder(val, len(val), None)[0])
+ assert (locale_decoder(val).encode('utf8') ==
+ utf8_decoder(val, 'strict', True, None)[0])
def test_decode_locale_errorhandler(self):
self.setlocale("en_US.UTF-8")
@@ -66,6 +67,6 @@
utf8_decoder = self.getdecoder('utf-8')
decode_error_handler = self.getstate().decode_error_handler
val = 'foo\xe3bar'
- expected = utf8_decoder(val, len(val), 'surrogateescape', True,
+ expected = utf8_decoder(val, 'surrogateescape', True,
decode_error_handler)[0]
- assert locale_decoder(val) == expected
+ assert locale_decoder(val).encode('utf8') == expected
diff --git a/pypy/objspace/std/objspace.py b/pypy/objspace/std/objspace.py
--- a/pypy/objspace/std/objspace.py
+++ b/pypy/objspace/std/objspace.py
@@ -399,7 +399,7 @@
return self.newtext(s)
def newutf8(self, utf8s, length):
- assert utf8s is not None
+ assert isinstance(utf8s, str)
return W_UnicodeObject(utf8s, length)
def newfilename(self, s):
diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -43,7 +43,7 @@
# XXX checking, remove before any performance measurments
# ifdef not_running_in_benchmark
if not we_are_translated():
- lgt = rutf8.check_utf8(utf8str, True)
+ lgt = rutf8.codepoints_in_utf8(utf8str)
assert lgt == length
@staticmethod
@@ -57,7 +57,7 @@
def unwrap(self, space):
# for testing
- return self.realunicode_w(space)
+ return space.realunicode_w(self)
def is_w(self, space, w_other):
if not isinstance(w_other, W_UnicodeObject):
@@ -95,9 +95,6 @@
def utf8_w(self, space):
return self._utf8
- def realunicode_w(self, space):
- return self._utf8.decode('utf8')
-
def listview_utf8(self):
assert self.is_ascii()
return _create_list_from_unicode(self._utf8)
@@ -1190,9 +1187,12 @@
utf8 = space.utf8_w(w_object)
idx = rutf8.surrogate_in_utf8(utf8)
if idx >= 0:
- eh = unicodehelper.encode_error_handler(space)
- eh(None, "utf8", "surrogates not allowed", utf8,
- idx, idx + 1)
+ print 'surrogate in unicodeobject.encode_object(', w_object, ',', encoding, ',', errors, ')', 'raising'
+ if errors is None:
+ w_err_handler = unicodehelper.encode_error_handler(space)
+ else:
+ w_err_handler = lookup_error(space, errors)
+ w_err_handler(None, "utf8", "surrogates not allowed", utf8, idx, idx + 1)
if errors is None or errors == 'strict':
if encoding is None or encoding == 'utf-8':
#if rutf8.has_surrogates(utf8):
diff --git a/rpython/rlib/runicode.py b/rpython/rlib/runicode.py
--- a/rpython/rlib/runicode.py
+++ b/rpython/rlib/runicode.py
@@ -1401,7 +1401,7 @@
endinpos += 1
res, pos = errorhandler(errors, encoding,
message, s, pos-2, endinpos)
- builder.append(res)
+ builder.append(res.decode('utf8'))
else:
try:
chr = r_uint(int(s[pos:pos+digits], 16))
@@ -1411,7 +1411,7 @@
endinpos += 1
res, pos = errorhandler(errors, encoding,
message, s, pos-2, endinpos)
- builder.append(res)
+ builder.append(res.decode('utf8'))
else:
# when we get here, chr is a 32-bit unicode character
if chr <= MAXUNICODE:
@@ -1427,7 +1427,7 @@
message = "illegal Unicode character"
res, pos = errorhandler(errors, encoding,
message, s, pos-2, pos+digits)
- builder.append(res)
+ builder.append(res.decode('utf8'))
return pos
def str_decode_unicode_escape(s, size, errors, final=False,
@@ -1708,8 +1708,12 @@
pos += 1
continue
- digits = 4 if s[pos] == 'u' else 8
- message = "truncated \\uXXXX"
+ if s[pos] == 'u':
+ digits = 4
+ message = "truncated \\uXXXX escape"
+ else:
+ digits = 8
+ message = "truncated \\UXXXXXXXX escape"
pos += 1
pos = hexescape(result, s, pos, digits,
"rawunicodeescape", errorhandler, message, errors)
More information about the pypy-commit
mailing list