[pypy-commit] pypy unicode-utf8: fix all the tests in codecs until test_ztranslation
fijal
pypy.commits at gmail.com
Tue Nov 21 08:03:46 EST 2017
Author: fijal
Branch: unicode-utf8
Changeset: r93110:c7109cb7f6be
Date: 2017-11-21 14:03 +0100
http://bitbucket.org/pypy/pypy/changeset/c7109cb7f6be/
Log: fix all the tests in codecs until test_ztranslation
diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -173,8 +173,13 @@
r, pos = errorhandler(errors, 'latin1',
'ordinal not in range(256)', s, cur,
cur + 1)
- res.append(r)
for j in range(pos - cur):
+ c = rutf8.codepoint_at_pos(r, j)
+ if c > 0xFF:
+ errorhandler("strict", 'latin1',
+ 'ordinal not in range(256)', s,
+ cur, cur + 1)
+ res.append(chr(c))
i = rutf8.next_codepoint_pos(s, i)
cur = pos
cur += 1
@@ -200,7 +205,12 @@
msg = "ordinal not in range(128)"
r, newpos = errorhandler(errors, 'ascii', msg, utf8,
pos, endpos)
- for _ in range(newpos - pos):
+ for j in range(newpos - pos):
+ c = rutf8.codepoint_at_pos(r, j)
+ if c > 0x7F:
+ errorhandler("strict", 'ascii',
+ 'ordinal not in range(128)', utf8,
+ pos, pos + 1)
i = rutf8.next_codepoint_pos(utf8, i)
pos = newpos
res.append(r)
@@ -364,7 +374,7 @@
message = "illegal Unicode character"
res, pos = errorhandler(errors, encoding,
message, s, pos-2, pos+digits)
- size, flag = rutf8.check_utf8(res)
+ size, flag = rutf8.check_utf8(res, True)
builder.append(res)
else:
rutf8.unichr_as_utf8_append(builder, chr, True)
@@ -778,21 +788,25 @@
if base64bits > 0: # left-over bits
if base64bits >= 6:
# We've seen at least one base-64 character
- aaa
pos += 1
msg = "partial character in shift sequence"
res, pos = errorhandler(errors, 'utf7',
msg, s, pos-1, pos)
+ reslen, resflags = rutf8.check_utf8(res, True)
+ outsize += reslen
+ flag = combine_flags(flag, resflags)
result.append(res)
continue
else:
# Some bits remain; they should be zero
if base64buffer != 0:
- bbb
pos += 1
msg = "non-zero padding bits in shift sequence"
res, pos = errorhandler(errors, 'utf7',
msg, s, pos-1, pos)
+ reslen, resflags = rutf8.check_utf8(res, True)
+ outsize += reslen
+ flag = combine_flags(flag, resflags)
result.append(res)
continue
@@ -826,11 +840,13 @@
outsize += 1
pos += 1
else:
- yyy
startinpos = pos
pos += 1
msg = "unexpected special character"
res, pos = errorhandler(errors, 'utf7', msg, s, pos-1, pos)
+ reslen, resflags = rutf8.check_utf8(res, True)
+ outsize += reslen
+ flag = combine_flags(flag, resflags)
result.append(res)
# end of string
@@ -973,7 +989,7 @@
else:
bo = 1
if size == 0:
- return u'', 0, bo
+ return '', 0, 0, rutf8.FLAG_ASCII, bo
if bo == -1:
# force little endian
ihi = 1
@@ -1182,7 +1198,7 @@
else:
bo = 1
if size == 0:
- return u'', 0, bo
+ return '', 0, 0, rutf8.FLAG_ASCII, bo
if bo == -1:
# force little endian
iorder = [0, 1, 2, 3]
@@ -1409,40 +1425,43 @@
mapping=None):
size = len(s)
if mapping is None:
- return utf8_encode_latin_1(s, size, errors,
- errorhandler=errorhandler)
+ return utf8_encode_latin_1(s, errors, errorhandler=errorhandler)
if size == 0:
return ''
result = StringBuilder(size)
pos = 0
+ index = 0
while pos < size:
ch = rutf8.codepoint_at_pos(s, pos)
c = mapping.get(ch, '')
if len(c) == 0:
# collect all unencodable chars. Important for narrow builds.
- collend = pos + 1
- while collend < size and mapping.get(s[collend], '') == '':
- collend += 1
- rs, pos = errorhandler(errors, "charmap",
+ collend = rutf8.next_codepoint_pos(s, pos)
+ endindex = index + 1
+ while collend < size and mapping.get(rutf8.codepoint_at_pos(s, collend), '') == '':
+ collend = rutf8.next_codepoint_pos(s, collend)
+ endindex += 1
+ rs, endindex = errorhandler(errors, "charmap",
"character maps to <undefined>",
- s, pos, collend)
- XXXX
- if rs is not None:
- # py3k only
- result.append(rs)
- continue
- for ch2 in ru:
- c2 = mapping.get(ch2, '')
- if len(c2) == 0:
+ s, index, endindex)
+ j = 0
+ for _ in range(endindex - index):
+ ch2 = rutf8.codepoint_at_pos(rs, j)
+ ch2 = mapping.get(ch2, '')
+ if not ch2:
errorhandler(
"strict", "charmap",
"character maps to <undefined>",
- s, pos, pos + 1)
- result.append(c2)
+ s, index, index + 1)
+ result.append(ch2)
+ index += 1
+ j = rutf8.next_codepoint_pos(rs, j)
+ pos = rutf8.next_codepoint_pos(s, pos)
continue
result.append(c)
+ index += 1
pos = rutf8.next_codepoint_pos(s, pos)
return result.build()
diff --git a/pypy/module/_codecs/interp_codecs.py b/pypy/module/_codecs/interp_codecs.py
--- a/pypy/module/_codecs/interp_codecs.py
+++ b/pypy/module/_codecs/interp_codecs.py
@@ -1,6 +1,6 @@
from rpython.rlib import jit, rutf8
from rpython.rlib.objectmodel import we_are_translated, not_rpython
-from rpython.rlib.rstring import UnicodeBuilder
+from rpython.rlib.rstring import StringBuilder
from pypy.interpreter.error import OperationError, oefmt
from pypy.interpreter.gateway import interp2app, unwrap_spec, WrappedDefault
@@ -241,33 +241,42 @@
"don't know how to handle %T in error callback", w_exc)
def backslashreplace_errors(space, w_exc):
+ from pypy.interpreter import unicodehelper
+
check_exception(space, w_exc)
if space.isinstance_w(w_exc, space.w_UnicodeEncodeError):
- obj = space.realunicode_w(space.getattr(w_exc, space.newtext('object')))
+ w_obj = space.getattr(w_exc, space.newtext('object'))
+ space.realutf8_w(w_obj) # for errors
+ w_obj = unicodehelper.convert_arg_to_w_unicode(space, w_obj)
start = space.int_w(space.getattr(w_exc, space.newtext('start')))
w_end = space.getattr(w_exc, space.newtext('end'))
end = space.int_w(w_end)
- builder = UnicodeBuilder()
+ start = w_obj._index_to_byte(start)
+ end = w_obj._index_to_byte(end)
+ builder = StringBuilder()
+ obj = w_obj._utf8
pos = start
while pos < end:
- oc = ord(obj[pos])
+ oc = rutf8.codepoint_at_pos(obj, pos)
num = hex(oc)
if (oc >= 0x10000):
- builder.append(u"\\U")
+ builder.append("\\U")
zeros = 8
elif (oc >= 0x100):
- builder.append(u"\\u")
+ builder.append("\\u")
zeros = 4
else:
- builder.append(u"\\x")
+ builder.append("\\x")
zeros = 2
lnum = len(num)
nb = zeros + 2 - lnum # num starts with '0x'
if nb > 0:
- builder.append_multiple_char(u'0', nb)
- builder.append_slice(unicode(num), 2, lnum)
- pos += 1
- return space.newtuple([space.newunicode(builder.build()), w_end])
+ builder.append_multiple_char('0', nb)
+ builder.append_slice(num, 2, lnum)
+ pos = rutf8.next_codepoint_pos(obj, pos)
+ r = builder.build()
+ lgt, flag = rutf8.check_utf8(r, True)
+ return space.newtuple([space.newutf8(r, lgt, flag), w_end])
else:
raise oefmt(space.w_TypeError,
"don't know how to handle %T in error callback", w_exc)
@@ -489,7 +498,7 @@
@unwrap_spec(data='bufferstr', errors='text_or_none', byteorder=int,
w_final=WrappedDefault(False))
def utf_16_ex_decode(space, data, errors='strict', byteorder=0, w_final=None):
- from pypy.interpreter.unicodehelper import DecodeWrapper
+ from pypy.interpreter.unicodehelper import str_decode_utf_16_helper
if errors is None:
errors = 'strict'
@@ -504,16 +513,17 @@
consumed = len(data)
if final:
consumed = 0
- res, consumed, byteorder = runicode.str_decode_utf_16_helper(
- data, len(data), errors, final,
- DecodeWrapper(state.decode_error_handler).handle, byteorder)
- return space.newtuple([space.newunicode(res), space.newint(consumed),
+ res, consumed, lgt, flag, byteorder = str_decode_utf_16_helper(
+ data, errors, final,
+ state.decode_error_handler, byteorder)
+ return space.newtuple([space.newutf8(res, lgt, flag),
+ space.newint(consumed),
space.newint(byteorder)])
@unwrap_spec(data='bufferstr', errors='text_or_none', byteorder=int,
w_final=WrappedDefault(False))
def utf_32_ex_decode(space, data, errors='strict', byteorder=0, w_final=None):
- from pypy.interpreter.unicodehelper import DecodeWrapper
+ from pypy.interpreter.unicodehelper import str_decode_utf_32_helper
final = space.is_true(w_final)
state = space.fromcache(CodecState)
@@ -526,10 +536,11 @@
consumed = len(data)
if final:
consumed = 0
- res, consumed, byteorder = runicode.str_decode_utf_32_helper(
- data, len(data), errors, final,
- DecodeWrapper(state.decode_error_handler).handle, byteorder)
- return space.newtuple([space.newunicode(res), space.newint(consumed),
+ res, consumed, lgt, flag, byteorder = str_decode_utf_32_helper(
+ data, errors, final,
+ state.decode_error_handler, byteorder)
+ return space.newtuple([space.newutf8(res, lgt, flag),
+ space.newint(consumed),
space.newint(byteorder)])
# ____________________________________________________________
diff --git a/pypy/module/_codecs/test/test_codecs.py b/pypy/module/_codecs/test/test_codecs.py
--- a/pypy/module/_codecs/test/test_codecs.py
+++ b/pypy/module/_codecs/test/test_codecs.py
@@ -592,11 +592,11 @@
def handler_unicodeinternal(exc):
if not isinstance(exc, UnicodeDecodeError):
raise TypeError("don't know how to handle %r" % exc)
- return (u"\x01", 1)
+ return (u"\x01", 4)
codecs.register_error("test.hui", handler_unicodeinternal)
res = "\x00\x00\x00\x00\x00".decode("unicode-internal", "test.hui")
if sys.maxunicode > 65535:
- assert res == u"\u0000\u0001\u0000" # UCS4 build
+ assert res == u"\u0000\u0001" # UCS4 build
else:
assert res == u"\x00\x00\x01\x00\x00" # UCS2 build
More information about the pypy-commit
mailing list