[pypy-commit] pypy py3k: Lot of fixes in the _codecs module
amauryfa
noreply at buildbot.pypy.org
Wed Oct 19 23:11:15 CEST 2011
Author: Amaury Forgeot d'Arc <amauryfa at gmail.com>
Branch: py3k
Changeset: r48241:e0cf3d8b87a2
Date: 2011-10-19 22:31 +0200
http://bitbucket.org/pypy/pypy/changeset/e0cf3d8b87a2/
Log: Lot of fixes in the _codecs module
diff --git a/pypy/module/_codecs/interp_codecs.py b/pypy/module/_codecs/interp_codecs.py
--- a/pypy/module/_codecs/interp_codecs.py
+++ b/pypy/module/_codecs/interp_codecs.py
@@ -49,7 +49,7 @@
"(unicode, int) tuple, not %s")
raise operationerrfmt(
space.w_TypeError, msg,
- space.str_w(space.repr(w_res)))
+ space.unicode_w(space.repr(w_res)))
w_replace, w_newpos = space.fixedview(w_res, 2)
newpos = space.int_w(w_newpos)
if newpos < 0:
@@ -487,7 +487,7 @@
make_encoder_wrapper('mbcs_encode')
make_decoder_wrapper('mbcs_decode')
- at unwrap_spec(data=str, errors='str_or_None', byteorder=int)
+ at unwrap_spec(data="bufferstr", errors='str_or_None', byteorder=int)
def utf_16_ex_decode(space, data, errors='strict', byteorder=0, w_final=False):
if errors is None:
errors = 'strict'
@@ -507,7 +507,7 @@
return space.newtuple([space.wrap(res), space.wrap(consumed),
space.wrap(byteorder)])
- at unwrap_spec(data=str, errors='str_or_None', byteorder=int)
+ at unwrap_spec(data="bufferstr", errors='str_or_None', byteorder=int)
def utf_32_ex_decode(space, data, errors='strict', byteorder=0, w_final=False):
final = space.is_true(w_final)
state = space.fromcache(CodecState)
@@ -599,7 +599,7 @@
# Charmap may return a string
try:
- x = space.realstr_w(w_ch)
+ x = space.bytes_w(w_ch)
except OperationError, e:
if not e.match(space, space.w_TypeError):
raise
@@ -626,7 +626,7 @@
raise OperationError(space.w_TypeError, space.wrap("invalid mapping"))
- at unwrap_spec(string=str, errors='str_or_None')
+ at unwrap_spec(string="bufferstr", errors='str_or_None')
def charmap_decode(space, string, errors="strict", w_mapping=None):
if errors is None:
errors = 'strict'
@@ -658,7 +658,7 @@
result = runicode.unicode_encode_charmap(
uni, len(uni), errors,
state.encode_error_handler, mapping)
- return space.newtuple([space.wrap(result), space.wrap(len(uni))])
+ return space.newtuple([space.wrapbytes(result), space.wrap(len(uni))])
@unwrap_spec(chars=unicode)
@@ -716,7 +716,7 @@
if space.isinstance_w(w_string, space.w_unicode):
return space.newtuple([w_string, space.len(w_string)])
- string = space.str_w(w_string)
+ string = space.bytes_w(w_string)
if len(string) == 0:
return space.newtuple([space.wrap(u''), space.wrap(0)])
@@ -729,21 +729,21 @@
return space.newtuple([space.wrap(result), space.wrap(consumed)])
# ____________________________________________________________
-# support for the "string escape" codec
+# support for the "string escape" translation
# This is a bytes-to bytes transformation
- at unwrap_spec(data=str, errors='str_or_None')
+ at unwrap_spec(data="bufferstr", errors='str_or_None')
def escape_encode(space, data, errors='strict'):
from pypy.objspace.std.stringobject import string_escape_encode
result = string_escape_encode(data, quote="'")
start = 1
end = len(result) - 1
assert end >= 0
- w_result = space.wrap(result[start:end])
+ w_result = space.wrapbytes(result[start:end])
return space.newtuple([w_result, space.wrap(len(data))])
- at unwrap_spec(data=str, errors='str_or_None')
+ at unwrap_spec(data="bufferstr", errors='str_or_None')
def escape_decode(space, data, errors='strict'):
from pypy.interpreter.pyparser.parsestring import PyString_DecodeEscape
result = PyString_DecodeEscape(space, data, None)
- return space.newtuple([space.wrap(result), space.wrap(len(data))])
+ return space.newtuple([space.wrapbytes(result), space.wrap(len(data))])
diff --git a/pypy/module/_codecs/test/test_codecs.py b/pypy/module/_codecs/test/test_codecs.py
--- a/pypy/module/_codecs/test/test_codecs.py
+++ b/pypy/module/_codecs/test/test_codecs.py
@@ -17,7 +17,7 @@
'utf-32', 'utf-32-le', 'utf-32-be',
'raw_unicode_escape',
'unicode_escape', 'unicode_internal'):
- assert unicode(u.encode(encoding),encoding) == u
+ assert str(u.encode(encoding),encoding) == u
def test_ucs4(self):
x = u'\U00100000'
@@ -25,14 +25,14 @@
assert x == y
def test_named_unicode(self):
- assert unicode('\\N{SPACE}','unicode-escape') == u" "
- raises( UnicodeDecodeError, unicode,'\\N{SPACE','unicode-escape')
- raises( UnicodeDecodeError, unicode,'\\NSPACE}','unicode-escape')
- raises( UnicodeDecodeError, unicode,'\\NSPACE','unicode-escape')
- raises( UnicodeDecodeError, unicode,'\\N','unicode-escape')
- assert unicode('\\N{SPACE}\\N{SPACE}','unicode-escape') == u" "
- assert unicode('\\N{SPACE}a\\N{SPACE}','unicode-escape') == u" a "
- assert "\\N{foo}xx".decode("unicode-escape", "ignore") == u"xx"
+ assert str(b'\\N{SPACE}','unicode-escape') == u" "
+ raises( UnicodeDecodeError, str,b'\\N{SPACE','unicode-escape')
+ raises( UnicodeDecodeError, str,b'\\NSPACE}','unicode-escape')
+ raises( UnicodeDecodeError, str,b'\\NSPACE','unicode-escape')
+ raises( UnicodeDecodeError, str,b'\\N','unicode-escape')
+ assert str(b'\\N{SPACE}\\N{SPACE}','unicode-escape') == u" "
+ assert str(b'\\N{SPACE}a\\N{SPACE}','unicode-escape') == u" a "
+ assert b"\\N{foo}xx".decode("unicode-escape", "ignore") == u"xx"
assert 1 <= len(u"\N{CJK UNIFIED IDEOGRAPH-20000}") <= 2
def test_literals(self):
@@ -40,26 +40,26 @@
def test_insecure_pickle(self):
import pickle
- insecure = ["abc", "2 + 2", # not quoted
+ insecure = [b"abc", b"2 + 2", # not quoted
#"'abc' + 'def'", # not a single quoted string
- "'abc", # quote is not closed
- "'abc\"", # open quote and close quote don't match
- "'abc' ?", # junk after close quote
- "'\\'", # trailing backslash
+ b"'abc", # quote is not closed
+ b"'abc\"", # open quote and close quote don't match
+ b"'abc' ?", # junk after close quote
+ b"'\\'", # trailing backslash
# some tests of the quoting rules
#"'abc\"\''",
#"'\\\\a\'\'\'\\\'\\\\\''",
]
for s in insecure:
- buf = "S" + s + "\012p0\012."
+ buf = b"S" + s + b"\012p0\012."
raises (ValueError, pickle.loads, buf)
def test_unicodedecodeerror(self):
assert str(UnicodeDecodeError(
- "ascii", "g\xfcrk", 1, 2, "ouch")) == "'ascii' codec can't decode byte 0xfc in position 1: ouch"
+ "ascii", b"g\xfcrk", 1, 2, "ouch")) == "'ascii' codec can't decode byte 0xfc in position 1: ouch"
assert str(UnicodeDecodeError(
- "ascii", "g\xfcrk", 1, 3, "ouch")) == "'ascii' codec can't decode bytes in position 1-2: ouch"
+ "ascii", b"g\xfcrk", 1, 3, "ouch")) == "'ascii' codec can't decode bytes in position 1-2: ouch"
def test_unicodetranslateerror(self):
@@ -73,7 +73,7 @@
assert str(UnicodeTranslateError(
u"g\uffffrk", 1, 2, "ouch"))== "can't translate character u'\\uffff' in position 1: ouch"
- if sys.maxunicode > 0xffff and len(unichr(0x10000)) == 1:
+ if sys.maxunicode > 0xffff and len(chr(0x10000)) == 1:
assert str(UnicodeTranslateError(
u"g\U00010000rk", 1, 2, "ouch"))== "can't translate character u'\\U00010000' in position 1: ouch"
@@ -96,30 +96,31 @@
assert str(UnicodeEncodeError(
"ascii", u"\uffffx", 0, 1, "ouch"))=="'ascii' codec can't encode character u'\\uffff' in position 0: ouch"
- if sys.maxunicode > 0xffff and len(unichr(0x10000)) == 1:
+ if sys.maxunicode > 0xffff and len(chr(0x10000)) == 1:
assert str(UnicodeEncodeError(
"ascii", u"\U00010000x", 0, 1, "ouch")) =="'ascii' codec can't encode character u'\\U00010000' in position 0: ouch"
def test_indexerror(self):
- test = "\\" # trailing backslash
- raises (ValueError, test.decode,'string-escape')
+ import _codecs
+ test = b"\\" # trailing backslash
+ raises (ValueError, _codecs.escape_decode, test)
def test_charmap_decode(self):
from _codecs import charmap_decode
import sys
- assert charmap_decode('', 'strict', 'blablabla') == ('', 0)
- assert charmap_decode('xxx') == ('xxx', 3)
- assert charmap_decode('xxx', 'strict', {ord('x'): u'XX'}) == ('XXXXXX', 3)
- map = tuple([unichr(i) for i in range(256)])
- assert charmap_decode('xxx\xff', 'strict', map) == (u'xxx\xff', 4)
+ assert charmap_decode(b'', 'strict', 'blablabla') == ('', 0)
+ assert charmap_decode(b'xxx') == ('xxx', 3)
+ assert charmap_decode(b'xxx', 'strict', {ord('x'): u'XX'}) == ('XXXXXX', 3)
+ map = tuple([chr(i) for i in range(256)])
+ assert charmap_decode(b'xxx\xff', 'strict', map) == (u'xxx\xff', 4)
raises(TypeError, charmap_decode, '\xff', "replace", {0xff: 0x10001})
def test_unicode_escape(self):
from _codecs import unicode_escape_encode, unicode_escape_decode
assert unicode_escape_encode(u'abc') == (u'abc'.encode('unicode_escape'), 3)
- assert unicode_escape_decode('abc') == (u'abc'.decode('unicode_escape'), 3)
- assert unicode_escape_decode('\\x61\\x62\\x63') == (u'abc', 12)
+ assert unicode_escape_decode(b'abc') == (b'abc'.decode('unicode_escape'), 3)
+ assert unicode_escape_decode(b'\\x61\\x62\\x63') == (u'abc', 12)
class AppTestPartialEvaluation:
@@ -144,13 +145,13 @@
u"\x00\xff\u07ff\u0800\uffff",
]
- buffer = ''
+ buffer = b''
result = u""
for (c, partialresult) in zip(u"\x00\xff\u07ff\u0800\uffff".encode(encoding), check_partial):
- buffer += c
+ buffer += bytes([c])
res = _codecs.utf_8_decode(buffer,'strict',False)
if res[1] >0 :
- buffer = ''
+ buffer = b''
result += res[0]
assert result == partialresult
@@ -169,26 +170,26 @@
u"\x00\xff\u0100",
u"\x00\xff\u0100\uffff",
]
- buffer = ''
+ buffer = b''
result = u""
for (c, partialresult) in zip(u"\x00\xff\u0100\uffff".encode(encoding), check_partial):
- buffer += c
+ buffer += bytes([c])
res = _codecs.utf_16_decode(buffer,'strict',False)
if res[1] >0 :
- buffer = ''
+ buffer = b''
result += res[0]
assert result == partialresult
def test_bug1098990_a(self):
- import codecs, StringIO
+ import codecs, io
self.encoding = 'utf-8'
s1 = u"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
s2 = u"offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
s3 = u"next line.\r\n"
s = (s1+s2+s3).encode(self.encoding)
- stream = StringIO.StringIO(s)
+ stream = io.BytesIO(s)
reader = codecs.getreader(self.encoding)(stream)
assert reader.readline() == s1
assert reader.readline() == s2
@@ -196,7 +197,7 @@
assert reader.readline() == u""
def test_bug1098990_b(self):
- import codecs, StringIO
+ import codecs, io
self.encoding = 'utf-8'
s1 = u"aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
s2 = u"bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
@@ -205,7 +206,7 @@
s5 = u"againokay.\r\n"
s = (s1+s2+s3+s4+s5).encode(self.encoding)
- stream = StringIO.StringIO(s)
+ stream = io.BytesIO(s)
reader = codecs.getreader(self.encoding)(stream)
assert reader.readline() == s1
assert reader.readline() == s2
@@ -216,11 +217,11 @@
def test_seek_utf16le(self):
# all codecs should be able to encode these
- import codecs, StringIO
+ import codecs, io
encoding = 'utf-16-le'
s = u"%s\n%s\n" % (10*u"abc123", 10*u"def456")
- reader = codecs.getreader(encoding)(StringIO.StringIO(s.encode(encoding)))
- for t in xrange(5):
+ reader = codecs.getreader(encoding)(io.BytesIO(s.encode(encoding)))
+ for t in range(5):
# Test that calling seek resets the internal codec state and buffers
reader.seek(0, 0)
line = reader.readline()
@@ -229,71 +230,75 @@
def test_unicode_internal_encode(self):
import sys
- class U(unicode):
+ class U(str):
pass
enc = U(u"a").encode("unicode_internal")
if sys.maxunicode == 65535: # UCS2 build
if sys.byteorder == "big":
- assert enc == "\x00a"
+ assert enc == b"\x00a"
else:
- assert enc == "a\x00"
+ assert enc == b"a\x00"
elif len(u"\U00010098") == 1:
# UCS4 build on a UCS4 CPython
enc2 = u"\U00010098".encode("unicode_internal")
if sys.byteorder == "big":
- assert enc == "\x00\x00\x00a"
- assert enc2 == "\x00\x01\x00\x98"
+ assert enc == b"\x00\x00\x00a"
+ assert enc2 == b"\x00\x01\x00\x98"
else:
- assert enc == "a\x00\x00\x00"
- assert enc2 == "\x98\x00\x01\x00"
+ assert enc == b"a\x00\x00\x00"
+ assert enc2 == b"\x98\x00\x01\x00"
else:
# UCS4 build on a UCS2 CPython
if sys.byteorder == "big":
- assert enc == "\x00\x00\x00a"
+ assert enc == b"\x00\x00\x00a"
else:
- assert enc == "a\x00\x00\x00"
+ assert enc == b"a\x00\x00\x00"
def test_unicode_internal_decode(self):
import sys
if sys.maxunicode == 65535: # UCS2 build
if sys.byteorder == "big":
- bytes = "\x00a"
+ bytes = b"\x00a"
else:
- bytes = "a\x00"
+ bytes = b"a\x00"
else: # UCS4 build
if sys.byteorder == "big":
- bytes = "\x00\x00\x00a"
- bytes2 = "\x00\x01\x00\x98"
+ bytes = b"\x00\x00\x00a"
+ bytes2 = b"\x00\x01\x00\x98"
else:
- bytes = "a\x00\x00\x00"
- bytes2 = "\x98\x00\x01\x00"
+ bytes = b"a\x00\x00\x00"
+ bytes2 = b"\x98\x00\x01\x00"
assert bytes2.decode("unicode_internal") == u"\U00010098"
assert bytes.decode("unicode_internal") == u"a"
def test_raw_unicode_escape(self):
- assert unicode("\u0663", "raw-unicode-escape") == u"\u0663"
- assert u"\u0663".encode("raw-unicode-escape") == "\u0663"
+ assert str(b"\u0663", "raw-unicode-escape") == u"\u0663"
+ assert u"\u0663".encode("raw-unicode-escape") == b"\u0663"
def test_escape_decode(self):
- test = 'a\n\\b\x00c\td\u2045'.encode('string_escape')
- assert test.decode('string_escape') =='a\n\\b\x00c\td\u2045'
- assert '\\077'.decode('string_escape') == '?'
- assert '\\100'.decode('string_escape') == '@'
- assert '\\253'.decode('string_escape') == chr(0253)
- assert '\\312'.decode('string_escape') == chr(0312)
+ import _codecs
+ test = _codecs.escape_encode(b'a\n\\b\x00c\td\u2045')[0]
+ assert _codecs.escape_decode(test)[0] == b'a\n\\b\x00c\td\u2045'
+ assert _codecs.escape_decode(b'\\077')[0] == b'?'
+ assert _codecs.escape_decode(b'\\100')[0] == b'@'
+ assert _codecs.escape_decode(b'\\253')[0] == bytes([0253])
+ assert _codecs.escape_decode(b'\\312')[0] == bytes([0312])
def test_escape_decode_wrap_around(self):
- assert '\\400'.decode('string_escape') == chr(0)
+ import _codecs
+ assert _codecs.escape_decode(b'\\400')[0] == b'\0'
def test_escape_decode_ignore_invalid(self):
- assert '\\9'.decode('string_escape') == '\\9'
- assert '\\01'.decode('string_escape') == chr(01)
- assert '\\0f'.decode('string_escape') == chr(0) + 'f'
- assert '\\08'.decode('string_escape') == chr(0) + '8'
+ import _codecs
+ assert _codecs.escape_decode(b'\\9')[0] == b'\\9'
+ assert _codecs.escape_decode(b'\\01')[0] == b'\x01'
+ assert _codecs.escape_decode(b'\\0f')[0] == b'\0' + b'f'
+ assert _codecs.escape_decode(b'\\08')[0] == b'\0' + b'8'
def test_escape_encode(self):
- assert '"'.encode('string_escape') == '"'
- assert "'".encode('string_escape') == "\\'"
+ import _codecs
+ assert _codecs.escape_encode(b'"')[0] == b'"'
+ assert _codecs.escape_encode(b"'")[0] == b"\\'"
def test_decode_utf8_different_case(self):
constant = u"a"
@@ -304,35 +309,35 @@
def search_function(encoding):
def f(input, errors="strict"):
return 42
- print encoding
+ print(encoding)
if encoding == 'test.mytestenc':
return (f, f, None, None)
return None
_codecs.register(search_function)
- raises(TypeError, "hello".decode, "test.mytestenc")
+ raises(TypeError, b"hello".decode, "test.mytestenc")
raises(TypeError, u"hello".encode, "test.mytestenc")
def test_cpytest_decode(self):
import codecs
- assert codecs.decode('\xe4\xf6\xfc', 'latin-1') == u'\xe4\xf6\xfc'
+ assert codecs.decode(b'\xe4\xf6\xfc', 'latin-1') == u'\xe4\xf6\xfc'
raises(TypeError, codecs.decode)
- assert codecs.decode('abc') == u'abc'
- raises(UnicodeDecodeError, codecs.decode, '\xff', 'ascii')
+ assert codecs.decode(b'abc') == u'abc'
+ raises(UnicodeDecodeError, codecs.decode, b'\xff', 'ascii')
def test_bad_errorhandler_return(self):
import codecs
def baddecodereturn1(exc):
return 42
codecs.register_error("test.baddecodereturn1", baddecodereturn1)
- raises(TypeError, "\xff".decode, "ascii", "test.baddecodereturn1")
- raises(TypeError, "\\".decode, "unicode-escape", "test.baddecodereturn1")
- raises(TypeError, "\\x0".decode, "unicode-escape", "test.baddecodereturn1")
- raises(TypeError, "\\x0y".decode, "unicode-escape", "test.baddecodereturn1")
- raises(TypeError, "\\Uffffeeee".decode, "unicode-escape", "test.baddecodereturn1")
- raises(TypeError, "\\uyyyy".decode, "raw-unicode-escape", "test.baddecodereturn1")
+ raises(TypeError, b"\xff".decode, "ascii", "test.baddecodereturn1")
+ raises(TypeError, b"\\".decode, "unicode-escape", "test.baddecodereturn1")
+ raises(TypeError, b"\\x0".decode, "unicode-escape", "test.baddecodereturn1")
+ raises(TypeError, b"\\x0y".decode, "unicode-escape", "test.baddecodereturn1")
+ raises(TypeError, b"\\Uffffeeee".decode, "unicode-escape", "test.baddecodereturn1")
+ raises(TypeError, b"\\uyyyy".decode, "raw-unicode-escape", "test.baddecodereturn1")
def test_cpy_bug1175396(self):
- import codecs, StringIO
+ import codecs, io
s = [
'<%!--===================================================\r\n',
' BLOG index page: show recent articles,\r\n',
@@ -364,15 +369,15 @@
' log.error("Error loading articles: "+str(x))\r\n',
' self.abort("cannot load articles")\r\n',
]
- stream = StringIO.StringIO("".join(s).encode("utf7"))
- assert "aborrt" not in stream.getvalue()
+ stream = io.BytesIO("".join(s).encode("utf7"))
+ assert b"aborrt" not in stream.getvalue()
reader = codecs.getreader("utf7")(stream)
for (i, line) in enumerate(reader):
assert line == s[i]
def test_array(self):
import _codecs, array
- _codecs.readbuffer_encode(array.array('c', 'spam')) == ('spam', 4)
+ _codecs.readbuffer_encode(array.array('b', b'spam')) == ('spam', 4)
def test_utf8sig(self):
import codecs
@@ -382,28 +387,28 @@
def test_escape_decode_escaped_newline(self):
import _codecs
- s = '\\\n'
+ s = b'\\\n'
decoded = _codecs.unicode_escape_decode(s)[0]
assert decoded == ''
def test_charmap_decode_1(self):
import codecs
- assert codecs.charmap_encode(u'xxx') == ('xxx', 3)
- assert codecs.charmap_encode(u'xxx', 'strict', {ord('x'): 'XX'}) == ('XXXXXX', 3)
+ assert codecs.charmap_encode(u'xxx') == (b'xxx', 3)
+ assert codecs.charmap_encode(u'xxx', 'strict', {ord('x'): b'XX'}) == (b'XXXXXX', 3)
- res = codecs.charmap_decode("\x00\x01\x02", "replace", u"ab")
+ res = codecs.charmap_decode(b"\x00\x01\x02", "replace", u"ab")
assert res == (u"ab\ufffd", 3)
- res = codecs.charmap_decode("\x00\x01\x02", "replace", u"ab\ufffe")
+ res = codecs.charmap_decode(b"\x00\x01\x02", "replace", u"ab\ufffe")
assert res == (u'ab\ufffd', 3)
def test_decode_errors(self):
import sys
if sys.maxunicode > 0xffff:
try:
- "\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
+ b"\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
except UnicodeDecodeError, ex:
assert "unicode_internal" == ex.encoding
- assert "\x00\x00\x00\x00\x00\x11\x11\x00" == ex.object
+ assert b"\x00\x00\x00\x00\x00\x11\x11\x00" == ex.object
assert ex.start == 4
assert ex.end == 8
else:
@@ -414,14 +419,14 @@
assert codecs.replace_errors(UnicodeEncodeError(
"ascii", u"\u3042", 0, 1, "ouch")) == (u"?", 1)
assert codecs.replace_errors(UnicodeDecodeError(
- "ascii", "\xff", 0, 1, "ouch")) == (u"\ufffd", 1)
+ "ascii", b"\xff", 0, 1, "ouch")) == (u"\ufffd", 1)
assert codecs.replace_errors(UnicodeTranslateError(
u"\u3042", 0, 1, "ouch")) == (u"\ufffd", 1)
assert codecs.replace_errors(UnicodeEncodeError(
"ascii", u"\u3042\u3042", 0, 2, "ouch")) == (u"??", 2)
assert codecs.replace_errors(UnicodeDecodeError(
- "ascii", "\xff\xff", 0, 2, "ouch")) == (u"\ufffd", 2)
+ "ascii", b"\xff\xff", 0, 2, "ouch")) == (u"\ufffd", 2)
assert codecs.replace_errors(UnicodeTranslateError(
u"\u3042\u3042", 0, 2, "ouch")) == (u"\ufffd\ufffd", 2)
@@ -439,13 +444,13 @@
# A UnicodeDecodeError object without an end attribute
class NoEndUnicodeDecodeError(UnicodeDecodeError):
def __init__(self):
- UnicodeDecodeError.__init__(self, "ascii", "", 0, 1, "bad")
+ UnicodeDecodeError.__init__(self, "ascii", b"", 0, 1, "bad")
del self.end
# A UnicodeDecodeError object with a bad object attribute
class BadObjectUnicodeDecodeError(UnicodeDecodeError):
def __init__(self):
- UnicodeDecodeError.__init__(self, "ascii", "", 0, 1, "bad")
+ UnicodeDecodeError.__init__(self, "ascii", b"", 0, 1, "bad")
self.object = []
# A UnicodeTranslateError object without a start attribute
@@ -477,11 +482,11 @@
# With the correct exception, "replace" returns an "?" or u"\ufffd" replacement
def test_decode_ignore(self):
- assert '\xff'.decode('utf-7', 'ignore') == ''
- assert '\x00'.decode('unicode-internal', 'ignore') == ''
+ assert b'\xff'.decode('utf-7', 'ignore') == ''
+ assert b'\x00'.decode('unicode-internal', 'ignore') == ''
def test_backslahreplace(self):
- assert u'a\xac\u1234\u20ac\u8000'.encode('ascii', 'backslashreplace') == 'a\\xac\u1234\u20ac\u8000'
+ assert u'a\xac\u1234\u20ac\u8000'.encode('ascii', 'backslashreplace') == b'a\\xac\u1234\u20ac\u8000'
def test_surrogateescape(self):
assert b'a\x80b'.decode('utf-8', 'surrogateescape') == 'a\udc80b'
@@ -502,10 +507,10 @@
"test.badhandler"
)
for (enc, bytes) in (
- ("utf-8", "\xff"),
- ("ascii", "\xff"),
- ("utf-7", "+x-"),
- ("unicode-internal", "\x00"),
+ ("utf-8", b"\xff"),
+ ("ascii", b"\xff"),
+ ("utf-7", b"+x-"),
+ ("unicode-internal", b"\x00"),
):
raises(
TypeError,
@@ -518,19 +523,19 @@
import codecs
import sys
try:
- '\x00'.decode('unicode-internal')
+ b'\x00'.decode('unicode-internal')
except UnicodeDecodeError:
pass
else:
raise Exception("DID NOT RAISE")
- res = "\x00\x00\x00\x00\x00".decode("unicode-internal", "replace")
+ res = b"\x00\x00\x00\x00\x00".decode("unicode-internal", "replace")
if sys.maxunicode > 65535:
assert res == u"\u0000\ufffd" # UCS4 build
else:
assert res == u"\x00\x00\ufffd" # UCS2 build
- res = "\x00\x00\x00\x00\x00".decode("unicode-internal", "ignore")
+ res = b"\x00\x00\x00\x00\x00".decode("unicode-internal", "ignore")
if sys.maxunicode > 65535:
assert res == u"\u0000" # UCS4 build
else:
@@ -541,7 +546,7 @@
raise TypeError("don't know how to handle %r" % exc)
return (u"\x01", 1)
codecs.register_error("test.hui", handler_unicodeinternal)
- res = "\x00\x00\x00\x00\x00".decode("unicode-internal", "test.hui")
+ res = b"\x00\x00\x00\x00\x00".decode("unicode-internal", "test.hui")
if sys.maxunicode > 65535:
assert res == u"\u0000\u0001\u0000" # UCS4 build
else:
@@ -550,31 +555,31 @@
def test_encode_error_bad_handler(self):
import codecs
codecs.register_error("test.bad_handler", lambda e: (repl, 1))
- assert u"xyz".encode("latin-1", "test.bad_handler") == "xyz"
+ assert u"xyz".encode("latin-1", "test.bad_handler") == b"xyz"
repl = u"\u1234"
raises(UnicodeEncodeError, u"\u5678".encode, "latin-1",
"test.bad_handler")
repl = u"\u00E9"
s = u"\u5678".encode("latin-1", "test.bad_handler")
- assert s == '\xe9'
+ assert s == b'\xe9'
def test_charmap_encode(self):
- assert 'xxx'.encode('charmap') == 'xxx'
+ assert 'xxx'.encode('charmap') == b'xxx'
import codecs
raises(TypeError, codecs.charmap_encode, u'\xff', "replace", {0xff: 300})
raises(UnicodeError, codecs.charmap_encode, u"\xff", "replace", {0xff: None})
def test_charmap_encode_replace(self):
- charmap = dict([ (ord(c), 2*c.upper()) for c in "abcdefgh"])
- charmap[ord("?")] = "XYZ"
+ charmap = dict([(c, bytes([c, c]).upper()) for c in b"abcdefgh"])
+ charmap[ord("?")] = b"XYZ"
import codecs
sin = u"abcDEF"
sout = codecs.charmap_encode(sin, "replace", charmap)[0]
- assert sout == "AABBCCXYZXYZXYZ"
+ assert sout == b"AABBCCXYZXYZXYZ"
def test_charmap_decode_2(self):
- assert 'foo'.decode('charmap') == 'foo'
+ assert b'foo'.decode('charmap') == 'foo'
def test_charmap_build(self):
import codecs
@@ -583,25 +588,25 @@
def test_utf7_start_end_in_exception(self):
try:
- '+IC'.decode('utf-7')
+ b'+IC'.decode('utf-7')
except UnicodeDecodeError, exc:
assert exc.start == 0
assert exc.end == 3
def test_utf7_surrogate(self):
- raises(UnicodeDecodeError, '+3ADYAA-'.decode, 'utf-7')
+ raises(UnicodeDecodeError, b'+3ADYAA-'.decode, 'utf-7')
def test_utf_16_encode_decode(self):
import codecs
x = u'123abc'
- assert codecs.getencoder('utf-16')(x) == ('\xff\xfe1\x002\x003\x00a\x00b\x00c\x00', 6)
- assert codecs.getdecoder('utf-16')('\xff\xfe1\x002\x003\x00a\x00b\x00c\x00') == (x, 14)
+ assert codecs.getencoder('utf-16')(x) == (b'\xff\xfe1\x002\x003\x00a\x00b\x00c\x00', 6)
+ assert codecs.getdecoder('utf-16')(b'\xff\xfe1\x002\x003\x00a\x00b\x00c\x00') == (x, 14)
def test_unicode_escape(self):
- assert u'\\'.encode('unicode-escape') == '\\\\'
- assert '\\\\'.decode('unicode-escape') == u'\\'
- assert u'\ud801'.encode('unicode-escape') == '\\ud801'
- assert u'\u0013'.encode('unicode-escape') == '\\x13'
+ assert u'\\'.encode('unicode-escape') == b'\\\\'
+ assert b'\\\\'.decode('unicode-escape') == u'\\'
+ assert u'\ud801'.encode('unicode-escape') == b'\\ud801'
+ assert u'\u0013'.encode('unicode-escape') == b'\\x13'
def test_mbcs(self):
import sys
@@ -611,11 +616,3 @@
assert u'caf\xe9'.encode('mbcs') == 'caf\xe9'
assert u'\u040a'.encode('mbcs') == '?' # some cyrillic letter
assert 'cafx\e9'.decode('mbcs') == u'cafx\e9'
-
- def test_bad_handler_string_result(self):
- import _codecs
- def f(exc):
- return ('foo', exc.end)
- _codecs.register_error("test.test_codecs_not_a_string", f)
- raises(TypeError, u'\u1234'.encode, 'ascii',
- 'test.test_codecs_not_a_string')
diff --git a/pypy/rlib/runicode.py b/pypy/rlib/runicode.py
--- a/pypy/rlib/runicode.py
+++ b/pypy/rlib/runicode.py
@@ -272,11 +272,11 @@
# Encode UCS2 Unicode ordinals
if ch < 0x10000:
# Special case: check for high surrogate
- if 0xD800 <= ch <= 0xDBFF and pos != size:
+ if 0xD800 <= ch <= 0xDFFF and pos != size:
ch2 = ord(s[pos])
# Check for low surrogate and combine the two to
# form a UCS4 value
- if 0xDC00 <= ch2 <= 0xDFFF:
+ if ch <= 0xDBFF and 0xDC00 <= ch2 <= 0xDFFF:
ch3 = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000
pos += 1
_encodeUCS4(result, ch3)
More information about the pypy-commit
mailing list