[pypy-commit] pypy py3k: Unicode fixes in _multibytecodec module
amauryfa
noreply at buildbot.pypy.org
Sat Dec 17 23:01:11 CET 2011
Author: Amaury Forgeot d'Arc <amauryfa at gmail.com>
Branch: py3k
Changeset: r50643:c13feede4946
Date: 2011-12-17 20:47 +0100
http://bitbucket.org/pypy/pypy/changeset/c13feede4946/
Log: Unicode fixes in _multibytecodec module
diff --git a/pypy/module/_multibytecodec/interp_incremental.py b/pypy/module/_multibytecodec/interp_incremental.py
--- a/pypy/module/_multibytecodec/interp_incremental.py
+++ b/pypy/module/_multibytecodec/interp_incremental.py
@@ -48,7 +48,7 @@
c_codecs.pypy_cjk_dec_free(self.decodebuf)
self.decodebuf = lltype.nullptr(c_codecs.DECODEBUF_P.TO)
- @unwrap_spec(object=str, final=bool)
+ @unwrap_spec(object='bufferstr', final=bool)
def decode_w(self, object, final=False):
space = self.space
state = space.fromcache(CodecState)
@@ -114,7 +114,7 @@
pos = c_codecs.pypy_cjk_enc_inbuf_consumed(self.encodebuf)
assert 0 <= pos <= len(object)
self.pending = object[pos:]
- return space.wrap(output)
+ return space.wrapbytes(output)
@unwrap_spec(errors="str_or_None")
diff --git a/pypy/module/_multibytecodec/interp_multibytecodec.py b/pypy/module/_multibytecodec/interp_multibytecodec.py
--- a/pypy/module/_multibytecodec/interp_multibytecodec.py
+++ b/pypy/module/_multibytecodec/interp_multibytecodec.py
@@ -12,7 +12,7 @@
self.name = name
self.codec = codec
- @unwrap_spec(input=str, errors="str_or_None")
+ @unwrap_spec(input='bufferstr', errors="str_or_None")
def decode(self, space, input, errors=None):
if errors is None:
errors = 'strict'
@@ -41,7 +41,7 @@
raise wrap_unicodeencodeerror(space, e, input, self.name)
except RuntimeError:
raise wrap_runtimeerror(space)
- return space.newtuple([space.wrap(output),
+ return space.newtuple([space.wrapbytes(output),
space.wrap(len(input))])
@@ -69,7 +69,7 @@
space.w_UnicodeDecodeError,
space.newtuple([
space.wrap(name),
- space.wrap(input),
+ space.wrapbytes(input),
space.wrap(e.start),
space.wrap(e.end),
space.wrap(e.reason)]))
diff --git a/pypy/module/_multibytecodec/test/test_app_codecs.py b/pypy/module/_multibytecodec/test/test_app_codecs.py
--- a/pypy/module/_multibytecodec/test/test_app_codecs.py
+++ b/pypy/module/_multibytecodec/test/test_app_codecs.py
@@ -12,76 +12,76 @@
def test_decode_hz(self):
import _codecs_cn
codec = _codecs_cn.getcodec("hz")
- r = codec.decode("~{abc}")
- assert r == (u'\u5f95\u6cef', 6)
+ r = codec.decode(b"~{abc}")
+ assert r == ('\u5f95\u6cef', 6)
def test_strict_error(self):
import _codecs_cn
codec = _codecs_cn.getcodec("hz")
- r = codec.decode("~{abc}", "strict")
- assert r == (u'\u5f95\u6cef', 6)
- assert type(r[0]) is unicode
+ r = codec.decode(b"~{abc}", "strict")
+ assert r == ('\u5f95\u6cef', 6)
+ assert type(r[0]) is str
def test_decode_hz_error(self):
import _codecs_cn
codec = _codecs_cn.getcodec("hz")
- e = raises(UnicodeDecodeError, codec.decode, "~{}").value
- assert e.args == ('hz', '~{}', 2, 3, 'incomplete multibyte sequence')
+ e = raises(UnicodeDecodeError, codec.decode, b"~{}").value
+ assert e.args == ('hz', b'~{}', 2, 3, 'incomplete multibyte sequence')
assert e.encoding == 'hz'
- assert e.object == '~{}' and type(e.object) is str
+ assert e.object == b'~{}' and type(e.object) is bytes
assert e.start == 2
assert e.end == 3
assert e.reason == "incomplete multibyte sequence"
#
- e = raises(UnicodeDecodeError, codec.decode, "~{xyz}").value
- assert e.args == ('hz', '~{xyz}', 2, 4, 'illegal multibyte sequence')
+ e = raises(UnicodeDecodeError, codec.decode, b"~{xyz}").value
+ assert e.args == ('hz', b'~{xyz}', 2, 4, 'illegal multibyte sequence')
def test_decode_hz_ignore(self):
import _codecs_cn
codec = _codecs_cn.getcodec("hz")
- r = codec.decode("def~{}abc", errors='ignore')
- assert r == (u'def\u5fcf', 9)
- r = codec.decode("def~{}abc", 'ignore')
- assert r == (u'def\u5fcf', 9)
+ r = codec.decode(b"def~{}abc", errors='ignore')
+ assert r == ('def\u5fcf', 9)
+ r = codec.decode(b"def~{}abc", 'ignore')
+ assert r == ('def\u5fcf', 9)
def test_decode_hz_replace(self):
import _codecs_cn
codec = _codecs_cn.getcodec("hz")
- r = codec.decode("def~{}abc", errors='replace')
- assert r == (u'def\ufffd\u5fcf', 9)
- r = codec.decode("def~{}abc", 'replace')
- assert r == (u'def\ufffd\u5fcf', 9)
+ r = codec.decode(b"def~{}abc", errors='replace')
+ assert r == ('def\ufffd\u5fcf', 9)
+ r = codec.decode(b"def~{}abc", 'replace')
+ assert r == ('def\ufffd\u5fcf', 9)
def test_decode_custom_error_handler(self):
import codecs
codecs.register_error("test.decode_custom_error_handler",
- lambda e: (u'\u1234\u5678', e.end))
- u = "abc\xDD".decode("hz", "test.decode_custom_error_handler")
- assert u == u'abc\u1234\u5678'
+ lambda e: ('\u1234\u5678', e.end))
+ u = b"abc\xDD".decode("hz", "test.decode_custom_error_handler")
+ assert u == 'abc\u1234\u5678'
def test_decode_custom_error_handler_overflow(self):
import codecs
import sys
codecs.register_error("test.test_decode_custom_error_handler_overflow",
- lambda e: (u'', sys.maxint + 1))
- raises((IndexError, OverflowError), "abc\xDD".decode, "hz",
+ lambda e: ('', sys.maxint + 1))
+ raises((IndexError, OverflowError), b"abc\xDD".decode, "hz",
"test.test_decode_custom_error_handler_overflow")
def test_encode_hz(self):
import _codecs_cn
codec = _codecs_cn.getcodec("hz")
- r = codec.encode(u'\u5f95\u6cef')
- assert r == ('~{abc}~}', 2)
- assert type(r[0]) is str
+ r = codec.encode('\u5f95\u6cef')
+ assert r == (b'~{abc}~}', 2)
+ assert type(r[0]) is bytes
def test_encode_hz_error(self):
import _codecs_cn
codec = _codecs_cn.getcodec("hz")
- u = u'abc\u1234def'
+ u = 'abc\u1234def'
e = raises(UnicodeEncodeError, codec.encode, u).value
assert e.args == ('hz', u, 3, 4, 'illegal multibyte sequence')
assert e.encoding == 'hz'
- assert e.object == u and type(e.object) is unicode
+ assert e.object == u and type(e.object) is str
assert e.start == 3
assert e.end == 4
assert e.reason == 'illegal multibyte sequence'
@@ -89,20 +89,20 @@
def test_encode_hz_ignore(self):
import _codecs_cn
codec = _codecs_cn.getcodec("hz")
- r = codec.encode(u'abc\u1234def', 'ignore')
- assert r == ('abcdef', 7)
- assert type(r[0]) is str
+ r = codec.encode('abc\u1234def', 'ignore')
+ assert r == (b'abcdef', 7)
+ assert type(r[0]) is bytes
def test_encode_hz_replace(self):
import _codecs_cn
codec = _codecs_cn.getcodec("hz")
- r = codec.encode(u'abc\u1234def', 'replace')
- assert r == ('abc?def', 7)
- assert type(r[0]) is str
+ r = codec.encode('abc\u1234def', 'replace')
+ assert r == (b'abc?def', 7)
+ assert type(r[0]) is bytes
def test_encode_custom_error_handler(self):
import codecs
codecs.register_error("test.multi_bad_handler", lambda e: (repl, 1))
- repl = u"\u2014"
- s = u"\uDDA1".encode("gbk", "test.multi_bad_handler")
- assert s == '\xA1\xAA'
+ repl = "\u2014"
+ s = "\uDDA1".encode("gbk", "test.multi_bad_handler")
+ assert s == b'\xA1\xAA'
diff --git a/pypy/module/_multibytecodec/test/test_app_incremental.py b/pypy/module/_multibytecodec/test/test_app_incremental.py
--- a/pypy/module/_multibytecodec/test/test_app_incremental.py
+++ b/pypy/module/_multibytecodec/test/test_app_incremental.py
@@ -34,86 +34,86 @@
def test_decode_hz(self):
d = self.IncrementalHzDecoder()
- r = d.decode("~{abcd~}")
- assert r == u'\u5f95\u6c85'
- r = d.decode("~{efgh~}")
- assert r == u'\u5f50\u73b7'
- for c, output in zip("!~{abcd~}xyz~{efgh",
- [u'!', # !
- u'', # ~
- u'', # {
- u'', # a
- u'\u5f95', # b
- u'', # c
- u'\u6c85', # d
- u'', # ~
- u'', # }
- u'x', # x
- u'y', # y
- u'z', # z
- u'', # ~
- u'', # {
- u'', # e
- u'\u5f50', # f
- u'', # g
- u'\u73b7', # h
+ r = d.decode(b"~{abcd~}")
+ assert r == '\u5f95\u6c85'
+ r = d.decode(b"~{efgh~}")
+ assert r == '\u5f50\u73b7'
+ for c, output in zip(b"!~{abcd~}xyz~{efgh",
+ ['!', # !
+ '', # ~
+ '', # {
+ '', # a
+ '\u5f95', # b
+ '', # c
+ '\u6c85', # d
+ '', # ~
+ '', # }
+ 'x', # x
+ 'y', # y
+ 'z', # z
+ '', # ~
+ '', # {
+ '', # e
+ '\u5f50', # f
+ '', # g
+ '\u73b7', # h
]):
- r = d.decode(c)
+ r = d.decode(bytes([c]))
assert r == output
def test_decode_hz_final(self):
d = self.IncrementalHzDecoder()
- r = d.decode("~{", True)
- assert r == u''
- raises(UnicodeDecodeError, d.decode, "~", True)
- raises(UnicodeDecodeError, d.decode, "~{a", True)
+ r = d.decode(b"~{", True)
+ assert r == ''
+ raises(UnicodeDecodeError, d.decode, b"~", True)
+ raises(UnicodeDecodeError, d.decode, b"~{a", True)
def test_decode_hz_reset(self):
d = self.IncrementalHzDecoder()
- r = d.decode("ab")
- assert r == u'ab'
- r = d.decode("~{")
- assert r == u''
- r = d.decode("ab")
- assert r == u'\u5f95'
- r = d.decode("ab")
- assert r == u'\u5f95'
+ r = d.decode(b"ab")
+ assert r == 'ab'
+ r = d.decode(b"~{")
+ assert r == ''
+ r = d.decode(b"ab")
+ assert r == '\u5f95'
+ r = d.decode(b"ab")
+ assert r == '\u5f95'
d.reset()
- r = d.decode("ab")
- assert r == u'ab'
+ r = d.decode(b"ab")
+ assert r == 'ab'
def test_decode_hz_error(self):
d = self.IncrementalHzDecoder()
- raises(UnicodeDecodeError, d.decode, "~{abc", True)
+ raises(UnicodeDecodeError, d.decode, b"~{abc", True)
d = self.IncrementalHzDecoder("ignore")
- r = d.decode("~{abc", True)
+ r = d.decode(b"~{abc", True)
assert r == u'\u5f95'
d = self.IncrementalHzDecoder()
d.errors = "replace"
- r = d.decode("~{abc", True)
- assert r == u'\u5f95\ufffd'
+ r = d.decode(b"~{abc", True)
+ assert r == '\u5f95\ufffd'
def test_decode_hz_buffer_grow(self):
d = self.IncrementalHzDecoder()
for i in range(13):
- r = d.decode("a" * (2**i))
- assert r == u"a" * (2**i)
+ r = d.decode(b"a" * (2**i))
+ assert r == "a" * (2**i)
def test_encode_hz(self):
e = self.IncrementalHzEncoder()
r = e.encode("abcd")
- assert r == 'abcd'
- r = e.encode(u"\u5f95\u6c85")
- assert r == '~{abcd~}'
- r = e.encode(u"\u5f50")
- assert r == '~{ef~}'
- r = e.encode(u"\u73b7")
- assert r == '~{gh~}'
+ assert r == b'abcd'
+ r = e.encode("\u5f95\u6c85")
+ assert r == b'~{abcd~}'
+ r = e.encode("\u5f50")
+ assert r == b'~{ef~}'
+ r = e.encode("\u73b7")
+ assert r == b'~{gh~}'
def test_encode_hz_final(self):
e = self.IncrementalHzEncoder()
- r = e.encode(u"xyz\u5f95\u6c85", True)
- assert r == 'xyz~{abcd~}'
+ r = e.encode("xyz\u5f95\u6c85", True)
+ assert r == b'xyz~{abcd~}'
# This is a bit hard to test, because the only way I can see that
# encoders can return MBERR_TOOFEW is with surrogates, which only
# occur with 2-byte unicode characters... We will just have to
@@ -123,41 +123,41 @@
def test_encode_hz_reset(self):
# Same issue as with test_encode_hz_final
e = self.IncrementalHzEncoder()
- r = e.encode(u"xyz\u5f95\u6c85", True)
- assert r == 'xyz~{abcd~}'
+ r = e.encode("xyz\u5f95\u6c85", True)
+ assert r == b'xyz~{abcd~}'
e.reset()
- r = e.encode(u"xyz\u5f95\u6c85")
- assert r == 'xyz~{abcd~}'
+ r = e.encode("xyz\u5f95\u6c85")
+ assert r == b'xyz~{abcd~}'
def test_encode_hz_error(self):
e = self.IncrementalHzEncoder()
- raises(UnicodeEncodeError, e.encode, u"\u4321", True)
+ raises(UnicodeEncodeError, e.encode, "\u4321", True)
e = self.IncrementalHzEncoder("ignore")
- r = e.encode(u"xy\u4321z", True)
- assert r == 'xyz'
+ r = e.encode("xy\u4321z", True)
+ assert r == b'xyz'
e = self.IncrementalHzEncoder()
e.errors = "replace"
- r = e.encode(u"xy\u4321z", True)
- assert r == 'xy?z'
+ r = e.encode("xy\u4321z", True)
+ assert r == b'xy?z'
def test_encode_hz_buffer_grow(self):
e = self.IncrementalHzEncoder()
for i in range(13):
- r = e.encode(u"a" * (2**i))
- assert r == "a" * (2**i)
+ r = e.encode("a" * (2**i))
+ assert r == b"a" * (2**i)
def test_encode_big5hkscs(self):
#e = self.IncrementalBig5hkscsEncoder()
- #r = e.encode(u'\xca', True)
- #assert r == '\x88f'
- #r = e.encode(u'\xca', True)
- #assert r == '\x88f'
- #raises(UnicodeEncodeError, e.encode, u'\u0304', True)
+ #r = e.encode('\xca', True)
+ #assert r == b'\x88f'
+ #r = e.encode('\xca', True)
+ #assert r == b'\x88f'
+ #raises(UnicodeEncodeError, e.encode, '\u0304', True)
#
e = self.IncrementalBig5hkscsEncoder()
- r = e.encode(u'\xca')
- assert r == ''
- r = e.encode(u'\xca')
- assert r == '\x88f'
- r = e.encode(u'\u0304')
- assert r == '\x88b'
+ r = e.encode('\xca')
+ assert r == b''
+ r = e.encode('\xca')
+ assert r == b'\x88f'
+ r = e.encode('\u0304')
+ assert r == b'\x88b'
diff --git a/pypy/module/_multibytecodec/test/test_app_stream.py b/pypy/module/_multibytecodec/test/test_app_stream.py
--- a/pypy/module/_multibytecodec/test/test_app_stream.py
+++ b/pypy/module/_multibytecodec/test/test_app_stream.py
@@ -42,8 +42,8 @@
self.pos += size
return res
#
- r = self.HzStreamReader(FakeFile("!~{abcd~}xyz~{efgh"))
- for expected in u'!\u5f95\u6c85xyz\u5f50\u73b7':
+ r = self.HzStreamReader(FakeFile(b"!~{abcd~}xyz~{efgh"))
+ for expected in '!\u5f95\u6c85xyz\u5f50\u73b7':
c = r.read(1)
assert c == expected
c = r.read(1)
@@ -56,15 +56,15 @@
def read(self):
return self.data
#
- r = self.HzStreamReader(FakeFile("!~{a"), "replace")
+ r = self.HzStreamReader(FakeFile(b"!~{a"), "replace")
c = r.read()
- assert c == u'!\ufffd'
+ assert c == '!\ufffd'
#
- r = self.HzStreamReader(FakeFile("!~{a"))
+ r = self.HzStreamReader(FakeFile(b"!~{a"))
r.errors = "replace"
assert r.errors == "replace"
c = r.read()
- assert c == u'!\ufffd'
+ assert c == '!\ufffd'
def test_writer(self):
class FakeFile:
@@ -74,10 +74,10 @@
self.output.append(data)
#
w = self.HzStreamWriter(FakeFile())
- for input in u'!\u5f95\u6c85xyz\u5f50\u73b7':
+ for input in '!\u5f95\u6c85xyz\u5f50\u73b7':
w.write(input)
- assert w.stream.output == ['!', '~{ab~}', '~{cd~}', 'x', 'y', 'z',
- '~{ef~}', '~{gh~}']
+ assert w.stream.output == [b'!', b'~{ab~}', b'~{cd~}', b'x', b'y', b'z',
+ b'~{ef~}', b'~{gh~}']
def test_no_flush(self):
class FakeFile:
@@ -87,7 +87,7 @@
self.output.append(data)
#
w = self.ShiftJisx0213StreamWriter(FakeFile())
- w.write(u'\u30ce')
- w.write(u'\u304b')
- w.write(u'\u309a')
- assert w.stream.output == ['\x83m', '', '\x82\xf5']
+ w.write('\u30ce')
+ w.write('\u304b')
+ w.write('\u309a')
+ assert w.stream.output == [b'\x83m', b'', b'\x82\xf5']
More information about the pypy-commit
mailing list