[pypy-commit] pypy py3k: Lot of fixes in the _codecs module

amauryfa noreply at buildbot.pypy.org
Wed Oct 19 23:11:15 CEST 2011


Author: Amaury Forgeot d'Arc <amauryfa at gmail.com>
Branch: py3k
Changeset: r48241:e0cf3d8b87a2
Date: 2011-10-19 22:31 +0200
http://bitbucket.org/pypy/pypy/changeset/e0cf3d8b87a2/

Log:	Lot of fixes in the _codecs module

diff --git a/pypy/module/_codecs/interp_codecs.py b/pypy/module/_codecs/interp_codecs.py
--- a/pypy/module/_codecs/interp_codecs.py
+++ b/pypy/module/_codecs/interp_codecs.py
@@ -49,7 +49,7 @@
                            "(unicode, int) tuple, not %s")
                 raise operationerrfmt(
                     space.w_TypeError, msg,
-                    space.str_w(space.repr(w_res)))
+                    space.unicode_w(space.repr(w_res)))
             w_replace, w_newpos = space.fixedview(w_res, 2)
             newpos = space.int_w(w_newpos)
             if newpos < 0:
@@ -487,7 +487,7 @@
     make_encoder_wrapper('mbcs_encode')
     make_decoder_wrapper('mbcs_decode')
 
- at unwrap_spec(data=str, errors='str_or_None', byteorder=int)
+ at unwrap_spec(data="bufferstr", errors='str_or_None', byteorder=int)
 def utf_16_ex_decode(space, data, errors='strict', byteorder=0, w_final=False):
     if errors is None:
         errors = 'strict'
@@ -507,7 +507,7 @@
     return space.newtuple([space.wrap(res), space.wrap(consumed),
                            space.wrap(byteorder)])
 
- at unwrap_spec(data=str, errors='str_or_None', byteorder=int)
+ at unwrap_spec(data="bufferstr", errors='str_or_None', byteorder=int)
 def utf_32_ex_decode(space, data, errors='strict', byteorder=0, w_final=False):
     final = space.is_true(w_final)
     state = space.fromcache(CodecState)
@@ -599,7 +599,7 @@
 
         # Charmap may return a string
         try:
-            x = space.realstr_w(w_ch)
+            x = space.bytes_w(w_ch)
         except OperationError, e:
             if not e.match(space, space.w_TypeError):
                 raise
@@ -626,7 +626,7 @@
         raise OperationError(space.w_TypeError, space.wrap("invalid mapping"))
 
 
- at unwrap_spec(string=str, errors='str_or_None')
+ at unwrap_spec(string="bufferstr", errors='str_or_None')
 def charmap_decode(space, string, errors="strict", w_mapping=None):
     if errors is None:
         errors = 'strict'
@@ -658,7 +658,7 @@
     result = runicode.unicode_encode_charmap(
         uni, len(uni), errors,
         state.encode_error_handler, mapping)
-    return space.newtuple([space.wrap(result), space.wrap(len(uni))])
+    return space.newtuple([space.wrapbytes(result), space.wrap(len(uni))])
 
 
 @unwrap_spec(chars=unicode)
@@ -716,7 +716,7 @@
     if space.isinstance_w(w_string, space.w_unicode):
         return space.newtuple([w_string, space.len(w_string)])
 
-    string = space.str_w(w_string)
+    string = space.bytes_w(w_string)
 
     if len(string) == 0:
         return space.newtuple([space.wrap(u''), space.wrap(0)])
@@ -729,21 +729,21 @@
     return space.newtuple([space.wrap(result), space.wrap(consumed)])
 
 # ____________________________________________________________
-# support for the "string escape" codec
+# support for the "string escape" translation
 # This is a bytes-to bytes transformation
 
- at unwrap_spec(data=str, errors='str_or_None')
+ at unwrap_spec(data="bufferstr", errors='str_or_None')
 def escape_encode(space, data, errors='strict'):
     from pypy.objspace.std.stringobject import string_escape_encode
     result = string_escape_encode(data, quote="'")
     start = 1
     end = len(result) - 1
     assert end >= 0
-    w_result = space.wrap(result[start:end])
+    w_result = space.wrapbytes(result[start:end])
     return space.newtuple([w_result, space.wrap(len(data))])
 
- at unwrap_spec(data=str, errors='str_or_None')
+ at unwrap_spec(data="bufferstr", errors='str_or_None')
 def escape_decode(space, data, errors='strict'):
     from pypy.interpreter.pyparser.parsestring import PyString_DecodeEscape
     result = PyString_DecodeEscape(space, data, None)
-    return space.newtuple([space.wrap(result), space.wrap(len(data))])
+    return space.newtuple([space.wrapbytes(result), space.wrap(len(data))])
diff --git a/pypy/module/_codecs/test/test_codecs.py b/pypy/module/_codecs/test/test_codecs.py
--- a/pypy/module/_codecs/test/test_codecs.py
+++ b/pypy/module/_codecs/test/test_codecs.py
@@ -17,7 +17,7 @@
                          'utf-32', 'utf-32-le', 'utf-32-be',
                          'raw_unicode_escape',
                          'unicode_escape', 'unicode_internal'):
-            assert unicode(u.encode(encoding),encoding) == u
+            assert str(u.encode(encoding),encoding) == u
 
     def test_ucs4(self):
         x = u'\U00100000'
@@ -25,14 +25,14 @@
         assert x == y 
 
     def test_named_unicode(self):
-        assert unicode('\\N{SPACE}','unicode-escape') == u" "
-        raises( UnicodeDecodeError, unicode,'\\N{SPACE','unicode-escape')
-        raises( UnicodeDecodeError, unicode,'\\NSPACE}','unicode-escape')
-        raises( UnicodeDecodeError, unicode,'\\NSPACE','unicode-escape')
-        raises( UnicodeDecodeError, unicode,'\\N','unicode-escape')
-        assert  unicode('\\N{SPACE}\\N{SPACE}','unicode-escape') == u"  " 
-        assert  unicode('\\N{SPACE}a\\N{SPACE}','unicode-escape') == u" a " 
-        assert "\\N{foo}xx".decode("unicode-escape", "ignore") == u"xx"
+        assert str(b'\\N{SPACE}','unicode-escape') == u" "
+        raises( UnicodeDecodeError, str,b'\\N{SPACE','unicode-escape')
+        raises( UnicodeDecodeError, str,b'\\NSPACE}','unicode-escape')
+        raises( UnicodeDecodeError, str,b'\\NSPACE','unicode-escape')
+        raises( UnicodeDecodeError, str,b'\\N','unicode-escape')
+        assert  str(b'\\N{SPACE}\\N{SPACE}','unicode-escape') == u"  " 
+        assert  str(b'\\N{SPACE}a\\N{SPACE}','unicode-escape') == u" a " 
+        assert b"\\N{foo}xx".decode("unicode-escape", "ignore") == u"xx"
         assert 1 <= len(u"\N{CJK UNIFIED IDEOGRAPH-20000}") <= 2
 
     def test_literals(self):
@@ -40,26 +40,26 @@
 
     def test_insecure_pickle(self):
         import pickle
-        insecure = ["abc", "2 + 2", # not quoted
+        insecure = [b"abc", b"2 + 2", # not quoted
                     #"'abc' + 'def'", # not a single quoted string
-                    "'abc", # quote is not closed
-                    "'abc\"", # open quote and close quote don't match
-                    "'abc'   ?", # junk after close quote
-                    "'\\'", # trailing backslash
+                    b"'abc", # quote is not closed
+                    b"'abc\"", # open quote and close quote don't match
+                    b"'abc'   ?", # junk after close quote
+                    b"'\\'", # trailing backslash
                     # some tests of the quoting rules
                     #"'abc\"\''",
                     #"'\\\\a\'\'\'\\\'\\\\\''",
                     ]
         for s in insecure:
-            buf = "S" + s + "\012p0\012."
+            buf = b"S" + s + b"\012p0\012."
             raises (ValueError, pickle.loads, buf)
 
     def test_unicodedecodeerror(self):
         assert str(UnicodeDecodeError(
-            "ascii", "g\xfcrk", 1, 2, "ouch")) == "'ascii' codec can't decode byte 0xfc in position 1: ouch"
+            "ascii", b"g\xfcrk", 1, 2, "ouch")) == "'ascii' codec can't decode byte 0xfc in position 1: ouch"
         
         assert str(UnicodeDecodeError(
-            "ascii", "g\xfcrk", 1, 3, "ouch")) == "'ascii' codec can't decode bytes in position 1-2: ouch"
+            "ascii", b"g\xfcrk", 1, 3, "ouch")) == "'ascii' codec can't decode bytes in position 1-2: ouch"
         
 
     def test_unicodetranslateerror(self):
@@ -73,7 +73,7 @@
         assert str(UnicodeTranslateError(
             u"g\uffffrk", 1, 2, "ouch"))== "can't translate character u'\\uffff' in position 1: ouch"
         
-        if sys.maxunicode > 0xffff and len(unichr(0x10000)) == 1:
+        if sys.maxunicode > 0xffff and len(chr(0x10000)) == 1:
             assert str(UnicodeTranslateError(
                 u"g\U00010000rk", 1, 2, "ouch"))== "can't translate character u'\\U00010000' in position 1: ouch"
             
@@ -96,30 +96,31 @@
        
         assert str(UnicodeEncodeError(
             "ascii", u"\uffffx", 0, 1, "ouch"))=="'ascii' codec can't encode character u'\\uffff' in position 0: ouch"
-        if sys.maxunicode > 0xffff and len(unichr(0x10000)) == 1:
+        if sys.maxunicode > 0xffff and len(chr(0x10000)) == 1:
             assert str(UnicodeEncodeError(
                 "ascii", u"\U00010000x", 0, 1, "ouch")) =="'ascii' codec can't encode character u'\\U00010000' in position 0: ouch"
     
     def test_indexerror(self):
-        test =   "\\"     # trailing backslash
-        raises (ValueError, test.decode,'string-escape')
+        import _codecs
+        test =   b"\\"     # trailing backslash
+        raises (ValueError, _codecs.escape_decode, test)
 
     def test_charmap_decode(self):
         from _codecs import charmap_decode
         import sys
-        assert charmap_decode('', 'strict', 'blablabla') == ('', 0)
-        assert charmap_decode('xxx') == ('xxx', 3)
-        assert charmap_decode('xxx', 'strict', {ord('x'): u'XX'}) == ('XXXXXX', 3)
-        map = tuple([unichr(i) for i in range(256)])
-        assert charmap_decode('xxx\xff', 'strict', map) == (u'xxx\xff', 4)
+        assert charmap_decode(b'', 'strict', 'blablabla') == ('', 0)
+        assert charmap_decode(b'xxx') == ('xxx', 3)
+        assert charmap_decode(b'xxx', 'strict', {ord('x'): u'XX'}) == ('XXXXXX', 3)
+        map = tuple([chr(i) for i in range(256)])
+        assert charmap_decode(b'xxx\xff', 'strict', map) == (u'xxx\xff', 4)
 
         raises(TypeError, charmap_decode, '\xff', "replace",  {0xff: 0x10001})
 
     def test_unicode_escape(self):
         from _codecs import unicode_escape_encode, unicode_escape_decode
         assert unicode_escape_encode(u'abc') == (u'abc'.encode('unicode_escape'), 3)
-        assert unicode_escape_decode('abc') == (u'abc'.decode('unicode_escape'), 3)
-        assert unicode_escape_decode('\\x61\\x62\\x63') == (u'abc', 12)
+        assert unicode_escape_decode(b'abc') == (b'abc'.decode('unicode_escape'), 3)
+        assert unicode_escape_decode(b'\\x61\\x62\\x63') == (u'abc', 12)
 
 class AppTestPartialEvaluation:
 
@@ -144,13 +145,13 @@
                 u"\x00\xff\u07ff\u0800\uffff",
             ]
             
-        buffer = ''
+        buffer = b''
         result = u""
         for (c, partialresult) in zip(u"\x00\xff\u07ff\u0800\uffff".encode(encoding), check_partial):
-            buffer += c
+            buffer += bytes([c])
             res = _codecs.utf_8_decode(buffer,'strict',False)
             if res[1] >0 :
-                buffer = ''
+                buffer = b''
             result += res[0]
             assert result == partialresult
 
@@ -169,26 +170,26 @@
                     u"\x00\xff\u0100",
                     u"\x00\xff\u0100\uffff",
                 ]
-        buffer = ''
+        buffer = b''
         result = u""
         for (c, partialresult) in zip(u"\x00\xff\u0100\uffff".encode(encoding), check_partial):
-            buffer += c
+            buffer += bytes([c])
             res = _codecs.utf_16_decode(buffer,'strict',False)
             if res[1] >0 :
-                buffer = ''
+                buffer = b''
             result += res[0]
             assert result == partialresult
 
     def test_bug1098990_a(self):
 
-        import codecs, StringIO
+        import codecs, io
         self.encoding = 'utf-8'
         s1 = u"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
         s2 = u"offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
         s3 = u"next line.\r\n"
        
         s = (s1+s2+s3).encode(self.encoding)
-        stream = StringIO.StringIO(s)
+        stream = io.BytesIO(s)
         reader = codecs.getreader(self.encoding)(stream)
         assert reader.readline() == s1
         assert reader.readline() == s2
@@ -196,7 +197,7 @@
         assert reader.readline() == u""
 
     def test_bug1098990_b(self):
-        import codecs, StringIO
+        import codecs, io
         self.encoding = 'utf-8'
         s1 = u"aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
         s2 = u"bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
@@ -205,7 +206,7 @@
         s5 = u"againokay.\r\n"
 
         s = (s1+s2+s3+s4+s5).encode(self.encoding)
-        stream = StringIO.StringIO(s)
+        stream = io.BytesIO(s)
         reader = codecs.getreader(self.encoding)(stream)
         assert reader.readline() == s1
         assert reader.readline() == s2
@@ -216,11 +217,11 @@
     
     def test_seek_utf16le(self):
         # all codecs should be able to encode these
-        import codecs, StringIO
+        import codecs, io
         encoding = 'utf-16-le'
         s = u"%s\n%s\n" % (10*u"abc123", 10*u"def456")
-        reader = codecs.getreader(encoding)(StringIO.StringIO(s.encode(encoding)))
-        for t in xrange(5):
+        reader = codecs.getreader(encoding)(io.BytesIO(s.encode(encoding)))
+        for t in range(5):
             # Test that calling seek resets the internal codec state and buffers
             reader.seek(0, 0)
             line = reader.readline()
@@ -229,71 +230,75 @@
 
     def test_unicode_internal_encode(self):
         import sys
-        class U(unicode):
+        class U(str):
             pass
         enc = U(u"a").encode("unicode_internal")
         if sys.maxunicode == 65535: # UCS2 build
             if sys.byteorder == "big":
-                assert enc == "\x00a"
+                assert enc == b"\x00a"
             else:
-                assert enc == "a\x00"
+                assert enc == b"a\x00"
         elif len(u"\U00010098") == 1:
             # UCS4 build on a UCS4 CPython
             enc2 = u"\U00010098".encode("unicode_internal")
             if sys.byteorder == "big":
-                assert enc == "\x00\x00\x00a"
-                assert enc2 == "\x00\x01\x00\x98"
+                assert enc == b"\x00\x00\x00a"
+                assert enc2 == b"\x00\x01\x00\x98"
             else:
-                assert enc == "a\x00\x00\x00"
-                assert enc2 == "\x98\x00\x01\x00"
+                assert enc == b"a\x00\x00\x00"
+                assert enc2 == b"\x98\x00\x01\x00"
         else:
             # UCS4 build on a UCS2 CPython
             if sys.byteorder == "big":
-                assert enc == "\x00\x00\x00a"
+                assert enc == b"\x00\x00\x00a"
             else:
-                assert enc == "a\x00\x00\x00"
+                assert enc == b"a\x00\x00\x00"
 
     def test_unicode_internal_decode(self):
         import sys
         if sys.maxunicode == 65535: # UCS2 build
             if sys.byteorder == "big":
-                bytes = "\x00a"
+                bytes = b"\x00a"
             else:
-                bytes = "a\x00"
+                bytes = b"a\x00"
         else: # UCS4 build
             if sys.byteorder == "big":
-                bytes = "\x00\x00\x00a"
-                bytes2 = "\x00\x01\x00\x98"
+                bytes = b"\x00\x00\x00a"
+                bytes2 = b"\x00\x01\x00\x98"
             else:
-                bytes = "a\x00\x00\x00"
-                bytes2 = "\x98\x00\x01\x00"
+                bytes = b"a\x00\x00\x00"
+                bytes2 = b"\x98\x00\x01\x00"
             assert bytes2.decode("unicode_internal") == u"\U00010098"
         assert bytes.decode("unicode_internal") == u"a"
 
     def test_raw_unicode_escape(self):
-        assert unicode("\u0663", "raw-unicode-escape") == u"\u0663"
-        assert u"\u0663".encode("raw-unicode-escape") == "\u0663"
+        assert str(b"\u0663", "raw-unicode-escape") == u"\u0663"
+        assert u"\u0663".encode("raw-unicode-escape") == b"\u0663"
 
     def test_escape_decode(self):
-        test = 'a\n\\b\x00c\td\u2045'.encode('string_escape')
-        assert test.decode('string_escape') =='a\n\\b\x00c\td\u2045'
-        assert '\\077'.decode('string_escape') == '?'
-        assert '\\100'.decode('string_escape') == '@'
-        assert '\\253'.decode('string_escape') == chr(0253)
-        assert '\\312'.decode('string_escape') == chr(0312)
+        import _codecs
+        test = _codecs.escape_encode(b'a\n\\b\x00c\td\u2045')[0]
+        assert _codecs.escape_decode(test)[0] == b'a\n\\b\x00c\td\u2045'
+        assert _codecs.escape_decode(b'\\077')[0] == b'?'
+        assert _codecs.escape_decode(b'\\100')[0] == b'@'
+        assert _codecs.escape_decode(b'\\253')[0] == bytes([0253])
+        assert _codecs.escape_decode(b'\\312')[0] == bytes([0312])
 
     def test_escape_decode_wrap_around(self):
-        assert '\\400'.decode('string_escape') == chr(0)
+        import _codecs
+        assert _codecs.escape_decode(b'\\400')[0] == b'\0'
 
     def test_escape_decode_ignore_invalid(self):
-        assert '\\9'.decode('string_escape') == '\\9'
-        assert '\\01'.decode('string_escape') == chr(01)
-        assert '\\0f'.decode('string_escape') == chr(0) + 'f'
-        assert '\\08'.decode('string_escape') == chr(0) + '8'
+        import _codecs
+        assert _codecs.escape_decode(b'\\9')[0] == b'\\9'
+        assert _codecs.escape_decode(b'\\01')[0] == b'\x01'
+        assert _codecs.escape_decode(b'\\0f')[0] == b'\0' + b'f'
+        assert _codecs.escape_decode(b'\\08')[0] == b'\0' + b'8'
 
     def test_escape_encode(self):
-        assert '"'.encode('string_escape') == '"'
-        assert "'".encode('string_escape') == "\\'"
+        import _codecs
+        assert _codecs.escape_encode(b'"')[0] == b'"'
+        assert _codecs.escape_encode(b"'")[0] == b"\\'"
 
     def test_decode_utf8_different_case(self):
         constant = u"a"
@@ -304,35 +309,35 @@
         def search_function(encoding):
             def f(input, errors="strict"):
                 return 42
-            print encoding
+            print(encoding)
             if encoding == 'test.mytestenc':
                 return (f, f, None, None)
             return None
         _codecs.register(search_function)
-        raises(TypeError, "hello".decode, "test.mytestenc")
+        raises(TypeError, b"hello".decode, "test.mytestenc")
         raises(TypeError, u"hello".encode, "test.mytestenc")
 
     def test_cpytest_decode(self):
         import codecs
-        assert codecs.decode('\xe4\xf6\xfc', 'latin-1') == u'\xe4\xf6\xfc'
+        assert codecs.decode(b'\xe4\xf6\xfc', 'latin-1') == u'\xe4\xf6\xfc'
         raises(TypeError, codecs.decode)
-        assert codecs.decode('abc') == u'abc'
-        raises(UnicodeDecodeError, codecs.decode, '\xff', 'ascii')
+        assert codecs.decode(b'abc') == u'abc'
+        raises(UnicodeDecodeError, codecs.decode, b'\xff', 'ascii')
 
     def test_bad_errorhandler_return(self):
         import codecs
         def baddecodereturn1(exc):
             return 42
         codecs.register_error("test.baddecodereturn1", baddecodereturn1)
-        raises(TypeError, "\xff".decode, "ascii", "test.baddecodereturn1")
-        raises(TypeError, "\\".decode, "unicode-escape", "test.baddecodereturn1")
-        raises(TypeError, "\\x0".decode, "unicode-escape", "test.baddecodereturn1")
-        raises(TypeError, "\\x0y".decode, "unicode-escape", "test.baddecodereturn1")
-        raises(TypeError, "\\Uffffeeee".decode, "unicode-escape", "test.baddecodereturn1")
-        raises(TypeError, "\\uyyyy".decode, "raw-unicode-escape", "test.baddecodereturn1")
+        raises(TypeError, b"\xff".decode, "ascii", "test.baddecodereturn1")
+        raises(TypeError, b"\\".decode, "unicode-escape", "test.baddecodereturn1")
+        raises(TypeError, b"\\x0".decode, "unicode-escape", "test.baddecodereturn1")
+        raises(TypeError, b"\\x0y".decode, "unicode-escape", "test.baddecodereturn1")
+        raises(TypeError, b"\\Uffffeeee".decode, "unicode-escape", "test.baddecodereturn1")
+        raises(TypeError, b"\\uyyyy".decode, "raw-unicode-escape", "test.baddecodereturn1")
 
     def test_cpy_bug1175396(self):
-        import codecs, StringIO
+        import codecs, io
         s = [
             '<%!--===================================================\r\n',
             '    BLOG index page: show recent articles,\r\n',
@@ -364,15 +369,15 @@
             '        log.error("Error loading articles: "+str(x))\r\n',
             '        self.abort("cannot load articles")\r\n',
         ]
-        stream = StringIO.StringIO("".join(s).encode("utf7"))
-        assert "aborrt" not in stream.getvalue()
+        stream = io.BytesIO("".join(s).encode("utf7"))
+        assert b"aborrt" not in stream.getvalue()
         reader = codecs.getreader("utf7")(stream)
         for (i, line) in enumerate(reader):
             assert line == s[i]
 
     def test_array(self):
         import _codecs, array
-        _codecs.readbuffer_encode(array.array('c', 'spam')) == ('spam', 4)
+        _codecs.readbuffer_encode(array.array('b', b'spam')) == ('spam', 4)
 
     def test_utf8sig(self):
         import codecs
@@ -382,28 +387,28 @@
 
     def test_escape_decode_escaped_newline(self):
         import _codecs
-        s = '\\\n'
+        s = b'\\\n'
         decoded = _codecs.unicode_escape_decode(s)[0]
         assert decoded == ''
 
     def test_charmap_decode_1(self):
         import codecs
-        assert codecs.charmap_encode(u'xxx') == ('xxx', 3)
-        assert codecs.charmap_encode(u'xxx', 'strict', {ord('x'): 'XX'}) == ('XXXXXX', 3)
+        assert codecs.charmap_encode(u'xxx') == (b'xxx', 3)
+        assert codecs.charmap_encode(u'xxx', 'strict', {ord('x'): b'XX'}) == (b'XXXXXX', 3)
 
-        res = codecs.charmap_decode("\x00\x01\x02", "replace", u"ab")
+        res = codecs.charmap_decode(b"\x00\x01\x02", "replace", u"ab")
         assert res == (u"ab\ufffd", 3)
-        res = codecs.charmap_decode("\x00\x01\x02", "replace", u"ab\ufffe")
+        res = codecs.charmap_decode(b"\x00\x01\x02", "replace", u"ab\ufffe")
         assert res == (u'ab\ufffd', 3)
 
     def test_decode_errors(self):
         import sys
         if sys.maxunicode > 0xffff:
             try:
-                "\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
+                b"\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
             except UnicodeDecodeError, ex:
                 assert "unicode_internal" == ex.encoding
-                assert "\x00\x00\x00\x00\x00\x11\x11\x00" == ex.object
+                assert b"\x00\x00\x00\x00\x00\x11\x11\x00" == ex.object
                 assert ex.start == 4
                 assert ex.end == 8
             else:
@@ -414,14 +419,14 @@
         assert codecs.replace_errors(UnicodeEncodeError(
             "ascii", u"\u3042", 0, 1, "ouch")) == (u"?", 1)
         assert codecs.replace_errors(UnicodeDecodeError(
-            "ascii", "\xff", 0, 1, "ouch")) == (u"\ufffd", 1)
+            "ascii", b"\xff", 0, 1, "ouch")) == (u"\ufffd", 1)
         assert codecs.replace_errors(UnicodeTranslateError(
             u"\u3042", 0, 1, "ouch")) == (u"\ufffd", 1)
 
         assert codecs.replace_errors(UnicodeEncodeError(
             "ascii", u"\u3042\u3042", 0, 2, "ouch")) == (u"??", 2)
         assert codecs.replace_errors(UnicodeDecodeError(
-            "ascii", "\xff\xff", 0, 2, "ouch")) == (u"\ufffd", 2)
+            "ascii", b"\xff\xff", 0, 2, "ouch")) == (u"\ufffd", 2)
         assert codecs.replace_errors(UnicodeTranslateError(
             u"\u3042\u3042", 0, 2, "ouch")) == (u"\ufffd\ufffd", 2)
 
@@ -439,13 +444,13 @@
         # A UnicodeDecodeError object without an end attribute
         class NoEndUnicodeDecodeError(UnicodeDecodeError):
             def __init__(self):
-                UnicodeDecodeError.__init__(self, "ascii", "", 0, 1, "bad")
+                UnicodeDecodeError.__init__(self, "ascii", b"", 0, 1, "bad")
                 del self.end
 
         # A UnicodeDecodeError object with a bad object attribute
         class BadObjectUnicodeDecodeError(UnicodeDecodeError):
             def __init__(self):
-                UnicodeDecodeError.__init__(self, "ascii", "", 0, 1, "bad")
+                UnicodeDecodeError.__init__(self, "ascii", b"", 0, 1, "bad")
                 self.object = []
 
         # A UnicodeTranslateError object without a start attribute
@@ -477,11 +482,11 @@
         # With the correct exception, "replace" returns an "?" or u"\ufffd" replacement
 
     def test_decode_ignore(self):
-        assert '\xff'.decode('utf-7', 'ignore') == ''
-        assert '\x00'.decode('unicode-internal', 'ignore') == ''
+        assert b'\xff'.decode('utf-7', 'ignore') == ''
+        assert b'\x00'.decode('unicode-internal', 'ignore') == ''
 
     def test_backslahreplace(self):
-        assert u'a\xac\u1234\u20ac\u8000'.encode('ascii', 'backslashreplace') == 'a\\xac\u1234\u20ac\u8000'
+        assert u'a\xac\u1234\u20ac\u8000'.encode('ascii', 'backslashreplace') == b'a\\xac\u1234\u20ac\u8000'
 
     def test_surrogateescape(self):
         assert b'a\x80b'.decode('utf-8', 'surrogateescape') == 'a\udc80b'
@@ -502,10 +507,10 @@
                     "test.badhandler"
                 )
             for (enc, bytes) in (
-                ("utf-8", "\xff"),
-                ("ascii", "\xff"),
-                ("utf-7", "+x-"),
-                ("unicode-internal", "\x00"),
+                ("utf-8", b"\xff"),
+                ("ascii", b"\xff"),
+                ("utf-7", b"+x-"),
+                ("unicode-internal", b"\x00"),
             ):
                 raises(
                     TypeError,
@@ -518,19 +523,19 @@
         import codecs
         import sys
         try:
-            '\x00'.decode('unicode-internal')
+            b'\x00'.decode('unicode-internal')
         except UnicodeDecodeError:
             pass
         else:
             raise Exception("DID NOT RAISE")
 
-        res = "\x00\x00\x00\x00\x00".decode("unicode-internal", "replace")
+        res = b"\x00\x00\x00\x00\x00".decode("unicode-internal", "replace")
         if sys.maxunicode > 65535:
             assert res == u"\u0000\ufffd"    # UCS4 build
         else:
             assert res == u"\x00\x00\ufffd"  # UCS2 build
 
-        res = "\x00\x00\x00\x00\x00".decode("unicode-internal", "ignore")
+        res = b"\x00\x00\x00\x00\x00".decode("unicode-internal", "ignore")
         if sys.maxunicode > 65535:
             assert res == u"\u0000"   # UCS4 build
         else:
@@ -541,7 +546,7 @@
                 raise TypeError("don't know how to handle %r" % exc)
             return (u"\x01", 1)
         codecs.register_error("test.hui", handler_unicodeinternal)
-        res = "\x00\x00\x00\x00\x00".decode("unicode-internal", "test.hui")
+        res = b"\x00\x00\x00\x00\x00".decode("unicode-internal", "test.hui")
         if sys.maxunicode > 65535:
             assert res == u"\u0000\u0001\u0000"   # UCS4 build
         else:
@@ -550,31 +555,31 @@
     def test_encode_error_bad_handler(self):
         import codecs
         codecs.register_error("test.bad_handler", lambda e: (repl, 1))
-        assert u"xyz".encode("latin-1", "test.bad_handler") == "xyz"
+        assert u"xyz".encode("latin-1", "test.bad_handler") == b"xyz"
         repl = u"\u1234"
         raises(UnicodeEncodeError, u"\u5678".encode, "latin-1",
                "test.bad_handler")
         repl = u"\u00E9"
         s = u"\u5678".encode("latin-1", "test.bad_handler")
-        assert s == '\xe9'
+        assert s == b'\xe9'
 
     def test_charmap_encode(self):
-        assert 'xxx'.encode('charmap') == 'xxx'
+        assert 'xxx'.encode('charmap') == b'xxx'
 
         import codecs
         raises(TypeError, codecs.charmap_encode, u'\xff', "replace",  {0xff: 300})
         raises(UnicodeError, codecs.charmap_encode, u"\xff", "replace", {0xff: None})
 
     def test_charmap_encode_replace(self):
-        charmap = dict([ (ord(c), 2*c.upper()) for c in "abcdefgh"])
-        charmap[ord("?")] = "XYZ"
+        charmap = dict([(c, bytes([c, c]).upper()) for c in b"abcdefgh"])
+        charmap[ord("?")] = b"XYZ"
         import codecs
         sin = u"abcDEF"
         sout = codecs.charmap_encode(sin, "replace", charmap)[0]
-        assert sout == "AABBCCXYZXYZXYZ"
+        assert sout == b"AABBCCXYZXYZXYZ"
 
     def test_charmap_decode_2(self):
-        assert 'foo'.decode('charmap') == 'foo'
+        assert b'foo'.decode('charmap') == 'foo'
 
     def test_charmap_build(self):
         import codecs
@@ -583,25 +588,25 @@
 
     def test_utf7_start_end_in_exception(self):
         try:
-            '+IC'.decode('utf-7')
+            b'+IC'.decode('utf-7')
         except UnicodeDecodeError, exc:
             assert exc.start == 0
             assert exc.end == 3
 
     def test_utf7_surrogate(self):
-        raises(UnicodeDecodeError, '+3ADYAA-'.decode, 'utf-7')
+        raises(UnicodeDecodeError, b'+3ADYAA-'.decode, 'utf-7')
 
     def test_utf_16_encode_decode(self):
         import codecs
         x = u'123abc'
-        assert codecs.getencoder('utf-16')(x) == ('\xff\xfe1\x002\x003\x00a\x00b\x00c\x00', 6)
-        assert codecs.getdecoder('utf-16')('\xff\xfe1\x002\x003\x00a\x00b\x00c\x00') == (x, 14)
+        assert codecs.getencoder('utf-16')(x) == (b'\xff\xfe1\x002\x003\x00a\x00b\x00c\x00', 6)
+        assert codecs.getdecoder('utf-16')(b'\xff\xfe1\x002\x003\x00a\x00b\x00c\x00') == (x, 14)
 
     def test_unicode_escape(self):        
-        assert u'\\'.encode('unicode-escape') == '\\\\'
-        assert '\\\\'.decode('unicode-escape') == u'\\'
-        assert u'\ud801'.encode('unicode-escape') == '\\ud801'
-        assert u'\u0013'.encode('unicode-escape') == '\\x13'
+        assert u'\\'.encode('unicode-escape') == b'\\\\'
+        assert b'\\\\'.decode('unicode-escape') == u'\\'
+        assert u'\ud801'.encode('unicode-escape') == b'\\ud801'
+        assert u'\u0013'.encode('unicode-escape') == b'\\x13'
 
     def test_mbcs(self):
         import sys
@@ -611,11 +616,3 @@
         assert u'caf\xe9'.encode('mbcs') == 'caf\xe9'
         assert u'\u040a'.encode('mbcs') == '?' # some cyrillic letter
         assert 'cafx\e9'.decode('mbcs') == u'cafx\e9'
-
-    def test_bad_handler_string_result(self):
-        import _codecs
-        def f(exc):
-            return ('foo', exc.end)
-        _codecs.register_error("test.test_codecs_not_a_string", f)
-        raises(TypeError, u'\u1234'.encode, 'ascii',
-               'test.test_codecs_not_a_string')
diff --git a/pypy/rlib/runicode.py b/pypy/rlib/runicode.py
--- a/pypy/rlib/runicode.py
+++ b/pypy/rlib/runicode.py
@@ -272,11 +272,11 @@
             # Encode UCS2 Unicode ordinals
             if ch < 0x10000:
                 # Special case: check for high surrogate
-                if 0xD800 <= ch <= 0xDBFF and pos != size:
+                if 0xD800 <= ch <= 0xDFFF and pos != size:
                     ch2 = ord(s[pos])
                     # Check for low surrogate and combine the two to
                     # form a UCS4 value
-                    if 0xDC00 <= ch2 <= 0xDFFF:
+                    if ch <= 0xDBFF and 0xDC00 <= ch2 <= 0xDFFF:
                         ch3 = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000
                         pos += 1
                         _encodeUCS4(result, ch3)


More information about the pypy-commit mailing list