[pypy-svn] r13082 - in pypy/dist/pypy/lib: . test2

Sun Jun 5 22:59:25 CEST 2005

Author: ale
Date: Sun Jun  5 22:59:23 2005
New Revision: 13082

Removed:
   pypy/dist/pypy/lib/test2/test_codeccallbacks.py
Modified:
   pypy/dist/pypy/lib/inprogress__codecs.py
   pypy/dist/pypy/lib/unicodecodec.py
Log:
test_unicode.py passes now. No known failures

Modified: pypy/dist/pypy/lib/inprogress__codecs.py
==============================================================================

--- pypy/dist/pypy/lib/inprogress__codecs.py	(original)
+++ pypy/dist/pypy/lib/inprogress__codecs.py	Sun Jun  5 22:59:23 2005
@@ -114,7 +114,7 @@
     if isinstance(encoding,str):
         decoder = lookup(encoding)[1]
         if decoder and isinstance(errors,str):
-            res = decoder(v,errors)
+            res = decoder(obj,errors)
             if not isinstance(res,tuple) or len(res) != 2:
                 raise TypeError("encoder must return a tuple (object,integer)")
             return res[0]
@@ -227,7 +227,8 @@
         res = ''.join(p)
         return res, len(res)
     else:
-        return ''.join(PyUnicode_FromUnicode(obj,size),size)
+        res = "You can do better than this" # XXX make this right
+        return res,len(res)
 
 def unicode_internal_decode( unistr,errors='strict'):
     """None
@@ -241,7 +242,7 @@
             t = 0
             for j in range(unicode_bytes):
                 t += ord(unistr[i+j])<<(j*8)
-            i += j+1
+            i += unicode_bytes
             p += unichr(t)
         res = u''.join(p)
         return res, len(res)
@@ -283,11 +284,12 @@
 def mbcs_encode( obj,errors='strict'):
     """None
     """
-    return (PyUnicode_EncodeMBCS(
-			       PyUnicode_AS_UNICODE(obj), 
-			       PyUnicode_GET_SIZE(obj),
-			       errors),
-		    PyUnicode_GET_SIZE(obj));
+    pass
+##    return (PyUnicode_EncodeMBCS(
+##			       (obj), 
+##			       len(obj),
+##			       errors),
+##		    len(obj))
     
 
 def ascii_encode( obj,errors='strict'):

Deleted: /pypy/dist/pypy/lib/test2/test_codeccallbacks.py
==============================================================================
--- /pypy/dist/pypy/lib/test2/test_codeccallbacks.py	Sun Jun  5 22:59:23 2005
+++ (empty file)
@@ -1,724 +0,0 @@
-import py
-py.test.skip("this test module doesn't belong here") 
-import autopath
-import test.test_support, unittest
-import sys,  htmlentitydefs, unicodedata
-sys.path.insert(1,r'd:\projects\pypy_co\pypy\lib')
-from pypy.lib import codecs
-sys.modules['codecs'] = codecs
-from pypy.lib import encodings
-reload(encodings)
-reload(codecs)
-assert codecs == encodings.codecs
-sys.modules['encodings'] = encodings
-
-class PosReturn:
-    # this can be used for configurable callbacks
-
-    def __init__(self):
-        self.pos = 0
-
-    def handle(self, exc):
-        oldpos = self.pos
-        realpos = oldpos
-        if realpos<0:
-            realpos = len(exc.object) + realpos
-        # if we don't advance this time, terminate on the next call
-        # otherwise we'd get an endless loop
-        if realpos <= exc.start:
-            self.pos = len(exc.object)
-        return (u"<?>", oldpos)
-
-class CodecCallbackTest(unittest.TestCase):
-
-    def test_xmlcharrefreplace(self):
-        # replace unencodable characters which numeric character entities.
-        # For ascii, latin-1 and charmaps this is completely implemented
-        # in C and should be reasonably fast.
-        s = u"\u30b9\u30d1\u30e2 \xe4nd eggs"
-        self.assertEqual(
-            s.encode("ascii", "xmlcharrefreplace"),
-            "&#12473;&#12497;&#12514; &#228;nd eggs"
-        )
-        self.assertEqual(
-            s.encode("latin-1", "xmlcharrefreplace"),
-            "&#12473;&#12497;&#12514; \xe4nd eggs"
-        )
-
-    def test_xmlcharnamereplace(self):
-        # This time use a named character entity for unencodable
-        # characters, if one is available.
-
-        def xmlcharnamereplace(exc):
-            if not isinstance(exc, UnicodeEncodeError):
-                raise TypeError("don't know how to handle %r" % exc)
-            l = []
-            for c in exc.object[exc.start:exc.end]:
-                try:
-                    l.append(u"&%s;" % htmlentitydefs.codepoint2name[ord(c)])
-                except KeyError:
-                    l.append(u"&#%d;" % ord(c))
-            return (u"".join(l), exc.end)
-
-        codecs.register_error(
-            "test.xmlcharnamereplace", xmlcharnamereplace)
-
-        sin = u"\xab\u211c\xbb = \u2329\u1234\u20ac\u232a"
-        sout = "&laquo;&real;&raquo; = &lang;&#4660;&euro;&rang;"
-        self.assertEqual(codecs.encode(sin,"ascii", "test.xmlcharnamereplace"), sout)
-        sout = "\xab&real;\xbb = &lang;&#4660;&euro;&rang;"
-        self.assertEqual(codecs.encode(sin,"latin-1", "test.xmlcharnamereplace"), sout)
-        sout = "\xab&real;\xbb = &lang;&#4660;\xa4&rang;"
-        self.assertEqual(codecs.encode(sin,"iso-8859-15", "test.xmlcharnamereplace"), sout)
-
-    def test_uninamereplace(self):
-        # We're using the names from the unicode database this time,
-        # and we're doing "syntax highlighting" here, i.e. we include
-        # the replaced text in ANSI escape sequences. For this it is
-        # useful that the error handler is not called for every single
-        # unencodable character, but for a complete sequence of
-        # unencodable characters, otherwise we would output many
-        # unneccessary escape sequences.
-
-        def uninamereplace(exc):
-            if not isinstance(exc, UnicodeEncodeError):
-                raise TypeError("don't know how to handle %r" % exc)
-            l = []
-            for c in exc.object[exc.start:exc.end]:
-                l.append(unicodedata.name(c, u"0x%x" % ord(c)))
-            return (u"\033[1m%s\033[0m" % u", ".join(l), exc.end)
-
-        codecs.register_error(
-            "test.uninamereplace", uninamereplace)
-
-        sin = u"\xac\u1234\u20ac\u8000"
-        sout = "\033[1mNOT SIGN, ETHIOPIC SYLLABLE SEE, EURO SIGN, CJK UNIFIED IDEOGRAPH-8000\033[0m"
-        self.assertEqual(codecs.encode(sin,"ascii", "test.uninamereplace"), sout)
-
-        sout = "\xac\033[1mETHIOPIC SYLLABLE SEE, EURO SIGN, CJK UNIFIED IDEOGRAPH-8000\033[0m"
-        tout = codecs.encode(sin,"latin-1", "test.uninamereplace")
-        self.assertEqual(tout, sout)
-        sout = "\xac\033[1mETHIOPIC SYLLABLE SEE\033[0m\xa4\033[1mCJK UNIFIED IDEOGRAPH-8000\033[0m"
-        self.assertEqual(codecs.encode(sin,"iso-8859-15", "test.uninamereplace"), sout)
-
-    def test_backslashescape(self):
-        # Does the same as the "unicode-escape" encoding, but with different
-        # base encodings.
-        sin = u"a\xac\u1234\u20ac\u8000"
-        if sys.maxunicode > 0xffff:
-            sin += unichr(sys.maxunicode)
-        sout = "a\\xac\\u1234\\u20ac\\u8000"
-        if sys.maxunicode > 0xffff:
-            sout += "\\U%08x" % sys.maxunicode
-        self.assertEqual(codecs.encode(sin,"ascii", "backslashreplace"), sout)
-
-        sout = "a\xac\\u1234\\u20ac\\u8000"
-        if sys.maxunicode > 0xffff:
-            sout += "\\U%08x" % sys.maxunicode
-        self.assertEqual(codecs.encode(sin,"latin-1", "backslashreplace"), sout)
-
-        sout = "a\xac\\u1234\xa4\\u8000"
-        if sys.maxunicode > 0xffff:
-            sout += "\\U%08x" % sys.maxunicode
-        self.assertEqual(codecs.encode(sin,"iso-8859-15", "backslashreplace"), sout)
-
-    def test_relaxedutf8(self):
-        # This is the test for a decoding callback handler,
-        # that relaxes the UTF-8 minimal encoding restriction.
-        # A null byte that is encoded as "\xc0\x80" will be
-        # decoded as a null byte. All other illegal sequences
-        # will be handled strictly.
-        def relaxedutf8(exc):
-            if not isinstance(exc, UnicodeDecodeError):
-                raise TypeError("don't know how to handle %r" % exc)
-            if exc.object[exc.start:exc.end].startswith("\xc0\x80"):
-                return (u"\x00", exc.start+2) # retry after two bytes
-            else:
-                raise exc
-
-        codecs.register_error(
-            "test.relaxedutf8", relaxedutf8)
-
-        sin = "a\x00b\xc0\x80c\xc3\xbc\xc0\x80\xc0\x80"
-        sout = u"a\x00b\x00c\xfc\x00\x00"
-        self.assertEqual(codecs.decode(sin,"utf-8", "test.relaxedutf8"), sout)
-        sin = "\xc0\x80\xc0\x81"
-        self.assertRaises(UnicodeError, codecs.decode,sin, "utf-8", "test.relaxedutf8")
-
-    def test_charmapencode(self):
-        # For charmap encodings the replacement string will be
-        # mapped through the encoding again. This means, that
-        # to be able to use e.g. the "replace" handler, the
-        # charmap has to have a mapping for "?".
-        charmap = dict([ (ord(c), 2*c.upper()) for c in "abcdefgh"])
-        sin = u"abc"
-        sout = "AABBCC"
-        self.assertEquals(codecs.charmap_encode(sin, "strict", charmap)[0], sout)
-
-        sin = u"abcA"
-        self.assertRaises(UnicodeError, codecs.charmap_encode, sin, "strict", charmap)
-
-        charmap[ord("?")] = "XYZ"
-        sin = u"abcDEF"
-        sout = "AABBCCXYZXYZXYZ"
-        self.assertEquals(codecs.charmap_encode(sin, "replace", charmap)[0], sout)
-
-        charmap[ord("?")] = u"XYZ"
-        self.assertRaises(TypeError, codecs.charmap_encode, sin, "replace", charmap)
-
-        charmap[ord("?")] = u"XYZ"
-        self.assertRaises(TypeError, codecs.charmap_encode, sin, "replace", charmap)
-
-    def test_callbacks(self):
-        def handler1(exc):
-            if not isinstance(exc, UnicodeEncodeError) \
-               and not isinstance(exc, UnicodeDecodeError):
-                raise TypeError("don't know how to handle %r" % exc)
-            l = [u"<%d>" % ord(exc.object[pos]) for pos in xrange(exc.start, exc.end)]
-            return (u"[%s]" % u"".join(l), exc.end)
-
-        codecs.register_error("test.handler1", handler1)
-
-        def handler2(exc):
-            if not isinstance(exc, UnicodeDecodeError):
-                raise TypeError("don't know how to handle %r" % exc)
-            l = [u"<%d>" % ord(exc.object[pos]) for pos in xrange(exc.start, exc.end)]
-            return (u"[%s]" % u"".join(l), exc.end+1) # skip one character
-
-        codecs.register_error("test.handler2", handler2)
-
-        s = "\x00\x81\x7f\x80\xff"
-
-        self.assertEqual(
-            codecs.decode(s,"ascii", "test.handler1"),
-            u"\x00[<129>]\x7f[<128>][<255>]"
-        )
-        self.assertEqual(
-            codecs.decode(s,"ascii", "test.handler2"),
-            u"\x00[<129>][<128>]"
-        )
-
-        self.assertEqual(
-            codecs.decode("\\u3042\u3xxx","unicode-escape", "test.handler1"),
-            u"\u3042[<92><117><51><120>]xx"
-        )
-
-        self.assertEqual(
-            codecs.decode("\\u3042\u3xx","unicode-escape", "test.handler1"),
-            u"\u3042[<92><117><51><120><120>]"
-        )
-
-        self.assertEqual(
-            codecs.charmap_decode("abc", "test.handler1", {ord("a"): u"z"})[0],
-            u"z[<98>][<99>]"
-        )
-
-        self.assertEqual(
-            codecs.encode(u"g\xfc\xdfrk","ascii", "test.handler1"),
-            u"g[<252><223>]rk"
-        )
-
-        self.assertEqual(
-            codecs.encode(u"g\xfc\xdf","ascii", "test.handler1"),
-            u"g[<252><223>]"
-        )
-
-    def test_longstrings(self):
-        # test long strings to check for memory overflow problems
-        errors = [ "strict", "ignore", "replace", "xmlcharrefreplace", "backslashreplace"]
-        # register the handlers under different names,
-        # to prevent the codec from recognizing the name
-        for err in errors:
-            codecs.register_error("test." + err, codecs.lookup_error(err))
-        l = 1000
-        errors += [ "test." + err for err in errors ]
-        for uni in [ s*l for s in (u"x", u"\u3042", u"a\xe4") ]:
-            for enc in ("ascii", "latin-1", "iso-8859-1", "iso-8859-15", "utf-8", "utf-7", "utf-16"):
-                for err in errors:
-                    try:
-                        codecs.encode(uni,enc, err)
-                    except UnicodeError:
-                        pass
-
-    def check_exceptionobjectargs(self, exctype, args, msg):
-        # Test UnicodeError subclasses: construction, attribute assignment and __str__ conversion
-        # check with one missing argument
-        self.assertRaises(TypeError, exctype, *args[:-1])
-        # check with one argument too much
-        self.assertRaises(TypeError, exctype, *(args + ["too much"]))
-        # check with one argument of the wrong type
-        wrongargs = [ "spam", u"eggs", 42, 1.0, None ]
-        for i in xrange(len(args)):
-            for wrongarg in wrongargs:
-                if type(wrongarg) is type(args[i]):
-                    continue
-                # build argument array
-                callargs = []
-                for j in xrange(len(args)):
-                    if i==j:
-                        callargs.append(wrongarg)
-                    else:
-                        callargs.append(args[i])
-                self.assertRaises(TypeError, exctype, *callargs)
-
-        # check with the correct number and type of arguments
-        exc = exctype(*args)
-        self.assertEquals(str(exc), msg)
-
-##    def test_unicodeencodeerror(self):
-##        self.check_exceptionobjectargs(
-##            UnicodeEncodeError,
-##            ["ascii", u"g\xfcrk", 1, 2, "ouch"],
-##            "'ascii' codec can't encode character u'\\xfc' in position 1: ouch"
-##        )
-##        self.check_exceptionobjectargs(
-##            UnicodeEncodeError,
-##            ["ascii", u"g\xfcrk", 1, 4, "ouch"],
-##            "'ascii' codec can't encode characters in position 1-3: ouch"
-##        )
-##        self.check_exceptionobjectargs(
-##            UnicodeEncodeError,
-##            ["ascii", u"\xfcx", 0, 1, "ouch"],
-##            "'ascii' codec can't encode character u'\\xfc' in position 0: ouch"
-##        )
-##        self.check_exceptionobjectargs(
-##            UnicodeEncodeError,
-##            ["ascii", u"\u0100x", 0, 1, "ouch"],
-##            "'ascii' codec can't encode character u'\\u0100' in position 0: ouch"
-##        )
-##        self.check_exceptionobjectargs(
-##            UnicodeEncodeError,
-##            ["ascii", u"\uffffx", 0, 1, "ouch"],
-##            "'ascii' codec can't encode character u'\\uffff' in position 0: ouch"
-##        )
-##        if sys.maxunicode > 0xffff:
-##            self.check_exceptionobjectargs(
-##                UnicodeEncodeError,
-##                ["ascii", u"\U00010000x", 0, 1, "ouch"],
-##                "'ascii' codec can't encode character u'\\U00010000' in position 0: ouch"
-##            )
-##
-##    def test_unicodedecodeerror(self):
-##        self.check_exceptionobjectargs(
-##            UnicodeDecodeError,
-##            ["ascii", "g\xfcrk", 1, 2, "ouch"],
-##            "'ascii' codec can't decode byte 0xfc in position 1: ouch"
-##        )
-##        self.check_exceptionobjectargs(
-##            UnicodeDecodeError,
-##            ["ascii", "g\xfcrk", 1, 3, "ouch"],
-##            "'ascii' codec can't decode bytes in position 1-2: ouch"
-##        )
-##
-##    def test_unicodetranslateerror(self):
-##        self.check_exceptionobjectargs(
-##            UnicodeTranslateError,
-##            [u"g\xfcrk", 1, 2, "ouch"],
-##            "can't translate character u'\\xfc' in position 1: ouch"
-##        )
-##        self.check_exceptionobjectargs(
-##            UnicodeTranslateError,
-##            [u"g\u0100rk", 1, 2, "ouch"],
-##            "can't translate character u'\\u0100' in position 1: ouch"
-##        )
-##        self.check_exceptionobjectargs(
-##            UnicodeTranslateError,
-##            [u"g\uffffrk", 1, 2, "ouch"],
-##            "can't translate character u'\\uffff' in position 1: ouch"
-##        )
-##        if sys.maxunicode > 0xffff:
-##            self.check_exceptionobjectargs(
-##                UnicodeTranslateError,
-##                [u"g\U00010000rk", 1, 2, "ouch"],
-##                "can't translate character u'\\U00010000' in position 1: ouch"
-##            )
-##        self.check_exceptionobjectargs(
-##            UnicodeTranslateError,
-##            [u"g\xfcrk", 1, 3, "ouch"],
-##            "can't translate characters in position 1-2: ouch"
-##        )
-
-    def test_badandgoodstrictexceptions(self):
-        # "strict" complains about a non-exception passed in
-        self.assertRaises(
-            TypeError,
-            codecs.strict_errors,
-            42
-        )
-        # "strict" complains about the wrong exception type
-        self.assertRaises(
-            Exception,
-            codecs.strict_errors,
-            Exception("ouch")
-        )
-
-        # If the correct exception is passed in, "strict" raises it
-        self.assertRaises(
-            UnicodeEncodeError,
-            codecs.strict_errors,
-            UnicodeEncodeError("ascii", u"\u3042", 0, 1, "ouch")
-        )
-
-    def test_badandgoodignoreexceptions(self):
-        # "ignore" complains about a non-exception passed in
-        self.assertRaises(
-           TypeError,
-           codecs.ignore_errors,
-           42
-        )
-        # "ignore" complains about the wrong exception type
-        self.assertRaises(
-           TypeError,
-           codecs.ignore_errors,
-           UnicodeError("ouch")
-        )
-        # If the correct exception is passed in, "ignore" returns an empty replacement
-        self.assertEquals(
-            codecs.ignore_errors(UnicodeEncodeError("ascii", u"\u3042", 0, 1, "ouch")),
-            (u"", 1)
-        )
-        self.assertEquals(
-            codecs.ignore_errors(UnicodeDecodeError("ascii", "\xff", 0, 1, "ouch")),
-            (u"", 1)
-        )
-        self.assertEquals(
-            codecs.ignore_errors(UnicodeTranslateError(u"\u3042", 0, 1, "ouch")),
-            (u"", 1)
-        )
-
-    def test_badandgoodreplaceexceptions(self):
-        # "replace" complains about a non-exception passed in
-        self.assertRaises(
-           TypeError,
-           codecs.replace_errors,
-           42
-        )
-        # "replace" complains about the wrong exception type
-        self.assertRaises(
-           TypeError,
-           codecs.replace_errors,
-           UnicodeError("ouch")
-        )
-        # With the correct exception, "ignore" returns an empty replacement
-        self.assertEquals(
-            codecs.replace_errors(UnicodeEncodeError("ascii", u"\u3042", 0, 1, "ouch")),
-            (u"?", 1)
-        )
-        self.assertEquals(
-            codecs.replace_errors(UnicodeDecodeError("ascii", "\xff", 0, 1, "ouch")),
-            (u"\ufffd", 1)
-        )
-        self.assertEquals(
-            codecs.replace_errors(UnicodeTranslateError(u"\u3042", 0, 1, "ouch")),
-            (u"\ufffd", 1)
-        )
-
-    def test_badandgoodxmlcharrefreplaceexceptions(self):
-        # "xmlcharrefreplace" complains about a non-exception passed in
-        self.assertRaises(
-           TypeError,
-           codecs.xmlcharrefreplace_errors,
-           42
-        )
-        # "xmlcharrefreplace" complains about the wrong exception types
-        self.assertRaises(
-           TypeError,
-           codecs.xmlcharrefreplace_errors,
-           UnicodeError("ouch")
-        )
-        # "xmlcharrefreplace" can only be used for encoding
-        self.assertRaises(
-            TypeError,
-            codecs.xmlcharrefreplace_errors,
-            UnicodeDecodeError("ascii", "\xff", 0, 1, "ouch")
-        )
-        self.assertRaises(
-            TypeError,
-            codecs.xmlcharrefreplace_errors,
-            UnicodeTranslateError(u"\u3042", 0, 1, "ouch")
-        )
-        # Use the correct exception
-        self.assertEquals(
-            codecs.xmlcharrefreplace_errors(UnicodeEncodeError("ascii", u"\u3042", 0, 1, "ouch")),
-            (u"&#%d;" % 0x3042, 1)
-        )
-
-    def test_badandgoodbackslashreplaceexceptions(self):
-        # "backslashreplace" complains about a non-exception passed in
-        self.assertRaises(
-           TypeError,
-           codecs.backslashreplace_errors,
-           42
-        )
-        # "backslashreplace" complains about the wrong exception types
-        self.assertRaises(
-           TypeError,
-           codecs.backslashreplace_errors,
-           UnicodeError("ouch")
-        )
-        # "backslashreplace" can only be used for encoding
-        self.assertRaises(
-            TypeError,
-            codecs.backslashreplace_errors,
-            UnicodeDecodeError("ascii", "\xff", 0, 1, "ouch")
-        )
-        self.assertRaises(
-            TypeError,
-            codecs.backslashreplace_errors,
-            UnicodeTranslateError(u"\u3042", 0, 1, "ouch")
-        )
-        # Use the correct exception
-        self.assertEquals(
-            codecs.backslashreplace_errors(UnicodeEncodeError("ascii", u"\u3042", 0, 1, "ouch")),
-            (u"\\u3042", 1)
-        )
-        self.assertEquals(
-            codecs.backslashreplace_errors(UnicodeEncodeError("ascii", u"\x00", 0, 1, "ouch")),
-            (u"\\x00", 1)
-        )
-        self.assertEquals(
-            codecs.backslashreplace_errors(UnicodeEncodeError("ascii", u"\xff", 0, 1, "ouch")),
-            (u"\\xff", 1)
-        )
-        self.assertEquals(
-            codecs.backslashreplace_errors(UnicodeEncodeError("ascii", u"\u0100", 0, 1, "ouch")),
-            (u"\\u0100", 1)
-        )
-        self.assertEquals(
-            codecs.backslashreplace_errors(UnicodeEncodeError("ascii", u"\uffff", 0, 1, "ouch")),
-            (u"\\uffff", 1)
-        )
-        if sys.maxunicode>0xffff:
-            self.assertEquals(
-                codecs.backslashreplace_errors(UnicodeEncodeError("ascii", u"\U00010000", 0, 1, "ouch")),
-                (u"\\U00010000", 1)
-            )
-            self.assertEquals(
-                codecs.backslashreplace_errors(UnicodeEncodeError("ascii", u"\U0010ffff", 0, 1, "ouch")),
-                (u"\\U0010ffff", 1)
-            )
-
-    def test_badhandlerresults(self):
-        results = ( 42, u"foo", (1,2,3), (u"foo", 1, 3), (u"foo", None), (u"foo",), ("foo", 1, 3), ("foo", None), ("foo",) )
-        encs = ("ascii", "latin-1", "iso-8859-1", "iso-8859-15")
-
-        for res in results:
-            codecs.register_error("test.badhandler", lambda: res)
-            for enc in encs:
-                self.assertRaises(
-                    TypeError,
-                    codecs.encode,
-                    u"\u3042",
-                    enc,
-                    "test.badhandler"
-                )
-            for (enc, bytes) in (
-                ("ascii", "\xff"),
-                ("utf-8", "\xff"),
-                ("utf-7", "+x-")
-            ):
-                self.assertRaises(
-                    TypeError,
-                    codecs.decode,
-                    bytes,
-                    enc,
-                    "test.badhandler"
-                )
-
-    def test_lookup(self):
-        self.assertEquals(codecs.strict_errors, codecs.lookup_error("strict"))
-        self.assertEquals(codecs.ignore_errors, codecs.lookup_error("ignore"))
-        self.assertEquals(codecs.strict_errors, codecs.lookup_error("strict"))
-        self.assertEquals(
-            codecs.xmlcharrefreplace_errors,
-            codecs.lookup_error("xmlcharrefreplace")
-        )
-        self.assertEquals(
-            codecs.backslashreplace_errors,
-            codecs.lookup_error("backslashreplace")
-        )
-
-    def test_unencodablereplacement(self):
-        def unencrepl(exc):
-            if isinstance(exc, UnicodeEncodeError):
-                return (u"\u4242", exc.end)
-            else:
-                raise TypeError("don't know how to handle %r" % exc)
-        codecs.register_error("test.unencreplhandler", unencrepl)
-        for enc in ("ascii", "iso-8859-1", "iso-8859-15"):
-            self.assertRaises(
-                UnicodeEncodeError,
-                codecs.encode,
-                u"\u4242",
-                enc,
-                "test.unencreplhandler"
-            )
-
-    def test_badregistercall(self):
-        # enhance coverage of:
-        # Modules/_codecsmodule.c::register_error()
-        # Python/codecs.c::PyCodec_RegisterError()
-        self.assertRaises(TypeError, codecs.register_error, 42)
-        self.assertRaises(TypeError, codecs.register_error, "test.dummy", 42)
-
-    def test_unknownhandler(self):
-        # enhance coverage of:
-        # Modules/_codecsmodule.c::lookup_error()
-        self.assertRaises(LookupError, codecs.lookup_error, "test.unknown")
-
-    def test_xmlcharrefvalues(self):
-        # enhance coverage of:
-        # Python/codecs.c::PyCodec_XMLCharRefReplaceErrors()
-        # and inline implementations
-        v = (1, 5, 10, 50, 100, 500, 1000, 5000, 10000, 50000)
-        if sys.maxunicode>=100000:
-            v += (100000, 500000, 1000000)
-        s = u"".join([unichr(x) for x in v])
-        codecs.register_error("test.xmlcharrefreplace", codecs.xmlcharrefreplace_errors)
-        for enc in ("ascii", "iso-8859-15"):
-            for err in ("xmlcharrefreplace", "test.xmlcharrefreplace"):
-                codecs.encode(s,enc, err)
-
-    def test_decodehelper(self):
-        # enhance coverage of:
-        # Objects/unicodeobject.c::unicode_decode_call_errorhandler()
-        # and callers
-        self.assertRaises(LookupError, codecs.decode,"\xff", "ascii", "test.unknown")
-
-        def baddecodereturn1(exc):
-            return 42
-        codecs.register_error("test.baddecodereturn1", baddecodereturn1)
-        self.assertRaises(TypeError, codecs.decode, "\xff", "ascii", "test.baddecodereturn1")
-        self.assertRaises(TypeError, codecs.decode, "\\", "unicode-escape", "test.baddecodereturn1")
-        self.assertRaises(TypeError, codecs.decode, "\\x0", "unicode-escape", "test.baddecodereturn1")
-        self.assertRaises(TypeError, codecs.decode, "\\x0y", "unicode-escape", "test.baddecodereturn1")
-        self.assertRaises(TypeError, codecs.decode, "\\Uffffeeee", "unicode-escape", "test.baddecodereturn1")
-        self.assertRaises(TypeError, codecs.decode, "\\uyyyy", "raw-unicode-escape", "test.baddecodereturn1")
-
-        def baddecodereturn2(exc):
-            return (u"?", None)
-        codecs.register_error("test.baddecodereturn2", baddecodereturn2)
-        self.assertRaises(TypeError, codecs.decode, "\xff", "ascii", "test.baddecodereturn2")
-
-        handler = PosReturn()
-        codecs.register_error("test.posreturn", handler.handle)
-
-        # Valid negative position
-        handler.pos = -1
-        self.assertEquals(codecs.decode( "\xff0","ascii", "test.posreturn"), u"<?>0")
-
-        # Valid negative position
-        handler.pos = -2
-        self.assertEquals(codecs.decode("\xff0","ascii", "test.posreturn"), u"<?><?>")
-
-        # Negative position out of bounds
-        handler.pos = -3
-        self.assertRaises(IndexError, codecs.decode,"\xff0", "ascii", "test.posreturn")
-
-        # Valid positive position
-        handler.pos = 1
-        self.assertEquals(codecs.decode("\xff0","ascii", "test.posreturn"), u"<?>0")
-
-        # Largest valid positive position (one beyond end of input
-        handler.pos = 2
-        self.assertEquals(codecs.decode("\xff0","ascii", "test.posreturn"), u"<?>")
-
-        # Invalid positive position
-        handler.pos = 3
-        self.assertRaises(IndexError, codecs.decode,"\xff0", "ascii", "test.posreturn")
-
-        # Restart at the "0"
-        handler.pos = 6
-        self.assertEquals(codecs.decode("\\uyyyy0","raw-unicode-escape", "test.posreturn"), u"<?>0")
-
-        class D(dict):
-            def __getitem__(self, key):
-                raise ValueError
-        self.assertRaises(UnicodeError, codecs.charmap_decode, "\xff", "strict", {0xff: None})
-        self.assertRaises(ValueError, codecs.charmap_decode, "\xff", "strict", D())
-        self.assertRaises(TypeError, codecs.charmap_decode, "\xff", "strict", {0xff: sys.maxunicode+1})
-
-    def test_encodehelper(self):
-        # enhance coverage of:
-        # Objects/unicodeobject.c::unicode_encode_call_errorhandler()
-        # and callers
-        
-        self.assertRaises(LookupError, codecs.decode,u"\xff", "ascii", "test.unknown")
-
-        def badencodereturn1(exc):
-            return 42
-        codecs.register_error("test.badencodereturn1", badencodereturn1)
-        self.assertRaises(TypeError, codecs.decode, u"\xff", "ascii", "test.badencodereturn1")
-
-        def badencodereturn2(exc):
-            return (u"?", None)
-        codecs.register_error("test.badencodereturn2", badencodereturn2)
-        self.assertRaises(TypeError, codecs.decode,u"\xff", "ascii", "test.badencodereturn2")
-
-        handler = PosReturn()
-        codecs.register_error("test.posreturn", handler.handle)
-
-        # Valid negative position
-        handler.pos = -1
-        self.assertEquals(u"\xff0".encode("ascii", "test.posreturn"), "<?>0")
-
-        # Valid negative position
-        handler.pos = -2
-        self.assertEquals(u"\xff0".encode("ascii", "test.posreturn"), "<?><?>")
-
-        # Negative position out of bounds
-        handler.pos = -3
-        self.assertRaises(IndexError, u"\xff0".encode, "ascii", "test.posreturn")
-
-        # Valid positive position
-        handler.pos = 1
-        self.assertEquals(u"\xff0".encode("ascii", "test.posreturn"), "<?>0")
-
-        # Largest valid positive position (one beyond end of input
-        handler.pos = 2
-        self.assertEquals(u"\xff0".encode("ascii", "test.posreturn"), "<?>")
-
-        # Invalid positive position
-        handler.pos = 3
-        self.assertRaises(IndexError, u"\xff0".encode, "ascii", "test.posreturn")
-
-        handler.pos = 0
-
-        class D(dict):
-            def __getitem__(self, key):
-                raise ValueError
-        for err in ("strict", "replace", "xmlcharrefreplace", "backslashreplace", "test.posreturn"):
-            self.assertRaises(UnicodeError, codecs.charmap_encode, u"\xff", err, {0xff: None})
-            self.assertRaises(ValueError, codecs.charmap_encode, u"\xff", err, D())
-            self.assertRaises(TypeError, codecs.charmap_encode, u"\xff", err, {0xff: 300})
-
-    def test_translatehelper(self):
-        # enhance coverage of:
-        # Objects/unicodeobject.c::unicode_encode_call_errorhandler()
-        # and callers
-        # (Unfortunately the errors argument is not directly accessible
-        # from Python, so we can't test that much)
-        class D(dict):
-            def __getitem__(self, key):
-                raise ValueError
-        self.assertRaises(ValueError, u"\xff".translate, D())
-        self.assertRaises(TypeError, u"\xff".translate, {0xff: sys.maxunicode+1})
-        self.assertRaises(TypeError, u"\xff".translate, {0xff: ()})
-
-    def test_bug828737(self):
-        charmap = {
-            ord("&"): u"&amp;",
-            ord("<"): u"&lt;",
-            ord(">"): u"&gt;",
-            ord('"'): u"&quot;",
-        }
-        
-        for n in (1, 10, 100, 1000):
-            text = u'abc<def>ghi'*n
-            text.translate(charmap)
-
-def test_main():
-    test.test_support.run_unittest(CodecCallbackTest)
-
-if __name__ == "__main__":
-    test_main()

Modified: pypy/dist/pypy/lib/unicodecodec.py
==============================================================================
--- pypy/dist/pypy/lib/unicodecodec.py	(original)
+++ pypy/dist/pypy/lib/unicodecodec.py	Sun Jun  5 22:59:23 2005
@@ -83,7 +83,7 @@
 ##        charvalue += ord(c)
     while (bits >= 6):
         out +=  B64(ch >> (bits-6))
-        bits -= 6; 
+        bits -= 6 
     return out,bits
 
 
@@ -250,21 +250,21 @@
 
 unicode_empty=u''
 
-def PyUnicode_Decode(s,size,encoding,errors):
-
-    if (encoding == None):
-        encoding = PyUnicode_GetDefaultEncoding()
-
-##    /* Shortcuts for common default encodings */
-    decoder = encodings.get(encoding,None)
-    if decoder:
-        return decoder(s,encoding,errors)
-##    /* Decode via the codec registry */
-    buf = buffer(s)
-    result = PyCodec_Decode(buf, encoding, errors);
-    if (not isinstance(result,unicode)):
-        raise UnicodeDecodeError, "decoder did not return an unicode object (type=%.400s)"%type(result)
-    return result
+##def PyUnicode_Decode(s,size,encoding,errors):
+##
+##    if (encoding == None):
+##        encoding = PyUnicode_GetDefaultEncoding()
+##
+####    /* Shortcuts for common default encodings */
+##    decoder = encodings.get(encoding,None)
+##    if decoder:
+##        return decoder(s,encoding,errors)
+####    /* Decode via the codec registry */
+##    buf = buffer(s)
+##    result = PyCodec_Decode(buf, encoding, errors)
+##    if (not isinstance(result,unicode)):
+##        raise UnicodeDecodeError, "decoder did not return an unicode object (type=%.400s)"%type(result)
+##    return result
 
 def unicodeescape_string(s, size, quotes):
 
@@ -336,7 +336,7 @@
             p += '%02x'%ord(ch)
         #/* Copy everything else as-is */
         else:
-            p += ch
+            p += chr(ord(ch))
         pos += 1
     if (quotes):
         p += p[1]
@@ -439,7 +439,7 @@
     	if (len(s)-q<2):
     	    if (consumed):
                 break
-    	    errmsg = "truncated data";
+    	    errmsg = "truncated data"
     	    startinpos = q
     	    endinpos = len(s)
     	    unicode_call_errorhandler()
@@ -455,7 +455,7 @@
     
 	#/* UTF-16 code pair: */
         if (q >= len(s)):
-            errmsg = "unexpected end of data";
+            errmsg = "unexpected end of data"
             startinpos = q-2
             endinpos = len(s)
             unicode_call_errorhandler
@@ -474,12 +474,12 @@
                 continue
 
             else:
-    	        errmsg = "illegal UTF-16 surrogate";
+    	        errmsg = "illegal UTF-16 surrogate"
                 startinpos = q-4
                 endinpos = startinpos+2
                 unicode_call_errorhandler
     	   
-	errmsg = "illegal encoding";
+	errmsg = "illegal encoding"
 	startinpos = q-2
 	endinpos = startinpos+2
 	unicode_call_errorhandler
@@ -570,7 +570,7 @@
 
 def PyUnicode_DecodeUTF8(s, size, errors):
 
-    return PyUnicode_DecodeUTF8Stateful(s, size, errors, None);
+    return PyUnicode_DecodeUTF8Stateful(s, size, errors, None)
 
 ##    /* Map UTF-8 encoded prefix byte to sequence length.  zero means
 ##       illegal prefix.  see RFC 2279 for details */
@@ -732,7 +732,7 @@
                     else:
 ##                /*  compute and append the two surrogates: */
 ##                /*  translate from 10000..10FFFF to 0..FFFF */
-                        c -= 0x10000;
+                        c -= 0x10000
             #/*  high surrogate = top 10 bits added to D800 */
                         p += unichr(0xD800 + (c >> 10))
             #/*  low surrogate = bottom 10 bits added to DC00 */
@@ -741,7 +741,7 @@
         else:
 ##        default:
 ##            /* Other sizes are only needed for UCS-4 */
-            errmsg = "unsupported Unicode code range";
+            errmsg = "unsupported Unicode code range"
     	    startinpos = pos
     	    endinpos = startinpos+n
     	    res = unicode_call_errorhandler(
@@ -910,7 +910,7 @@
                 errmessage = "\\ at end of string"
                 unicode_call_errorhandler(errors,"unicodeescape",errmessage,s,pos-1,size)
             ch = s[pos]
-    
+            pos += 1
     ##        /* \x escapes */
             #if ch == '\n': break;
             if ch == '\\': p += '\\'
@@ -920,50 +920,52 @@
             elif ch == 'f': p += '\014' #/* FF */
             elif ch == 't': p += '\t' 
             elif ch == 'n': p += '\n'
-            elif ch == 'r': p += '\r' 
+            elif ch == 'r': 
+                p += '\r' 
+                
             elif ch == 'v': p += '\013' #break; /* VT */
             elif ch == 'a': p += '\007' # break; /* BEL, not classic C */
     
     ##        /* \OOO (octal) escapes */
             elif ch in [ '0','1', '2', '3','4', '5', '6','7']:
                 x = ord(ch) - ord('0')
-                ch = s[pos+1]
+                ch = s[pos]
                 if ('0' <= ch and ch <= '7'):
                     x = (x<<3) + ord(ch) - ord('0')
-                    ch = s[pos+2]
+                    ch = s[pos+1]
                     if ('0' <= ch and ch <= '7'):
                         x = (x<<3) + ord(ch) - ord('0')
-                        pos += 3
+                        pos += 2
     
                 p += unichr(x)
     ##        /* hex escapes */
     ##        /* \xXX */
             elif ch == 'x':
-                digits = 2;
-                message = "truncated \\xXX escape";
-                x = hexescape(s,pos+1,digits,message,errors)
+                digits = 2
+                message = "truncated \\xXX escape"
+                x = hexescape(s,pos,digits,message,errors)
                 p += x[0]
                 pos = x[1]
     
          #   /* \uXXXX */
             elif ch == 'u':
-                digits = 4;
-                message = "truncated \\uXXXX escape";
-                x = hexescape(s,pos+1,digits,message,errors)
+                digits = 4
+                message = "truncated \\uXXXX escape"
+                x = hexescape(s,pos,digits,message,errors)
                 p += x[0]
                 pos = x[1]
     
           #  /* \UXXXXXXXX */
             elif ch == 'U':
                 digits = 8
-                message = "truncated \\UXXXXXXXX escape";
-                x = hexescape(s,pos+1,digits,message,errors)
+                message = "truncated \\UXXXXXXXX escape"
+                x = hexescape(s,pos,digits,message,errors)
                 p += x[0]
                 pos = x[1]
 ##        /* \N{name} */
             elif ch == 'N':
                 message = "malformed \\N character escape"
-                pos += 1
+                #pos += 1
                 try:
                     import unicodedata
                 except ImportError:
@@ -1138,7 +1140,7 @@
         bs = pos
         while pos < size:
             if (s[pos] != '\\'):
-                break;
+                break
             p += unichr(ord(s[pos]))
             pos += 1