[pypy-svn] r61084 - in pypy/trunk/pypy/module/_codecs: . test

Sun Jan 18 21:42:42 CET 2009

Author: fijal
Date: Sun Jan 18 21:42:40 2009
New Revision: 61084

Modified:
   pypy/trunk/pypy/module/_codecs/app_codecs.py
   pypy/trunk/pypy/module/_codecs/test/test_codecs.py
Log:
Some fight with codecs. More tests pass, but this is very ugly place.
I'm going to make tests pass and get out


Modified: pypy/trunk/pypy/module/_codecs/app_codecs.py
==============================================================================

--- pypy/trunk/pypy/module/_codecs/app_codecs.py	(original)
+++ pypy/trunk/pypy/module/_codecs/app_codecs.py	Sun Jan 18 21:42:40 2009
@@ -1,4 +1,4 @@
-
+# NOT_RPYTHON
 # Note:
 # This *is* now explicitly RPython.
 # Please make sure not to break this.
@@ -118,8 +118,7 @@
         return res, len(res)
 
 def unicode_internal_decode( unistr, errors='strict'):
-    """None
-    """
+    import sys
     if type(unistr) == unicode:
         return unistr, len(unistr)
     else:
@@ -133,7 +132,13 @@
             start = 0
             stop = unicode_bytes
             step = 1
-        while i < len(unistr)-unicode_bytes+1:
+        while i < len(unistr):
+            if len(unistr) - i < unicode_bytes:
+                msg = 'truncated input'
+                next, _ = unicode_call_errorhandler(errors, 'unicode_internal', msg,
+                                                    unistr, i, i + unicode_bytes)
+                p += next
+                break
             t = 0
             h = 0
             for j in range(start, stop, step):
@@ -145,9 +150,10 @@
             except ValueError:
                 startpos = i - unicode_bytes
                 endpos = i
-                raise UnicodeDecodeError('unicode_internal', unistr, startpos,
-                                         endpos,
-                                         "unichr(%s) not in range" % (t,))
+                msg = "unichr(%s) not in range" % (t,)
+                next, _ = unicode_call_errorhandler(errors, 'unicode_internal', msg,
+                                                    unistr, startpos, endpos)
+                p += next
         res = u''.join(p)
         return res, len(unistr)
 
@@ -234,6 +240,14 @@
     res = ''.join(res)
     return res, len(res)
 
+def check_exception(exc):
+    try:
+        delta = exc.end - exc.start
+        if delta < 0 or not isinstance(exc.object, (unicode, str)):
+            raise TypeError("wrong exception")
+    except AttributeError:
+        raise TypeError("wrong exception")
+
 def strict_errors(exc):
     if isinstance(exc, Exception):
         raise exc
@@ -241,6 +255,7 @@
         raise TypeError("codec must pass exception instance")
     
 def ignore_errors(exc):
+    check_exception(exc)
     if isinstance(exc, UnicodeEncodeError):
         return u'', exc.end
     elif isinstance(exc, (UnicodeDecodeError, UnicodeTranslateError)):
@@ -251,6 +266,7 @@
 Py_UNICODE_REPLACEMENT_CHARACTER = u"\ufffd"
 
 def replace_errors(exc):
+    check_exception(exc)
     if isinstance(exc, UnicodeEncodeError):
         return u'?'*(exc.end-exc.start), exc.end
     elif isinstance(exc, (UnicodeTranslateError, UnicodeDecodeError)):
@@ -356,7 +372,7 @@
     return out, bits
 
 def PyUnicode_DecodeUTF7(s, size, errors):
-
+    from _codecs import lookup_error
     starts = s
     errmsg = ""
     inShift = 0
@@ -419,7 +435,8 @@
                     
                 elif SPECIAL(ch, 0, 0) :
                     msg = "unexpected special character"
-                    raise UnicodeDecodeError('utf-7', s, i-1, i, msg)
+                    out, _ = unicode_call_errorhandler(errors, 'utf-7', msg, s, i-1, i)
+                    p += out
                 else:  
                     p +=  ch 
             else:
@@ -440,7 +457,8 @@
         elif (SPECIAL(ch, 0, 0)):
             i += 1
             msg = "unexpected special character"
-            raise UnicodeDecodeError('utf-7', s, i-1, i, msg)
+            out, _ = unicode_call_errorhandler(errors, 'utf-7', msg, s, i-1, i)
+            p += out
         else:
             p +=  ch 
             i += 1
@@ -449,8 +467,8 @@
         #XXX This aint right
         endinpos = size
         msg = "unterminated shift sequence"
-        raise UnicodeDecodeError('utf-7', s, i-1, i, msg)
-        
+        out, _ = unicode_call_errorhandler(errors, 'utf-7', msg, s, i-1, i)
+        p += out
     return p
 
 def PyUnicode_EncodeUTF7(s, size, encodeSetO, encodeWhiteSpace, errors):
@@ -606,7 +624,7 @@
     else:
         exceptionObject = UnicodeEncodeError(encoding, input, startinpos, endinpos, reason)
     res = errorHandler(exceptionObject)
-    if isinstance(res, tuple) and isinstance(res[0], unicode) and isinstance(res[1], int):
+    if isinstance(res, tuple) and len(res) == 2 and isinstance(res[0], unicode) and isinstance(res[1], int):
         newpos = res[1]
         if (newpos < 0):
             newpos = len(input) + newpos

Modified: pypy/trunk/pypy/module/_codecs/test/test_codecs.py
==============================================================================
--- pypy/trunk/pypy/module/_codecs/test/test_codecs.py	(original)
+++ pypy/trunk/pypy/module/_codecs/test/test_codecs.py	Sun Jan 18 21:42:40 2009
@@ -357,6 +357,8 @@
         import codecs
         res = codecs.charmap_decode("\x00\x01\x02", "replace", u"ab")
         assert res == (u"ab\ufffd", 3)
+        res = codecs.charmap_decode("\x00\x01\x02", "replace", u"ab\ufffe")
+        assert res == (u'ab\ufffd', 3)
 
     def test_decode_errors(self):
         import sys
@@ -370,3 +372,110 @@
                 assert ex.end == 8
             else:
                 raise Exception("DID NOT RAISE")
+
+    def test_errors(self):
+        import codecs
+        assert (
+            codecs.replace_errors(UnicodeEncodeError("ascii", u"\u3042", 0, 1, "ouch"))) == (
+            (u"?", 1)
+        )
+        assert (
+            codecs.replace_errors(UnicodeDecodeError("ascii", "\xff", 0, 1, "ouch"))) == (
+            (u"\ufffd", 1)
+        )
+        assert (
+            codecs.replace_errors(UnicodeTranslateError(u"\u3042", 0, 1, "ouch"))) == (
+            (u"\ufffd", 1)
+        )
+        class BadStartUnicodeEncodeError(UnicodeEncodeError):
+            def __init__(self):
+                UnicodeEncodeError.__init__(self, "ascii", u"", 0, 1, "bad")
+                self.start = []
+
+        # A UnicodeEncodeError object with a bad object attribute
+        class BadObjectUnicodeEncodeError(UnicodeEncodeError):
+            def __init__(self):
+                UnicodeEncodeError.__init__(self, "ascii", u"", 0, 1, "bad")
+                self.object = []
+
+        # A UnicodeDecodeError object without an end attribute
+        class NoEndUnicodeDecodeError(UnicodeDecodeError):
+            def __init__(self):
+                UnicodeDecodeError.__init__(self, "ascii", "", 0, 1, "bad")
+                del self.end
+
+        # A UnicodeDecodeError object with a bad object attribute
+        class BadObjectUnicodeDecodeError(UnicodeDecodeError):
+            def __init__(self):
+                UnicodeDecodeError.__init__(self, "ascii", "", 0, 1, "bad")
+                self.object = []
+
+        # A UnicodeTranslateError object without a start attribute
+        class NoStartUnicodeTranslateError(UnicodeTranslateError):
+            def __init__(self):
+                UnicodeTranslateError.__init__(self, u"", 0, 1, "bad")
+                del self.start
+
+        # A UnicodeTranslateError object without an end attribute
+        class NoEndUnicodeTranslateError(UnicodeTranslateError):
+            def __init__(self):
+                UnicodeTranslateError.__init__(self,  u"", 0, 1, "bad")
+                del self.end
+
+        # A UnicodeTranslateError object without an object attribute
+        class NoObjectUnicodeTranslateError(UnicodeTranslateError):
+            def __init__(self):
+                UnicodeTranslateError.__init__(self, u"", 0, 1, "bad")
+                del self.object
+
+        import codecs
+        raises(TypeError, codecs.replace_errors, BadObjectUnicodeEncodeError())
+        raises(TypeError, codecs.replace_errors, 42)
+        # "replace" complains about the wrong exception type
+        raises(TypeError, codecs.replace_errors, UnicodeError("ouch"))
+        raises(TypeError, codecs.replace_errors, BadObjectUnicodeEncodeError())
+        raises(TypeError, codecs.replace_errors, BadObjectUnicodeDecodeError()
+        )
+        # With the correct exception, "replace" returns an "?" or u"\ufffd" replacement
+
+    def test_decode_ignore(self):
+        assert '\xff'.decode('utf-7', 'ignore') == ''
+        assert '\x00'.decode('unicode-internal', 'ignore') == ''
+
+    def test_badhandler(self):
+        import codecs
+        results = ( 42, u"foo", (1,2,3), (u"foo", 1, 3), (u"foo", None), (u"foo",), ("foo", 1, 3), ("foo", None), ("foo",) )
+        encs = ("ascii", "latin-1", "iso-8859-1", "iso-8859-15")
+
+        for res in results:
+            codecs.register_error("test.badhandler", lambda x: res)
+            for enc in encs:
+                raises(
+                    TypeError,
+                    u"\u3042".encode,
+                    enc,
+                    "test.badhandler"
+                )
+            for (enc, bytes) in (
+                ("utf-8", "\xff"),
+                ("ascii", "\xff"),
+                ("utf-7", "+x-"),
+                ("unicode-internal", "\x00"),
+            ):
+                raises(
+                    TypeError,
+                    bytes.decode,
+                    enc,
+                    "test.badhandler"
+                )
+
+    def test_unicode_internal(self):
+        try:
+            '\x00'.decode('unicode-internal')
+        except UnicodeDecodeError:
+            pass
+        else:
+            raise Exception("DID NOT RAISE")
+
+        res = "\x00\x00\x00\x00\x00".decode("unicode-internal", "replace")
+        assert res == u"\u0000\ufffd"