[pypy-commit] pypy stdlib-2.7.6: fix utf-7 decoder (cpython issue19279)

bdkearns noreply at buildbot.pypy.org
Sun Mar 2 08:19:45 CET 2014


Author: Brian Kearns <bdkearns at gmail.com>
Branch: stdlib-2.7.6
Changeset: r69600:5a23ee926e6c
Date: 2014-03-02 02:19 -0500
http://bitbucket.org/pypy/pypy/changeset/5a23ee926e6c/

Log:	fix utf-7 decoder (cpython issue19279)

diff --git a/pypy/module/_codecs/test/test_codecs.py b/pypy/module/_codecs/test/test_codecs.py
--- a/pypy/module/_codecs/test/test_codecs.py
+++ b/pypy/module/_codecs/test/test_codecs.py
@@ -591,6 +591,30 @@
     def test_utf7_surrogate(self):
         assert '+3ADYAA-'.decode('utf-7') == u'\udc00\ud800'
 
+    def test_utf7_errors(self):
+        import codecs
+        tests = [
+            ('a\xffb', u'a\ufffdb'),
+            ('a+IK', u'a\ufffd'),
+            ('a+IK-b', u'a\ufffdb'),
+            ('a+IK,b', u'a\ufffdb'),
+            ('a+IKx', u'a\u20ac\ufffd'),
+            ('a+IKx-b', u'a\u20ac\ufffdb'),
+            ('a+IKwgr', u'a\u20ac\ufffd'),
+            ('a+IKwgr-b', u'a\u20ac\ufffdb'),
+            ('a+IKwgr,', u'a\u20ac\ufffd'),
+            ('a+IKwgr,-b', u'a\u20ac\ufffd-b'),
+            ('a+IKwgrB', u'a\u20ac\u20ac\ufffd'),
+            ('a+IKwgrB-b', u'a\u20ac\u20ac\ufffdb'),
+            ('a+/,+IKw-b', u'a\ufffd\u20acb'),
+            ('a+//,+IKw-b', u'a\ufffd\u20acb'),
+            ('a+///,+IKw-b', u'a\uffff\ufffd\u20acb'),
+            ('a+////,+IKw-b', u'a\uffff\ufffd\u20acb'),
+        ]
+        for raw, expected in tests:
+            raises(UnicodeDecodeError, codecs.utf_7_decode, raw, 'strict', True)
+            assert raw.decode('utf-7', 'replace') == expected
+
     def test_utf_16_encode_decode(self):
         import codecs, sys
         x = u'123abc'
@@ -605,7 +629,7 @@
             assert codecs.getdecoder('utf-16')(
                     '\xff\xfe1\x002\x003\x00a\x00b\x00c\x00') == (x, 14)
 
-    def test_unicode_escape(self):        
+    def test_unicode_escape(self):
         assert u'\\'.encode('unicode-escape') == '\\\\'
         assert '\\\\'.decode('unicode-escape') == u'\\'
         assert u'\ud801'.encode('unicode-escape') == '\\ud801'
diff --git a/rpython/rlib/runicode.py b/rpython/rlib/runicode.py
--- a/rpython/rlib/runicode.py
+++ b/rpython/rlib/runicode.py
@@ -812,6 +812,7 @@
                     outCh = base64buffer >> (base64bits - 16)
                     base64bits -= 16
                     base64buffer &= (1 << base64bits) - 1 # clear high bits
+                    assert outCh <= 0xffff
                     if surrogate:
                         # expecting a second surrogate
                         if outCh >= 0xDC00 and outCh <= 0xDFFFF:
@@ -877,6 +878,8 @@
             else: # begin base64-encoded section
                 inShift = 1
                 shiftOutStartPos = pos - 1
+                base64bits = 0
+                base64buffer = 0
 
         elif _utf7_DECODE_DIRECT(oc): # character decodes at itself
             result.append(unichr(oc))


More information about the pypy-commit mailing list