[pypy-commit] pypy stdlib-2.7.8: (alex, dreid) Completely fix the utf7 decoder this time

Thu Aug 28 18:48:09 CEST 2014

Author: Alex Gaynor <alex.gaynor at gmail.com>
Branch: stdlib-2.7.8
Changeset: r73111:687dd5c34b84
Date: 2014-08-28 09:47 -0700
http://bitbucket.org/pypy/pypy/changeset/687dd5c34b84/

Log:	(alex, dreid) Completely fix the utf7 decoder this time

diff --git a/rpython/rlib/runicode.py b/rpython/rlib/runicode.py
--- a/rpython/rlib/runicode.py
+++ b/rpython/rlib/runicode.py
@@ -874,13 +874,14 @@
                     result.append(unichr(ord(ch)))
 
         elif ch == '+':
+            startingpos = pos
             pos += 1 # consume '+'
             if pos < size and s[pos] == '-': # '+-' encodes '+'
                 pos += 1
                 result.append(u'+')
             else: # begin base64-encoded section
                 inShift = 1
-                shiftOutStartPos = pos - 1
+                shiftOutStartPos = result.getlength()
                 base64bits = 0
                 base64buffer = 0
 
@@ -888,13 +889,14 @@
             result.append(unichr(oc))
             pos += 1
         else:
+            startingpos = pos
             pos += 1
             msg = "unexpected special character"
             res, pos = errorhandler(errors, 'utf7', msg, s, pos-1, pos)
             result.append(res)
 
     # end of string
-
+    final_length = result.getlength()
     if inShift and final: # in shift sequence, no more to follow
         # if we're in an inconsistent state, that's an error
         if (surrogate or
@@ -904,10 +906,11 @@
             res, pos = errorhandler(errors, 'utf7', msg, s, shiftOutStartPos, pos)
             result.append(res)
     elif inShift:
-        pos = shiftOutStartPos # back off output
+        pos = startingpos
+        final_length = shiftOutStartPos # back off output
 
-    assert pos >= 0
-    return result.build()[:pos], pos
+    assert final_length >= 0
+    return result.build()[:final_length], pos
 
 def unicode_encode_utf_7(s, size, errors, errorhandler=None):
     if size == 0:
diff --git a/rpython/rlib/test/test_runicode.py b/rpython/rlib/test/test_runicode.py
--- a/rpython/rlib/test/test_runicode.py
+++ b/rpython/rlib/test/test_runicode.py
@@ -232,6 +232,7 @@
         assert decode(s, 5, None) == (u'a+-b', 5)
 
         assert decode((27 * u"\u3042" + "\n").encode('utf7')[:28], 28, None) == (u'', 0)
+        assert decode('+MEI\n+MEIwQjBCMEIwQjBCMEIwQjBCMEIwQjBCMEIwQjBCMEIwQjBCMEIwQjBCMEIwQjBCME', 72, None) == (u'\u3042\n', 5)
 
     def test_utf7_surrogates(self):
         encode = self.getencoder('utf-7')