[pypy-commit] pypy stdlib-2.7.9: charmapdecode: pass all consecutive invalid chars to the error handler.

amauryfa noreply at buildbot.pypy.org
Sat Feb 14 12:39:41 CET 2015


Author: Amaury Forgeot d'Arc <amauryfa at gmail.com>
Branch: stdlib-2.7.9
Changeset: r75864:3599c828e5f7
Date: 2015-02-14 11:39 +0100
http://bitbucket.org/pypy/pypy/changeset/3599c828e5f7/

Log:	charmapdecode: pass all consecutive invalid chars to the error
	handler. This also ensure that on narrow builds, surrogate pairs are
	not split.

	Should fix test_codeccallbacks on win32.

diff --git a/rpython/rlib/runicode.py b/rpython/rlib/runicode.py
--- a/rpython/rlib/runicode.py
+++ b/rpython/rlib/runicode.py
@@ -1114,9 +1114,13 @@
 
         c = mapping.get(ch, '')
         if len(c) == 0:
+            # collect all unencodable chars. Important for narrow builds.
+            collend = pos + 1
+            while collend < size and mapping.get(s[collend], '') == '':
+                collend += 1
             ru, rs, pos = errorhandler(errors, "charmap",
                                        "character maps to <undefined>",
-                                       s, pos, pos + 1)
+                                       s, pos, collend)
             if rs is not None:
                 # py3k only
                 result.append(rs)
diff --git a/rpython/rlib/test/test_runicode.py b/rpython/rlib/test/test_runicode.py
--- a/rpython/rlib/test/test_runicode.py
+++ b/rpython/rlib/test/test_runicode.py
@@ -118,6 +118,17 @@
         if addstuff:
             assert result.endswith(u"some rest in ascii")
 
+    def test_charmap_encodeerror(self):
+        def errorhandler(errors, enc, msg, t, startingpos,
+                         endingpos):
+            assert t[startingpos:endingpos] == u'\t\n  \r'
+            return None, ' ', endingpos
+        s = u'aa\t\n  \raa'
+        mapping = {u'a': 'a'}
+        r = runicode.unicode_encode_charmap(s, len(s), None, errorhandler,
+                                            mapping=mapping)
+        assert r == 'aa aa'
+
 
 class TestDecoding(UnicodeTests):
     # XXX test bom recognition in utf-16


More information about the pypy-commit mailing list