[pypy-commit] pypy stdlib-2.7.9: charmapdecode: pass all consecutive invalid chars to the error handler.
amauryfa
noreply at buildbot.pypy.org
Sat Feb 14 12:39:41 CET 2015
Author: Amaury Forgeot d'Arc <amauryfa at gmail.com>
Branch: stdlib-2.7.9
Changeset: r75864:3599c828e5f7
Date: 2015-02-14 11:39 +0100
http://bitbucket.org/pypy/pypy/changeset/3599c828e5f7/
Log: charmapdecode: pass all consecutive invalid chars to the error
handler. This also ensure that on narrow builds, surrogate pairs are
not split.
Should fix test_codeccallbacks on win32.
diff --git a/rpython/rlib/runicode.py b/rpython/rlib/runicode.py
--- a/rpython/rlib/runicode.py
+++ b/rpython/rlib/runicode.py
@@ -1114,9 +1114,13 @@
c = mapping.get(ch, '')
if len(c) == 0:
+ # collect all unencodable chars. Important for narrow builds.
+ collend = pos + 1
+ while collend < size and mapping.get(s[collend], '') == '':
+ collend += 1
ru, rs, pos = errorhandler(errors, "charmap",
"character maps to <undefined>",
- s, pos, pos + 1)
+ s, pos, collend)
if rs is not None:
# py3k only
result.append(rs)
diff --git a/rpython/rlib/test/test_runicode.py b/rpython/rlib/test/test_runicode.py
--- a/rpython/rlib/test/test_runicode.py
+++ b/rpython/rlib/test/test_runicode.py
@@ -118,6 +118,17 @@
if addstuff:
assert result.endswith(u"some rest in ascii")
+ def test_charmap_encodeerror(self):
+ def errorhandler(errors, enc, msg, t, startingpos,
+ endingpos):
+ assert t[startingpos:endingpos] == u'\t\n \r'
+ return None, ' ', endingpos
+ s = u'aa\t\n \raa'
+ mapping = {u'a': 'a'}
+ r = runicode.unicode_encode_charmap(s, len(s), None, errorhandler,
+ mapping=mapping)
+ assert r == 'aa aa'
+
class TestDecoding(UnicodeTests):
# XXX test bom recognition in utf-16
More information about the pypy-commit
mailing list