[pypy-commit] pypy stdlib-2.7.3: CPython Issue #13333: The UTF-7 decoder now accepts lone surrogates
amauryfa
noreply at buildbot.pypy.org
Wed Jun 13 08:10:21 CEST 2012
Author: Amaury Forgeot d'Arc <amauryfa at gmail.com>
Branch: stdlib-2.7.3
Changeset: r55637:e5e22d7d9ac3
Date: 2012-06-13 08:09 +0200
http://bitbucket.org/pypy/pypy/changeset/e5e22d7d9ac3/
Log: CPython Issue #13333: The UTF-7 decoder now accepts lone surrogates
(the encoder already accepts them).
diff --git a/pypy/rlib/runicode.py b/pypy/rlib/runicode.py
--- a/pypy/rlib/runicode.py
+++ b/pypy/rlib/runicode.py
@@ -753,22 +753,14 @@
UNICHR((((surrogate & 0x3FF)<<10) |
(outCh & 0x3FF)) + 0x10000))
surrogate = 0
+ continue
else:
+ result.append(unichr(surrogate))
surrogate = 0
- msg = "second surrogate missing"
- res, pos = errorhandler(errors, 'utf-7',
- msg, s, pos-1, pos)
- result.append(res)
- continue
- elif outCh >= 0xD800 and outCh <= 0xDBFF:
+ # Not done with outCh: falls back to next line
+ if outCh >= 0xD800 and outCh <= 0xDBFF:
# first surrogate
surrogate = outCh
- elif outCh >= 0xDC00 and outCh <= 0xDFFF:
- msg = "unexpected second surrogate"
- res, pos = errorhandler(errors, 'utf-7',
- msg, s, pos-1, pos)
- result.append(res)
- continue
else:
result.append(unichr(outCh))
@@ -778,11 +770,8 @@
pos += 1
if surrogate:
- msg = "second surrogate missing at end of shift sequence"
- res, pos = errorhandler(errors, 'utf-7',
- msg, s, pos-1, pos)
- result.append(res)
- continue
+ result.append(unichr(surrogate))
+ surrogate = 0
if base64bits > 0: # left-over bits
if base64bits >= 6:
diff --git a/pypy/rlib/test/test_runicode.py b/pypy/rlib/test/test_runicode.py
--- a/pypy/rlib/test/test_runicode.py
+++ b/pypy/rlib/test/test_runicode.py
@@ -209,12 +209,19 @@
u = u'\U000abcde'
assert encode(u, len(u), None) == '+2m/c3g-'
decode = self.getdecoder('utf-7')
- s = '+3ADYAA-'
- raises(UnicodeError, decode, s, len(s), None)
- def replace_handler(errors, codec, message, input, start, end):
- return u'?', end
- assert decode(s, len(s), None, final=True,
- errorhandler = replace_handler) == (u'??', len(s))
+
+ # Unpaired surrogates are passed through
+ assert encode(u'\uD801', 1, None) == '+2AE-'
+ assert encode(u'\uD801x', 2, None) == '+2AE-x'
+ assert encode(u'\uDC01', 1, None) == '+3AE-'
+ assert encode(u'\uDC01x', 2, None) == '+3AE-x'
+ assert decode('+2AE-', 5, None) == (u'\uD801', 5)
+ assert decode('+2AE-x', 6, None) == (u'\uD801x', 6)
+ assert decode('+3AE-', 5, None) == (u'\uDC01', 5)
+ assert decode('+3AE-x', 6, None) == (u'\uDC01x', 6)
+
+ assert encode(u'\uD801\U000abcde', 2, None) == '+2AHab9ze-'
+ assert decode('+2AHab9ze-', 10, None) == (u'\uD801\U000abcde', 10)
class TestUTF8Decoding(UnicodeTests):
More information about the pypy-commit
mailing list