[pypy-svn] r48609 - in pypy/branch/more-unicode-improvements/pypy/rlib: . test
cfbolz at codespeak.net
cfbolz at codespeak.net
Mon Nov 12 19:50:56 CET 2007
Author: cfbolz
Date: Mon Nov 12 19:50:55 2007
New Revision: 48609
Modified:
pypy/branch/more-unicode-improvements/pypy/rlib/runicode.py
pypy/branch/more-unicode-improvements/pypy/rlib/test/test_runicode.py
Log:
grr. The error handling of PyPy's utf-16 decoder is broken. Fix the RPython
version at least.
Modified: pypy/branch/more-unicode-improvements/pypy/rlib/runicode.py
==============================================================================
--- pypy/branch/more-unicode-improvements/pypy/rlib/runicode.py (original)
+++ pypy/branch/more-unicode-improvements/pypy/rlib/runicode.py Mon Nov 12 19:50:55 2007
@@ -195,24 +195,24 @@
# byte order setting accordingly. In native mode, the leading BOM
# mark is skipped, in all other modes, it is copied to the output
# stream as-is (giving a ZWNBSP character).
- q = 0
- p = []
+ pos = 0
+ result = []
if byteorder == 'native':
if (size >= 2):
bom = (ord(s[ihi]) << 8) | ord(s[ilo])
if sys.byteorder == 'little':
if (bom == 0xFEFF):
- q += 2
+ pos += 2
bo = -1
elif bom == 0xFFFE:
- q += 2
+ pos += 2
bo = 1
else:
if bom == 0xFEFF:
- q += 2
+ pos += 2
bo = 1
elif bom == 0xFFFE:
- q += 2
+ pos += 2
bo = -1
elif byteorder == 'little':
bo = -1
@@ -231,45 +231,48 @@
ilo = 1
#XXX I think the errors are not correctly handled here
- while (q < len(s)):
+ while (pos < len(s)):
# remaining bytes at the end? (size should be even)
- if len(s) - q < 2:
+ if len(s) - pos < 2:
if not final:
break
- errmsg = "truncated data"
- startinpos = q
- endinpos = len(s)
- errorhandler(errors, 'utf-16', "truncated data",
- s, startinpos, endinpos, True)
- # CPython ignores the remaining input chars if the callback
- # chooses to skip the input. XXX is this sensible?
- ch = (ord(s[q + ihi]) << 8) | ord(s[q + ilo])
- q += 2
+ r, pos = errorhandler(errors, 'utf16', "truncated data",
+ s, pos, len(s), True)
+ result.append(r)
+ if len(s) - pos < 2:
+ break
+ ch = (ord(s[pos + ihi]) << 8) | ord(s[pos + ilo])
+ pos += 2
if (ch < 0xD800 or ch > 0xDFFF):
- p += unichr(ch)
+ result += unichr(ch)
continue
# UTF-16 code pair:
- if (q >= len(s)):
+ if len(s) - pos < 2:
+ if not final:
+ break
errmsg = "unexpected end of data"
- errorhandler(errors, 'utf-16', errmsg, s, q - 2, len(s))
+ r, pos = errorhandler(errors, 'utf16', errmsg, s, pos - 2, len(s))
+ result.append(r)
+ if len(s) - pos < 2:
+ break
elif (0xD800 <= ch and ch <= 0xDBFF):
- ch2 = (ord(s[q+ihi]) << 8) | ord(s[q+ilo])
- q += 2
+ ch2 = (ord(s[pos+ihi]) << 8) | ord(s[pos+ilo])
+ pos += 2
if (0xDC00 <= ch2 and ch2 <= 0xDFFF):
if MAXUNICODE < 65536:
- p += unichr(ch)
- p += unichr(ch2)
+ result += unichr(ch)
+ result += unichr(ch2)
else:
- p += unichr((((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000)
+ result += unichr((((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000)
continue
else:
- errmsg = "illegal UTF-16 surrogate"
- errorhandler(errors, 'utf-16', errmsg, s, q - 4, q - 2)
- errmsg = "illegal encoding"
- startinpos = q-2
- endinpos = startinpos+2
- errorhandler(errors, 'utf-16', errmsg, s, startinpos, endinpos, True)
- return u"".join(p), q, bo
+ r, pos = errorhandler(errors, 'utf16',
+ "illegal UTF-16 surrogate",
+ s, pos - 4, pos - 2)
+ result.append(r)
+ else:
+ assert 0, "unreachable"
+ return u"".join(result), pos, bo
def str_decode_latin1(s, size, errors, final=False,
errorhandler=raise_unicode_exception):
Modified: pypy/branch/more-unicode-improvements/pypy/rlib/test/test_runicode.py
==============================================================================
--- pypy/branch/more-unicode-improvements/pypy/rlib/test/test_runicode.py (original)
+++ pypy/branch/more-unicode-improvements/pypy/rlib/test/test_runicode.py Mon Nov 12 19:50:55 2007
@@ -118,6 +118,23 @@
def test_ascii_error(self):
self.checkdecodeerror("abc\xFF\xFF\xFFcde", "ascii", 3, 4)
+ def test_utf16_errors(self):
+ # trunkated BOM
+ for s in ["\xff", "\xfe"]:
+ self.checkdecodeerror(s, "utf16", 0, len(s), addstuff=False)
+
+ for s in [
+ # unexpected end of data ascii
+ "\xff\xfeF",
+ # unexpected end of data
+ '\xff\xfe\xc0\xdb\x00', '\xff\xfe\xc0\xdb', '\xff\xfe\xc0',
+ ]:
+ self.checkdecodeerror(s, "utf16", 2, len(s), addstuff=False)
+ for s in [
+ # illegal surrogate
+ "\xff\xfe\xff\xdb\xff\xff",
+ ]:
+ self.checkdecodeerror(s, "utf16", 2, 4, addstuff=False)
class TestEncoding(UnicodeTests):
def test_all_ascii(self):
More information about the Pypy-commit
mailing list