[pypy-svn] r48609 - in pypy/branch/more-unicode-improvements/pypy/rlib: . test

Mon Nov 12 19:50:56 CET 2007

Author: cfbolz
Date: Mon Nov 12 19:50:55 2007
New Revision: 48609

Modified:
   pypy/branch/more-unicode-improvements/pypy/rlib/runicode.py
   pypy/branch/more-unicode-improvements/pypy/rlib/test/test_runicode.py
Log:
grr. The error handling of PyPy's utf-16 decoder is broken. Fix the RPython
version at least.


Modified: pypy/branch/more-unicode-improvements/pypy/rlib/runicode.py
==============================================================================

--- pypy/branch/more-unicode-improvements/pypy/rlib/runicode.py	(original)
+++ pypy/branch/more-unicode-improvements/pypy/rlib/runicode.py	Mon Nov 12 19:50:55 2007
@@ -195,24 +195,24 @@
     #  byte order setting accordingly. In native mode, the leading BOM
     #  mark is skipped, in all other modes, it is copied to the output
     #  stream as-is (giving a ZWNBSP character).
-    q = 0
-    p = []
+    pos = 0
+    result = []
     if byteorder == 'native':
         if (size >= 2):
             bom = (ord(s[ihi]) << 8) | ord(s[ilo])
             if sys.byteorder == 'little':
                 if (bom == 0xFEFF):
-                    q += 2
+                    pos += 2
                     bo = -1
                 elif bom == 0xFFFE:
-                    q += 2
+                    pos += 2
                     bo = 1
             else:
                 if bom == 0xFEFF:
-                    q += 2
+                    pos += 2
                     bo = 1
                 elif bom == 0xFFFE:
-                    q += 2
+                    pos += 2
                     bo = -1
     elif byteorder == 'little':
         bo = -1
@@ -231,45 +231,48 @@
         ilo = 1
 
     #XXX I think the errors are not correctly handled here
-    while (q < len(s)):
+    while (pos < len(s)):
         # remaining bytes at the end? (size should be even)
-        if len(s) - q < 2:
+        if len(s) - pos < 2:
             if not final:
                 break
-            errmsg = "truncated data"
-            startinpos = q
-            endinpos = len(s)
-            errorhandler(errors, 'utf-16', "truncated data",
-                         s, startinpos, endinpos, True)
-            # CPython ignores the remaining input chars if the callback
-            # chooses to skip the input. XXX is this sensible?
-        ch = (ord(s[q + ihi]) << 8) | ord(s[q + ilo])
-        q += 2
+            r, pos = errorhandler(errors, 'utf16', "truncated data",
+                                s, pos, len(s), True)
+            result.append(r)
+            if len(s) - pos < 2:
+                break
+        ch = (ord(s[pos + ihi]) << 8) | ord(s[pos + ilo])
+        pos += 2
         if (ch < 0xD800 or ch > 0xDFFF):
-            p += unichr(ch)
+            result += unichr(ch)
             continue
         # UTF-16 code pair:
-        if (q >= len(s)):
+        if len(s) - pos < 2:
+            if not final:
+                break
             errmsg = "unexpected end of data"
-            errorhandler(errors, 'utf-16', errmsg, s, q - 2, len(s))
+            r, pos = errorhandler(errors, 'utf16', errmsg, s, pos - 2, len(s))
+            result.append(r)
+            if len(s) - pos < 2:
+                break
         elif (0xD800 <= ch and ch <= 0xDBFF):
-            ch2 = (ord(s[q+ihi]) << 8) | ord(s[q+ilo])
-            q += 2
+            ch2 = (ord(s[pos+ihi]) << 8) | ord(s[pos+ilo])
+            pos += 2
             if (0xDC00 <= ch2 and ch2 <= 0xDFFF):
                 if MAXUNICODE < 65536:
-                    p += unichr(ch)
-                    p += unichr(ch2)
+                    result += unichr(ch)
+                    result += unichr(ch2)
                 else:
-                    p += unichr((((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000)
+                    result += unichr((((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000)
                 continue
             else:
-                errmsg = "illegal UTF-16 surrogate"
-                errorhandler(errors, 'utf-16', errmsg, s, q - 4, q - 2)
-        errmsg = "illegal encoding"
-        startinpos = q-2
-        endinpos = startinpos+2
-        errorhandler(errors, 'utf-16', errmsg, s, startinpos, endinpos, True)
-    return u"".join(p), q, bo
+                r, pos = errorhandler(errors, 'utf16',
+                                      "illegal UTF-16 surrogate",
+                                      s, pos - 4, pos - 2)
+                result.append(r)
+        else:
+            assert 0, "unreachable"
+    return u"".join(result), pos, bo
 
 def str_decode_latin1(s, size, errors, final=False,
                       errorhandler=raise_unicode_exception):

Modified: pypy/branch/more-unicode-improvements/pypy/rlib/test/test_runicode.py
==============================================================================
--- pypy/branch/more-unicode-improvements/pypy/rlib/test/test_runicode.py	(original)
+++ pypy/branch/more-unicode-improvements/pypy/rlib/test/test_runicode.py	Mon Nov 12 19:50:55 2007
@@ -118,6 +118,23 @@
     def test_ascii_error(self):
         self.checkdecodeerror("abc\xFF\xFF\xFFcde", "ascii", 3, 4)
 
+    def test_utf16_errors(self):
+        # trunkated BOM
+        for s in ["\xff", "\xfe"]:
+            self.checkdecodeerror(s, "utf16", 0, len(s), addstuff=False)
+
+        for s in [
+                  # unexpected end of data ascii
+                  "\xff\xfeF",
+                  # unexpected end of data
+                  '\xff\xfe\xc0\xdb\x00', '\xff\xfe\xc0\xdb', '\xff\xfe\xc0', 
+                  ]:
+            self.checkdecodeerror(s, "utf16", 2, len(s), addstuff=False)
+        for s in [
+                  # illegal surrogate
+                  "\xff\xfe\xff\xdb\xff\xff",
+                  ]:
+            self.checkdecodeerror(s, "utf16", 2, 4, addstuff=False)
 
 class TestEncoding(UnicodeTests):
     def test_all_ascii(self):