[pypy-svn] pypy default: CPython Issue #8271: during the decoding of an invalid UTF-8 byte sequence,

Tue Feb 8 18:33:01 CET 2011

Author: Amaury Forgeot d'Arc <amauryfa at gmail.com>
Branch: 
Changeset: r41706:fd91ba2df7bb
Date: 2011-02-08 17:13 +0100
http://bitbucket.org/pypy/pypy/changeset/fd91ba2df7bb/

Log:	CPython Issue #8271: during the decoding of an invalid UTF-8 byte
	sequence, only the start byte and the continuation byte(s) are now
	considered invalid, instead of the number of bytes specified by the
	start byte.

diff --git a/pypy/rlib/test/test_runicode.py b/pypy/rlib/test/test_runicode.py
--- a/pypy/rlib/test/test_runicode.py
+++ b/pypy/rlib/test/test_runicode.py
@@ -133,20 +133,126 @@
         for s in [# unexpected end of data
                   "\xd7", "\xd6", "\xeb\x96", "\xf0\x90\x91"]:
             self.checkdecodeerror(s, "utf-8", 0, len(s), addstuff=False)
-            
-        for s in [# unexpected code byte
-                  "\x81", "\xbf",
-                  # invalid data 2 byte
-                  "\xd7\x50", "\xd6\x06", "\xd6\xD6",
-                  # invalid data 3 byte
-                  "\xeb\x56\x95", "\xeb\x06\x95", "\xeb\xD6\x95",
-                  "\xeb\x96\x55", "\xeb\x96\x05", "\xeb\x96\xD5",
-                  # invalid data 4 byte
-                  "\xf0\x50\x91\x93", "\xf0\x00\x91\x93", "\xf0\xd0\x91\x93", 
-                  "\xf0\x90\x51\x93", "\xf0\x90\x01\x93", "\xf0\x90\xd1\x93", 
-                  "\xf0\x90\x91\x53", "\xf0\x90\x91\x03", "\xf0\x90\x91\xd3", 
-                  ]:
-            self.checkdecodeerror(s, "utf-8", 0, len(s), addstuff=True)
+
+        # unexpected code byte
+        for s in ["\x81", "\xbf"]:
+            self.checkdecodeerror(s, "utf-8", 0, 1, addstuff=True)
+
+        # invalid data 2 byte
+        for s in ["\xd7\x50", "\xd6\x06", "\xd6\xD6"]:
+            self.checkdecodeerror(s, "utf-8", 0, 1, addstuff=True)
+        # invalid data 3 byte
+        for s in ["\xeb\x56\x95", "\xeb\x06\x95", "\xeb\xD6\x95"]:
+            self.checkdecodeerror(s, "utf-8", 0, 1, addstuff=True)
+        for s in ["\xeb\x96\x55", "\xeb\x96\x05", "\xeb\x96\xD5"]:
+            self.checkdecodeerror(s, "utf-8", 0, 2, addstuff=True)
+        # invalid data 4 byte
+        for s in ["\xf0\x50\x91\x93", "\xf0\x00\x91\x93", "\xf0\xd0\x91\x93"]:
+            self.checkdecodeerror(s, "utf-8", 0, 1, addstuff=True)
+        for s in ["\xf0\x90\x51\x93", "\xf0\x90\x01\x93", "\xf0\x90\xd1\x93"]:
+            self.checkdecodeerror(s, "utf-8", 0, 2, addstuff=True)
+        for s in ["\xf0\x90\x91\x53", "\xf0\x90\x91\x03", "\xf0\x90\x91\xd3"]:
+            self.checkdecodeerror(s, "utf-8", 0, 3, addstuff=True)
+
+
+    def test_issue8271(self):
+        # From CPython
+        # Issue #8271: during the decoding of an invalid UTF-8 byte sequence,
+        # only the start byte and the continuation byte(s) are now considered
+        # invalid, instead of the number of bytes specified by the start byte.
+        # See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf (page 95,
+        # table 3-8, Row 2) for more information about the algorithm used.
+        FFFD = u'\ufffd'
+        sequences = [
+            # invalid start bytes
+            ('\x80', FFFD), # continuation byte
+            ('\x80\x80', FFFD*2), # 2 continuation bytes
+            ('\xc0', FFFD),
+            ('\xc0\xc0', FFFD*2),
+            ('\xc1', FFFD),
+            ('\xc1\xc0', FFFD*2),
+            ('\xc0\xc1', FFFD*2),
+            # with start byte of a 2-byte sequence
+            ('\xc2', FFFD), # only the start byte
+            ('\xc2\xc2', FFFD*2), # 2 start bytes
+            ('\xc2\xc2\xc2', FFFD*3), # 2 start bytes
+            ('\xc2\x41', FFFD+'A'), # invalid continuation byte
+            # with start byte of a 3-byte sequence
+            ('\xe1', FFFD), # only the start byte
+            ('\xe1\xe1', FFFD*2), # 2 start bytes
+            ('\xe1\xe1\xe1', FFFD*3), # 3 start bytes
+            ('\xe1\xe1\xe1\xe1', FFFD*4), # 4 start bytes
+            ('\xe1\x80', FFFD), # only 1 continuation byte
+            ('\xe1\x41', FFFD+'A'), # invalid continuation byte
+            ('\xe1\x41\x80', FFFD+'A'+FFFD), # invalid cb followed by valid cb
+            ('\xe1\x41\x41', FFFD+'AA'), # 2 invalid continuation bytes
+            ('\xe1\x80\x41', FFFD+'A'), # only 1 valid continuation byte
+            ('\xe1\x80\xe1\x41', FFFD*2+'A'), # 1 valid and the other invalid
+            ('\xe1\x41\xe1\x80', FFFD+'A'+FFFD), # 1 invalid and the other valid
+            # with start byte of a 4-byte sequence
+            ('\xf1', FFFD), # only the start byte
+            ('\xf1\xf1', FFFD*2), # 2 start bytes
+            ('\xf1\xf1\xf1', FFFD*3), # 3 start bytes
+            ('\xf1\xf1\xf1\xf1', FFFD*4), # 4 start bytes
+            ('\xf1\xf1\xf1\xf1\xf1', FFFD*5), # 5 start bytes
+            ('\xf1\x80', FFFD), # only 1 continuation bytes
+            ('\xf1\x80\x80', FFFD), # only 2 continuation bytes
+            ('\xf1\x80\x41', FFFD+'A'), # 1 valid cb and 1 invalid
+            ('\xf1\x80\x41\x41', FFFD+'AA'), # 1 valid cb and 1 invalid
+            ('\xf1\x80\x80\x41', FFFD+'A'), # 2 valid cb and 1 invalid
+            ('\xf1\x41\x80', FFFD+'A'+FFFD), # 1 invalid cv and 1 valid
+            ('\xf1\x41\x80\x80', FFFD+'A'+FFFD*2), # 1 invalid cb and 2 invalid
+            ('\xf1\x41\x80\x41', FFFD+'A'+FFFD+'A'), # 2 invalid cb and 1 invalid
+            ('\xf1\x41\x41\x80', FFFD+'AA'+FFFD), # 1 valid cb and 1 invalid
+            ('\xf1\x41\xf1\x80', FFFD+'A'+FFFD),
+            ('\xf1\x41\x80\xf1', FFFD+'A'+FFFD*2),
+            ('\xf1\xf1\x80\x41', FFFD*2+'A'),
+            ('\xf1\x41\xf1\xf1', FFFD+'A'+FFFD*2),
+            # with invalid start byte of a 4-byte sequence (rfc2279)
+            ('\xf5', FFFD), # only the start byte
+            ('\xf5\xf5', FFFD*2), # 2 start bytes
+            ('\xf5\x80', FFFD*2), # only 1 continuation byte
+            ('\xf5\x80\x80', FFFD*3), # only 2 continuation byte
+            ('\xf5\x80\x80\x80', FFFD*4), # 3 continuation bytes
+            ('\xf5\x80\x41', FFFD*2+'A'), #  1 valid cb and 1 invalid
+            ('\xf5\x80\x41\xf5', FFFD*2+'A'+FFFD),
+            ('\xf5\x41\x80\x80\x41', FFFD+'A'+FFFD*2+'A'),
+            # with invalid start byte of a 5-byte sequence (rfc2279)
+            ('\xf8', FFFD), # only the start byte
+            ('\xf8\xf8', FFFD*2), # 2 start bytes
+            ('\xf8\x80', FFFD*2), # only one continuation byte
+            ('\xf8\x80\x41', FFFD*2 + 'A'), # 1 valid cb and 1 invalid
+            ('\xf8\x80\x80\x80\x80', FFFD*5), # invalid 5 bytes seq with 5 bytes
+            # with invalid start byte of a 6-byte sequence (rfc2279)
+            ('\xfc', FFFD), # only the start byte
+            ('\xfc\xfc', FFFD*2), # 2 start bytes
+            ('\xfc\x80\x80', FFFD*3), # only 2 continuation bytes
+            ('\xfc\x80\x80\x80\x80\x80', FFFD*6), # 6 continuation bytes
+            # invalid start byte
+            ('\xfe', FFFD),
+            ('\xfe\x80\x80', FFFD*3),
+            # other sequences
+            ('\xf1\x80\x41\x42\x43', u'\ufffd\x41\x42\x43'),
+            ('\xf1\x80\xff\x42\x43', u'\ufffd\ufffd\x42\x43'),
+            ('\xf1\x80\xc2\x81\x43', u'\ufffd\x81\x43'),
+            ('\x61\xF1\x80\x80\xE1\x80\xC2\x62\x80\x63\x80\xBF\x64',
+             u'\x61\uFFFD\uFFFD\uFFFD\x62\uFFFD\x63\uFFFD\uFFFD\x64'),
+        ]
+        def replace_handler(errors, codec, message, input, start, end):
+            return FFFD, end
+        def ignore_handler(errors, codec, message, input, start, end):
+            return u'', end
+        for n, (seq, res) in enumerate(sequences):
+            decoder = self.getdecoder('utf-8')
+            raises(UnicodeDecodeError, decoder, seq, len(seq), None, final=True)
+            assert decoder(seq, len(seq), None, final=True,
+                           errorhandler=replace_handler) == (res, len(seq))
+            assert decoder(seq + 'b', len(seq) + 1, None, final=True,
+                           errorhandler=replace_handler) == (res + u'b',
+                                                             len(seq) + 1)
+            res = res.replace(FFFD, u'')
+            assert decoder(seq, len(seq), None, final=True,
+                           errorhandler=ignore_handler) == (res, len(seq))
 
     def test_ascii_error(self):
         self.checkdecodeerror("abc\xFF\xFF\xFFcde", "ascii", 3, 4)

diff --git a/pypy/rlib/runicode.py b/pypy/rlib/runicode.py
--- a/pypy/rlib/runicode.py
+++ b/pypy/rlib/runicode.py
@@ -59,22 +59,22 @@
 # utf-8
 
 utf8_code_length = [
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 00-0F
     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 70-7F
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 80-8F
     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
-    4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # B0-BF
+    0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # C0-C1 + C2-CF
+    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # D0-DF
+    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, # E0-EF
+    4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0  # F0-F4 - F5-FF
 ]
 
 def str_decode_utf_8(s, size, errors, final=False,
@@ -99,13 +99,18 @@
             if not final:
                 break
             else:
+                endpos = pos + 1
+                while endpos < size and ord(s[endpos]) & 0xC0 == 0x80:
+                    endpos += 1
                 r, pos = errorhandler(errors, "utf-8",
-                                      "unexpected end of data", s,  pos, size)
+                                      "unexpected end of data",
+                                      s,  pos, endpos)
                 result.append(r)
-                if pos + n > size:
-                    break
+                continue
+
         if n == 0:
-            r, pos = errorhandler(errors, "utf-8", "unexpected code byte",
+            r, pos = errorhandler(errors, "utf-8",
+                                  "invalid start byte",
                                   s,  pos, pos + 1)
             result.append(r)
         elif n == 1:
@@ -118,18 +123,14 @@
             y, six = splitter[5, 3](ordch1)
             assert six == 6
             if two != 2:
-                r, pos = errorhandler(errors, "utf-8", "invalid data",
-                                      s,  pos, pos + 2)
+                r, pos = errorhandler(errors, "utf-8",
+                                      "invalid continuation byte",
+                                      s,  pos, pos + 1)
                 result.append(r)
             else:
                 c = (y << 6) + z
-                if c < 0x80:
-                    r, pos = errorhandler(errors, "utf-8", "illegal encoding",
-                                          s,  pos, pos + 2)
-                    result.append(r)
-                else:
-                    result.append(unichr(c))
-                    pos += n
+                result.append(unichr(c))
+                pos += n
         elif n == 3:
             #  1110xxxx 10yyyyyy 10zzzzzz ====> 00000000 xxxxyyyy yyzzzzzz
             ordch2 = ord(s[pos+1])
@@ -138,24 +139,27 @@
             y, two2 = splitter[6, 2](ordch2)
             x, fourteen = splitter[4, 4](ordch1)
             assert fourteen == 14
-            if two1 != 2 or two2 != 2:
-                r, pos = errorhandler(errors, "utf-8", "invalid data",
-                                      s,  pos, pos + 3)
+            if (two1 != 2 or two2 != 2 or
+                (ordch1 == 0xe0 and ordch2 < 0xa0)
+                # surrogates shouldn't be valid UTF-8!
+                # Uncomment the line below to make them invalid.
+                # or (ordch1 == 0xed and ordch2 > 0x9f)
+                ):
+
+                # if ordch2 first two bits are 1 and 0, then the invalid
+                # continuation byte is ordch3; else ordch2 is invalid.
+                if two2 == 2:
+                    endpos = pos + 2
+                else:
+                    endpos = pos + 1
+                r, pos = errorhandler(errors, "utf-8",
+                                      "invalid continuation byte",
+                                      s,  pos, endpos)
                 result.append(r)
             else:
                 c = (x << 12) + (y << 6) + z
-                # Note: UTF-8 encodings of surrogates are considered
-                # legal UTF-8 sequences;
-                # XXX For wide builds (UCS-4) we should probably try
-                #     to recombine the surrogates into a single code
-                #     unit.
-                if c < 0x0800:
-                    r, pos = errorhandler(errors, "utf-8", "illegal encoding",
-                                          s,  pos, pos + 3)
-                    result.append(r)
-                else:
-                    result.append(unichr(c))
-                    pos += n
+                result.append(unichr(c))
+                pos += n
         elif n == 4:
             # 11110www 10xxxxxx 10yyyyyy 10zzzzzz ====>
             # 000wwwxx xxxxyyyy yyzzzzzz
@@ -167,31 +171,32 @@
             x, two3 = splitter[6, 2](ordch2)
             w, thirty = splitter[3, 5](ordch1)
             assert thirty == 30
-            if two1 != 2 or two2 != 2 or two3 != 2:
-                r, pos = errorhandler(errors, "utf-8", "invalid data",
-                                      s,  pos, pos + 4)
+            if (two1 != 2 or two2 != 2 or two3 != 2 or
+                (ordch1 == 0xf0 and ordch2 < 0x90) or
+                (ordch1 == 0xf4 and ordch2 > 0x8f)):
+                endpos = pos + 1
+                if ordch2 & 0xc0 == 0x80:
+                    endpos += 1
+                    if ordch3 & 0xc0 == 0x80:
+                        endpos += 1
+                r, pos = errorhandler(errors, "utf-8",
+                                      "invalid continuation byte",
+                                      s,  pos, endpos)
                 result.append(r)
             else:
                 c = (w << 18) + (x << 12) + (y << 6) + z
-                # minimum value allowed for 4 byte encoding
-                # maximum value allowed for UTF-16
-                if c < 0x10000 or c > 0x10ffff:
-                    r, pos = errorhandler(errors, "utf-8", "illegal encoding",
-                                          s,  pos, pos + 4)
-                    result.append(r)
+                # convert to UTF-16 if necessary
+                if c <= MAXUNICODE:
+                    result.append(UNICHR(c))
                 else:
-                    # convert to UTF-16 if necessary
-                    if c <= MAXUNICODE:
-                        result.append(UNICHR(c))
-                    else:
-                        # compute and append the two surrogates:
-                        # translate from 10000..10FFFF to 0..FFFF
-                        c -= 0x10000
-                        # high surrogate = top 10 bits added to D800
-                        result.append(unichr(0xD800 + (c >> 10)))
-                        # low surrogate = bottom 10 bits added to DC00
-                        result.append(unichr(0xDC00 + (c & 0x03FF)))
-                    pos += n
+                    # compute and append the two surrogates:
+                    # translate from 10000..10FFFF to 0..FFFF
+                    c -= 0x10000
+                    # high surrogate = top 10 bits added to D800
+                    result.append(unichr(0xD800 + (c >> 10)))
+                    # low surrogate = bottom 10 bits added to DC00
+                    result.append(unichr(0xDC00 + (c & 0x03FF)))
+                pos += n
         else:
             r, pos = errorhandler(errors, "utf-8",
                                   "unsupported Unicode code range",