[pypy-svn] pypy default: CPython Issue #8271: during the decoding of an invalid UTF-8 byte sequence,
amauryfa
commits-noreply at bitbucket.org
Tue Feb 8 18:33:01 CET 2011
Author: Amaury Forgeot d'Arc <amauryfa at gmail.com>
Branch:
Changeset: r41706:fd91ba2df7bb
Date: 2011-02-08 17:13 +0100
http://bitbucket.org/pypy/pypy/changeset/fd91ba2df7bb/
Log: CPython Issue #8271: during the decoding of an invalid UTF-8 byte
sequence, only the start byte and the continuation byte(s) are now
considered invalid, instead of the number of bytes specified by the
start byte.
diff --git a/pypy/rlib/test/test_runicode.py b/pypy/rlib/test/test_runicode.py
--- a/pypy/rlib/test/test_runicode.py
+++ b/pypy/rlib/test/test_runicode.py
@@ -133,20 +133,126 @@
for s in [# unexpected end of data
"\xd7", "\xd6", "\xeb\x96", "\xf0\x90\x91"]:
self.checkdecodeerror(s, "utf-8", 0, len(s), addstuff=False)
-
- for s in [# unexpected code byte
- "\x81", "\xbf",
- # invalid data 2 byte
- "\xd7\x50", "\xd6\x06", "\xd6\xD6",
- # invalid data 3 byte
- "\xeb\x56\x95", "\xeb\x06\x95", "\xeb\xD6\x95",
- "\xeb\x96\x55", "\xeb\x96\x05", "\xeb\x96\xD5",
- # invalid data 4 byte
- "\xf0\x50\x91\x93", "\xf0\x00\x91\x93", "\xf0\xd0\x91\x93",
- "\xf0\x90\x51\x93", "\xf0\x90\x01\x93", "\xf0\x90\xd1\x93",
- "\xf0\x90\x91\x53", "\xf0\x90\x91\x03", "\xf0\x90\x91\xd3",
- ]:
- self.checkdecodeerror(s, "utf-8", 0, len(s), addstuff=True)
+
+ # unexpected code byte
+ for s in ["\x81", "\xbf"]:
+ self.checkdecodeerror(s, "utf-8", 0, 1, addstuff=True)
+
+ # invalid data 2 byte
+ for s in ["\xd7\x50", "\xd6\x06", "\xd6\xD6"]:
+ self.checkdecodeerror(s, "utf-8", 0, 1, addstuff=True)
+ # invalid data 3 byte
+ for s in ["\xeb\x56\x95", "\xeb\x06\x95", "\xeb\xD6\x95"]:
+ self.checkdecodeerror(s, "utf-8", 0, 1, addstuff=True)
+ for s in ["\xeb\x96\x55", "\xeb\x96\x05", "\xeb\x96\xD5"]:
+ self.checkdecodeerror(s, "utf-8", 0, 2, addstuff=True)
+ # invalid data 4 byte
+ for s in ["\xf0\x50\x91\x93", "\xf0\x00\x91\x93", "\xf0\xd0\x91\x93"]:
+ self.checkdecodeerror(s, "utf-8", 0, 1, addstuff=True)
+ for s in ["\xf0\x90\x51\x93", "\xf0\x90\x01\x93", "\xf0\x90\xd1\x93"]:
+ self.checkdecodeerror(s, "utf-8", 0, 2, addstuff=True)
+ for s in ["\xf0\x90\x91\x53", "\xf0\x90\x91\x03", "\xf0\x90\x91\xd3"]:
+ self.checkdecodeerror(s, "utf-8", 0, 3, addstuff=True)
+
+
+ def test_issue8271(self):
+ # From CPython
+ # Issue #8271: during the decoding of an invalid UTF-8 byte sequence,
+ # only the start byte and the continuation byte(s) are now considered
+ # invalid, instead of the number of bytes specified by the start byte.
+ # See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf (page 95,
+ # table 3-8, Row 2) for more information about the algorithm used.
+ FFFD = u'\ufffd'
+ sequences = [
+ # invalid start bytes
+ ('\x80', FFFD), # continuation byte
+ ('\x80\x80', FFFD*2), # 2 continuation bytes
+ ('\xc0', FFFD),
+ ('\xc0\xc0', FFFD*2),
+ ('\xc1', FFFD),
+ ('\xc1\xc0', FFFD*2),
+ ('\xc0\xc1', FFFD*2),
+ # with start byte of a 2-byte sequence
+ ('\xc2', FFFD), # only the start byte
+ ('\xc2\xc2', FFFD*2), # 2 start bytes
+ ('\xc2\xc2\xc2', FFFD*3), # 2 start bytes
+ ('\xc2\x41', FFFD+'A'), # invalid continuation byte
+ # with start byte of a 3-byte sequence
+ ('\xe1', FFFD), # only the start byte
+ ('\xe1\xe1', FFFD*2), # 2 start bytes
+ ('\xe1\xe1\xe1', FFFD*3), # 3 start bytes
+ ('\xe1\xe1\xe1\xe1', FFFD*4), # 4 start bytes
+ ('\xe1\x80', FFFD), # only 1 continuation byte
+ ('\xe1\x41', FFFD+'A'), # invalid continuation byte
+ ('\xe1\x41\x80', FFFD+'A'+FFFD), # invalid cb followed by valid cb
+ ('\xe1\x41\x41', FFFD+'AA'), # 2 invalid continuation bytes
+ ('\xe1\x80\x41', FFFD+'A'), # only 1 valid continuation byte
+ ('\xe1\x80\xe1\x41', FFFD*2+'A'), # 1 valid and the other invalid
+ ('\xe1\x41\xe1\x80', FFFD+'A'+FFFD), # 1 invalid and the other valid
+ # with start byte of a 4-byte sequence
+ ('\xf1', FFFD), # only the start byte
+ ('\xf1\xf1', FFFD*2), # 2 start bytes
+ ('\xf1\xf1\xf1', FFFD*3), # 3 start bytes
+ ('\xf1\xf1\xf1\xf1', FFFD*4), # 4 start bytes
+ ('\xf1\xf1\xf1\xf1\xf1', FFFD*5), # 5 start bytes
+ ('\xf1\x80', FFFD), # only 1 continuation bytes
+ ('\xf1\x80\x80', FFFD), # only 2 continuation bytes
+ ('\xf1\x80\x41', FFFD+'A'), # 1 valid cb and 1 invalid
+ ('\xf1\x80\x41\x41', FFFD+'AA'), # 1 valid cb and 1 invalid
+ ('\xf1\x80\x80\x41', FFFD+'A'), # 2 valid cb and 1 invalid
+ ('\xf1\x41\x80', FFFD+'A'+FFFD), # 1 invalid cv and 1 valid
+ ('\xf1\x41\x80\x80', FFFD+'A'+FFFD*2), # 1 invalid cb and 2 invalid
+ ('\xf1\x41\x80\x41', FFFD+'A'+FFFD+'A'), # 2 invalid cb and 1 invalid
+ ('\xf1\x41\x41\x80', FFFD+'AA'+FFFD), # 1 valid cb and 1 invalid
+ ('\xf1\x41\xf1\x80', FFFD+'A'+FFFD),
+ ('\xf1\x41\x80\xf1', FFFD+'A'+FFFD*2),
+ ('\xf1\xf1\x80\x41', FFFD*2+'A'),
+ ('\xf1\x41\xf1\xf1', FFFD+'A'+FFFD*2),
+ # with invalid start byte of a 4-byte sequence (rfc2279)
+ ('\xf5', FFFD), # only the start byte
+ ('\xf5\xf5', FFFD*2), # 2 start bytes
+ ('\xf5\x80', FFFD*2), # only 1 continuation byte
+ ('\xf5\x80\x80', FFFD*3), # only 2 continuation byte
+ ('\xf5\x80\x80\x80', FFFD*4), # 3 continuation bytes
+ ('\xf5\x80\x41', FFFD*2+'A'), # 1 valid cb and 1 invalid
+ ('\xf5\x80\x41\xf5', FFFD*2+'A'+FFFD),
+ ('\xf5\x41\x80\x80\x41', FFFD+'A'+FFFD*2+'A'),
+ # with invalid start byte of a 5-byte sequence (rfc2279)
+ ('\xf8', FFFD), # only the start byte
+ ('\xf8\xf8', FFFD*2), # 2 start bytes
+ ('\xf8\x80', FFFD*2), # only one continuation byte
+ ('\xf8\x80\x41', FFFD*2 + 'A'), # 1 valid cb and 1 invalid
+ ('\xf8\x80\x80\x80\x80', FFFD*5), # invalid 5 bytes seq with 5 bytes
+ # with invalid start byte of a 6-byte sequence (rfc2279)
+ ('\xfc', FFFD), # only the start byte
+ ('\xfc\xfc', FFFD*2), # 2 start bytes
+ ('\xfc\x80\x80', FFFD*3), # only 2 continuation bytes
+ ('\xfc\x80\x80\x80\x80\x80', FFFD*6), # 6 continuation bytes
+ # invalid start byte
+ ('\xfe', FFFD),
+ ('\xfe\x80\x80', FFFD*3),
+ # other sequences
+ ('\xf1\x80\x41\x42\x43', u'\ufffd\x41\x42\x43'),
+ ('\xf1\x80\xff\x42\x43', u'\ufffd\ufffd\x42\x43'),
+ ('\xf1\x80\xc2\x81\x43', u'\ufffd\x81\x43'),
+ ('\x61\xF1\x80\x80\xE1\x80\xC2\x62\x80\x63\x80\xBF\x64',
+ u'\x61\uFFFD\uFFFD\uFFFD\x62\uFFFD\x63\uFFFD\uFFFD\x64'),
+ ]
+ def replace_handler(errors, codec, message, input, start, end):
+ return FFFD, end
+ def ignore_handler(errors, codec, message, input, start, end):
+ return u'', end
+ for n, (seq, res) in enumerate(sequences):
+ decoder = self.getdecoder('utf-8')
+ raises(UnicodeDecodeError, decoder, seq, len(seq), None, final=True)
+ assert decoder(seq, len(seq), None, final=True,
+ errorhandler=replace_handler) == (res, len(seq))
+ assert decoder(seq + 'b', len(seq) + 1, None, final=True,
+ errorhandler=replace_handler) == (res + u'b',
+ len(seq) + 1)
+ res = res.replace(FFFD, u'')
+ assert decoder(seq, len(seq), None, final=True,
+ errorhandler=ignore_handler) == (res, len(seq))
def test_ascii_error(self):
self.checkdecodeerror("abc\xFF\xFF\xFFcde", "ascii", 3, 4)
diff --git a/pypy/rlib/runicode.py b/pypy/rlib/runicode.py
--- a/pypy/rlib/runicode.py
+++ b/pypy/rlib/runicode.py
@@ -59,22 +59,22 @@
# utf-8
utf8_code_length = [
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 00-0F
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 70-7F
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 80-8F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
- 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # B0-BF
+ 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # C0-C1 + C2-CF
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # D0-DF
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, # E0-EF
+ 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 # F0-F4 - F5-FF
]
def str_decode_utf_8(s, size, errors, final=False,
@@ -99,13 +99,18 @@
if not final:
break
else:
+ endpos = pos + 1
+ while endpos < size and ord(s[endpos]) & 0xC0 == 0x80:
+ endpos += 1
r, pos = errorhandler(errors, "utf-8",
- "unexpected end of data", s, pos, size)
+ "unexpected end of data",
+ s, pos, endpos)
result.append(r)
- if pos + n > size:
- break
+ continue
+
if n == 0:
- r, pos = errorhandler(errors, "utf-8", "unexpected code byte",
+ r, pos = errorhandler(errors, "utf-8",
+ "invalid start byte",
s, pos, pos + 1)
result.append(r)
elif n == 1:
@@ -118,18 +123,14 @@
y, six = splitter[5, 3](ordch1)
assert six == 6
if two != 2:
- r, pos = errorhandler(errors, "utf-8", "invalid data",
- s, pos, pos + 2)
+ r, pos = errorhandler(errors, "utf-8",
+ "invalid continuation byte",
+ s, pos, pos + 1)
result.append(r)
else:
c = (y << 6) + z
- if c < 0x80:
- r, pos = errorhandler(errors, "utf-8", "illegal encoding",
- s, pos, pos + 2)
- result.append(r)
- else:
- result.append(unichr(c))
- pos += n
+ result.append(unichr(c))
+ pos += n
elif n == 3:
# 1110xxxx 10yyyyyy 10zzzzzz ====> 00000000 xxxxyyyy yyzzzzzz
ordch2 = ord(s[pos+1])
@@ -138,24 +139,27 @@
y, two2 = splitter[6, 2](ordch2)
x, fourteen = splitter[4, 4](ordch1)
assert fourteen == 14
- if two1 != 2 or two2 != 2:
- r, pos = errorhandler(errors, "utf-8", "invalid data",
- s, pos, pos + 3)
+ if (two1 != 2 or two2 != 2 or
+ (ordch1 == 0xe0 and ordch2 < 0xa0)
+ # surrogates shouldn't be valid UTF-8!
+ # Uncomment the line below to make them invalid.
+ # or (ordch1 == 0xed and ordch2 > 0x9f)
+ ):
+
+ # if ordch2 first two bits are 1 and 0, then the invalid
+ # continuation byte is ordch3; else ordch2 is invalid.
+ if two2 == 2:
+ endpos = pos + 2
+ else:
+ endpos = pos + 1
+ r, pos = errorhandler(errors, "utf-8",
+ "invalid continuation byte",
+ s, pos, endpos)
result.append(r)
else:
c = (x << 12) + (y << 6) + z
- # Note: UTF-8 encodings of surrogates are considered
- # legal UTF-8 sequences;
- # XXX For wide builds (UCS-4) we should probably try
- # to recombine the surrogates into a single code
- # unit.
- if c < 0x0800:
- r, pos = errorhandler(errors, "utf-8", "illegal encoding",
- s, pos, pos + 3)
- result.append(r)
- else:
- result.append(unichr(c))
- pos += n
+ result.append(unichr(c))
+ pos += n
elif n == 4:
# 11110www 10xxxxxx 10yyyyyy 10zzzzzz ====>
# 000wwwxx xxxxyyyy yyzzzzzz
@@ -167,31 +171,32 @@
x, two3 = splitter[6, 2](ordch2)
w, thirty = splitter[3, 5](ordch1)
assert thirty == 30
- if two1 != 2 or two2 != 2 or two3 != 2:
- r, pos = errorhandler(errors, "utf-8", "invalid data",
- s, pos, pos + 4)
+ if (two1 != 2 or two2 != 2 or two3 != 2 or
+ (ordch1 == 0xf0 and ordch2 < 0x90) or
+ (ordch1 == 0xf4 and ordch2 > 0x8f)):
+ endpos = pos + 1
+ if ordch2 & 0xc0 == 0x80:
+ endpos += 1
+ if ordch3 & 0xc0 == 0x80:
+ endpos += 1
+ r, pos = errorhandler(errors, "utf-8",
+ "invalid continuation byte",
+ s, pos, endpos)
result.append(r)
else:
c = (w << 18) + (x << 12) + (y << 6) + z
- # minimum value allowed for 4 byte encoding
- # maximum value allowed for UTF-16
- if c < 0x10000 or c > 0x10ffff:
- r, pos = errorhandler(errors, "utf-8", "illegal encoding",
- s, pos, pos + 4)
- result.append(r)
+ # convert to UTF-16 if necessary
+ if c <= MAXUNICODE:
+ result.append(UNICHR(c))
else:
- # convert to UTF-16 if necessary
- if c <= MAXUNICODE:
- result.append(UNICHR(c))
- else:
- # compute and append the two surrogates:
- # translate from 10000..10FFFF to 0..FFFF
- c -= 0x10000
- # high surrogate = top 10 bits added to D800
- result.append(unichr(0xD800 + (c >> 10)))
- # low surrogate = bottom 10 bits added to DC00
- result.append(unichr(0xDC00 + (c & 0x03FF)))
- pos += n
+ # compute and append the two surrogates:
+ # translate from 10000..10FFFF to 0..FFFF
+ c -= 0x10000
+ # high surrogate = top 10 bits added to D800
+ result.append(unichr(0xD800 + (c >> 10)))
+ # low surrogate = bottom 10 bits added to DC00
+ result.append(unichr(0xDC00 + (c & 0x03FF)))
+ pos += n
else:
r, pos = errorhandler(errors, "utf-8",
"unsupported Unicode code range",
More information about the Pypy-commit
mailing list