[pypy-commit] pypy default: Rewrite str_decode_utf_8_impl() to produce the exact same error messages
arigo
pypy.commits at gmail.com
Tue Feb 21 13:05:17 EST 2017
Author: Armin Rigo <arigo at tunes.org>
Branch:
Changeset: r90277:5e8ef7ce3887
Date: 2017-02-21 19:04 +0100
http://bitbucket.org/pypy/pypy/changeset/5e8ef7ce3887/
Log: Rewrite str_decode_utf_8_impl() to produce the exact same error
messages as CPython 2.7. (test by fijal)
diff --git a/rpython/rlib/runicode.py b/rpython/rlib/runicode.py
--- a/rpython/rlib/runicode.py
+++ b/rpython/rlib/runicode.py
@@ -133,23 +133,6 @@
def _invalid_cont_byte(ordch):
return ordch>>6 != 0x2 # 0b10
-_invalid_byte_2_of_2 = _invalid_cont_byte
-_invalid_byte_3_of_3 = _invalid_cont_byte
-_invalid_byte_3_of_4 = _invalid_cont_byte
-_invalid_byte_4_of_4 = _invalid_cont_byte
-
- at enforceargs(allow_surrogates=bool)
-def _invalid_byte_2_of_3(ordch1, ordch2, allow_surrogates):
- return (ordch2>>6 != 0x2 or # 0b10
- (ordch1 == 0xe0 and ordch2 < 0xa0)
- # surrogates shouldn't be valid UTF-8!
- or (ordch1 == 0xed and ordch2 > 0x9f and not allow_surrogates))
-
-def _invalid_byte_2_of_4(ordch1, ordch2):
- return (ordch2>>6 != 0x2 or # 0b10
- (ordch1 == 0xf0 and ordch2 < 0x90) or
- (ordch1 == 0xf4 and ordch2 > 0x8f))
-
def str_decode_utf_8_impl(s, size, errors, final, errorhandler,
allow_surrogates):
if size == 0:
@@ -170,60 +153,20 @@
if pos + n > size:
if not final:
break
- # argh, this obscure block of code is mostly a copy of
- # what follows :-(
charsleft = size - pos - 1 # either 0, 1, 2
# note: when we get the 'unexpected end of data' we need
# to care about the pos returned; it can be lower than size,
# in case we need to continue running this loop
- if not charsleft:
- # there's only the start byte and nothing else
- r, pos = errorhandler(errors, 'utf8',
- 'unexpected end of data',
- s, pos, pos+1)
- result.append(r)
- continue
- ordch2 = ord(s[pos+1])
- if n == 3:
- # 3-bytes seq with only a continuation byte
- if _invalid_byte_2_of_3(ordch1, ordch2, allow_surrogates):
- # second byte invalid, take the first and continue
- r, pos = errorhandler(errors, 'utf8',
- 'invalid continuation byte',
- s, pos, pos+1)
- result.append(r)
- continue
- else:
- # second byte valid, but third byte missing
- r, pos = errorhandler(errors, 'utf8',
- 'unexpected end of data',
- s, pos, pos+2)
- result.append(r)
- continue
- elif n == 4:
- # 4-bytes seq with 1 or 2 continuation bytes
- if _invalid_byte_2_of_4(ordch1, ordch2):
- # second byte invalid, take the first and continue
- r, pos = errorhandler(errors, 'utf8',
- 'invalid continuation byte',
- s, pos, pos+1)
- result.append(r)
- continue
- elif charsleft == 2 and _invalid_byte_3_of_4(ord(s[pos+2])):
- # third byte invalid, take the first two and continue
- r, pos = errorhandler(errors, 'utf8',
- 'invalid continuation byte',
- s, pos, pos+2)
- result.append(r)
- continue
- else:
- # there's only 1 or 2 valid cb, but the others are missing
- r, pos = errorhandler(errors, 'utf8',
- 'unexpected end of data',
- s, pos, pos+charsleft+1)
- result.append(r)
- continue
- raise AssertionError("unreachable")
+ endpos = pos + 1
+ if charsleft >= 1 and not _invalid_cont_byte(ord(s[pos+1])):
+ endpos = pos + 2
+ if charsleft >= 2 and not _invalid_cont_byte(ord(s[pos+2])):
+ endpos = pos + 3
+ r, pos = errorhandler(errors, 'utf8',
+ 'unexpected end of data',
+ s, pos, endpos)
+ result.append(r)
+ continue
if n == 0:
r, pos = errorhandler(errors, 'utf8',
@@ -236,7 +179,7 @@
elif n == 2:
ordch2 = ord(s[pos+1])
- if _invalid_byte_2_of_2(ordch2):
+ if _invalid_cont_byte(ordch2):
r, pos = errorhandler(errors, 'utf8',
'invalid continuation byte',
s, pos, pos+1)
@@ -250,41 +193,48 @@
elif n == 3:
ordch2 = ord(s[pos+1])
ordch3 = ord(s[pos+2])
- if _invalid_byte_2_of_3(ordch1, ordch2, allow_surrogates):
+ if _invalid_cont_byte(ordch2):
r, pos = errorhandler(errors, 'utf8',
'invalid continuation byte',
s, pos, pos+1)
result.append(r)
continue
- elif _invalid_byte_3_of_3(ordch3):
+ elif _invalid_cont_byte(ordch3):
r, pos = errorhandler(errors, 'utf8',
'invalid continuation byte',
s, pos, pos+2)
result.append(r)
continue
# 1110xxxx 10yyyyyy 10zzzzzz -> 00000000 xxxxyyyy yyzzzzzz
- result.append(unichr(((ordch1 & 0x0F) << 12) + # 0b00001111
- ((ordch2 & 0x3F) << 6) + # 0b00111111
- (ordch3 & 0x3F))) # 0b00111111
+ c = (((ordch1 & 0x0F) << 12) + # 0b00001111
+ ((ordch2 & 0x3F) << 6) + # 0b00111111
+ (ordch3 & 0x3F)) # 0b00111111
+ if c < 2048 or (0xd800 <= c <= 0xdfff and not allow_surrogates):
+ r, pos = errorhandler(errors, 'utf8',
+ 'invalid continuation byte',
+ s, pos, pos+2)
+ result.append(r)
+ continue
+ result.append(unichr(c))
pos += 3
elif n == 4:
ordch2 = ord(s[pos+1])
ordch3 = ord(s[pos+2])
ordch4 = ord(s[pos+3])
- if _invalid_byte_2_of_4(ordch1, ordch2):
+ if _invalid_cont_byte(ordch2):
r, pos = errorhandler(errors, 'utf8',
'invalid continuation byte',
s, pos, pos+1)
result.append(r)
continue
- elif _invalid_byte_3_of_4(ordch3):
+ elif _invalid_cont_byte(ordch3):
r, pos = errorhandler(errors, 'utf8',
'invalid continuation byte',
s, pos, pos+2)
result.append(r)
continue
- elif _invalid_byte_4_of_4(ordch4):
+ elif _invalid_cont_byte(ordch4):
r, pos = errorhandler(errors, 'utf8',
'invalid continuation byte',
s, pos, pos+3)
@@ -295,6 +245,12 @@
((ordch2 & 0x3F) << 12) + # 0b00111111
((ordch3 & 0x3F) << 6) + # 0b00111111
(ordch4 & 0x3F)) # 0b00111111
+ if c <= 65535 or c > 0x10ffff:
+ r, pos = errorhandler(errors, 'utf8',
+ 'invalid continuation byte',
+ s, pos, pos+3)
+ result.append(r)
+ continue
if c <= MAXUNICODE:
result.append(UNICHR(c))
else:
diff --git a/rpython/rlib/test/test_runicode.py b/rpython/rlib/test/test_runicode.py
--- a/rpython/rlib/test/test_runicode.py
+++ b/rpython/rlib/test/test_runicode.py
@@ -700,6 +700,27 @@
assert decoder(seq, len(seq), 'ignore', final=True
) == (res, len(seq))
+ @settings(max_examples=10000)
+ @given(strategies.binary())
+ def test_str_check_utf8(self, s):
+ try:
+ u = s.decode("utf8")
+ valid = True
+ except UnicodeDecodeError as e:
+ valid = False
+ try:
+ result, length = runicode.str_decode_utf_8(s, len(s), None,
+ errorhandler=None, final=True, allow_surrogates=True)
+ except UnicodeDecodeError as a:
+ assert not valid
+ assert a.start == e.start
+ assert a.end == e.end
+ assert str(a) == str(e)
+ else:
+ assert valid
+ assert result == u
+ assert length == len(s)
+
class TestEncoding(UnicodeTests):
def test_all_ascii(self):
More information about the pypy-commit
mailing list