[pypy-commit] pypy default: Rewrite str_decode_utf_8_impl() to produce the exact same error messages

Tue Feb 21 13:05:17 EST 2017

Author: Armin Rigo <arigo at tunes.org>
Branch: 
Changeset: r90277:5e8ef7ce3887
Date: 2017-02-21 19:04 +0100
http://bitbucket.org/pypy/pypy/changeset/5e8ef7ce3887/

Log:	Rewrite str_decode_utf_8_impl() to produce the exact same error
	messages as CPython 2.7. (test by fijal)

diff --git a/rpython/rlib/runicode.py b/rpython/rlib/runicode.py
--- a/rpython/rlib/runicode.py
+++ b/rpython/rlib/runicode.py
@@ -133,23 +133,6 @@
 def _invalid_cont_byte(ordch):
     return ordch>>6 != 0x2    # 0b10
 
-_invalid_byte_2_of_2 = _invalid_cont_byte
-_invalid_byte_3_of_3 = _invalid_cont_byte
-_invalid_byte_3_of_4 = _invalid_cont_byte
-_invalid_byte_4_of_4 = _invalid_cont_byte
-
- at enforceargs(allow_surrogates=bool)
-def _invalid_byte_2_of_3(ordch1, ordch2, allow_surrogates):
-    return (ordch2>>6 != 0x2 or    # 0b10
-            (ordch1 == 0xe0 and ordch2 < 0xa0)
-            # surrogates shouldn't be valid UTF-8!
-            or (ordch1 == 0xed and ordch2 > 0x9f and not allow_surrogates))
-
-def _invalid_byte_2_of_4(ordch1, ordch2):
-    return (ordch2>>6 != 0x2 or    # 0b10
-            (ordch1 == 0xf0 and ordch2 < 0x90) or
-            (ordch1 == 0xf4 and ordch2 > 0x8f))
-
 def str_decode_utf_8_impl(s, size, errors, final, errorhandler,
                           allow_surrogates):
     if size == 0:
@@ -170,60 +153,20 @@
         if pos + n > size:
             if not final:
                 break
-            # argh, this obscure block of code is mostly a copy of
-            # what follows :-(
             charsleft = size - pos - 1 # either 0, 1, 2
             # note: when we get the 'unexpected end of data' we need
             # to care about the pos returned; it can be lower than size,
             # in case we need to continue running this loop
-            if not charsleft:
-                # there's only the start byte and nothing else
-                r, pos = errorhandler(errors, 'utf8',
-                                      'unexpected end of data',
-                                      s, pos, pos+1)
-                result.append(r)
-                continue
-            ordch2 = ord(s[pos+1])
-            if n == 3:
-                # 3-bytes seq with only a continuation byte
-                if _invalid_byte_2_of_3(ordch1, ordch2, allow_surrogates):
-                    # second byte invalid, take the first and continue
-                    r, pos = errorhandler(errors, 'utf8',
-                                          'invalid continuation byte',
-                                          s, pos, pos+1)
-                    result.append(r)
-                    continue
-                else:
-                    # second byte valid, but third byte missing
-                    r, pos = errorhandler(errors, 'utf8',
-                                      'unexpected end of data',
-                                      s, pos, pos+2)
-                    result.append(r)
-                    continue
-            elif n == 4:
-                # 4-bytes seq with 1 or 2 continuation bytes
-                if _invalid_byte_2_of_4(ordch1, ordch2):
-                    # second byte invalid, take the first and continue
-                    r, pos = errorhandler(errors, 'utf8',
-                                          'invalid continuation byte',
-                                          s, pos, pos+1)
-                    result.append(r)
-                    continue
-                elif charsleft == 2 and _invalid_byte_3_of_4(ord(s[pos+2])):
-                    # third byte invalid, take the first two and continue
-                    r, pos = errorhandler(errors, 'utf8',
-                                          'invalid continuation byte',
-                                          s, pos, pos+2)
-                    result.append(r)
-                    continue
-                else:
-                    # there's only 1 or 2 valid cb, but the others are missing
-                    r, pos = errorhandler(errors, 'utf8',
-                                      'unexpected end of data',
-                                      s, pos, pos+charsleft+1)
-                    result.append(r)
-                    continue
-            raise AssertionError("unreachable")
+            endpos = pos + 1
+            if charsleft >= 1 and not _invalid_cont_byte(ord(s[pos+1])):
+                endpos = pos + 2
+                if charsleft >= 2 and not _invalid_cont_byte(ord(s[pos+2])):
+                    endpos = pos + 3
+            r, pos = errorhandler(errors, 'utf8',
+                                  'unexpected end of data',
+                                  s, pos, endpos)
+            result.append(r)
+            continue
 
         if n == 0:
             r, pos = errorhandler(errors, 'utf8',
@@ -236,7 +179,7 @@
 
         elif n == 2:
             ordch2 = ord(s[pos+1])
-            if _invalid_byte_2_of_2(ordch2):
+            if _invalid_cont_byte(ordch2):
                 r, pos = errorhandler(errors, 'utf8',
                                       'invalid continuation byte',
                                       s, pos, pos+1)
@@ -250,41 +193,48 @@
         elif n == 3:
             ordch2 = ord(s[pos+1])
             ordch3 = ord(s[pos+2])
-            if _invalid_byte_2_of_3(ordch1, ordch2, allow_surrogates):
+            if _invalid_cont_byte(ordch2):
                 r, pos = errorhandler(errors, 'utf8',
                                       'invalid continuation byte',
                                       s, pos, pos+1)
                 result.append(r)
                 continue
-            elif _invalid_byte_3_of_3(ordch3):
+            elif _invalid_cont_byte(ordch3):
                 r, pos = errorhandler(errors, 'utf8',
                                       'invalid continuation byte',
                                       s, pos, pos+2)
                 result.append(r)
                 continue
             # 1110xxxx 10yyyyyy 10zzzzzz -> 00000000 xxxxyyyy yyzzzzzz
-            result.append(unichr(((ordch1 & 0x0F) << 12) +     # 0b00001111
-                                 ((ordch2 & 0x3F) << 6) +      # 0b00111111
-                                 (ordch3 & 0x3F)))             # 0b00111111
+            c = (((ordch1 & 0x0F) << 12) +     # 0b00001111
+                 ((ordch2 & 0x3F) << 6) +      # 0b00111111
+                 (ordch3 & 0x3F))              # 0b00111111
+            if c < 2048 or (0xd800 <= c <= 0xdfff and not allow_surrogates):
+                r, pos = errorhandler(errors, 'utf8',
+                                      'invalid continuation byte',
+                                      s, pos, pos+2)
+                result.append(r)
+                continue
+            result.append(unichr(c))
             pos += 3
 
         elif n == 4:
             ordch2 = ord(s[pos+1])
             ordch3 = ord(s[pos+2])
             ordch4 = ord(s[pos+3])
-            if _invalid_byte_2_of_4(ordch1, ordch2):
+            if _invalid_cont_byte(ordch2):
                 r, pos = errorhandler(errors, 'utf8',
                                       'invalid continuation byte',
                                       s, pos, pos+1)
                 result.append(r)
                 continue
-            elif _invalid_byte_3_of_4(ordch3):
+            elif _invalid_cont_byte(ordch3):
                 r, pos = errorhandler(errors, 'utf8',
                                       'invalid continuation byte',
                                       s, pos, pos+2)
                 result.append(r)
                 continue
-            elif _invalid_byte_4_of_4(ordch4):
+            elif _invalid_cont_byte(ordch4):
                 r, pos = errorhandler(errors, 'utf8',
                                       'invalid continuation byte',
                                       s, pos, pos+3)
@@ -295,6 +245,12 @@
                  ((ordch2 & 0x3F) << 12) +      # 0b00111111
                  ((ordch3 & 0x3F) << 6) +       # 0b00111111
                  (ordch4 & 0x3F))               # 0b00111111
+            if c <= 65535 or c > 0x10ffff:
+                r, pos = errorhandler(errors, 'utf8',
+                                      'invalid continuation byte',
+                                      s, pos, pos+3)
+                result.append(r)
+                continue
             if c <= MAXUNICODE:
                 result.append(UNICHR(c))
             else:
diff --git a/rpython/rlib/test/test_runicode.py b/rpython/rlib/test/test_runicode.py
--- a/rpython/rlib/test/test_runicode.py
+++ b/rpython/rlib/test/test_runicode.py
@@ -700,6 +700,27 @@
             assert decoder(seq, len(seq), 'ignore', final=True
                            ) == (res, len(seq))
 
+    @settings(max_examples=10000)
+    @given(strategies.binary())
+    def test_str_check_utf8(self, s):
+        try:
+            u = s.decode("utf8")
+            valid = True
+        except UnicodeDecodeError as e:
+            valid = False
+        try:
+            result, length = runicode.str_decode_utf_8(s, len(s), None,
+                errorhandler=None, final=True, allow_surrogates=True)
+        except UnicodeDecodeError as a:
+            assert not valid
+            assert a.start == e.start
+            assert a.end == e.end
+            assert str(a) == str(e)
+        else:
+            assert valid
+            assert result == u
+            assert length == len(s)
+
 
 class TestEncoding(UnicodeTests):
     def test_all_ascii(self):