[pypy-commit] pypy unicode-utf8-py3: cpython prefers "invalid continuation byte" over "unxepected end of data"

mattip pypy.commits at gmail.com
Sun Jan 20 12:11:38 EST 2019


Author: Matti Picus <matti.picus at gmail.com>
Branch: unicode-utf8-py3
Changeset: r95679:98c9d750a001
Date: 2019-01-20 12:05 +0200
http://bitbucket.org/pypy/pypy/changeset/98c9d750a001/

Log:	cpython prefers "invalid continuation byte" over "unxepected end of
	data"

diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -395,8 +395,12 @@
                 if not final:
                     pos -= 1
                     break
-                r, pos, rettype = errorhandler(errors, "utf8", "unexpected end of data",
-                    s, pos - 1, pos)
+                if (pos) < end and  rutf8._invalid_byte_2_of_3(ordch1,
+                                                ord(s[pos]), allow_surrogates):
+                    msg = "invalid continuation byte"
+                else:
+                    msg = "unexpected end of data"
+                r, pos, rettype = errorhandler(errors, "utf8", msg, s, pos - 1, pos)
                 res.append(r)
                 suppressing = True
                 continue
@@ -429,8 +433,13 @@
                 if not final:
                     pos -= 1
                     break
-                r, pos, rettype = errorhandler(errors, "utf8", "unexpected end of data",
-                    s, pos - 1, pos)
+                if pos < end and rutf8._invalid_byte_2_of_4(ordch1, s[pos]):
+                    msg = "invalid continuation byte"
+                elif pos + 1 < end and rutf8._invalid_byte_3_of_4(ord(s[pos + 1])):
+                    msg = "invalid continuation byte"
+                else:
+                    msg = "unexpected end of data"
+                r, pos, rettype = errorhandler(errors, "utf8", msg, s, pos - 1, pos)
                 res.append(r)
                 suppressing = True
                 continue
diff --git a/pypy/module/_codecs/test/test_codecs.py b/pypy/module/_codecs/test/test_codecs.py
--- a/pypy/module/_codecs/test/test_codecs.py
+++ b/pypy/module/_codecs/test/test_codecs.py
@@ -544,6 +544,10 @@
         assert codecs.decode(b'abc') == 'abc'
         exc = raises(UnicodeDecodeError, codecs.decode, b'\xff', 'ascii')
 
+        exc = raises(UnicodeDecodeError, codecs.decode, b'\xe0\x00', 'utf-8')
+        print(dir(exc.value))
+        assert 'invalid continuation byte' in exc.value.reason
+
     def test_bad_errorhandler_return(self):
         import codecs
         def baddecodereturn1(exc):


More information about the pypy-commit mailing list