[pypy-commit] pypy py3.5: Give more precise error messages---still different from CPython, but well, too bad I suppose

Tue Dec 6 09:29:25 EST 2016

Author: Armin Rigo <arigo at tunes.org>
Branch: py3.5
Changeset: r88903:ec42187d7005
Date: 2016-12-06 15:28 +0100
http://bitbucket.org/pypy/pypy/changeset/ec42187d7005/

Log:	Give more precise error messages---still different from CPython, but
	well, too bad I suppose

diff --git a/pypy/interpreter/astcompiler/consts.py b/pypy/interpreter/astcompiler/consts.py
--- a/pypy/interpreter/astcompiler/consts.py
+++ b/pypy/interpreter/astcompiler/consts.py
@@ -32,3 +32,4 @@
 PyCF_ONLY_AST = 0x0400
 PyCF_IGNORE_COOKIE = 0x0800
 PyCF_ACCEPT_NULL_BYTES = 0x10000000   # PyPy only, for compile()
+PyCF_FOUND_ENCODING = 0x20000000      # PyPy only, for pytokenizer
diff --git a/pypy/interpreter/pyparser/pyparse.py b/pypy/interpreter/pyparser/pyparse.py
--- a/pypy/interpreter/pyparser/pyparse.py
+++ b/pypy/interpreter/pyparser/pyparse.py
@@ -108,6 +108,7 @@
         tree is handled here.
         """
         # Detect source encoding.
+        explicit_encoding = False
         enc = None
         if compile_info.flags & consts.PyCF_SOURCE_IS_UTF8:
             enc = 'utf-8'
@@ -119,12 +120,14 @@
             enc = 'utf-8'
             # If an encoding is explicitly given check that it is utf-8.
             decl_enc = _check_for_encoding(bytessrc)
+            explicit_encoding = (decl_enc is not None)
             if decl_enc and decl_enc != "utf-8":
                 raise error.SyntaxError("UTF-8 BOM with %s coding cookie" % decl_enc,
                                         filename=compile_info.filename)
             textsrc = bytessrc
         else:
             enc = _normalize_encoding(_check_for_encoding(bytessrc))
+            explicit_encoding = (enc is not None)
             if enc is None:
                 enc = 'utf-8'
             try:
@@ -145,6 +148,8 @@
                 raise
 
         flags = compile_info.flags
+        if explicit_encoding:
+            flags |= consts.PyCF_FOUND_ENCODING
 
         # The tokenizer is very picky about how it wants its input.
         source_lines = textsrc.splitlines(True)
diff --git a/pypy/interpreter/pyparser/pytokenizer.py b/pypy/interpreter/pyparser/pytokenizer.py
--- a/pypy/interpreter/pyparser/pytokenizer.py
+++ b/pypy/interpreter/pyparser/pytokenizer.py
@@ -56,16 +56,27 @@
         return False
     return True
 
+def bad_utf8(location_msg, line, lnum, pos, token_list, flags):
+    msg = 'Non-UTF-8 code in %s' % location_msg
+    if not (flags & consts.PyCF_FOUND_ENCODING):
+        # this extra part of the message is added only if we found no
+        # explicit encoding
+        msg += (' but no encoding declared; see '
+                'http://python.org/dev/peps/pep-0263/ for details')
+    return TokenError(msg, line, lnum, pos, token_list)
+
+
 def verify_identifier(token):
+    # 1=ok; 0=not an identifier; -1=bad utf-8
     for c in token:
         if ord(c) >= 0x80:
             break
     else:
-        return True
+        return 1
     try:
         u = token.decode('utf-8')
     except UnicodeDecodeError:
-        return False
+        return -1
     from pypy.objspace.std.unicodeobject import _isidentifier
     return _isidentifier(u)
 
@@ -177,8 +188,8 @@
             if line[pos] == '#':
                 # skip full-line comment, but still check that it is valid utf-8
                 if not verify_utf8(line):
-                    raise TokenError("Non-UTF-8 code in comment",
-                                     line, lnum, pos, token_list)
+                    raise bad_utf8("comment",
+                                   line, lnum, pos, token_list, flags)
                 continue
 
             if column == indents[-1]:
@@ -247,8 +258,8 @@
                 elif initial == '#':
                     # skip comment, but still check that it is valid utf-8
                     if not verify_utf8(token):
-                        raise TokenError("Non-UTF-8 code in comment",
-                                         line, lnum, start, token_list)
+                        raise bad_utf8("comment",
+                                       line, lnum, start, token_list, flags)
                     last_comment = token
                 elif token in triple_quoted:
                     endDFA = endDFAs[token]
@@ -280,7 +291,13 @@
                         last_comment = ''
                 elif (initial in namechars or              # ordinary name
                       ord(initial) >= 0x80):               # unicode identifier
-                    if not verify_identifier(token):
+                    valid = verify_identifier(token)
+                    if valid <= 0:
+                        if valid == -1:
+                            raise bad_utf8("identifier", line, lnum, start + 1,
+                                           token_list, flags)
+                        # valid utf-8, but it gives a unicode char that cannot
+                        # be used in identifiers
                         raise TokenError("invalid character in identifier",
                                          line, lnum, start + 1, token_list)
 
diff --git a/pypy/interpreter/test/test_compiler.py b/pypy/interpreter/test/test_compiler.py
--- a/pypy/interpreter/test/test_compiler.py
+++ b/pypy/interpreter/test/test_compiler.py
@@ -954,6 +954,15 @@
         else:
             assert False, "Expected SyntaxError"
 
+    def test_invalid_utf8(self):
+        e = raises(SyntaxError, compile, b'\x80', "dummy", "exec")
+        assert str(e.value).startswith('Non-UTF-8 code')
+        assert 'but no encoding declared' in str(e.value)
+        e = raises(SyntaxError, compile, b'# coding: utf-8\n\x80',
+                   "dummy", "exec")
+        assert str(e.value).startswith('Non-UTF-8 code')
+        assert 'but no encoding declared' not in str(e.value)
+
     def test_invalid_utf8_in_comments_or_strings(self):
         import sys
         compile(b"# coding: latin1\n#\xfd\n", "dummy", "exec")