[pypy-commit] pypy py3.5: Give more precise error messages---still different from CPython, but well, too bad I suppose
arigo
pypy.commits at gmail.com
Tue Dec 6 09:29:25 EST 2016
Author: Armin Rigo <arigo at tunes.org>
Branch: py3.5
Changeset: r88903:ec42187d7005
Date: 2016-12-06 15:28 +0100
http://bitbucket.org/pypy/pypy/changeset/ec42187d7005/
Log: Give more precise error messages---still different from CPython, but
well, too bad I suppose
diff --git a/pypy/interpreter/astcompiler/consts.py b/pypy/interpreter/astcompiler/consts.py
--- a/pypy/interpreter/astcompiler/consts.py
+++ b/pypy/interpreter/astcompiler/consts.py
@@ -32,3 +32,4 @@
PyCF_ONLY_AST = 0x0400
PyCF_IGNORE_COOKIE = 0x0800
PyCF_ACCEPT_NULL_BYTES = 0x10000000 # PyPy only, for compile()
+PyCF_FOUND_ENCODING = 0x20000000 # PyPy only, for pytokenizer
diff --git a/pypy/interpreter/pyparser/pyparse.py b/pypy/interpreter/pyparser/pyparse.py
--- a/pypy/interpreter/pyparser/pyparse.py
+++ b/pypy/interpreter/pyparser/pyparse.py
@@ -108,6 +108,7 @@
tree is handled here.
"""
# Detect source encoding.
+ explicit_encoding = False
enc = None
if compile_info.flags & consts.PyCF_SOURCE_IS_UTF8:
enc = 'utf-8'
@@ -119,12 +120,14 @@
enc = 'utf-8'
# If an encoding is explicitly given check that it is utf-8.
decl_enc = _check_for_encoding(bytessrc)
+ explicit_encoding = (decl_enc is not None)
if decl_enc and decl_enc != "utf-8":
raise error.SyntaxError("UTF-8 BOM with %s coding cookie" % decl_enc,
filename=compile_info.filename)
textsrc = bytessrc
else:
enc = _normalize_encoding(_check_for_encoding(bytessrc))
+ explicit_encoding = (enc is not None)
if enc is None:
enc = 'utf-8'
try:
@@ -145,6 +148,8 @@
raise
flags = compile_info.flags
+ if explicit_encoding:
+ flags |= consts.PyCF_FOUND_ENCODING
# The tokenizer is very picky about how it wants its input.
source_lines = textsrc.splitlines(True)
diff --git a/pypy/interpreter/pyparser/pytokenizer.py b/pypy/interpreter/pyparser/pytokenizer.py
--- a/pypy/interpreter/pyparser/pytokenizer.py
+++ b/pypy/interpreter/pyparser/pytokenizer.py
@@ -56,16 +56,27 @@
return False
return True
+def bad_utf8(location_msg, line, lnum, pos, token_list, flags):
+ msg = 'Non-UTF-8 code in %s' % location_msg
+ if not (flags & consts.PyCF_FOUND_ENCODING):
+ # this extra part of the message is added only if we found no
+ # explicit encoding
+ msg += (' but no encoding declared; see '
+ 'http://python.org/dev/peps/pep-0263/ for details')
+ return TokenError(msg, line, lnum, pos, token_list)
+
+
def verify_identifier(token):
+ # 1=ok; 0=not an identifier; -1=bad utf-8
for c in token:
if ord(c) >= 0x80:
break
else:
- return True
+ return 1
try:
u = token.decode('utf-8')
except UnicodeDecodeError:
- return False
+ return -1
from pypy.objspace.std.unicodeobject import _isidentifier
return _isidentifier(u)
@@ -177,8 +188,8 @@
if line[pos] == '#':
# skip full-line comment, but still check that it is valid utf-8
if not verify_utf8(line):
- raise TokenError("Non-UTF-8 code in comment",
- line, lnum, pos, token_list)
+ raise bad_utf8("comment",
+ line, lnum, pos, token_list, flags)
continue
if column == indents[-1]:
@@ -247,8 +258,8 @@
elif initial == '#':
# skip comment, but still check that it is valid utf-8
if not verify_utf8(token):
- raise TokenError("Non-UTF-8 code in comment",
- line, lnum, start, token_list)
+ raise bad_utf8("comment",
+ line, lnum, start, token_list, flags)
last_comment = token
elif token in triple_quoted:
endDFA = endDFAs[token]
@@ -280,7 +291,13 @@
last_comment = ''
elif (initial in namechars or # ordinary name
ord(initial) >= 0x80): # unicode identifier
- if not verify_identifier(token):
+ valid = verify_identifier(token)
+ if valid <= 0:
+ if valid == -1:
+ raise bad_utf8("identifier", line, lnum, start + 1,
+ token_list, flags)
+ # valid utf-8, but it gives a unicode char that cannot
+ # be used in identifiers
raise TokenError("invalid character in identifier",
line, lnum, start + 1, token_list)
diff --git a/pypy/interpreter/test/test_compiler.py b/pypy/interpreter/test/test_compiler.py
--- a/pypy/interpreter/test/test_compiler.py
+++ b/pypy/interpreter/test/test_compiler.py
@@ -954,6 +954,15 @@
else:
assert False, "Expected SyntaxError"
+ def test_invalid_utf8(self):
+ e = raises(SyntaxError, compile, b'\x80', "dummy", "exec")
+ assert str(e.value).startswith('Non-UTF-8 code')
+ assert 'but no encoding declared' in str(e.value)
+ e = raises(SyntaxError, compile, b'# coding: utf-8\n\x80',
+ "dummy", "exec")
+ assert str(e.value).startswith('Non-UTF-8 code')
+ assert 'but no encoding declared' not in str(e.value)
+
def test_invalid_utf8_in_comments_or_strings(self):
import sys
compile(b"# coding: latin1\n#\xfd\n", "dummy", "exec")
More information about the pypy-commit
mailing list