[Python-checkins] cpython: Issue #9319: Include the filename in "Non-UTF8 code ..." syntax error.

victor.stinner python-checkins at python.org
Tue Apr 5 01:48:35 CEST 2011


http://hg.python.org/cpython/rev/7b8d625eb6e4
changeset:   69148:7b8d625eb6e4
user:        Victor Stinner <victor.stinner at haypocalc.com>
date:        Tue Apr 05 01:48:03 2011 +0200
summary:
  Issue #9319: Include the filename in "Non-UTF8 code ..." syntax error.

files:
  Lib/test/test_imp.py |   6 ++++
  Misc/NEWS            |   2 +
  Parser/tokenizer.c   |  41 +++++++++++++++++++++----------
  Parser/tokenizer.h   |   1 -
  Python/import.c      |  10 +++---
  Python/traceback.c   |   6 ++--
  6 files changed, 43 insertions(+), 23 deletions(-)


diff --git a/Lib/test/test_imp.py b/Lib/test/test_imp.py
--- a/Lib/test/test_imp.py
+++ b/Lib/test/test_imp.py
@@ -58,6 +58,12 @@
             with imp.find_module('module_' + mod, self.test_path)[0] as fd:
                 self.assertEqual(fd.encoding, encoding)
 
+        path = [os.path.dirname(__file__)]
+        self.assertRaisesRegex(SyntaxError,
+            r"Non-UTF-8 code starting with '\\xf6'"
+            r" in file .*badsyntax_pep3120.py",
+            imp.find_module, 'badsyntax_pep3120', path)
+
     def test_issue1267(self):
         for mod, encoding, _ in self.test_strings:
             fp, filename, info  = imp.find_module('module_' + mod,
diff --git a/Misc/NEWS b/Misc/NEWS
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -10,6 +10,8 @@
 Core and Builtins
 -----------------
 
+- Issue #9319: Include the filename in "Non-UTF8 code ..." syntax error.
+
 - Issue #10785: Store the filename as Unicode in the Python parser.
 
 - Issue #11619: _PyImport_LoadDynamicModule() doesn't encode the path to bytes
diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@@ -1690,17 +1690,18 @@
     return result;
 }
 
-/* Get -*- encoding -*- from a Python file.
+/* Get the encoding of a Python file. Check for the coding cookie and check if
+   the file starts with a BOM.
 
-   PyTokenizer_FindEncoding returns NULL when it can't find the encoding in
-   the first or second line of the file (in which case the encoding
-   should be assumed to be PyUnicode_GetDefaultEncoding()).
+   PyTokenizer_FindEncodingFilename() returns NULL when it can't find the
+   encoding in the first or second line of the file (in which case the encoding
+   should be assumed to be UTF-8).
 
-   The char * returned is malloc'ed via PyMem_MALLOC() and thus must be freed
-   by the caller.
-*/
+   The char* returned is malloc'ed via PyMem_MALLOC() and thus must be freed
+   by the caller. */
+
 char *
-PyTokenizer_FindEncoding(int fd)
+PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
 {
     struct tok_state *tok;
     FILE *fp;
@@ -1720,9 +1721,18 @@
         return NULL;
     }
 #ifndef PGEN
-    tok->filename = PyUnicode_FromString("<string>");
-    if (tok->filename == NULL)
-        goto error;
+    if (filename != NULL) {
+        Py_INCREF(filename);
+        tok->filename = filename;
+    }
+    else {
+        tok->filename = PyUnicode_FromString("<string>");
+        if (tok->filename == NULL) {
+            fclose(fp);
+            PyTokenizer_Free(tok);
+            return encoding;
+        }
+    }
 #endif
     while (tok->lineno < 2 && tok->done == E_OK) {
         PyTokenizer_Get(tok, &p_start, &p_end);
@@ -1733,13 +1743,16 @@
         if (encoding)
         strcpy(encoding, tok->encoding);
     }
-#ifndef PGEN
-error:
-#endif
     PyTokenizer_Free(tok);
     return encoding;
 }
 
+char *
+PyTokenizer_FindEncoding(int fd)
+{
+    return PyTokenizer_FindEncodingFilename(fd, NULL);
+}
+
 #ifdef Py_DEBUG
 
 void
diff --git a/Parser/tokenizer.h b/Parser/tokenizer.h
--- a/Parser/tokenizer.h
+++ b/Parser/tokenizer.h
@@ -75,7 +75,6 @@
 extern int PyTokenizer_Get(struct tok_state *, char **, char **);
 extern char * PyTokenizer_RestoreEncoding(struct tok_state* tok,
                                           int len, int *offset);
-extern char * PyTokenizer_FindEncoding(int);
 
 #ifdef __cplusplus
 }
diff --git a/Python/import.c b/Python/import.c
--- a/Python/import.c
+++ b/Python/import.c
@@ -124,12 +124,12 @@
 /* See _PyImport_FixupExtensionObject() below */
 static PyObject *extensions = NULL;
 
+/* Function from Parser/tokenizer.c */
+extern char * PyTokenizer_FindEncodingFilename(int, PyObject *);
+
 /* This table is defined in config.c: */
 extern struct _inittab _PyImport_Inittab[];
 
-/* Method from Parser/tokenizer.c */
-extern char * PyTokenizer_FindEncoding(int);
-
 struct _inittab *PyImport_Inittab = _PyImport_Inittab;
 
 /* these tables define the module suffixes that Python recognizes */
@@ -3540,9 +3540,9 @@
     }
     if (fd != -1) {
         if (strchr(fdp->mode, 'b') == NULL) {
-            /* PyTokenizer_FindEncoding() returns PyMem_MALLOC'ed
+            /* PyTokenizer_FindEncodingFilename() returns PyMem_MALLOC'ed
                memory. */
-            found_encoding = PyTokenizer_FindEncoding(fd);
+            found_encoding = PyTokenizer_FindEncodingFilename(fd, pathobj);
             lseek(fd, 0, 0); /* Reset position */
             if (found_encoding == NULL && PyErr_Occurred()) {
                 Py_XDECREF(pathobj);
diff --git a/Python/traceback.c b/Python/traceback.c
--- a/Python/traceback.c
+++ b/Python/traceback.c
@@ -18,8 +18,8 @@
 #define MAX_FRAME_DEPTH 100
 #define MAX_NTHREADS 100
 
-/* Method from Parser/tokenizer.c */
-extern char * PyTokenizer_FindEncoding(int);
+/* Function from Parser/tokenizer.c */
+extern char * PyTokenizer_FindEncodingFilename(int, PyObject *);
 
 static PyObject *
 tb_dir(PyTracebackObject *self)
@@ -251,7 +251,7 @@
 
     /* use the right encoding to decode the file as unicode */
     fd = PyObject_AsFileDescriptor(binary);
-    found_encoding = PyTokenizer_FindEncoding(fd);
+    found_encoding = PyTokenizer_FindEncodingFilename(fd, filename);
     encoding = (found_encoding != NULL) ? found_encoding : "utf-8";
     lseek(fd, 0, 0); /* Reset position */
     fob = PyObject_CallMethod(io, "TextIOWrapper", "Os", binary, encoding);

-- 
Repository URL: http://hg.python.org/cpython


More information about the Python-checkins mailing list