[Python-checkins] [3.10] bpo-46339: Fix crash in the parser when computing error text for multi-line f-strings (GH-30529) (GH-30542)
pablogsal
webhook-mailer at python.org
Thu Jan 20 08:05:28 EST 2022
https://github.com/python/cpython/commit/1fb1f5d8bd084c20f0a5fde547b563c08d103f09
commit: 1fb1f5d8bd084c20f0a5fde547b563c08d103f09
branch: 3.10
author: Miss Islington (bot) <31488909+miss-islington at users.noreply.github.com>
committer: pablogsal <Pablogsal at gmail.com>
date: 2022-01-20T13:05:10Z
summary:
[3.10] bpo-46339: Fix crash in the parser when computing error text for multi-line f-strings (GH-30529) (GH-30542)
* bpo-46339: Fix crash in the parser when computing error text for multi-line f-strings (GH-30529)
Automerge-Triggered-By: GH:pablogsal
(cherry picked from commit cedec19be81e6bd153678bfb28c8e217af8bda58)
Co-authored-by: Pablo Galindo Salgado <Pablogsal at gmail.com>
* Fix interactive mode
Co-authored-by: Pablo Galindo Salgado <Pablogsal at gmail.com>
files:
A Misc/NEWS.d/next/Core and Builtins/2022-01-11-11-50-19.bpo-46339.OVumDZ.rst
D Parser/pegen_errors.c
M Lib/test/test_exceptions.py
M Parser/pegen.c
diff --git a/Lib/test/test_exceptions.py b/Lib/test/test_exceptions.py
index 86b5dccaaed98..b3d1c35274c71 100644
--- a/Lib/test/test_exceptions.py
+++ b/Lib/test/test_exceptions.py
@@ -278,6 +278,12 @@ def baz():
}
\"\"\"
}'''""", 5, 17)
+ check('''f"""
+
+
+ {
+ 6
+ 0="""''', 5, 13)
# Errors thrown by symtable.c
check('x = [(yield i) for i in range(3)]', 1, 7)
diff --git a/Misc/NEWS.d/next/Core and Builtins/2022-01-11-11-50-19.bpo-46339.OVumDZ.rst b/Misc/NEWS.d/next/Core and Builtins/2022-01-11-11-50-19.bpo-46339.OVumDZ.rst
new file mode 100644
index 0000000000000..cd04f060826b2
--- /dev/null
+++ b/Misc/NEWS.d/next/Core and Builtins/2022-01-11-11-50-19.bpo-46339.OVumDZ.rst
@@ -0,0 +1,3 @@
+Fix a crash in the parser when retrieving the error text for multi-line
+f-strings expressions that do not start in the first line of the string.
+Patch by Pablo Galindo
diff --git a/Parser/pegen.c b/Parser/pegen.c
index e507415f6d14c..f9812c0ea8f02 100644
--- a/Parser/pegen.c
+++ b/Parser/pegen.c
@@ -436,9 +436,17 @@ get_error_line(Parser *p, Py_ssize_t lineno)
char *cur_line = p->tok->fp_interactive ? p->tok->interactive_src_start : p->tok->str;
assert(cur_line != NULL);
+ const char* buf_end = p->tok->fp_interactive ? p->tok->interactive_src_end : p->tok->inp;
- for (int i = 0; i < lineno - 1; i++) {
- cur_line = strchr(cur_line, '\n') + 1;
+ Py_ssize_t relative_lineno = p->starting_lineno ? lineno - p->starting_lineno + 1 : lineno;
+
+ for (int i = 0; i < relative_lineno - 1; i++) {
+ char *new_line = strchr(cur_line, '\n') + 1;
+ assert(new_line != NULL && new_line <= buf_end);
+ if (new_line == NULL || new_line > buf_end) {
+ break;
+ }
+ cur_line = new_line;
}
char *next_newline;
diff --git a/Parser/pegen_errors.c b/Parser/pegen_errors.c
deleted file mode 100644
index 93057d151db38..0000000000000
--- a/Parser/pegen_errors.c
+++ /dev/null
@@ -1,425 +0,0 @@
-#include <Python.h>
-#include <errcode.h>
-
-#include "tokenizer.h"
-#include "pegen.h"
-
-// TOKENIZER ERRORS
-
-void
-_PyPegen_raise_tokenizer_init_error(PyObject *filename)
-{
- if (!(PyErr_ExceptionMatches(PyExc_LookupError)
- || PyErr_ExceptionMatches(PyExc_SyntaxError)
- || PyErr_ExceptionMatches(PyExc_ValueError)
- || PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))) {
- return;
- }
- PyObject *errstr = NULL;
- PyObject *tuple = NULL;
- PyObject *type;
- PyObject *value;
- PyObject *tback;
- PyErr_Fetch(&type, &value, &tback);
- errstr = PyObject_Str(value);
- if (!errstr) {
- goto error;
- }
-
- PyObject *tmp = Py_BuildValue("(OiiO)", filename, 0, -1, Py_None);
- if (!tmp) {
- goto error;
- }
-
- tuple = PyTuple_Pack(2, errstr, tmp);
- Py_DECREF(tmp);
- if (!value) {
- goto error;
- }
- PyErr_SetObject(PyExc_SyntaxError, tuple);
-
-error:
- Py_XDECREF(type);
- Py_XDECREF(value);
- Py_XDECREF(tback);
- Py_XDECREF(errstr);
- Py_XDECREF(tuple);
-}
-
-static inline void
-raise_unclosed_parentheses_error(Parser *p) {
- int error_lineno = p->tok->parenlinenostack[p->tok->level-1];
- int error_col = p->tok->parencolstack[p->tok->level-1];
- RAISE_ERROR_KNOWN_LOCATION(p, PyExc_SyntaxError,
- error_lineno, error_col, error_lineno, -1,
- "'%c' was never closed",
- p->tok->parenstack[p->tok->level-1]);
-}
-
-int
-_Pypegen_tokenizer_error(Parser *p)
-{
- if (PyErr_Occurred()) {
- return -1;
- }
-
- const char *msg = NULL;
- PyObject* errtype = PyExc_SyntaxError;
- Py_ssize_t col_offset = -1;
- switch (p->tok->done) {
- case E_TOKEN:
- msg = "invalid token";
- break;
- case E_EOF:
- if (p->tok->level) {
- raise_unclosed_parentheses_error(p);
- } else {
- RAISE_SYNTAX_ERROR("unexpected EOF while parsing");
- }
- return -1;
- case E_DEDENT:
- RAISE_INDENTATION_ERROR("unindent does not match any outer indentation level");
- return -1;
- case E_INTR:
- if (!PyErr_Occurred()) {
- PyErr_SetNone(PyExc_KeyboardInterrupt);
- }
- return -1;
- case E_NOMEM:
- PyErr_NoMemory();
- return -1;
- case E_TABSPACE:
- errtype = PyExc_TabError;
- msg = "inconsistent use of tabs and spaces in indentation";
- break;
- case E_TOODEEP:
- errtype = PyExc_IndentationError;
- msg = "too many levels of indentation";
- break;
- case E_LINECONT: {
- col_offset = p->tok->cur - p->tok->buf - 1;
- msg = "unexpected character after line continuation character";
- break;
- }
- default:
- msg = "unknown parsing error";
- }
-
- RAISE_ERROR_KNOWN_LOCATION(p, errtype, p->tok->lineno,
- col_offset >= 0 ? col_offset : 0,
- p->tok->lineno, -1, msg);
- return -1;
-}
-
-int
-_Pypegen_raise_decode_error(Parser *p)
-{
- assert(PyErr_Occurred());
- const char *errtype = NULL;
- if (PyErr_ExceptionMatches(PyExc_UnicodeError)) {
- errtype = "unicode error";
- }
- else if (PyErr_ExceptionMatches(PyExc_ValueError)) {
- errtype = "value error";
- }
- if (errtype) {
- PyObject *type;
- PyObject *value;
- PyObject *tback;
- PyObject *errstr;
- PyErr_Fetch(&type, &value, &tback);
- errstr = PyObject_Str(value);
- if (errstr) {
- RAISE_SYNTAX_ERROR("(%s) %U", errtype, errstr);
- Py_DECREF(errstr);
- }
- else {
- PyErr_Clear();
- RAISE_SYNTAX_ERROR("(%s) unknown error", errtype);
- }
- Py_XDECREF(type);
- Py_XDECREF(value);
- Py_XDECREF(tback);
- }
-
- return -1;
-}
-
-static int
-_PyPegen_tokenize_full_source_to_check_for_errors(Parser *p) {
- // Tokenize the whole input to see if there are any tokenization
- // errors such as mistmatching parentheses. These will get priority
- // over generic syntax errors only if the line number of the error is
- // before the one that we had for the generic error.
-
- // We don't want to tokenize to the end for interactive input
- if (p->tok->prompt != NULL) {
- return 0;
- }
-
- PyObject *type, *value, *traceback;
- PyErr_Fetch(&type, &value, &traceback);
-
- Token *current_token = p->known_err_token != NULL ? p->known_err_token : p->tokens[p->fill - 1];
- Py_ssize_t current_err_line = current_token->lineno;
-
- int ret = 0;
-
- for (;;) {
- const char *start;
- const char *end;
- switch (_PyTokenizer_Get(p->tok, &start, &end)) {
- case ERRORTOKEN:
- if (p->tok->level != 0) {
- int error_lineno = p->tok->parenlinenostack[p->tok->level-1];
- if (current_err_line > error_lineno) {
- raise_unclosed_parentheses_error(p);
- ret = -1;
- goto exit;
- }
- }
- break;
- case ENDMARKER:
- break;
- default:
- continue;
- }
- break;
- }
-
-
-exit:
- if (PyErr_Occurred()) {
- Py_XDECREF(value);
- Py_XDECREF(type);
- Py_XDECREF(traceback);
- } else {
- PyErr_Restore(type, value, traceback);
- }
- return ret;
-}
-
-// PARSER ERRORS
-
-void *
-_PyPegen_raise_error(Parser *p, PyObject *errtype, const char *errmsg, ...)
-{
- if (p->fill == 0) {
- va_list va;
- va_start(va, errmsg);
- _PyPegen_raise_error_known_location(p, errtype, 0, 0, 0, -1, errmsg, va);
- va_end(va);
- return NULL;
- }
-
- Token *t = p->known_err_token != NULL ? p->known_err_token : p->tokens[p->fill - 1];
- Py_ssize_t col_offset;
- Py_ssize_t end_col_offset = -1;
- if (t->col_offset == -1) {
- if (p->tok->cur == p->tok->buf) {
- col_offset = 0;
- } else {
- const char* start = p->tok->buf ? p->tok->line_start : p->tok->buf;
- col_offset = Py_SAFE_DOWNCAST(p->tok->cur - start, intptr_t, int);
- }
- } else {
- col_offset = t->col_offset + 1;
- }
-
- if (t->end_col_offset != -1) {
- end_col_offset = t->end_col_offset + 1;
- }
-
- va_list va;
- va_start(va, errmsg);
- _PyPegen_raise_error_known_location(p, errtype, t->lineno, col_offset, t->end_lineno, end_col_offset, errmsg, va);
- va_end(va);
-
- return NULL;
-}
-
-static PyObject *
-get_error_line_from_tokenizer_buffers(Parser *p, Py_ssize_t lineno)
-{
- /* If the file descriptor is interactive, the source lines of the current
- * (multi-line) statement are stored in p->tok->interactive_src_start.
- * If not, we're parsing from a string, which means that the whole source
- * is stored in p->tok->str. */
- assert((p->tok->fp == NULL && p->tok->str != NULL) || p->tok->fp == stdin);
-
- char *cur_line = p->tok->fp_interactive ? p->tok->interactive_src_start : p->tok->str;
- assert(cur_line != NULL);
-
- for (int i = 0; i < lineno - 1; i++) {
- cur_line = strchr(cur_line, '\n') + 1;
- }
-
- char *next_newline;
- if ((next_newline = strchr(cur_line, '\n')) == NULL) { // This is the last line
- next_newline = cur_line + strlen(cur_line);
- }
- return PyUnicode_DecodeUTF8(cur_line, next_newline - cur_line, "replace");
-}
-
-void *
-_PyPegen_raise_error_known_location(Parser *p, PyObject *errtype,
- Py_ssize_t lineno, Py_ssize_t col_offset,
- Py_ssize_t end_lineno, Py_ssize_t end_col_offset,
- const char *errmsg, va_list va)
-{
- PyObject *value = NULL;
- PyObject *errstr = NULL;
- PyObject *error_line = NULL;
- PyObject *tmp = NULL;
- p->error_indicator = 1;
-
- if (end_lineno == CURRENT_POS) {
- end_lineno = p->tok->lineno;
- }
- if (end_col_offset == CURRENT_POS) {
- end_col_offset = p->tok->cur - p->tok->line_start;
- }
-
- if (p->start_rule == Py_fstring_input) {
- const char *fstring_msg = "f-string: ";
- Py_ssize_t len = strlen(fstring_msg) + strlen(errmsg);
-
- char *new_errmsg = PyMem_Malloc(len + 1); // Lengths of both strings plus NULL character
- if (!new_errmsg) {
- return (void *) PyErr_NoMemory();
- }
-
- // Copy both strings into new buffer
- memcpy(new_errmsg, fstring_msg, strlen(fstring_msg));
- memcpy(new_errmsg + strlen(fstring_msg), errmsg, strlen(errmsg));
- new_errmsg[len] = 0;
- errmsg = new_errmsg;
- }
- errstr = PyUnicode_FromFormatV(errmsg, va);
- if (!errstr) {
- goto error;
- }
-
- if (p->tok->fp_interactive) {
- error_line = get_error_line_from_tokenizer_buffers(p, lineno);
- }
- else if (p->start_rule == Py_file_input) {
- error_line = _PyErr_ProgramDecodedTextObject(p->tok->filename,
- (int) lineno, p->tok->encoding);
- }
-
- if (!error_line) {
- /* PyErr_ProgramTextObject was not called or returned NULL. If it was not called,
- then we need to find the error line from some other source, because
- p->start_rule != Py_file_input. If it returned NULL, then it either unexpectedly
- failed or we're parsing from a string or the REPL. There's a third edge case where
- we're actually parsing from a file, which has an E_EOF SyntaxError and in that case
- `PyErr_ProgramTextObject` fails because lineno points to last_file_line + 1, which
- does not physically exist */
- assert(p->tok->fp == NULL || p->tok->fp == stdin || p->tok->done == E_EOF);
-
- if (p->tok->lineno <= lineno && p->tok->inp > p->tok->buf) {
- Py_ssize_t size = p->tok->inp - p->tok->buf;
- error_line = PyUnicode_DecodeUTF8(p->tok->buf, size, "replace");
- }
- else if (p->tok->fp == NULL || p->tok->fp == stdin) {
- error_line = get_error_line_from_tokenizer_buffers(p, lineno);
- }
- else {
- error_line = PyUnicode_FromStringAndSize("", 0);
- }
- if (!error_line) {
- goto error;
- }
- }
-
- if (p->start_rule == Py_fstring_input) {
- col_offset -= p->starting_col_offset;
- end_col_offset -= p->starting_col_offset;
- }
-
- Py_ssize_t col_number = col_offset;
- Py_ssize_t end_col_number = end_col_offset;
-
- if (p->tok->encoding != NULL) {
- col_number = _PyPegen_byte_offset_to_character_offset(error_line, col_offset);
- if (col_number < 0) {
- goto error;
- }
- if (end_col_number > 0) {
- Py_ssize_t end_col_offset = _PyPegen_byte_offset_to_character_offset(error_line, end_col_number);
- if (end_col_offset < 0) {
- goto error;
- } else {
- end_col_number = end_col_offset;
- }
- }
- }
- tmp = Py_BuildValue("(OiiNii)", p->tok->filename, lineno, col_number, error_line, end_lineno, end_col_number);
- if (!tmp) {
- goto error;
- }
- value = PyTuple_Pack(2, errstr, tmp);
- Py_DECREF(tmp);
- if (!value) {
- goto error;
- }
- PyErr_SetObject(errtype, value);
-
- Py_DECREF(errstr);
- Py_DECREF(value);
- if (p->start_rule == Py_fstring_input) {
- PyMem_Free((void *)errmsg);
- }
- return NULL;
-
-error:
- Py_XDECREF(errstr);
- Py_XDECREF(error_line);
- if (p->start_rule == Py_fstring_input) {
- PyMem_Free((void *)errmsg);
- }
- return NULL;
-}
-
-void
-_Pypegen_set_syntax_error(Parser* p, Token* last_token) {
- // Existing sintax error
- if (PyErr_Occurred()) {
- // Prioritize tokenizer errors to custom syntax errors raised
- // on the second phase only if the errors come from the parser.
- if (p->tok->done == E_DONE && PyErr_ExceptionMatches(PyExc_SyntaxError)) {
- _PyPegen_tokenize_full_source_to_check_for_errors(p);
- }
- // Propagate the existing syntax error.
- return;
- }
- // Initialization error
- if (p->fill == 0) {
- RAISE_SYNTAX_ERROR("error at start before reading any input");
- }
- // Parser encountered EOF (End of File) unexpectedtly
- if (last_token->type == ERRORTOKEN && p->tok->done == E_EOF) {
- if (p->tok->level) {
- raise_unclosed_parentheses_error(p);
- } else {
- RAISE_SYNTAX_ERROR("unexpected EOF while parsing");
- }
- return;
- }
- // Indentation error in the tokenizer
- if (last_token->type == INDENT || last_token->type == DEDENT) {
- RAISE_INDENTATION_ERROR(last_token->type == INDENT ? "unexpected indent" : "unexpected unindent");
- return;
- }
- // Unknown error (generic case)
-
- // Use the last token we found on the first pass to avoid reporting
- // incorrect locations for generic syntax errors just because we reached
- // further away when trying to find specific syntax errors in the second
- // pass.
- RAISE_SYNTAX_ERROR_KNOWN_LOCATION(last_token, "invalid syntax");
- // _PyPegen_tokenize_full_source_to_check_for_errors will override the existing
- // generic SyntaxError we just raised if errors are found.
- _PyPegen_tokenize_full_source_to_check_for_errors(p);
-}
More information about the Python-checkins
mailing list