[Python-checkins] [3.10] bpo-46339: Fix crash in the parser when computing error text for multi-line f-strings (GH-30529) (GH-30542)

Thu Jan 20 08:05:28 EST 2022

https://github.com/python/cpython/commit/1fb1f5d8bd084c20f0a5fde547b563c08d103f09
commit: 1fb1f5d8bd084c20f0a5fde547b563c08d103f09
branch: 3.10
author: Miss Islington (bot) <31488909+miss-islington at users.noreply.github.com>
committer: pablogsal <Pablogsal at gmail.com>
date: 2022-01-20T13:05:10Z
summary:

[3.10] bpo-46339: Fix crash in the parser when computing error text for multi-line f-strings (GH-30529) (GH-30542)

* bpo-46339: Fix crash in the parser when computing error text for multi-line f-strings (GH-30529)

Automerge-Triggered-By: GH:pablogsal
(cherry picked from commit cedec19be81e6bd153678bfb28c8e217af8bda58)

Co-authored-by: Pablo Galindo Salgado <Pablogsal at gmail.com>

* Fix interactive mode

Co-authored-by: Pablo Galindo Salgado <Pablogsal at gmail.com>

files:
A Misc/NEWS.d/next/Core and Builtins/2022-01-11-11-50-19.bpo-46339.OVumDZ.rst
D Parser/pegen_errors.c
M Lib/test/test_exceptions.py
M Parser/pegen.c

diff --git a/Lib/test/test_exceptions.py b/Lib/test/test_exceptions.py
index 86b5dccaaed98..b3d1c35274c71 100644
--- a/Lib/test/test_exceptions.py
+++ b/Lib/test/test_exceptions.py
@@ -278,6 +278,12 @@ def baz():
             }
             \"\"\"
             }'''""", 5, 17)
+        check('''f"""
+
+
+            {
+            6
+            0="""''', 5, 13)
 
         # Errors thrown by symtable.c
         check('x = [(yield i) for i in range(3)]', 1, 7)
diff --git a/Misc/NEWS.d/next/Core and Builtins/2022-01-11-11-50-19.bpo-46339.OVumDZ.rst b/Misc/NEWS.d/next/Core and Builtins/2022-01-11-11-50-19.bpo-46339.OVumDZ.rst
new file mode 100644
index 0000000000000..cd04f060826b2
--- /dev/null
+++ b/Misc/NEWS.d/next/Core and Builtins/2022-01-11-11-50-19.bpo-46339.OVumDZ.rst	
@@ -0,0 +1,3 @@
+Fix a crash in the parser when retrieving the error text for multi-line
+f-strings expressions that do not start in the first line of the string.
+Patch by Pablo Galindo
diff --git a/Parser/pegen.c b/Parser/pegen.c
index e507415f6d14c..f9812c0ea8f02 100644
--- a/Parser/pegen.c
+++ b/Parser/pegen.c
@@ -436,9 +436,17 @@ get_error_line(Parser *p, Py_ssize_t lineno)
 
     char *cur_line = p->tok->fp_interactive ? p->tok->interactive_src_start : p->tok->str;
     assert(cur_line != NULL);
+    const char* buf_end = p->tok->fp_interactive ? p->tok->interactive_src_end : p->tok->inp;
 
-    for (int i = 0; i < lineno - 1; i++) {
-        cur_line = strchr(cur_line, '\n') + 1;
+    Py_ssize_t relative_lineno = p->starting_lineno ? lineno - p->starting_lineno + 1 : lineno;
+
+    for (int i = 0; i < relative_lineno - 1; i++) {
+        char *new_line = strchr(cur_line, '\n') + 1;
+        assert(new_line != NULL && new_line <= buf_end);
+        if (new_line == NULL || new_line > buf_end) {
+            break;
+        }
+        cur_line = new_line;
     }
 
     char *next_newline;
diff --git a/Parser/pegen_errors.c b/Parser/pegen_errors.c
deleted file mode 100644
index 93057d151db38..0000000000000
--- a/Parser/pegen_errors.c
+++ /dev/null
@@ -1,425 +0,0 @@
-#include <Python.h>
-#include <errcode.h>
-
-#include "tokenizer.h"
-#include "pegen.h"
-
-// TOKENIZER ERRORS
-
-void
-_PyPegen_raise_tokenizer_init_error(PyObject *filename)
-{
-    if (!(PyErr_ExceptionMatches(PyExc_LookupError)
-          || PyErr_ExceptionMatches(PyExc_SyntaxError)
-          || PyErr_ExceptionMatches(PyExc_ValueError)
-          || PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))) {
-        return;
-    }
-    PyObject *errstr = NULL;
-    PyObject *tuple = NULL;
-    PyObject *type;
-    PyObject *value;
-    PyObject *tback;
-    PyErr_Fetch(&type, &value, &tback);
-    errstr = PyObject_Str(value);
-    if (!errstr) {
-        goto error;
-    }
-
-    PyObject *tmp = Py_BuildValue("(OiiO)", filename, 0, -1, Py_None);
-    if (!tmp) {
-        goto error;
-    }
-
-    tuple = PyTuple_Pack(2, errstr, tmp);
-    Py_DECREF(tmp);
-    if (!value) {
-        goto error;
-    }
-    PyErr_SetObject(PyExc_SyntaxError, tuple);
-
-error:
-    Py_XDECREF(type);
-    Py_XDECREF(value);
-    Py_XDECREF(tback);
-    Py_XDECREF(errstr);
-    Py_XDECREF(tuple);
-}
-
-static inline void
-raise_unclosed_parentheses_error(Parser *p) {
-       int error_lineno = p->tok->parenlinenostack[p->tok->level-1];
-       int error_col = p->tok->parencolstack[p->tok->level-1];
-       RAISE_ERROR_KNOWN_LOCATION(p, PyExc_SyntaxError,
-                                  error_lineno, error_col, error_lineno, -1,
-                                  "'%c' was never closed",
-                                  p->tok->parenstack[p->tok->level-1]);
-}
-
-int
-_Pypegen_tokenizer_error(Parser *p)
-{
-    if (PyErr_Occurred()) {
-        return -1;
-    }
-
-    const char *msg = NULL;
-    PyObject* errtype = PyExc_SyntaxError;
-    Py_ssize_t col_offset = -1;
-    switch (p->tok->done) {
-        case E_TOKEN:
-            msg = "invalid token";
-            break;
-        case E_EOF:
-            if (p->tok->level) {
-                raise_unclosed_parentheses_error(p);
-            } else {
-                RAISE_SYNTAX_ERROR("unexpected EOF while parsing");
-            }
-            return -1;
-        case E_DEDENT:
-            RAISE_INDENTATION_ERROR("unindent does not match any outer indentation level");
-            return -1;
-        case E_INTR:
-            if (!PyErr_Occurred()) {
-                PyErr_SetNone(PyExc_KeyboardInterrupt);
-            }
-            return -1;
-        case E_NOMEM:
-            PyErr_NoMemory();
-            return -1;
-        case E_TABSPACE:
-            errtype = PyExc_TabError;
-            msg = "inconsistent use of tabs and spaces in indentation";
-            break;
-        case E_TOODEEP:
-            errtype = PyExc_IndentationError;
-            msg = "too many levels of indentation";
-            break;
-        case E_LINECONT: {
-            col_offset = p->tok->cur - p->tok->buf - 1;
-            msg = "unexpected character after line continuation character";
-            break;
-        }
-        default:
-            msg = "unknown parsing error";
-    }
-
-    RAISE_ERROR_KNOWN_LOCATION(p, errtype, p->tok->lineno,
-                               col_offset >= 0 ? col_offset : 0,
-                               p->tok->lineno, -1, msg);
-    return -1;
-}
-
-int
-_Pypegen_raise_decode_error(Parser *p)
-{
-    assert(PyErr_Occurred());
-    const char *errtype = NULL;
-    if (PyErr_ExceptionMatches(PyExc_UnicodeError)) {
-        errtype = "unicode error";
-    }
-    else if (PyErr_ExceptionMatches(PyExc_ValueError)) {
-        errtype = "value error";
-    }
-    if (errtype) {
-        PyObject *type;
-        PyObject *value;
-        PyObject *tback;
-        PyObject *errstr;
-        PyErr_Fetch(&type, &value, &tback);
-        errstr = PyObject_Str(value);
-        if (errstr) {
-            RAISE_SYNTAX_ERROR("(%s) %U", errtype, errstr);
-            Py_DECREF(errstr);
-        }
-        else {
-            PyErr_Clear();
-            RAISE_SYNTAX_ERROR("(%s) unknown error", errtype);
-        }
-        Py_XDECREF(type);
-        Py_XDECREF(value);
-        Py_XDECREF(tback);
-    }
-
-    return -1;
-}
-
-static int
-_PyPegen_tokenize_full_source_to_check_for_errors(Parser *p) {
-    // Tokenize the whole input to see if there are any tokenization
-    // errors such as mistmatching parentheses. These will get priority
-    // over generic syntax errors only if the line number of the error is
-    // before the one that we had for the generic error.
-
-    // We don't want to tokenize to the end for interactive input
-    if (p->tok->prompt != NULL) {
-        return 0;
-    }
-
-    PyObject *type, *value, *traceback;
-    PyErr_Fetch(&type, &value, &traceback);
-
-    Token *current_token = p->known_err_token != NULL ? p->known_err_token : p->tokens[p->fill - 1];
-    Py_ssize_t current_err_line = current_token->lineno;
-
-    int ret = 0;
-
-    for (;;) {
-        const char *start;
-        const char *end;
-        switch (_PyTokenizer_Get(p->tok, &start, &end)) {
-            case ERRORTOKEN:
-                if (p->tok->level != 0) {
-                    int error_lineno = p->tok->parenlinenostack[p->tok->level-1];
-                    if (current_err_line > error_lineno) {
-                        raise_unclosed_parentheses_error(p);
-                        ret = -1;
-                        goto exit;
-                    }
-                }
-                break;
-            case ENDMARKER:
-                break;
-            default:
-                continue;
-        }
-        break;
-    }
-
-
-exit:
-    if (PyErr_Occurred()) {
-        Py_XDECREF(value);
-        Py_XDECREF(type);
-        Py_XDECREF(traceback);
-    } else {
-        PyErr_Restore(type, value, traceback);
-    }
-    return ret;
-}
-
-// PARSER ERRORS
-
-void *
-_PyPegen_raise_error(Parser *p, PyObject *errtype, const char *errmsg, ...)
-{
-    if (p->fill == 0) {
-        va_list va;
-        va_start(va, errmsg);
-        _PyPegen_raise_error_known_location(p, errtype, 0, 0, 0, -1, errmsg, va);
-        va_end(va);
-        return NULL;
-    }
-
-    Token *t = p->known_err_token != NULL ? p->known_err_token : p->tokens[p->fill - 1];
-    Py_ssize_t col_offset;
-    Py_ssize_t end_col_offset = -1;
-    if (t->col_offset == -1) {
-        if (p->tok->cur == p->tok->buf) {
-            col_offset = 0;
-        } else {
-            const char* start = p->tok->buf  ? p->tok->line_start : p->tok->buf;
-            col_offset = Py_SAFE_DOWNCAST(p->tok->cur - start, intptr_t, int);
-        }
-    } else {
-        col_offset = t->col_offset + 1;
-    }
-
-    if (t->end_col_offset != -1) {
-        end_col_offset = t->end_col_offset + 1;
-    }
-
-    va_list va;
-    va_start(va, errmsg);
-    _PyPegen_raise_error_known_location(p, errtype, t->lineno, col_offset, t->end_lineno, end_col_offset, errmsg, va);
-    va_end(va);
-
-    return NULL;
-}
-
-static PyObject *
-get_error_line_from_tokenizer_buffers(Parser *p, Py_ssize_t lineno)
-{
-    /* If the file descriptor is interactive, the source lines of the current
-     * (multi-line) statement are stored in p->tok->interactive_src_start.
-     * If not, we're parsing from a string, which means that the whole source
-     * is stored in p->tok->str. */
-    assert((p->tok->fp == NULL && p->tok->str != NULL) || p->tok->fp == stdin);
-
-    char *cur_line = p->tok->fp_interactive ? p->tok->interactive_src_start : p->tok->str;
-    assert(cur_line != NULL);
-
-    for (int i = 0; i < lineno - 1; i++) {
-        cur_line = strchr(cur_line, '\n') + 1;
-    }
-
-    char *next_newline;
-    if ((next_newline = strchr(cur_line, '\n')) == NULL) { // This is the last line
-        next_newline = cur_line + strlen(cur_line);
-    }
-    return PyUnicode_DecodeUTF8(cur_line, next_newline - cur_line, "replace");
-}
-
-void *
-_PyPegen_raise_error_known_location(Parser *p, PyObject *errtype,
-                                    Py_ssize_t lineno, Py_ssize_t col_offset,
-                                    Py_ssize_t end_lineno, Py_ssize_t end_col_offset,
-                                    const char *errmsg, va_list va)
-{
-    PyObject *value = NULL;
-    PyObject *errstr = NULL;
-    PyObject *error_line = NULL;
-    PyObject *tmp = NULL;
-    p->error_indicator = 1;
-
-    if (end_lineno == CURRENT_POS) {
-        end_lineno = p->tok->lineno;
-    }
-    if (end_col_offset == CURRENT_POS) {
-        end_col_offset = p->tok->cur - p->tok->line_start;
-    }
-
-    if (p->start_rule == Py_fstring_input) {
-        const char *fstring_msg = "f-string: ";
-        Py_ssize_t len = strlen(fstring_msg) + strlen(errmsg);
-
-        char *new_errmsg = PyMem_Malloc(len + 1); // Lengths of both strings plus NULL character
-        if (!new_errmsg) {
-            return (void *) PyErr_NoMemory();
-        }
-
-        // Copy both strings into new buffer
-        memcpy(new_errmsg, fstring_msg, strlen(fstring_msg));
-        memcpy(new_errmsg + strlen(fstring_msg), errmsg, strlen(errmsg));
-        new_errmsg[len] = 0;
-        errmsg = new_errmsg;
-    }
-    errstr = PyUnicode_FromFormatV(errmsg, va);
-    if (!errstr) {
-        goto error;
-    }
-
-    if (p->tok->fp_interactive) {
-        error_line = get_error_line_from_tokenizer_buffers(p, lineno);
-    }
-    else if (p->start_rule == Py_file_input) {
-        error_line = _PyErr_ProgramDecodedTextObject(p->tok->filename,
-                                                     (int) lineno, p->tok->encoding);
-    }
-
-    if (!error_line) {
-        /* PyErr_ProgramTextObject was not called or returned NULL. If it was not called,
-           then we need to find the error line from some other source, because
-           p->start_rule != Py_file_input. If it returned NULL, then it either unexpectedly
-           failed or we're parsing from a string or the REPL. There's a third edge case where
-           we're actually parsing from a file, which has an E_EOF SyntaxError and in that case
-           `PyErr_ProgramTextObject` fails because lineno points to last_file_line + 1, which
-           does not physically exist */
-        assert(p->tok->fp == NULL || p->tok->fp == stdin || p->tok->done == E_EOF);
-
-        if (p->tok->lineno <= lineno && p->tok->inp > p->tok->buf) {
-            Py_ssize_t size = p->tok->inp - p->tok->buf;
-            error_line = PyUnicode_DecodeUTF8(p->tok->buf, size, "replace");
-        }
-        else if (p->tok->fp == NULL || p->tok->fp == stdin) {
-            error_line = get_error_line_from_tokenizer_buffers(p, lineno);
-        }
-        else {
-            error_line = PyUnicode_FromStringAndSize("", 0);
-        }
-        if (!error_line) {
-            goto error;
-        }
-    }
-
-    if (p->start_rule == Py_fstring_input) {
-        col_offset -= p->starting_col_offset;
-        end_col_offset -= p->starting_col_offset;
-    }
-
-    Py_ssize_t col_number = col_offset;
-    Py_ssize_t end_col_number = end_col_offset;
-
-    if (p->tok->encoding != NULL) {
-        col_number = _PyPegen_byte_offset_to_character_offset(error_line, col_offset);
-        if (col_number < 0) {
-            goto error;
-        }
-        if (end_col_number > 0) {
-            Py_ssize_t end_col_offset = _PyPegen_byte_offset_to_character_offset(error_line, end_col_number);
-            if (end_col_offset < 0) {
-                goto error;
-            } else {
-                end_col_number = end_col_offset;
-            }
-        }
-    }
-    tmp = Py_BuildValue("(OiiNii)", p->tok->filename, lineno, col_number, error_line, end_lineno, end_col_number);
-    if (!tmp) {
-        goto error;
-    }
-    value = PyTuple_Pack(2, errstr, tmp);
-    Py_DECREF(tmp);
-    if (!value) {
-        goto error;
-    }
-    PyErr_SetObject(errtype, value);
-
-    Py_DECREF(errstr);
-    Py_DECREF(value);
-    if (p->start_rule == Py_fstring_input) {
-        PyMem_Free((void *)errmsg);
-    }
-    return NULL;
-
-error:
-    Py_XDECREF(errstr);
-    Py_XDECREF(error_line);
-    if (p->start_rule == Py_fstring_input) {
-        PyMem_Free((void *)errmsg);
-    }
-    return NULL;
-}
-
-void
-_Pypegen_set_syntax_error(Parser* p, Token* last_token) {
-    // Existing sintax error
-    if (PyErr_Occurred()) {
-        // Prioritize tokenizer errors to custom syntax errors raised
-        // on the second phase only if the errors come from the parser.
-        if (p->tok->done == E_DONE && PyErr_ExceptionMatches(PyExc_SyntaxError)) {
-            _PyPegen_tokenize_full_source_to_check_for_errors(p);
-        }
-        // Propagate the existing syntax error.
-        return;
-    }
-    // Initialization error
-    if (p->fill == 0) {
-        RAISE_SYNTAX_ERROR("error at start before reading any input");
-    }
-    // Parser encountered EOF (End of File) unexpectedtly
-    if (last_token->type == ERRORTOKEN && p->tok->done == E_EOF) {
-        if (p->tok->level) {
-            raise_unclosed_parentheses_error(p);
-        } else {
-            RAISE_SYNTAX_ERROR("unexpected EOF while parsing");
-        }
-        return;
-    }
-    // Indentation error in the tokenizer
-    if (last_token->type == INDENT || last_token->type == DEDENT) {
-        RAISE_INDENTATION_ERROR(last_token->type == INDENT ? "unexpected indent" : "unexpected unindent");
-        return;
-    }
-    // Unknown error (generic case)
-
-    // Use the last token we found on the first pass to avoid reporting
-    // incorrect locations for generic syntax errors just because we reached
-    // further away when trying to find specific syntax errors in the second
-    // pass.
-    RAISE_SYNTAX_ERROR_KNOWN_LOCATION(last_token, "invalid syntax");
-    // _PyPegen_tokenize_full_source_to_check_for_errors will override the existing
-    // generic SyntaxError we just raised if errors are found.
-    _PyPegen_tokenize_full_source_to_check_for_errors(p);
-}