[Python-checkins] bpo-43410: Fix crash in the parser when producing syntax errors when reading from stdin (GH-24763)

Sat Mar 13 22:38:55 EST 2021

https://github.com/python/cpython/commit/cd8dcbc851fcc312722cdb5544c2f25cf46b3f8a
commit: cd8dcbc851fcc312722cdb5544c2f25cf46b3f8a
branch: master
author: Pablo Galindo <Pablogsal at gmail.com>
committer: pablogsal <Pablogsal at gmail.com>
date: 2021-03-14T04:38:40+01:00
summary:

bpo-43410: Fix crash in the parser when producing syntax errors when reading from stdin (GH-24763)

files:
A Misc/NEWS.d/next/Core and Builtins/2021-03-05-17-23-36.bpo-43410.lCzIg0.rst
M Lib/test/test_cmd_line.py
M Parser/pegen.c
M Parser/tokenizer.c
M Parser/tokenizer.h

diff --git a/Lib/test/test_cmd_line.py b/Lib/test/test_cmd_line.py
index f12dff3202fe3..95ab9d8c13965 100644
--- a/Lib/test/test_cmd_line.py
+++ b/Lib/test/test_cmd_line.py
@@ -816,9 +816,16 @@ def test_sys_flags_not_set(self):
             PYTHONVERBOSE="1",
         )
 
+class SyntaxErrorTests(unittest.TestCase):
+    def test_tokenizer_error_with_stdin(self):
+        proc = subprocess.run([sys.executable, "-"], input = b"(1+2+3",
+                              stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        self.assertNotEqual(proc.returncode, 0)
+        self.assertNotEqual(proc.stderr, None)
+        self.assertIn(b"\nSyntaxError", proc.stderr)
 
 def test_main():
-    support.run_unittest(CmdLineTest, IgnoreEnvironmentTest)
+    support.run_unittest(CmdLineTest, IgnoreEnvironmentTest, SyntaxErrorTests)
     support.reap_children()
 
 if __name__ == "__main__":
diff --git a/Misc/NEWS.d/next/Core and Builtins/2021-03-05-17-23-36.bpo-43410.lCzIg0.rst b/Misc/NEWS.d/next/Core and Builtins/2021-03-05-17-23-36.bpo-43410.lCzIg0.rst
new file mode 100644
index 0000000000000..245bda5ff72dd
--- /dev/null
+++ b/Misc/NEWS.d/next/Core and Builtins/2021-03-05-17-23-36.bpo-43410.lCzIg0.rst	
@@ -0,0 +1,2 @@
+Fix a bug that was causing the parser to crash when emiting syntax errors
+when reading input from stdin. Patch by Pablo Galindo
diff --git a/Parser/pegen.c b/Parser/pegen.c
index 68f0e329f083d..301199368651d 100644
--- a/Parser/pegen.c
+++ b/Parser/pegen.c
@@ -397,7 +397,8 @@ get_error_line(Parser *p, Py_ssize_t lineno)
        are stored in p->tok->stdin_content */
     assert(p->tok->fp == NULL || p->tok->fp == stdin);
 
-    char *cur_line = p->tok->fp == NULL ? p->tok->str : p->tok->stdin_content;
+    char *cur_line = p->tok->fp_interactive ? p->tok->interactive_src_start : p->tok->str;
+
     for (int i = 0; i < lineno - 1; i++) {
         cur_line = strchr(cur_line, '\n') + 1;
     }
@@ -440,7 +441,10 @@ _PyPegen_raise_error_known_location(Parser *p, PyObject *errtype,
         goto error;
     }
 
-    if (p->start_rule == Py_file_input) {
+    if (p->tok->fp_interactive) {
+        error_line = get_error_line(p, lineno);
+    }
+    else if (p->start_rule == Py_file_input) {
         error_line = PyErr_ProgramTextObject(p->tok->filename, (int) lineno);
     }
 
@@ -1232,7 +1236,7 @@ _PyPegen_run_parser(Parser *p)
         if (p->fill == 0) {
             RAISE_SYNTAX_ERROR("error at start before reading any input");
         }
-       else if (p->tok->done == E_EOF) {
+        else if (p->tok->done == E_EOF) {
             if (p->tok->level) {
                 raise_unclosed_parentheses_error(p);
             } else {
@@ -1287,6 +1291,10 @@ _PyPegen_run_parser_from_file_pointer(FILE *fp, int start_rule, PyObject *filena
         }
         return NULL;
     }
+    if (!tok->fp || ps1 != NULL || ps2 != NULL ||
+        PyUnicode_CompareWithASCIIString(filename_ob, "<stdin>") == 0) {
+        tok->fp_interactive = 1;
+    }
     // This transfers the ownership to the tokenizer
     tok->filename = filename_ob;
     Py_INCREF(filename_ob);
diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c
index d9334aaf148ba..09d8b88cadf35 100644
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@@ -56,6 +56,9 @@ tok_new(void)
     if (tok == NULL)
         return NULL;
     tok->buf = tok->cur = tok->inp = NULL;
+    tok->fp_interactive = 0;
+    tok->interactive_src_start = NULL;
+    tok->interactive_src_end = NULL;
     tok->start = NULL;
     tok->end = NULL;
     tok->done = E_OK;
@@ -80,8 +83,6 @@ tok_new(void)
     tok->decoding_readline = NULL;
     tok->decoding_buffer = NULL;
     tok->type_comments = 0;
-    tok->stdin_content = NULL;
-
     tok->async_hacks = 0;
     tok->async_def = 0;
     tok->async_def_indent = 0;
@@ -323,6 +324,35 @@ check_bom(int get_char(struct tok_state *),
     return 1;
 }
 
+static int tok_concatenate_interactive_new_line(struct tok_state* tok, char* line) {
+    assert(tok->fp_interactive);
+
+    if (!line) {
+        return 0;
+    }
+
+    Py_ssize_t current_size = tok->interactive_src_end - tok->interactive_src_start;
+    Py_ssize_t line_size = strlen(line);
+    char* new_str = tok->interactive_src_start;
+
+    new_str = PyMem_Realloc(new_str, current_size + line_size + 1);
+    if (!new_str) {
+        if (tok->interactive_src_start) {
+            PyMem_Free(tok->interactive_src_start);
+        }
+        tok->interactive_src_start = NULL;
+        tok->interactive_src_end = NULL;
+        tok->done = E_NOMEM;
+        return -1;
+    }
+    strcpy(new_str + current_size, line);
+
+    tok->interactive_src_start = new_str;
+    tok->interactive_src_end = new_str + current_size + line_size;
+    return 0;
+}
+
+
 /* Read a line of text from TOK into S, using the stream in TOK.
    Return NULL on failure, else S.
 
@@ -552,6 +582,12 @@ decoding_fgets(char *s, int size, struct tok_state *tok)
                 badchar, tok->filename, tok->lineno + 1);
         return error_ret(tok);
     }
+
+    if (tok->fp_interactive &&
+        tok_concatenate_interactive_new_line(tok, line) == -1) {
+        return NULL;
+    }
+
     return line;
 }
 
@@ -807,17 +843,21 @@ PyTokenizer_FromFile(FILE *fp, const char* enc,
 void
 PyTokenizer_Free(struct tok_state *tok)
 {
-    if (tok->encoding != NULL)
+    if (tok->encoding != NULL) {
         PyMem_Free(tok->encoding);
+    }
     Py_XDECREF(tok->decoding_readline);
     Py_XDECREF(tok->decoding_buffer);
     Py_XDECREF(tok->filename);
-    if (tok->fp != NULL && tok->buf != NULL)
+    if (tok->fp != NULL && tok->buf != NULL) {
         PyMem_Free(tok->buf);
-    if (tok->input)
+    }
+    if (tok->input) {
         PyMem_Free(tok->input);
-    if (tok->stdin_content)
-        PyMem_Free(tok->stdin_content);
+    }
+    if (tok->interactive_src_start != NULL) {
+        PyMem_Free(tok->interactive_src_start);
+    }
     PyMem_Free(tok);
 }
 
@@ -858,24 +898,6 @@ tok_nextc(struct tok_state *tok)
                 if (translated == NULL)
                     return EOF;
                 newtok = translated;
-                if (tok->stdin_content == NULL) {
-                    tok->stdin_content = PyMem_Malloc(strlen(translated) + 1);
-                    if (tok->stdin_content == NULL) {
-                        tok->done = E_NOMEM;
-                        return EOF;
-                    }
-                    sprintf(tok->stdin_content, "%s", translated);
-                }
-                else {
-                    char *new_str = PyMem_Malloc(strlen(tok->stdin_content) + strlen(translated) + 1);
-                    if (new_str == NULL) {
-                        tok->done = E_NOMEM;
-                        return EOF;
-                    }
-                    sprintf(new_str, "%s%s", tok->stdin_content, translated);
-                    PyMem_Free(tok->stdin_content);
-                    tok->stdin_content = new_str;
-                }
             }
             if (tok->encoding && newtok && *newtok) {
                 /* Recode to UTF-8 */
@@ -898,6 +920,10 @@ tok_nextc(struct tok_state *tok)
                 strcpy(newtok, buf);
                 Py_DECREF(u);
             }
+            if (tok->fp_interactive &&
+                tok_concatenate_interactive_new_line(tok, newtok) == -1) {
+                return EOF;
+            }
             if (tok->nextprompt != NULL)
                 tok->prompt = tok->nextprompt;
             if (newtok == NULL)
@@ -958,7 +984,7 @@ tok_nextc(struct tok_state *tok)
                 }
                 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
                           tok) == NULL) {
-                    if (!tok->decoding_erred)
+                    if (!tok->decoding_erred && !(tok->done == E_NOMEM))
                         tok->done = E_EOF;
                     done = 1;
                 }
diff --git a/Parser/tokenizer.h b/Parser/tokenizer.h
index 56074b61ae100..111126c67f2d5 100644
--- a/Parser/tokenizer.h
+++ b/Parser/tokenizer.h
@@ -26,6 +26,9 @@ struct tok_state {
     char *buf;          /* Input buffer, or NULL; malloc'ed if fp != NULL */
     char *cur;          /* Next character in buffer */
     char *inp;          /* End of data in buffer */
+    int fp_interactive; /* If the file descriptor is interactive */
+    char *interactive_src_start; /* The start of the source parsed so far in interactive mode */
+    char *interactive_src_end; /* The end of the source parsed so far in interactive mode */
     const char *end;    /* End of input buffer if buf != NULL */
     const char *start;  /* Start of current token if not NULL */
     int done;           /* E_OK normally, E_EOF at EOF, otherwise error code */
@@ -37,7 +40,6 @@ struct tok_state {
     int atbol;          /* Nonzero if at begin of new line */
     int pendin;         /* Pending indents (if > 0) or dedents (if < 0) */
     const char *prompt, *nextprompt;          /* For interactive prompting */
-    char *stdin_content;
     int lineno;         /* Current line number */
     int first_lineno;   /* First line of a single line or multi line string
                            expression (cf. issue 16806) */