[Python-checkins] bpo-40688: Use the correct parser in the peg_generator scripts (GH-20235)

Mon May 25 15:52:05 EDT 2020

https://github.com/python/cpython/commit/9645930b5bc1833ef495891d22052d1ba65ab7ea
commit: 9645930b5bc1833ef495891d22052d1ba65ab7ea
branch: master
author: Lysandros Nikolaou <lisandrosnik at gmail.com>
committer: GitHub <noreply at github.com>
date: 2020-05-25T20:51:58+01:00
summary:

bpo-40688: Use the correct parser in the peg_generator scripts (GH-20235)

The scripts in `Tools/peg_generator/scripts` mostly assume that
`ast.parse` and `compile` use the old parser, since this was the
state of things, while we were developing them. They need to be
updated to always use the correct parser. `_peg_parser` is being
extended to support both parsing and compiling with both parsers.

files:
M Modules/_peg_parser.c
M Tools/peg_generator/Makefile
M Tools/peg_generator/scripts/benchmark.py
M Tools/peg_generator/scripts/show_parse.py
M Tools/peg_generator/scripts/test_parse_directory.py
M Tools/peg_generator/scripts/test_pypi_packages.py

diff --git a/Modules/_peg_parser.c b/Modules/_peg_parser.c
index 3b27b2c9cbaa2..b66d5a83a84f6 100644
--- a/Modules/_peg_parser.c
+++ b/Modules/_peg_parser.c
@@ -1,104 +1,133 @@
 #include <Python.h>
 #include "pegen_interface.h"
 
-PyObject *
-_Py_parse_file(PyObject *self, PyObject *args, PyObject *kwds)
+static int
+_mode_str_to_int(char *mode_str)
 {
-    static char *keywords[] = {"file", "mode", NULL};
-    char *filename;
-    char *mode_str = "exec";
-
-    if (!PyArg_ParseTupleAndKeywords(args, kwds, "s|s", keywords, &filename, &mode_str)) {
-        return NULL;
-    }
-
     int mode;
     if (strcmp(mode_str, "exec") == 0) {
         mode = Py_file_input;
     }
+    else if (strcmp(mode_str, "eval") == 0) {
+        mode = Py_eval_input;
+    }
     else if (strcmp(mode_str, "single") == 0) {
         mode = Py_single_input;
     }
     else {
-        return PyErr_Format(PyExc_ValueError, "mode must be either 'exec' or 'single'");
+        mode = -1;
     }
+    return mode;
+}
 
-    PyArena *arena = PyArena_New();
-    if (arena == NULL) {
+static mod_ty
+_run_parser(char *str, char *filename, int mode, PyCompilerFlags *flags, PyArena *arena, int oldparser)
+{
+    mod_ty mod;
+    if (!oldparser) {
+        mod = PyPegen_ASTFromString(str, filename, mode, flags, arena);
+    }
+    else {
+        mod = PyParser_ASTFromString(str, filename, mode, flags, arena);
+    }
+    return mod;
+}
+
+PyObject *
+_Py_compile_string(PyObject *self, PyObject *args, PyObject *kwds)
+{
+    static char *keywords[] = {"string", "filename", "mode", "oldparser", NULL};
+    char *the_string;
+    char *filename = "<string>";
+    char *mode_str = "exec";
+    int oldparser = 0;
+
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "s|ssp", keywords,
+            &the_string, &filename, &mode_str, &oldparser)) {
         return NULL;
     }
 
+    int mode = _mode_str_to_int(mode_str);
+    if (mode == -1) {
+        return PyErr_Format(PyExc_ValueError, "mode must be either 'exec' or 'eval' or 'single'");
+    }
+
     PyCompilerFlags flags = _PyCompilerFlags_INIT;
-    PyObject *result = NULL;
+    flags.cf_flags = PyCF_IGNORE_COOKIE;
 
-    mod_ty res = PyPegen_ASTFromFilename(filename, mode, &flags, arena);
-    if (res == NULL) {
-        goto error;
+    PyArena *arena = PyArena_New();
+    if (arena == NULL) {
+        return NULL;
+    }
+
+    mod_ty mod = _run_parser(the_string, filename, mode, &flags, arena, oldparser);
+    if (mod == NULL) {
+        PyArena_Free(arena);
+        return NULL;
     }
-    result = PyAST_mod2obj(res);
 
-error:
+    PyObject *filename_ob = PyUnicode_DecodeFSDefault(filename);
+    if (filename_ob == NULL) {
+        PyArena_Free(arena);
+        return NULL;
+    }
+    PyCodeObject *result = PyAST_CompileObject(mod, filename_ob, &flags, -1, arena);
+    Py_XDECREF(filename_ob);
     PyArena_Free(arena);
-    return result;
+    return (PyObject *)result;
 }
 
 PyObject *
 _Py_parse_string(PyObject *self, PyObject *args, PyObject *kwds)
 {
-    static char *keywords[] = {"string", "mode", "oldparser", NULL};
+    static char *keywords[] = {"string", "filename", "mode", "oldparser", NULL};
     char *the_string;
+    char *filename = "<string>";
     char *mode_str = "exec";
     int oldparser = 0;
 
-    if (!PyArg_ParseTupleAndKeywords(args, kwds, "s|sp", keywords,
-            &the_string, &mode_str, &oldparser)) {
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "s|ssp", keywords,
+            &the_string, &filename, &mode_str, &oldparser)) {
         return NULL;
     }
 
-    int mode;
-    if (strcmp(mode_str, "exec") == 0) {
-        mode = Py_file_input;
-    }
-    else if (strcmp(mode_str, "eval") == 0) {
-        mode = Py_eval_input;
-    }
-    else if (strcmp(mode_str, "single") == 0) {
-        mode = Py_single_input;
-    }
-    else {
+    int mode = _mode_str_to_int(mode_str);
+    if (mode == -1) {
         return PyErr_Format(PyExc_ValueError, "mode must be either 'exec' or 'eval' or 'single'");
     }
 
+    PyCompilerFlags flags = _PyCompilerFlags_INIT;
+    flags.cf_flags = PyCF_IGNORE_COOKIE;
+
     PyArena *arena = PyArena_New();
     if (arena == NULL) {
         return NULL;
     }
 
-    PyObject *result = NULL;
-
-    PyCompilerFlags flags = _PyCompilerFlags_INIT;
-    flags.cf_flags = PyCF_IGNORE_COOKIE;
-
-    mod_ty res;
-    if (oldparser) {
-        res = PyParser_ASTFromString(the_string, "<string>", mode, &flags, arena);
-    }
-    else {
-        res = PyPegen_ASTFromString(the_string, "<string>", mode, &flags, arena);
-    }
-    if (res == NULL) {
-        goto error;
+    mod_ty mod = _run_parser(the_string, filename, mode, &flags, arena, oldparser);
+    if (mod == NULL) {
+        PyArena_Free(arena);
+        return NULL;
     }
-    result = PyAST_mod2obj(res);
 
-error:
+    PyObject *result = PyAST_mod2obj(mod);
     PyArena_Free(arena);
     return result;
 }
 
 static PyMethodDef ParseMethods[] = {
-    {"parse_file", (PyCFunction)(void (*)(void))_Py_parse_file, METH_VARARGS|METH_KEYWORDS, "Parse a file."},
-    {"parse_string", (PyCFunction)(void (*)(void))_Py_parse_string, METH_VARARGS|METH_KEYWORDS,"Parse a string."},
+    {
+        "parse_string",
+        (PyCFunction)(void (*)(void))_Py_parse_string,
+        METH_VARARGS|METH_KEYWORDS,
+        "Parse a string, return an AST."
+    },
+    {
+        "compile_string",
+        (PyCFunction)(void (*)(void))_Py_compile_string,
+        METH_VARARGS|METH_KEYWORDS,
+        "Compile a string, return a code object."
+    },
     {NULL, NULL, 0, NULL} /* Sentinel */
 };
 
diff --git a/Tools/peg_generator/Makefile b/Tools/peg_generator/Makefile
index 34763b543c23b..e7a190c1bcd13 100644
--- a/Tools/peg_generator/Makefile
+++ b/Tools/peg_generator/Makefile
@@ -69,25 +69,22 @@ stats: peg_extension/parse.c data/xxl.py
 
 time: time_compile
 
-time_compile: venv peg_extension/parse.c data/xxl.py
+time_compile: venv data/xxl.py
 	$(VENVPYTHON) scripts/benchmark.py --parser=pegen --target=xxl compile
 
-time_parse: venv peg_extension/parse.c data/xxl.py
+time_parse: venv data/xxl.py
 	$(VENVPYTHON) scripts/benchmark.py --parser=pegen --target=xxl parse
 
-time_check: venv peg_extension/parse.c data/xxl.py
-	$(VENVPYTHON) scripts/benchmark.py --parser=pegen --target=xxl check
+time_old: time_old_compile
 
-time_stdlib: time_stdlib_compile
-
-time_stdlib_compile: venv peg_extension/parse.c data/xxl.py
+time_old_compile: venv data/xxl.py
 	$(VENVPYTHON) scripts/benchmark.py --parser=cpython --target=xxl compile
 
-time_stdlib_parse: venv peg_extension/parse.c data/xxl.py
+time_old_parse: venv data/xxl.py
 	$(VENVPYTHON) scripts/benchmark.py --parser=cpython --target=xxl parse
 
-test_local:
-	$(PYTHON) scripts/test_parse_directory.py \
+time_peg_dir: venv
+	$(VENVPYTHON) scripts/test_parse_directory.py \
 		--grammar-file $(GRAMMAR) \
 		--tokens-file $(TOKENS) \
 		-d $(TESTDIR) \
@@ -96,8 +93,8 @@ test_local:
 		--exclude "*/failset/**" \
 		--exclude "*/failset/**/*"
 
-test_global: $(CPYTHON)
-	$(PYTHON) scripts/test_parse_directory.py \
+time_stdlib: $(CPYTHON) venv
+	$(VENVPYTHON) scripts/test_parse_directory.py \
 		--grammar-file $(GRAMMAR) \
 		--tokens-file $(TOKENS) \
 		-d $(CPYTHON) \
@@ -113,9 +110,6 @@ mypy: regen-metaparser
 format-python:
 	black pegen scripts
 
-bench: venv
-	$(VENVPYTHON) scripts/benchmark.py --parser=pegen --target=stdlib check
-
 format: format-python
 
 find_max_nesting:
diff --git a/Tools/peg_generator/scripts/benchmark.py b/Tools/peg_generator/scripts/benchmark.py
index 4942b99b6619f..71512c22a355b 100644
--- a/Tools/peg_generator/scripts/benchmark.py
+++ b/Tools/peg_generator/scripts/benchmark.py
@@ -6,6 +6,8 @@
 import os
 from time import time
 
+import _peg_parser
+
 try:
     import memory_profiler
 except ModuleNotFoundError:
@@ -14,8 +16,6 @@
     sys.exit(1)
 
 sys.path.insert(0, os.getcwd())
-from peg_extension import parse
-from pegen.build import build_c_parser_and_generator
 from scripts.test_parse_directory import parse_directory
 
 argparser = argparse.ArgumentParser(
@@ -41,9 +41,6 @@
     "compile", help="Benchmark parsing and compiling to bytecode"
 )
 command_parse = subcommands.add_parser("parse", help="Benchmark parsing and generating an ast.AST")
-command_check = subcommands.add_parser(
-    "check", help="Benchmark parsing and throwing the tree away"
-)
 
 
 def benchmark(func):
@@ -66,22 +63,20 @@ def wrapper(*args):
 @benchmark
 def time_compile(source, parser):
     if parser == "cpython":
-        return compile(source, os.path.join("data", "xxl.py"), "exec")
+        return _peg_parser.compile_string(
+            source,
+            oldparser=True,
+        )
     else:
-        return parse.parse_string(source, mode=2)
+        return _peg_parser.compile_string(source)
 
 
 @benchmark
 def time_parse(source, parser):
     if parser == "cpython":
-        return ast.parse(source, os.path.join("data", "xxl.py"), "exec")
+        return _peg_parser.parse_string(source, oldparser=True)
     else:
-        return parse.parse_string(source, mode=1)
-
-
- at benchmark
-def time_check(source):
-    return parse.parse_string(source, mode=0)
+        return _peg_parser.parse_string(source)
 
 
 def run_benchmark_xxl(subcommand, parser, source):
@@ -89,32 +84,20 @@ def run_benchmark_xxl(subcommand, parser, source):
         time_compile(source, parser)
     elif subcommand == "parse":
         time_parse(source, parser)
-    elif subcommand == "check":
-        time_check(source)
 
 
 def run_benchmark_stdlib(subcommand, parser):
-    modes = {"compile": 2, "parse": 1, "check": 0}
-    extension = None
-    if parser == "pegen":
-        extension = build_c_parser_and_generator(
-            "../../Grammar/python.gram",
-            "../../Grammar/Tokens",
-            "peg_extension/parse.c",
-            compile_extension=True,
-            skip_actions=False,
-        )
     for _ in range(3):
         parse_directory(
             "../../Lib",
             "../../Grammar/python.gram",
+            "../../Grammar/Tokens",
             verbose=False,
             excluded_files=["*/bad*", "*/lib2to3/tests/data/*",],
             skip_actions=False,
             tree_arg=0,
             short=True,
-            extension=extension,
-            mode=modes[subcommand],
+            mode=2 if subcommand == "compile" else 1,
             parser=parser,
         )
 
@@ -127,8 +110,6 @@ def main():
 
     if subcommand is None:
         argparser.error("A benchmark to run is required")
-    if subcommand == "check" and parser == "cpython":
-        argparser.error("Cannot use check target with the CPython parser")
 
     if target == "xxl":
         with open(os.path.join("data", "xxl.py"), "r") as f:
diff --git a/Tools/peg_generator/scripts/show_parse.py b/Tools/peg_generator/scripts/show_parse.py
index 1a0410e1bac8f..1c1996f40f74e 100755
--- a/Tools/peg_generator/scripts/show_parse.py
+++ b/Tools/peg_generator/scripts/show_parse.py
@@ -30,6 +30,8 @@
 import sys
 import tempfile
 
+import _peg_parser
+
 from typing import List
 
 sys.path.insert(0, os.getcwd())
@@ -72,7 +74,7 @@ def diff_trees(a: ast.AST, b: ast.AST, verbose: bool = False) -> List[str]:
 
 
 def show_parse(source: str, verbose: bool = False) -> str:
-    tree = ast.parse(source)
+    tree = _peg_parser.parse_string(source, oldparser=True)
     return format_tree(tree, verbose).rstrip("\n")
 
 
@@ -90,17 +92,11 @@ def main() -> None:
         sep = " "
     program = sep.join(args.program)
     if args.grammar_file:
-        sys.path.insert(0, os.curdir)
-        from pegen.build import build_parser_and_generator
-
-        build_parser_and_generator(args.grammar_file, "peg_parser/parse.c", compile_extension=True)
-        from pegen.parse import parse_string  # type: ignore[import]
-
-        tree = parse_string(program, mode=1)
+        tree = _peg_parser.parse_string(program)
 
         if args.diff:
             a = tree
-            b = ast.parse(program)
+            b = _peg_parser.parse_string(program, oldparser=True)
             diff = diff_trees(a, b, args.verbose)
             if diff:
                 for line in diff:
@@ -111,8 +107,8 @@ def main() -> None:
             print(f"# Parsed using {args.grammar_file}")
             print(format_tree(tree, args.verbose))
     else:
-        tree = ast.parse(program)
-        print("# Parse using ast.parse()")
+        tree = _peg_parser.parse_string(program, oldparser=True)
+        print("# Parse using the old parser")
         print(format_tree(tree, args.verbose))
 
 
diff --git a/Tools/peg_generator/scripts/test_parse_directory.py b/Tools/peg_generator/scripts/test_parse_directory.py
index aef9c74b52881..e88afe1539ce1 100755
--- a/Tools/peg_generator/scripts/test_parse_directory.py
+++ b/Tools/peg_generator/scripts/test_parse_directory.py
@@ -6,13 +6,14 @@
 import sys
 import time
 import traceback
+import tokenize
+import _peg_parser
 from glob import glob
 from pathlib import PurePath
 
 from typing import List, Optional, Any
 
 sys.path.insert(0, os.getcwd())
-from pegen.build import build_c_parser_and_generator
 from pegen.ast_dump import ast_dump
 from pegen.testutil import print_memstats
 from scripts import show_parse
@@ -83,7 +84,7 @@ def compare_trees(
     actual_tree: ast.AST, file: str, verbose: bool, include_attributes: bool = False,
 ) -> int:
     with open(file) as f:
-        expected_tree = ast.parse(f.read())
+        expected_tree = _peg_parser.parse_string(f.read(), oldparser=True)
 
     expected_text = ast_dump(expected_tree, include_attributes=include_attributes)
     actual_text = ast_dump(actual_tree, include_attributes=include_attributes)
@@ -121,7 +122,6 @@ def parse_directory(
     skip_actions: bool,
     tree_arg: int,
     short: bool,
-    extension: Any,
     mode: int,
     parser: str,
 ) -> int:
@@ -137,47 +137,21 @@ def parse_directory(
         if not os.path.exists(grammar_file):
             print(f"The specified grammar file, {grammar_file}, does not exist.", file=sys.stderr)
             return 1
-
-        try:
-            if not extension and parser == "pegen":
-                build_c_parser_and_generator(
-                    grammar_file,
-                    tokens_file,
-                    "peg_extension/parse.c",
-                    compile_extension=True,
-                    skip_actions=skip_actions,
-                )
-        except Exception as err:
-            print(
-                f"{FAIL}The following error occurred when generating the parser. Please check your grammar file.\n{ENDC}",
-                file=sys.stderr,
-            )
-            traceback.print_exception(err.__class__, err, None)
-
-            return 1
-
     else:
         print(
             "A grammar file or a tokens file was not provided - attempting to use existing parser from stdlib...\n"
         )
 
-    if parser == "pegen":
-        try:
-            from peg_extension import parse  # type: ignore
-        except Exception as e:
-            print(
-                "An existing parser was not found. Please run `make` or specify a grammar file with the `-g` flag.",
-                file=sys.stderr,
-            )
-            return 1
+    if tree_arg:
+        assert mode == 1, "Mode should be 1 (parse), when comparing the generated trees"
 
     # For a given directory, traverse files and attempt to parse each one
     # - Output success/failure for each file
     errors = 0
     files = []
     trees = {}  # Trees to compare (after everything else is done)
+    total_seconds = 0
 
-    t0 = time.time()
     for file in sorted(glob(f"{directory}/**/*.py", recursive=True)):
         # Only attempt to parse Python files and files that are not excluded
         should_exclude_file = False
@@ -187,25 +161,31 @@ def parse_directory(
                 break
 
         if not should_exclude_file:
+            with tokenize.open(file) as f:
+                source = f.read()
             try:
-                if tree_arg:
-                    mode = 1
-                if parser == "cpython":
-                    with open(file, "r") as f:
-                        source = f.read()
-                        if mode == 2:
-                            compile(source, file, "exec")
-                        elif mode == 1:
-                            ast.parse(source, file, "exec")
+                t0 = time.time()
+                if mode == 2:
+                    result = _peg_parser.compile_string(
+                        source,
+                        filename=file,
+                        oldparser=parser == "cpython",
+                    )
                 else:
-                    tree = parse.parse_file(file, mode=mode)
+                    result = _peg_parser.parse_string(
+                        source,
+                        filename=file,
+                        oldparser=parser == "cpython"
+                    )
+                t1 = time.time()
+                total_seconds += (t1 - t0)
                 if tree_arg:
-                    trees[file] = tree
+                    trees[file] = result
                 if not short:
                     report_status(succeeded=True, file=file, verbose=verbose)
             except Exception as error:
                 try:
-                    ast.parse(file)
+                    _peg_parser.parse_string(source, mode="exec", oldparser=True)
                 except Exception:
                     if not short:
                         print(f"File {file} cannot be parsed by either pegen or the ast module.")
@@ -217,7 +197,6 @@ def parse_directory(
             files.append(file)
     t1 = time.time()
 
-    total_seconds = t1 - t0
     total_files = len(files)
 
     total_bytes = 0
@@ -238,13 +217,6 @@ def parse_directory(
             f"or {total_bytes / total_seconds :,.0f} bytes/sec.",
         )
 
-    if parser == "pegen":
-        # Dump memo stats to @data.
-        with open("@data", "w") as datafile:
-            for i, count in enumerate(parse.get_memo_stats()):
-                if count:
-                    datafile.write(f"{i:4d} {count:9d}\n")
-
     if short:
         print_memstats()
 
@@ -275,6 +247,7 @@ def main() -> None:
     skip_actions = args.skip_actions
     tree = args.tree
     short = args.short
+    mode = 1 if args.tree else 2
     sys.exit(
         parse_directory(
             directory,
@@ -285,8 +258,7 @@ def main() -> None:
             skip_actions,
             tree,
             short,
-            None,
-            0,
+            mode,
             "pegen",
         )
     )
diff --git a/Tools/peg_generator/scripts/test_pypi_packages.py b/Tools/peg_generator/scripts/test_pypi_packages.py
index 7586b1a21fa6d..98f77785cdd1c 100755
--- a/Tools/peg_generator/scripts/test_pypi_packages.py
+++ b/Tools/peg_generator/scripts/test_pypi_packages.py
@@ -54,7 +54,7 @@ def find_dirname(package_name: str) -> str:
     assert False  # This is to fix mypy, should never be reached
 
 
-def run_tests(dirname: str, tree: int, extension: Any) -> int:
+def run_tests(dirname: str, tree: int) -> int:
     return test_parse_directory.parse_directory(
         dirname,
         HERE / ".." / ".." / ".." / "Grammar" / "python.gram",
@@ -72,7 +72,6 @@ def run_tests(dirname: str, tree: int, extension: Any) -> int:
         skip_actions=False,
         tree_arg=tree,
         short=True,
-        extension=extension,
         mode=1,
         parser="pegen",
     )
@@ -82,13 +81,6 @@ def main() -> None:
     args = argparser.parse_args()
     tree = args.tree
 
-    extension = build.build_c_parser_and_generator(
-        HERE / ".." / ".." / ".." / "Grammar" / "python.gram",
-        HERE / ".." / ".." / ".." / "Grammar" / "Tokens",
-        "peg_extension/parse.c",
-        compile_extension=True,
-    )
-
     for package in get_packages():
         print(f"Extracting files from {package}... ", end="")
         try:
@@ -100,7 +92,7 @@ def main() -> None:
 
         print(f"Trying to parse all python files ... ")
         dirname = find_dirname(package)
-        status = run_tests(dirname, tree, extension)
+        status = run_tests(dirname, tree)
         if status == 0:
             shutil.rmtree(dirname)
         else: