[pypy-svn] r35057 - pypy/dist/pypy/rlib/parsing/test

Mon Nov 27 21:20:36 CET 2006

Author: cfbolz
Date: Mon Nov 27 21:20:34 2006
New Revision: 35057

Added:
   pypy/dist/pypy/rlib/parsing/test/pygrammar.txt
   pypy/dist/pypy/rlib/parsing/test/test_pythonparse.py
Log:
in an attempt to find some more bugs by trying a more complex grammar:
convert the python 2.4 grammar to my ebnf format, requires a few changes but
parses python code :-)


Added: pypy/dist/pypy/rlib/parsing/test/pygrammar.txt
==============================================================================

--- (empty file)
+++ pypy/dist/pypy/rlib/parsing/test/pygrammar.txt	Mon Nov 27 21:20:34 2006
@@ -0,0 +1,136 @@
+# tokens
+
+#IGNORE: "[ \f\t]*|#[^\n]*";
+#NAME: "[a-zA-Z_][a-zA-Z0-9_]*";
+#NUMBER: "(0[xX][0-9a-fA-F]*[lL]?)|(0[0-7]*[lL]?)|([1-9][0-9]*[lL]?)"
+#STRING: "\"[^\\\"\n]*\"";
+#NEWLINE: "\$\$NEWLINE"
+#INDENT: "\$\$INDENT"
+#DEDENT: "\$\$DEDENT"
+
+# Start symbols for the grammar:
+#	single_input is a single interactive statement;
+#	file_input is a module or sequence of commands read from an input file;
+#	eval_input is the input for the eval() and input() functions.
+# NB: compound_stmt in single_input is followed by extra NEWLINE!
+file_input: newline_or_stmt* [EOF];
+single_input: NEWLINE | simple_stmt | compound_stmt NEWLINE;
+newline_or_stmt: NEWLINE | <stmt>;
+eval_input: testlist NEWLINE* EOF;
+
+decorator: "@" dotted_name ( "(" [arglist] ")" )? NEWLINE;
+decorators: decorator+;
+funcdef: decorators? ["def"] NAME parameters [":"] suite;
+parameters: ["("] >varargslist< [")"] | ["("] [")"];
+varargslist: (parameter ",")*  >star_or_starstarargs< |
+             parameter ("," parameter)* ","?;
+parameter: fpdef ("=" test)?;
+star_or_starstarargs:  starargs [","] starstarargs | starargs | starstarargs;
+starargs: ["*"] NAME;
+starstarargs: ["**"] NAME;
+fpdef: <NAME> | "(" fplist ")";
+fplist: fpdef ("," fpdef)* ","?;
+
+stmt: <simple_stmt> | <compound_stmt>;
+simple_stmt: small_stmt ([";"] small_stmt)* [";"]? [NEWLINE];
+small_stmt: <expr_stmt> | <print_stmt>  | <del_stmt> | <pass_stmt> |
+            <flow_stmt> | <import_stmt> | <global_stmt> | <exec_stmt> |
+            <assert_stmt>;
+expr_stmt: testlist augassign testlist | testlist ("=" testlist)*;
+augassign: "+=" | "-=" | "*=" | "/=" | "%=" | "&=" | "|=" | "^=" | "<<=" |
+           ">>=" | "**=" | "//=";
+# For normal assignments, additional restrictions enforced by the interpreter
+
+print_stmt: "print" ">>" test (("," test)+ ","?)? |
+            "print" (test ("," test)* ","?)?;
+del_stmt: "del" exprlist;
+pass_stmt: "pass";
+flow_stmt: <break_stmt> | <continue_stmt> | <return_stmt> | <raise_stmt> |
+           <yield_stmt>;
+break_stmt: "break";
+continue_stmt: "continue";
+return_stmt: ["return"] testlist?;
+yield_stmt: "yield" testlist;
+raise_stmt: "raise" (test ("," test ("," test)?)?)?;
+import_stmt: import_name | import_from;
+import_name: "import" dotted_as_names;
+import_from: "from" dotted_name "import" import_what;
+import_what: "*" | "(" import_as_names ","? ")" | import_as_names;
+
+# the second NAME is the "as" (which is not a keyword in Python)
+import_as_name: NAME (NAME NAME)?;
+dotted_as_name: dotted_name (NAME NAME)?;
+import_as_names: import_as_name ("," import_as_name)*;
+dotted_as_names: dotted_as_name ("," dotted_as_name)*;
+dotted_name: NAME ("." NAME)*;
+global_stmt: "global" NAME ("," NAME)*;
+exec_stmt: "exec" expr ("in" test ("," test)?)?;
+assert_stmt: "assert" test ("," test)?;
+
+compound_stmt: <if_stmt> | <while_stmt> | <for_stmt> | <try_stmt> |
+               <funcdef> | <classdef>;
+if_stmt: "if" test ":" suite ("elif" test ":" suite)* ("else" ":" suite)?;
+while_stmt: ["while"] test [":"] suite (["else" ":"] suite)?;
+for_stmt: "for" exprlist "in" testlist ":" suite ("else" ":" suite)?;
+try_stmt: "try" ":" suite (except_clause ":" suite)+
+          ("else" ":" suite)? | "try" ":" suite "finally" ":" suite;
+
+except_clause: "except" (test ("," test)?)?;
+suite: simple_stmt | [NEWLINE] [INDENT] stmt+ [DEDENT];
+
+test: and_test "or" test | <and_test> | <lambdef>;
+and_test: not_test "and" and_test | <not_test>;
+not_test: "not" not_test | <comparison>;
+comparison: expr >comp_op< comparison | <expr>;
+comp_op: "<" | ">" | "==" | ">=" | "<=" | "<>" | "!=" | "in" | "not" "in" |
+         "is" "not" | "is";
+expr: xor_expr "|" expr | <xor_expr>;
+xor_expr: and_expr "^" xor_expr | <and_expr>;
+and_expr: shift_expr "&" and_expr | <shift_expr>;
+shift_expr: arith_expr "<<" shift_expr |  # was (("<<"|">>") arith_expr)*
+            arith_expr ">>" shift_expr |
+            <arith_expr>;
+arith_expr: term "+" arith_expr |  # was (("+"|"-") term)*
+            term "-" arith_expr |
+            <term>;
+term: factor "*" term |  # was (("*"|"/"|"%"|"//") factor)*
+      factor "/" term |
+      factor "%" term |
+      factor "//" term |
+      <factor>;
+factor: "+" factor | "-" factor | "~" factor | <power>;
+power: atom trailer+ ("**" factor)? | atom "**" factor | <atom>;
+atom: "(" testlist_gexp? ")" | "[" listmaker? "]" |
+      "{" dictmaker? "}" | "`" testlist1 "`" | <NAME> | <NUMBER> | STRING+;
+listmaker: test list_for |
+           test ("," test)* ","?;
+testlist_gexp: test gen_for |
+               test ("," test)* ","?;
+lambdef: "lambda" varargslist? ":" test;
+trailer: "(" ")" | "(" arglist ")" | "[" subscriptlist "]" | "." NAME;
+subscriptlist: subscript ("," subscript)* ","?;
+subscript: "." "." "." | test? ":" test? sliceop? | test;
+sliceop: ":" test?;
+exprlist: expr ("," expr)* ","?;
+testlist: test ("," test)* ","?;
+testlist_safe: test (("," test)+ ","?)?;
+dictmaker: test ":" test ("," test ":" test)* ","?;
+
+classdef: ["class"] NAME (["("] testlist [")"])? [":"] suite;
+
+arglist: (argument [","])* arglist_rest;
+arglist_rest: "*" test ("," "**" test)? | "**" test | argument ","?;
+argument: (test "=")? test gen_for?;
+
+list_iter: list_for | list_if;
+list_for: "for" exprlist "in" testlist_safe list_iter?;
+list_if: "if" test list_iter?;
+
+gen_iter: gen_for | gen_if;
+gen_for: "for" exprlist "in" test gen_iter?;
+gen_if: "if" test gen_iter?;
+
+testlist1: test ("," test)*;
+
+# not used in grammar, but may appear in "node" passed from Parser to Compiler
+encoding_decl: NAME;

Added: pypy/dist/pypy/rlib/parsing/test/test_pythonparse.py
==============================================================================
--- (empty file)
+++ pypy/dist/pypy/rlib/parsing/test/test_pythonparse.py	Mon Nov 27 21:20:34 2006
@@ -0,0 +1,170 @@
+""" test file to experiment with a an adapted CPython grammar """
+
+import py
+from pypy.rlib.parsing.lexer import Lexer
+from pypy.rlib.parsing.deterministic import LexerError
+from pypy.rlib.parsing.tree import Nonterminal, Symbol, RPythonVisitor
+from pypy.rlib.parsing.parsing import PackratParser, Symbol, ParseError, Rule
+from pypy.rlib.parsing.ebnfparse import parse_ebnf, make_parse_function
+
+grammar = py.magic.autopath().dirpath().join("pygrammar.txt").read()
+
+
+def test_parse_grammar():
+    _, rules, ToAST = parse_ebnf(grammar)
+
+def test_parse_python_args():
+    regexs, rules, ToAST = parse_ebnf("""
+IGNORE: " ";
+NAME: "[a-zA-Z_]*";
+NUMBER: "0|[1-9][0-9]*";
+parameters: ["("] >varargslist< [")"] | ["("] [")"];
+varargslist: (fpdef ("=" test)? ",")* star_or_starstarargs |
+             fpdef ("=" test)? ("," fpdef ("=" test)?)* ","?;
+star_or_starstarargs:  "*" NAME "," "**" NAME | "*" NAME | "**" NAME;
+fpdef: NAME | "(" fplist ")";
+fplist: fpdef ("," fpdef)* ","?;
+test: NUMBER;
+    """)
+    parse = make_parse_function(regexs, rules)
+    t = parse("(a)").visit(ToAST())[0]
+    t = parse("(a,)").visit(ToAST())[0]
+    t = parse("(a,b,c,d)").visit(ToAST())[0]
+    t = parse("(a,b,c,d,)").visit(ToAST())[0]
+    t = parse("((a, b, c),b,c,d,)").visit(ToAST())[0]
+    t = parse("((a, b, c),b,c,d,*args)").visit(ToAST())[0]
+    t = parse("((a, b, c),b,c,d,**kwargs)").visit(ToAST())[0]
+    t = parse("((a, b, c),b,c,d,*args, **args)").visit(ToAST())[0]
+    t = parse("()").visit(ToAST())[0]
+    t = parse("(*args, **args)").visit(ToAST())[0]
+    t = parse("(a=1)").visit(ToAST())[0]
+    t = parse("(a=2,)").visit(ToAST())[0]
+    t = parse("(a,b,c,d=3)").visit(ToAST())[0]
+    t = parse("(a,b,c,d=4,)").visit(ToAST())[0]
+    t = parse("((a, b, c),b,c,(c, d)=1,)").visit(ToAST())[0]
+    t = parse("((a, b, c),b,c,d=1,*args)").visit(ToAST())[0]
+    t = parse("((a, b, c),b,c,d=2,**kwargs)").visit(ToAST())[0]
+    t = parse("((a, b, c),b,c,(c, d)=4,*args, **args)").visit(ToAST())[0]
+    t = parse("(self, a, b, args)").visit(ToAST())[0]
+    
+def test_parse_funcdef():
+    regexs, rules, ToAST = parse_ebnf("""
+IGNORE: " ";
+NAME: "[a-zA-Z_]*";
+NUMBER: "0|[1-9][0-9]*";
+funcdef: "def" NAME parameters ":" suite;
+parameters: ["("] >varargslist< [")"] | ["("] [")"];
+varargslist: (fpdef ("=" test)? ",")* star_or_starstarargs |
+             fpdef ("=" test)? ("," fpdef ("=" test)?)* ","?;
+star_or_starstarargs:  "*" NAME "," "**" NAME | "*" NAME | "**" NAME;
+fpdef: NAME | "(" fplist ")";
+fplist: fpdef ("," fpdef)* ","?;
+test: NUMBER;
+suite: simple_stmt | ["NEWLINE"] ["INDENT"] stmt+ ["DEDENT"];
+simple_stmt: stmt;
+stmt: "pass";
+    """)
+    parse = make_parse_function(regexs, rules)
+    t = parse("def f(a): NEWLINE INDENT pass DEDENT").visit(ToAST())[0]
+
+
+class TestParser(object):
+    def setup_class(cls):
+        from pypy.rlib.parsing.parsing import PackratParser
+        regexs, rules, ToAST = parse_ebnf(grammar)
+        cls.ToAST = ToAST()
+        cls.parser = PackratParser(rules, rules[0].nonterminal)
+        cls.regexs = regexs
+        names, regexs = zip(*regexs)
+        cls.lexer = Lexer(list(regexs), list(names))
+
+    def parse(self, source):
+        tokens = list(self.tokenize(source))
+        s = self.parser.parse(tokens)
+        return s
+
+    def tokenize(self, source):
+        # use tokenize module but rewrite tokens slightly
+        import tokenize, cStringIO
+        pos = 0
+        readline = cStringIO.StringIO(source).readline
+        for token in tokenize.generate_tokens(readline):
+            typ, s, (row, col), _, line = token
+            pos += len(s)
+            typ = tokenize.tok_name[typ]
+            if typ == "ENDMARKER":
+                typ = s = "EOF"
+            elif typ == "NL":
+                continue
+            elif typ == "COMMENT":
+                continue
+            try:
+                tokens = self.lexer.tokenize(s, eof=False)
+                if len(tokens) == 1:
+                    typ, s, _, _, _ = tokens[0]
+                    yield typ, s, pos, row, col
+                    continue
+            except LexerError:
+                pass
+            yield (typ, s, pos, row, col)
+
+
+    def test_simple(self):
+        t = self.parse("""
+def f(x, null=0):
+    if x >= null:
+        return null + x
+    else:
+        pass
+        return null - x
+        """)
+        t = t.visit(self.ToAST)
+        assert len(t) == 1
+        t = t[0]
+
+    def test_class(self):
+        t = self.parse("""
+class A(object):
+    def __init__(self, a, b, *args):
+        self.a = a
+        self.b = b
+        if args:
+            self.len = len(args)
+            self.args = [a, b] + list(args)
+
+    def diagonal(self):
+        return (self.a ** 2 + self.b ** 2) ** 0.5
+        """)
+        t = t.visit(self.ToAST)[0]
+
+    def test_while(self):
+        t = self.parse("""
+def f(x, null=0):
+    i = null
+    result = 0
+    while i < x:
+        result += i
+        i += 1
+        if result % 625 == 13:
+            break
+    else:
+        return result - 15
+    return result
+        """)
+        t = t.visit(self.ToAST)
+        assert len(t) == 1
+        t = t[0]
+
+    def test_comment(self):
+        t = self.parse("""
+def f(x):
+    # this does some fancy stuff
+    return x
+""")
+        t = self.ToAST.transform(t)
+
+    def test_parse_this(self):
+        s = py.magic.autopath().read()
+        s = s[s.index("\nclass"):]
+        t = self.parse(s)
+        t = t.visit(self.ToAST)[0]