[Python-checkins] r52858 - in sandbox/trunk/2to3: Grammar.pickle Grammar.txt pgen2 pgen2/__init__.py pgen2/__init__.pyc pgen2/astnode.py pgen2/conv.py pgen2/driver.py pgen2/grammar.py pgen2/literals.py pgen2/parse.py pgen2/pgen.py pgen2/python.py pgen2/test.py play.py pynode.py

guido.van.rossum python-checkins at python.org
Wed Nov 29 18:39:11 CET 2006


Author: guido.van.rossum
Date: Wed Nov 29 18:38:40 2006
New Revision: 52858

Added:
   sandbox/trunk/2to3/
   sandbox/trunk/2to3/Grammar.pickle   (contents, props changed)
   sandbox/trunk/2to3/Grammar.txt   (contents, props changed)
   sandbox/trunk/2to3/pgen2/
   sandbox/trunk/2to3/pgen2/__init__.py   (contents, props changed)
   sandbox/trunk/2to3/pgen2/__init__.pyc   (contents, props changed)
   sandbox/trunk/2to3/pgen2/astnode.py   (contents, props changed)
   sandbox/trunk/2to3/pgen2/conv.py   (contents, props changed)
   sandbox/trunk/2to3/pgen2/driver.py   (contents, props changed)
   sandbox/trunk/2to3/pgen2/grammar.py   (contents, props changed)
   sandbox/trunk/2to3/pgen2/literals.py   (contents, props changed)
   sandbox/trunk/2to3/pgen2/parse.py   (contents, props changed)
   sandbox/trunk/2to3/pgen2/pgen.py   (contents, props changed)
   sandbox/trunk/2to3/pgen2/python.py   (contents, props changed)
   sandbox/trunk/2to3/pgen2/test.py   (contents, props changed)
   sandbox/trunk/2to3/play.py   (contents, props changed)
   sandbox/trunk/2to3/pynode.py   (contents, props changed)
Log:
Checkpoint of alternative Python 2.x-to-3.0 conversion tool.
This contains a modified copy of pgen2 which was open-sourced
by Elemental Security through a contributor's agreement with the PSF.


Added: sandbox/trunk/2to3/Grammar.pickle
==============================================================================
Binary file. No diff available.

Added: sandbox/trunk/2to3/Grammar.txt
==============================================================================
--- (empty file)
+++ sandbox/trunk/2to3/Grammar.txt	Wed Nov 29 18:38:40 2006
@@ -0,0 +1,148 @@
+# Grammar for Python
+
+# Note:  Changing the grammar specified in this file will most likely
+#        require corresponding changes in the parser module
+#        (../Modules/parsermodule.c).  If you can't make the changes to
+#        that module yourself, please co-ordinate the required changes
+#        with someone who can; ask around on python-dev for help.  Fred
+#        Drake <fdrake at acm.org> will probably be listening there.
+
+# NOTE WELL: You should also follow all the steps listed in PEP 306,
+# "How to Change Python's Grammar"
+
+# Commands for Kees Blom's railroad program
+#diagram:token NAME
+#diagram:token NUMBER
+#diagram:token STRING
+#diagram:token NEWLINE
+#diagram:token ENDMARKER
+#diagram:token INDENT
+#diagram:output\input python.bla
+#diagram:token DEDENT
+#diagram:output\textwidth 20.04cm\oddsidemargin  0.0cm\evensidemargin 0.0cm
+#diagram:rules
+
+# Start symbols for the grammar:
+#	file_input is a module or sequence of commands read from an input file;
+#	single_input is a single interactive statement;
+#	eval_input is the input for the eval() and input() functions.
+# NB: compound_stmt in single_input is followed by extra NEWLINE!
+file_input: (NEWLINE | stmt)* ENDMARKER
+single_input: NEWLINE | simple_stmt | compound_stmt NEWLINE
+eval_input: testlist NEWLINE* ENDMARKER
+
+decorator: '@' dotted_name [ '(' [arglist] ')' ] NEWLINE
+decorators: decorator+
+funcdef: [decorators] 'def' NAME parameters ':' suite
+parameters: '(' [varargslist] ')'
+varargslist: ((fpdef ['=' test] ',')*
+              ('*' NAME [',' '**' NAME] | '**' NAME) |
+              fpdef ['=' test] (',' fpdef ['=' test])* [','])
+fpdef: NAME | '(' fplist ')'
+fplist: fpdef (',' fpdef)* [',']
+
+stmt: simple_stmt | compound_stmt
+simple_stmt: small_stmt (';' small_stmt)* [';'] NEWLINE
+small_stmt: (expr_stmt | print_stmt  | del_stmt | pass_stmt | flow_stmt |
+             import_stmt | global_stmt | exec_stmt | assert_stmt)
+expr_stmt: testlist (augassign (yield_expr|testlist) |
+                     ('=' (yield_expr|testlist))*)
+augassign: ('+=' | '-=' | '*=' | '/=' | '%=' | '&=' | '|=' | '^=' |
+            '<<=' | '>>=' | '**=' | '//=')
+# For normal assignments, additional restrictions enforced by the interpreter
+print_stmt: 'print' ( [ test (',' test)* [','] ] |
+                      '>>' test [ (',' test)+ [','] ] )
+del_stmt: 'del' exprlist
+pass_stmt: 'pass'
+flow_stmt: break_stmt | continue_stmt | return_stmt | raise_stmt | yield_stmt
+break_stmt: 'break'
+continue_stmt: 'continue'
+return_stmt: 'return' [testlist]
+yield_stmt: yield_expr
+raise_stmt: 'raise' [test [',' test [',' test]]]
+import_stmt: import_name | import_from
+import_name: 'import' dotted_as_names
+import_from: ('from' ('.'* dotted_name | '.'+)
+              'import' ('*' | '(' import_as_names ')' | import_as_names))
+import_as_name: NAME ['as' NAME]
+dotted_as_name: dotted_name ['as' NAME]
+import_as_names: import_as_name (',' import_as_name)* [',']
+dotted_as_names: dotted_as_name (',' dotted_as_name)*
+dotted_name: NAME ('.' NAME)*
+global_stmt: 'global' NAME (',' NAME)*
+exec_stmt: 'exec' expr ['in' test [',' test]]
+assert_stmt: 'assert' test [',' test]
+
+compound_stmt: if_stmt | while_stmt | for_stmt | try_stmt | with_stmt | funcdef | classdef
+if_stmt: 'if' test ':' suite ('elif' test ':' suite)* ['else' ':' suite]
+while_stmt: 'while' test ':' suite ['else' ':' suite]
+for_stmt: 'for' exprlist 'in' testlist ':' suite ['else' ':' suite]
+try_stmt: ('try' ':' suite
+           ((except_clause ':' suite)+
+	    ['else' ':' suite]
+	    ['finally' ':' suite] |
+	   'finally' ':' suite))
+with_stmt: 'with' test [ with_var ] ':' suite
+with_var: 'as' expr
+# NB compile.c makes sure that the default except clause is last
+except_clause: 'except' [test [',' test]]
+suite: simple_stmt | NEWLINE INDENT stmt+ DEDENT
+
+# Backward compatibility cruft to support:
+# [ x for x in lambda: True, lambda: False if x() ]
+# even while also allowing:
+# lambda x: 5 if x else 2
+# (But not a mix of the two)
+testlist_safe: old_test [(',' old_test)+ [',']]
+old_test: or_test | old_lambdef
+old_lambdef: 'lambda' [varargslist] ':' old_test
+
+test: or_test ['if' or_test 'else' test] | lambdef
+or_test: and_test ('or' and_test)*
+and_test: not_test ('and' not_test)*
+not_test: 'not' not_test | comparison
+comparison: expr (comp_op expr)*
+comp_op: '<'|'>'|'=='|'>='|'<='|'<>'|'!='|'in'|'not' 'in'|'is'|'is' 'not'
+expr: xor_expr ('|' xor_expr)*
+xor_expr: and_expr ('^' and_expr)*
+and_expr: shift_expr ('&' shift_expr)*
+shift_expr: arith_expr (('<<'|'>>') arith_expr)*
+arith_expr: term (('+'|'-') term)*
+term: factor (('*'|'/'|'%'|'//') factor)*
+factor: ('+'|'-'|'~') factor | power
+power: atom trailer* ['**' factor]
+atom: ('(' [yield_expr|testlist_gexp] ')' |
+       '[' [listmaker] ']' |
+       '{' [dictmaker] '}' |
+       '`' testlist1 '`' |
+       NAME | NUMBER | STRING+)
+listmaker: test ( list_for | (',' test)* [','] )
+testlist_gexp: test ( gen_for | (',' test)* [','] )
+lambdef: 'lambda' [varargslist] ':' test
+trailer: '(' [arglist] ')' | '[' subscriptlist ']' | '.' NAME
+subscriptlist: subscript (',' subscript)* [',']
+subscript: '.' '.' '.' | test | [test] ':' [test] [sliceop]
+sliceop: ':' [test]
+exprlist: expr (',' expr)* [',']
+testlist: test (',' test)* [',']
+dictmaker: test ':' test (',' test ':' test)* [',']
+
+classdef: 'class' NAME ['(' [testlist] ')'] ':' suite
+
+arglist: (argument ',')* (argument [',']| '*' test [',' '**' test] | '**' test)
+argument: test [gen_for] | test '=' test  # Really [keyword '='] test
+
+list_iter: list_for | list_if
+list_for: 'for' exprlist 'in' testlist_safe [list_iter]
+list_if: 'if' old_test [list_iter]
+
+gen_iter: gen_for | gen_if
+gen_for: 'for' exprlist 'in' or_test [gen_iter]
+gen_if: 'if' old_test [gen_iter]
+
+testlist1: test (',' test)*
+
+# not used in grammar, but may appear in "node" passed from Parser to Compiler
+encoding_decl: NAME
+
+yield_expr: 'yield' [testlist]

Added: sandbox/trunk/2to3/pgen2/__init__.py
==============================================================================
--- (empty file)
+++ sandbox/trunk/2to3/pgen2/__init__.py	Wed Nov 29 18:38:40 2006
@@ -0,0 +1,4 @@
+# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
+# Licensed to PSF under a Contributor Agreement.
+
+"""The pgen2 package."""

Added: sandbox/trunk/2to3/pgen2/__init__.pyc
==============================================================================
Binary file. No diff available.

Added: sandbox/trunk/2to3/pgen2/astnode.py
==============================================================================
--- (empty file)
+++ sandbox/trunk/2to3/pgen2/astnode.py	Wed Nov 29 18:38:40 2006
@@ -0,0 +1,434 @@
+# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
+# Licensed to PSF under a Contributor Agreement.
+
+"""Syntax tree node definitions.
+
+There is a class or function corresponding to each terminal and
+nonterminal symbol.
+
+The grammar is pieced together from the docstrings of the
+corresponding classes and functions: text between dollar signs is
+assumed to be the right-hand side of the grammar rule corresponding to
+that class or function.
+
+We use __slots__ to make the parse tree nodes as small as possible.
+
+NOTE: EVERY CLASS MUST HAVE A __slots__ DEFINITION, EVEN IF EMPTY!
+(If not, the instances will get a __dict__, defeating the purpose of
+__slots__ in our case.)
+
+"""
+
+# Python imports
+import token
+import logging
+
+from pgen2 import grammar
+
+logger = logging.getLogger("pgen2.astnode")
+
+class Node(object):
+    """Abstract base class for all nodes.
+
+    This has no attributes except a context slot which holds the lone
+    number (or more detailed context info).  In the future this might
+    change this to several slots (e.g. filename, lineno, column, or
+    even filename, start_lineno, start_column, end_lineno,
+    end_column).  The context is only referenced by two places: the
+    part of the code that sticks it it, and the part of the code that
+    reports errors.
+
+    In order to reduce the amount of boilerplate code, the context is
+    argument is handled by __new__ rather than __init__.  There are
+    also a few subclasses that override __new__ to sometimes avoid
+    constructing an instance.
+
+    """
+
+    __slots__ = ["context"]
+
+    def __new__(cls, context, *rest):
+        assert cls not in (Node, Nonterminal, Terminal, Constant)
+        obj = object.__new__(cls)
+        obj.context = context
+        return obj
+
+    _stretch = False # Set to true to stretch the repr() vertically
+
+    def __repr__(self, repr=repr):
+        stretch = self._stretch
+        r = [self.__class__.__name__]
+        if stretch:
+            r.append("(\n    ")
+        else:
+            r.append("(") # ")" -- crutch for Emacs python-mode :-(
+        cls = self.__class__
+        # Get nearest non-empty slots __slots__.  This assumes
+        # *something* has non-empty __slots__ before we reach object
+        # (which has no __slots__).  The class hierarchy guarantees
+        # this.
+        slots = cls.__slots__
+        while not slots:
+            cls = cls.__base__
+            slots = cls.__slots__
+        first = True
+        for name in slots:
+            if name == "context":
+                continue # Skip this
+            if first:
+                first = False
+            else:
+                if stretch:
+                    r.append(",\n    ")
+                else:
+                    r.append(", ")
+            try:
+                value = getattr(self, name)
+            except AttributeError:
+                continue
+            if stretch and isinstance(value, list):
+                rr = map(repr, value)
+                rv = "[" + ",\n ".join(rr) + "]"
+            else:
+                rv = repr(value)
+            if stretch:
+                rv = rv.replace("\n", "\n    ")
+            r.append(rv)
+        r.append(")")
+        return "".join(r)
+
+    def __str__(self):
+        return self.__repr__(repr=str)
+
+class Nonterminal(Node):
+    """Abstract base class for nonterminal symbols.
+
+    Nothing beyond Node.
+
+    """
+
+    __slots__ = []
+
+    _stretch = True
+
+class Terminal(Node):
+    """Abstract base class for terminal symbols.
+
+    Nothing beyond Node.
+
+    """
+
+    __slots__ = []
+
+class Series(Nonterminal):
+    """Abstract base class for nonterminals like stmts: stmt+."""
+
+    __slots__ = []
+
+    def __new__(cls, context, *nodes):
+        assert cls is not Series
+        if len(nodes) == 0:
+            return None
+        elif len(nodes) == 1:
+            return nodes[0]
+        else:
+            obj = Nonterminal.__new__(cls, context)
+            obj.initseries(nodes)
+            return obj
+
+class Constant(Terminal):
+    """Abstract base class for constants (e.g. number or string).
+
+    Attributes:
+
+    repr -- a string giving the token, exactly as read from source
+
+    """
+
+    __slots__ = ["repr"]
+
+    def __init__(self, context, repr):
+        self.repr = repr
+
+    def __str__(self):
+        return self.repr
+
+# Node classes for terminal symbols
+
+class Name(Terminal):
+    """Name (e.g. a variable name or an attribute name).
+
+    Attributes:
+
+    name -- a string giving the name.
+
+    """
+
+    __slots__ = ["name"]
+
+    def __init__(self, context, name):
+        self.name = name
+
+    def __str__(self):
+        return self.name
+
+class Number(Constant):
+    """Numeric constant.
+
+    Nothing beyond Constant.
+
+    """
+
+    __slots__ = []
+
+class String(Constant):
+    """String constant.
+
+    Nothing beyond Constant.
+
+    """
+
+    __slots__ = []
+
+# Example nodes and factory functions for nonterminal symbols
+
+def Program(context, stmts):
+    """Program is a nonterminal with only one non-trivial child.
+
+    Grammar: $ Expression ENDMARKER $
+
+    """
+    return stmts
+
+class Expression(Series):
+    "Grammar: $ BinaryExpression ['?' BinaryExpression ':' BinaryExpression] $"
+
+    __slots__ = ["test", "left", "right"]
+
+    def initseries(self, nodes):
+        self.test, self.left, self.right = nodes
+
+    def __str__(self):
+        return "%s ? %s : %s" % (self.test, self.left, self.right)
+
+class BinaryExpression(Series):
+    "Grammar: $ Operand (Operator Operand)* $"
+
+    __slots__ = ["left", "op", "right"]
+
+    def initseries(self, stuff):
+        # Stuff is a list with alternating operands and operators
+        if len(stuff) == 3:
+            self.left, self.op, self.right = stuff
+            return
+        assert len(stuff) > 1 and len(stuff) % 2 == 1
+        # Find the rightmost lowest-priority operator
+        lowest_i = 1
+        lowest_op = stuff[lowest_i]
+        lowest_pri = lowest_op.priority()
+        for i in range(3, len(stuff), 2):
+            op = stuff[i]
+            pri = op.priority()
+            if pri <= lowest_pri:
+                lowest_i = i
+                lowest_op = op
+                lowest_pri = pri
+        self.left = self.__class__(self.context, *stuff[:lowest_i])
+        self.op = lowest_op
+        self.right = self.__class__(self.context, *stuff[lowest_i+1:])
+
+    def __str__(self):
+        return "(%s %s %s)" % (self.left, self.op, self.right)
+
+def Operand(context, arg):
+    """Operand is a nonterminal with one child.
+
+    Grammar: $ Atom | UnaryExpression $
+
+    """
+    return arg
+
+class UnaryExpression(Nonterminal):
+    "Grammar: $ UnaryOperator Operand $"
+
+    __slots__ = ["op", "arg"]
+
+    def __init__(self, context, op, arg):
+        self.op = op
+        self.arg = arg
+
+    def __str__(self):
+        return "%s%s" % (self.op, self.arg)
+
+def UnaryOperator(context, op):
+    "Grammar: $ 'not' | '-' $"
+    return op
+
+class Operator(Nonterminal):
+    """Operator.
+
+    This has a repr slot and a priority() method.
+
+    The __new__ method implements a sort-of singleton pattern: there's
+    only one instance per repr value.  (Yes, this means the context is
+    not useful.  Therefore we set it to None.)
+
+    Grammar: $ '**' | '*' | '/' | '+' | '-' | '&' | '^' | '|' |
+          ['not'] 'in' | '==' | '<' | '>' | '!=' | '<=' | '>=' |
+          'and' | 'or' $
+
+    """
+
+    __slots__ = ["repr"]
+
+    _stretch = False
+
+    _cache = {}
+
+    def __new__(cls, context, *args):
+        repr = " ".join(args)
+        # For "not in", the argument should be the string "not in"
+        obj = cls._cache.get(repr)
+        if obj is None:
+            obj = Terminal.__new__(cls, None)
+            obj.repr = repr
+        return obj
+
+    def __str__(self):
+        return self.repr
+
+    def priority(self):
+        return self._priorities[self.repr]
+
+    _priorities = {
+        "or":  0,
+        "and": 1,
+        "in":  2,
+        "not in": 2,
+        "<":  2,
+        ">":  2,
+        "==": 2,
+        "<=": 2,
+        ">=": 2,
+        "!=": 2,
+        "|":  3,
+        "^":  4,
+        "&":  5,
+        "+":  6,
+        "-":  6,
+        "*":  7,
+        "/":  7,
+        "**": 8,
+    }
+
+def Atom(context, arg):
+    """ Grammar: $ NAME | STRING | NUMBER | ParenthesizedExpression $
+    """
+    return arg
+
+def ParenthesizedExpression(context, expr):
+    "Grammar: $ '(' Expression ')' $"
+    return expr
+
+
+# Conversion from concrete to abstract syntax trees
+
+def vanish(context, value):
+    return None
+
+token_mapping = {
+    # Tokens that become parse tree nodes
+    # (token.NAME is special-cased in the code)
+    token.NUMBER: Number,
+    token.STRING: String,
+
+    # Tokens that vanish
+    token.DOT: vanish,
+    token.LPAR: vanish,
+    token.RPAR: vanish,
+    token.COLON: vanish,
+    token.COMMA: vanish,
+    token.EQUAL: vanish,
+    token.DEDENT: vanish,
+    token.INDENT: vanish,
+    token.LBRACE: vanish,
+    token.RBRACE: vanish,
+    token.NEWLINE: vanish,
+    token.ENDMARKER: vanish,
+
+    # All other tokens return the token's string value (e.g. "+")
+    }
+
+vanishing_keywords = {
+    # E.g. "def": True,
+    }
+
+def convert(grammar, node):
+    type, value, context, children = node
+    # Is it a non-terminal symbol?
+    if type in grammar.number2symbol:
+        symbol = grammar.number2symbol[type]
+        factory = globals().get(symbol)
+        if factory is None:
+            raise RuntimeError("can't find factory for %s (line %s)" %
+                               (symbol, context))
+        # Debug variation:
+        try:
+            return factory(context, *children)
+        except:
+            logger.debug("%s %s", factory.__name__, "(")
+            for child in children:
+                logger.debug("%s %s", "==>", child)
+            logger.debug(")")
+            logger.debug("# Did you remember to declare a 'context' arg?")
+            raise
+        return factory(context, *children)
+
+    # Must be a terminal symbol.
+    if type == token.NAME:
+        # Name or keyword.  Special-case the snot out of this.
+        if value in grammar.keywords:
+            # Keywords become strings, like operators
+            if value in vanishing_keywords:
+                return None
+            else:
+                return value
+        else:
+            return Name(context, value)
+
+    # Look for a handler in the token_mapping table.
+    assert type in token.tok_name
+    factory = token_mapping.get(type)
+    if factory:
+        return factory(context, value)
+    else:
+        return value
+
+
+# Support code
+
+def generate_grammar(stream):
+    """Extract the grammar rules from this module's doc strings."""
+    import re
+    from types import ModuleType
+    lines = []
+    startline = None
+    for name, obj in globals().items():
+        if hasattr(obj, "__doc__") and not isinstance(obj, ModuleType):
+            m = re.search(r"Grammar:\s*\$([^$]+)\$", obj.__doc__ or "")
+            if m:
+                rule = obj.__name__, " ".join(m.group(1).split())
+                if rule[0] == "Program":
+                    assert not startline
+                    startline = rule
+                else:
+                    lines.append(rule)
+    lines.sort()
+    # The start symbol *must* be the first rule
+    lines.insert(0, startline)
+    for name, rhs in lines:
+        stream.write("%s: %s\n" % (name, rhs))
+
+if __name__ == '__main__':
+    import sys
+    generate_grammar(sys.stdout)

Added: sandbox/trunk/2to3/pgen2/conv.py
==============================================================================
--- (empty file)
+++ sandbox/trunk/2to3/pgen2/conv.py	Wed Nov 29 18:38:40 2006
@@ -0,0 +1,256 @@
+# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
+# Licensed to PSF under a Contributor Agreement.
+
+"""Convert graminit.[ch] spit out by pgen to Python code.
+
+Pgen is the Python parser generator.  It is useful to quickly create a
+parser from a grammar file in Python's grammar notation.  But I don't
+want my parsers to be written in C (yet), so I'm translating the
+parsing tables to Python data structures and writing a Python parse
+engine.
+
+Note that the token numbers are constants determined by the standard
+Python tokenizer.  The standard token module defines these numbers and
+their names (the names are not used much).  The token numbers are
+hardcoded into the Python tokenizer and into pgen.  A Python
+implementation of the Python tokenizer is also available, in the
+standard tokenize module.
+
+On the other hand, symbol numbers (representing the grammar's
+non-terminals) are assigned by pgen based on the actual grammar
+input.
+
+Note: this module is pretty much obsolete; the pgen module generates
+equivalent grammar tables directly from the Grammar.txt input file
+without having to invoke the Python pgen C program.
+
+"""
+
+# Python imports
+import re
+import token
+
+from pgen2 import grammar
+
+class Converter(grammar.Grammar):
+    """Grammar subclass that reads classic pgen output files.
+
+    The run() method reads the tables as produced by the pgen parser
+    generator, typically contained in two C files, graminit.h and
+    graminit.c.  The other methods are for internal use only.
+
+    See the base class for more documentation.
+
+    """
+
+    def run(self, graminit_h, graminit_c):
+        """Load the grammar tables from the text files written by pgen."""
+        self.parse_graminit_h(graminit_h)
+        self.parse_graminit_c(graminit_c)
+        self.finish_off()
+
+    def parse_graminit_h(self, filename):
+        """Parse the .h file writen by pgen.  (Internal)
+
+        This file is a sequence of #define statements defining the
+        nonterminals of the grammar as numbers.  We build two tables
+        mapping the numbers to names and back.
+
+        """
+        try:
+            f = open(filename)
+        except IOError, err:
+            print "Can't open %s: %s" % (filename, err)
+            return False
+        self.symbol2number = {}
+        self.number2symbol = {}
+        lineno = 0
+        for line in f:
+            lineno += 1
+            mo = re.match(r"^#define\s+(\w+)\s+(\d+)$", line)
+            if not mo and line.strip():
+                print "%s(%s): can't parse %s" % (filename, lineno,
+                                                  line.strip())
+            else:
+                symbol, number = mo.groups()
+                number = int(number)
+                assert symbol not in self.symbol2number
+                assert number not in self.number2symbol
+                self.symbol2number[symbol] = number
+                self.number2symbol[number] = symbol
+        return True
+
+    def parse_graminit_c(self, filename):
+        """Parse the .c file writen by pgen.  (Internal)
+
+        The file looks as follows.  The first two lines are always this:
+
+        #include "pgenheaders.h"
+        #include "grammar.h"
+
+        After that come four blocks:
+
+        1) one or more state definitions
+        2) a table defining dfas
+        3) a table defining labels
+        4) a struct defining the grammar
+
+        A state definition has the following form:
+        - one or more arc arrays, each of the form:
+          static arc arcs_<n>_<m>[<k>] = {
+                  {<i>, <j>},
+                  ...
+          };
+        - followed by a state array, of the form:
+          static state states_<s>[<t>] = {
+                  {<k>, arcs_<n>_<m>},
+                  ...
+          };
+
+        """
+        try:
+            f = open(filename)
+        except IOError, err:
+            print "Can't open %s: %s" % (filename, err)
+            return False
+        # The code below essentially uses f's iterator-ness!
+        lineno = 0
+
+        # Expect the two #include lines
+        lineno, line = lineno+1, f.next()
+        assert line == '#include "pgenheaders.h"\n', (lineno, line)
+        lineno, line = lineno+1, f.next()
+        assert line == '#include "grammar.h"\n', (lineno, line)
+
+        # Parse the state definitions
+        lineno, line = lineno+1, f.next()
+        allarcs = {}
+        states = []
+        while line.startswith("static arc "):
+            while line.startswith("static arc "):
+                mo = re.match(r"static arc arcs_(\d+)_(\d+)\[(\d+)\] = {$",
+                              line)
+                assert mo, (lineno, line)
+                n, m, k = map(int, mo.groups())
+                arcs = []
+                for _ in range(k):
+                    lineno, line = lineno+1, f.next()
+                    mo = re.match(r"\s+{(\d+), (\d+)},$", line)
+                    assert mo, (lineno, line)
+                    i, j = map(int, mo.groups())
+                    arcs.append((i, j))
+                lineno, line = lineno+1, f.next()
+                assert line == "};\n", (lineno, line)
+                allarcs[(n, m)] = arcs
+                lineno, line = lineno+1, f.next()
+            mo = re.match(r"static state states_(\d+)\[(\d+)\] = {$", line)
+            assert mo, (lineno, line)
+            s, t = map(int, mo.groups())
+            assert s == len(states), (lineno, line)
+            state = []
+            for _ in range(t):
+                lineno, line = lineno+1, f.next()
+                mo = re.match(r"\s+{(\d+), arcs_(\d+)_(\d+)},$", line)
+                assert mo, (lineno, line)
+                k, n, m = map(int, mo.groups())
+                arcs = allarcs[n, m]
+                assert k == len(arcs), (lineno, line)
+                state.append(arcs)
+            states.append(state)
+            lineno, line = lineno+1, f.next()
+            assert line == "};\n", (lineno, line)
+            lineno, line = lineno+1, f.next()
+        self.states = states
+
+        # Parse the dfas
+        dfas = {}
+        mo = re.match(r"static dfa dfas\[(\d+)\] = {$", line)
+        assert mo, (lineno, line)
+        ndfas = int(mo.group(1))
+        for i in range(ndfas):
+            lineno, line = lineno+1, f.next()
+            mo = re.match(r'\s+{(\d+), "(\w+)", (\d+), (\d+), states_(\d+),$',
+                          line)
+            assert mo, (lineno, line)
+            symbol = mo.group(2)
+            number, x, y, z = map(int, mo.group(1, 3, 4, 5))
+            assert self.symbol2number[symbol] == number, (lineno, line)
+            assert self.number2symbol[number] == symbol, (lineno, line)
+            assert x == 0, (lineno, line)
+            state = states[z]
+            assert y == len(state), (lineno, line)
+            lineno, line = lineno+1, f.next()
+            mo = re.match(r'\s+("(?:\\\d\d\d)*")},$', line)
+            assert mo, (lineno, line)
+            first = {}
+            rawbitset = eval(mo.group(1))
+            for i, c in enumerate(rawbitset):
+                byte = ord(c)
+                for j in range(8):
+                    if byte & (1<<j):
+                        first[i*8 + j] = 1
+            dfas[number] = (state, first)
+        lineno, line = lineno+1, f.next()
+        assert line == "};\n", (lineno, line)
+        self.dfas = dfas
+
+        # Parse the labels
+        labels = []
+        lineno, line = lineno+1, f.next()
+        mo = re.match(r"static label labels\[(\d+)\] = {$", line)
+        assert mo, (lineno, line)
+        nlabels = int(mo.group(1))
+        for i in range(nlabels):
+            lineno, line = lineno+1, f.next()
+            mo = re.match(r'\s+{(\d+), (0|"\w+")},$', line)
+            assert mo, (lineno, line)
+            x, y = mo.groups()
+            x = int(x)
+            if y == "0":
+                y = None
+            else:
+                y = eval(y)
+            labels.append((x, y))
+        lineno, line = lineno+1, f.next()
+        assert line == "};\n", (lineno, line)
+        self.labels = labels
+
+        # Parse the grammar struct
+        lineno, line = lineno+1, f.next()
+        assert line == "grammar _PyParser_Grammar = {\n", (lineno, line)
+        lineno, line = lineno+1, f.next()
+        mo = re.match(r"\s+(\d+),$", line)
+        assert mo, (lineno, line)
+        ndfas = int(mo.group(1))
+        assert ndfas == len(self.dfas)
+        lineno, line = lineno+1, f.next()
+        assert line == "\tdfas,\n", (lineno, line)
+        lineno, line = lineno+1, f.next()
+        mo = re.match(r"\s+{(\d+), labels},$", line)
+        assert mo, (lineno, line)
+        nlabels = int(mo.group(1))
+        assert nlabels == len(self.labels), (lineno, line)
+        lineno, line = lineno+1, f.next()
+        mo = re.match(r"\s+(\d+)$", line)
+        assert mo, (lineno, line)
+        start = int(mo.group(1))
+        assert start in self.number2symbol, (lineno, line)
+        self.start = start
+        lineno, line = lineno+1, f.next()
+        assert line == "};\n", (lineno, line)
+        try:
+            lineno, line = lineno+1, f.next()
+        except StopIteration:
+            pass
+        else:
+            assert 0, (lineno, line)
+
+    def finish_off(self):
+        """Create additional useful structures.  (Internal)."""
+        self.keywords = {} # map from keyword strings to arc labels
+        self.tokens = {}   # map from numeric token values to arc labels
+        for ilabel, (type, value) in enumerate(self.labels):
+            if type == token.NAME and value is not None:
+                self.keywords[value] = ilabel
+            elif value is None:
+                self.tokens[type] = ilabel

Added: sandbox/trunk/2to3/pgen2/driver.py
==============================================================================
--- (empty file)
+++ sandbox/trunk/2to3/pgen2/driver.py	Wed Nov 29 18:38:40 2006
@@ -0,0 +1,105 @@
+# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
+# Licensed to PSF under a Contributor Agreement.
+
+# Modifications:
+# Copyright 2006 Python Software Foundation. All Rights Reserved.
+
+"""Parser driver.
+
+This provides a high-level interface to parse a file into a syntax tree.
+
+"""
+
+__author__ = "Guido van Rossum <guido at python.org>"
+
+__all__ = ["Driver", "load_grammar"]
+
+# Python imports
+import os
+import token
+import logging
+import tokenize
+
+# Pgen imports
+from pgen2 import parse
+from pgen2 import astnode
+from pgen2 import grammar
+
+class Driver(object):
+
+    def __init__(self, grammar, convert=None, logger=None):
+        self.grammar = grammar
+        if logger is None:
+            logger = logging.getLogger()
+        self.logger = logger
+        self.convert = convert
+ 
+    def parse_stream_raw(self, stream, debug=False):
+        """Parse a stream and return the concrete syntax tree."""
+        p = parse.Parser(self.grammar, self.convert)
+        p.setup()
+        t = v = x = None
+        # (t, v, x, y, z) == (type, value, start, end, line)
+        for t, v, x, y, z in tokenize.generate_tokens(stream.readline):
+            if t in (tokenize.COMMENT, tokenize.NL):
+                continue
+            if t == token.OP:
+                t = grammar.opmap[v]
+            if debug:
+                self.logger.debug("%s %r", token.tok_name[t], v)
+            if p.addtoken(t, v, x):
+                if debug:
+                    self.logger.debug("Stop.")
+                break
+        else:
+            # We never broke out -- EOF is too soon (how can this happen???)
+            raise parse.ParseError("incomplete input", t, v, x)
+        return p.rootnode
+
+    def parse_stream(self, stream, debug=False):
+        """Parse a stream and return the syntax tree."""
+        return self.parse_stream_raw(stream, debug)
+
+    def parse_file(self, filename, debug=False):
+        """Parse a file and return the syntax tree."""
+        stream = open(filename)
+        try:
+            return self.parse_stream(stream, debug)
+        finally:
+            stream.close()
+
+    def parse_string(self, text, debug=False):
+        """Parse a string and return the syntax tree."""
+        from StringIO import StringIO
+        stream = StringIO(text)
+        return self.parse_stream(stream, debug)
+
+def load_grammar(gt="Grammar.txt", gp=None,
+                 save=True, force=False, logger=None):
+    """Load the grammar (maybe from a pickle)."""
+    if logger is None:
+        logger = logging.getLogger()
+    if gp is None:
+        head, tail = os.path.splitext(gt)
+        if tail == ".txt":
+            tail = ""
+        gp = head + tail + ".pickle"
+    if force or not _newer(gp, gt):
+        logger.info("Generating grammar tables from %s", gt)
+        from pgen2 import pgen
+        g = pgen.generate_grammar(gt)
+        if save:
+            logger.info("Writing grammar tables to %s", gp)
+            g.dump(gp)
+    else:
+        g = grammar.Grammar()
+        g.load(gp)
+    return g
+
+def _newer(a, b):
+    """Inquire whether file a was written since file b."""
+    if not os.path.exists(a):
+        return False
+    if not os.path.exists(b):
+        return True
+    return os.path.getmtime(a) >= os.path.getmtime(b)

Added: sandbox/trunk/2to3/pgen2/grammar.py
==============================================================================
--- (empty file)
+++ sandbox/trunk/2to3/pgen2/grammar.py	Wed Nov 29 18:38:40 2006
@@ -0,0 +1,167 @@
+# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
+# Licensed to PSF under a Contributor Agreement.
+
+"""This module defines the data structures used to represent a grammar.
+
+These are a bit arcane because they are derived from the data
+structures used by Python's 'pgen' parser generator.
+
+There's also a table here mapping operators to their names in the
+token module; the Python tokenize module reports all operators as the
+fallback token code OP, but the parser needs the actual token code.
+
+"""
+
+# Python imports
+import token, tokenize
+import cPickle as pickle
+
+class Grammar(object):
+    """Pgen parsing tables tables conversion class.
+
+    Once initialized, this class supplies the grammar tables for the
+    parsing engine implemented by parse.py.  The parsing engine
+    accesses the instance variables directly.  The class here does not
+    provide initialization of the tables; several subclasses exist to
+    do this (see the conv and pgen modules).
+
+    The load() method reads the tables from a pickle file, which is
+    much faster than the other ways offered by subclasses.  The pickle
+    file is written by calling dump() (after loading the grammar
+    tables using a subclass).  The report() method prints a readable
+    representation of the tables to stdout, for debugging.
+
+    The instance variables are as follows:
+
+    symbol2number -- a dict mapping symbol names to numbers.  Symbol
+                     numbers are always 256 or higher, to distinguish
+                     them from token numbers, which are between 0 and
+                     255 (inclusive).
+
+    number2symbol -- a dict mapping numbers to symbol names;
+                     these two are each other's inverse.
+
+    states        -- a list of DFAs, where each DFA is a list of
+                     states, each state is is a list of arcs, and each
+                     arc is a (i, j) pair where i is a label and j is
+                     a state number.  The DFA number is the index into
+                     this list.  (This name is slightly confusing.)
+                     Final states are represented by a special arc of
+                     the form (0, j) where j is its own state number.
+
+    dfas          -- a dict mapping symbol numbers to (DFA, first)
+                     pairs, where DFA is an item from the states list
+                     above, and first is a set of tokens that can
+                     begin this grammar rule (represented by a dict
+                     whose values are always 1).
+
+    labels        -- a list of (x, y) pairs where x is either a token
+                     number or a symbol number, and y is either None
+                     or a string; the strings are keywords.  The label
+                     number is the index in this list; label numbers
+                     are used to mark state transitions (arcs) in the
+                     DFAs.
+
+    start         -- the number of the grammar's start symbol.
+
+    keywords      -- a dict mapping keyword strings to arc labels.
+
+    tokens        -- a dict mapping token numbers to arc labels.
+
+    """
+
+    def __init__(self):
+        self.symbol2number = {}
+        self.number2symbol = {}
+        self.states = []
+        self.dfas = {}
+        self.labels = [(0, "EMPTY")]
+        self.keywords = {}
+        self.tokens = {}
+        self.symbol2label = {}
+        self.start = 256
+
+    def dump(self, filename):
+        """Dump the grammar tables to a pickle file."""
+        f = open(filename, "wb")
+        pickle.dump(self.__dict__, f, 2)
+        f.close()
+
+    def load(self, filename):
+        """Load the grammar tables from a pickle file."""
+        f = open(filename, "rb")
+        d = pickle.load(f)
+        f.close()
+        self.__dict__.update(d)
+
+    def report(self):
+        """Dump the grammar tables to standard output, for debugging."""
+        from pprint import pprint
+        print "s2n"
+        pprint(self.symbol2number)
+        print "n2s"
+        pprint(self.number2symbol)
+        print "states"
+        pprint(self.states)
+        print "dfas"
+        pprint(self.dfas)
+        print "labels"
+        pprint(self.labels)
+        print "start", self.start
+
+
+# Map from operator to number (since tokenize doesn't do this)
+
+opmap_raw = """
+( LPAR
+) RPAR
+[ LSQB
+] RSQB
+: COLON
+, COMMA
+; SEMI
++ PLUS
+- MINUS
+* STAR
+/ SLASH
+| VBAR
+& AMPER
+< LESS
+> GREATER
+= EQUAL
+. DOT
+% PERCENT
+` BACKQUOTE
+{ LBRACE
+} RBRACE
+@ AT
+== EQEQUAL
+!= NOTEQUAL
+<> NOTEQUAL
+<= LESSEQUAL
+>= GREATEREQUAL
+~ TILDE
+^ CIRCUMFLEX
+<< LEFTSHIFT
+>> RIGHTSHIFT
+** DOUBLESTAR
++= PLUSEQUAL
+-= MINEQUAL
+*= STAREQUAL
+/= SLASHEQUAL
+%= PERCENTEQUAL
+&= AMPEREQUAL
+|= VBAREQUAL
+^= CIRCUMFLEXEQUAL
+<<= LEFTSHIFTEQUAL
+>>= RIGHTSHIFTEQUAL
+**= DOUBLESTAREQUAL
+// DOUBLESLASH
+//= DOUBLESLASHEQUAL
+"""
+
+opmap = {}
+for line in opmap_raw.splitlines():
+    if line:
+        op, name = line.split()
+        opmap[op] = getattr(token, name)

Added: sandbox/trunk/2to3/pgen2/literals.py
==============================================================================
--- (empty file)
+++ sandbox/trunk/2to3/pgen2/literals.py	Wed Nov 29 18:38:40 2006
@@ -0,0 +1,60 @@
+# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
+# Licensed to PSF under a Contributor Agreement.
+
+"""Safely evaluate Python string literals without using eval()."""
+
+import re
+
+simple_escapes = {"a": "\a",
+                  "b": "\b",
+                  "f": "\f",
+                  "n": "\n",
+                  "r": "\r",
+                  "t": "\t",
+                  "v": "\v",
+                  "'": "'",
+                  '"': '"',
+                  "\\": "\\"}
+
+def escape(m):
+    all, tail = m.group(0, 1)
+    assert all.startswith("\\")
+    esc = simple_escapes.get(tail)
+    if esc is not None:
+        return esc
+    if tail.startswith("x"):
+        hexes = tail[1:]
+        if len(hexes) < 2:
+            raise ValueError("invalid hex string escape ('\\%s')" % tail)
+        try:
+            i = int(hexes, 16)
+        except ValueError:
+            raise ValueError("invalid hex string escape ('\\%s')" % tail)
+    else:
+        try:
+            i = int(tail, 8)
+        except ValueError:
+            raise ValueError("invalid octal string escape ('\\%s')" % tail)
+    return chr(i)
+
+def evalString(s):
+    assert s.startswith("'") or s.startswith('"'), repr(s[:1])
+    q = s[0]
+    if s[:3] == q*3:
+        q = q*3
+    assert s.endswith(q), repr(s[-len(q):])
+    assert len(s) >= 2*len(q)
+    s = s[len(q):-len(q)]
+    return re.sub(r"\\(\'|\"|\\|[abfnrtv]|x.{0,2}|[0-7]{1,3})", escape, s)
+
+def test():
+    for i in range(256):
+        c = chr(i)
+        s = repr(c)
+        e = evalString(s)
+        if e != c:
+            print i, c, s, e
+
+
+if __name__ == "__main__":
+    test()

Added: sandbox/trunk/2to3/pgen2/parse.py
==============================================================================
--- (empty file)
+++ sandbox/trunk/2to3/pgen2/parse.py	Wed Nov 29 18:38:40 2006
@@ -0,0 +1,199 @@
+# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
+# Licensed to PSF under a Contributor Agreement.
+
+"""Parser engine for the grammar tables generated by pgen.
+
+The grammar table must be loaded first.
+
+See Parser/parser.c in the Python distribution for additional info on
+how this parsing engine works.
+
+"""
+
+# Python imports
+import token
+
+class ParseError(Exception):
+    """Exception to signal the parser is stuck."""
+
+    def __init__(self, msg, type, value, context):
+        Exception.__init__(self, "%s: type=%r, value=%r, context=%r" %
+                           (msg, type, value, context))
+        self.msg = msg
+        self.type = type
+        self.value = value
+        self.context = context
+
+class Parser(object):
+    """Parser engine.
+
+    The proper usage sequence is:
+
+    p = Parser(grammar, [converter])  # create instance
+    p.setup([start])                  # prepare for parsing
+    <for each input token>:
+        if p.addtoken(...):           # parse a token; may raise ParseError
+            break
+    root = p.rootnode                 # root of abstract syntax tree
+
+    A Parser instance may be reused by calling setup() repeatedly.
+
+    A Parser instance contains state pertaining to the current token
+    sequence, and should not be used concurrently by different threads
+    to parse separate token sequences.
+
+    See driver.py for how to get input tokens by tokenizing a file or
+    string.
+
+    Parsing is complete when addtoken() returns True; the root of the
+    abstract syntax tree can then be retrieved from the rootnode
+    instance variable.  When a syntax error occurs, addtoken() raises
+    the ParseError exception.  There is no error recovery; the parser
+    cannot be used after a syntax error was reported (but it can be
+    reinitialized by calling setup()).
+
+    """
+
+    def __init__(self, grammar, convert=None):
+        """Constructor.
+
+        The grammar argument is a grammar.Grammar instance; see the
+        grammar module for more information.
+
+        The parser is not ready yet for parsing; you must call the
+        setup() method to get it started.
+
+        The optional convert argument is a function mapping concrete
+        syntax tree nodes to abstract syntax tree nodes.  If not
+        given, no conversion is done and the syntax tree produced is
+        the concrete syntax tree.  If given, it must be a function of
+        two arguments, the first being the grammar (a grammar.Grammar
+        instance), and the second being the concrete syntax tree node
+        to be converted.  The syntax tree is converted from the bottom
+        up.
+
+        A concrete syntax tree node is a (type, value, context, nodes)
+        tuple, where type is the node type (a token or symbol number),
+        value is None for symbols and a string for tokens, context is
+        None or an opaque value used for error reporting (typically a
+        (lineno, offset) pair), and nodes is a list of children for
+        symbols, and None for tokens.
+
+        An abstract syntax tree node may be anything; this is entirely
+        up to the converter function.  For example, it can be an
+        instance of a subclass of the astnode.Node class (see the
+        astnode module).
+
+        """
+        self.grammar = grammar
+        self.convert = convert or (lambda grammar, node: node)
+
+    def setup(self, start=None):
+        """Prepare for parsing.
+
+        This *must* be called before starting to parse.
+
+        The optional argument is an alternative start symbol; it
+        defaults to the grammar's start symbol.
+
+        You can use a Parser instance to parse any number of programs;
+        each time you call setup() the parser is reset to an initial
+        state determined by the (implicit or explicit) start symbol.
+
+        """
+        if start is None:
+            start = self.grammar.start
+        # Each stack entry is a tuple: (dfa, state, node).
+        # A node is a tuple: (type, value, context, children),
+        # where children is a list of nodes or None, and context may be None.
+        newnode = (start, None, None, [])
+        stackentry = (self.grammar.dfas[start], 0, newnode)
+        self.stack = [stackentry]
+        self.rootnode = None
+
+    def addtoken(self, type, value, context):
+        """Add a token; return True iff this is the end of the program."""
+        # Map from token to label
+        ilabel = self.classify(type, value, context)
+        # Loop until the token is shifted; may raise exceptions
+        while True:
+            dfa, state, node = self.stack[-1]
+            states, first = dfa
+            arcs = states[state]
+            # Look for a state with this label
+            for i, newstate in arcs:
+                t, v = self.grammar.labels[i]
+                if ilabel == i:
+                    # Look it up in the list of labels
+                    assert t < 256
+                    # Shift a token; we're done with it
+                    self.shift(type, value, newstate, context)
+                    # Pop while we are in an accept-only state
+                    state = newstate
+                    while states[state] == [(0, state)]:
+                        self.pop()
+                        if not self.stack:
+                            # Done parsing!
+                            return True
+                        dfa, state, node = self.stack[-1]
+                        states, first = dfa
+                    # Done with this token
+                    return False
+                elif t >= 256:
+                    # See if it's a symbol and if we're in its first set
+                    itsdfa = self.grammar.dfas[t]
+                    itsstates, itsfirst = itsdfa
+                    if ilabel in itsfirst:
+                        # Push a symbol
+                        self.push(t, self.grammar.dfas[t], newstate, context)
+                        break # To continue the outer while loop
+            else:
+                if (0, state) in arcs:
+                    # An accepting state, pop it and try something else
+                    self.pop()
+                    if not self.stack:
+                        # Done parsing, but another token is input
+                        raise ParseError("too much input",
+                                         type, value, context)
+                else:
+                    # No success finding a transition
+                    raise ParseError("bad input", type, value, context)
+
+    def classify(self, type, value, context):
+        """Turn a token into a label.  (Internal)"""
+        if type == token.NAME:
+            # Check for reserved words
+            ilabel = self.grammar.keywords.get(value)
+            if ilabel is not None:
+                return ilabel
+        ilabel = self.grammar.tokens.get(type)
+        if ilabel is None:
+            raise ParseError("bad token", type, value, context)
+        return ilabel
+
+    def shift(self, type, value, newstate, context):
+        """Shift a token.  (Internal)"""
+        dfa, state, node = self.stack[-1]
+        newnode = (type, value, context, None)
+        newnode = self.convert(self.grammar, newnode)
+        if newnode is not None:
+            node[-1].append(newnode)
+        self.stack[-1] = (dfa, newstate, node)
+
+    def push(self, type, newdfa, newstate, context):
+        """Push a nonterminal.  (Internal)"""
+        dfa, state, node = self.stack[-1]
+        newnode = (type, None, context, [])
+        self.stack[-1] = (dfa, newstate, node)
+        self.stack.append((newdfa, 0, newnode))
+
+    def pop(self):
+        """Pop a nonterminal.  (Internal)"""
+        popdfa, popstate, popnode = self.stack.pop()
+        newnode = self.convert(self.grammar, popnode)
+        if newnode is not None:
+            if self.stack:
+                dfa, state, node = self.stack[-1]
+                node[-1].append(newnode)
+            else:
+                self.rootnode = newnode

Added: sandbox/trunk/2to3/pgen2/pgen.py
==============================================================================
--- (empty file)
+++ sandbox/trunk/2to3/pgen2/pgen.py	Wed Nov 29 18:38:40 2006
@@ -0,0 +1,384 @@
+# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
+# Licensed to PSF under a Contributor Agreement.
+
+# Python imports
+import token
+import tokenize
+
+# Pgen imports
+from pgen2 import grammar
+
+class PgenGrammar(grammar.Grammar):
+    pass
+
+class ParserGenerator(object):
+
+    def __init__(self, filename, stream=None):
+        if stream is None:
+            stream = open(filename)
+        self.filename = filename
+        self.stream = stream
+        self.generator = tokenize.generate_tokens(stream.readline)
+        self.gettoken() # Initialize lookahead
+        self.dfas, self.startsymbol = self.parse()
+        self.first = {} # map from symbol name to set of tokens
+        self.addfirstsets()
+
+    def make_grammar(self):
+        c = PgenGrammar()
+        names = self.dfas.keys()
+        names.sort()
+        names.remove(self.startsymbol)
+        names.insert(0, self.startsymbol)
+        for name in names:
+            i = 256 + len(c.symbol2number)
+            c.symbol2number[name] = i
+            c.number2symbol[i] = name
+        for name in names:
+            dfa = self.dfas[name]
+            states = []
+            for state in dfa:
+                arcs = []
+                for label, next in state.arcs.iteritems():
+                    arcs.append((self.make_label(c, label), dfa.index(next)))
+                if state.isfinal:
+                    arcs.append((0, dfa.index(state)))
+                states.append(arcs)
+            c.states.append(states)
+            c.dfas[c.symbol2number[name]] = (states, self.make_first(c, name))
+        c.start = c.symbol2number[self.startsymbol]
+        return c
+
+    def make_first(self, c, name):
+        rawfirst = self.first[name]
+        first = {}
+        for label in rawfirst:
+            ilabel = self.make_label(c, label)
+            ##assert ilabel not in first # XXX failed on <> ... !=
+            first[ilabel] = 1
+        return first
+
+    def make_label(self, c, label):
+        # XXX Maybe this should be a method on a subclass of converter?
+        ilabel = len(c.labels)
+        if label[0].isalpha():
+            # Either a symbol name or a named token
+            if label in c.symbol2number:
+                # A symbol name (a non-terminal)
+                if label in c.symbol2label:
+                    return c.symbol2label[label]
+                else:
+                    c.labels.append((c.symbol2number[label], None))
+                    c.symbol2label[label] = ilabel
+                    return ilabel
+            else:
+                # A named token (NAME, NUMBER, STRING)
+                itoken = getattr(token, label, None)
+                assert isinstance(itoken, int), label
+                assert itoken in token.tok_name, label
+                if itoken in c.tokens:
+                    return c.tokens[itoken]
+                else:
+                    c.labels.append((itoken, None))
+                    c.tokens[itoken] = ilabel
+                    return ilabel
+        else:
+            # Either a keyword or an operator
+            assert label[0] in ('"', "'"), label
+            value = eval(label)
+            if value[0].isalpha():
+                # A keyword
+                if value in c.keywords:
+                    return c.keywords[value]
+                else:
+                    c.labels.append((token.NAME, value))
+                    c.keywords[value] = ilabel
+                    return ilabel
+            else:
+                # An operator (any non-numeric token)
+                itoken = grammar.opmap[value] # Fails if unknown token
+                if itoken in c.tokens:
+                    return c.tokens[itoken]
+                else:
+                    c.labels.append((itoken, None))
+                    c.tokens[itoken] = ilabel
+                    return ilabel
+
+    def addfirstsets(self):
+        names = self.dfas.keys()
+        names.sort()
+        for name in names:
+            if name not in self.first:
+                self.calcfirst(name)
+            #print name, self.first[name].keys()
+
+    def calcfirst(self, name):
+        dfa = self.dfas[name]
+        self.first[name] = None # dummy to detect left recursion
+        state = dfa[0]
+        totalset = {}
+        overlapcheck = {}
+        for label, next in state.arcs.iteritems():
+            if label in self.dfas:
+                if label in self.first:
+                    fset = self.first[label]
+                    if fset is None:
+                        raise ValueError("recursion for rule %r" % name)
+                else:
+                    self.calcfirst(label)
+                    fset = self.first[label]
+                totalset.update(fset)
+                overlapcheck[label] = fset
+            else:
+                totalset[label] = 1
+                overlapcheck[label] = {label: 1}
+        inverse = {}
+        for label, itsfirst in overlapcheck.iteritems():
+            for symbol in itsfirst:
+                if symbol in inverse:
+                    raise ValueError("rule %s is ambiguous; %s is in the"
+                                     " first sets of %s as well as %s" %
+                                     (name, symbol, label, inverse[symbol]))
+                inverse[symbol] = label
+        self.first[name] = totalset
+
+    def parse(self):
+        dfas = {}
+        startsymbol = None
+        # MSTART: (NEWLINE | RULE)* ENDMARKER
+        while self.type != token.ENDMARKER:
+            while self.type == token.NEWLINE:
+                self.gettoken()
+            # RULE: NAME ':' RHS NEWLINE
+            name = self.expect(token.NAME)
+            self.expect(token.OP, ":")
+            a, z = self.parse_rhs()
+            self.expect(token.NEWLINE)
+            #self.dump_nfa(name, a, z)
+            dfa = self.make_dfa(a, z)
+            #self.dump_dfa(name, dfa)
+            oldlen = len(dfa)
+            self.simplify_dfa(dfa)
+            newlen = len(dfa)
+            dfas[name] = dfa
+            #print name, oldlen, newlen
+            if startsymbol is None:
+                startsymbol = name
+        return dfas, startsymbol
+
+    def make_dfa(self, start, finish):
+        # To turn an NFA into a DFA, we define the states of the DFA
+        # to correspond to *sets* of states of the NFA.  Then do some
+        # state reduction.  Let's represent sets as dicts with 1 for
+        # values.
+        assert isinstance(start, NFAState)
+        assert isinstance(finish, NFAState)
+        def closure(state):
+            base = {}
+            addclosure(state, base)
+            return base
+        def addclosure(state, base):
+            assert isinstance(state, NFAState)
+            if state in base:
+                return
+            base[state] = 1
+            for label, next in state.arcs:
+                if label is None:
+                    addclosure(next, base)
+        states = [DFAState(closure(start), finish)]
+        for state in states: # NB states grows while we're iterating
+            arcs = {}
+            for nfastate in state.nfaset:
+                for label, next in nfastate.arcs:
+                    if label is not None:
+                        addclosure(next, arcs.setdefault(label, {}))
+            for label, nfaset in arcs.iteritems():
+                for st in states:
+                    if st.nfaset == nfaset:
+                        break
+                else:
+                    st = DFAState(nfaset, finish)
+                    states.append(st)
+                state.addarc(st, label)
+        return states # List of DFAState instances; first one is start
+
+    def dump_nfa(self, name, start, finish):
+        print "Dump of NFA for", name
+        todo = [start]
+        for i, state in enumerate(todo):
+            print "  State", i, state is finish and "(final)" or ""
+            for label, next in state.arcs:
+                if next in todo:
+                    j = todo.index(next)
+                else:
+                    j = len(todo)
+                    todo.append(next)
+                if label is None:
+                    print "    -> %d" % j
+                else:
+                    print "    %s -> %d" % (label, j)
+
+    def dump_dfa(self, name, dfa):
+        print "Dump of DFA for", name
+        for i, state in enumerate(dfa):
+            print "  State", i, state.isfinal and "(final)" or ""
+            for label, next in state.arcs.iteritems():
+                print "    %s -> %d" % (label, dfa.index(next))
+
+    def simplify_dfa(self, dfa):
+        # This is not theoretically optimal, but works well enough.
+        # Algorithm: repeatedly look for two states that have the same
+        # set of arcs (same labels pointing to the same nodes) and
+        # unify them, until things stop changing.
+
+        # dfa is a list of DFAState instances
+        changes = True
+        while changes:
+            changes = False
+            for i, state_i in enumerate(dfa):
+                for j in range(i+1, len(dfa)):
+                    state_j = dfa[j]
+                    if state_i == state_j:
+                        #print "  unify", i, j
+                        del dfa[j]
+                        for state in dfa:
+                            state.unifystate(state_j, state_i)
+                        changes = True
+                        break
+
+    def parse_rhs(self):
+        # RHS: ALT ('|' ALT)*
+        a, z = self.parse_alt()
+        if self.value != "|":
+            return a, z
+        else:
+            aa = NFAState()
+            zz = NFAState()
+            aa.addarc(a)
+            z.addarc(zz)
+            while self.value == "|":
+                self.gettoken()
+                a, z = self.parse_alt()
+                aa.addarc(a)
+                z.addarc(zz)
+            return aa, zz
+
+    def parse_alt(self):
+        # ALT: ITEM+
+        a, b = self.parse_item()
+        while (self.value in ("(", "[") or
+               self.type in (token.NAME, token.STRING)):
+            c, d = self.parse_item()
+            b.addarc(c)
+            b = d
+        return a, b
+
+    def parse_item(self):
+        # ITEM: '[' RHS ']' | ATOM ['+' | '*']
+        if self.value == "[":
+            self.gettoken()
+            a, z = self.parse_rhs()
+            self.expect(token.OP, "]")
+            a.addarc(z)
+            return a, z
+        else:
+            a, z = self.parse_atom()
+            value = self.value
+            if value not in ("+", "*"):
+                return a, z
+            self.gettoken()
+            z.addarc(a)
+            if value == "+":
+                return a, z
+            else:
+                return a, a
+
+    def parse_atom(self):
+        # ATOM: '(' RHS ')' | NAME | STRING
+        if self.value == "(":
+            self.gettoken()
+            a, z = self.parse_rhs()
+            self.expect(token.OP, ")")
+            return a, z
+        elif self.type in (token.NAME, token.STRING):
+            a = NFAState()
+            z = NFAState()
+            a.addarc(z, self.value)
+            self.gettoken()
+            return a, z
+        else:
+            self.raise_error("expected (...) or NAME or STRING, got %s/%s",
+                             self.type, self.value)
+
+    def expect(self, type, value=None):
+        if self.type != type or (value is not None and self.value != value):
+            self.raise_error("expected %s/%s, got %s/%s",
+                             type, value, self.type, self.value)
+        value = self.value
+        self.gettoken()
+        return value
+
+    def gettoken(self):
+        tup = self.generator.next()
+        while tup[0] in (tokenize.COMMENT, tokenize.NL):
+            tup = self.generator.next()
+        self.type, self.value, self.begin, self.end, self.line = tup
+        #print token.tok_name[self.type], repr(self.value)
+
+    def raise_error(self, msg, *args):
+        if args:
+            try:
+                msg = msg % args
+            except:
+                msg = " ".join([msg] + map(str, args))
+        raise SyntaxError(msg, (self.filename, self.end[0],
+                                self.end[1], self.line))
+
+class NFAState(object):
+
+    def __init__(self):
+        self.arcs = [] # list of (label, NFAState) pairs
+
+    def addarc(self, next, label=None):
+        assert label is None or isinstance(label, str)
+        assert isinstance(next, NFAState)
+        self.arcs.append((label, next))
+
+class DFAState(object):
+
+    def __init__(self, nfaset, final):
+        assert isinstance(nfaset, dict)
+        assert isinstance(iter(nfaset).next(), NFAState)
+        assert isinstance(final, NFAState)
+        self.nfaset = nfaset
+        self.isfinal = final in nfaset
+        self.arcs = {} # map from label to DFAState
+
+    def addarc(self, next, label):
+        assert isinstance(label, str)
+        assert label not in self.arcs
+        assert isinstance(next, DFAState)
+        self.arcs[label] = next
+
+    def unifystate(self, old, new):
+        for label, next in self.arcs.iteritems():
+            if next is old:
+                self.arcs[label] = new
+
+    def __eq__(self, other):
+        # Equality test -- ignore the nfaset instance variable
+        assert isinstance(other, DFAState)
+        if self.isfinal != other.isfinal:
+            return False
+        # Can't just return self.arcs == other.arcs, because that
+        # would invoke this method recursively, with cycles...
+        if len(self.arcs) != len(other.arcs):
+            return False
+        for label, next in self.arcs.iteritems():
+            if next is not other.arcs.get(label):
+                return False
+        return True
+
+def generate_grammar(filename="Grammar.txt"):
+    p = ParserGenerator(filename)
+    return p.make_grammar()

Added: sandbox/trunk/2to3/pgen2/python.py
==============================================================================
--- (empty file)
+++ sandbox/trunk/2to3/pgen2/python.py	Wed Nov 29 18:38:40 2006
@@ -0,0 +1,9 @@
+#!/usr/bin/python2.5
+# Copyright 2006 Python Software Foundation.  All rights reserved.
+
+"""Full Python grammar"""
+
+from __future__ import with_statement
+
+__author__ = "Guido van Rossum <guido at python.org>"
+

Added: sandbox/trunk/2to3/pgen2/test.py
==============================================================================
--- (empty file)
+++ sandbox/trunk/2to3/pgen2/test.py	Wed Nov 29 18:38:40 2006
@@ -0,0 +1,18 @@
+# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
+# Licensed to PSF under a Contributor Agreement.
+
+def test():
+    import sys
+    sys.path[0] = ".."
+    from pgen2 import astnode, driver
+    f = open("Grammar.txt", "w")
+    try:
+        astnode.generate_grammar(f)
+    finally:
+        f.close()
+    sample = "year<=1989 ? ('Modula-3' + ABC) ** 2 : Python"
+    tree = driver.parse_string(sample, True)
+    print tree
+
+if __name__ == "__main__":
+    test()

Added: sandbox/trunk/2to3/play.py
==============================================================================
--- (empty file)
+++ sandbox/trunk/2to3/play.py	Wed Nov 29 18:38:40 2006
@@ -0,0 +1,66 @@
+#!/usr/bin/env python2.5
+# Copyright 2006 Python Software Foundation. All Rights Reserved.
+
+"""XXX."""
+
+from __future__ import with_statement
+
+__author__ = "Guido van Rossum <guido at python.org>"
+
+# Python imports
+import os
+import sys
+import logging
+
+import pgen2
+from pgen2 import driver
+
+import pynode
+
+logging.basicConfig(level=logging.INFO)
+
+SAMPLE = """\
+for i in range(10):
+    print i
+class C(object, list):
+    def f(a, b, c=1, *args, **kwds):
+        pass
+"""
+
+def main():
+    gr = driver.load_grammar("Grammar.txt")
+    dr = driver.Driver(gr, convert=pynode.convert)
+
+##     tree = dr.parse_file("pynode.py", debug=True)
+##     print tree
+
+##     for name in sys.modules:
+##         mod = sys.modules[name]
+##         if mod is None or not hasattr(mod, "__file__"):
+##             continue
+##         fn = mod.__file__
+##         if fn.endswith(".pyc"):
+##             fn = fn[:-1]
+##         if not fn.endswith(".py"):
+##             continue
+##         print >>sys.stderr, "Parsing", fn
+##         dr.parse_file(fn, debug=True)
+
+    for dir in reversed(sys.path):
+        try:
+            names = os.listdir(dir)
+        except os.error:
+            continue
+        print >>sys.stderr, "Scanning", dir, "..."
+        for name in names:
+            if not name.endswith(".py"):
+                continue
+            print >>sys.stderr, "Parsing", name
+            try:
+                dr.parse_file(os.path.join(dir, name), debug=True)
+            except pgen2.parse.ParseError, err:
+                print "ParseError:", err
+        
+
+if __name__ == "__main__":
+    main()

Added: sandbox/trunk/2to3/pynode.py
==============================================================================
--- (empty file)
+++ sandbox/trunk/2to3/pynode.py	Wed Nov 29 18:38:40 2006
@@ -0,0 +1,698 @@
+# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
+# Licensed to PSF under a Contributor Agreement.
+
+# Modifications:
+# Copyright 2006 Python Software Foundation. All Rights Reserved.
+
+"""Syntax tree node definitions.
+
+There is a class or function corresponding to each terminal and
+nonterminal symbol.
+
+The grammar is pieced together from the docstrings of the
+corresponding classes and functions: text between dollar signs is
+assumed to be the right-hand side of the grammar rule corresponding to
+that class or function.
+
+We use __slots__ to make the parse tree nodes as small as possible.
+
+NOTE: EVERY CLASS MUST HAVE A __slots__ DEFINITION, EVEN IF EMPTY!
+(If not, the instances will get a __dict__, defeating the purpose of
+__slots__ in our case.)
+"""
+
+__author__ = "Guido van Rossum <guido at python.org>"
+
+# Python imports
+import token
+import logging
+
+# Pgen imports
+from pgen2 import grammar
+
+# Custom logger
+logger = logging.getLogger()
+
+class Node(object):
+    """Abstract base class for all nodes.
+
+    This has no attributes except a context slot which holds the line
+    number (or more detailed context info).  In the future this might
+    change this to several slots (e.g. filename, lineno, column, or
+    even filename, start_lineno, start_column, end_lineno,
+    end_column).  The context is only referenced by two places: the
+    part of the code that sticks it in, and the part of the code that
+    reports errors.
+
+    In order to reduce the amount of boilerplate code, the context is
+    argument is handled by __new__ rather than __init__.  There are
+    also a few subclasses that override __new__ to sometimes avoid
+    constructing an instance.
+
+    """
+
+    __slots__ = ["context"]
+
+    def __new__(cls, context, *rest):
+        assert cls not in (Node, Nonterminal, Terminal, Constant)
+        obj = object.__new__(cls)
+        obj.context = context
+        return obj
+
+    _stretch = False # Set to true to stretch the repr() vertically
+
+    def __repr__(self, repr=repr):
+        stretch = self._stretch
+        r = [self.__class__.__name__]
+        if stretch:
+            r.append("(\n    ")
+        else:
+            r.append("(") # ")" -- crutch for Emacs python-mode :-(
+        cls = self.__class__
+        # Get nearest non-empty slots __slots__.  This assumes
+        # *something* has non-empty __slots__ before we reach object
+        # (which has no __slots__).  The class hierarchy guarantees
+        # this.
+        slots = cls.__slots__
+        while not slots:
+            cls = cls.__base__
+            slots = cls.__slots__
+        first = True
+        for name in slots:
+            if name == "context":
+                continue # Skip this
+            if first:
+                first = False
+            else:
+                if stretch:
+                    r.append(",\n    ")
+                else:
+                    r.append(", ")
+            try:
+                value = getattr(self, name)
+            except AttributeError:
+                continue
+            if stretch and isinstance(value, list):
+                rr = map(repr, value)
+                rv = "[" + ",\n ".join(rr) + "]"
+            else:
+                rv = repr(value)
+            if stretch:
+                rv = rv.replace("\n", "\n    ")
+            r.append(rv)
+        r.append(")")
+        return "".join(r)
+
+    def __str__(self):
+        return self.__repr__(repr=str)
+
+class Nonterminal(Node):
+    """Abstract base class for nonterminal symbols.
+
+    Nothing beyond Node.
+
+    """
+
+    __slots__ = []
+
+    _stretch = True
+
+class Terminal(Node):
+    """Abstract base class for terminal symbols.
+
+    Nothing beyond Node.
+
+    """
+
+    __slots__ = []
+
+class Series(Nonterminal):
+    """Abstract base class for nonterminals like stmts: stmt+."""
+
+    __slots__ = []
+
+    def __new__(cls, context, *nodes):
+        assert cls is not Series
+        if len(nodes) == 0:
+            return None
+        elif len(nodes) == 1:
+            return nodes[0]
+        else:
+            obj = Nonterminal.__new__(cls, context)
+            obj.initseries(nodes)
+            return obj
+
+class Constant(Terminal):
+    """Abstract base class for constants (e.g. number or string).
+
+    Attributes:
+
+    repr -- a string giving the token, exactly as read from source
+
+    """
+
+    __slots__ = ["repr"]
+
+    def __init__(self, context, repr):
+        self.repr = repr
+
+    def __str__(self):
+        return self.repr
+
+# Node classes for terminal symbols
+
+class Name(Terminal):
+    """Name (e.g. a variable name or an attribute name).
+
+    Attributes:
+
+    name -- a string giving the name.
+
+    """
+
+    __slots__ = ["name"]
+
+    def __init__(self, context, name):
+        self.name = name
+
+    def __str__(self):
+        return self.name
+
+class Number(Constant):
+    """Numeric constant.
+
+    Nothing beyond Constant.
+
+    """
+
+    __slots__ = []
+
+class String(Constant):
+    """String constant.
+
+    Nothing beyond Constant.
+
+    """
+
+    __slots__ = []
+
+# Nodes and factory functions for Python grammar
+
+class VanishingNonterminal(Nonterminal):
+    __slots__ = []
+    def __new__(cls, context, node):
+        return node
+
+class GenericSeries(Series):
+    __slots__ = ["nodes"]
+    def initseries(self, nodes):
+        self.nodes = nodes
+
+class atom(GenericSeries):
+    __slots__ = []
+
+class power(GenericSeries):
+    __slots__ = []
+
+class factor(GenericSeries):
+    __slots__ = []
+
+class term(GenericSeries):
+    __slots__ = []
+
+class arith_expr(GenericSeries):
+    __slots__ = []
+
+class shift_expr(GenericSeries):
+    __slots__ = []
+
+class and_expr(GenericSeries):
+    __slots__ = []
+
+class xor_expr(GenericSeries):
+    __slots__ = []
+
+class or_expr(GenericSeries):
+    __slots__ = []
+
+class expr(GenericSeries):
+    __slots__ = []
+
+class comparison(GenericSeries):
+    __slots__ = []
+
+class not_test(GenericSeries):
+    __slots__ = []
+
+class and_test(GenericSeries):
+    __slots__ = []
+
+class or_test(GenericSeries):
+    __slots__ = []
+
+class test(GenericSeries):
+    __slots__ = []
+
+class testlist(GenericSeries):
+    __slots__ = []
+
+class expr_stmt(GenericSeries):
+    __slots__ = []
+
+class trailer(GenericSeries):
+    __slots__ = []
+
+class argument(GenericSeries):
+    __slots__ = []
+
+class arglist(GenericSeries):
+    __slots__ = []
+
+class subscript(GenericSeries):
+    __slots__ = []
+
+class subscriptlist(GenericSeries):
+    __slots__ = []
+
+class listmaker(GenericSeries):
+    __slots__ = []
+
+class testlist_gexp(GenericSeries):
+    __slots__ = []
+
+class suite(GenericSeries):
+    __slots__ = []
+
+class if_stmt(GenericSeries):
+    __slots__ = []
+
+class compound_stmt(GenericSeries):
+    __slots__ = []
+
+class parameters(GenericSeries):
+    __slots__ = []
+
+class funcdef(GenericSeries):
+    __slots__ = []
+
+class fpdef(GenericSeries):
+    __slots__ = []
+
+class varargslist(GenericSeries):
+    __slots__ = []
+
+class classdef(GenericSeries):
+    __slots__ = []
+
+class exprlist(GenericSeries):
+    __slots__ = []
+
+class print_stmt(GenericSeries):
+    __slots__ = []
+
+class for_stmt(GenericSeries):
+    __slots__ = []
+
+class dotted_name(GenericSeries):
+    __slots__ = []
+
+class dotted_as_name(GenericSeries):
+    __slots__ = []
+
+class dotted_as_names(GenericSeries):
+    __slots__ = []
+
+class import_as_names(GenericSeries):
+    __slots__ = []
+
+class import_as_name(GenericSeries):
+    __slots__ = []
+
+class import_name(GenericSeries):
+    __slots__ = []
+
+class import_from(GenericSeries):
+    __slots__ = []
+
+class import_stmt(GenericSeries):
+    __slots__ = []
+
+class comp_op(GenericSeries):
+    __slots__ = []
+
+class assert_stmt(GenericSeries):
+    __slots__ = []
+
+class return_stmt(GenericSeries):
+    __slots__ = []
+
+class continue_stmt(GenericSeries):
+    __slots__ = []
+
+class break_stmt(GenericSeries):
+    __slots__ = []
+
+class flow_stmt(GenericSeries):
+    __slots__ = []
+
+class while_stmt(GenericSeries):
+    __slots__ = []
+
+class except_clause(GenericSeries):
+    __slots__ = []
+
+class try_stmt(GenericSeries):
+    __slots__ = []
+
+class dictmaker(GenericSeries):
+    __slots__ = []
+
+class raise_stmt(GenericSeries):
+    __slots__ = []
+
+class del_stmt(GenericSeries):
+    __slots__ = []
+
+class exec_stmt(GenericSeries):
+    __slots__ = []
+
+class augassign(GenericSeries):
+    __slots__ = []
+
+class global_stmt(GenericSeries):
+    __slots__ = []
+
+class fplist(GenericSeries):
+    __slots__ = []
+
+class lambdef(GenericSeries):
+    __slots__ = []
+
+class old_test(GenericSeries):
+    __slots__ = []
+
+class testlist_safe(GenericSeries):
+    __slots__ = []
+
+class list_for(GenericSeries):
+    __slots__ = []
+
+class decorator(GenericSeries):
+    __slots__ = []
+
+class decorators(GenericSeries):
+    __slots__ = []
+
+class yield_expr(GenericSeries):
+    __slots__ = []
+
+class yield_stmt(GenericSeries):
+    __slots__ = []
+
+class list_if(GenericSeries):
+    __slots__ = []
+
+class list_iter(GenericSeries):
+    __slots__ = []
+
+class gen_for(GenericSeries):
+    __slots__ = []
+
+class gen_iter(GenericSeries):
+    __slots__ = []
+
+class gen_if(GenericSeries):
+    __slots__ = []
+
+class with_var(GenericSeries):
+    __slots__ = []
+
+class with_stmt(GenericSeries):
+    __slots__ = []
+
+class sliceop(GenericSeries):
+    __slots__ = []
+
+class testlist1(GenericSeries):
+    __slots__ = []
+
+
+def _transparent(context, node, *rest):
+    assert rest == (), (context, node, rest)
+    return node
+
+pass_stmt = _transparent
+small_stmt = _transparent
+stmt = _transparent
+
+class simple_stmt(GenericSeries):
+    __slots__ = []
+
+class file_input(GenericSeries):
+    __slots__ = []
+
+
+# Example nodes and factory functions for nonterminal symbols
+# XXX: get rid of these
+
+def Program(context, stmts):
+    """Program is a nonterminal with only one non-trivial child.
+
+    Grammar: $ Expression ENDMARKER $
+
+    """
+    return stmts
+
+class Expression(Series):
+    "Grammar: $ BinaryExpression ['?' BinaryExpression ':' BinaryExpression] $"
+
+    __slots__ = ["test", "left", "right"]
+
+    def initseries(self, nodes):
+        self.test, self.left, self.right = nodes
+
+    def __str__(self):
+        return "%s ? %s : %s" % (self.test, self.left, self.right)
+
+class BinaryExpression(Series):
+    "Grammar: $ Operand (Operator Operand)* $"
+
+    __slots__ = ["left", "op", "right"]
+
+    def initseries(self, stuff):
+        # Stuff is a list with alternating operands and operators
+        if len(stuff) == 3:
+            self.left, self.op, self.right = stuff
+            return
+        assert len(stuff) > 1 and len(stuff) % 2 == 1
+        # Find the rightmost lowest-priority operator
+        lowest_i = 1
+        lowest_op = stuff[lowest_i]
+        lowest_pri = lowest_op.priority()
+        for i in range(3, len(stuff), 2):
+            op = stuff[i]
+            pri = op.priority()
+            if pri <= lowest_pri:
+                lowest_i = i
+                lowest_op = op
+                lowest_pri = pri
+        self.left = self.__class__(self.context, *stuff[:lowest_i])
+        self.op = lowest_op
+        self.right = self.__class__(self.context, *stuff[lowest_i+1:])
+
+    def __str__(self):
+        return "(%s %s %s)" % (self.left, self.op, self.right)
+
+def Operand(context, arg):
+    """Operand is a nonterminal with one child.
+
+    Grammar: $ Atom | UnaryExpression $
+
+    """
+    return arg
+
+class UnaryExpression(Nonterminal):
+    "Grammar: $ UnaryOperator Operand $"
+
+    __slots__ = ["op", "arg"]
+
+    def __init__(self, context, op, arg):
+        self.op = op
+        self.arg = arg
+
+    def __str__(self):
+        return "%s%s" % (self.op, self.arg)
+
+def UnaryOperator(context, op):
+    "Grammar: $ 'not' | '-' $"
+    return op
+
+class Operator(Nonterminal):
+    """Operator.
+
+    This has a repr slot and a priority() method.
+
+    The __new__ method implements a sort-of singleton pattern: there's
+    only one instance per repr value.  (Yes, this means the context is
+    not useful.  Therefore we set it to None.)
+
+    Grammar: $ '**' | '*' | '/' | '+' | '-' | '&' | '^' | '|' |
+          ['not'] 'in' | '==' | '<' | '>' | '!=' | '<=' | '>=' |
+          'and' | 'or' $
+
+    """
+
+    __slots__ = ["repr"]
+
+    _stretch = False
+
+    _cache = {}
+
+    def __new__(cls, context, *args):
+        repr = " ".join(args)
+        # For "not in", the argument should be the string "not in"
+        obj = cls._cache.get(repr)
+        if obj is None:
+            obj = Terminal.__new__(cls, None)
+            obj.repr = repr
+        return obj
+
+    def __str__(self):
+        return self.repr
+
+    def priority(self):
+        return self._priorities[self.repr]
+
+    _priorities = {
+        "or":  0,
+        "and": 1,
+        "in":  2,
+        "not in": 2,
+        "<":  2,
+        ">":  2,
+        "==": 2,
+        "<=": 2,
+        ">=": 2,
+        "!=": 2,
+        "|":  3,
+        "^":  4,
+        "&":  5,
+        "+":  6,
+        "-":  6,
+        "*":  7,
+        "/":  7,
+        "**": 8,
+    }
+
+def Atom(context, arg):
+    """ Grammar: $ NAME | STRING | NUMBER | ParenthesizedExpression $
+    """
+    return arg
+
+def ParenthesizedExpression(context, expr):
+    "Grammar: $ '(' Expression ')' $"
+    return expr
+
+
+# Conversion from concrete to abstract syntax trees
+
+def vanish(context, value):
+    return None
+
+token_mapping = {
+    # Tokens that become parse tree nodes
+    # (token.NAME is special-cased in the code)
+    token.NUMBER: Number,
+    token.STRING: String,
+
+##     # Tokens that vanish
+##     token.DOT: vanish,
+##     token.LPAR: vanish,
+##     token.RPAR: vanish,
+##     token.COLON: vanish,
+##     token.COMMA: vanish,
+##     token.EQUAL: vanish,
+##     token.DEDENT: vanish,
+##     token.INDENT: vanish,
+##     token.LBRACE: vanish,
+##     token.RBRACE: vanish,
+##     token.NEWLINE: vanish,
+     token.ENDMARKER: vanish,
+##     grammar.QUESTIONMARK: vanish,
+
+    # All other tokens return the token's string value (e.g. "+")
+    }
+
+vanishing_keywords = {
+    # E.g. "def": True,
+    }
+
+def convert(grammar, node):
+    type, value, context, children = node
+    # Is it a non-terminal symbol?
+    if type in grammar.number2symbol:
+        symbol = grammar.number2symbol[type]
+        factory = globals().get(symbol)
+        if factory is None:
+            raise RuntimeError("can't find factory for %s (line %s)" %
+                               (symbol, context))
+        # Debug variation:
+        try:
+            return factory(context, *children)
+        except:
+            logger.debug("%s %s", factory.__name__, "(")
+            for child in children:
+                logger.debug("%s %s", "==>", child)
+            logger.debug(")")
+            logger.debug("# Did you remember to declare a 'context' arg?")
+            raise
+        return factory(context, *children)
+
+    # Must be a terminal symbol.
+    if type == token.NAME:
+        # Name or keyword.  Special-case the snot out of this.
+        if value in grammar.keywords:
+            # Keywords become strings, like operators
+            if value in vanishing_keywords:
+                return None
+            else:
+                return value
+        else:
+            return Name(context, value)
+
+    # Look for a handler in the token_mapping table.
+    assert type in token.tok_name
+    factory = token_mapping.get(type)
+    if factory:
+        return factory(context, value)
+    else:
+        return value
+
+
+# Support code
+
+def generate_grammar(stream):
+    """Extract the grammar rules from this module's doc strings."""
+    import re
+    from types import ModuleType
+    lines = []
+    startline = None
+    for name, obj in globals().items():
+        if hasattr(obj, "__doc__") and not isinstance(obj, ModuleType):
+            m = re.search(r"Grammar:\s*\$([^$]+)\$", obj.__doc__ or "")
+            if m:
+                rule = obj.__name__, " ".join(m.group(1).split())
+                if rule[0] == "Program":
+                    assert not startline
+                    startline = rule
+                else:
+                    lines.append(rule)
+    lines.sort()
+    # The start symbol *must* be the first rule
+    lines.insert(0, startline)
+    for name, rhs in lines:
+        stream.write("%s: %s\n" % (name, rhs))
+
+if __name__ == '__main__':
+    import sys
+    generate_grammar(sys.stdout)


More information about the Python-checkins mailing list