[pypy-svn] r14848 - in pypy/dist/pypy/interpreter: pyparser pyparser/test stablecompiler
ludal at codespeak.net
ludal at codespeak.net
Thu Jul 21 12:37:33 CEST 2005
Author: ludal
Date: Thu Jul 21 12:37:26 2005
New Revision: 14848
Added:
pypy/dist/pypy/interpreter/pyparser/pysymbol.py
pypy/dist/pypy/interpreter/pyparser/pytoken.py
Modified:
pypy/dist/pypy/interpreter/pyparser/astbuilder.py
pypy/dist/pypy/interpreter/pyparser/ebnflexer.py
pypy/dist/pypy/interpreter/pyparser/ebnfparse.py
pypy/dist/pypy/interpreter/pyparser/grammar.py
pypy/dist/pypy/interpreter/pyparser/pythonlexer.py
pypy/dist/pypy/interpreter/pyparser/pythonparse.py
pypy/dist/pypy/interpreter/pyparser/pythonutil.py
pypy/dist/pypy/interpreter/pyparser/pytokenize.py
pypy/dist/pypy/interpreter/pyparser/syntaxtree.py
pypy/dist/pypy/interpreter/pyparser/test/test_lookahead.py
pypy/dist/pypy/interpreter/pyparser/test/test_pytokenizer.py
pypy/dist/pypy/interpreter/pyparser/tuplebuilder.py
pypy/dist/pypy/interpreter/stablecompiler/transformer.py
Log:
- introduced interpreter.pyparser.pysymbol and interpreter.pyparser.pytoken which are pypy versions of
the symbol and token modules of CPython
- reworked all the parser so that we represent rules and tokens with their integer value instead of strings
all tests passed but it might still break some things..
Modified: pypy/dist/pypy/interpreter/pyparser/astbuilder.py
==============================================================================
--- pypy/dist/pypy/interpreter/pyparser/astbuilder.py (original)
+++ pypy/dist/pypy/interpreter/pyparser/astbuilder.py Thu Jul 21 12:37:26 2005
@@ -2,6 +2,8 @@
from grammar import BaseGrammarBuilder
from pypy.interpreter.astcompiler import ast, consts
+import pypy.interpreter.pyparser.pysymbol as sym
+import pypy.interpreter.pyparser.pytoken as tok
## these tests should be methods of the ast objects
@@ -66,15 +68,15 @@
L = get_atoms( builder, nb )
top = L[0]
if isinstance(top, TokenObject):
- if top.name == "(":
+ if top.name == tok.LPAR:
builder. ast.Tuple(L[1:-1], top.line)
- elif top.name == "[":
+ elif top.name == tok.LSQB:
builder.push( ast.List( L[1:-1], top.line) )
- elif top.name == "{":
+ elif top.name == tok.LBRACE:
builder.push( ast.Dict( L[1:-1], top.line) )
- elif top.name == "NAME":
+ elif top.name == tok.NAME:
builder.push( ast.Name(top.value) )
- elif top.name == "NUMBER":
+ elif top.name == tok.NUMBER:
builder.push( ast.Const(eval(top.value)) )
else:
raise ValueError, "unexpected tokens (%d): %s" % (nb,[ str(i) for i in L] )
@@ -94,11 +96,11 @@
if len(L) == 1:
builder.push( L[0] )
elif len(L) == 2 and isinstance(L[0],TokenObject):
- if L[0].name == "+":
+ if L[0].name == tok.PLUS:
builder.push( ast.UnaryAdd( L[1] ) )
- if L[0].name == "-":
+ if L[0].name == tok.MINUS:
builder.push( ast.UnarySub( L[1] ) )
- if L[0].name == "~":
+ if L[0].name == tok.TILDE:
builder.push( ast.Invert( L[1] ) )
def build_term( builder, nb ):
@@ -108,13 +110,13 @@
for i in range(2,l,2):
right = L[i]
op = L[i-1].name
- if op == "*":
+ if op == tok.STAR:
left = ast.Mul( [ left, right ] )
- elif op == "/":
+ elif op == tok.SLASH:
left = ast.Div( [ left, right ] )
- elif op == "%":
+ elif op == tok.PERCENT:
left = ast.Mod( [ left, right ] )
- elif op == "//":
+ elif op == tok.DOUBLESLASH:
left = ast.FloorDiv( [ left, right ] )
else:
raise ValueError, "unexpected token: %s" % L[i-1]
@@ -127,9 +129,9 @@
for i in range(2,l,2):
right = L[i]
op = L[i-1].name
- if op == "+":
+ if op == tok.PLUS:
left = ast.Add( [ left, right ] )
- elif op == "-":
+ elif op == tok.MINUS:
left = ast.Sub( [ left, right ] )
else:
raise ValueError, "unexpected token: %s : %s" % L[i-1]
@@ -142,9 +144,9 @@
for i in range(2,l,2):
right = L[i]
op = L[i-1].name
- if op == "<<":
+ if op == tok.LEFTSHIFT:
left = ast.LeftShift( [ left, right ] )
- elif op == ">>":
+ elif op == tok.RIGHTSHIFT:
left = ast.RightShift( [ left, right ] )
else:
raise ValueError, "unexpected token: %s : %s" % L[i-1]
@@ -204,7 +206,7 @@
builder.push( ast.Discard( L[0] ) )
return
op = L[1]
- if op.name == '=':
+ if op.name == tok.EQUAL:
nodes = []
for i in range(0,l-2,2):
lvalue = to_lvalue( L[i], consts.OP_ASSIGN )
@@ -246,23 +248,23 @@
ASTRULES = {
# "single_input" : build_single_input,
- "atom" : build_atom,
- "power" : build_power,
- "factor" : build_factor,
- "term" : build_term,
- "arith_expr" : build_arith_expr,
- "shift_expr" : build_shift_expr,
- "and_expr" : build_and_expr,
- "xor_expr" : build_xor_expr,
- "expr" : build_expr,
- "comparison" : build_comparison,
- "and_test" : build_and_test,
- "test" : build_test,
- "testlist" : build_testlist,
- "expr_stmt" : build_expr_stmt,
- "small_stmt" : return_one,
- "simple_stmt" : build_simple_stmt,
- "single_input" : build_single_input,
+ sym.atom : build_atom,
+ sym.power : build_power,
+ sym.factor : build_factor,
+ sym.term : build_term,
+ sym.arith_expr : build_arith_expr,
+ sym.shift_expr : build_shift_expr,
+ sym.and_expr : build_and_expr,
+ sym.xor_expr : build_xor_expr,
+ sym.expr : build_expr,
+ sym.comparison : build_comparison,
+ sym.and_test : build_and_test,
+ sym.test : build_test,
+ sym.testlist : build_testlist,
+ sym.expr_stmt : build_expr_stmt,
+ sym.small_stmt : return_one,
+ sym.simple_stmt : build_simple_stmt,
+ sym.single_input : build_single_input,
}
class RuleObject(ast.Node):
@@ -312,26 +314,26 @@
def alternative( self, rule, source ):
# Do nothing, keep rule on top of the stack
if rule.is_root():
- print "ALT:",rule.name
- F = ASTRULES.get(rule.name)
+ print "ALT:", sym.sym_name[rule.codename], rule.codename
+ F = ASTRULES.get(rule.codename)
if F:
F( self, 1 )
else:
- self.push_rule( rule.name, 1, source )
+ self.push_rule( rule.codename, 1, source )
return True
def sequence(self, rule, source, elts_number):
""" """
if rule.is_root():
- print "SEQ:", rule.name
- F = ASTRULES.get(rule.name)
+ print "SEQ:", sym.sym_name[rule.codename], rule.codename
+ F = ASTRULES.get(rule.codename)
if F:
F( self, elts_number )
else:
- self.push_rule( rule.name, elts_number, source )
+ self.push_rule( rule.codename, elts_number, source )
return True
def token(self, name, value, source):
- print "TOK:", name, value
+ print "TOK:", tok.tok_name[name], name, value
self.push_tok( name, value, source )
return True
Modified: pypy/dist/pypy/interpreter/pyparser/ebnflexer.py
==============================================================================
--- pypy/dist/pypy/interpreter/pyparser/ebnflexer.py (original)
+++ pypy/dist/pypy/interpreter/pyparser/ebnflexer.py Thu Jul 21 12:37:26 2005
@@ -22,11 +22,12 @@
SYMBOL: a rule symbol usually appeary right of a SYMDEF
tokens: '[', ']', '(' ,')', '*', '+', '|'
"""
- def __init__(self, inpstring ):
+ def __init__(self, inpstring, tokenmap ):
TokenSource.__init__(self)
self.input = inpstring
self.pos = 0
self._peeked = None
+ self.tokmap = tokenmap
def context(self):
"""returns an opaque context object, used to backtrack
@@ -52,6 +53,7 @@
# means backtracking more than one token
# will re-tokenize the stream (but this is the
# grammar lexer so we don't care really!)
+ T = self.tokmap
if self._peeked is not None:
peeked = self._peeked
self._peeked = None
@@ -64,28 +66,28 @@
pos = m.end()
if pos==len(inp):
self.pos = pos
- return Token("EOF", None)
+ return Token(T["EOF"], None)
m = g_skip.match(inp, pos)
m = g_symdef.match(inp,pos)
if m:
tk = m.group(0)
self.pos = m.end()
- return Token('SYMDEF',tk[:-1])
+ return Token(T['SYMDEF'],tk[:-1])
m = g_tok.match(inp,pos)
if m:
tk = m.group(0)
self.pos = m.end()
- return Token(tk,tk)
+ return Token(T[tk],tk)
m = g_string.match(inp,pos)
if m:
tk = m.group(0)
self.pos = m.end()
- return Token('STRING',tk[1:-1])
+ return Token(T['STRING'],tk[1:-1])
m = g_symbol.match(inp,pos)
if m:
tk = m.group(0)
self.pos = m.end()
- return Token('SYMBOL',tk)
+ return Token(T['SYMBOL'],tk)
raise ValueError("Unknown token at pos=%d context='%s'" %
(pos,inp[pos:pos+20]) )
Modified: pypy/dist/pypy/interpreter/pyparser/ebnfparse.py
==============================================================================
--- pypy/dist/pypy/interpreter/pyparser/ebnfparse.py (original)
+++ pypy/dist/pypy/interpreter/pyparser/ebnfparse.py Thu Jul 21 12:37:26 2005
@@ -2,6 +2,8 @@
from grammar import BaseGrammarBuilder, Alternative, Sequence, Token, \
KleenStar, GrammarElement, build_first_sets, EmptyToken
from ebnflexer import GrammarSource
+import pytoken
+import pysymbol
import re
py_name = re.compile(r"[a-zA-Z_][a-zA-Z0-9_]*", re.M)
@@ -32,7 +34,7 @@
class NameToken(Token):
"""A token that is not a keyword"""
def __init__(self, keywords=None ):
- Token.__init__(self, "NAME")
+ Token.__init__(self, pytoken.NAME)
self.keywords = keywords
def match(self, source, builder, level=0):
@@ -48,10 +50,10 @@
"""
ctx = source.context()
tk = source.next()
- if tk.name==self.name:
+ if tk.codename==self.codename:
if tk.value not in self.keywords:
- ret = builder.token( tk.name, tk.value, source )
- return self.debug_return( ret, tk.name, tk.value )
+ ret = builder.token( tk.codename, tk.value, source )
+ return self.debug_return( ret, tk.codename, tk.value )
source.restore( ctx )
return 0
@@ -62,7 +64,7 @@
raise RuntimeError("Unexpected token type %r" % other)
if other is EmptyToken:
return False
- if other.name != self.name:
+ if other.codename != self.codename:
return False
if other.value in self.keywords:
return False
@@ -76,20 +78,40 @@
self.terminals = {}
self.current_rule = None
self.current_subrule = 0
- self.tokens = {}
+ self.keywords = []
self.items = []
self.terminals['NAME'] = NameToken()
- def new_name(self):
+ def new_symbol(self):
rule_name = ":%s_%s" % (self.current_rule, self.current_subrule)
self.current_subrule += 1
- return rule_name
+ symval = pysymbol.add_anon_symbol( rule_name )
+ return symval
def new_item(self, itm):
self.items.append(itm)
return itm
-
- def visit_grammar(self, node):
+
+ def visit_syntaxnode( self, node ):
+ """NOT RPYTHON, used only at bootstrap time anyway"""
+ name = sym_map[node.name]
+ visit_meth = getattr(self, "handle_%s" % name, None)
+ if visit_meth:
+ return visit_meth(node)
+ else:
+ print "Unknown handler for %s" %name
+ # helper function for nodes that have only one subnode:
+ if len(node.nodes) == 1:
+ return node.nodes[0].visit(visitor)
+ raise RuntimeError("Unknown Visitor for %r" % name)
+
+ def visit_tokennode( self, node ):
+ return self.visit_syntaxnode( node )
+
+ def visit_tempsyntaxnode( self, node ):
+ return self.visit_syntaxnode( node )
+
+ def handle_grammar(self, node):
for rule in node.nodes:
rule.visit(self)
# the rules are registered already
@@ -97,33 +119,33 @@
# terminal symbols from non terminals
for r in self.items:
for i,a in enumerate(r.args):
- if a.name in self.rules:
+ if a.codename in self.rules:
assert isinstance(a,Token)
- r.args[i] = self.rules[a.name]
- if a.name in self.terminals:
- del self.terminals[a.name]
+ r.args[i] = self.rules[a.codename]
+ if a.codename in self.terminals:
+ del self.terminals[a.codename]
# XXX .keywords also contains punctuations
- self.terminals['NAME'].keywords = self.tokens.keys()
+ self.terminals['NAME'].keywords = self.keywords
- def visit_rule(self, node):
+ def handle_rule(self, node):
symdef = node.nodes[0].value
self.current_rule = symdef
self.current_subrule = 0
alt = node.nodes[1]
rule = alt.visit(self)
if not isinstance(rule, Token):
- rule.name = symdef
- self.rules[symdef] = rule
+ rule.codename = pysymbol.add_symbol( symdef )
+ self.rules[rule.codename] = rule
- def visit_alternative(self, node):
+ def handle_alternative(self, node):
items = [node.nodes[0].visit(self)]
items += node.nodes[1].visit(self)
- if len(items) == 1 and items[0].name.startswith(':'):
+ if len(items) == 1 and not items[0].is_root():
return items[0]
- alt = Alternative(self.new_name(), items)
+ alt = Alternative(self.new_symbol(), items)
return self.new_item(alt)
- def visit_sequence( self, node ):
+ def handle_sequence( self, node ):
""" """
items = []
for n in node.nodes:
@@ -131,56 +153,64 @@
if len(items)==1:
return items[0]
elif len(items)>1:
- return self.new_item( Sequence( self.new_name(), items) )
+ return self.new_item( Sequence( self.new_symbol(), items) )
raise SyntaxError("Found empty sequence")
- def visit_sequence_cont( self, node ):
+ def handle_sequence_cont( self, node ):
"""Returns a list of sequences (possibly empty)"""
return [n.visit(self) for n in node.nodes]
- def visit_seq_cont_list(self, node):
+ def handle_seq_cont_list(self, node):
return node.nodes[1].visit(self)
- def visit_symbol(self, node):
+ def handle_symbol(self, node):
star_opt = node.nodes[1]
sym = node.nodes[0].value
terminal = self.terminals.get( sym )
if not terminal:
- terminal = Token( sym )
- self.terminals[sym] = terminal
+ tokencode = pytoken.tok_values.get( sym )
+ if tokencode is None:
+ tokencode = pysymbol.add_symbol( sym )
+ terminal = Token( tokencode )
+ else:
+ terminal = Token( tokencode )
+ self.terminals[sym] = terminal
return self.repeat( star_opt, terminal )
- def visit_option( self, node ):
+ def handle_option( self, node ):
rule = node.nodes[1].visit(self)
- return self.new_item( KleenStar( self.new_name(), 0, 1, rule ) )
+ return self.new_item( KleenStar( self.new_symbol(), 0, 1, rule ) )
- def visit_group( self, node ):
+ def handle_group( self, node ):
rule = node.nodes[1].visit(self)
return self.repeat( node.nodes[3], rule )
- def visit_STRING( self, node ):
+ def handle_STRING( self, node ):
value = node.value
- tok = self.tokens.get(value)
- if not tok:
- if py_punct.match( value ):
- tok = Token( value )
- elif py_name.match( value ):
- tok = Token('NAME', value)
- else:
+ tokencode = pytoken.tok_punct.get( value )
+ if tokencode is None:
+ if not py_name.match( value ):
raise SyntaxError("Unknown STRING value ('%s')" % value )
- self.tokens[value] = tok
+ # assume a keyword
+ tok = Token( pytoken.NAME, value )
+ if value not in self.keywords:
+ self.keywords.append( value )
+ else:
+ # punctuation
+ tok = Token( tokencode )
return tok
- def visit_sequence_alt( self, node ):
+ def handle_sequence_alt( self, node ):
res = node.nodes[0].visit(self)
assert isinstance( res, GrammarElement )
return res
def repeat( self, star_opt, myrule ):
+ assert isinstance( myrule, GrammarElement )
if star_opt.nodes:
- rule_name = self.new_name()
+ rule_name = self.new_symbol()
tok = star_opt.nodes[0].nodes[0]
if tok.value == '+':
item = KleenStar(rule_name, _min=1, rule=myrule)
@@ -195,6 +225,22 @@
rules = None
+sym_map = {}
+sym_rmap = {}
+sym_count = 0
+
+def g_add_symbol( name ):
+ global sym_count
+ if name in sym_rmap:
+ return sym_rmap[name]
+ val = sym_count
+ sym_count += 1
+ sym_map[val] = name
+ sym_rmap[name] = val
+ return val
+
+g_add_symbol( 'EOF' )
+
def grammar_grammar():
"""Builds the grammar for the grammar file
@@ -209,36 +255,37 @@
option: '[' alternative ']'
group: '(' alternative ')' star?
"""
- global rules
+ global rules, sym_map
+ S = g_add_symbol
# star: '*' | '+'
- star = Alternative( "star", [Token('*'), Token('+')] )
- star_opt = KleenStar ( "star_opt", 0, 1, rule=star )
+ star = Alternative( S("star"), [Token(S('*')), Token(S('+'))] )
+ star_opt = KleenStar ( S("star_opt"), 0, 1, rule=star )
# rule: SYMBOL ':' alternative
- symbol = Sequence( "symbol", [Token('SYMBOL'), star_opt] )
- symboldef = Token( "SYMDEF" )
- alternative = Sequence( "alternative", [])
- rule = Sequence( "rule", [symboldef, alternative] )
+ symbol = Sequence( S("symbol"), [Token(S('SYMBOL')), star_opt] )
+ symboldef = Token( S("SYMDEF") )
+ alternative = Sequence( S("alternative"), [])
+ rule = Sequence( S("rule"), [symboldef, alternative] )
# grammar: rule+
- grammar = KleenStar( "grammar", _min=1, rule=rule )
+ grammar = KleenStar( S("grammar"), _min=1, rule=rule )
# alternative: sequence ( '|' sequence )*
- sequence = KleenStar( "sequence", 1 )
- seq_cont_list = Sequence( "seq_cont_list", [Token('|'), sequence] )
- sequence_cont = KleenStar( "sequence_cont",0, rule=seq_cont_list )
+ sequence = KleenStar( S("sequence"), 1 )
+ seq_cont_list = Sequence( S("seq_cont_list"), [Token(S('|')), sequence] )
+ sequence_cont = KleenStar( S("sequence_cont"),0, rule=seq_cont_list )
alternative.args = [ sequence, sequence_cont ]
# option: '[' alternative ']'
- option = Sequence( "option", [Token('['), alternative, Token(']')] )
+ option = Sequence( S("option"), [Token(S('[')), alternative, Token(S(']'))] )
# group: '(' alternative ')'
- group = Sequence( "group", [Token('('), alternative, Token(')'), star_opt] )
+ group = Sequence( S("group"), [Token(S('(')), alternative, Token(S(')')), star_opt] )
# sequence: (SYMBOL | STRING | option | group )+
- string = Token('STRING')
- alt = Alternative( "sequence_alt", [symbol, string, option, group] )
+ string = Token(S('STRING'))
+ alt = Alternative( S("sequence_alt"), [symbol, string, option, group] )
sequence.args = [ alt ]
@@ -253,7 +300,7 @@
stream : file-like object representing the grammar to parse
"""
- source = GrammarSource(stream.read())
+ source = GrammarSource(stream.read(), sym_rmap)
rule = grammar_grammar()
builder = BaseGrammarBuilder()
result = rule.match(source, builder)
Modified: pypy/dist/pypy/interpreter/pyparser/grammar.py
==============================================================================
--- pypy/dist/pypy/interpreter/pyparser/grammar.py (original)
+++ pypy/dist/pypy/interpreter/pyparser/grammar.py Thu Jul 21 12:37:26 2005
@@ -11,6 +11,13 @@
DEBUG = 0
USE_LOOKAHEAD = True
+def get_symbol( codename, symbols ):
+ """Helper function to build a token name"""
+ if codename in symbols:
+ return symbols[codename]
+ else:
+ return "["+str(codename)+"]"
+
#### Abstract interface for a lexer/tokenizer
class TokenSource(object):
"""Abstract base class for a source tokenizer"""
@@ -74,16 +81,22 @@
from syntaxtree import SyntaxNode, TempSyntaxNode, TokenNode
-
+#
+# we use the term root for a grammar rule to specify rules that are given a name
+# by the grammar
+# a rule like S -> A B* is mapped as Sequence( SCODE, KleenStar(-3, B))
+# so S is a root and the subrule describing B* is not.
+# SCODE is the numerical value for rule "S"
class BaseGrammarBuilder(object):
"""Base/default class for a builder"""
- def __init__(self, rules=None, debug=0):
+ def __init__(self, rules=None, debug=0, symbols={} ):
# a dictionary of grammar rules for debug/reference
self.rules = rules or {}
# This attribute is here for convenience
self.source_encoding = None
self.debug = debug
self.stack = []
+ self.symbols = symbols # mapping from codename to symbols
def context(self):
"""Returns the state of the builder to be restored later"""
@@ -98,7 +111,7 @@
# Do nothing, keep rule on top of the stack
if rule.is_root():
elems = self.stack[-1].expand()
- self.stack[-1] = SyntaxNode(rule.name, source, elems)
+ self.stack[-1] = SyntaxNode(rule.codename, source, elems)
if self.debug:
self.stack[-1].dumpstr()
return True
@@ -114,11 +127,11 @@
node_type = TempSyntaxNode
# replace N elements with 1 element regrouping them
if elts_number >= 1:
- elem = node_type(rule.name, source, items)
+ elem = node_type(rule.codename, source, items)
del self.stack[-elts_number:]
self.stack.append(elem)
elif elts_number == 0:
- self.stack.append(node_type(rule.name, source, []))
+ self.stack.append(node_type(rule.codename, source, []))
if self.debug:
self.stack[-1].dumpstr()
return True
@@ -135,11 +148,14 @@
######################################################################
class GrammarElement(object):
"""Base parser class"""
- def __init__(self, name):
+
+ symbols = {} # dirty trick to provide a symbols mapping while printing (and not putting it in every object)
+
+ def __init__(self, codename):
# the rule name
- self.name = name
+ #assert type(codename)==int
+ self.codename = codename # integer mapping to either a token value or rule symbol value
self.args = []
- self._is_root = False
self.first_set = []
self.first_set_complete = False
# self._processing = False
@@ -148,9 +164,10 @@
def is_root(self):
"""This is a root node of the grammar, that is one that will
be included in the syntax tree"""
- if self.name != ":" and self.name.startswith(":"):
- return False
- return True
+ # code attributed to root grammar rules are >=0
+ if self.codename >=0:
+ return True
+ return False
def match(self, source, builder, level=0):
@@ -177,13 +194,13 @@
if EmptyToken in self.first_set:
ret = builder.sequence(self, source, 0 )
if self._trace:
- self._debug_display(token, level, 'eee')
- return self.debug_return( ret, 0 )
+ self._debug_display(token, level, 'eee', builder.symbols)
+ return self.debug_return( ret, builder.symbols, 0 )
if self._trace:
- self._debug_display(token, level, 'rrr')
+ self._debug_display(token, level, 'rrr', builder.symbols)
return 0
elif self._trace:
- self._debug_display(token, level, '>>>')
+ self._debug_display(token, level, '>>>', builder.symbols)
res = self._match(source, builder, level)
if self._trace:
@@ -192,14 +209,14 @@
prefix = '+++'
else:
prefix = '---'
- self._debug_display(token, level, prefix)
+ self._debug_display(token, level, prefix, builder.symbols)
print ' '*level, prefix, " TEXT ='%s'" % (
source.get_source_text(pos1,pos2))
if res:
print "*" * 50
return res
- def _debug_display(self, token, level, prefix):
+ def _debug_display(self, token, level, prefix, symbols):
"""prints context debug informations"""
prefix = '%s%s' % (' ' * level, prefix)
print prefix, " RULE =", self
@@ -229,24 +246,24 @@
pass
def __str__(self):
- return self.display(0)
+ return self.display(0, GrammarElement.symbols )
def __repr__(self):
- return self.display(0)
+ return self.display(0, GrammarElement.symbols )
- def display(self, level):
+ def display(self, level=0, symbols={}):
"""Helper function used to represent the grammar.
mostly used for debugging the grammar itself"""
return "GrammarElement"
- def debug_return(self, ret, *args ):
+ def debug_return(self, ret, symbols, *args ):
# FIXME: use a wrapper of match() methods instead of debug_return()
# to prevent additional indirection
if ret and DEBUG > 0:
sargs = ",".join( [ str(i) for i in args ] )
print "matched %s (%s): %s" % (self.__class__.__name__,
- sargs, self.display() )
+ sargs, self.display(0, symbols=symbols) )
return ret
@@ -284,12 +301,12 @@
for i in self.args:
assert isinstance( i, GrammarElement )
- def _match(self, source, builder, level=0):
+ def _match(self, source, builder, level=0 ):
"""If any of the rules in self.args matches
returns the object built from the first rules that matches
"""
if DEBUG > 1:
- print "try alt:", self.display()
+ print "try alt:", self.display(level, builder.symbols )
tok = source.peek()
# Here we stop at the first match we should
# try instead to get the longest alternative
@@ -303,17 +320,18 @@
m = rule.match(source, builder, level+1)
if m:
ret = builder.alternative( self, source )
- return self.debug_return( ret )
+ return self.debug_return( ret, builder.symbols )
return 0
- def display(self, level=0):
+ def display(self, level=0, symbols={}):
+ name = get_symbol( self.codename, symbols )
if level == 0:
- name = self.name + " -> "
- elif not self.name.startswith(":"):
- return self.name
+ name = name + " -> "
+ elif self.is_root():
+ return name
else:
name = ""
- items = [ a.display(1) for a in self.args ]
+ items = [ a.display(1,symbols) for a in self.args ]
return name+"(" + "|".join( items ) + ")"
def calc_first_set(self):
@@ -370,7 +388,7 @@
def _match(self, source, builder, level=0):
"""matches all of the symbols in order"""
if DEBUG > 1:
- print "try seq:", self.display()
+ print "try seq:", self.display(level, builder.symbols )
ctx = source.context()
bctx = builder.context()
for rule in self.args:
@@ -382,13 +400,14 @@
builder.restore(bctx)
return 0
ret = builder.sequence(self, source, len(self.args))
- return self.debug_return( ret )
+ return self.debug_return( ret, builder.symbols )
- def display(self, level=0):
+ def display(self, level=0, symbols={}):
+ name = get_symbol( self.codename, symbols )
if level == 0:
- name = self.name + " -> "
- elif not self.name.startswith(":"):
- return self.name
+ name = name + " -> "
+ elif self.is_root():
+ return name
else:
name = ""
items = [a.display(1) for a in self.args]
@@ -454,17 +473,18 @@
builder.restore(bctx)
return 0
ret = builder.sequence(self, source, rules)
- return self.debug_return( ret, rules )
+ return self.debug_return( ret, builder.symbols, rules )
rules += 1
if self.max>0 and rules == self.max:
ret = builder.sequence(self, source, rules)
- return self.debug_return( ret, rules )
+ return self.debug_return( ret, builder.symbols, rules )
- def display(self, level=0):
+ def display(self, level=0, symbols={}):
+ name = get_symbol( self.codename, symbols )
if level==0:
- name = self.name + " -> "
- elif not self.name.startswith(":"):
- return self.name
+ name = name + " -> "
+ elif self.is_root():
+ return name
else:
name = ""
star = "{%d,%d}" % (self.min,self.max)
@@ -494,8 +514,8 @@
class Token(GrammarElement):
"""Represents a Token in a grammar rule (a lexer token)"""
- def __init__( self, name, value = None):
- GrammarElement.__init__( self, name )
+ def __init__( self, codename, value = None):
+ GrammarElement.__init__( self, codename )
self.value = value
self.first_set = [self]
# self.first_set = {self: 1}
@@ -513,23 +533,24 @@
"""
ctx = source.context()
tk = source.next()
- if tk.name == self.name:
+ if tk.codename == self.codename:
if self.value is None:
- ret = builder.token( tk.name, tk.value, source )
- return self.debug_return( ret, tk.name )
+ ret = builder.token( tk.codename, tk.value, source )
+ return self.debug_return( ret, builder.symbols, tk.codename )
elif self.value == tk.value:
- ret = builder.token( tk.name, tk.value, source )
- return self.debug_return( ret, tk.name, tk.value )
+ ret = builder.token( tk.codename, tk.value, source )
+ return self.debug_return( ret, builder.symbols, tk.codename, tk.value )
if DEBUG > 1:
print "tried tok:", self.display()
source.restore( ctx )
return 0
- def display(self, level=0):
+ def display(self, level=0, symbols={}):
+ name = get_symbol( self.codename, symbols )
if self.value is None:
- return "<%s>" % self.name
+ return "<%s>" % name
else:
- return "<%s>=='%s'" % (self.name, self.value)
+ return "<%s>=='%s'" % (name, self.value)
def match_token(self, other):
@@ -544,12 +565,12 @@
raise RuntimeError("Unexpected token type %r" % other)
if other is EmptyToken:
return False
- res = other.name == self.name and self.value in (None, other.value)
+ res = other.codename == self.codename and self.value in (None, other.value)
#print "matching", self, other, res
return res
def __eq__(self, other):
- return self.name == other.name and self.value == other.value
+ return self.codename == other.codename and self.value == other.value
@@ -558,4 +579,5 @@
"""
pass
-EmptyToken = Token(None)
+from pypy.interpreter.pyparser.pytoken import NULLTOKEN
+EmptyToken = Token(NULLTOKEN, None)
Added: pypy/dist/pypy/interpreter/pyparser/pysymbol.py
==============================================================================
--- (empty file)
+++ pypy/dist/pypy/interpreter/pyparser/pysymbol.py Thu Jul 21 12:37:26 2005
@@ -0,0 +1,48 @@
+# replacement for the CPython symbol module
+from pytoken import N_TOKENS
+
+# try to avoid numeric values conflict with tokens
+# it's important for CPython, but I'm not so sure it's still
+# important here
+SYMBOL_START = N_TOKENS+30
+del N_TOKENS
+
+_count = SYMBOL_START
+_anoncount = -10
+
+sym_name = {}
+sym_values = {}
+
+def add_symbol( sym ):
+ global _count
+ assert type(sym)==str
+ if not sym_values.has_key( sym ):
+ val = _count
+ sym_values[sym] = val
+ sym_name[val] = sym
+ globals()[sym] = val
+ _count += 1
+ return val
+ return sym_values[ sym ]
+
+def add_anon_symbol( sym ):
+ global _anoncount
+ assert type(sym)==str
+ if not sym_values.has_key( sym ):
+ val = _anoncount
+ sym_values[sym] = val
+ sym_name[val] = sym
+ _anoncount -= 1
+ return val
+ return sym_values[ sym ]
+
+
+def update_symbols( parser ):
+ """Update the symbol module according to rules
+ in PythonParser instance : parser"""
+ for rule in parser.rules:
+ add_symbol( rule )
+
+# There is no symbol in this module until the grammar is loaded
+# once loaded the grammar parser will fill the mappings with the
+# grammar symbols
Modified: pypy/dist/pypy/interpreter/pyparser/pythonlexer.py
==============================================================================
--- pypy/dist/pypy/interpreter/pyparser/pythonlexer.py (original)
+++ pypy/dist/pypy/interpreter/pyparser/pythonlexer.py Thu Jul 21 12:37:26 2005
@@ -7,6 +7,9 @@
from pypy.interpreter.pyparser.grammar import TokenSource, Token
from pypy.interpreter.pyparser.error import ParseError
+import pytoken
+from pytoken import NEWLINE
+
# Don't import string for that ...
NAMECHARS = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_'
NUMCHARS = '0123456789'
@@ -64,14 +67,11 @@
return encoding
################################################################################
-import token as tokenmod
+from pypy.interpreter.pyparser import pytoken
from pytokenize import tabsize, whiteSpaceDFA, triple_quoted, endDFAs, \
single_quoted, pseudoDFA
import automata
-# adopt pytokenize notations / values
-tokenmod.COMMENT = tokenmod.N_TOKENS
-tokenmod.NL = tokenmod.N_TOKENS + 1
class TokenError(ParseError):
"""Raised for lexer errors, e.g. when EOF is found prematurely"""
@@ -128,7 +128,7 @@
endmatch = endDFA.recognize(line)
if -1 != endmatch:
pos = end = endmatch
- tok = token_from_values(tokenmod.STRING, contstr + line[:end])
+ tok = Token(pytoken.STRING, contstr + line[:end])
token_list.append((tok, line, lnum, pos))
last_comment = ''
# token_list.append((STRING, contstr + line[:end],
@@ -136,7 +136,7 @@
contstr, needcont = '', 0
contline = None
elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
- tok = token_from_values(tokenmod.ERRORTOKEN, contstr + line)
+ tok = Token(pytoken.ERRORTOKEN, contstr + line)
token_list.append((tok, line, lnum, pos))
last_comment = ''
# token_list.append((ERRORTOKEN, contstr + line,
@@ -162,14 +162,14 @@
if line[pos] in '#\r\n': # skip comments or blank lines
if line[pos] == '#':
- tok = token_from_values(tokenmod.COMMENT, line[pos:])
+ tok = Token(pytoken.COMMENT, line[pos:])
last_comment = line[pos:]
if lnum <= 2 and encoding is None:
encoding = match_encoding_declaration(last_comment)
if encoding is not None:
encoding = _normalize_encoding(encoding)
else:
- tok = token_from_values(tokenmod.NL, line[pos:])
+ tok = Token(pytoken.NL, line[pos:])
last_comment = ''
# XXX Skip NL and COMMENT Tokens
# token_list.append((tok, line, lnum, pos))
@@ -177,12 +177,12 @@
if column > indents[-1]: # count indents or dedents
indents.append(column)
- tok = token_from_values(tokenmod.INDENT, line[:pos])
+ tok = Token(pytoken.INDENT, line[:pos])
token_list.append((tok, line, lnum, pos))
last_comment = ''
while column < indents[-1]:
indents = indents[:-1]
- tok = token_from_values(tokenmod.DEDENT, '')
+ tok = Token(pytoken.DEDENT, '')
token_list.append((tok, line, lnum, pos))
last_comment = ''
else: # continued statement
@@ -209,22 +209,22 @@
token, initial = line[start:end], line[start]
if initial in numchars or \
(initial == '.' and token != '.'): # ordinary number
- tok = token_from_values(tokenmod.NUMBER, token)
+ tok = Token(pytoken.NUMBER, token)
token_list.append((tok, line, lnum, pos))
last_comment = ''
elif initial in '\r\n':
if parenlev > 0:
- tok = token_from_values(tokenmod.NL, token)
+ tok = Token(pytoken.NL, token)
last_comment = ''
# XXX Skip NL
else:
- tok = token_from_values(tokenmod.NEWLINE, token)
+ tok = Token(pytoken.NEWLINE, token)
# XXX YUCK !
tok.value = last_comment
token_list.append((tok, line, lnum, pos))
last_comment = ''
elif initial == '#':
- tok = token_from_values(tokenmod.COMMENT, token)
+ tok = Token(pytoken.COMMENT, token)
last_comment = token
if lnum <= 2 and encoding is None:
encoding = match_encoding_declaration(last_comment)
@@ -238,7 +238,7 @@
if -1 != endmatch: # all on one line
pos = endmatch
token = line[start:pos]
- tok = token_from_values(tokenmod.STRING, token)
+ tok = Token(pytoken.STRING, token)
token_list.append((tok, line, lnum, pos))
last_comment = ''
else:
@@ -255,12 +255,12 @@
contline = line
break
else: # ordinary string
- tok = token_from_values(tokenmod.STRING, token)
+ tok = Token(pytoken.STRING, token)
token_list.append((tok, line, lnum, pos))
last_comment = ''
# token_list.append((STRING, token, spos, epos, line))
elif initial in namechars: # ordinary name
- tok = token_from_values(tokenmod.NAME, token)
+ tok = Token(pytoken.NAME, token)
token_list.append((tok, line, lnum, pos))
last_comment = ''
elif initial == '\\': # continued stmt
@@ -270,26 +270,29 @@
parenlev = parenlev + 1
elif initial in ')]}':
parenlev = parenlev - 1
- tok = token_from_values(tokenmod.OP, token)
+ if token in pytoken.tok_punct:
+ tok = Token(pytoken.tok_punct[token])
+ else:
+ tok = Token(pytoken.OP, token)
token_list.append((tok, line, lnum, pos))
last_comment = ''
else:
- tok = token_from_values(tokenmod.ERRORTOKEN, line[pos])
+ tok = Token(pytoken.ERRORTOKEN, line[pos])
token_list.append((tok, line, lnum, pos))
last_comment = ''
pos = pos + 1
lnum -= 1
if not (flags & PyCF_DONT_IMPLY_DEDENT):
- if token_list and token_list[-1][0].name != 'NEWLINE':
- token_list.append((Token('NEWLINE', ''), '\n', lnum, 0))
+ if token_list and token_list[-1][0].codename != pytoken.NEWLINE:
+ token_list.append((Token(pytoken.NEWLINE, ''), '\n', lnum, 0))
for indent in indents[1:]: # pop remaining indent levels
- tok = token_from_values(tokenmod.DEDENT, '')
+ tok = Token(pytoken.DEDENT, '')
token_list.append((tok, line, lnum, pos))
- if token_list and token_list[-1][0].name != 'NEWLINE':
- token_list.append((Token('NEWLINE', ''), '\n', lnum, 0))
+ if token_list and token_list[-1][0].codename != pytoken.NEWLINE:
+ token_list.append((Token(pytoken.NEWLINE, ''), '\n', lnum, 0))
- tok = token_from_values(tokenmod.ENDMARKER, '',)
+ tok = Token(pytoken.ENDMARKER, '',)
token_list.append((tok, line, lnum, pos))
return token_list, encoding
@@ -362,18 +365,8 @@
return (self._current_line, self._lineno)
# return 'line %s : %s' % ('XXX', self._current_line)
-NONE_LIST = [tokenmod.ENDMARKER, tokenmod.INDENT, tokenmod.DEDENT]
-NAMED_LIST = [tokenmod.OP]
-
-def token_from_values(tok_type, tok_string):
- """Compatibility layer between both parsers"""
- if tok_type in NONE_LIST:
- return Token(tokenmod.tok_name[tok_type], None)
- if tok_type in NAMED_LIST:
- return Token(tok_string, None)
- if tok_type == tokenmod.NEWLINE:
- return Token('NEWLINE', '') # XXX pending comment ?
- return Token(tokenmod.tok_name[tok_type], tok_string)
+NONE_LIST = [pytoken.ENDMARKER, pytoken.INDENT, pytoken.DEDENT]
+NAMED_LIST = [pytoken.OP]
Source = PythonSource
Modified: pypy/dist/pypy/interpreter/pyparser/pythonparse.py
==============================================================================
--- pypy/dist/pypy/interpreter/pyparser/pythonparse.py (original)
+++ pypy/dist/pypy/interpreter/pyparser/pythonparse.py Thu Jul 21 12:37:26 2005
@@ -8,13 +8,13 @@
from pypy.interpreter.error import OperationError, debug_print
from pypy.interpreter.pyparser.error import ParseError
from pypy.tool.option import Options
-
from pythonlexer import Source
+import pysymbol
import ebnfparse
import sys
import os
import grammar
-import symbol
+
from codeop import PyCF_DONT_IMPLY_DEDENT
class PythonParser(object):
@@ -37,7 +37,8 @@
return self.parse_lines(lines, goal, builder, flags)
def parse_lines(self, lines, goal, builder=None, flags=0):
- target = self.rules[goal]
+ goalnumber = pysymbol.sym_values[goal]
+ target = self.rules[goalnumber]
src = Source(lines, flags)
if builder is None:
@@ -71,44 +72,18 @@
grammar.DEBUG = 0
gram = ebnfparse.parse_grammar( file(fname) )
grammar.DEBUG = level
- return PythonParser(gram)
+ parser = PythonParser( gram )
+ return parser
debug_print( "Loading grammar %s" % PYTHON_GRAMMAR )
PYTHON_PARSER = python_grammar( PYTHON_GRAMMAR )
-_symbols = symbol.sym_name.keys()
-_symbols.sort()
-
-def add_symbol( sym ):
- if not hasattr(symbol, sym):
- nextval = _symbols[-1] + 1
- setattr(symbol, sym, nextval)
- _symbols.append(nextval)
- symbol.sym_name[nextval] = sym
- return nextval
- return 0
-def reload_grammar( version ):
+def reload_grammar(version):
"""helper function to test with pypy different grammars"""
global PYTHON_GRAMMAR, PYTHON_PARSER, PYPY_VERSION
PYTHON_GRAMMAR, PYPY_VERSION = get_grammar_file( version )
debug_print( "Reloading grammar %s" % PYTHON_GRAMMAR )
PYTHON_PARSER = python_grammar( PYTHON_GRAMMAR )
- for rule in PYTHON_PARSER.rules:
- add_symbol( rule )
-
-
-for rule in PYTHON_PARSER.rules:
- add_symbol( rule )
-
-
-SYMBOLS = {}
-# copies the numerical mapping between symbol name and symbol value
-# into SYMBOLS
-for k, v in symbol.sym_name.items():
- SYMBOLS[v] = k
-SYMBOLS['UNKNOWN'] = -1
-
-
def parse_file_input(pyf, gram, builder=None):
"""Parse a python file"""
Modified: pypy/dist/pypy/interpreter/pyparser/pythonutil.py
==============================================================================
--- pypy/dist/pypy/interpreter/pyparser/pythonutil.py (original)
+++ pypy/dist/pypy/interpreter/pyparser/pythonutil.py Thu Jul 21 12:37:26 2005
@@ -1,11 +1,11 @@
__all__ = ["python_parse", "pypy_parse"]
import parser
-import symbol
import pythonparse
from tuplebuilder import TupleBuilder
from pypy.interpreter.pyparser.error import ParseError
+from pypy.interpreter.pyparser import pysymbol
PYTHON_PARSER = pythonparse.PYTHON_PARSER
TARGET_DICT = {
@@ -62,7 +62,7 @@
source_encoding, stack_element = parse_result
nested_tuples = stack_element.as_tuple(lineno)
if source_encoding is not None:
- return (symbol.encoding_decl, nested_tuples, source_encoding)
+ return (pysymbol.encoding_decl, nested_tuples, source_encoding)
else:
return nested_tuples
Added: pypy/dist/pypy/interpreter/pyparser/pytoken.py
==============================================================================
--- (empty file)
+++ pypy/dist/pypy/interpreter/pyparser/pytoken.py Thu Jul 21 12:37:26 2005
@@ -0,0 +1,88 @@
+# A replacement for the token module
+#
+# adds a new map token_values to avoid doing getattr on the module
+# from PyPy RPython
+
+import token
+
+N_TOKENS = token.N_TOKENS
+
+tok_name = {}
+tok_values = {}
+
+def add_token(name, value=None):
+ global N_TOKENS
+ if value is None:
+ value = N_TOKENS
+ N_TOKENS += 1
+ _g = globals()
+ _g[name] = value
+ tok_name[value] = name
+ tok_values[name] = value
+
+# This is used to replace None
+add_token( 'NULLTOKEN', -1 )
+
+for value, name in token.tok_name.items():
+ add_token( name, value )
+
+# Make sure '@' is in the token list
+if "AT" not in tok_values:
+ add_token( "AT" )
+
+add_token( "COMMENT" )
+add_token( "NL" )
+
+# a reverse mapping from internal tokens def to more pythonic tokens
+tok_punct = {
+ "&" : AMPER,
+ "&=" : AMPEREQUAL,
+ "`" : BACKQUOTE,
+ "^" : CIRCUMFLEX,
+ "^=" : CIRCUMFLEXEQUAL,
+ ":" : COLON,
+ "," : COMMA,
+ "." : DOT,
+ "//" : DOUBLESLASH,
+ "//=" : DOUBLESLASHEQUAL,
+ "**" : DOUBLESTAR,
+ "**=" : DOUBLESTAREQUAL,
+ "==" : EQEQUAL,
+ "=" : EQUAL,
+ ">" : GREATER,
+ ">=" : GREATEREQUAL,
+ "{" : LBRACE,
+ "}" : RBRACE,
+ "<<" : LEFTSHIFT,
+ "<<=" : LEFTSHIFTEQUAL,
+ "<" : LESS,
+ "<=" : LESSEQUAL,
+ "(" : LPAR,
+ "[" : LSQB,
+ "-=" : MINEQUAL,
+ "-" : MINUS,
+ "!=" : NOTEQUAL,
+ "<>" : NOTEQUAL,
+ "%" : PERCENT,
+ "%=" : PERCENTEQUAL,
+ "+" : PLUS,
+ "+=" : PLUSEQUAL,
+ ")" : RBRACE,
+ ">>" : RIGHTSHIFT,
+ ">>=" : RIGHTSHIFTEQUAL,
+ ")" : RPAR,
+ "]" : RSQB,
+ ";" : SEMI,
+ "/" : SLASH,
+ "/=" : SLASHEQUAL,
+ "*" : STAR,
+ "*=" : STAREQUAL,
+ "~" : TILDE,
+ "|" : VBAR,
+ "|=" : VBAREQUAL,
+ "@": AT,
+ }
+tok_rpunct = {}
+for string, value in tok_punct.items():
+ tok_rpunct[value] = string
+
Modified: pypy/dist/pypy/interpreter/pyparser/pytokenize.py
==============================================================================
--- pypy/dist/pypy/interpreter/pyparser/pytokenize.py (original)
+++ pypy/dist/pypy/interpreter/pyparser/pytokenize.py Thu Jul 21 12:37:26 2005
@@ -20,21 +20,7 @@
from __future__ import generators
from pypy.interpreter.pyparser import automata
-# ______________________________________________________________________
-# COPIED:
-import token
-__all__ = [x for x in dir(token) if x[0] != '_'] + ["COMMENT", "tokenize",
- "generate_tokens", "NL"]
-del x
-N_TOKENS = token.N_TOKENS
-tok_name = token.tok_name
-del token
-
-COMMENT = N_TOKENS
-tok_name[COMMENT] = 'COMMENT'
-NL = N_TOKENS + 1
-tok_name[NL] = 'NL'
-N_TOKENS += 2
+__all__ = [ "tokenize", "generate_tokens", ]
# ______________________________________________________________________
# Automatically generated DFA's (with one or two hand tweeks):
Modified: pypy/dist/pypy/interpreter/pyparser/syntaxtree.py
==============================================================================
--- pypy/dist/pypy/interpreter/pyparser/syntaxtree.py (original)
+++ pypy/dist/pypy/interpreter/pyparser/syntaxtree.py Thu Jul 21 12:37:26 2005
@@ -1,74 +1,6 @@
"""SyntaxTree class definition"""
-import symbol
-import token
-
-# XXX hack: make sure '@' is in the token list
-if not hasattr(token, 'AT'):
- token.AT = token.N_TOKENS + 2 # see pythonlexer.py for why '+2'
- token.tok_name[token.AT] = 'AT'
-
-TOKEN_MAP = {
- "STRING" : token.STRING,
- "NUMBER" : token.NUMBER,
- "NAME" : token.NAME,
- "NEWLINE" : token.NEWLINE,
- "DEDENT" : token.DEDENT,
- "ENDMARKER" : token.ENDMARKER,
- "INDENT" : token.INDENT,
- "NEWLINE" : token.NEWLINE,
- "NT_OFFSET" : token.NT_OFFSET,
- "N_TOKENS" : token.N_TOKENS,
- "OP" : token.OP,
- "?ERRORTOKEN" : token.ERRORTOKEN,
- "&" : token.AMPER,
- "&=" : token.AMPEREQUAL,
- "`" : token.BACKQUOTE,
- "^" : token.CIRCUMFLEX,
- "^=" : token.CIRCUMFLEXEQUAL,
- ":" : token.COLON,
- "," : token.COMMA,
- "." : token.DOT,
- "//" : token.DOUBLESLASH,
- "//=" : token.DOUBLESLASHEQUAL,
- "**" : token.DOUBLESTAR,
- "**=" : token.DOUBLESTAREQUAL,
- "==" : token.EQEQUAL,
- "=" : token.EQUAL,
- ">" : token.GREATER,
- ">=" : token.GREATEREQUAL,
- "{" : token.LBRACE,
- "}" : token.RBRACE,
- "<<" : token.LEFTSHIFT,
- "<<=" : token.LEFTSHIFTEQUAL,
- "<" : token.LESS,
- "<=" : token.LESSEQUAL,
- "(" : token.LPAR,
- "[" : token.LSQB,
- "-=" : token.MINEQUAL,
- "-" : token.MINUS,
- "!=" : token.NOTEQUAL,
- "<>" : token.NOTEQUAL,
- "%" : token.PERCENT,
- "%=" : token.PERCENTEQUAL,
- "+" : token.PLUS,
- "+=" : token.PLUSEQUAL,
- ")" : token.RBRACE,
- ">>" : token.RIGHTSHIFT,
- ">>=" : token.RIGHTSHIFTEQUAL,
- ")" : token.RPAR,
- "]" : token.RSQB,
- ";" : token.SEMI,
- "/" : token.SLASH,
- "/=" : token.SLASHEQUAL,
- "*" : token.STAR,
- "*=" : token.STAREQUAL,
- "~" : token.TILDE,
- "|" : token.VBAR,
- "|=" : token.VBAREQUAL,
- "@": token.AT,
- }
-NT_OFFSET = token.NT_OFFSET
-
+from pypy.interpreter.pyparser.pysymbol import sym_values
+from pypy.interpreter.pyparser.pytoken import tok_values
class SyntaxNode(object):
@@ -103,17 +35,10 @@
return "<node [%s] at 0x%x>" % (self.name, id(self))
def __str__(self):
- return "(%s)" % self.name
+ return "(%s)" % self.name
def visit(self, visitor):
- """NOT RPYTHON, used only at bootstrap time anyway"""
- visit_meth = getattr(visitor, "visit_%s" % self.name, None)
- if visit_meth:
- return visit_meth(self)
- # helper function for nodes that have only one subnode:
- if len(self.nodes) == 1:
- return self.nodes[0].visit(visitor)
- raise RuntimeError("Unknown Visitor for %r" % self.name)
+ return visitor.visit_syntaxnode(self)
def expand(self):
"""expand the syntax node to its content,
@@ -123,7 +48,7 @@
def totuple(self, lineno=False ):
"""returns a tuple representation of the syntax tree"""
- symvalue = SYMBOLS.get( self.name, (0, self.name) )
+ symvalue = sym_values.get( self.name, (0, self.name) )
l = [ symvalue ]
l += [node.totuple(lineno) for node in self.nodes]
return tuple(l)
@@ -135,6 +60,9 @@
"""expand the syntax node to its content"""
return self.nodes
+ def visit(self, visitor):
+ return visitor.visit_tempsyntaxnode(self)
+
class TokenNode(SyntaxNode):
"""A token node"""
def __init__(self, name, source, value):
@@ -155,9 +83,12 @@
else:
return "<%s!>" % (self.name,)
+ def visit(self, visitor):
+ return visitor.visit_tokennode(self)
+
def totuple(self, lineno=False):
"""returns a tuple representation of the syntax tree"""
- num = TOKEN_MAP.get(self.name, -1)
+ num = tok_values.get(self.name, -1)
if num == -1:
print "Unknown", self.name, self.value
if self.value is not None:
Modified: pypy/dist/pypy/interpreter/pyparser/test/test_lookahead.py
==============================================================================
--- pypy/dist/pypy/interpreter/pyparser/test/test_lookahead.py (original)
+++ pypy/dist/pypy/interpreter/pyparser/test/test_lookahead.py Thu Jul 21 12:37:26 2005
@@ -4,33 +4,38 @@
class TestLookAheadBasics:
def setup_method(self, method):
- self.tok1 = Token('t1', 'foo')
- self.tok2 = Token('t2', 'bar')
- self.tok3 = Token('t3', 'foobar')
+ self.count = 0
+ self.tok1 = Token(self.nextid(), 'foo')
+ self.tok2 = Token(self.nextid(), 'bar')
+ self.tok3 = Token(self.nextid(), 'foobar')
self.tokens = [self.tok1, self.tok2, self.tok3]
build_first_sets(self.tokens)
+ def nextid(self):
+ self.count+=1
+ return self.count
+
def test_basic_token(self):
assert self.tok1.first_set == [self.tok1]
def test_basic_alternative(self):
- alt = Alternative('alt', self.tokens)
+ alt = Alternative(self.nextid(), self.tokens)
build_first_sets([alt])
assert alt.first_set == self.tokens
def test_basic_sequence(self):
- seq = Sequence('seq', self.tokens)
+ seq = Sequence(self.nextid(), self.tokens)
build_first_sets([seq])
assert seq.first_set == [self.tokens[0]]
def test_basic_kleenstar(self):
tok1, tok2, tok3 = self.tokens
- kstar = KleenStar('kstar', 1, 3, tok1)
+ kstar = KleenStar(self.nextid(), 1, 3, tok1)
build_first_sets([kstar])
assert kstar.first_set == [tok1]
- kstar = KleenStar('kstar', 0, 3, tok1)
+ kstar = KleenStar(self.nextid(), 0, 3, tok1)
build_first_sets([kstar])
assert kstar.first_set == [tok1, EmptyToken]
@@ -40,9 +45,9 @@
==> S.first_set = [tok1, tok2, EmptyToken]
"""
tok1, tok2, tok3 = self.tokens
- k1 = KleenStar('k1', 0, 2, tok1)
- k2 = KleenStar('k1', 0, 2, tok2)
- seq = Sequence('seq', [k1, k2])
+ k1 = KleenStar(self.nextid(), 0, 2, tok1)
+ k2 = KleenStar(self.nextid(), 0, 2, tok2)
+ seq = Sequence(self.nextid(), [k1, k2])
build_first_sets([k1, k2, seq])
assert seq.first_set == [tok1, tok2, EmptyToken]
@@ -52,33 +57,41 @@
==> S.first_set = [tok1, tok2]
"""
tok1, tok2, tok3 = self.tokens
- k1 = KleenStar('k1', 0, 2, tok1)
- k2 = KleenStar('k1', 1, 2, tok2)
- seq = Sequence('seq', [k1, k2])
+ k1 = KleenStar(self.nextid(), 0, 2, tok1)
+ k2 = KleenStar(self.nextid(), 1, 2, tok2)
+ seq = Sequence(self.nextid(), [k1, k2])
build_first_sets([k1, k2, seq])
assert seq.first_set == [tok1, tok2]
def test_token_comparison():
- assert Token('t1', 'foo') == Token('t1', 'foo')
- assert Token('t1', 'foo') != Token('t2', 'foo')
- assert Token('t2', 'foo') != Token('t1', None)
+ assert Token(1, 'foo') == Token(1, 'foo')
+ assert Token(1, 'foo') != Token(2, 'foo')
+ assert Token(2, 'foo') != Token(2, None)
+
+LOW = 1
+CAP = 2
+R_A = 3
+R_B = 4
+R_C = 5
+R_k1 = 6
+R_k2 = 7
class TestLookAhead:
def setup_method(self, method):
- self.LOW = Token('LOW', 'low')
- self.CAP = Token('CAP' ,'cap')
- self.A = Alternative('A', [])
- k1 = KleenStar('k1', 0, rule=self.LOW)
- k2 = KleenStar('k2', 0, rule=self.CAP)
- self.B = Sequence('B', [k1, self.A])
- self.C = Sequence('C', [k2, self.A])
+ self.LOW = Token(LOW, 'low')
+ self.CAP = Token(CAP ,'cap')
+ self.A = Alternative(R_A, [])
+ k1 = KleenStar(R_k1, 0, rule=self.LOW)
+ k2 = KleenStar(R_k2, 0, rule=self.CAP)
+ self.B = Sequence(R_B, [k1, self.A])
+ self.C = Sequence(R_C, [k2, self.A])
self.A.args = [self.B, self.C]
build_first_sets([self.A, self.B, self.C, self.LOW, self.CAP, k1, k2])
def test_S_first_set(self):
- for s in [Token('LOW', 'low'), EmptyToken, Token('CAP', 'cap')]:
+ for s in [Token(LOW, 'low'), EmptyToken, Token(CAP, 'cap')]:
assert s in self.A.first_set
assert s in self.B.first_set
assert s in self.C.first_set
Modified: pypy/dist/pypy/interpreter/pyparser/test/test_pytokenizer.py
==============================================================================
--- pypy/dist/pypy/interpreter/pyparser/test/test_pytokenizer.py (original)
+++ pypy/dist/pypy/interpreter/pyparser/test/test_pytokenizer.py Thu Jul 21 12:37:26 2005
@@ -1,15 +1,19 @@
from pypy.interpreter.pyparser.pythonlexer import Source, TokenError, \
match_encoding_declaration
-from pypy.interpreter.pyparser.grammar import Token
+from pypy.interpreter.pyparser.grammar import Token, GrammarElement
+from pypy.interpreter.pyparser.pytoken import EQUAL, ENDMARKER, LSQB, MINUS, NAME, NEWLINE, NULLTOKEN, NUMBER, RSQB, STRING
+
+from pypy.interpreter.pyparser.pytoken import tok_name, tok_punct
+GrammarElement.symbols = tok_name
+
def parse_source(source):
"""returns list of parsed tokens"""
lexer = Source(source.splitlines(True))
tokens = []
- last_token = Token(None, None)
- while last_token.name != 'ENDMARKER':
+ last_token = Token(NULLTOKEN, None)
+ while last_token.codename != ENDMARKER:
last_token = lexer.next()
- # tokens.append((last_token, value))
tokens.append(last_token)
return tokens
@@ -45,26 +49,26 @@
s = """['a'
]"""
tokens = parse_source(s)
- assert tokens == [Token('[', None), Token('STRING', "'a'"),
- Token(']', None), Token('NEWLINE', ''),
- Token('ENDMARKER', None)]
+ assert tokens == [Token(LSQB, None), Token(STRING, "'a'"),
+ Token(RSQB, None), Token(NEWLINE, ''),
+ Token(ENDMARKER, '')]
def test_numbers():
"""make sure all kind of numbers are correctly parsed"""
for number in NUMBERS:
- assert parse_source(number)[0] == Token('NUMBER', number)
+ assert parse_source(number)[0] == Token(NUMBER, number)
neg = '-%s' % number
- assert parse_source(neg)[:2] == [Token('-', None),
- Token('NUMBER', number)]
+ assert parse_source(neg)[:2] == [Token(MINUS, None),
+ Token(NUMBER, number)]
for number in BAD_NUMBERS:
- assert parse_source(number)[0] != Token('NUMBER', number)
+ assert parse_source(number)[0] != Token(NUMBER, number)
def test_hex_number():
"""basic pasrse"""
tokens = parse_source("a = 0x12L")
- assert tokens == [Token('NAME', 'a'), Token('=', None),
- Token('NUMBER', '0x12L'), Token('NEWLINE', ''),
- Token('ENDMARKER', None)]
+ assert tokens == [Token(NAME, 'a'), Token(EQUAL, None),
+ Token(NUMBER, '0x12L'), Token(NEWLINE, ''),
+ Token(ENDMARKER, '')]
def test_punct():
"""make sure each punctuation is correctly parsed"""
@@ -73,7 +77,7 @@
tokens = parse_source(pstr)
except TokenError, error:
tokens = [tok for tok, _, _, _ in error.token_stack]
- assert tokens[0].name == pstr
+ assert tokens[0].codename == tok_punct[pstr]
def test_encoding_declarations_match():
Modified: pypy/dist/pypy/interpreter/pyparser/tuplebuilder.py
==============================================================================
--- pypy/dist/pypy/interpreter/pyparser/tuplebuilder.py (original)
+++ pypy/dist/pypy/interpreter/pyparser/tuplebuilder.py Thu Jul 21 12:37:26 2005
@@ -1,7 +1,6 @@
from grammar import BaseGrammarBuilder
-from syntaxtree import TOKEN_MAP # , NT_OFFSET
-from pythonparse import SYMBOLS
+from pytoken import tok_name, tok_rpunct, NEWLINE, INDENT, DEDENT, ENDMARKER
class StackElement:
"""wraps TupleBuilder's tuples"""
@@ -18,7 +17,7 @@
return self.nodes[0][:-1]
class NonTerminal(StackElement):
- def __init__(self, num, nodes, rulename=None):
+ def __init__(self, num, nodes):
"""rulename should always be None with regular Python grammar"""
self.nodes = nodes
self.num = num
@@ -32,7 +31,7 @@
"""generate a nested tuples from a list of stack elements"""
expanded = []
for element in stack_elements:
- if isinstance(element, NonTerminal) and element.num == -2:
+ if isinstance(element, NonTerminal) and element.num<0:
expanded.extend(element.nodes)
else:
expanded.append(element)
@@ -46,38 +45,19 @@
# This attribute is here for convenience
self.source_encoding = None
self.lineno = lineno
- self._unknown = -10
+ self.tuplestack = []
- def _add_rule(self, rulename):
- SYMBOLS[rulename] = self._unknown
- self._unknown -= 1
-
def alternative(self, rule, source):
# Do nothing, keep rule on top of the stack
if rule.is_root():
nodes = expand_nodes( [self.stack[-1]] )
- if rule.name in SYMBOLS:
- self.stack[-1] = NonTerminal(SYMBOLS[rule.name], nodes)
- else:
- # Using regular CPython's Grammar should not lead here
- # XXX find how self._unknown is meant to be used
- self.stack[-1] = NonTerminal(self._unknown, nodes, rule.name)
- self._add_rule(rule.name)
+ self.stack[-1] = NonTerminal( rule.codename, nodes )
return True
def sequence(self, rule, source, elts_number):
""" """
- if rule.is_root():
- if rule.name in SYMBOLS:
- num = SYMBOLS[rule.name]
- node = [num]
- else:
- num = self._unknown
- node = [num]
- self._add_rule(rule.name)
- else:
- num = -2
- node = [num]
+ num = rule.codename
+ node = [rule.codename]
if elts_number > 0:
sequence_elements = self.stack[-elts_number:]
nodes = expand_nodes( sequence_elements )
@@ -86,13 +66,12 @@
self.stack.append( NonTerminal(num, []) )
return True
- def token(self, name, value, source):
- num = TOKEN_MAP.get(name, -1)
+ def token(self, codename, value, source):
lineno = source.current_lineno()
if value is None:
- if name not in ("NEWLINE", "INDENT", "DEDENT", "ENDMARKER"):
- value = name
+ if codename not in ( NEWLINE, INDENT, DEDENT, ENDMARKER ):
+ value = tok_rpunct.get(codename, "unknown op")
else:
value = ''
- self.stack.append( Terminal(num, value, lineno) )
+ self.stack.append( Terminal(codename, value, lineno) )
return True
Modified: pypy/dist/pypy/interpreter/stablecompiler/transformer.py
==============================================================================
--- pypy/dist/pypy/interpreter/stablecompiler/transformer.py (original)
+++ pypy/dist/pypy/interpreter/stablecompiler/transformer.py Thu Jul 21 12:37:26 2005
@@ -29,8 +29,8 @@
import pypy.interpreter.pyparser.pythonparse
from pypy.interpreter.stablecompiler.ast import *
import parser
-import symbol
-import token
+import pypy.interpreter.pyparser.pysymbol as symbol
+import pypy.interpreter.pyparser.pytoken as token
import sys
class WalkerError(StandardError):
More information about the Pypy-commit
mailing list