[pypy-svn] r22790 - pypy/branch/ast-experiments/pypy/interpreter/pyparser
ludal at codespeak.net
ludal at codespeak.net
Sat Jan 28 15:13:55 CET 2006
Author: ludal
Date: Sat Jan 28 15:13:53 2006
New Revision: 22790
Modified:
pypy/branch/ast-experiments/pypy/interpreter/pyparser/ebnfgrammar.py
pypy/branch/ast-experiments/pypy/interpreter/pyparser/ebnflexer.py
pypy/branch/ast-experiments/pypy/interpreter/pyparser/ebnfparse.py
pypy/branch/ast-experiments/pypy/interpreter/pyparser/grammar.py
pypy/branch/ast-experiments/pypy/interpreter/pyparser/pythonlexer.py
pypy/branch/ast-experiments/pypy/interpreter/pyparser/pythonparse.py
Log:
refactor, part II
forgot the parser class
make pythonlexer.py use it
Modified: pypy/branch/ast-experiments/pypy/interpreter/pyparser/ebnfgrammar.py
==============================================================================
--- pypy/branch/ast-experiments/pypy/interpreter/pyparser/ebnfgrammar.py (original)
+++ pypy/branch/ast-experiments/pypy/interpreter/pyparser/ebnfgrammar.py Sat Jan 28 15:13:53 2006
@@ -2,9 +2,8 @@
# and the symbol mappings
from grammar import BaseGrammarBuilder, Alternative, Sequence, Token, \
- KleeneStar, GrammarElement
+ KleeneStar, GrammarElement, Parser
-from pypy.interpreter.pyparser.parser import Parser
## sym_map = {}
## sym_rmap = {}
@@ -68,34 +67,34 @@
p.add_token('EOF','EOF')
# star: '*' | '+'
- star = p.Alternative( "star", [p.Token('TOK_STAR', '*'), p.Token('TOK_ADD', '+')] )
- star_opt = p.KleeneStar ( "star_opt", 0, 1, rule=star )
+ star = p.Alternative_n( "star", [p.Token_n('TOK_STAR', '*'), p.Token_n('TOK_ADD', '+')] )
+ star_opt = p.KleeneStar_n( "star_opt", 0, 1, rule=star )
# rule: SYMBOL ':' alternative
- symbol = p.Sequence( "symbol", [p.Token('TOK_SYMBOL'), star_opt] )
- symboldef = p.Token( 'TOK_SYMDEF' )
- alternative = p.Sequence( "alternative", [])
- rule = p.Sequence( "rule", [symboldef, alternative] )
+ symbol = p.Sequence_n( "symbol", [p.Token_n('TOK_SYMBOL'), star_opt] )
+ symboldef = p.Token_n( 'TOK_SYMDEF' )
+ alternative = p.Sequence_n( "alternative", [])
+ rule = p.Sequence_n( "rule", [symboldef, alternative] )
# grammar: rule+
- grammar = p.KleeneStar( "grammar", _min=1, rule=rule )
+ grammar = p.KleeneStar_n( "grammar", _min=1, rule=rule )
# alternative: sequence ( '|' sequence )*
- sequence = p.KleeneStar( "sequence", 1 )
- seq_cont_list = p.Sequence( "seq_cont_list", [p.Token('TOK_BAR', '|'), sequence] )
- sequence_cont = p.KleeneStar( "sequence_cont",0, rule=seq_cont_list )
+ sequence = p.KleeneStar_n( "sequence", 1 )
+ seq_cont_list = p.Sequence_n( "seq_cont_list", [p.Token_n('TOK_BAR', '|'), sequence] )
+ sequence_cont = p.KleeneStar_n( "sequence_cont",0, rule=seq_cont_list )
alternative.args = [ sequence, sequence_cont ]
# option: '[' alternative ']'
- option = p.Sequence( "option", [p.Token('TOK_LBRACKET', '['), alternative, p.Token('TOK_RBRACKET', ']')] )
+ option = p.Sequence_n( "option", [p.Token_n('TOK_LBRACKET', '['), alternative, p.Token_n('TOK_RBRACKET', ']')] )
# group: '(' alternative ')'
- group = p.Sequence( "group", [p.Token('TOK_LPAR', '('), alternative, p.Token('TOK_RPAR', ')'), star_opt] )
+ group = p.Sequence_n( "group", [p.Token_n('TOK_LPAR', '('), alternative, p.Token_n('TOK_RPAR', ')'), star_opt] )
# sequence: (SYMBOL | STRING | option | group )+
- string = p.Token('TOK_STRING')
- alt = p.Alternative( "sequence_alt", [symbol, string, option, group] )
+ string = p.Token_n('TOK_STRING')
+ alt = p.Alternative_n( "sequence_alt", [symbol, string, option, group] )
sequence.args = [ alt ]
p.root_rules['grammar'] = grammar
Modified: pypy/branch/ast-experiments/pypy/interpreter/pyparser/ebnflexer.py
==============================================================================
--- pypy/branch/ast-experiments/pypy/interpreter/pyparser/ebnflexer.py (original)
+++ pypy/branch/ast-experiments/pypy/interpreter/pyparser/ebnflexer.py Sat Jan 28 15:13:53 2006
@@ -125,6 +125,7 @@
# means backtracking more than one token
# will re-tokenize the stream (but this is the
# grammar lexer so we don't care really!)
+ _p = self.parser
if self._peeked is not None:
peeked = self._peeked
self._peeked = None
@@ -135,7 +136,7 @@
end = len(self.input)
pos = self.skip_empty_lines(inp,pos,end)
if pos==end:
- return self.parser.Token( 'EOF', None)
+ return _p.Token( _p.EOF, None)
# at this point nextchar is not a white space nor \n
nextchr = inp[pos]
@@ -147,22 +148,22 @@
self.pos = npos
_endpos = npos - 1
assert _endpos>=0
- return self.parser.Token( 'TOK_STRING', inp[pos+1:_endpos])
+ return _p.Token( _p.TOK_STRING, inp[pos+1:_endpos])
else:
npos = match_symbol( inp, pos, end)
if npos!=pos:
self.pos = npos
if npos!=end and inp[npos]==":":
self.pos += 1
- return self.parser.Token( 'TOK_SYMDEF', inp[pos:npos])
+ return _p.Token( _p.TOK_SYMDEF, inp[pos:npos])
else:
- return self.parser.Token( 'TOK_SYMBOL', inp[pos:npos])
+ return _p.Token( _p.TOK_SYMBOL, inp[pos:npos])
# we still have pos!=end here
chr = inp[pos]
if chr in "[]()*+|":
self.pos = pos+1
- return Token( self.parser, self.parser.tok_values[chr], chr)
+ return _p.Token( _p.tok_values[chr], chr)
self.RaiseError( "Unknown token" )
def peek(self):
Modified: pypy/branch/ast-experiments/pypy/interpreter/pyparser/ebnfparse.py
==============================================================================
--- pypy/branch/ast-experiments/pypy/interpreter/pyparser/ebnfparse.py (original)
+++ pypy/branch/ast-experiments/pypy/interpreter/pyparser/ebnfparse.py Sat Jan 28 15:13:53 2006
@@ -1,12 +1,11 @@
#!/usr/bin/env python
from grammar import BaseGrammarBuilder, Alternative, Sequence, Token
from grammar import GrammarProxy, KleeneStar, GrammarElement, build_first_sets
-from grammar import AbstractBuilder, AbstractContext
+from grammar import AbstractBuilder, AbstractContext, Parser
from ebnflexer import GrammarSource
import ebnfgrammar
from ebnfgrammar import GRAMMAR_GRAMMAR
from syntaxtree import AbstractSyntaxVisitor
-from parser import Parser
ORDA = ord("A")
@@ -117,7 +116,8 @@
name is based on the current grammar rule being parsed"""
rule_name = ":" + self.current_rule_name + "_%d" % self.current_subrule
self.current_subrule += 1
- return rule_name
+ name_id = self.parser.add_anon_symbol( rule_name )
+ return name_id
def new_rule(self, rule):
"""A simple helper method that registers a new rule as 'known'"""
@@ -155,7 +155,7 @@
def get_rule( self, name ):
if name in self.parser.tokens:
- return self.parser.Token( name )
+ return self.parser.Token_n( name )
codename = self.get_symbolcode( name )
if codename in self.parser.root_rules:
return self.parser.root_rules[codename]
@@ -220,7 +220,7 @@
del self.rule_stack[0]
if isinstance(old_rule,Token):
# Wrap a token into an alternative
- old_rule = self.parser.Alternative( self.current_rule_name, [old_rule] )
+ old_rule = self.parser.Alternative( self.current_rule, [old_rule] )
else:
# Make sure we use the codename from the named rule
old_rule.codename = self.current_rule
@@ -275,12 +275,12 @@
if value in self.parser.tok_values:
# punctuation
tokencode = self.parser.tok_values[value]
- tok = Token( self.parser, tokencode, None )
+ tok = self.parser.Token( tokencode, None )
else:
if not is_py_name(value):
raise RuntimeError("Unknown STRING value ('%s')" % value)
# assume a keyword
- tok = Token( self.parser, self.parser.NAME, value)
+ tok = self.parser.Token( self.parser.NAME, value)
if value not in self.keywords:
self.keywords.append(value)
self.rule_stack.append(tok)
Modified: pypy/branch/ast-experiments/pypy/interpreter/pyparser/grammar.py
==============================================================================
--- pypy/branch/ast-experiments/pypy/interpreter/pyparser/grammar.py (original)
+++ pypy/branch/ast-experiments/pypy/interpreter/pyparser/grammar.py Sat Jan 28 15:13:53 2006
@@ -10,12 +10,10 @@
try:
from pypy.interpreter.baseobjspace import Wrappable
from pypy.interpreter.pyparser.pytoken import NULLTOKEN
- from pypy.interpreter.pyparser.parser import Parser
except ImportError:
# allows standalone testing
Wrappable = object
NULLTOKEN = -1 # None
- from parser import Parser
from syntaxtree import SyntaxNode, TempSyntaxNode, TokenNode
@@ -303,10 +301,18 @@
pass
def __str__(self):
- return self.display(0)
+ try:
+ return self.display(0)
+ except Exception, e:
+ import traceback
+ traceback.print_exc()
def __repr__(self):
- return self.display(0)
+ try:
+ return self.display(0)
+ except Exception, e:
+ import traceback
+ traceback.print_exc()
def display(self, level=0):
"""Helper function used to represent the grammar.
@@ -717,4 +723,137 @@
-
+class Parser(object):
+ def __init__(self):
+ pass
+ _anoncount = self._anoncount = -10
+ _count = self._count = 0
+ self.sym_name = {} # mapping symbol code -> symbol name
+ self.symbols = {} # mapping symbol name -> symbol code
+ self.tokens = { 'NULLTOKEN' : -1 }
+ self.EmptyToken = Token( self, -1, None )
+ self.tok_name = {}
+ self.tok_values = {}
+ self._ann_sym_count = -10
+ self._sym_count = 0
+ self.all_rules = []
+ self.root_rules = {}
+
+ def symbol_repr( self, codename ):
+ if codename in self.tok_name:
+ return self.tok_name[codename]
+ elif codename in self.sym_name:
+ return self.sym_name[codename]
+ return "%d" % codename
+
+ def add_symbol( self, sym ):
+ assert isinstance( sym, str )
+ if not sym in self.symbols:
+ val = self._sym_count
+ self._sym_count += 1
+ self.symbols[sym] = val
+ self.sym_name[val] = sym
+ return val
+ return self.symbols[ sym ]
+
+ def add_anon_symbol( self, sym ):
+ assert isinstance( sym, str )
+ if not sym in self.symbols:
+ val = self._ann_sym_count
+ self._ann_sym_count -= 1
+ self.symbols[sym] = val
+ self.sym_name[val] = sym
+ return val
+ return self.symbols[ sym ]
+
+ def add_token( self, tok, value = None ):
+ assert isinstance( tok, str )
+ if not tok in self.tokens:
+ val = self._sym_count
+ self._sym_count += 1
+ self.tokens[tok] = val
+ self.tok_name[val] = tok
+ if value is not None:
+ self.tok_values[value] = val
+ return val
+ return self.tokens[ tok ]
+
+ def load_symbols( self, symbols ):
+ for _value, _name in symbols.items():
+ if _value < self._ann_sym_count:
+ self._ann_sym_count = _value - 1
+ if _value > self._sym_count:
+ self._sym_count = _value + 1
+ self.symbols[_name] = _value
+ self.sym_name[_value] = _name
+
+ def build_first_sets(self):
+ """builds the real first tokens set for each rule in <rules>
+
+ Because a rule can be recursive (directly or indirectly), the
+ *simplest* algorithm to build each first set is to recompute them
+ until Computation(N) = Computation(N-1), N being the number of rounds.
+ As an example, on Python2.3's grammar, we need 19 cycles to compute
+ full first sets.
+ """
+ rules = self.all_rules
+ changed = True
+ while changed:
+ # loop while one first set is changed
+ changed = False
+ for rule in rules:
+ # For each rule, recompute first set
+ size = len(rule.first_set)
+ rule.calc_first_set()
+ new_size = len(rule.first_set)
+ if new_size != size:
+ changed = True
+ for r in rules:
+ assert len(r.first_set) > 0, "Error: ot Empty firstset for %s" % r
+ r.reorder_rule()
+
+
+ def Alternative( self, name_id, args ):
+ assert isinstance( name_id, int )
+ alt = Alternative( self, name_id, args )
+ self.all_rules.append( alt )
+ return alt
+
+ def Alternative_n(self, name, args ):
+ assert isinstance(name, str)
+ name_id = self.add_symbol( name )
+ return self.Alternative( name_id, args )
+
+ def Sequence( self, name_id, args ):
+ assert isinstance( name_id, int )
+ alt = Sequence( self, name_id, args )
+ self.all_rules.append( alt )
+ return alt
+
+ def Sequence_n(self, name, args ):
+ assert isinstance(name, str)
+ name_id = self.add_symbol( name )
+ return self.Sequence( name_id, args )
+
+ def KleeneStar( self, name_id, _min = 0, _max = -1, rule = None ):
+ assert isinstance( name_id, int )
+ alt = KleeneStar( self, name_id, _min, _max, rule )
+ self.all_rules.append( alt )
+ return alt
+
+ def KleeneStar_n(self, name, _min = 0, _max = -1, rule = None ):
+ assert isinstance(name, str)
+ name_id = self.add_symbol( name )
+ return self.KleeneStar( name_id, _min, _max, rule )
+
+ def Token_n(self, name, value = None ):
+ assert isinstance( name, str)
+ assert value is None or isinstance( value, str)
+ name_id = self.add_token( name, value )
+ return self.Token( name_id, value )
+
+ def Token(self, name_id, value = None ):
+ assert isinstance( name_id, int )
+ assert value is None or isinstance( value, str)
+ tok = Token( self, name_id, value )
+ return tok
Modified: pypy/branch/ast-experiments/pypy/interpreter/pyparser/pythonlexer.py
==============================================================================
--- pypy/branch/ast-experiments/pypy/interpreter/pyparser/pythonlexer.py (original)
+++ pypy/branch/ast-experiments/pypy/interpreter/pyparser/pythonlexer.py Sat Jan 28 15:13:53 2006
@@ -5,7 +5,7 @@
import sys
from codeop import PyCF_DONT_IMPLY_DEDENT
-from pypy.interpreter.pyparser.grammar import TokenSource, Token, AbstractContext
+from pypy.interpreter.pyparser.grammar import TokenSource, Token, AbstractContext, Parser
from pypy.interpreter.pyparser.error import SyntaxError
import pytoken
@@ -51,7 +51,7 @@
################################################################################
from pypy.interpreter.pyparser import pytoken
from pytokenize import tabsize, whiteSpaceDFA, triple_quoted, endDFAs, \
- single_quoted, pseudoDFA
+ single_quoted, pseudoDFA
import automata
@@ -62,7 +62,7 @@
SyntaxError.__init__(self, msg, lineno, offset, line)
self.token_stack = token_stack
-def generate_tokens(lines, flags):
+def generate_tokens( parser, lines, flags):
"""
This is a rewrite of pypy.module.parser.pytokenize.generate_tokens since
the original function is not RPYTHON (uses yield)
@@ -91,6 +91,7 @@
#for line in lines:
# print repr(line)
#print '------------------- flags=%s ---->' % flags
+ assert isinstance( parser, Parser )
token_list = []
lnum = parenlev = continued = 0
namechars = NAMECHARS
@@ -120,7 +121,7 @@
endmatch = endDFA.recognize(line)
if endmatch >= 0:
pos = end = endmatch
- tok = Token(pytoken.STRING, contstr + line[:end])
+ tok = parser.Token(parser.STRING, contstr + line[:end])
token_list.append((tok, line, lnum, pos))
last_comment = ''
# token_list.append((STRING, contstr + line[:end],
@@ -129,7 +130,7 @@
contline = None
elif (needcont and not line.endswith('\\\n') and
not line.endswith('\\\r\n')):
- tok = Token(pytoken.ERRORTOKEN, contstr + line)
+ tok = parser.Token(parser.ERRORTOKEN, contstr + line)
token_list.append((tok, line, lnum, pos))
last_comment = ''
# token_list.append((ERRORTOKEN, contstr + line,
@@ -155,10 +156,10 @@
if line[pos] in '#\r\n': # skip comments or blank lines
if line[pos] == '#':
- tok = Token(pytoken.COMMENT, line[pos:])
+ tok = parser.Token(parser.COMMENT, line[pos:])
last_comment = line[pos:]
else:
- tok = Token(pytoken.NL, line[pos:])
+ tok = parser.Token(parser.NL, line[pos:])
last_comment = ''
# XXX Skip NL and COMMENT Tokens
# token_list.append((tok, line, lnum, pos))
@@ -166,12 +167,12 @@
if column > indents[-1]: # count indents or dedents
indents.append(column)
- tok = Token(pytoken.INDENT, line[:pos])
+ tok = parser.Token(parser.INDENT, line[:pos])
token_list.append((tok, line, lnum, pos))
last_comment = ''
while column < indents[-1]:
indents = indents[:-1]
- tok = Token(pytoken.DEDENT, '')
+ tok = parser.Token(parser.DEDENT, '')
token_list.append((tok, line, lnum, pos))
last_comment = ''
else: # continued statement
@@ -198,22 +199,22 @@
token, initial = line[start:end], line[start]
if initial in numchars or \
(initial == '.' and token != '.'): # ordinary number
- tok = Token(pytoken.NUMBER, token)
+ tok = parser.Token(parser.NUMBER, token)
token_list.append((tok, line, lnum, pos))
last_comment = ''
elif initial in '\r\n':
if parenlev > 0:
- tok = Token(pytoken.NL, token)
+ tok = parser.Token(parser.NL, token)
last_comment = ''
# XXX Skip NL
else:
- tok = Token(pytoken.NEWLINE, token)
+ tok = parser.Token(parser.NEWLINE, token)
# XXX YUCK !
tok.value = last_comment
token_list.append((tok, line, lnum, pos))
last_comment = ''
elif initial == '#':
- tok = Token(pytoken.COMMENT, token)
+ tok = parser.Token(parser.COMMENT, token)
last_comment = token
# XXX Skip # token_list.append((tok, line, lnum, pos))
# token_list.append((COMMENT, token, spos, epos, line))
@@ -223,7 +224,7 @@
if endmatch >= 0: # all on one line
pos = endmatch
token = line[start:pos]
- tok = Token(pytoken.STRING, token)
+ tok = parser.Token(parser.STRING, token)
token_list.append((tok, line, lnum, pos))
last_comment = ''
else:
@@ -240,11 +241,11 @@
contline = line
break
else: # ordinary string
- tok = Token(pytoken.STRING, token)
+ tok = parser.Token(parser.STRING, token)
token_list.append((tok, line, lnum, pos))
last_comment = ''
elif initial in namechars: # ordinary name
- tok = Token(pytoken.NAME, token)
+ tok = parser.Token(parser.NAME, token)
token_list.append((tok, line, lnum, pos))
last_comment = ''
elif initial == '\\': # continued stmt
@@ -258,10 +259,11 @@
if parenlev < 0:
raise TokenError("unmatched '%s'" % initial, line,
(lnum-1, 0), token_list)
- if token in pytoken.tok_punct:
- tok = Token(pytoken.tok_punct[token])
+ if token in parser.tok_values:
+ punct = parser.tok_values[token]
+ tok = parser.Token(punct)
else:
- tok = Token(pytoken.OP, token)
+ tok = parser.Token(parser.OP, token)
token_list.append((tok, line, lnum, pos))
last_comment = ''
else:
@@ -271,7 +273,7 @@
if start<max and line[start] in single_quoted:
raise TokenError("EOL while scanning single-quoted string", line,
(lnum, start), token_list)
- tok = Token(pytoken.ERRORTOKEN, line[pos])
+ tok = parser.Token(parser.ERRORTOKEN, line[pos])
token_list.append((tok, line, lnum, pos))
last_comment = ''
pos = pos + 1
@@ -279,14 +281,14 @@
lnum -= 1
if not (flags & PyCF_DONT_IMPLY_DEDENT):
if token_list and token_list[-1][0].codename != pytoken.NEWLINE:
- token_list.append((Token(pytoken.NEWLINE, ''), '\n', lnum, 0))
+ token_list.append((parser.Token(parser.NEWLINE, ''), '\n', lnum, 0))
for indent in indents[1:]: # pop remaining indent levels
- tok = Token(pytoken.DEDENT, '')
+ tok = parser.Token(parser.DEDENT, '')
token_list.append((tok, line, lnum, pos))
#if token_list and token_list[-1][0].codename != pytoken.NEWLINE:
- token_list.append((Token(pytoken.NEWLINE, ''), '\n', lnum, 0))
+ token_list.append((parser.Token(parser.NEWLINE, ''), '\n', lnum, 0))
- tok = Token(pytoken.ENDMARKER, '',)
+ tok = parser.Token(parser.ENDMARKER, '',)
token_list.append((tok, line, lnum, pos))
#for t in token_list:
# print '%20s %-25s %d' % (pytoken.tok_name.get(t[0].codename, '?'), t[0], t[-2])
@@ -300,9 +302,10 @@
class PythonSource(TokenSource):
"""This source uses Jonathan's tokenizer"""
- def __init__(self, strings, flags=0):
+ def __init__(self, parser, strings, flags=0):
# TokenSource.__init__(self)
- tokens = generate_tokens(strings, flags)
+ #self.parser = parser
+ tokens = generate_tokens( parser, strings, flags)
self.token_stack = tokens
self._current_line = '' # the current line (as a string)
self._lineno = -1
Modified: pypy/branch/ast-experiments/pypy/interpreter/pyparser/pythonparse.py
==============================================================================
--- pypy/branch/ast-experiments/pypy/interpreter/pyparser/pythonparse.py (original)
+++ pypy/branch/ast-experiments/pypy/interpreter/pyparser/pythonparse.py Sat Jan 28 15:13:53 2006
@@ -17,7 +17,7 @@
import pypy.interpreter.pyparser.pytoken as pytoken
import pypy.interpreter.pyparser.ebnfparse as ebnfparse
import pypy.interpreter.pyparser.grammar as grammar
-from pypy.interpreter.pyparser.parser import Parser
+
try:
from pypy.interpreter.pyparser import symbol
except ImportError:
@@ -26,10 +26,10 @@
from codeop import PyCF_DONT_IMPLY_DEDENT
-class PythonParser(Parser):
+class PythonParser(grammar.Parser):
"""Wrapper class for python grammar"""
def __init__(self):
- Parser.__init__(self)
+ grammar.Parser.__init__(self)
def parse_source(self, textsrc, goal, builder, flags=0):
"""Parse a python source according to goal"""
@@ -50,9 +50,9 @@
return self.parse_lines(lines, goal, builder, flags)
def parse_lines(self, lines, goal, builder, flags=0):
- goalnumber = pysymbol._cpython_symbols.sym_values[goal]
+ goalnumber = self.symbols[goal]
target = self.root_rules[goalnumber]
- src = Source(lines, flags)
+ src = Source(self, lines, flags)
result = target.match(src, builder)
if not result:
More information about the Pypy-commit
mailing list