[pypy-svn] r54598 - in pypy/dist/pypy/interpreter/pyparser: . test
arigo at codespeak.net
arigo at codespeak.net
Fri May 9 17:38:31 CEST 2008
Author: arigo
Date: Fri May 9 17:38:28 2008
New Revision: 54598
Modified:
pypy/dist/pypy/interpreter/pyparser/ebnfparse.py
pypy/dist/pypy/interpreter/pyparser/grammar.py
pypy/dist/pypy/interpreter/pyparser/test/test_lookahead.py
pypy/dist/pypy/interpreter/pyparser/test/test_pytokenizer.py
Log:
Remove the linear scan in GrammarElement.match_first_set,
which was measured to consume over 6% of the total
execution time of "pypy-c translate.py somesmalltarget".
Modified: pypy/dist/pypy/interpreter/pyparser/ebnfparse.py
==============================================================================
--- pypy/dist/pypy/interpreter/pyparser/ebnfparse.py (original)
+++ pypy/dist/pypy/interpreter/pyparser/ebnfparse.py Fri May 9 17:38:28 2008
@@ -29,6 +29,8 @@
class NameToken(Token):
"""A token that is not a keyword"""
+ isKeyword = False
+
def __init__(self, parser):
Token.__init__(self, parser, parser.tokens['NAME'])
@@ -56,8 +58,7 @@
def match_token(self, builder, other):
- """special case of match token for tokens which are really keywords
- """
+ # Historical stuff. Might be useful for debugging.
if not isinstance(other, Token):
raise RuntimeError("Unexpected token type")
if other is self.parser.EmptyToken:
Modified: pypy/dist/pypy/interpreter/pyparser/grammar.py
==============================================================================
--- pypy/dist/pypy/interpreter/pyparser/grammar.py (original)
+++ pypy/dist/pypy/interpreter/pyparser/grammar.py Fri May 9 17:38:28 2008
@@ -192,7 +192,8 @@
# integer mapping to either a token value or rule symbol value
self.codename = codename
self.args = []
- self.first_set = []
+ self.first_set = {}
+ self.emptytoken_in_first_set = False
self._trace = False
def is_root(self):
@@ -223,9 +224,9 @@
token = source.peek()
if self._trace:
pos1 = source.get_pos()
- in_first_set = self.match_first_set(builder, token)
- if not in_first_set: # and not EmptyToken in self.first_set:
- if self.parser.EmptyToken in self.first_set:
+ in_first_set = self.match_first_set(token)
+ if not in_first_set:
+ if self.emptytoken_in_first_set:
ret = builder.sequence(self, source, 0 )
if self._trace:
self._debug_display(token, level, 'eee' )
@@ -315,17 +316,51 @@
# XXX: first_set could probably be implemented with sets
return []
- def match_first_set(self, builder, other):
- """matching is not equality:
- token('NAME','x') matches token('NAME',None)
- """
+ def optimize_first_set(self):
+ """Precompute a data structure that optimizes match_first_set().
+ The first_set attribute should no longer be needed after this.
+ """
+ self.emptytoken_in_first_set = self.parser.EmptyToken in self.first_set
+ # see match_first_set() for the way this _match_cache is supposed
+ # to be used
+ self._match_cache = [GrammarElement._EMPTY_CODENAME_SET, # share empty
+ GrammarElement._EMPTY_CODENAME_SET] # dicts
for tk in self.first_set:
- if tk.match_token(builder, other):
- return True
- return False
+ if tk is not self.parser.EmptyToken:
+ cache = self._match_cache[tk.isKeyword]
+ if not cache:
+ cache = self._match_cache[tk.isKeyword] = {} # new dict
+ if tk.value is None:
+ cache[tk.codename] = None # match any value
+ else:
+ values = cache.setdefault(tk.codename, {})
+ if values is None:
+ pass # already seen another tk matching any value
+ else:
+ values[tk.value] = None # add tk.value to the set
+
+ _EMPTY_CODENAME_SET = {}
+ _EMPTY_VALUES_SET = {}
- def in_first_set(self, other):
- return other in self.first_set
+ def match_first_set(self, other):
+ """matching is not equality:
+ token('NAME','x') matches token('NAME',None).
+
+ More precisely, for a match, we need to find a tk in self.first_set
+ for which all the following is true:
+ - other is not EmptyToken
+ - other.isKeyword == tk.isKeyword
+ - other.codename == tk.codename
+ - other.value == tk.value or tk.value is None
+ """
+ try:
+ cachelist = self._match_cache
+ except AttributeError:
+ return True # not computed yet
+ cache = cachelist[other.isKeyword]
+ values = cache.get(other.codename, GrammarElement._EMPTY_VALUES_SET)
+ return (values is None or # 'None' means 'matches anything'
+ other.value in values) # otherwise, ok only if in the set
def reorder_rule(self):
"""Called after the computation of first set to allow rules to be
@@ -377,7 +412,7 @@
# to see if this solve our problems with infinite recursion
for rule in self.args:
if USE_LOOKAHEAD:
- if not rule.match_first_set(builder, tok) and self.parser.EmptyToken not in rule.first_set:
+ if not rule.match_first_set(tok) and not rule.emptytoken_in_first_set:
if self._trace:
print "Skipping impossible rule: %s" % (rule,)
continue
@@ -406,9 +441,7 @@
# do this to avoid problems on indirect recursive rules
for rule in self.args:
for t in rule.first_set:
- if t not in self.first_set:
- self.first_set.append(t)
- # self.first_set[t] = 1
+ self.first_set[t] = None
def reorder_rule(self):
# take the opportunity to reorder rules in alternatives
@@ -508,14 +541,10 @@
if not rule.first_set:
break
if self.parser.EmptyToken in self.first_set:
- self.first_set.remove( self.parser.EmptyToken )
-
- # del self.first_set[self.parser.EmptyToken]
+ del self.first_set[self.parser.EmptyToken]
# while we're in this loop, keep agregating possible tokens
for t in rule.first_set:
- if t not in self.first_set:
- self.first_set.append(t)
- # self.first_set[t] = 1
+ self.first_set[t] = None
if self.parser.EmptyToken not in rule.first_set:
break
@@ -545,8 +574,7 @@
self.max = _max
self.star = "x"
if self.min == 0:
- self.first_set.append( self.parser.EmptyToken )
- # self.first_set[self.parser.EmptyToken] = 1
+ self.first_set[self.parser.EmptyToken] = None
def _match(self, source, builder, level=0):
"""matches a number of times self.args[0]. the number must be
@@ -607,11 +635,9 @@
LAH(S) = LAH(A)
"""
rule = self.args[0]
- self.first_set = rule.first_set[:]
- # self.first_set = dict(rule.first_set)
- if self.min == 0 and self.parser.EmptyToken not in self.first_set:
- self.first_set.append(self.parser.EmptyToken)
- # self.first_set[self.parser.EmptyToken] = 1
+ self.first_set = rule.first_set.copy()
+ if self.min == 0:
+ self.first_set[self.parser.EmptyToken] = None
def validate( self, syntax_node ):
"""validate a syntax tree/subtree from this grammar node"""
@@ -634,8 +660,7 @@
def __init__(self, parser, codename, value=None):
GrammarElement.__init__(self, parser, codename)
self.value = value
- self.first_set = [self]
- # self.first_set = {self: 1}
+ self.first_set = {self: None}
def match(self, source, builder, level=0):
"""Matches a token.
@@ -675,13 +700,7 @@
return "<%s>=='%s'" % (name, self.value)
def match_token(self, builder, other):
- """convenience '==' implementation, this is *not* a *real* equality test
- a Token instance can be compared to:
- - another Token instance in which case all fields (name and value)
- must be equal
- - a tuple, such as those yielded by the Python lexer, in which case
- the comparison algorithm is similar to the one in match()
- """
+ # Historical stuff. Might be useful for debugging.
if not isinstance(other, Token):
raise RuntimeError("Unexpected token type")
if other is self.parser.EmptyToken:
@@ -689,8 +708,15 @@
res = other.isKeyword and other.codename == self.codename and self.value in [None, other.value]
return res
- def __eq__(self, other):
+ #def __eq__(self, other):
+ # XXX disabled to avoid strange differences between Python and RPython.
+ # XXX (moreover, only implementing __eq__ without __ne__ and __hash__
+ # XXX is a bit fragile)
+ # return self.codename == other.codename and self.value == other.value
+
+ def eq(self, other):
return self.codename == other.codename and self.value == other.value
+ # XXX probably also "and self.isKeyword == other.isKeyword"
def calc_first_set(self):
"""computes the list of possible next tokens
@@ -801,6 +827,8 @@
for r in rules:
assert len(r.first_set) > 0, "Error: ot Empty firstset for %s" % r
r.reorder_rule()
+ for r in rules:
+ r.optimize_first_set()
def build_alternative( self, name_id, args ):
Modified: pypy/dist/pypy/interpreter/pyparser/test/test_lookahead.py
==============================================================================
--- pypy/dist/pypy/interpreter/pyparser/test/test_lookahead.py (original)
+++ pypy/dist/pypy/interpreter/pyparser/test/test_lookahead.py Fri May 9 17:38:28 2008
@@ -12,59 +12,62 @@
self.parser.build_first_sets()
def test_basic_token(self):
- assert self.tok1.first_set == [self.tok1]
+ assert self.tok1.first_set == {self.tok1: None}
def test_basic_alternative(self):
alt = self.parser.Alternative_n("a1t", self.tokens)
self.parser.build_first_sets()
- assert alt.first_set == self.tokens
+ assert alt.first_set == dict.fromkeys(self.tokens)
def test_basic_sequence(self):
seq = self.parser.Sequence_n("seq", self.tokens)
self.parser.build_first_sets()
- assert seq.first_set == [self.tokens[0]]
+ assert seq.first_set == {self.tokens[0]: None}
def test_basic_kleenstar(self):
tok1, tok2, tok3 = self.tokens
kstar1 = self.parser.KleeneStar_n("k", 1, 3, tok1)
kstar2 = self.parser.KleeneStar_n("k2", 0, 3, tok1)
self.parser.build_first_sets()
- assert kstar1.first_set == [tok1]
- assert kstar2.first_set == [tok1, self.parser.EmptyToken]
+ assert kstar1.first_set == {tok1: None}
+ assert kstar2.first_set == {tok1: None,
+ self.parser.EmptyToken: None}
def test_maybe_empty_sequence(self):
"""S -> tok1{0,2} tok2{0,2}
- ==> S.first_set = [tok1, tok2, EmptyToken]
+ ==> S.first_set = {tok1, tok2, EmptyToken}
"""
tok1, tok2, tok3 = self.tokens
k1 = self.parser.KleeneStar_n( "k1", 0, 2, tok1)
k2 = self.parser.KleeneStar_n("k2", 0, 2, tok2)
seq = self.parser.Sequence_n( "seq", [k1, k2])
self.parser.build_first_sets()
- assert seq.first_set == [tok1, tok2, self.parser.EmptyToken]
+ assert seq.first_set == {tok1: None,
+ tok2: None,
+ self.parser.EmptyToken: None}
def test_not_empty_sequence(self):
"""S -> tok1{0,2} tok2{1,2}
- ==> S.first_set = [tok1, tok2]
+ ==> S.first_set = {tok1, tok2}
"""
tok1, tok2, tok3 = self.tokens
k1 = self.parser.KleeneStar_n("k1", 0, 2, tok1)
k2 = self.parser.KleeneStar_n("k2", 1, 2, tok2)
seq = self.parser.Sequence_n("seq", [k1, k2])
self.parser.build_first_sets()
- assert seq.first_set == [tok1, tok2]
+ assert seq.first_set == {tok1: None, tok2: None}
def test_token_comparison(self):
tok1 = self.parser.Token_n( "tok1", "foo" )
tok1b = self.parser.Token_n( "tok1", "foo" )
tok2 = self.parser.Token_n( "tok2", "foo" )
tok3 = self.parser.Token_n( "tok2", None )
- assert tok1 == tok1b
- assert tok1 != tok2
- assert tok2 != tok3
+ assert tok1.eq(tok1b)
+ assert not tok1.eq(tok2)
+ assert not tok2.eq(tok3)
@@ -86,7 +89,10 @@
p = self.parser
LOW = p.tokens['LOW']
CAP = p.tokens['CAP']
- for s in [Token(p, LOW, 'low'), p.EmptyToken, Token(p, CAP, 'cap')]:
- assert s in self.A.first_set
- assert s in self.B.first_set
- assert s in self.C.first_set
+ assert self.A.emptytoken_in_first_set
+ assert self.B.emptytoken_in_first_set
+ assert self.C.emptytoken_in_first_set
+ for s in [Token(p, LOW, 'low'), Token(p, CAP, 'cap')]:
+ assert self.A.match_first_set(s)
+ assert self.B.match_first_set(s)
+ assert self.C.match_first_set(s)
Modified: pypy/dist/pypy/interpreter/pyparser/test/test_pytokenizer.py
==============================================================================
--- pypy/dist/pypy/interpreter/pyparser/test/test_pytokenizer.py (original)
+++ pypy/dist/pypy/interpreter/pyparser/test/test_pytokenizer.py Fri May 9 17:38:28 2008
@@ -53,29 +53,38 @@
'j', '0xg', '0xj', '0xJ',
]
+def listeq(lst1, lst2):
+ if len(lst1) != len(lst2):
+ return False
+ for tk1, tk2 in zip(lst1, lst2):
+ if not tk1.eq(tk2):
+ return False
+ return True
+
def test_several_lines_list():
"""tests list definition on several lines"""
s = """['a'
]"""
tokens = parse_source(s)
- assert tokens[:4] == [Token(P, LSQB, None), Token(P, STRING, "'a'"),
- Token(P, RSQB, None), Token(P, NEWLINE, '')]
+ assert listeq(tokens[:4], [Token(P, LSQB, None), Token(P, STRING, "'a'"),
+ Token(P, RSQB, None), Token(P, NEWLINE, '')])
def test_numbers():
"""make sure all kind of numbers are correctly parsed"""
for number in NUMBERS:
- assert parse_source(number)[0] == Token(P, NUMBER, number)
+ assert parse_source(number)[0].eq(Token(P, NUMBER, number))
neg = '-%s' % number
- assert parse_source(neg)[:2] == [Token(P, MINUS, None),
- Token(P, NUMBER, number)]
+ assert listeq(parse_source(neg)[:2], [Token(P, MINUS, None),
+ Token(P, NUMBER, number)])
for number in BAD_NUMBERS:
- assert parse_source(number)[0] != Token(P, NUMBER, number)
+ assert not parse_source(number)[0].eq(Token(P, NUMBER, number))
def test_hex_number():
"""basic pasrse"""
tokens = parse_source("a = 0x12L")
- assert tokens[:4] == [Token(P, NAME, 'a'), Token(P, EQUAL, None),
- Token(P, NUMBER, '0x12L'), Token(P, NEWLINE, '')]
+ assert listeq(tokens[:4], [Token(P, NAME, 'a'), Token(P, EQUAL, None),
+ Token(P, NUMBER, '0x12L'),
+ Token(P, NEWLINE, '')])
def test_punct():
"""make sure each punctuation is correctly parsed"""
More information about the Pypy-commit
mailing list