[pypy-commit] pypy default: merge pyparser-improvements-3
cfbolz
pypy.commits at gmail.com
Wed Jun 6 09:13:32 EDT 2018
Author: Carl Friedrich Bolz-Tereick <cfbolz at gmx.de>
Branch:
Changeset: r94730:e85e93d7927e
Date: 2018-06-06 15:11 +0200
http://bitbucket.org/pypy/pypy/changeset/e85e93d7927e/
Log: merge pyparser-improvements-3
some small refactorings in interpreter/pyparser and module/parser
diff --git a/pypy/doc/whatsnew-head.rst b/pypy/doc/whatsnew-head.rst
--- a/pypy/doc/whatsnew-head.rst
+++ b/pypy/doc/whatsnew-head.rst
@@ -27,3 +27,8 @@
The reverse-debugger branch has been merged. For more information, see
https://bitbucket.org/pypy/revdb
+
+
+.. branch: pyparser-improvements-3
+
+Small refactorings in the Python parser.
diff --git a/pypy/interpreter/pyparser/future.py b/pypy/interpreter/pyparser/future.py
--- a/pypy/interpreter/pyparser/future.py
+++ b/pypy/interpreter/pyparser/future.py
@@ -43,7 +43,7 @@
self.tok = self.tokens[index]
def skip(self, n):
- if self.tok[0] == n:
+ if self.tok.token_type == n:
self.next()
return True
else:
@@ -51,7 +51,7 @@
def skip_name(self, name):
from pypy.interpreter.pyparser import pygram
- if self.tok[0] == pygram.tokens.NAME and self.tok[1] == name:
+ if self.tok.token_type == pygram.tokens.NAME and self.tok.value == name:
self.next()
return True
else:
@@ -59,8 +59,8 @@
def next_feature_name(self):
from pypy.interpreter.pyparser import pygram
- if self.tok[0] == pygram.tokens.NAME:
- name = self.tok[1]
+ if self.tok.token_type == pygram.tokens.NAME:
+ name = self.tok.value
self.next()
if self.skip_name("as"):
self.skip(pygram.tokens.NAME)
@@ -101,7 +101,7 @@
# somewhere inside the last __future__ import statement
# (at the start would be fine too, but it's easier to grab a
# random position inside)
- last_position = (it.tok[2], it.tok[3])
+ last_position = (it.tok.lineno, it.tok.column)
result |= future_flags.get_compiler_feature(it.next_feature_name())
while it.skip(pygram.tokens.COMMA):
result |= future_flags.get_compiler_feature(it.next_feature_name())
diff --git a/pypy/interpreter/pyparser/parser.py b/pypy/interpreter/pyparser/parser.py
--- a/pypy/interpreter/pyparser/parser.py
+++ b/pypy/interpreter/pyparser/parser.py
@@ -34,6 +34,18 @@
new.token_ids = self.token_ids
return new
+
+ def classify(self, token):
+ """Find the label for a token."""
+ if token.token_type == self.KEYWORD_TOKEN:
+ label_index = self.keyword_ids.get(token.value, -1)
+ if label_index != -1:
+ return label_index
+ label_index = self.token_ids.get(token.token_type, -1)
+ if label_index == -1:
+ raise ParseError("invalid token", token)
+ return label_index
+
def _freeze_(self):
# Remove some attributes not used in parsing.
try:
@@ -66,6 +78,33 @@
b[pos] |= bit
return str(b)
+
+class Token(object):
+ def __init__(self, token_type, value, lineno, column, line):
+ self.token_type = token_type
+ self.value = value
+ self.lineno = lineno
+ # 0-based offset
+ self.column = column
+ self.line = line
+
+ def __repr__(self):
+ return "Token(%s, %s)" % (self.token_type, self.value)
+
+ def __eq__(self, other):
+ # for tests
+ return (
+ self.token_type == other.token_type and
+ self.value == other.value and
+ self.lineno == other.lineno and
+ self.column == other.column and
+ self.line == other.line
+ )
+
+ def __ne__(self, other):
+ return not self == other
+
+
class Node(object):
__slots__ = ("type", )
@@ -106,6 +145,11 @@
self.lineno = lineno
self.column = column
+ @staticmethod
+ def fromtoken(token):
+ return Terminal(
+ token.token_type, token.value, token.lineno, token.column)
+
def __repr__(self):
return "Terminal(type=%s, value=%r)" % (self.type, self.value)
@@ -194,20 +238,14 @@
class ParseError(Exception):
- def __init__(self, msg, token_type, value, lineno, column, line,
- expected=-1, expected_str=None):
+ def __init__(self, msg, token, expected=-1, expected_str=None):
self.msg = msg
- self.token_type = token_type
- self.value = value
- self.lineno = lineno
- # this is a 0-based index
- self.column = column
- self.line = line
+ self.token = token
self.expected = expected
self.expected_str = expected_str
def __str__(self):
- return "ParserError(%s, %r)" % (self.token_type, self.value)
+ return "ParserError(%s)" % (self.token, )
class StackEntry(object):
@@ -250,8 +288,8 @@
self.root = None
self.stack = StackEntry(None, self.grammar.dfas[start - 256], 0)
- def add_token(self, token_type, value, lineno, column, line):
- label_index = self.classify(token_type, value, lineno, column, line)
+ def add_token(self, token):
+ label_index = self.grammar.classify(token)
sym_id = 0 # for the annotator
while True:
dfa = self.stack.dfa
@@ -262,7 +300,7 @@
sym_id = self.grammar.labels[i]
if label_index == i:
# We matched a non-terminal.
- self.shift(next_state, token_type, value, lineno, column)
+ self.shift(next_state, token)
state = states[next_state]
# While the only possible action is to accept, pop nodes off
# the stack.
@@ -279,8 +317,7 @@
sub_node_dfa = self.grammar.dfas[sym_id - 256]
# Check if this token can start a child node.
if sub_node_dfa.could_match_token(label_index):
- self.push(sub_node_dfa, next_state, sym_id, lineno,
- column)
+ self.push(sub_node_dfa, next_state, sym_id)
break
else:
# We failed to find any arcs to another state, so unless this
@@ -288,8 +325,7 @@
if is_accepting:
self.pop()
if self.stack is None:
- raise ParseError("too much input", token_type, value,
- lineno, column, line)
+ raise ParseError("too much input", token)
else:
# If only one possible input would satisfy, attach it to the
# error.
@@ -300,28 +336,16 @@
else:
expected = -1
expected_str = None
- raise ParseError("bad input", token_type, value, lineno,
- column, line, expected, expected_str)
+ raise ParseError("bad input", token, expected, expected_str)
- def classify(self, token_type, value, lineno, column, line):
- """Find the label for a token."""
- if token_type == self.grammar.KEYWORD_TOKEN:
- label_index = self.grammar.keyword_ids.get(value, -1)
- if label_index != -1:
- return label_index
- label_index = self.grammar.token_ids.get(token_type, -1)
- if label_index == -1:
- raise ParseError("invalid token", token_type, value, lineno, column,
- line)
- return label_index
- def shift(self, next_state, token_type, value, lineno, column):
+ def shift(self, next_state, token):
"""Shift a non-terminal and prepare for the next state."""
- new_node = Terminal(token_type, value, lineno, column)
+ new_node = Terminal.fromtoken(token)
self.stack.node_append_child(new_node)
self.stack.state = next_state
- def push(self, next_dfa, next_state, node_type, lineno, column):
+ def push(self, next_dfa, next_state, node_type):
"""Push a terminal and adjust the current state."""
self.stack.state = next_state
self.stack = self.stack.push(next_dfa, 0)
diff --git a/pypy/interpreter/pyparser/pyparse.py b/pypy/interpreter/pyparser/pyparse.py
--- a/pypy/interpreter/pyparser/pyparse.py
+++ b/pypy/interpreter/pyparser/pyparse.py
@@ -147,37 +147,37 @@
flags &= ~consts.PyCF_DONT_IMPLY_DEDENT
self.prepare(_targets[compile_info.mode])
- tp = 0
try:
try:
# Note: we no longer pass the CO_FUTURE_* to the tokenizer,
# which is expected to work independently of them. It's
# certainly the case for all futures in Python <= 2.7.
tokens = pytokenizer.generate_tokens(source_lines, flags)
-
- newflags, last_future_import = (
- future.add_future_flags(self.future_flags, tokens))
- compile_info.last_future_import = last_future_import
- compile_info.flags |= newflags
-
- self.grammar = pygram.choose_grammar(
- print_function=compile_info.flags & consts.CO_FUTURE_PRINT_FUNCTION,
- revdb=self.space.config.translation.reverse_debugger)
-
- for tp, value, lineno, column, line in tokens:
- if self.add_token(tp, value, lineno, column, line):
- break
except error.TokenError as e:
e.filename = compile_info.filename
raise
except error.TokenIndentationError as e:
e.filename = compile_info.filename
raise
+
+ newflags, last_future_import = (
+ future.add_future_flags(self.future_flags, tokens))
+ compile_info.last_future_import = last_future_import
+ compile_info.flags |= newflags
+
+ self.grammar = pygram.choose_grammar(
+ print_function=compile_info.flags & consts.CO_FUTURE_PRINT_FUNCTION,
+ revdb=self.space.config.translation.reverse_debugger)
+
+ try:
+ for token in tokens:
+ if self.add_token(token):
+ break
except parser.ParseError as e:
# Catch parse errors, pretty them up and reraise them as a
# SyntaxError.
new_err = error.IndentationError
- if tp == pygram.tokens.INDENT:
+ if token.token_type == pygram.tokens.INDENT:
msg = "unexpected indent"
elif e.expected == pygram.tokens.INDENT:
msg = "expected an indented block"
@@ -189,7 +189,7 @@
# parser.ParseError(...).column is 0-based, but the offsets in the
# exceptions in the error module are 1-based, hence the '+ 1'
- raise new_err(msg, e.lineno, e.column + 1, e.line,
+ raise new_err(msg, e.token.lineno, e.token.column + 1, e.token.line,
compile_info.filename)
else:
tree = self.root
diff --git a/pypy/interpreter/pyparser/pytokenize.py b/pypy/interpreter/pyparser/pytokenize.py
--- a/pypy/interpreter/pyparser/pytokenize.py
+++ b/pypy/interpreter/pyparser/pytokenize.py
@@ -1,9 +1,6 @@
# ______________________________________________________________________
"""Module pytokenize
-THIS FILE WAS COPIED FROM pypy/module/parser/pytokenize.py AND ADAPTED
-TO BE ANNOTABLE (Mainly made lists homogeneous)
-
This is a modified version of Ka-Ping Yee's tokenize module found in the
Python standard library.
@@ -12,7 +9,6 @@
expressions have been replaced with hand built DFA's using the
basil.util.automata module.
-$Id: pytokenize.py,v 1.3 2003/10/03 16:31:53 jriehl Exp $
"""
# ______________________________________________________________________
@@ -65,22 +61,3 @@
single_quoted[t] = t
tabsize = 8
-
-# PYPY MODIFICATION: removed TokenError class as it's not needed here
-
-# PYPY MODIFICATION: removed StopTokenizing class as it's not needed here
-
-# PYPY MODIFICATION: removed printtoken() as it's not needed here
-
-# PYPY MODIFICATION: removed tokenize() as it's not needed here
-
-# PYPY MODIFICATION: removed tokenize_loop() as it's not needed here
-
-# PYPY MODIFICATION: removed generate_tokens() as it was copied / modified
-# in pythonlexer.py
-
-# PYPY MODIFICATION: removed main() as it's not needed here
-
-# ______________________________________________________________________
-# End of pytokenize.py
-
diff --git a/pypy/interpreter/pyparser/pytokenizer.py b/pypy/interpreter/pyparser/pytokenizer.py
--- a/pypy/interpreter/pyparser/pytokenizer.py
+++ b/pypy/interpreter/pyparser/pytokenizer.py
@@ -1,4 +1,5 @@
from pypy.interpreter.pyparser import automata
+from pypy.interpreter.pyparser.parser import Token
from pypy.interpreter.pyparser.pygram import tokens
from pypy.interpreter.pyparser.pytoken import python_opmap
from pypy.interpreter.pyparser.error import TokenError, TokenIndentationError
@@ -103,7 +104,7 @@
endmatch = endDFA.recognize(line)
if endmatch >= 0:
pos = end = endmatch
- tok = (tokens.STRING, contstr + line[:end], strstart[0],
+ tok = Token(tokens.STRING, contstr + line[:end], strstart[0],
strstart[1], line)
token_list.append(tok)
last_comment = ''
@@ -111,7 +112,7 @@
contline = None
elif (needcont and not line.endswith('\\\n') and
not line.endswith('\\\r\n')):
- tok = (tokens.ERRORTOKEN, contstr + line, strstart[0],
+ tok = Token(tokens.ERRORTOKEN, contstr + line, strstart[0],
strstart[1], line)
token_list.append(tok)
last_comment = ''
@@ -140,11 +141,11 @@
if column > indents[-1]: # count indents or dedents
indents.append(column)
- token_list.append((tokens.INDENT, line[:pos], lnum, 0, line))
+ token_list.append(Token(tokens.INDENT, line[:pos], lnum, 0, line))
last_comment = ''
while column < indents[-1]:
indents.pop()
- token_list.append((tokens.DEDENT, '', lnum, pos, line))
+ token_list.append(Token(tokens.DEDENT, '', lnum, pos, line))
last_comment = ''
if column != indents[-1]:
err = "unindent does not match any outer indentation level"
@@ -177,11 +178,11 @@
token, initial = line[start:end], line[start]
if initial in numchars or \
(initial == '.' and token != '.'): # ordinary number
- token_list.append((tokens.NUMBER, token, lnum, start, line))
+ token_list.append(Token(tokens.NUMBER, token, lnum, start, line))
last_comment = ''
elif initial in '\r\n':
if not parenstack:
- tok = (tokens.NEWLINE, last_comment, lnum, start, line)
+ tok = Token(tokens.NEWLINE, last_comment, lnum, start, line)
token_list.append(tok)
last_comment = ''
elif initial == '#':
@@ -193,7 +194,7 @@
if endmatch >= 0: # all on one line
pos = endmatch
token = line[start:pos]
- tok = (tokens.STRING, token, lnum, start, line)
+ tok = Token(tokens.STRING, token, lnum, start, line)
token_list.append(tok)
last_comment = ''
else:
@@ -212,16 +213,16 @@
contline = line
break
else: # ordinary string
- tok = (tokens.STRING, token, lnum, start, line)
+ tok = Token(tokens.STRING, token, lnum, start, line)
token_list.append(tok)
last_comment = ''
elif initial in namechars: # ordinary name
- token_list.append((tokens.NAME, token, lnum, start, line))
+ token_list.append(Token(tokens.NAME, token, lnum, start, line))
last_comment = ''
elif initial == '\\': # continued stmt
continued = 1
elif initial == '$':
- token_list.append((tokens.REVDBMETAVAR, token,
+ token_list.append(Token(tokens.REVDBMETAVAR, token,
lnum, start, line))
last_comment = ''
else:
@@ -246,7 +247,7 @@
punct = python_opmap[token]
else:
punct = tokens.OP
- token_list.append((punct, token, lnum, start, line))
+ token_list.append(Token(punct, token, lnum, start, line))
last_comment = ''
else:
start = whiteSpaceDFA.recognize(line, pos)
@@ -255,22 +256,22 @@
if start<max and line[start] in single_quoted:
raise TokenError("end of line (EOL) while scanning string literal",
line, lnum, start+1, token_list)
- tok = (tokens.ERRORTOKEN, line[pos], lnum, pos, line)
+ tok = Token(tokens.ERRORTOKEN, line[pos], lnum, pos, line)
token_list.append(tok)
last_comment = ''
pos = pos + 1
lnum -= 1
if not (flags & consts.PyCF_DONT_IMPLY_DEDENT):
- if token_list and token_list[-1][0] != tokens.NEWLINE:
- tok = (tokens.NEWLINE, '', lnum, 0, '\n')
+ if token_list and token_list[-1].token_type != tokens.NEWLINE:
+ tok = Token(tokens.NEWLINE, '', lnum, 0, '\n')
token_list.append(tok)
for indent in indents[1:]: # pop remaining indent levels
- token_list.append((tokens.DEDENT, '', lnum, pos, line))
- tok = (tokens.NEWLINE, '', lnum, 0, '\n')
+ token_list.append(Token(tokens.DEDENT, '', lnum, pos, line))
+ tok = Token(tokens.NEWLINE, '', lnum, 0, '\n')
token_list.append(tok)
- token_list.append((tokens.ENDMARKER, '', lnum, pos, line))
+ token_list.append(Token(tokens.ENDMARKER, '', lnum, pos, line))
return token_list
diff --git a/pypy/interpreter/pyparser/test/test_automata.py b/pypy/interpreter/pyparser/test/test_automata.py
--- a/pypy/interpreter/pyparser/test/test_automata.py
+++ b/pypy/interpreter/pyparser/test/test_automata.py
@@ -1,4 +1,4 @@
-from pypy.interpreter.pyparser.automata import DFA, DEFAULT
+from pypy.interpreter.pyparser.automata import DFA, NonGreedyDFA, DEFAULT
def test_states():
d = DFA([{"\x00": 1}, {"\x01": 0}], [False, True])
@@ -10,3 +10,20 @@
assert d.states == "\x01\x00"
assert d.defaults == "\xff\x00"
assert d.max_char == 1
+
+def test_recognize():
+ d = DFA([{"a": 1}, {"b": 0}], [False, True])
+ assert d.recognize("ababab") == 5
+ assert d.recognize("c") == -1
+
+ d = DFA([{"a": 1}, {DEFAULT: 0}], [False, True])
+ assert d.recognize("a,a?ab") == 5
+ assert d.recognize("c") == -1
+
+ d = NonGreedyDFA([{"a": 1}, {"b": 0}], [False, True])
+ assert d.recognize("ababab") == 1
+ assert d.recognize("c") == -1
+
+ d = NonGreedyDFA([{"a": 1}, {DEFAULT: 0}], [False, True])
+ assert d.recognize("a,a?ab") == 1
+ assert d.recognize("c") == -1
diff --git a/pypy/interpreter/pyparser/test/test_parser.py b/pypy/interpreter/pyparser/test/test_parser.py
--- a/pypy/interpreter/pyparser/test/test_parser.py
+++ b/pypy/interpreter/pyparser/test/test_parser.py
@@ -20,7 +20,7 @@
rl = StringIO.StringIO(input + "\n").readline
gen = tokenize.generate_tokens(rl)
for tp, value, begin, end, line in gen:
- if self.add_token(tp, value, begin[0], begin[1], line):
+ if self.add_token(parser.Token(tp, value, begin[0], begin[1], line)):
py.test.raises(StopIteration, gen.next)
return self.root
diff --git a/pypy/interpreter/pyparser/test/test_pytokenizer.py b/pypy/interpreter/pyparser/test/test_pytokenizer.py
--- a/pypy/interpreter/pyparser/test/test_pytokenizer.py
+++ b/pypy/interpreter/pyparser/test/test_pytokenizer.py
@@ -1,5 +1,6 @@
import pytest
from pypy.interpreter.pyparser import pytokenizer
+from pypy.interpreter.pyparser.parser import Token
from pypy.interpreter.pyparser.pygram import tokens
from pypy.interpreter.pyparser.error import TokenError
@@ -22,12 +23,12 @@
line = "a+1"
tks = tokenize(line)
assert tks == [
- (tokens.NAME, 'a', 1, 0, line),
- (tokens.PLUS, '+', 1, 1, line),
- (tokens.NUMBER, '1', 1, 2, line),
- (tokens.NEWLINE, '', 2, 0, '\n'),
- (tokens.NEWLINE, '', 2, 0, '\n'),
- (tokens.ENDMARKER, '', 2, 0, ''),
+ Token(tokens.NAME, 'a', 1, 0, line),
+ Token(tokens.PLUS, '+', 1, 1, line),
+ Token(tokens.NUMBER, '1', 1, 2, line),
+ Token(tokens.NEWLINE, '', 2, 0, '\n'),
+ Token(tokens.NEWLINE, '', 2, 0, '\n'),
+ Token(tokens.ENDMARKER, '', 2, 0, ''),
]
def test_error_parenthesis(self):
diff --git a/pypy/module/parser/pyparser.py b/pypy/module/parser/pyparser.py
--- a/pypy/module/parser/pyparser.py
+++ b/pypy/module/parser/pyparser.py
@@ -133,10 +133,9 @@
space.newtext(message))
-def get_node_type(space, w_tuple):
+def get_node_type(space, tup_w, w_tuple):
try:
- w_type = space.getitem(w_tuple, space.newint(0))
- return space.int_w(w_type)
+ return space.int_w(tup_w[0])
except OperationError:
raise parser_error(space, w_tuple, "Illegal component tuple.")
@@ -145,44 +144,47 @@
self.lineno = 0
def build_node_tree(space, w_tuple):
- type = get_node_type(space, w_tuple)
+ tup_w = space.unpackiterable(w_tuple)
+ if len(tup_w) == 0:
+ raise parser_error(space, w_tuple, "tuple too short")
+
+ type = get_node_type(space, tup_w, w_tuple)
node_state = NodeState()
if 0 <= type < 256:
# The tuple is simple, but it doesn't start with a start symbol.
# Raise an exception now and be done with it.
raise parser_error(space, w_tuple,
"Illegal syntax-tree; cannot start with terminal symbol.")
+ return build_node_children(space, type, tup_w, node_state)
+
+def build_node_children(space, type, tup_w, node_state):
node = pyparse.parser.Nonterminal(type)
- build_node_children(space, w_tuple, node, node_state)
- return node
-
-def build_node_children(space, w_tuple, node, node_state):
- for w_elem in space.unpackiterable(w_tuple)[1:]:
- type = get_node_type(space, w_elem)
+ for i in range(1, len(tup_w)):
+ w_elem = tup_w[i]
+ subtup_w = space.unpackiterable(w_elem)
+ type = get_node_type(space, subtup_w, w_elem)
if type < 256: # Terminal node
- length = space.len_w(w_elem)
+ length = len(subtup_w)
if length == 2:
- _, w_obj = space.unpackiterable(w_elem, 2)
+ _, w_obj = subtup_w
elif length == 3:
- _, w_obj, w_lineno = space.unpackiterable(w_elem, 3)
+ _, w_obj, w_lineno = subtup_w
else:
raise parse_error(
space, "terminal nodes must have 2 or 3 entries")
strn = space.text_w(w_obj)
child = pyparse.parser.Terminal(type, strn, node_state.lineno, 0)
else:
- child = pyparse.parser.Nonterminal(type)
+ child = build_node_children(space, type, subtup_w, node_state)
node.append_child(child)
- if type >= 256: # Nonterminal node
- build_node_children(space, w_elem, child, node_state)
- elif type == pyparse.pygram.tokens.NEWLINE:
+ if type == pyparse.pygram.tokens.NEWLINE:
node_state.lineno += 1
+ return node
-def validate_node(space, tree):
+def validate_node(space, tree, parser):
assert tree.type >= 256
type = tree.type - 256
- parser = pyparse.PythonParser(space)
if type >= len(parser.grammar.dfas):
raise parse_error(space, "Unrecognized node type %d." % type)
dfa = parser.grammar.dfas[type]
@@ -195,7 +197,7 @@
if label == ch.type:
# The child is acceptable; validate it recursively
if ch.type >= 256:
- validate_node(space, ch)
+ validate_node(space, ch, parser)
# Update the state, and move on to the next child.
arcs, is_accepting = dfa.states[next_state]
break
@@ -209,5 +211,6 @@
def tuple2st(space, w_sequence):
# Convert the tree to the internal form before checking it
tree = build_node_tree(space, w_sequence)
- validate_node(space, tree)
+ parser = pyparse.PythonParser(space)
+ validate_node(space, tree, parser)
return W_STType(tree, 'eval')
More information about the pypy-commit
mailing list