[pypy-commit] pypy pyparser-improvements-3: use Token class in pytokenizer too

Sat Apr 14 05:21:22 EDT 2018

Author: Carl Friedrich Bolz-Tereick <cfbolz at gmx.de>
Branch: pyparser-improvements-3
Changeset: r94317:d965ef8c054e
Date: 2018-04-14 11:06 +0200
http://bitbucket.org/pypy/pypy/changeset/d965ef8c054e/

Log:	use Token class in pytokenizer too

diff --git a/pypy/interpreter/pyparser/future.py b/pypy/interpreter/pyparser/future.py
--- a/pypy/interpreter/pyparser/future.py
+++ b/pypy/interpreter/pyparser/future.py
@@ -43,7 +43,7 @@
         self.tok = self.tokens[index]
 
     def skip(self, n):
-        if self.tok[0] == n:
+        if self.tok.token_type == n:
             self.next()
             return True
         else:
@@ -51,7 +51,7 @@
 
     def skip_name(self, name):
         from pypy.interpreter.pyparser import pygram
-        if self.tok[0] == pygram.tokens.NAME and self.tok[1] == name:
+        if self.tok.token_type == pygram.tokens.NAME and self.tok.value == name:
             self.next()
             return True
         else:
@@ -59,8 +59,8 @@
 
     def next_feature_name(self):
         from pypy.interpreter.pyparser import pygram
-        if self.tok[0] == pygram.tokens.NAME:
-            name = self.tok[1]
+        if self.tok.token_type == pygram.tokens.NAME:
+            name = self.tok.value
             self.next()
             if self.skip_name("as"):
                 self.skip(pygram.tokens.NAME)
@@ -101,7 +101,7 @@
         # somewhere inside the last __future__ import statement
         # (at the start would be fine too, but it's easier to grab a
         # random position inside)
-        last_position = (it.tok[2], it.tok[3])
+        last_position = (it.tok.lineno, it.tok.column)
         result |= future_flags.get_compiler_feature(it.next_feature_name())
         while it.skip(pygram.tokens.COMMA):
             result |= future_flags.get_compiler_feature(it.next_feature_name())
diff --git a/pypy/interpreter/pyparser/parser.py b/pypy/interpreter/pyparser/parser.py
--- a/pypy/interpreter/pyparser/parser.py
+++ b/pypy/interpreter/pyparser/parser.py
@@ -78,6 +78,19 @@
     def __repr__(self):
         return "Token(%s, %s)" % (self.token_type, self.value)
 
+    def __eq__(self, other):
+        # for tests
+        return (
+            self.token_type == other.token_type and
+            self.value == other.value and
+            self.lineno == other.lineno and
+            self.column == other.column and
+            self.line == other.line
+        )
+
+    def __ne__(self, other):
+        return not self == other
+
 
 class Node(object):
 
diff --git a/pypy/interpreter/pyparser/pyparse.py b/pypy/interpreter/pyparser/pyparse.py
--- a/pypy/interpreter/pyparser/pyparse.py
+++ b/pypy/interpreter/pyparser/pyparse.py
@@ -147,7 +147,6 @@
             flags &= ~consts.PyCF_DONT_IMPLY_DEDENT
 
         self.prepare(_targets[compile_info.mode])
-        tp = 0
         try:
             try:
                 # Note: we no longer pass the CO_FUTURE_* to the tokenizer,
@@ -165,8 +164,8 @@
                 else:
                     self.grammar = pygram.python_grammar
 
-                for tp, value, lineno, column, line in tokens:
-                    if self.add_token(parser.Token(tp, value, lineno, column, line)):
+                for token in tokens:
+                    if self.add_token(token):
                         break
             except error.TokenError as e:
                 e.filename = compile_info.filename
@@ -178,7 +177,7 @@
                 # Catch parse errors, pretty them up and reraise them as a
                 # SyntaxError.
                 new_err = error.IndentationError
-                if tp == pygram.tokens.INDENT:
+                if token.token_type == pygram.tokens.INDENT:
                     msg = "unexpected indent"
                 elif e.expected == pygram.tokens.INDENT:
                     msg = "expected an indented block"
diff --git a/pypy/interpreter/pyparser/pytokenizer.py b/pypy/interpreter/pyparser/pytokenizer.py
--- a/pypy/interpreter/pyparser/pytokenizer.py
+++ b/pypy/interpreter/pyparser/pytokenizer.py
@@ -1,4 +1,5 @@
 from pypy.interpreter.pyparser import automata
+from pypy.interpreter.pyparser.parser import Token
 from pypy.interpreter.pyparser.pygram import tokens
 from pypy.interpreter.pyparser.pytoken import python_opmap
 from pypy.interpreter.pyparser.error import TokenError, TokenIndentationError
@@ -103,7 +104,7 @@
             endmatch = endDFA.recognize(line)
             if endmatch >= 0:
                 pos = end = endmatch
-                tok = (tokens.STRING, contstr + line[:end], strstart[0],
+                tok = Token(tokens.STRING, contstr + line[:end], strstart[0],
                        strstart[1], line)
                 token_list.append(tok)
                 last_comment = ''
@@ -111,7 +112,7 @@
                 contline = None
             elif (needcont and not line.endswith('\\\n') and
                                not line.endswith('\\\r\n')):
-                tok = (tokens.ERRORTOKEN, contstr + line, strstart[0],
+                tok = Token(tokens.ERRORTOKEN, contstr + line, strstart[0],
                        strstart[1], line)
                 token_list.append(tok)
                 last_comment = ''
@@ -140,11 +141,11 @@
 
             if column > indents[-1]:           # count indents or dedents
                 indents.append(column)
-                token_list.append((tokens.INDENT, line[:pos], lnum, 0, line))
+                token_list.append(Token(tokens.INDENT, line[:pos], lnum, 0, line))
                 last_comment = ''
             while column < indents[-1]:
                 indents.pop()
-                token_list.append((tokens.DEDENT, '', lnum, pos, line))
+                token_list.append(Token(tokens.DEDENT, '', lnum, pos, line))
                 last_comment = ''
             if column != indents[-1]:
                 err = "unindent does not match any outer indentation level"
@@ -177,11 +178,11 @@
                 token, initial = line[start:end], line[start]
                 if initial in numchars or \
                    (initial == '.' and token != '.'):      # ordinary number
-                    token_list.append((tokens.NUMBER, token, lnum, start, line))
+                    token_list.append(Token(tokens.NUMBER, token, lnum, start, line))
                     last_comment = ''
                 elif initial in '\r\n':
                     if not parenstack:
-                        tok = (tokens.NEWLINE, last_comment, lnum, start, line)
+                        tok = Token(tokens.NEWLINE, last_comment, lnum, start, line)
                         token_list.append(tok)
                     last_comment = ''
                 elif initial == '#':
@@ -193,7 +194,7 @@
                     if endmatch >= 0:                     # all on one line
                         pos = endmatch
                         token = line[start:pos]
-                        tok = (tokens.STRING, token, lnum, start, line)
+                        tok = Token(tokens.STRING, token, lnum, start, line)
                         token_list.append(tok)
                         last_comment = ''
                     else:
@@ -212,11 +213,11 @@
                         contline = line
                         break
                     else:                                  # ordinary string
-                        tok = (tokens.STRING, token, lnum, start, line)
+                        tok = Token(tokens.STRING, token, lnum, start, line)
                         token_list.append(tok)
                         last_comment = ''
                 elif initial in namechars:                 # ordinary name
-                    token_list.append((tokens.NAME, token, lnum, start, line))
+                    token_list.append(Token(tokens.NAME, token, lnum, start, line))
                     last_comment = ''
                 elif initial == '\\':                      # continued stmt
                     continued = 1
@@ -242,7 +243,7 @@
                         punct = python_opmap[token]
                     else:
                         punct = tokens.OP
-                    token_list.append((punct, token, lnum, start, line))
+                    token_list.append(Token(punct, token, lnum, start, line))
                     last_comment = ''
             else:
                 start = whiteSpaceDFA.recognize(line, pos)
@@ -251,22 +252,22 @@
                 if start<max and line[start] in single_quoted:
                     raise TokenError("end of line (EOL) while scanning string literal",
                              line, lnum, start+1, token_list)
-                tok = (tokens.ERRORTOKEN, line[pos], lnum, pos, line)
+                tok = Token(tokens.ERRORTOKEN, line[pos], lnum, pos, line)
                 token_list.append(tok)
                 last_comment = ''
                 pos = pos + 1
 
     lnum -= 1
     if not (flags & consts.PyCF_DONT_IMPLY_DEDENT):
-        if token_list and token_list[-1][0] != tokens.NEWLINE:
-            tok = (tokens.NEWLINE, '', lnum, 0, '\n')
+        if token_list and token_list[-1].token_type != tokens.NEWLINE:
+            tok = Token(tokens.NEWLINE, '', lnum, 0, '\n')
             token_list.append(tok)
         for indent in indents[1:]:                # pop remaining indent levels
-            token_list.append((tokens.DEDENT, '', lnum, pos, line))
-    tok = (tokens.NEWLINE, '', lnum, 0, '\n')
+            token_list.append(Token(tokens.DEDENT, '', lnum, pos, line))
+    tok = Token(tokens.NEWLINE, '', lnum, 0, '\n')
     token_list.append(tok)
 
-    token_list.append((tokens.ENDMARKER, '', lnum, pos, line))
+    token_list.append(Token(tokens.ENDMARKER, '', lnum, pos, line))
     return token_list
 
 
diff --git a/pypy/interpreter/pyparser/test/test_pytokenizer.py b/pypy/interpreter/pyparser/test/test_pytokenizer.py
--- a/pypy/interpreter/pyparser/test/test_pytokenizer.py
+++ b/pypy/interpreter/pyparser/test/test_pytokenizer.py
@@ -1,5 +1,6 @@
 import pytest
 from pypy.interpreter.pyparser import pytokenizer
+from pypy.interpreter.pyparser.parser import Token
 from pypy.interpreter.pyparser.pygram import tokens
 from pypy.interpreter.pyparser.error import TokenError
 
@@ -22,12 +23,12 @@
         line = "a+1"
         tks = tokenize(line)
         assert tks == [
-            (tokens.NAME, 'a', 1, 0, line),
-            (tokens.PLUS, '+', 1, 1, line),
-            (tokens.NUMBER, '1', 1, 2, line),
-            (tokens.NEWLINE, '', 2, 0, '\n'),
-            (tokens.NEWLINE, '', 2, 0, '\n'),
-            (tokens.ENDMARKER, '', 2, 0, ''),
+            Token(tokens.NAME, 'a', 1, 0, line),
+            Token(tokens.PLUS, '+', 1, 1, line),
+            Token(tokens.NUMBER, '1', 1, 2, line),
+            Token(tokens.NEWLINE, '', 2, 0, '\n'),
+            Token(tokens.NEWLINE, '', 2, 0, '\n'),
+            Token(tokens.ENDMARKER, '', 2, 0, ''),
             ]
 
     def test_error_parenthesis(self):