[pypy-svn] r13190 - pypy/branch/pycompiler/module/recparser

Wed Jun 8 18:46:49 CEST 2005

Author: adim
Date: Wed Jun  8 18:46:48 2005
New Revision: 13190

Modified:
   pypy/branch/pycompiler/module/recparser/pythonlexer.py
Log:
added a PythonSource which use Jonhathan's tokenizer

Modified: pypy/branch/pycompiler/module/recparser/pythonlexer.py
==============================================================================

--- pypy/branch/pycompiler/module/recparser/pythonlexer.py	(original)
+++ pypy/branch/pycompiler/module/recparser/pythonlexer.py	Wed Jun  8 18:46:48 2005
@@ -82,6 +82,7 @@
         self.stack_pos = 0
         self.comment = ''
         self.encoding = None
+
         
     def current_line(self):
         return self._current_line
@@ -289,138 +290,328 @@
             lineno = self.line
         return 'line %s : %s' % (lineno, self._lines[lineno-1])
 
-    ## ONLY refactor ideas ###########################################
-##     def _mynext(self):
-##         """returns the next token from source"""
-##         inp = self.input
-##         pos = self.pos
-##         input_length = len(inp)
-##         if pos >= input_length:
-##             return self.end_of_file()
-##         # Beginning of line
-##         if self.atbol:
-##             self.linestart = pos
-##             col = 0
-##             m = py_ws.match(inp, pos)
-##             pos = m.end()
-##             col = pos - self.linestart
-##             self.atbol = False
-##             # skip blanklines
-##             m = py_comment.match(inp, pos)
-##             if m:
-##                 self.pos = m.end() + 1
-##                 self.line += 1
-##                 self.atbol = True
-##                 return self._next()
-##             # the current block is more indented than the previous one
-##             if col > self.indentstack[-1]:
-##                 self.indentstack.append(col)
-##                 return "INDENT", None
-##             # the current block is less indented than the previous one
-##             while col < self.indentstack[-1]:
-##                 self.pendin += 1
-##                 self.indentstack.pop(-1)
-##             if col != self.indentstack[-1]:
-##                 raise SyntaxError("Indentation Error")
-##         if self.pendin > 0:
-##             self.pendin -= 1
-##             return "DEDENT", None
-##         m = py_skip.match(inp, pos)
-##         if m.group(0)[-1:] == '\n':
-##             self.line += 1
-##         pos = m.end() # always match
-##         if pos >= input_length:
-##             return self.end_of_file()
-##         self.pos = pos
-
-##         c = inp[pos]
-##         chain = (self._check_string, self._check_name, self._check_number,
-##                  self._check_newline, self._check_backslash, self._check_punct)
-##         for check_meth in chain:
-##             token_val_pair = check_meth(c, pos)
-##             if token_val_pair is not None:
-##                 return token_val_pair
+
+################################################################################
+class StringAsFile(object):
+    """XXX: Is StringIO RPython ?"""
+
+    def __init__(self, inpstring):
+        self.lines = inpstring.splitlines(True)
+        self.lineno = 0
+
+    def readline(self):
+        if self.lineno < len(self.lines):
+            line = self.lines[self.lineno]
+            self.lineno += 1
+            return line
+        return ''
+
+
+import token as tokenmod
+from pypy.module.parser.pytokenize import generate_tokens, tabsize, \
+     whiteSpaceDFA, triple_quoted, endDFAs, single_quoted, pseudoDFA
+
+# adopt pytokenize notations / values
+tokenmod.COMMENT = tokenmod.N_TOKENS 
+tokenmod.NL = tokenmod.N_TOKENS + 1
+
+class TokenError(Exception):
+    """Raised when EOF is found prematuerly"""
+    def __init__(self, msg, strstart, token_stack):
+        Exception.__init__(self, msg)
+        self.strstart = strstart
+        self.token_stack = token_stack
+    
+
+def generate_tokens(readline):
+    """
+    This is a rewrite of pypy.module.parser.pytokenize.generate_tokens since
+    the original function is not RPYTHON (uses yield)
+    It was also slightly modified to generate Token instances instead
+    of the original 5-tuples
+
+    Original docstring ::
+    
+        The generate_tokens() generator requires one argment, readline, which
+        must be a callable object which provides the same interface as the
+        readline() method of built-in file objects. Each call to the function
+        should return one line of input as a string.
+
+        The generator produces 5-tuples with these members: the token type; the
+        token string; a 2-tuple (srow, scol) of ints specifying the row and
+        column where the token begins in the source; a 2-tuple (erow, ecol) of
+        ints specifying the row and column where the token ends in the source;
+        and the line on which the token was found. The line passed is the
+        logical line; continuation lines are included.
+    """
+    token_list = []
+    lnum = parenlev = continued = 0
+    namechars = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_'
+    numchars = '0123456789'
+    contstr, needcont = '', 0
+    contline = None
+    indents = [0]
+    last_comment = ''
+    encoding = None
+    strstart = (0, 0)
+
+    while 1:                                   # loop over lines in stream
+        line = readline()
+        lnum = lnum + 1
+        pos, max = 0, len(line)
+
+        if contstr:                            # continued string
+            if not line:
+                raise TokenError("EOF in multi-line string", strstart, token_list)
+            endmatch = endDFA.recognize(line)
+            if -1 != endmatch:
+                pos = end = endmatch
+                tok = token_from_values(tokenmod.STRING, contstr + line[:end])
+                token_list.append((tok, line))
+                last_comment = ''
+                # token_list.append((STRING, contstr + line[:end],
+                #                    strstart, (lnum, end), contline + line))
+                contstr, needcont = '', 0
+                contline = None
+            elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
+                tok = token_from_values(tokenmod.ERRORTOKEN, contstr + line)
+                token_list.append((tok, line))
+                last_comment = ''
+                # token_list.append((ERRORTOKEN, contstr + line,
+                #                    strstart, (lnum, len(line)), contline))
+                contstr = ''
+                contline = None
+                continue
+            else:
+                contstr = contstr + line
+                contline = contline + line
+                continue
+
+        elif parenlev == 0 and not continued:  # new statement
+            if not line: break
+            column = 0
+            while pos < max:                   # measure leading whitespace
+                if line[pos] == ' ': column = column + 1
+                elif line[pos] == '\t': column = (column/tabsize + 1)*tabsize
+                elif line[pos] == '\f': column = 0
+                else: break
+                pos = pos + 1
+            if pos == max: break
+
+            if line[pos] in '#\r\n':           # skip comments or blank lines
+                if line[pos] == '#':
+                    tok = token_from_values(tokenmod.COMMENT, line[pos:])
+                    last_comment = line[pos:]
+                    if lnum <= 2:
+                        m_enc = py_encoding.search(last_comment)
+                        if m_enc is not None:
+                            encoding = _normalize_encoding(m_enc.group(1))
+                else:
+                    tok = token_from_values(tokenmod.NL, line[pos:])
+                    last_comment = ''
+                # XXX Skip NL and COMMENT Tokens   # token_list.append((tok, line))
+                # token_list.append(((NL, COMMENT)[line[pos] == '#'], line[pos:],
+                #                    (lnum, pos), (lnum, len(line)), line))
+                continue
+
+            if column > indents[-1]:           # count indents or dedents
+                indents.append(column)
+                tok = token_from_values(tokenmod.INDENT, line[:pos])
+                token_list.append((tok, line))
+                last_comment = ''
+                # token_list.append((INDENT, line[:pos], (lnum, 0), (lnum, pos), line))
+            while column < indents[-1]:
+                indents = indents[:-1]
+                tok = token_from_values(tokenmod.DEDENT, '')
+                token_list.append((tok, line))
+                last_comment = ''
+                # token_list.append((DEDENT, '', (lnum, pos), (lnum, pos), line))
+
+        else:                                  # continued statement
+            if not line:
+                raise TokenError("EOF in multi-line statement", (lnum, 0), token_list)
+            continued = 0
+
+        while pos < max:
+            pseudomatch = pseudoDFA.recognize(line, pos)
+            if -1 != pseudomatch:                            # scan for tokens
+                # JDR: Modified
+                start = whiteSpaceDFA.recognize(line, pos)
+                if -1 == start:
+                    start = pos
+                end = pseudomatch
+
+                spos, epos, pos = (lnum, start), (lnum, end), end
+                token, initial = line[start:end], line[start]
+
+                if initial in numchars or \
+                   (initial == '.' and token != '.'):      # ordinary number
+                    tok = token_from_values(tokenmod.NUMBER, token)
+                    token_list.append((tok, line))
+                    last_comment = ''
+                    # token_list.append((NUMBER, token, spos, epos, line))
+                elif initial in '\r\n':
+                    if parenlev > 0:
+                        tok = token_from_values(tokenmod.NL, token)
+                        last_comment = ''
+                        # XXX Skip NL
+                    else:
+                        tok = token_from_values(tokenmod.NEWLINE, token)
+                        # XXX YUCK !
+                        tok.value = last_comment
+                        token_list.append((tok, line))
+                        last_comment = ''
+                    # token_list.append((parenlev > 0 and NL or NEWLINE, token, spos, epos, line))
+                elif initial == '#':
+                    tok = token_from_values(tokenmod.COMMENT, token)
+                    last_comment = token
+                    if lnum <= 2:
+                        m_enc = py_encoding.search(last_comment)
+                        if m_enc is not None:
+                            encoding = _normalize_encoding(m_enc.group(1))
+                    # XXX Skip # token_list.append((tok, line))
+                    # token_list.append((COMMENT, token, spos, epos, line))
+                elif token in triple_quoted:
+                    endDFA = endDFAs[token]
+                    endmatch = endDFA.recognize(line, pos)
+                    if -1 != endmatch:                     # all on one line
+                        pos = endmatch
+                        token = line[start:pos]
+                        tok = token_from_values(tokenmod.STRING, token)
+                        token_list.append((tok, line))
+                        last_comment = ''
+                        # token_list.append((STRING, token, spos, (lnum, pos), line))
+                    else:
+                        strstart = (lnum, start)           # multiple lines
+                        contstr = line[start:]
+                        contline = line
+                        break
+                elif initial in single_quoted or \
+                    token[:2] in single_quoted or \
+                    token[:3] in single_quoted:
+                    if token[-1] == '\n':                  # continued string
+                        strstart = (lnum, start)
+                        endDFA = (endDFAs[initial] or endDFAs[token[1]] or
+                                   endDFAs[token[2]])
+                        contstr, needcont = line[start:], 1
+                        contline = line
+                        break
+                    else:                                  # ordinary string
+                        tok = token_from_values(tokenmod.STRING, token)
+                        token_list.append((tok, line))
+                        last_comment = ''
+                        # token_list.append((STRING, token, spos, epos, line))
+                elif initial in namechars:                 # ordinary name
+                    tok = token_from_values(tokenmod.NAME, token)
+                    token_list.append((tok, line))
+                    last_comment = ''
+                    # token_list.append((NAME, token, spos, epos, line))
+                elif initial == '\\':                      # continued stmt
+                    continued = 1
+                else:
+                    if initial in '([{': parenlev = parenlev + 1
+                    elif initial in ')]}': parenlev = parenlev - 1
+                    tok = token_from_values(tokenmod.OP, token)
+                    token_list.append((tok, line)) 
+                    last_comment = ''
+                    # token_list.append((OP, token, spos, epos, line))
+            else:
+                tok = token_from_values(tokenmod.ERRORTOKEN, line[pos])
+                token_list.append((tok, line))
+                last_comment = ''
+                # token_list.append((ERRORTOKEN, line[pos],
+                #                    (lnum, pos), (lnum, pos+1), line))
+                pos = pos + 1
+
+    last_comment = ''
+    for indent in indents[1:]:                 # pop remaining indent levels
+        tok = token_from_values(tokenmod.DEDENT, '')
+        token_list.append((tok, line))
+        # token_list.append((DEDENT, '', (lnum, 0), (lnum, 0), ''))
+
+    ## <XXX> adim
+    token_list.append((Token('NEWLINE', ''), line))
+    ## </XXX>
+    tok = token_from_values(tokenmod.ENDMARKER, '',)
+    token_list.append((tok, line))
+    # token_list.append((ENDMARKER, '', (lnum, 0), (lnum, 0), ''))
+    return token_list, encoding
+
+
+class PythonSource2(TokenSource):
+    """This source uses Jonathan's tokenizer"""
+    def __init__(self, inpstring):
+        TokenSource.__init__(self)
+        tokens, encoding = generate_tokens(StringAsFile(inpstring).readline)
+        self.token_stack = tokens
+        self.encoding = encoding
+        self._current_line = '' # the current line (as a string)
+        self.stack_pos = 0
+
+    def next(self):
+        if self.stack_pos >= len(self.token_stack):
+            raise StopIteration("Remove me")
+        tok, line = self.token_stack[self.stack_pos]
+        self.stack_pos += 1
+        self._current_line = line
+        return tok
+
+    def current_line(self):
+        return self._current_line
+
+    def context(self):
+        return self.stack_pos
+
+    def restore(self, ctx):
+        self.stack_pos = ctx
+
+    def peek(self):
+        """returns next token without consuming it"""
+        ctx = self.context()
+        token = self.next()
+        self.restore(ctx)
+        return token
+
+    #### methods below have to be translated 
+    def offset(self, ctx=None):
+        if ctx is None:
+            return self.stack_pos
+        else:
+            assert type(ctx)==int
+            return ctx
+
+    def get_pos(self):
+        if self.stack_pos >= len(self.stack):
+            return self.pos
+        else:
+            token, line, pos = self.stack[self.stack_pos]
+            return pos
+
+    def get_source_text(self, pos0, pos1 ):
+        return self.input[pos0:pos1]
         
+    def debug(self):
+        """return context for debug information"""
+        return 'line %s : %s' % ('XXX', self._current_line)
 
-##     def _check_string(self, c, pos):
-##         inp = self.input
-##         input_length = len(inp)
-##         # STRING
-##         if c in ('r', 'R'):
-##             if pos < input_length-1 and inp[pos+1] in ("'",'"'):
-##                 return self.next_string(raw=1)
-##         elif c in ('u','U'):
-##             if pos < input_length - 1:
-##                 if inp[pos+1] in ("r", 'R'):
-##                     if pos<input_length-2 and inp[pos+2] in ("'",'"'):
-##                         return self.next_string(raw = 1, uni = 1)
-##                 elif inp[pos+1] in ( "'", '"' ):
-##                     return self.next_string(uni = 1)
-##         elif c in ( '"', "'" ):
-##             return self.next_string()
-##         return None
-
-##     def _check_name(self, c, pos):
-##         inp = self.input
-##         # NAME
-##         m = py_name.match(inp, pos)
-##         if m:
-##             self.pos = m.end()
-##             val = m.group(0)
-##             if py_keywords.match(val):
-##                 return val, None
-##             return "NAME", val
-##         return None
-
-##     def _check_number(self, c, pos):
-##         inp = self.input
-##         # NUMBER
-##         m = py_number.match(inp, pos)
-##         if m:
-##             self.pos = m.end()
-##             return "NUMBER", m.group(0)
-##         return None
-
-##     def _check_newline(self, c, pos):
-##         # NEWLINE
-##         if c == '\n':
-##             self.pos += 1
-##             self.line += 1
-##             if self.level > 0:
-##                 return self._next()
-##             else:
-##                 self.atbol = True
-##                 return "NEWLINE", None
-##         return None
-            
-##     def _check_backslash(self, c, pos):
-##         inp = self.input
-##         input_length = len(inp)
-##         if c == '\\':
-##             if pos < input_length-1 and inp[pos+1] == '\n':
-##                 self.pos += 2
-##                 return self._next()
-##         return None
-
-##     def _check_punct(self, c, pos):
-##         inp = self.input
-##         input_length = len(inp)
-##         m = py_punct.match(inp, pos)
-##         if m:
-##             punct = m.group(0)
-##             if punct in ( '(', '{' ):
-##                 self.level += 1
-##             if punct in ( ')', '}' ):
-##                 self.level -= 1
-##             self.pos = m.end()
-##             return punct, None
-##         raise SyntaxError("Unrecognized token '%s'" % inp[pos:pos+20] )
+NONE_LIST = [tokenmod.ENDMARKER, tokenmod.INDENT, tokenmod.DEDENT,]
+NAMED_LIST = [tokenmod.OP, ]
 
+def token_from_values(tok_type, tok_string):
+    """XXX Compatibility layer between both parsers"""
+    if tok_type in NONE_LIST:
+        return Token(tokenmod.tok_name[tok_type], None)
+    if tok_type in NAMED_LIST:
+        return Token(tok_string, None)
+    if tok_type == tokenmod.NEWLINE:
+        return Token('NEWLINE', '') # XXX pending comment ?
+    return Token(tokenmod.tok_name[tok_type], tok_string)
 
+Source = PythonSource2
 
 def tokenize_file(filename):
     f = file(filename).read()
-    src = PythonSource(f)
+    src = Source(f)
     token = src.next()
     while token!=("ENDMARKER",None) and token!=(None,None):
         print token