[pypy-svn] r13190 - pypy/branch/pycompiler/module/recparser
adim at codespeak.net
adim at codespeak.net
Wed Jun 8 18:46:49 CEST 2005
Author: adim
Date: Wed Jun 8 18:46:48 2005
New Revision: 13190
Modified:
pypy/branch/pycompiler/module/recparser/pythonlexer.py
Log:
added a PythonSource which use Jonhathan's tokenizer
Modified: pypy/branch/pycompiler/module/recparser/pythonlexer.py
==============================================================================
--- pypy/branch/pycompiler/module/recparser/pythonlexer.py (original)
+++ pypy/branch/pycompiler/module/recparser/pythonlexer.py Wed Jun 8 18:46:48 2005
@@ -82,6 +82,7 @@
self.stack_pos = 0
self.comment = ''
self.encoding = None
+
def current_line(self):
return self._current_line
@@ -289,138 +290,328 @@
lineno = self.line
return 'line %s : %s' % (lineno, self._lines[lineno-1])
- ## ONLY refactor ideas ###########################################
-## def _mynext(self):
-## """returns the next token from source"""
-## inp = self.input
-## pos = self.pos
-## input_length = len(inp)
-## if pos >= input_length:
-## return self.end_of_file()
-## # Beginning of line
-## if self.atbol:
-## self.linestart = pos
-## col = 0
-## m = py_ws.match(inp, pos)
-## pos = m.end()
-## col = pos - self.linestart
-## self.atbol = False
-## # skip blanklines
-## m = py_comment.match(inp, pos)
-## if m:
-## self.pos = m.end() + 1
-## self.line += 1
-## self.atbol = True
-## return self._next()
-## # the current block is more indented than the previous one
-## if col > self.indentstack[-1]:
-## self.indentstack.append(col)
-## return "INDENT", None
-## # the current block is less indented than the previous one
-## while col < self.indentstack[-1]:
-## self.pendin += 1
-## self.indentstack.pop(-1)
-## if col != self.indentstack[-1]:
-## raise SyntaxError("Indentation Error")
-## if self.pendin > 0:
-## self.pendin -= 1
-## return "DEDENT", None
-## m = py_skip.match(inp, pos)
-## if m.group(0)[-1:] == '\n':
-## self.line += 1
-## pos = m.end() # always match
-## if pos >= input_length:
-## return self.end_of_file()
-## self.pos = pos
-
-## c = inp[pos]
-## chain = (self._check_string, self._check_name, self._check_number,
-## self._check_newline, self._check_backslash, self._check_punct)
-## for check_meth in chain:
-## token_val_pair = check_meth(c, pos)
-## if token_val_pair is not None:
-## return token_val_pair
+
+################################################################################
+class StringAsFile(object):
+ """XXX: Is StringIO RPython ?"""
+
+ def __init__(self, inpstring):
+ self.lines = inpstring.splitlines(True)
+ self.lineno = 0
+
+ def readline(self):
+ if self.lineno < len(self.lines):
+ line = self.lines[self.lineno]
+ self.lineno += 1
+ return line
+ return ''
+
+
+import token as tokenmod
+from pypy.module.parser.pytokenize import generate_tokens, tabsize, \
+ whiteSpaceDFA, triple_quoted, endDFAs, single_quoted, pseudoDFA
+
+# adopt pytokenize notations / values
+tokenmod.COMMENT = tokenmod.N_TOKENS
+tokenmod.NL = tokenmod.N_TOKENS + 1
+
+class TokenError(Exception):
+ """Raised when EOF is found prematuerly"""
+ def __init__(self, msg, strstart, token_stack):
+ Exception.__init__(self, msg)
+ self.strstart = strstart
+ self.token_stack = token_stack
+
+
+def generate_tokens(readline):
+ """
+ This is a rewrite of pypy.module.parser.pytokenize.generate_tokens since
+ the original function is not RPYTHON (uses yield)
+ It was also slightly modified to generate Token instances instead
+ of the original 5-tuples
+
+ Original docstring ::
+
+ The generate_tokens() generator requires one argment, readline, which
+ must be a callable object which provides the same interface as the
+ readline() method of built-in file objects. Each call to the function
+ should return one line of input as a string.
+
+ The generator produces 5-tuples with these members: the token type; the
+ token string; a 2-tuple (srow, scol) of ints specifying the row and
+ column where the token begins in the source; a 2-tuple (erow, ecol) of
+ ints specifying the row and column where the token ends in the source;
+ and the line on which the token was found. The line passed is the
+ logical line; continuation lines are included.
+ """
+ token_list = []
+ lnum = parenlev = continued = 0
+ namechars = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_'
+ numchars = '0123456789'
+ contstr, needcont = '', 0
+ contline = None
+ indents = [0]
+ last_comment = ''
+ encoding = None
+ strstart = (0, 0)
+
+ while 1: # loop over lines in stream
+ line = readline()
+ lnum = lnum + 1
+ pos, max = 0, len(line)
+
+ if contstr: # continued string
+ if not line:
+ raise TokenError("EOF in multi-line string", strstart, token_list)
+ endmatch = endDFA.recognize(line)
+ if -1 != endmatch:
+ pos = end = endmatch
+ tok = token_from_values(tokenmod.STRING, contstr + line[:end])
+ token_list.append((tok, line))
+ last_comment = ''
+ # token_list.append((STRING, contstr + line[:end],
+ # strstart, (lnum, end), contline + line))
+ contstr, needcont = '', 0
+ contline = None
+ elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
+ tok = token_from_values(tokenmod.ERRORTOKEN, contstr + line)
+ token_list.append((tok, line))
+ last_comment = ''
+ # token_list.append((ERRORTOKEN, contstr + line,
+ # strstart, (lnum, len(line)), contline))
+ contstr = ''
+ contline = None
+ continue
+ else:
+ contstr = contstr + line
+ contline = contline + line
+ continue
+
+ elif parenlev == 0 and not continued: # new statement
+ if not line: break
+ column = 0
+ while pos < max: # measure leading whitespace
+ if line[pos] == ' ': column = column + 1
+ elif line[pos] == '\t': column = (column/tabsize + 1)*tabsize
+ elif line[pos] == '\f': column = 0
+ else: break
+ pos = pos + 1
+ if pos == max: break
+
+ if line[pos] in '#\r\n': # skip comments or blank lines
+ if line[pos] == '#':
+ tok = token_from_values(tokenmod.COMMENT, line[pos:])
+ last_comment = line[pos:]
+ if lnum <= 2:
+ m_enc = py_encoding.search(last_comment)
+ if m_enc is not None:
+ encoding = _normalize_encoding(m_enc.group(1))
+ else:
+ tok = token_from_values(tokenmod.NL, line[pos:])
+ last_comment = ''
+ # XXX Skip NL and COMMENT Tokens # token_list.append((tok, line))
+ # token_list.append(((NL, COMMENT)[line[pos] == '#'], line[pos:],
+ # (lnum, pos), (lnum, len(line)), line))
+ continue
+
+ if column > indents[-1]: # count indents or dedents
+ indents.append(column)
+ tok = token_from_values(tokenmod.INDENT, line[:pos])
+ token_list.append((tok, line))
+ last_comment = ''
+ # token_list.append((INDENT, line[:pos], (lnum, 0), (lnum, pos), line))
+ while column < indents[-1]:
+ indents = indents[:-1]
+ tok = token_from_values(tokenmod.DEDENT, '')
+ token_list.append((tok, line))
+ last_comment = ''
+ # token_list.append((DEDENT, '', (lnum, pos), (lnum, pos), line))
+
+ else: # continued statement
+ if not line:
+ raise TokenError("EOF in multi-line statement", (lnum, 0), token_list)
+ continued = 0
+
+ while pos < max:
+ pseudomatch = pseudoDFA.recognize(line, pos)
+ if -1 != pseudomatch: # scan for tokens
+ # JDR: Modified
+ start = whiteSpaceDFA.recognize(line, pos)
+ if -1 == start:
+ start = pos
+ end = pseudomatch
+
+ spos, epos, pos = (lnum, start), (lnum, end), end
+ token, initial = line[start:end], line[start]
+
+ if initial in numchars or \
+ (initial == '.' and token != '.'): # ordinary number
+ tok = token_from_values(tokenmod.NUMBER, token)
+ token_list.append((tok, line))
+ last_comment = ''
+ # token_list.append((NUMBER, token, spos, epos, line))
+ elif initial in '\r\n':
+ if parenlev > 0:
+ tok = token_from_values(tokenmod.NL, token)
+ last_comment = ''
+ # XXX Skip NL
+ else:
+ tok = token_from_values(tokenmod.NEWLINE, token)
+ # XXX YUCK !
+ tok.value = last_comment
+ token_list.append((tok, line))
+ last_comment = ''
+ # token_list.append((parenlev > 0 and NL or NEWLINE, token, spos, epos, line))
+ elif initial == '#':
+ tok = token_from_values(tokenmod.COMMENT, token)
+ last_comment = token
+ if lnum <= 2:
+ m_enc = py_encoding.search(last_comment)
+ if m_enc is not None:
+ encoding = _normalize_encoding(m_enc.group(1))
+ # XXX Skip # token_list.append((tok, line))
+ # token_list.append((COMMENT, token, spos, epos, line))
+ elif token in triple_quoted:
+ endDFA = endDFAs[token]
+ endmatch = endDFA.recognize(line, pos)
+ if -1 != endmatch: # all on one line
+ pos = endmatch
+ token = line[start:pos]
+ tok = token_from_values(tokenmod.STRING, token)
+ token_list.append((tok, line))
+ last_comment = ''
+ # token_list.append((STRING, token, spos, (lnum, pos), line))
+ else:
+ strstart = (lnum, start) # multiple lines
+ contstr = line[start:]
+ contline = line
+ break
+ elif initial in single_quoted or \
+ token[:2] in single_quoted or \
+ token[:3] in single_quoted:
+ if token[-1] == '\n': # continued string
+ strstart = (lnum, start)
+ endDFA = (endDFAs[initial] or endDFAs[token[1]] or
+ endDFAs[token[2]])
+ contstr, needcont = line[start:], 1
+ contline = line
+ break
+ else: # ordinary string
+ tok = token_from_values(tokenmod.STRING, token)
+ token_list.append((tok, line))
+ last_comment = ''
+ # token_list.append((STRING, token, spos, epos, line))
+ elif initial in namechars: # ordinary name
+ tok = token_from_values(tokenmod.NAME, token)
+ token_list.append((tok, line))
+ last_comment = ''
+ # token_list.append((NAME, token, spos, epos, line))
+ elif initial == '\\': # continued stmt
+ continued = 1
+ else:
+ if initial in '([{': parenlev = parenlev + 1
+ elif initial in ')]}': parenlev = parenlev - 1
+ tok = token_from_values(tokenmod.OP, token)
+ token_list.append((tok, line))
+ last_comment = ''
+ # token_list.append((OP, token, spos, epos, line))
+ else:
+ tok = token_from_values(tokenmod.ERRORTOKEN, line[pos])
+ token_list.append((tok, line))
+ last_comment = ''
+ # token_list.append((ERRORTOKEN, line[pos],
+ # (lnum, pos), (lnum, pos+1), line))
+ pos = pos + 1
+
+ last_comment = ''
+ for indent in indents[1:]: # pop remaining indent levels
+ tok = token_from_values(tokenmod.DEDENT, '')
+ token_list.append((tok, line))
+ # token_list.append((DEDENT, '', (lnum, 0), (lnum, 0), ''))
+
+ ## <XXX> adim
+ token_list.append((Token('NEWLINE', ''), line))
+ ## </XXX>
+ tok = token_from_values(tokenmod.ENDMARKER, '',)
+ token_list.append((tok, line))
+ # token_list.append((ENDMARKER, '', (lnum, 0), (lnum, 0), ''))
+ return token_list, encoding
+
+
+class PythonSource2(TokenSource):
+ """This source uses Jonathan's tokenizer"""
+ def __init__(self, inpstring):
+ TokenSource.__init__(self)
+ tokens, encoding = generate_tokens(StringAsFile(inpstring).readline)
+ self.token_stack = tokens
+ self.encoding = encoding
+ self._current_line = '' # the current line (as a string)
+ self.stack_pos = 0
+
+ def next(self):
+ if self.stack_pos >= len(self.token_stack):
+ raise StopIteration("Remove me")
+ tok, line = self.token_stack[self.stack_pos]
+ self.stack_pos += 1
+ self._current_line = line
+ return tok
+
+ def current_line(self):
+ return self._current_line
+
+ def context(self):
+ return self.stack_pos
+
+ def restore(self, ctx):
+ self.stack_pos = ctx
+
+ def peek(self):
+ """returns next token without consuming it"""
+ ctx = self.context()
+ token = self.next()
+ self.restore(ctx)
+ return token
+
+ #### methods below have to be translated
+ def offset(self, ctx=None):
+ if ctx is None:
+ return self.stack_pos
+ else:
+ assert type(ctx)==int
+ return ctx
+
+ def get_pos(self):
+ if self.stack_pos >= len(self.stack):
+ return self.pos
+ else:
+ token, line, pos = self.stack[self.stack_pos]
+ return pos
+
+ def get_source_text(self, pos0, pos1 ):
+ return self.input[pos0:pos1]
+ def debug(self):
+ """return context for debug information"""
+ return 'line %s : %s' % ('XXX', self._current_line)
-## def _check_string(self, c, pos):
-## inp = self.input
-## input_length = len(inp)
-## # STRING
-## if c in ('r', 'R'):
-## if pos < input_length-1 and inp[pos+1] in ("'",'"'):
-## return self.next_string(raw=1)
-## elif c in ('u','U'):
-## if pos < input_length - 1:
-## if inp[pos+1] in ("r", 'R'):
-## if pos<input_length-2 and inp[pos+2] in ("'",'"'):
-## return self.next_string(raw = 1, uni = 1)
-## elif inp[pos+1] in ( "'", '"' ):
-## return self.next_string(uni = 1)
-## elif c in ( '"', "'" ):
-## return self.next_string()
-## return None
-
-## def _check_name(self, c, pos):
-## inp = self.input
-## # NAME
-## m = py_name.match(inp, pos)
-## if m:
-## self.pos = m.end()
-## val = m.group(0)
-## if py_keywords.match(val):
-## return val, None
-## return "NAME", val
-## return None
-
-## def _check_number(self, c, pos):
-## inp = self.input
-## # NUMBER
-## m = py_number.match(inp, pos)
-## if m:
-## self.pos = m.end()
-## return "NUMBER", m.group(0)
-## return None
-
-## def _check_newline(self, c, pos):
-## # NEWLINE
-## if c == '\n':
-## self.pos += 1
-## self.line += 1
-## if self.level > 0:
-## return self._next()
-## else:
-## self.atbol = True
-## return "NEWLINE", None
-## return None
-
-## def _check_backslash(self, c, pos):
-## inp = self.input
-## input_length = len(inp)
-## if c == '\\':
-## if pos < input_length-1 and inp[pos+1] == '\n':
-## self.pos += 2
-## return self._next()
-## return None
-
-## def _check_punct(self, c, pos):
-## inp = self.input
-## input_length = len(inp)
-## m = py_punct.match(inp, pos)
-## if m:
-## punct = m.group(0)
-## if punct in ( '(', '{' ):
-## self.level += 1
-## if punct in ( ')', '}' ):
-## self.level -= 1
-## self.pos = m.end()
-## return punct, None
-## raise SyntaxError("Unrecognized token '%s'" % inp[pos:pos+20] )
+NONE_LIST = [tokenmod.ENDMARKER, tokenmod.INDENT, tokenmod.DEDENT,]
+NAMED_LIST = [tokenmod.OP, ]
+def token_from_values(tok_type, tok_string):
+ """XXX Compatibility layer between both parsers"""
+ if tok_type in NONE_LIST:
+ return Token(tokenmod.tok_name[tok_type], None)
+ if tok_type in NAMED_LIST:
+ return Token(tok_string, None)
+ if tok_type == tokenmod.NEWLINE:
+ return Token('NEWLINE', '') # XXX pending comment ?
+ return Token(tokenmod.tok_name[tok_type], tok_string)
+Source = PythonSource2
def tokenize_file(filename):
f = file(filename).read()
- src = PythonSource(f)
+ src = Source(f)
token = src.next()
while token!=("ENDMARKER",None) and token!=(None,None):
print token
More information about the Pypy-commit
mailing list