My first Python program -- a lexer

Thomas Mlynarczyk thomas at mlynarczyk-webdesign.de
Sat Nov 8 15:55:51 EST 2008


Hello,

I started to write a lexer in Python -- my first attempt to do something 
useful with Python (rather than trying out snippets from tutorials). It 
is not complete yet, but I would like some feedback -- I'm a Python 
newbie and it seems that, with Python, there is always a simpler and 
better way to do it than you think.

### Begin ###

import re

class Lexer(object):
     def __init__( self, source, tokens ):
         self.source = re.sub( r"\r?\n|\r\n", "\n", source )
         self.tokens = tokens
         self.offset = 0
         self.result = []
         self.line   = 1
         self._compile()
         self._tokenize()

     def _compile( self ):
         for name, regex in self.tokens.iteritems():
             self.tokens[name] = re.compile( regex, re.M )

     def _tokenize( self ):
         while self.offset < len( self.source ):
             for name, regex in self.tokens.iteritems():
                 match = regex.match( self.source, self.offset )
                 if not match: continue
                 self.offset += len( match.group(0) )
                 self.result.append( ( name, match, self.line ) )
                 self.line += match.group(0).count( "\n" )
                 break
             else:
                 raise Exception(
                     'Syntax error in source at offset %s' %
                     str( self.offset ) )

     def __str__( self ):
         return "\n".join(
             [ "[L:%s]\t[O:%s]\t[%s]\t'%s'" %
               ( str( line ), str( match.pos ), name, match.group(0) )
               for name, match, line in self.result ] )

# Test Example

source = r"""
     Name: "Thomas", # just a comment
     Age: 37
"""

tokens = {
     'T_IDENTIFIER' : r'[A-Za-z_][A-Za-z0-9_]*',
     'T_NUMBER'     : r'[+-]?\d+',
     'T_STRING'     : r'"(?:\\.|[^\\"])*"',
     'T_OPERATOR'   : r'[=:,;]',
     'T_NEWLINE'    : r'\n',
     'T_LWSP'       : r'[ \t]+',
     'T_COMMENT'    : r'(?:\#|//).*$' }

print Lexer( source, tokens )

### End ###


Greetings,
Thomas

-- 
Ce n'est pas parce qu'ils sont nombreux à avoir tort qu'ils ont raison!
(Coluche)



More information about the Python-list mailing list