Looking for very simple general purpose tokenizer

Tue Jan 20 05:46:59 EST 2004

Thank you all for your very useful comments. Below I have included my
source. Could you comment if there's a more elegant way of implementing the
continuation character &? 

With the RE implementation I have noticed that the position of the '*' in
spclist is very delicate. This order works, but other orders throw
exceptions. Is this correct or is it a bug? Lastly, is there more
documentation and examples for the shlex module? Ideally I would like to
see a full scale example of how this module should be used to parse.

Maarten   

import re
import shlex
import StringIO

def splitf90(source):
   buf = StringIO.StringIO(source)
   toker = shlex.shlex(buf)
   toker.commenters = "!"
   toker.whitespace = " \t\r"
   return  processTokens(toker)

def splitf90_re(source):
   spclist = ['\*', '\+', '-', '/', '=','\[', '\]', '\(', '\)' \
              '>', '<', '&', ';', ',', ':', '!', ' ', '\n']
   pat = '|'.join(spclist) + '|[^' + ''.join(spclist) + ']+'
   rawtokens = re.findall(pat, source)
   return processTokens(rawtokens)

def processTokens(rawtokens):
   # substitute characters
   subst1 = []
   prevtoken = None
   for token in rawtokens:
      if token == ';': token = '\n'
      if token == ' ': token = ''
      if token == '\n' and prevtoken == '&': token = ''
      if not token == '':
         subst1.append(token)
      prevtoken = token

   # remove continuation chars
   subst2 = []
   for token in subst1:
      if token == '&': token = ''
      if not token == '':
         subst2.append(token)

   # split into lines
   final = []
   curline = []
   for token in subst2:
      if not token == '\n':
         curline.append(token)
      else:
         if not curline == []:
            final.append(curline)
            curline = []

   return final

# Example session
src = """
MODULE modsize
implicit none

integer, parameter::  &
    Nx =  256, &
    Ny =  256, &
    Nz =  256, &
    nt =    1, &  ! nr of (passive) scalars
    Np =   16     ! nr of processors, should match mpirun -np .. command

END MODULE
"""
print splitf90(src)
print splitf90_re(src)

Output:
[['MODULE', 'modsize'], ['implicit', 'none'], ['integer', ',', 'parameter',
':', ':', 'Nx', '=', '256', ',', 'Ny', '=', '256', ',', 'Nz', '=', '256',
',', 'nt', '=', '1', ',', 'Np', '=', '16'], ['END', 'MODULE']]

[['MODULE', 'modsize'], ['implicit', 'none'], ['integer', ',', 'parameter',
':', ':', 'Nx', '=', '256', ',', 'Ny', '=', '256', ',', 'Nz', '=', '256',
',', 'nt', '=', '1', ',', '!', 'nr', 'of', '(', 'passive', 'scalars'],
['Np', '=', '16', '!', 'nr', 'of', 'processors', ',', 'should', 'match',
'mpirun', '-', 'np', 'command'], ['END', 'MODULE']]

-- 
===================================================================
Maarten van Reeuwijk                        Heat and Fluid Sciences
Phd student                             dept. of Multiscale Physics
www.ws.tn.tudelft.nl                 Delft University of Technology