[Python-Dev] A standard lexer?

Fredrik Lundh Fredrik Lundh" <effbot@telia.com
Sun, 2 Jul 2000 18:13:58 +0200


paul wrote:
> As an aside: I would be pumpld about getting a generic lexer into the
> Python distribution.

how about this quick and dirty proposal:

- add a new primitive to SRE: (?P#n), where n is a small integer.
  this primitive sets the match object's "index" variable to n when
  the engine stumbles upon it.

- given a list of "phrases", combine them into a single regular
  expression like this:

        (?:phrase1(?P#1))|(?:phrase2(?P#2))|...

- apply match repeatedly to the input string.  for each match,
  use the index attribute to figure out what phrase we matched.

see below for a slightly larger example.

</F>

import sre

class Scanner:
    def __init__(self, lexicon):
        self.lexicon =3D lexicon
        p =3D []
        for phrase, action in lexicon:
            p.append("(?:%s)(?P#%d)" % (phrase, len(p)))
        self.scanner =3D sre.compile("|".join(p))
    def scan(self, string):
        result =3D []
        append =3D result.append
        match =3D self.scanner.match
        i =3D 0
        while 1:
            m =3D match(string, i)
            if not m:
                break
            j =3D m.end()
            if i =3D=3D j:
                break
            action =3D self.lexicon[m.index][1]
            if callable(action):
                self.match =3D match
                action =3D action(self, m.group())
            if action is not None:
                append(action)
            i =3D j
        return result, string[i:]

def s_ident(scanner, token):
    return token

def s_operator(scanner, token):
    return "operator%s" % token

def s_float(scanner, token):
    return float(token)

def s_int(scanner, token):
    return int(token)

scanner =3D Scanner([
    (r"[a-zA-Z_]\w*", s_ident),
    (r"\d+\.\d*", s_float),
    (r"\d+", s_int),
    (r"=3D|\+|-|\*|/", s_operator),
    (r"\s+", None),
    ])

tokens, tail =3D scanner.scan("sum =3D 3*foo + 312.50 + bar")

print tokens

if tail:
    print "syntax error at", tail

## prints:
## ['sum', 'operator=3D', 3, 'operator*', 'foo', 'operator+',
## 312.5, 'operator+', 'bar']