Regex Speed

garrickp at gmail.com garrickp at gmail.com
Fri Feb 23 18:15:48 EST 2007


On Feb 21, 10:34 am, garri... at gmail.com wrote:
> On Feb 20, 6:14 pm, Pop User <popu... at christest2.dc.k12us.com> wrote:
> >http://swtch.com/~rsc/regexp/regexp1.html

Going back a bit on a tangent, the author of this citation states that
any regex can be expressed as a  DFA machine. However, while
investigating this more I appear to have found one example of a regex
which breaks this assumption.

"ab+c|abd"

Am I correct? Can you think of a deterministic method of computing
this expression? It would be easier with a NFA machine, but given that
the Python method of computing RE's involves pre-compiling a re
object, optimizing the matching engine would make the most sense to
me.

Here's what I have so far:

class State(object):
    def __init__(self):
        self.nextState = {}
        self.nextStateKeys = []
        self.prevState = None
        self.isMatchState = True
    def setNextState(self, chars, iNextState):
        self.nextState[chars] = iNextState
        self.nextStateKeys = self.nextState.keys()
        self.isMatchState = False
    def setPrevState(self, iPrevState):
        self.prevState = iPrevState
    def moveToNextState(self, testChar):
        if testChar in self.nextStateKeys:
            return self.nextState[testChar]
        else:
            return None

class CompiledRegex(object):
    def __init__(self, startState):
        self.startState = startState
    def match(self, matchStr):
        match_set = []
        currentStates = [self.startState]
        nextStates = [self.startState]
        for character in matchStr:
            for state in currentStates:
                nextState = state.moveToNextState(character)
                if nextState is not None:
                    nextStates.append(nextState)
                    if nextState.isMatchState:
                        print "Match!"
                        return
            currentStates = nextStates
            nextStates = [self.startState]
        print "No Match!"

def compile(regexStr):
    startState = State()
    currentState = startState
    backRefState = None
    lastChar = ""
    for character in regexStr:
        if character == "+":
            currentState.setNextState(lastChar, currentState)
        elif character == "|":
            currentState = startState
        elif character == "?":
            backRefState = currentState.prevState
        elif character == "(":
            # Implement "("
            pass
        elif character == ")":
            # Implement ")"
            pass
        elif character == "*":
            currentState = currentState.prevState
            currentState.setNextState(lastChar, currentState)
        else:
            testRepeatState = currentState.moveToNextState(character)
            if testRepeatState is None:
                newState = State()
                newState.setPrevState(currentState)
                currentState.setNextState(character, newState)
                if backRefState is not None:
                    backRefState.setNextState(character, newState)
                    backRefState = None
                currentState = newState
            else:
                currentState = testRepeatState
        lastChar = character
    return CompiledRegex(startState)

>>> a = compile("ab+c")
>>> a.match("abc")
Match!
>>> a.match("abbc")
Match!
>>> a.match("ac")
No Match!
>>> a = compile("ab+c|abd")
>>> a.match("abc")
Match!
>>> a.match("abbc")
Match!
>>> a.match("ac")
No Match!
>>> a.match("abd")
Match!
>>> a.match("abbd")
Match!
>>>




More information about the Python-list mailing list