Regex Speed
garrickp at gmail.com
garrickp at gmail.com
Fri Feb 23 18:15:48 EST 2007
On Feb 21, 10:34 am, garri... at gmail.com wrote:
> On Feb 20, 6:14 pm, Pop User <popu... at christest2.dc.k12us.com> wrote:
> >http://swtch.com/~rsc/regexp/regexp1.html
Going back a bit on a tangent, the author of this citation states that
any regex can be expressed as a DFA machine. However, while
investigating this more I appear to have found one example of a regex
which breaks this assumption.
"ab+c|abd"
Am I correct? Can you think of a deterministic method of computing
this expression? It would be easier with a NFA machine, but given that
the Python method of computing RE's involves pre-compiling a re
object, optimizing the matching engine would make the most sense to
me.
Here's what I have so far:
class State(object):
def __init__(self):
self.nextState = {}
self.nextStateKeys = []
self.prevState = None
self.isMatchState = True
def setNextState(self, chars, iNextState):
self.nextState[chars] = iNextState
self.nextStateKeys = self.nextState.keys()
self.isMatchState = False
def setPrevState(self, iPrevState):
self.prevState = iPrevState
def moveToNextState(self, testChar):
if testChar in self.nextStateKeys:
return self.nextState[testChar]
else:
return None
class CompiledRegex(object):
def __init__(self, startState):
self.startState = startState
def match(self, matchStr):
match_set = []
currentStates = [self.startState]
nextStates = [self.startState]
for character in matchStr:
for state in currentStates:
nextState = state.moveToNextState(character)
if nextState is not None:
nextStates.append(nextState)
if nextState.isMatchState:
print "Match!"
return
currentStates = nextStates
nextStates = [self.startState]
print "No Match!"
def compile(regexStr):
startState = State()
currentState = startState
backRefState = None
lastChar = ""
for character in regexStr:
if character == "+":
currentState.setNextState(lastChar, currentState)
elif character == "|":
currentState = startState
elif character == "?":
backRefState = currentState.prevState
elif character == "(":
# Implement "("
pass
elif character == ")":
# Implement ")"
pass
elif character == "*":
currentState = currentState.prevState
currentState.setNextState(lastChar, currentState)
else:
testRepeatState = currentState.moveToNextState(character)
if testRepeatState is None:
newState = State()
newState.setPrevState(currentState)
currentState.setNextState(character, newState)
if backRefState is not None:
backRefState.setNextState(character, newState)
backRefState = None
currentState = newState
else:
currentState = testRepeatState
lastChar = character
return CompiledRegex(startState)
>>> a = compile("ab+c")
>>> a.match("abc")
Match!
>>> a.match("abbc")
Match!
>>> a.match("ac")
No Match!
>>> a = compile("ab+c|abd")
>>> a.match("abc")
Match!
>>> a.match("abbc")
Match!
>>> a.match("ac")
No Match!
>>> a.match("abd")
Match!
>>> a.match("abbd")
Match!
>>>
More information about the Python-list
mailing list