Code that ought to run fast, but can't due to Python limitations.
Stefan Behnel
stefan_ml at behnel.de
Sun Jul 5 07:52:07 EDT 2009
John Nagle wrote:
> Here's some actual code, from "tokenizer.py". This is called once
> for each character in an HTML document, when in "data" state (outside
> a tag). It's straightforward code, but look at all those
> dictionary lookups.
>
> def dataState(self):
> data = self.stream.char()
>
> # Keep a charbuffer to handle the escapeFlag
> if self.contentModelFlag in\
> (contentModelFlags["CDATA"], contentModelFlags["RCDATA"]):
> if len(self.lastFourChars) == 4:
> self.lastFourChars.pop(0)
> self.lastFourChars.append(data)
>
> # The rest of the logic
> if data == "&" and self.contentModelFlag in\
> (contentModelFlags["PCDATA"], contentModelFlags["RCDATA"]) and
> not\
> self.escapeFlag:
> self.state = self.states["entityData"]
> elif data == "-" and self.contentModelFlag in\
> (contentModelFlags["CDATA"], contentModelFlags["RCDATA"]) and
> not\
> self.escapeFlag and "".join(self.lastFourChars) == "<!--":
> self.escapeFlag = True
> self.tokenQueue.append({"type": "Characters", "data":data})
> elif (data == "<" and (self.contentModelFlag ==
> contentModelFlags["PCDATA"]
> or (self.contentModelFlag in
> (contentModelFlags["CDATA"],
> contentModelFlags["RCDATA"]) and
> self.escapeFlag == False))):
> self.state = self.states["tagOpen"]
> elif data == ">" and self.contentModelFlag in\
> (contentModelFlags["CDATA"], contentModelFlags["RCDATA"]) and\
> self.escapeFlag and "".join(self.lastFourChars)[1:] == "-->":
> self.escapeFlag = False
> self.tokenQueue.append({"type": "Characters", "data":data})
> elif data == EOF:
> # Tokenization ends.
> return False
> elif data in spaceCharacters:
> # Directly after emitting a token you switch back to the "data
> # state". At that point spaceCharacters are important so
> they are
> # emitted separately.
> self.tokenQueue.append({"type": "SpaceCharacters", "data":
> data + self.stream.charsUntil(spaceCharacters, True)})
> # No need to update lastFourChars here, since the first
> space will
> # have already broken any <!-- or --> sequences
> else:
> chars = self.stream.charsUntil(("&", "<", ">", "-"))
> self.tokenQueue.append({"type": "Characters", "data":
> data + chars})
> self.lastFourChars += chars[-4:]
> self.lastFourChars = self.lastFourChars[-4:]
> return True
Giving this some more thought, I'd also try is to split the huge
if-elif-else block like this:
if data in string_with_all_special_characters:
if data == '&' ...:
...
else:
...
So there are three things to improve:
- eliminate common subexpressions which you know are constant
- split the large conditional sequence as shown above
- use separate dataState() methods when inside and outside of CDATA/RCDATA
blocks and (maybe) escaped blocks
Stefan
More information about the Python-list
mailing list