Faster way of Parsing and Rendering?

Jane Austine janeaustine50 at hotmail.com
Sat May 24 14:39:27 EDT 2003


Hello

I posted a question on the performance of reST a few days ago and I'm
experimenting with several candidates. One of them is the almighty
mxTextTools. Since handling it directly seemed a hard job, I landed on
SimpleParse.

However, comparing my version in SimpleParse with the general RE way
as in MoinMoin, the result was surprising. Regex version was much(x10)
faster.

Is there any thing you can find to improve the performance of the
following code?(I'm almost 100% sure mine isn't the most effective) Or
should I go for the regex?

In addition to that, there are some questions concerning some patterns
which I couldn't easily do with ebnf.

Thanks in advance.

---------------------------------------

import time,re

ebnf = r'''
>body<:=(markup/plain)+
>markup<:=wikiname/email/table_open/table_close/rule/heading/emph/italic
plain:=-markup*
wikiname:=([A-Z],(([A-Z]+,[a-z])/([a-z]+,[A-Z])),[A-Za-z0-9]*)
email:=[-a-zA-Z0-9._+]+,"@",[-a-zA-Z0-9.]+
italic := "\'\'",?-"\'"
emph:="\'\'\'"
table_open:="{{|" #inside table, body should come recursively, but it
doesn't work here. How can we make it recursive, like "{{|",body,"|}}"
(which doesn't work)
table_close:="|}}"
rule:="-","-","-","-"+
heading:= ("="+,-[=\n]+,"="+) #the num of first "="s must equal to the
second
'''

testData = """
abc DEF ghi ''hello'' FrontPage and hello at myworld.com
{{|
''italic'' and '''emphasis'''
|}}
Wiki test
"""
from simpleparse.parser import Parser

class BasicMethodSource:
	def __init__( self ):
            self.results = []
	def _m_table_open( self, taglist,text,l,r,subtags ):
            self.results.append("<something>")
	def _m_emph( self, taglist,text,l,r,subtags):
            self.results.append("<em>")
	def _m_italic( self, taglist,text,l,r,subtags):
            self.results.append("<il>")
	def _m_table_close( self, taglist,text,l,r,subtags):
            self.results.append("</something>")
        def _m_plain(self, taglist,text,l,r,subtags):
            self.results.append(text[l:r])
        def _m_wikiname(self,taglist,text,l,r,subtags):
            self.results.append("<a href>%s</a>"%text[l:r])
        def _m_rule(self,taglist,text,l,r,subtags):
            self.results.append("<rule/>")
        def _m_heading(self,taglist,text,l,r,subtags):
            self.results.append("<head>%s</head>"%text[l:r])
        def _m_email(self,tl,text,l,r,sub):
            self.results.append("<a href>%s</a>"%text[l:r])



parser=Parser( ebnf, "body" )
source=BasicMethodSource()

class ReParser:
    def distributor(self,match):
        for k,v in match.groupdict().iteritems():
            if v is not None:
                return getattr(self,'_'+k)(v)
    def _table_open(self,hit):
        return "<something>"
    def _table_close(self,hit):
        return "</something>"
    def _emph(self,hit):
        return "<em>"
    def _italic(self,hit):
        return "<il>"
    def _wikiname(self,hit):
        return "<a href>%s</a>"%hit
    def _rule(self,hit):
        return "<rule/>"
    def _heading(self,hit):
        return "<head>%s</head>"%hit
    def _email(self,hit):
        return "<a href>%s</a>"%hit

rep=r"""(?P<table_open>{{\|)|
      (?P<table_close>\|}})|
      (?P<emph>''')|
      (?P<italic>'')|
      (?P<wikiname>\b[A-Z]([A-Z]+[a-z]|[a-z]+[A-Z])[A-Za-z0-9]*\b)|
      (?P<rule>-{4,})|
      (?P<heading>(?:=+)(\s.*\s)(?:=+))|
      (?P<email>[-\w._+]+\@[\w.-]+)
   """
rep=re.compile(rep,re.X)

def testReParse():
    c=ReParser()
    return rep.sub(c.distributor,testData)


def tagpprint(tag,i=0):
    for name, s,e,nextList in tag:
        print "\t"*i,name,s,e,":",repr(testData[s:e]),nextList
        if type(nextList)==type([]):
            tagpprint(nextList,i+1)
    
if __name__ =="__main__":
    s=time.clock()
    _,r,rlen= parser.parse(testData,processor=source)
    res1=''.join(source.results)
    print "sp",time.clock()-s

    s=time.clock()
    res2= testReParse()
    print "re",time.clock()-s

    assert len(res1)==len(res2),"%d %d"%(len(res1),len(res2))
    assert res1==res2
    print "done"

    _,r,rlen= parser.parse(testData)
    tagpprint(r)




More information about the Python-list mailing list