scanf style parsing

Mon Oct 1 05:06:14 EDT 2001

aahz at panix.com (Aahz Maruch) wrote in news:9p84ea$cd7$1 at panix3.panix.com:

>>Wouldn't you be happier with this?:
>>
>>   extensions = ['.htm', '.html', '.shtm', '.shtml', '.phtm', '.phtml',
>>   '.cgi', '.php', '.php2', 'php3', '.php4', '.pl'] ext =
>>   os.path.splitext(filename)[1] if ext in extensions: ...
>>
>>which has the arguable advantage of matching what your description says
>>instead of what your original code does. 
> 
> Well, if you're going to do that, extensions should be a dict for real
> speed.  ;-)

Not until you have profiled the code and determined that this particular 
lookup is a bottleneck, and that your proposed improvement actually has 
some benefit. Get the code right first, then get it fast.

I knocked up a quick test program (attached at the end of the post) to do 
just that and the results are perhaps surprising:
testdict:      1.290, each loop 161.2uS match 6000, nomatch 2000
testlist:      1.304, each loop 163.0uS match 6000, nomatch 2000
testregex:     0.071, each loop   8.9uS match 6000, nomatch 2000
testlist1:     0.162, each loop  20.2uS match 6000, nomatch 2000

The first time around the dictionary lookup actually came out slower than 
the list: I had to optimise the extensions.has_key out of the loop to make 
it faster. The real killer though is the call to splitext; replacing that 
with a home rolled one that doesn't actually work correctly in all cases, 
but does a good enough job for this situation brings the time down 
significantly but the regex still wins.

The moral is, never jump to conclusions over speed in Python.

---- test.py ----
import time
import re

TESTFILES = []
TESTEXT = ('.htm', '.html', '.shtm', '.shtml', '.phtm',
        '.phtml', '.cgi', '.php', '.php2', '.php3', '.php4', '.pl',
        '.py', '.txt', '.exe', '')
TESTBASE = ('foo', 'bar', '/usr/local/bin/foo', '/usr/local/bin/bar', 
'd:\\Program Files\\Silly.dir\\fred')
for filename in TESTBASE:
    for ext in TESTEXT:
        TESTFILES.append(filename + ext)
LOOPS = 100

def testregex():
    match, nomatch = 0, 0
    re_web_files = re.compile(r'\.([ps]?html?|cgi|php[\d]?|pl)$')
    start = time.clock()
    for i in range(LOOPS):
        for filename in TESTFILES:
            m = re_web_files.search(filename)
            if m:
                match += 1
            else:
                nomatch += 1
    stend = time.clock()
    results("testregex", start, stend, match, nomatch)

def testlist():
    match, nomatch = 0, 0
    extensions = ['.htm', '.html', '.shtm', '.shtml', '.phtm',
        '.phtml', '.cgi', '.php', '.php2', '.php3', '.php4', '.pl']
    from os.path import splitext
    start = time.clock()
    for i in range(LOOPS):
        for filename in TESTFILES:
            ext = splitext(filename)[1]
            if ext in extensions:
                match += 1
            else:
                nomatch += 1
    stend = time.clock()
    results("testlist", start, stend, match, nomatch)

def testlist1():
    match, nomatch = 0, 0
    extensions = ['.htm', '.html', '.shtm', '.shtml', '.phtm',
        '.phtml', '.cgi', '.php', '.php2', '.php3', '.php4', '.pl']
    def splitext(filename):
        '''This doesn't always correctly split the extension,
        but within the requirements of this test it will work, as it will 
give nomatch
        in the cases where it fails.'''
        pos = filename.rfind('.')
        if pos >= 0:
            return (filename[:pos], filename[pos:])
        return (filename, '')

    start = time.clock()
    for i in range(LOOPS):
        for filename in TESTFILES:
            ext = splitext(filename)[1]
            if ext in extensions:
                match += 1
            else:
                nomatch += 1
    stend = time.clock()

    results("testlist1", start, stend, match, nomatch)

def testdict():
    match, nomatch = 0, 0
    extensions = {'.htm':0, '.html':0, '.shtm':0, '.shtml':0, '.phtm':0,
        '.phtml':0, '.cgi':0, '.php':0, '.php2':0, '.php3':0, '.php4':0, 
'.pl':0}
    from os.path import splitext
    matches = extensions.has_key
    start = time.clock()
    for i in range(LOOPS):
        for filename in TESTFILES:
            ext = splitext(filename)[1]
            if matches(ext):
                match += 1
            else:
                nomatch += 1
    stend = time.clock()

    results("testdict", start, stend, match, nomatch)

def results(fn, start, stend, match, nomatch):
    print "%-12s %7.3f, each loop %5.1fuS" % (fn+':', stend-start, (stend-
start)*1000000/(match+nomatch)),
    print "match %d, nomatch %d" % (match, nomatch)

if __name__=='__main__':
    testdict()
    testlist()
    testregex()
#
    testlist1()
---- end of test.py ----
-- 
Duncan Booth                                             duncan at rcp.co.uk
int month(char *p){return(124864/((p[0]+p[1]-p[2]&0x1f)+1)%12)["\5\x8\3"
"\6\7\xb\1\x9\xa\2\0\4"];} // Who said my code was obscure?