scanf style parsing
Duncan Booth
duncan at NOSPAMrcp.co.uk
Mon Oct 1 05:06:14 EDT 2001
aahz at panix.com (Aahz Maruch) wrote in news:9p84ea$cd7$1 at panix3.panix.com:
>>Wouldn't you be happier with this?:
>>
>> extensions = ['.htm', '.html', '.shtm', '.shtml', '.phtm', '.phtml',
>> '.cgi', '.php', '.php2', 'php3', '.php4', '.pl'] ext =
>> os.path.splitext(filename)[1] if ext in extensions: ...
>>
>>which has the arguable advantage of matching what your description says
>>instead of what your original code does.
>
> Well, if you're going to do that, extensions should be a dict for real
> speed. ;-)
Not until you have profiled the code and determined that this particular
lookup is a bottleneck, and that your proposed improvement actually has
some benefit. Get the code right first, then get it fast.
I knocked up a quick test program (attached at the end of the post) to do
just that and the results are perhaps surprising:
testdict: 1.290, each loop 161.2uS match 6000, nomatch 2000
testlist: 1.304, each loop 163.0uS match 6000, nomatch 2000
testregex: 0.071, each loop 8.9uS match 6000, nomatch 2000
testlist1: 0.162, each loop 20.2uS match 6000, nomatch 2000
The first time around the dictionary lookup actually came out slower than
the list: I had to optimise the extensions.has_key out of the loop to make
it faster. The real killer though is the call to splitext; replacing that
with a home rolled one that doesn't actually work correctly in all cases,
but does a good enough job for this situation brings the time down
significantly but the regex still wins.
The moral is, never jump to conclusions over speed in Python.
---- test.py ----
import time
import re
TESTFILES = []
TESTEXT = ('.htm', '.html', '.shtm', '.shtml', '.phtm',
'.phtml', '.cgi', '.php', '.php2', '.php3', '.php4', '.pl',
'.py', '.txt', '.exe', '')
TESTBASE = ('foo', 'bar', '/usr/local/bin/foo', '/usr/local/bin/bar',
'd:\\Program Files\\Silly.dir\\fred')
for filename in TESTBASE:
for ext in TESTEXT:
TESTFILES.append(filename + ext)
LOOPS = 100
def testregex():
match, nomatch = 0, 0
re_web_files = re.compile(r'\.([ps]?html?|cgi|php[\d]?|pl)$')
start = time.clock()
for i in range(LOOPS):
for filename in TESTFILES:
m = re_web_files.search(filename)
if m:
match += 1
else:
nomatch += 1
stend = time.clock()
results("testregex", start, stend, match, nomatch)
def testlist():
match, nomatch = 0, 0
extensions = ['.htm', '.html', '.shtm', '.shtml', '.phtm',
'.phtml', '.cgi', '.php', '.php2', '.php3', '.php4', '.pl']
from os.path import splitext
start = time.clock()
for i in range(LOOPS):
for filename in TESTFILES:
ext = splitext(filename)[1]
if ext in extensions:
match += 1
else:
nomatch += 1
stend = time.clock()
results("testlist", start, stend, match, nomatch)
def testlist1():
match, nomatch = 0, 0
extensions = ['.htm', '.html', '.shtm', '.shtml', '.phtm',
'.phtml', '.cgi', '.php', '.php2', '.php3', '.php4', '.pl']
def splitext(filename):
'''This doesn't always correctly split the extension,
but within the requirements of this test it will work, as it will
give nomatch
in the cases where it fails.'''
pos = filename.rfind('.')
if pos >= 0:
return (filename[:pos], filename[pos:])
return (filename, '')
start = time.clock()
for i in range(LOOPS):
for filename in TESTFILES:
ext = splitext(filename)[1]
if ext in extensions:
match += 1
else:
nomatch += 1
stend = time.clock()
results("testlist1", start, stend, match, nomatch)
def testdict():
match, nomatch = 0, 0
extensions = {'.htm':0, '.html':0, '.shtm':0, '.shtml':0, '.phtm':0,
'.phtml':0, '.cgi':0, '.php':0, '.php2':0, '.php3':0, '.php4':0,
'.pl':0}
from os.path import splitext
matches = extensions.has_key
start = time.clock()
for i in range(LOOPS):
for filename in TESTFILES:
ext = splitext(filename)[1]
if matches(ext):
match += 1
else:
nomatch += 1
stend = time.clock()
results("testdict", start, stend, match, nomatch)
def results(fn, start, stend, match, nomatch):
print "%-12s %7.3f, each loop %5.1fuS" % (fn+':', stend-start, (stend-
start)*1000000/(match+nomatch)),
print "match %d, nomatch %d" % (match, nomatch)
if __name__=='__main__':
testdict()
testlist()
testregex()
#
testlist1()
---- end of test.py ----
--
Duncan Booth duncan at rcp.co.uk
int month(char *p){return(124864/((p[0]+p[1]-p[2]&0x1f)+1)%12)["\5\x8\3"
"\6\7\xb\1\x9\xa\2\0\4"];} // Who said my code was obscure?
More information about the Python-list
mailing list