prefix search on a large file
js
ebgssth at gmail.com
Thu Oct 12 10:13:27 EDT 2006
Thank you for the quick reply.
Here're the exact code I executed. (including your code)
#!/usr/bin/env python
from pprint import pprint as pp
data = [
'foo bar baz',
'foo bar',
'foo',
'food',
'food', # duplicate
'xyzzy',
'plugh',
'xyzzy and plugh are magic',
'zzzz',
]
data_sorted_by_len = sorted(data, key=len)
data_sorted_by_asc = sorted(data)
print "OP trial 1 input"; pp(data)
def prefixdel_stupidly(alist):
for line in alist[:]:
unneeded = [no for no, line2 in enumerate(alist[1:]) if
line2.startswith(line) and line != line2]
adjust=1
for i in unneeded:
del alist[i+adjust]
adjust -= 1
return alist
def prefixdel_recursively(alist):
if len(alist) < 2:
return alist
unneeded = [no for no, line in enumerate(alist[1:]) if
line.startswith(alist[0])]
adjust=1
for i in unneeded:
del alist[i+adjust]
adjust -= 1
return [alist[0]] + prefixdel_recursively(alist[1:])
def prefixdel_by_john(alist):
olist = []
for i in xrange(len(alist)-1, 0, -1):
if alist[i].startswith(alist[i-1]):
continue
olist.append(alist[i])
olist.append(alist[0])
return olist
if __name__ == '__main__':
from timeit import Timer
print sorted(prefixdel_stupidly(data_sorted_by_len[:]))
print sorted(prefixdel_recursively(data_sorted_by_len[:]))
print sorted(prefixdel_by_john(data_sorted_by_asc[:]))
t = Timer("prefixdel_stupidly(data_sorted_by_len)", "from __main__
import prefixdel_stupidly, data_sorted_by_len")
print t.timeit(number=100000)
t = Timer("prefixdel_recursively(data_sorted_by_len)", "from
__main__ import prefixdel_recursively, data_sorted_by_len")
print t.timeit(number=100000)
t = Timer("prefixdel_by_john(data_sorted_by_asc)", "from __main__
import prefixdel_by_john, data_sorted_by_asc")
print t.timeit(number=100000)
The output is the following:
$ python2.5 myprefixdel.py
OP trial 1 input
['foo bar baz',
'foo bar',
'foo',
'food',
'food',
'xyzzy',
'plugh',
'xyzzy and plugh are magic',
'zzzz']
['foo', 'plugh', 'xyzzy', 'zzzz']
['foo', 'plugh', 'xyzzy', 'zzzz']
['foo', 'food', 'plugh', 'xyzzy', 'zzzz']
1.17837095261
1.21182584763
0.737352132797
Your code is much faster than ones I wrote
but the result is a little bit different.('food' shoud be removed
because 'foo' is in the list)
As you pointed out, list[:] must be a big evil but without duplicating the list,
the problem become much harder to solve to me.
Any practical solution, anyone?
More information about the Python-list
mailing list