prefix search on a large file

Thu Oct 12 10:13:27 EDT 2006

Thank you for the quick reply.

Here're the exact code I executed. (including your code)

#!/usr/bin/env python
from pprint import pprint as pp

data = [
   'foo bar baz',
   'foo bar',
   'foo',
   'food',
   'food', # duplicate
   'xyzzy',
   'plugh',
   'xyzzy and plugh are magic',
   'zzzz',
   ]

data_sorted_by_len = sorted(data, key=len)
data_sorted_by_asc = sorted(data)
print "OP trial 1 input"; pp(data)

def prefixdel_stupidly(alist):
    for line in alist[:]:
	unneeded = [no for no, line2 in enumerate(alist[1:]) if
line2.startswith(line) and line != line2]
	adjust=1
	for i in unneeded:
	    del alist[i+adjust]
	    adjust -= 1
    return alist

def prefixdel_recursively(alist):
    if len(alist) < 2:
	return alist

    unneeded = [no for no, line in enumerate(alist[1:]) if
line.startswith(alist[0])]
    adjust=1
    for i in unneeded:
        del alist[i+adjust]
	adjust -= 1

    return [alist[0]] + prefixdel_recursively(alist[1:])

def prefixdel_by_john(alist):
    olist = []
    for i in xrange(len(alist)-1, 0, -1):
	if alist[i].startswith(alist[i-1]):
	    continue
	olist.append(alist[i])
    olist.append(alist[0])
    return olist

if __name__ == '__main__':
    from timeit import Timer
    print sorted(prefixdel_stupidly(data_sorted_by_len[:]))
    print sorted(prefixdel_recursively(data_sorted_by_len[:]))
    print sorted(prefixdel_by_john(data_sorted_by_asc[:]))

    t = Timer("prefixdel_stupidly(data_sorted_by_len)", "from __main__
import prefixdel_stupidly, data_sorted_by_len")
    print t.timeit(number=100000)
    t = Timer("prefixdel_recursively(data_sorted_by_len)", "from
__main__ import prefixdel_recursively, data_sorted_by_len")
    print t.timeit(number=100000)
    t = Timer("prefixdel_by_john(data_sorted_by_asc)", "from __main__
import prefixdel_by_john, data_sorted_by_asc")
    print t.timeit(number=100000)

The output is the following:

$ python2.5 myprefixdel.py
OP trial 1 input
['foo bar baz',
 'foo bar',
 'foo',
 'food',
 'food',
 'xyzzy',
 'plugh',
 'xyzzy and plugh are magic',
 'zzzz']
['foo', 'plugh', 'xyzzy', 'zzzz']
['foo', 'plugh', 'xyzzy', 'zzzz']
['foo', 'food', 'plugh', 'xyzzy', 'zzzz']
1.17837095261
1.21182584763
0.737352132797

Your code is much faster than ones I wrote
but the result is a little bit different.('food' shoud be removed
because 'foo' is in the list)

As you pointed out, list[:] must be a big evil but without duplicating the list,
the problem become much harder to solve to me.

Any practical solution, anyone?