are elements of a list in sequence in list b
bearophileHUGS at lycos.com
bearophileHUGS at lycos.com
Sat Feb 9 08:44:27 EST 2008
Steven D'Aprano:
> What you're trying to do is the equivalent of substring searching. There
> are simple algorithms that do this, and there are fast algorithms, but
> the simple ones aren't fast and the fast ones aren't simple.
> However, for your use case, the simple algorithm is probably fast enough:
>
> def is_sublist(b, a):
> """Return True if b is a sub-list of a, otherwise False"""
> # Get the simple cases out of the way first, for speed.
> if len(a) < len(b): return False
> elif a == b: return True
> elif not b: return False # Or maybe True, if you prefer.
> lb = len(b)
> for i in xrange(len(a)-lb+1):
> if a[i:i+lb] == b:
> return True
> return False
I think a compromise can be found, this seem to work, and fast enough
(Psyco helps a lot of this kind of code):
def issubseq(sub, items):
"""issubseq(sub, items): return true if the sequence 'sub' is a
contiguous
subsequence of the 'items' sequence.
>>> issubseq()
Traceback (most recent call last):
...
TypeError: issubseq() takes exactly 2 arguments (0 given)
>>> issubseq("abc")
Traceback (most recent call last):
...
TypeError: issubseq() takes exactly 2 arguments (1 given)
>>> issubseq(1, [1, 2, 3])
Traceback (most recent call last):
...
TypeError: object of type 'int' has no len()
>>> isi = lambda s,i: int(issubseq(s,i))
>>> isi([], [])
1
>>> isi("a", "")
0
>>> isi("", "a")
1
>>> isi("", "aaa")
1
>>> isi("a", "a")
1
>>> isi("ab", "bab")
1
>>> isi("ab", "bab")
1
>>> isi("ba", "bbb")
0
>>> isi("bab", "ab")
0
>>> isi(("a", "b"), ("a","a","b"))
1
>>> isi(("a", "b"), ("a","a","c"))
0
>>> isi([1,2,1], [3,5, 1,2,4, 1,2,1, 6])
1
>>> isi([1,2,1], [3,5, 1,2,4, 1,2,3, 6])
0
>>> l = [1] * 100 + [1,2,3] + [4] * 100
>>> isi([1,2,3], l)
1
>>> l = [1] * 100 + [1,2,4] + [5] * 100
>>> isi([1,2,3], l)
0
"""
len_sub = len(sub)
len_items = len(items)
if len_sub == 0:
return True
if len_sub > len_items:
return False
if len_items == 0:
return False
if sub == items:
return True
table = [0] * (len_sub + 1)
# building prefix-function
m = 0
for i in xrange(1, len_sub):
while m > 0 and sub[m] != sub[i]:
m = table[m - 1]
if sub[m] == sub[i]:
m += 1
table[i] = m
# searching
m, i = 0, 0
for x in items:
while m > 0 and sub[m] != x:
m = table[m - 1]
if sub[m] == x:
m += 1
if m == len_sub:
return True
i += 1
return False
def _is_sublist(sub, items):
"""Return True if sub is a sub-list of items, otherwise False.
>>> _is_sublist()
Traceback (most recent call last):
...
TypeError: _is_sublist() takes exactly 2 arguments (0 given)
>>> _is_sublist("abc")
Traceback (most recent call last):
...
TypeError: _is_sublist() takes exactly 2 arguments (1 given)
>>> _is_sublist(1, [1, 2, 3])
Traceback (most recent call last):
...
TypeError: object of type 'int' has no len()
>>> isi = lambda s,i: int(_is_sublist(s,i))
>>> isi([], [])
1
>>> isi("a", "")
0
>>> isi("", "a")
1
>>> isi("", "aaa")
1
>>> isi("a", "a")
1
>>> isi("ab", "bab")
1
>>> isi("ab", "bab")
1
>>> isi("ba", "bbb")
0
>>> isi("bab", "ab")
0
>>> isi(("a", "b"), ("a","a","b"))
1
>>> isi(("a", "b"), ("a","a","c"))
0
>>> isi([1,2,1], [3,5, 1,2,4, 1,2,1, 6])
1
>>> isi([1,2,1], [3,5, 1,2,4, 1,2,3, 6])
0
>>> l = [1] * 100 + [1,2,3] + [4] * 100
>>> isi([1,2,3], l)
1
>>> l = [1] * 100 + [1,2,4] + [5] * 100
>>> isi([1,2,3], l)
0
"""
# By Steven D'Aprano, modified
len_sub = len(sub)
len_items = len(items)
if len_sub == 0:
return True
if len_sub > len_items:
return False
if len_items == 0:
return False
if sub == items:
return True
for i in xrange(len_items - len_sub + 1):
if items[i : i+len_sub] == sub:
return True
return False
if __name__ == "__main__":
import doctest
doctest.testmod()
print "Doctests finished.\n"
def subsequence_benchmark(with_psyco=False):
from random import randrange
from time import clock
N = 100000
NBIG = 1000
small_sub = [1, 2, 3]
big_sub = [randrange(10) for _ in xrange(NBIG)]
small_present = [1] * N + small_sub + [N] * N
small_absent1 = [1] * N + small_sub[:-1] + [N] * N
small_absent2 = small_sub[:-1] * N
big_present = [1] * N + big_sub + [N] * N
big_absent = [1] * N + big_sub[:-1] + [N] * N
test_cases = [(small_sub, small_present), (small_sub,
small_absent1),
(small_sub, small_absent2),
(big_sub, big_present), (big_sub, big_absent)]
algos = [issubseq, _is_sublist]
if with_psyco:
import psyco
for algo in algos:
psyco.bind(algo)
print "With psyco"
else:
print "Without psyco"
print "N, NBIG =", N, NBIG
for sub, seq in test_cases:
print "len(sub),len(seq)= %d %d" % (len(sub), len(seq))
result = []
for finder in algos:
t0 = clock()
r = finder(sub, seq)
t1 = clock()
result.append(r)
print " %s: %s (time: %.02f)" %(finder.func_name, r,
round(t1-t0,2))
assert(all(r==True for r in result) or all(r==False for r
in result))
#subsequence_benchmark(with_psyco=True)
There are ways to use less memory or to go faster, but they may be
overkill:
http://www.icu-project.org/docs/papers/efficient_text_searching_in_java.html
Bye,
bearophile
More information about the Python-list
mailing list