are elements of a list in sequence in list b

Sat Feb 9 08:44:27 EST 2008

Steven D'Aprano:
> What you're trying to do is the equivalent of substring searching. There
> are simple algorithms that do this, and there are fast algorithms, but
> the simple ones aren't fast and the fast ones aren't simple.
> However, for your use case, the simple algorithm is probably fast enough:
>
> def is_sublist(b, a):
>     """Return True if b is a sub-list of a, otherwise False"""
>     # Get the simple cases out of the way first, for speed.
>     if len(a) < len(b): return False
>     elif a == b: return True
>     elif not b: return False  # Or maybe True, if you prefer.
>     lb = len(b)
>     for i in xrange(len(a)-lb+1):
>         if a[i:i+lb] == b:
>             return True
>     return False

I think a compromise can be found, this seem to work, and fast enough
(Psyco helps a lot of this kind of code):

def issubseq(sub, items):
    """issubseq(sub, items): return true if the sequence 'sub' is a
contiguous
    subsequence of the 'items' sequence.

    >>> issubseq()
    Traceback (most recent call last):
      ...
    TypeError: issubseq() takes exactly 2 arguments (0 given)
    >>> issubseq("abc")
    Traceback (most recent call last):
      ...
    TypeError: issubseq() takes exactly 2 arguments (1 given)
    >>> issubseq(1, [1, 2, 3])
    Traceback (most recent call last):
      ...
    TypeError: object of type 'int' has no len()
    >>> isi = lambda s,i: int(issubseq(s,i))
    >>> isi([], [])
    1
    >>> isi("a", "")
    0
    >>> isi("", "a")
    1
    >>> isi("", "aaa")
    1
    >>> isi("a", "a")
    1
    >>> isi("ab", "bab")
    1
    >>> isi("ab", "bab")
    1
    >>> isi("ba", "bbb")
    0
    >>> isi("bab", "ab")
    0
    >>> isi(("a", "b"), ("a","a","b"))
    1
    >>> isi(("a", "b"), ("a","a","c"))
    0
    >>> isi([1,2,1], [3,5, 1,2,4, 1,2,1, 6])
    1
    >>> isi([1,2,1], [3,5, 1,2,4, 1,2,3, 6])
    0
    >>> l = [1] * 100 + [1,2,3] + [4] * 100
    >>> isi([1,2,3], l)
    1
    >>> l = [1] * 100 + [1,2,4] + [5] * 100
    >>> isi([1,2,3], l)
    0
    """
    len_sub = len(sub)
    len_items = len(items)
    if len_sub == 0:
        return True
    if len_sub > len_items:
        return False
    if len_items == 0:
        return False
    if sub == items:
        return True

    table = [0] * (len_sub + 1)

    # building prefix-function
    m = 0
    for i in xrange(1, len_sub):
        while m > 0 and sub[m] != sub[i]:
            m = table[m - 1]
        if sub[m] == sub[i]:
            m += 1
        table[i] = m

    # searching
    m, i = 0, 0
    for x in items:
        while m > 0 and sub[m] != x:
            m = table[m - 1]
        if sub[m] == x:
            m += 1
            if m == len_sub:
                return True
        i += 1

    return False

def _is_sublist(sub, items):
    """Return True if sub is a sub-list of items, otherwise False.

    >>> _is_sublist()
    Traceback (most recent call last):
      ...
    TypeError: _is_sublist() takes exactly 2 arguments (0 given)
    >>> _is_sublist("abc")
    Traceback (most recent call last):
      ...
    TypeError: _is_sublist() takes exactly 2 arguments (1 given)
    >>> _is_sublist(1, [1, 2, 3])
    Traceback (most recent call last):
      ...
    TypeError: object of type 'int' has no len()
    >>> isi = lambda s,i: int(_is_sublist(s,i))
    >>> isi([], [])
    1
    >>> isi("a", "")
    0
    >>> isi("", "a")
    1
    >>> isi("", "aaa")
    1
    >>> isi("a", "a")
    1
    >>> isi("ab", "bab")
    1
    >>> isi("ab", "bab")
    1
    >>> isi("ba", "bbb")
    0
    >>> isi("bab", "ab")
    0
    >>> isi(("a", "b"), ("a","a","b"))
    1
    >>> isi(("a", "b"), ("a","a","c"))
    0
    >>> isi([1,2,1], [3,5, 1,2,4, 1,2,1, 6])
    1
    >>> isi([1,2,1], [3,5, 1,2,4, 1,2,3, 6])
    0
    >>> l = [1] * 100 + [1,2,3] + [4] * 100
    >>> isi([1,2,3], l)
    1
    >>> l = [1] * 100 + [1,2,4] + [5] * 100
    >>> isi([1,2,3], l)
    0
    """
    # By Steven D'Aprano, modified
    len_sub = len(sub)
    len_items = len(items)
    if len_sub == 0:
        return True
    if len_sub > len_items:
        return False
    if len_items == 0:
        return False
    if sub == items:
        return True

    for i in xrange(len_items - len_sub + 1):
        if items[i : i+len_sub] == sub:
            return True
    return False

if __name__ == "__main__":
    import doctest
    doctest.testmod()
    print "Doctests finished.\n"

    def subsequence_benchmark(with_psyco=False):
        from random import randrange
        from time import clock

        N = 100000
        NBIG = 1000

        small_sub = [1, 2, 3]
        big_sub = [randrange(10) for _ in xrange(NBIG)]

        small_present = [1] * N + small_sub + [N] * N
        small_absent1 = [1] * N + small_sub[:-1] + [N] * N
        small_absent2 = small_sub[:-1] * N
        big_present = [1] * N + big_sub + [N] * N
        big_absent = [1] * N + big_sub[:-1] + [N] * N

        test_cases = [(small_sub, small_present), (small_sub,
small_absent1),
                      (small_sub, small_absent2),
                      (big_sub, big_present), (big_sub, big_absent)]
        algos = [issubseq, _is_sublist]

        if with_psyco:
            import psyco
            for algo in algos:
                psyco.bind(algo)
            print "With psyco"
        else:
            print "Without psyco"

        print "N, NBIG =", N, NBIG
        for sub, seq in test_cases:
            print "len(sub),len(seq)= %d  %d" % (len(sub), len(seq))
            result = []
            for finder in algos:
                t0 = clock()
                r = finder(sub, seq)
                t1 = clock()
                result.append(r)
                print "  %s: %s (time: %.02f)" %(finder.func_name, r,
round(t1-t0,2))
            assert(all(r==True for r in result) or all(r==False for r
in result))

    #subsequence_benchmark(with_psyco=True)

There are ways to use less memory or to go faster, but they may be
overkill:
http://www.icu-project.org/docs/papers/efficient_text_searching_in_java.html

Bye,
bearophile