searching substrings with interpositions

Andrew Dalke dalke at dalkescientific.com
Tue May 24 17:13:27 EDT 2005


Claudio Grondi wrote:
> Note: code below is intended to help to clarify things only,
> so that a bunch of examples can be tested.
> If you need bugfree production quality code, maybe
> someone else can provide it.

Still not tested enough to ensure that it's bug free, but more
concise.  Here's one the implements the algorithm directly and
another that uses a regexp.  The latter should be the preferred
approach.  My intent was that the algorithm implements the given
pattern so they should given identical results.

# Doing the work ourselves
def find_spread_substring(query, target, limit = None):
    stack = []
    ti = qi = 0
    Nq = len(query)
    Nt = len(target)
    delta = 0

    while ti < Nt:
        # We have a match
        if query[qi] == target[ti]:
            stack.append( (qi, ti, delta) )
            qi = qi + 1
            if qi == Nq:
                return [ti for (qi, ti, delta) in stack]
            ti = ti + 1
            delta = 0
        else:
            # No match
            while 1:
                # If we have a partial match, check if we've
                # gone over the limit.
                if stack:
                    delta = delta + 1
                if limit is not None and delta > limit:
                    # backtrack, treating it as an invalid match
                    # (so retry this 'else:' block)
                    qi, ti, delta = stack.pop()
                    continue
                # No backtracking needed
                break
            # Advance to check the next character in the target
            ti = ti + 1

    # Failure
    return None

# Using regular expressions
import re
def find_spread_substring2(query, target, limit = None):
    if limit is None:
        template = "(%s).*?"
    else:
        template = "(%%s).{,%d}?" % (limit,)
    terms = [template % c for c in query]
    pattern = "".join(terms)

    pat = re.compile(pattern)
    m = pat.search(target)
    if not m:
        return None
    return [m.start(i) for i in range(1, len(query)+1)]
    

def test():
    for (q, t, limit, is_valid) in (
        ("1010", "10001001", None, True),
        ("1010", "100011", None, False),
        ("1010", "100010", 3, True),
        ("1010", "100010", 1, True),
        ("1010", "1000010", 1, False),
        ("1010", "01000010", 2, True),
        ("1010", "01000010", 1, False),
        ("1010", "0100000", None, False),

        ):
        result = find_spread_substring(q, t, limit)
        result2 = find_spread_substring2(q, t, limit)
        if result != result2:
            raise AssertionError( (result, result2) )

        if result is not None:
            if limit is not None:
                # check that it's a proper subset
                for (x, y) in zip(result[:-1], result[1:]):
                    # +1 because 'limit' is the maximum gap size
                    if (y-x) > limit+1:
                        raise AssertionError((q, t, limit, result, x, y))
            s = "".join([t[i] for i in result])
            if s != q:
                raise AssertionError((q, t, limit, result, s))
        
        if result is None and not is_valid:
            pass
        elif result is not None and is_valid:
            pass
        else:
            raise AssertionError( (q, t, limit, is_valid, result) )

if __name__ == "__main__":
    test()
    print "All tests passed."
    

				Andrew
				dalke at dalkescientific.com




More information about the Python-list mailing list