searching substrings with interpositions
Andrew Dalke
dalke at dalkescientific.com
Tue May 24 17:13:27 EDT 2005
Claudio Grondi wrote:
> Note: code below is intended to help to clarify things only,
> so that a bunch of examples can be tested.
> If you need bugfree production quality code, maybe
> someone else can provide it.
Still not tested enough to ensure that it's bug free, but more
concise. Here's one the implements the algorithm directly and
another that uses a regexp. The latter should be the preferred
approach. My intent was that the algorithm implements the given
pattern so they should given identical results.
# Doing the work ourselves
def find_spread_substring(query, target, limit = None):
stack = []
ti = qi = 0
Nq = len(query)
Nt = len(target)
delta = 0
while ti < Nt:
# We have a match
if query[qi] == target[ti]:
stack.append( (qi, ti, delta) )
qi = qi + 1
if qi == Nq:
return [ti for (qi, ti, delta) in stack]
ti = ti + 1
delta = 0
else:
# No match
while 1:
# If we have a partial match, check if we've
# gone over the limit.
if stack:
delta = delta + 1
if limit is not None and delta > limit:
# backtrack, treating it as an invalid match
# (so retry this 'else:' block)
qi, ti, delta = stack.pop()
continue
# No backtracking needed
break
# Advance to check the next character in the target
ti = ti + 1
# Failure
return None
# Using regular expressions
import re
def find_spread_substring2(query, target, limit = None):
if limit is None:
template = "(%s).*?"
else:
template = "(%%s).{,%d}?" % (limit,)
terms = [template % c for c in query]
pattern = "".join(terms)
pat = re.compile(pattern)
m = pat.search(target)
if not m:
return None
return [m.start(i) for i in range(1, len(query)+1)]
def test():
for (q, t, limit, is_valid) in (
("1010", "10001001", None, True),
("1010", "100011", None, False),
("1010", "100010", 3, True),
("1010", "100010", 1, True),
("1010", "1000010", 1, False),
("1010", "01000010", 2, True),
("1010", "01000010", 1, False),
("1010", "0100000", None, False),
):
result = find_spread_substring(q, t, limit)
result2 = find_spread_substring2(q, t, limit)
if result != result2:
raise AssertionError( (result, result2) )
if result is not None:
if limit is not None:
# check that it's a proper subset
for (x, y) in zip(result[:-1], result[1:]):
# +1 because 'limit' is the maximum gap size
if (y-x) > limit+1:
raise AssertionError((q, t, limit, result, x, y))
s = "".join([t[i] for i in result])
if s != q:
raise AssertionError((q, t, limit, result, s))
if result is None and not is_valid:
pass
elif result is not None and is_valid:
pass
else:
raise AssertionError( (q, t, limit, is_valid, result) )
if __name__ == "__main__":
test()
print "All tests passed."
Andrew
dalke at dalkescientific.com
More information about the Python-list
mailing list