searching substrings with interpositions

Claudio Grondi claudio.grondi at freenet.de
Tue May 24 14:19:29 EDT 2005


<borges2003xx at yahoo.it> schrieb im Newsbeitrag
news:1116939594.958040.14110 at z14g2000cwz.googlegroups.com...
> thanx everyone, is what i need.
> As Claudio argues, it's a standard problem of dna sequences
> comparation.
> the next step of my job is to make limits of lenght of interposed
> sequences (if someone can help me in this way i'll apreciate a lot)
> thanx everyone.
> giorgio
>
Note: code below is intended to help to clarify things only,
so that a bunch of examples can be tested.
If you need bugfree production quality code, maybe
someone else can provide it.

I have introduced two additional parameter to the function.
If  intMaxLenOfGap == 0  the gap size doesn't matter.
lstStartEndOfRangeOfBwithOccurenceOfA returns in its
0,1 elements the begin and end of the range strA was
found in strB.

Hope this does what you mean with
  "make limits of lenght of interposed sequences",
does it?

Claudio
P.S. Here the code:

def blnFindCharSequenceAevenIfSpreadOverEntireStringB(strA, strB,
intMaxLenOfGap = 0, lstStartEndOfRangeOfBwithOccurenceOfA = []):

  lstStartEndOfRangeOfBwithOccurenceOfA = []
  intNoOfCharsFound = 0
  intPtrToFirstCharFound = 0
  intPtrToBeginOfSubsectionOfB = 0
  intLenA = len(strA)
  intLenB = len(strB)
  blnStrAinB = False
  indxToA = 0

  while(indxToA < intLenA):
    # print chrA
    if(indxToA == 0):
      blnFoundChrA = False
      for indxToB in range(intPtrToBeginOfSubsectionOfB, intLenB):
        if(strA[indxToA] == strB[indxToB]):
          intNoOfCharsFound += 1
          # print "   ",chrA, strB[indxToB], indxToB
          intPtrToFirstCharFound = indxToB
          intPtrToBeginOfSubsectionOfB = indxToB + 1
          blnFoundChrA = True
          break
        #:if
      #:for
      if(intNoOfCharsFound == intLenA):
        blnStrAinB = True
        print "sequence '%s' found in '%s'"%(strA, strB)
        break
      #:if
      if(blnFoundChrA == False):
        break
      #:if
      indxToA += 1
    else:
      intGapLen = 0
      blnFoundChrA = False
      for indxToB in range(intPtrToBeginOfSubsectionOfB, intLenB):
        if(strA[indxToA] == strB[indxToB]):
          intNoOfCharsFound += 1
          # print "   ",chrA, strB[indxToB], indxToB
          intPtrToBeginOfSubsectionOfB = indxToB + 1
          blnFoundChrA = True
          break
        #:if
        intGapLen += 1
        if(intMaxLenOfGap > 0 and intGapLen > intMaxLenOfGap):
          indxToA = 0
          blnFoundChrA = False
          intPtrToBeginOfSubsectionOfB = intPtrToFirstCharFound + 1
          intNoOfCharsFound = 0
          break
        #:if
      #:for
      if(intNoOfCharsFound == intLenA):
        blnStrAinB = True
        print "sequence '%s' found in '%s' at range(%i, %i)"%(strA, strB,
intPtrToFirstCharFound, indxToB+1)
        lstStartEndOfRangeOfB.append(intPtrToFirstCharFound)
        lstStartEndOfRangeOfB.append(indxToB+1)
        break
      #:if
      if(blnFoundChrA == False):
        break
      #:if
      indxToA += 1
    #:if/else
  #:while
  if  blnStrAinB == False:
    if(intMaxLenOfGap > 0 and intGapLen > intMaxLenOfGap):
      print "sequence '%s' not in '%s' (maybe allowed gap of %i chars was
too small?)"%(strA, strB, intMaxLenOfGap)
    else:
      print "sequence '%s' not in '%s'"%(strA, strB)
  #:if
#:def

print

lstStartEndOfRangeOfB = []
strA = "0101"
strB = "000011110100"
blnFindCharSequenceAevenIfSpreadOverEntireStringB(strA, strB)

lstStartEndOfRangeOfB = []
strA = "0101"
strB = "000011110100"
blnFindCharSequenceAevenIfSpreadOverEntireStringB(strA, strB, 2)

strA = "010101"
strB = "000011110100"
blnFindCharSequenceAevenIfSpreadOverEntireStringB(strA, strB, 6,
lstStartEndOfRangeOfB)

strA = "010101"
strB = "00001111010000001"
blnFindCharSequenceAevenIfSpreadOverEntireStringB(strA, strB, 4,
lstStartEndOfRangeOfB)

strA = "010101"
strB = "00001111010000001"
blnFindCharSequenceAevenIfSpreadOverEntireStringB(strA, strB, 5,
lstStartEndOfRangeOfB)
print
print "usage of lstStartEndOfRangeOfB parameter passed to function for use
as return value:"
print "sequence '%s' was found in '%s' at range(%i, %i)"%(strA, strB,
lstStartEndOfRangeOfB[0], lstStartEndOfRangeOfB[1])





More information about the Python-list mailing list