Checking if string starts with list element

Darrell Gallion darrell at dorb.com
Mon Aug 14 21:09:32 EDT 2000


I've got to stop reading this group. it's sooo distracting :)
This is all very untested.

--Darrell

import string, re, time

def findallParse(patList, source, start=0, stop=0):
    """Assumes a pattern with groups
    Returns a list of [(groupNumber, start, end),...]
    """
    results = []
    for pattern in patList:
        pos = start
        if stop==0:
            end=len(source)
        else:
            end = min(stop,len(source))

        match = pattern.code.match
        append = results.append

        restRange= None
        while pos <= end:
            regs = match(source, pos, end, 0)
            if not regs:
                break
            i,j = regs[0]
            rest = regs[1:]
            gr=None
            if not restRange:
                restRange = range(1,len(regs))
            for x in restRange:
                a, b= regs[x]
                if a != -1:
                    gr=(x,a,b)
                    break
            append(gr)
            pos = max(j, pos+1)
    return results


def groupsOf(l, sz):
    """
    Factor l in to lists of max size sz
    """
    g1 = len(l) / sz
    g2 = len(l) % sz

    res=[]
    last=0
    for x in range(g1):
        x=(x+1)*sz
        res.append(l[last:x])
        last=x

    res.append(l[last:])
    return res


words=string.split(
"""Hope this isn't full of holes
Some text-->
The solution is to use Python's raw string notation for regular expression
patterns; backslashes are not handled in any special way in a string literal
prefixed with "r". So r"\n" is a two-character string containing "\" and
"n", while "\n" is a one-character string containing a newline. Usually
patterns will be expressed in Python code using this raw string notation.
"""
)
    # Get rid of dups
dict={}
for i in words:
    dict[i]=i

words=dict.keys()
print "len(words to find)=",len(words)
    # re has a limit on the number of groups, Don't know what it is, go with
50
wordGroups=groupsOf(words, 50)

    # Build a list of patterns
patList=[]
openPat="(^"
closePat=")"
for i in wordGroups:
#    print "(?m)%s%s%s"%(openPat,string.join(i,")|(^"),closePat)

patList.append(re.compile("(?m)%s%s%s"%(openPat,string.join(i,")|(^"),closeP
at)))

input="""
Hope may not be enough.
Work is full
Holes have been known to flow.
isn't this fun
"""
input=input*1000
print "len(input)=",len(input)

lines=string.split(input,'\n')

t1=time.time()

hits=0
for line in lines:
    for word in words:
        if string.find(line[:len(word)],word) != -1:
            hits=hits+1
#            print line,'-----', word
            break

print 'String Hits:', hits
print time.time()-t1

t1=time.time()
res=findallParse(patList,input)
print 'Re Hits:',len(res)
print time.time()-t1

""" Output
>test.py
len(words to find)= 55
len(input)= 84000
String Hits: 2000
3.03199994564
Re Hits: 3000
0.453000068665
"""























































More information about the Python-list mailing list