Checking if string starts with list element
Darrell Gallion
darrell at dorb.com
Mon Aug 14 21:09:32 EDT 2000
I've got to stop reading this group. it's sooo distracting :)
This is all very untested.
--Darrell
import string, re, time
def findallParse(patList, source, start=0, stop=0):
"""Assumes a pattern with groups
Returns a list of [(groupNumber, start, end),...]
"""
results = []
for pattern in patList:
pos = start
if stop==0:
end=len(source)
else:
end = min(stop,len(source))
match = pattern.code.match
append = results.append
restRange= None
while pos <= end:
regs = match(source, pos, end, 0)
if not regs:
break
i,j = regs[0]
rest = regs[1:]
gr=None
if not restRange:
restRange = range(1,len(regs))
for x in restRange:
a, b= regs[x]
if a != -1:
gr=(x,a,b)
break
append(gr)
pos = max(j, pos+1)
return results
def groupsOf(l, sz):
"""
Factor l in to lists of max size sz
"""
g1 = len(l) / sz
g2 = len(l) % sz
res=[]
last=0
for x in range(g1):
x=(x+1)*sz
res.append(l[last:x])
last=x
res.append(l[last:])
return res
words=string.split(
"""Hope this isn't full of holes
Some text-->
The solution is to use Python's raw string notation for regular expression
patterns; backslashes are not handled in any special way in a string literal
prefixed with "r". So r"\n" is a two-character string containing "\" and
"n", while "\n" is a one-character string containing a newline. Usually
patterns will be expressed in Python code using this raw string notation.
"""
)
# Get rid of dups
dict={}
for i in words:
dict[i]=i
words=dict.keys()
print "len(words to find)=",len(words)
# re has a limit on the number of groups, Don't know what it is, go with
50
wordGroups=groupsOf(words, 50)
# Build a list of patterns
patList=[]
openPat="(^"
closePat=")"
for i in wordGroups:
# print "(?m)%s%s%s"%(openPat,string.join(i,")|(^"),closePat)
patList.append(re.compile("(?m)%s%s%s"%(openPat,string.join(i,")|(^"),closeP
at)))
input="""
Hope may not be enough.
Work is full
Holes have been known to flow.
isn't this fun
"""
input=input*1000
print "len(input)=",len(input)
lines=string.split(input,'\n')
t1=time.time()
hits=0
for line in lines:
for word in words:
if string.find(line[:len(word)],word) != -1:
hits=hits+1
# print line,'-----', word
break
print 'String Hits:', hits
print time.time()-t1
t1=time.time()
res=findallParse(patList,input)
print 'Re Hits:',len(res)
print time.time()-t1
""" Output
>test.py
len(words to find)= 55
len(input)= 84000
String Hits: 2000
3.03199994564
Re Hits: 3000
0.453000068665
"""
More information about the Python-list
mailing list