[Python-bugs-list] PRIVATE: Bug in re module (PR#36)
chenna@embl-heidelberg.de
chenna@embl-heidelberg.de
Fri, 23 Jul 1999 06:51:45 -0400 (EDT)
Full_Name: Chenna Ramu
Version: 1.5.1
OS: OSF1
Submission from: shag.embl-heidelberg.de (192.54.41.195)
Hello I get
Stack overflow: pid 14731, proc emparse1.py, addr 0x11fdfffd8, pc 0x120068cd8
Segmentation fault
when I run the following. The problem is re module unable to search
for the pattern that is too large, but this is a requirement for my
application in biology. I enclose the source code with this.
Please email me the solution as soon as possible
Thanks
Ramu
___________________________
#!/usr/pub/bin/python1.5
#
#
#
# (C) Chenna Ramu, EMBL, Heidelberg, Germany
#
# parser for biological databases
#
import string
import sys
import re
parserDict = {
'id' : r'((^ID [^\n]+\n)+)' ,
'ac' : r'((^AC [^\n]+\n)+)' ,
'dt' : r'((^DT [^\n]+\n)+)' ,
'de' : r'((^DE [^\n]+\n)+)' ,
'gn' : r'((^GN [^\n]+\n)+)' ,
'os' : r'((^OS [^\n]+\n)+)' ,
'oc' : r'((^OC [^\n]+\n)+)' ,
'ref' : r'(('
r'(^RN [^\n]+\n)'
r'((^RP [^\n]+\n)+)'
r'((^RX [^\n]+\n)?)'
r'((^RA [^\n]+\n)+)'
r'((^RT [^\n]+\n)*)'
r'((^RL [^\n]+\n)+)'
r')+)',
'cc' : r'((^CC [^\n]+\n)+)' ,
'dr' : r'((^DR [^\n]+\n)+)' ,
'kw' : r'((^KW [^\n]+\n)+)' ,
'ft' : r'((^FT [^\n]+\n)+)' ,
'sq' : r'(^SQ [^\n]+\n)' \
r'((^ [^\n]+\n)+)'
}
_hrefLink = {'embl':['<A href=%s>%s</A>','^DR ([^;]+)']} #should be like this
hrefLink = {'EMBL':"<A href=http://www/wgetz?-e+[%s-id:%s]>%s</a>",
'MIM':"<A href=http://www/wgetz?-e+[%s-id:%s]>%s</a>"}
em_rep = r'(^DR )(?P<dbase>[^;]+); (?P<id>[^;]+)'
class embl:
def __init__(self,parserDict={}):
self.parserDict = {}
self.reDict = {} #keep the compiled re's
self.fields = []
if parserDict:
self.Init(parserDict)
def Init(self,parserDict):
self.parserDict = parserDict
self.fields = parserDict.keys()
for j in self.fields:
setattr(self, j, None)
self.reDict[j] = re.compile(parserDict[j],re.MULTILINE)
def Parse(self,str):
if not self.parserDict:
print "No parser specified"
return
for k,v in parserDict.items():
## tmp = re.compile(v,re.MULTILINE) # move this to __init__
tmp = self.reDict[k]
mat = tmp.search(str)
if mat:
setattr(self, k, mat.group() )
def Field(self,name):
try:
return getattr(self,name)
except AttributeError:
return None
def PrintFields(self):
flds = self.fields
for j in flds:
print "==> ",j
print getattr(self,j)
def ReParse(self,str,retToken,pat):
""" str:string to parse, retToken:return token, pat:parser """
_p = re.compile(pat)
m = _p.search(str)
if m:
return m.group(retToken)
else:
return None
def Href(match):
""" Replaces the hrefs """
dbase = match.group('dbase')
id = match.group('id')
try:
defi = hrefLink[dbase]
except KeyError:
defi = None
if defi:
tmp = match.group(1) + dbase + '; '+defi %(dbase,id,id)
else:
tmp = match.group(1)+ dbase + '; ' + id
return tmp
def test(fileName):
sys.path.insert(0,'/home/chenna/py')
## from seqFormat import *
e = embl(parserDict)
# f = open('acha_mouse.dat','r')
f = open(fileName,'r')
a = f.readlines()
f.close()
a = string.join(a,'')
e.Parse(a)
e.PrintFields()
import string
print ' the fields are ',e.fields
## seq = string.split(e.sq,';')[-1]
## s = Seq(seq,'test')
## print 'check===>',s.seq
## s.SeqPrint('swiss')
seqLen = e.ReParse(e.sq[0:50],'seqLen',r'^SQ [^ ]+ *(?P<seqLen>(\d+))')
print e.sq
print "length of the sequence ",seqLen
print e.ref
dr = e.dr
print dr
p = re.compile(em_rep,re.M)
dr = p.sub(Href,dr)
print dr
print e.Field('id')
print e.Field('dr')
print e.Field('mm')
return
def test1(dumm=None):
tmp = ['SQ Sequence 1041931 BP; 8972 A; 5950 C; 6264 G; 8224 T; 0
other;\n']
for j in range(1,17365):
t = ' ' + 'tcagtcagtg ' * 6 + '\n'
tmp.append(t)
a = string.join(tmp)
e = embl(parserDict)
e.Parse(a)
e.PrintFields()
if __name__ == '__main__':
try:
test1(sys.argv[1])
except:
test1()