sgmllib parser keeps old tag data?

MRAB google at mrabarnett.plus.com
Fri Feb 13 09:31:40 EST 2009


Berend van Berkum wrote:
> -----BEGIN PGP SIGNED MESSAGE-----
> Hash: SHA1
> 
> 
> Hi everyone,
> 
> I read the source, made numerous tests, but SGMLParser's keeps returning *tag* data 
> from previous parser instances. I'm totally confused why.. The content data it
> returns is ok.
> 
> E.g.::
> 
>     sp = MyParser()
>     sp.feed('<test><t />Test</test>')
>     print sp.content, sp.markup
>     sp.close()
> 
>     sp = MyParser()
>     sp.feed('<xml>\n</xml>\r\n')
>     print sp.content, sp.markup
>     sp.close()
> 
> gives::
> 
>     ('Test', [{'t': ({}, (0, 0))}, {'test': ({}, (0, 4))}]) 
>     ('\n\r\n', [{'t': ({}, (0, 0))}, {'test': ({}, (0, 4))}, {'xml': ({}, (0, 1))}])
> 
> It keeps the tags from the previous session, while i'm sure the stack etc.
> should be clean..
> 
> Any ideas?
> 
> 
> regards, Berend
> 
> - ----
> 
> import sgmllib
> 
> 
> class MyParser(sgmllib.SGMLParser):
> 
> 	content = ''		
> 	markup = []
> 	span_stack = []
> 
These are in the _class_ itself, so they will be shared by all its
instances. You should so something like this instead:

	def __init__(self):
		self.content = ''
		self.markup = []
		self.span_stack = []

> 	def handle_data(self, data):
> 		self.content += data
> 
> 	def unknown_starttag(self, tag, attr):
> 		stack = { tag: ( dict(attr), ( len(self.content), ) ) }
> 		self.span_stack.append(stack)
> 
> 	def unknown_endtag(self, tag):
> 		prev_tag, ( attr, ( offset, ) ) = self.span_stack.pop().items()[0]
> 
> 		if tag:
> 			# close all tags on stack until it finds a matching end tag
> 			# XXX: need to return to LEVEL, not same tag name
> 			while tag != prev_tag:
> 				span = { prev_tag: ( attr, ( offset, 0 ) ) }
> 				self.markup.append( span )
> 
> 				prev_tag, ( attr, ( offset, ) ) = self.span_stack.pop().items()[0]
> 
> 		length = len( self.content ) - offset
> 		span = { tag: ( attr, ( offset, length ) ) }
> 		self.markup.append( span )
> 
> 	def do_unknown_tag(self, tag, attr):
> 		assert not tag and not attr, "do_unknown_tag %s, %s" % (tag, attr)
> 
> 	def close(self):
> 		sgmllib.SGMLParser.close(self)
> 		self.content = ''
> 		self.markup = []
> 		self.span_stack = []						
> 
> 
> def parse_data(data):
> 	sp = MyParser()
> 	sp.feed(data)
> 	r = sp.content, sp.markup
> 	sp.close()
> 	return r
> 
> print parse_data('<test><t />Test</test>')
> print parse_data('<xml>\n</xml>\r\n')
> print parse_data('<sgml><s>Test 3</s></sgml>')
> 




More information about the Python-list mailing list