[XML-SIG] new code for xml.dom.utils
Jeff.Johnson@icn.siemens.com
Jeff.Johnson@icn.siemens.com
Thu, 17 Dec 1998 12:54:00 -0500
I threw together this class and thought it might be a good candidate for
the xml.dom.utils module. It makes it really easy to get a DOM tree from a
file. I made it a class even though it could just as easily be a bunch of
functions but as a class it might be subclassed for some neat things I
can't think of right now.
The following subclass would allow an HTML or XML file to be pretty printed
with a single line of code (a pretty silly example but it's just an
example):
class DomDumper(DomHelper)
__init__(self,filename):
DomHelper.__init__(self,filename)
self.dom.dump()
d = DomDumper(sys.argv[1])
Here's the file:
import sys, string, os
from xml.dom import core
from xml.dom import html_builder
from xml.sax import saxexts
from xml.dom.sax_builder import SaxBuilder
class DomHelper:
def __init__(self,filename=None):
self.filename = filename
if filename != None:
self.dom = self.readFile(filename)
def readFile(self,filename):
"""Given an XML, HTML, or SGML filename with appropriate file
extensione,
return the DOM document."""
type = self.getFileType(filename)
file = open(filename,'r')
dom = self.readStream(file,type)
file.close()
return dom
def readStream(self,stream,type='XML'):
if type == 'XML':
dom = self.readXml(stream)
elif type == 'HTML':
dom = self.readHtml(stream)
elif type == 'SGML':
dom = self.readSgml(stream)
else:
dom = None
return dom
def readXml(self,stream,parserName=None):
"""parserName could be 'pyexpat', 'sgmlop', etc."""
p = saxexts.make_parser(parserName)
dh = SaxBuilder()
p.setDocumentHandler(dh)
p.feed(stream.read())
doc = dh.document
p.close()
return doc
def readHtml(self,stream):
b = html_builder.HtmlBuilder()
b.feed(stream.read())
b.close()
doc = b.document
# There was some bug that prevents the builder from
# freeing itself (maybe it has already been fixed?).
# The next two lines break its references to the DOM
# tree so that it can be freed.
b.document = None
b.current_element = None
return doc
def readSgml(self):
# Don't know much about this part. This could call SX to
# convert the SGML to XML, then read it in. That's what I
# do for some SGML files I need to convert. Any suggestions?
print "This is not implemented."
def getFileType(self,filename):
"""Given a filename, figure out if the file contains XML, HTML,
or SGML.
For now, use the file extension to make the determination."""
filename = string.lower(filename)
(name,ext) = os.path.splitext(filename)
if ext in ('.htm','.html'):
type = 'HTML'
elif ext in ('.sgm','.sgml'):
type = 'SGML'
elif ext == '.xml':
type = 'XML'
else:
type = '' # should this return None instead?
return type
if __name__ == '__main__':
if len(sys.argv) == 2:
d = DomHelper()
dom = d.readFile(sys.argv[1])
dom.dump()
else:
print "Usage: python %s <?ML filename>" % sys.argv[0]