[XML-SIG] new code for xml.dom.utils

Thu, 17 Dec 1998 12:54:00 -0500

I threw together this class and thought it might be a good candidate for
the xml.dom.utils module.  It makes it really easy to get a DOM tree from a
file.  I made it a class even though it could just as easily be a bunch of
functions but as a class it might be subclassed for some neat things I
can't think of right now.

The following subclass would allow an HTML or XML file to be pretty printed
with a single line of code (a pretty silly example but it's just an
example):

class DomDumper(DomHelper)
     __init__(self,filename):
          DomHelper.__init__(self,filename)
          self.dom.dump()

d = DomDumper(sys.argv[1])

Here's the file:

import sys, string, os
from xml.dom import core
from xml.dom import html_builder
from xml.sax import saxexts
from xml.dom.sax_builder import SaxBuilder

class DomHelper:
     def __init__(self,filename=None):
          self.filename = filename
          if filename != None:
               self.dom = self.readFile(filename)

     def readFile(self,filename):
          """Given an XML, HTML, or SGML filename with appropriate file
extensione,
          return the DOM document."""

          type = self.getFileType(filename)
          file = open(filename,'r')
          dom = self.readStream(file,type)
          file.close()
          return dom

     def readStream(self,stream,type='XML'):
          if type == 'XML':
               dom = self.readXml(stream)
          elif type == 'HTML':
               dom = self.readHtml(stream)
          elif type == 'SGML':
               dom = self.readSgml(stream)
          else:
               dom = None
          return dom

     def readXml(self,stream,parserName=None):
          """parserName could be 'pyexpat', 'sgmlop', etc."""
          p = saxexts.make_parser(parserName)
          dh = SaxBuilder()
          p.setDocumentHandler(dh)
          p.feed(stream.read())
          doc = dh.document
          p.close()
          return doc

     def readHtml(self,stream):
          b = html_builder.HtmlBuilder()
          b.feed(stream.read())
          b.close()
          doc = b.document
          # There was some bug that prevents the builder from
          # freeing itself (maybe it has already been fixed?).
          # The next two lines break its references to the DOM
          # tree so that it can be freed.
          b.document = None
          b.current_element = None
          return doc

     def readSgml(self):
          # Don't know much about this part.  This could call SX to
          # convert the SGML to XML, then read it in.  That's what I
          # do for some SGML files I need to convert.  Any suggestions?
          print "This is not implemented."

     def getFileType(self,filename):
          """Given a filename, figure out if the file contains XML, HTML,
or SGML.
          For now, use the file extension to make the determination."""

          filename = string.lower(filename)
          (name,ext) = os.path.splitext(filename)

          if ext in ('.htm','.html'):
               type = 'HTML'
          elif ext in ('.sgm','.sgml'):
               type = 'SGML'
          elif ext == '.xml':
               type = 'XML'
          else:
               type = '' # should this return None instead?
          return type

if __name__ == '__main__':
     if len(sys.argv) == 2:
          d = DomHelper()
          dom = d.readFile(sys.argv[1])
          dom.dump()
     else:
          print "Usage: python %s <?ML filename>" % sys.argv[0]