[XML-SIG] SAX prettyprinter V2 and SGMLOP

Christian Tismer tismer@appliedbiometrics.com
Fri, 22 Jan 1999 21:27:58 +0100


This is a multi-part message in MIME format.
--------------F46600A1D3B1D2BC0AA2B68F
Content-Type: text/plain; charset=us-ascii
Content-Transfer-Encoding: 7bit

Hi again,

the appended version of Indenter.py can use sgmlop to format
large XML files. It then processes a few megabytes in a few seconds.

sgmlop does not support ignorableWhitespace, so I supported
this alone, by delayed writing and postprocessing.

BTW - is sgmlop deprecated?
It still has some flaws, like not allowing "_" in tagnames.
Is Fredrik no longer supporting it, or what is the current
preferred fast parser for all platforms?

ciao - chris

-- 
Christian Tismer             :^)   <mailto:tismer@appliedbiometrics.com>
Applied Biometrics GmbH      :     Have a break! Take a ride on Python's
Kaiserin-Augusta-Allee 101   :    *Starship* http://starship.skyport.net
10553 Berlin                 :     PGP key -> http://pgp.ai.mit.edu/
     we're tired of banana software - shipped green, ripens at home
--------------F46600A1D3B1D2BC0AA2B68F
Content-Type: text/plain; charset=us-ascii; name="indenter.py"
Content-Transfer-Encoding: 7bit
Content-Disposition: inline; filename="indenter.py"


# pretty printer for SAX
# CT990122
# based upon the saxutils.Canonizer code

# V.0.2 support for sgmlop which doesn't give ignorableWhitespace info

from xml.sax import saxexts, saxlib, saxutils

import string, sys

class Indenter(saxlib.HandlerBase):
    "A SAX document handler that produces indented XML output."

    def __init__(self,writer=sys.stdout, indent=2):
        self.elem_level=0
        self.writer=writer
        self.indent=indent
        self.last_level=-1
        self.buffer = ""   # lazy buffer for whitespace stripping
    
    def processingInstruction (self,target, remainder):
        #if not target=="xml":
            self.writer.write("<?"+target+" "+remainder+"?>\n")

    def startElement(self,name,amap):
        if self.buffer:
            self.write_buffer()
        self.writer.write("\n"+self.indent*self.elem_level*" "+"<"+name)
        
        a_names=amap.keys()
        a_names.sort()

        for a_name in a_names:
            self.writer.write(" "+a_name+"=\"")
            self.write_data(amap[a_name], 1)
            self.writer.write("\"")
        self.writer.write(">")
        self.last_level = self.elem_level
        self.elem_level=self.elem_level+1

    def endElement(self,name):
        if self.buffer:
            self.write_buffer()
        self.elem_level=self.elem_level-1
        if self.last_level < self.elem_level:
            self.writer.write("\n"+self.indent*self.elem_level*" "+"</"+name+">")
        else:
            self.writer.write("</"+name+">")
            self.last_level = -1

    def ignorableWhitespace(self,data,start_ix,length):
        # we drop white space here.
        # self.characters(data,start_ix,length)
        pass
        
    def characters(self,data,start_ix,length):
        if self.elem_level>0:
            self.put_buffer(data[start_ix:start_ix+length])
            
    def put_buffer(self, txt):
        self.buffer = self.buffer+txt
        
    def write_buffer(self):
        if self.buffer:
            self.write_data(string.strip(self.buffer))
            self.buffer = ""
            
    def write_data(self,data, quotes=0):
        "Writes datachars to writer."
        data=string.replace(data,"&","&amp;")
        data=string.replace(data,"<","&lt;")
        if quotes:
            data=string.replace(data,"\"","&quot;")
        data=string.replace(data,">","&gt;")
        self.writer.write(data)
        
    def endDocument(self):
        self.write_buffer()
        self.writer.write("\n")
        try:
            pass #self.writer.close()
        except NameError:
            pass # It's OK, if the method isn't there we probably don't need it


"""
Example to format a DOM:

>>> i=Indenter()
>>> p=saxexts.make_parser()
>>> p.setErrorHandler(saxutils.ErrorPrinter())
>>> p.setDocumentHandler(i)
>>> p.parseFile(cStringIO.StringIO(dom.toxml()))

Example to format a file to a file, with sgmlop as parser:

>>> f=open(r'd:\tmp\test.xml',"w")
>>> i=Indenter(f)
>>> p=saxexts.make_parser("xml.sax.drivers.drv_sgmlop")
>>> p.setErrorHandler(saxutils.ErrorPrinter())
>>> p.setDocumentHandler(i)
>>> p.parseFile(r"h:\pns\projekte\srz\roteli\birgit\sgml\praep.sgm.umgebrochen.xml")
>>> f.close()
"""

--------------F46600A1D3B1D2BC0AA2B68F--