SAX ContentHandler / nbsp ?

Robert Roy rjroy at takingcontrol.com
Wed Jan 10 18:34:32 EST 2001


On Wed, 10 Jan 2001 15:48:44 GMT, gregory_wilson at my-deja.com wrote:

>I'm trying to build simple filter for some legacy
>HTML by deriving from xml.sax.ContentHandler.
>Right now, it dies when it hits the " "
>entity.  What is the simplest way to handle
>this?  Do I need an explicit DTD?  Can I register
>entities with the parser w/out a DTD? Can I build
>an error handler that'll trap unknown entities
>and tell the parser "never mind, it's OK"?
>
>Thanks,
>Greg
>
>
>Sent via Deja.com
>http://www.deja.com/
 HTML is SGML not  XML so your efforts are doomed to failure since
most HTML is not written in a manner that would be considered
well-formed XML. Use sgmllib instead. There is an extension called
sgmlop that is in the XML sig distribution that speeds up sgmllib
tremendously.


## here is a code sample that changes the background color of td's
## and passes all entity refs through unchanged

from sgmllib import SGMLParser
import string

class MySGMLParser(SGMLParser):
    def __init__(self, verbose=0, outfile=None):
       if not hasattr(outfile, 'write'):
           raise "outfile must have attribute write"
       self.outfile = outfile
       SGMLParser.__init__(self, verbose)

    def handle_data(self, data):
        self.outfile.write(data)

    def handle_comment(self, data):
        self.outfile.write('<!--%s-->' % data)
        
    def unknown_starttag(self, tag, attrs):
        if not attrs:
            self.outfile.write('<' + tag + '>')
        else:
            self.outfile.write('<' + tag)
            for attr in attrs:
                self.outfile.write(' %s="%s"' % attr)
            self.outfile.write('>')

    def unknown_endtag(self, tag):
        self.outfile.write('</%s>' % tag)

    def unknown_entityref(self, ref):
        self.outfile.write('&%s;' % ref)
    # so known refs do not get translated
    handle_entityref = unknown_entityref

    def unknown_charref(self, ref):
        self.outfile.write('&#%s;' % ref)
    # so known refs do not get translated
    handle_charref = unknown_charref

    def close(self):
        SGMLParser.close(self)

    ## put tag handlers here, 
    ## for my sample code I took the  www.python.org homepage and
    ## changed the bgcolor of the wrapper tables 
    ## define start and end tag handlers as start_TAGNAME, end_TAGNAME

    def start_td(self, attrs):
        if not attrs:
            self.outfile.write('<td>')
        else:
            self.outfile.write('<td')
            for name, val in attrs:
                if string.lower(name) == 'bgcolor': 
                    self.outfile.write(' %s="%s"' % (name, '#ffcc99'))
                else:
                    self.outfile.write(' %s="%s"' % (name, val))
            self.outfile.write('>')

    def end_td(self):
        self.outfile.write('</td>')



if __name__ == "__main__":
    import sys
    if len(sys.argv) != 3:
        print "usage: python changeattr.py infile, outfile"
        raise SystemExit
    infile = sys.argv[1]
    outfile = sys.argv[2]
    ofp = open(outfile, 'w')
    # this is a one shot parser
    p = MySGMLParser(outfile=ofp)
    p.feed(open(infile).read())
    p.close()
    ofp.close()






More information about the Python-list mailing list