HTMLParser and write
Stephen Ferg
steve at ferg.org
Fri Mar 5 15:04:55 EST 2004
You're right. The example is REALLY feeble. Maybe this will help:
"""HTMLParserDemoProgram
Use HTMLParser to read in an HTML file and write it out again.
This will put all tag and attribute names into lowercase.
"""
"""
REVISION HISTORY
2 2004-01-05 added handle_pi and improved attribute processing
"""
from HTMLParser import HTMLParser
class CustomizedParser(HTMLParser):
def setOutfileName(self, argOutfileName):
"""Remember the output file, so it is easy to write to it.
"""
self.OutfileName = argOutfileName
self.Outfile = open(self.OutfileName, "w")
def closeOutfile(self):
self.Outfile.close()
def write(self, argString):
self.Outfile.write(argString)
def handle_starttag(self, argTag, argAttrs):
""" argAttrs is a list of tuples.
Each tuple is a pair of (attribute_name, attribute_value)
"""
attributes = "".join([' %s="%s"' % (key, value) for key, value in argAttrs])
self.Outfile.write("<%s%s>" % (argTag, attributes))
def handle_startendtag(self, argTag, argAttrs):
""" argAttrs is a list of tuples.
Each tuple is a pair of (attribute_name, attribute_value)
"""
attributes = "".join([' %s="%s"' % (key, value) for key, value in argAttrs])
self.Outfile.write("<%s%s/>" % (argTag, attributes))
def handle_endtag(self, argTag):
self.write("</%s>" % argTag)
def handle_data(self, argString):
self.write(argString)
def handle_charref(self, argString):
self.write("&#%s;" % argString)
def handle_entityref(self, argString):
self.write("&%s;" % argString)
def handle_comment(self, argString):
self.write("<!--%s-->" % argString)
def handle_decl(self, argString):
self.write("<!%s>" % argString)
def handle_pi(self, argString):
# handle a processing instruction
self.write("<?%s>" % argString)
def main(myInfileName, myOutfileName ):
myInfile = open(myInfileName, "r")
myParser = CustomizedParser()
myParser.setOutfileName(myOutfileName)
myParser.feed(myInfile.read())
myInfile.close()
myParser.closeOutfile()
def dq(s):
"""Enclose a string argument in double quotes"""
return '"'+ s + '"'
if __name__ == "__main__":
print "Starting HTMLParserDemoProgram"
main("c:\junk\slide01.html", "c:\junk\slide01a.html")
print "Ending HTMLParserDemoProgram"
More information about the Python-list
mailing list