how to remove HTML attributes from web pages ( HTML parseing)

Wed Jun 26 13:14:10 EDT 2002

sanjay2kind at yahoo.com (sanjay) wrote in 
news:63170f57.0206260345.2b61b7ec at posting.google.com:

> Hi,
> 
>  New to python and doing one application. i would like to  web page
> content after removing specific html tag, attribute etc. 

Below you find some sample code that uses htmllib.
Hope this helps. 
- Matthias

-----------

import htmllib, formatter, StringIO, urllib

class HTMLStripper(htmllib.HTMLParser):
    def __init__(self):
        self.bodytext = StringIO.StringIO()
        writer = formatter.DumbWriter(self.bodytext)
        htmllib.HTMLParser.__init__(self,
            formatter.AbstractFormatter(writer))

    def anchor_end(self):
        if self.anchor:
            self.handle_data('')
            self.anchor = None

    def gettext(self):
        return self.bodytext.getvalue()

f = urllib.urlopen('http://www.python.org/').read()       
st = HTMLStripper()
st.feed(f)
st.close()
print st.gettext()
print st.anchorlist

-----------