how to remove HTML attributes from web pages ( HTML parseing)
Matthias Huening
mhuening at zedat.fu-berlin.de
Wed Jun 26 13:14:10 EDT 2002
sanjay2kind at yahoo.com (sanjay) wrote in
news:63170f57.0206260345.2b61b7ec at posting.google.com:
> Hi,
>
> New to python and doing one application. i would like to web page
> content after removing specific html tag, attribute etc.
Below you find some sample code that uses htmllib.
Hope this helps.
- Matthias
-----------
import htmllib, formatter, StringIO, urllib
class HTMLStripper(htmllib.HTMLParser):
def __init__(self):
self.bodytext = StringIO.StringIO()
writer = formatter.DumbWriter(self.bodytext)
htmllib.HTMLParser.__init__(self,
formatter.AbstractFormatter(writer))
def anchor_end(self):
if self.anchor:
self.handle_data('')
self.anchor = None
def gettext(self):
return self.bodytext.getvalue()
f = urllib.urlopen('http://www.python.org/').read()
st = HTMLStripper()
st.feed(f)
st.close()
print st.gettext()
print st.anchorlist
-----------
More information about the Python-list
mailing list