Extract links from html-page
Fredrik Lundh
effbot at telia.com
Thu May 11 11:10:50 EDT 2000
thomas at cintra.no wrote:
> I`d like to extract just the links in a web-page or html-document. It
> would be nice if relative links like '<a
> href="/source/test_pyt.tgz">Logo</a> in a page at
> http://www.test.site.org became
> "http://www.test.site.org/source/test_pyt.tgz" in the end too, but ...
from the eff-bot archives:
#
# extract anchors from an HTML document
#
# fredrik lundh, may 1999
#
# fredrik at pythonware.com
# http://www.pythonware.com
#
import htmllib
import formatter
import string
import urllib, urlparse
class myParser(htmllib.HTMLParser):
def __init__(self, base):
htmllib.HTMLParser.__init__(self, formatter.NullFormatter())
self.anchors = []
self.base = base
def anchor_bgn(self, href, name, type):
self.save_bgn()
if self.base:
self.anchor = urlparse.urljoin(self.base, href)
else:
self.anchor = href
def anchor_end(self):
text = string.strip(self.save_end())
if self.anchor and text:
self.anchors.append((self.anchor, text))
if __name__ == '__main__':
URL = "http://www.pythonware.com"
f = urllib.urlopen(URL)
p = myParser(URL)
p.feed(f.read())
p.close()
print "anchors =", p.anchors
print "title =", p.title
</F>
<!-- (the eff-bot guide to) the standard python library:
http://www.pythonware.com/people/fredrik/librarybook.htm
-->
More information about the Python-list
mailing list