Extract links from html-page

Thu May 11 11:10:50 EDT 2000

thomas at cintra.no wrote:
> I`d like to extract just the links in a web-page or html-document. It
> would be nice if relative links like '<a
> href="/source/test_pyt.tgz">Logo</a> in a page at
> http://www.test.site.org became
> "http://www.test.site.org/source/test_pyt.tgz" in the end too, but ...

from the eff-bot archives:

#
# extract anchors from an HTML document
#
# fredrik lundh, may 1999
#
# fredrik at pythonware.com
# http://www.pythonware.com
#

import htmllib
import formatter
import string
import urllib, urlparse

class myParser(htmllib.HTMLParser):

    def __init__(self, base):
        htmllib.HTMLParser.__init__(self, formatter.NullFormatter())
        self.anchors = []
        self.base = base

    def anchor_bgn(self, href, name, type):
        self.save_bgn()
        if self.base:
            self.anchor = urlparse.urljoin(self.base, href)
        else:
            self.anchor = href

    def anchor_end(self):
        text = string.strip(self.save_end())
        if self.anchor and text:
            self.anchors.append((self.anchor, text))

if __name__ == '__main__':

    URL = "http://www.pythonware.com"

    f = urllib.urlopen(URL)

    p = myParser(URL)
    p.feed(f.read())
    p.close()

    print "anchors =", p.anchors
    print "title =", p.title

</F>

<!-- (the eff-bot guide to) the standard python library:
http://www.pythonware.com/people/fredrik/librarybook.htm
-->