Parsing a HTML file for links?

Fredrik Lundh fredrik at pythonware.com
Wed May 5 05:03:33 EDT 1999


Zigron <zigron at jps.net> wrote:
>     I've never used the HTMLParser class(or SGML?), or the formatter thing,
> et al, and they confuse me a little.
> 
>     What I want to do is go through a HTML file, and spit out a
> dictionary based on the links, and title of the file. I want a dictionary,
> I guess, of like,
> {"text-between-anchor-tags":["Destination1","DestinationN.."]}
> 
> The dictionary has a list in it because the same text might have more then
> one destination... I can't figure out how to get this to work :) Any one
> have any ideas?

well, I still had Matthew's script in my editor,
so here's a snippet which does exactly what
you want...

# extract anchors from an HTML document

import htmllib
import formatter
import string

class myParser(htmllib.HTMLParser):

    def __init__(self, verbose=0):
        self.anchors = {}
        htmllib.HTMLParser.__init__(self, formatter.NullFormatter(), verbose)

    def anchor_bgn(self, href, name, type):
        self.save_bgn()
        self.anchor = href

    def anchor_end(self):
        text = string.strip(self.save_end())
        if self.anchor and text:
            self.anchors[text] = self.anchors.get(text, []) + [self.anchor]

if __name__ == '__main__':

    f = open("test.htm")

    p = myParser()
    p.feed(f.read())
    p.close()

    print "anchors =", p.anchors
    print "title =", p.title





More information about the Python-list mailing list