How to exctract title of links

prasad prasad83.a at gmail.com
Tue Apr 26 09:32:15 EDT 2005


import htmllib, formatter, urllib
class x(htmllib.HTMLParser):
    inanchor = True # indicates whether we are inside anchor element
    def dump(self, tag, attrs):
        #print tag,
        for a, v in attrs:
            if a in ['a', 'src', 'href']:
                print v,


        print
    #def do_img(self, attrs):
    #    self.dump('img', attrs)
    def start_a(self, attrs):
        self.dump('a', attrs)
        self.inanchor = True # yes now we are in anchor element


    def handle_data(self,data):
        if self.inanchor:
            print data # lets us print the anchor element inner data
            self.inanchor = False # we handled the anchor element data
            # this is not a nice way, self.inanchor should be set false

            # when </a> is reached. try in end_a(self) ...


    #def start_form(self, attrs):
    #    self.dump('form', attrs)

y = x(formatter.NullFormatter())
y.feed(urllib.urlopen('http://www.aquabase.org/fish/dump.php3').read())
y.close()




More information about the Python-list mailing list