Extracting data from HTML

Geoff Gerrietts geoff at gerrietts.net
Fri May 31 15:54:09 EDT 2002


Quoting Hazel (lailian98 at hotmail.com):
> how do i write a program that
> will extract info from an HTML and print
> of a list of TV programmes, its Time, and Duration
> using urllib?

You might check into htmllib -- it's got some basic parser structures
in there that can help you parse through the HTML.

You might check out http://www.python9.org/p9-zadka.ppt, which goes
over some of that.

And at the end of this message, I've affixed some (very sloppy, not
very good) Python code that I pounded out the other day to (more or
less) strip markup from a page, so you can see how I went about
prototyping a solution to a (somewhat) similar problem.

-- 
Geoff Gerrietts <geoff at gerrietts dot net>     http://www.gerrietts.net/
 "Politics, as a practice, whatever its professions, has always been the 
          systematic organization of hatreds." --Henry Adams


#!/usr/local/bin/python -i

import htmllib, formatter

class DataStorage:
    """ DataStorage
        helper class for the parser. effectively implements a string that
        changes in-place.
    """
    def __init__(self, weight=2):
        self.data = ""
        self.count = 0
        self.weight = weight

    def __add__(self, other):
        """ __add__
            the __add__ routine just appends. clean it later.
        """
        self.data = self.data + str(other)
        return self

    def purge(self):
        dat = [self.data] * self.weight
        self.data = ""
        return string.join(dat)


class HTMLMunger(htmllib.HTMLParser):
    TITLE_WT = 5
    HEADING_WT = 3
    EMPH_WT = 2
    
    def __init__(self):
        htmllib.HTMLParser.__init__(self, formatter.NullFormatter())
        self.plaindata = DataStorage()
        self.storagestack = []

    def start_body(self, attrs):
        self.savedata = self.plaindata

    def push_storage(self,stor):
        self.storagestack.append(self.savedata)
        self.savedata = stor

    def pop_storage(self):
        dat = self.savedata.purge()
        self.savedata = self.storagestack.pop()
        self.handle_data(dat)

    def start_h1(self, attrs):
        self.push_storage(DataStorage(self.HEADING_WT))
    start_h2 = start_h3 = start_h4 = start_h5 = start_h6 = start_h1

    def end_h1(self):
        self.pop_storage()
    end_h2 = end_h3 = end_h4 = end_h5 = end_h6 = end_h1

    def start_i(self, attrs):
        self.push_storage(DataStorage(self.EMPH_WT))
    start_b = start_i

    def end_i(self):
        self.pop_storage()
    end_b = end_i

    def anchor_end(self):
        # prevent the link number from showing up
        self.anchor = None

    def extract(self):
        dat = string.join(([self.title] * self.TITLE_WT) + [self.plaindata])
        return dat


class TextMunger:
    
    def __init__(self):
        self.data = ''

    def feed(self, data):
        self.data = self.data + data

    def extract(self):
        return self.data

class DocFetcherException(Exception):
    pass


def DocFetcher:
    handlers = {
        'text/html':    HTMLMunger,
        'text/plain':   TextMunger
    }
    
    def get_url(self, url):
        url_obj = urllib.urlopen(url)
        ct = url_obj.info()['Content-Type']
        h = self.handlers.get(ct)
        if not h:
            raise DocFetcherException, "no handler for [%s] type [%s]" %(url,ct)
        dp = h()
        dp.feed(url_obj.read())
        return dp.extract()

        



if __name__ == '__main__':
    pm = HTMLMunger()
    import urllib

    print "Retrieving"
    dat = urllib.urlopen("http://www.yahoo.com/").read()

    print "Parsing"
    pm.feed(dat)

    print "Plain data: ", len(pm.plaindata.data)
    print "Emph. data: ", len(pm.emphdata.data)
    print "Head. data: ", len(pm.headerdata.data)

    print "pm.plaindata.data"





More information about the Python-list mailing list