threading with time limit

Mon Jun 5 16:25:26 EDT 2000

How can x threads be started at once and stopped y seconds later,
processing only the successfully returned threads?

For example, consider a situation where all the links from multiple URL's
must be retrieved at the same time. By setting a time limit, slow sites
don't slow down the whole process and appropriate error messages are
returned stating which sites wheren't completed in y seconds.

My current implementation (below), defines a list of links to parse and
creates a thread for each link. If the threads don't finish within 2
seconds, processing starts even the URL isn't retrieved.

Problems are:
- no error messages are returned for active threads killed after 2 seconds
- processing occurs on every thread, even if they were killed
- should the time limit occur in main() or in Page.run()?
- the time limit is applied to URL retrieval AND processing, whereas it
should probably apply to the former only

Any suggestions to solve any of the above problems would be much
appreciated.

--- getlinks.py ---
#!/usr/local/bin/python

links = ['http://www.python.org', 'http://www.cam.org']

import time
import string
import threading

import urllib
import urlparse
import sgmllib

def main():

    threadlist = []
    for link in links:
        thread = Page(link)
        threadlist.append(thread)

    for thread in threadlist:
        thread.start()

    for i in range(5):
        if threading.activeCount() == 1:
            break;
        time.sleep(0.5)

    linklist = []
    for thread in threadlist:
        tlinks = thread.getlinks()
        for tlink in tlinks:
            linklist.append(tlink)

    print linklist

class Page(threading.Thread):

    def __init__(self, url):
        self.url = url
        self.links = []
        threading.Thread.__init__(self)

    def run(self):
        text = self.readhtml()
        if text:
            self.links = self.readlinks(text)

    def readhtml(self):
        text = None
        f = urllib.urlopen(self.url)
        if f:
            text = f.read()
            f.close()
        return text

    def readlinks(self, text):
        parser = MyHTMLParser()
        parser.feed(text)
        parser.close()
        rawlinks = parser.getlinks()
        base = urlparse.urljoin(self.url, parser.getbase() or "")
        links = []
        for rawlink in rawlinks:
            t = urlparse.urlparse(rawlink)
            t = t[:-1] + ('',)
            rawlink = urlparse.urlunparse(t)
            link = urlparse.urljoin(base, rawlink)
            links.append(link)
        return links

    def getlinks(self):
        return self.links

class MyHTMLParser(sgmllib.SGMLParser):

    def __init__(self):
        self.base = None
        self.links = {}
        sgmllib.SGMLParser.__init__(self)

    def start_a(self, attributes):
        self.link_attr(attributes, 'href')

    def end_a(self): pass

    def do_area(self, attributes):
        self.link_attr(attributes, 'href')

    def do_img(self, attributes):
        self.link_attr(attributes, 'src', 'lowsrc')

    def do_frame(self, attributes):
        self.link_attr(attributes, 'src')

    def link_attr(self, attributes, *args):
        for name, value in attributes:
            if name in args:
                if value: value = string.strip(value)
                if value: self.links[value] = None

    def do_base(self, attributes):
        for name, value in attributes:
            if name == 'href':
                if value: value = string.strip(value)
                if value: self.base = value

    def getlinks(self):
        return self.links.keys()

    def getbase(self):
        return self.base

if __name__ == '__main__':
    main()