threading with time limit
Marc Tardif
intmktg at Gloria.CAM.ORG
Mon Jun 5 16:25:26 EDT 2000
How can x threads be started at once and stopped y seconds later,
processing only the successfully returned threads?
For example, consider a situation where all the links from multiple URL's
must be retrieved at the same time. By setting a time limit, slow sites
don't slow down the whole process and appropriate error messages are
returned stating which sites wheren't completed in y seconds.
My current implementation (below), defines a list of links to parse and
creates a thread for each link. If the threads don't finish within 2
seconds, processing starts even the URL isn't retrieved.
Problems are:
- no error messages are returned for active threads killed after 2 seconds
- processing occurs on every thread, even if they were killed
- should the time limit occur in main() or in Page.run()?
- the time limit is applied to URL retrieval AND processing, whereas it
should probably apply to the former only
Any suggestions to solve any of the above problems would be much
appreciated.
--- getlinks.py ---
#!/usr/local/bin/python
links = ['http://www.python.org', 'http://www.cam.org']
import time
import string
import threading
import urllib
import urlparse
import sgmllib
def main():
threadlist = []
for link in links:
thread = Page(link)
threadlist.append(thread)
for thread in threadlist:
thread.start()
for i in range(5):
if threading.activeCount() == 1:
break;
time.sleep(0.5)
linklist = []
for thread in threadlist:
tlinks = thread.getlinks()
for tlink in tlinks:
linklist.append(tlink)
print linklist
class Page(threading.Thread):
def __init__(self, url):
self.url = url
self.links = []
threading.Thread.__init__(self)
def run(self):
text = self.readhtml()
if text:
self.links = self.readlinks(text)
def readhtml(self):
text = None
f = urllib.urlopen(self.url)
if f:
text = f.read()
f.close()
return text
def readlinks(self, text):
parser = MyHTMLParser()
parser.feed(text)
parser.close()
rawlinks = parser.getlinks()
base = urlparse.urljoin(self.url, parser.getbase() or "")
links = []
for rawlink in rawlinks:
t = urlparse.urlparse(rawlink)
t = t[:-1] + ('',)
rawlink = urlparse.urlunparse(t)
link = urlparse.urljoin(base, rawlink)
links.append(link)
return links
def getlinks(self):
return self.links
class MyHTMLParser(sgmllib.SGMLParser):
def __init__(self):
self.base = None
self.links = {}
sgmllib.SGMLParser.__init__(self)
def start_a(self, attributes):
self.link_attr(attributes, 'href')
def end_a(self): pass
def do_area(self, attributes):
self.link_attr(attributes, 'href')
def do_img(self, attributes):
self.link_attr(attributes, 'src', 'lowsrc')
def do_frame(self, attributes):
self.link_attr(attributes, 'src')
def link_attr(self, attributes, *args):
for name, value in attributes:
if name in args:
if value: value = string.strip(value)
if value: self.links[value] = None
def do_base(self, attributes):
for name, value in attributes:
if name == 'href':
if value: value = string.strip(value)
if value: self.base = value
def getlinks(self):
return self.links.keys()
def getbase(self):
return self.base
if __name__ == '__main__':
main()
More information about the Python-list
mailing list