Help on thread pool

Alex metallourlante at gmail.com
Sat May 17 07:08:56 EDT 2008


Hi all.

In order to understand the concept of threading pool in python I'm
working on a simple single-site web crawler.
I would like to stop the program when the threading pool have
downloaded all internal links from a web site, but now my program keep
waiting forever even if there are no more links to download.

Here's my code, I appreciate any comments, I'm programming just for
fun and learning ;-)

Thanks in advance.

from BeautifulSoup import BeautifulSoup
import urllib
from pprint import pprint
import string
from urlparse import urlparse
import sys
from threading import Thread
import time
from Queue import Queue

#dirty hack: set default encoding to utf-8
reload(sys)
sys.setdefaultencoding('utf-8')

opener = urllib.FancyURLopener({})

class Crawler:

    def __init__(self):
        """
        Constructor
        """
        self.missed = 0
        self.url_list = []
        self.urls_queue = Queue()
        self.num_threads = 5

        self._create_threads()

    def get_internal_links(self,url):
        """
        Get all internal links from a web page and feed the queue
        """
        self.url = url
        url_netloc = urlparse(self.url).netloc
        print "Downloading... ", self.url
        time.sleep(5)
        try:
            p = opener.open(self.url)
            #print p.info()
        except IOError:
            print "error connecting to ", self.url
            print "wait..."
            time.sleep(5)
            print "retry..."
            try:
                p = urllib.urlopen(self.url)
            except IOError:
              self.missed = self.missed + 1
              return None

        html = p.read()
        soup = BeautifulSoup(html)
        anchors = soup.findAll('a')
        links = [ str(anchor['href']) for anchor in anchors]
        internal_links = [link for link in links if
(urlparse(link).netloc == url_netloc)]

        for link in internal_links:
            if link not in self.url_list and link != self.url:
                self.url_list.append(link)
                self.urls_queue.put(link)
        print "Queue size: ", self.urls_queue.qsize()
        print "List size: ", str(len(self.url_list))
        print "Errors: ", str(self.missed)
        self._queue_consumer()


    def _queue_consumer(self):
        """
        Consume the queue
        """
        while True:
            url = self.urls_queue.get()
            print 'Next url: ', url
            self.get_internal_links(url)
            self.urls_queue.task_done()


    def _create_threads(self):
        """
        Set up some threads to fetch pages
        """
        for i in range(self.num_threads):
	        worker = Thread(target=self._queue_consumer, args=())
	        worker.setDaemon(True)
	        worker.start()

#-----------------------------------------------------------------------------
#

if __name__ == '__main__':

    c = Crawler()
    c.get_internal_links('http://www.thinkpragmatic.net/')





More information about the Python-list mailing list