help on threading in python... newbie.. a lot of questions..

eugene kim eugene1977 at hotmail.com
Tue Oct 15 10:41:02 EDT 2002


i appended entire code at bottom..

this program read urls from database
create thread for each url to be fetched via urllib 
get anchorlist and form valid complete links(which start with http://)
each thread create a databasehandler object to insert the links into 
database

db.cursor(), a dictionary(urlDictionary) is global variable
so i tried to handle them with mutex.acquire()/release()

what's really confusing is "is class variable(self.*) shared among threads?)
if i lock "creating databasehandler object and call the class' object" with 
mutex, there's no error,
but if i change it to class variable, errors are popping out..

additional quetions..
1) urlopen fails sometimes to grap pages(which is accessible by browser)
i guess it's because urlopen tries for too short time.. can i give more time 
for it to try?
2) is my approach acceptible in terms of general structure(threading..etc)
or should i rewrite it most of part..--;


#!/usr/local/bin/python
import sys
sys.path.append('/usr/local/lib/python2.2/site-packages')

import threading, thread
import formatter
import htmllib
import urllib
import string
from pyPgSQL import PgSQL

db=PgSQL.connect(user='postgres',database='bloglog')
cursor = db.cursor()
# w = formatter.DumbWriter() # plain text


urlDictionary = {}

def url_check(link):
    return not ( len(link) == 1 or link.startswith("mailto:") or 
link.startswith("javascript:"))
        
def url_convert(blogUrl, link):

    if not link.startswith("http://"):
        if not blogUrl[-1] == "/":
            blogUrl = blogUrl + "/"
        if link.startswith("./"):
            link = blogUrl +link[2:]
        elif link[0] == ".":
            link = blogUrl +link[1:]
        elif link[0] == "/":
            link = blogUrl + link[1:]
        else:
            link = blogUrl + link
    return link

class FetchUrlThread(threading.Thread):
    def __init__(self, blogUrl, blog_id):
        threading.Thread.__init__(self)
        self.blogUrl = blogUrl
        self.blog_id = blog_id
    def run(self):
        try:
            self.file = urllib.urlopen(self.blogUrl)
        except IOError:
            print self.blogUrl, " couldn't be reached"
        else:
            self.f = formatter.NullFormatter()
            self.p = htmllib.HTMLParser(self.f)
            (self.p).feed((self.file).read())
            (self.p).close()
            (self.file).close()
            (self.p).anchorlist

#             mutex.acquire()
            self.t2 = DatabaseWrite(self.p.anchorlist, self.blogUrl, 
self.blog_id)
            self.t2.dataWrite()
#             mutex.release()

mutex = thread.allocate_lock()
mutex2 = thread.allocate_lock()
mutex3 = thread.allocate_lock()
mutex4 = thread.allocate_lock()
class DatabaseWrite:
    def __init__(self, anchorlist, blogUrl, blog_id):
        self.anchorlist = anchorlist
        self.blogUrl = blogUrl
        self.blog_id = blog_id
        mutex4.acquire()
        self.cursor = db.cursor()
        mutex4.release()
    def dataWrite(self):
        for self.link in self.anchorlist:
#             print link
            if(url_check(self.link)):
                self.link = url_convert(self.blogUrl, self.link)
                
                print self.link

                if(urlDictionary.has_key(self.link)):
                    mutex2.acquire()
                    urlDictionary[self.link][1] = 
urlDictionary[self.link][1] + 1
                    mutex2.release()
                else:

                    self.sqlclause = 'select nextval(\'url_id_seq\')'
                    self.cursor.execute(self.sqlclause)
                    self.result = (self.cursor).fetchone()
                    self.url_id = str((self.result)[0])
                    self.sqlclause = 'insert into url ( url_id, url, 
quantity) values (%s, %s, %s)'

                    (self.cursor).execute(self.sqlclause, ( self.url_id, 
self.link, 1))
                    self.sqlclause = 'insert into blog_url_map (blog_id, 
url_id) values (%s, %s)'
                    (self.cursor).execute(self.sqlclause, ( self.blog_id, 
self.url_id))
                    mutex3.acquire()
                    urlDictionary[self.link] = [self.url_id, 0]
                    mutex3.release()

sqlselect = 'select url, blog_id from blog where visited_p = \'f\''
cursor.execute(sqlselect)
result2 = cursor.fetchall()
        
for i in range(len(result2)):

    blogUrl = str(result2[i][0])
    blog_id = str(result2[i][1])

    t = FetchUrlThread(blogUrl, blog_id)
    t.start()

            
sqlclause = 'update url set quantity = %s where url_id = %s'
for link in urlDictionary.keys():
    cursor.execute(sqlclause, (urlDictionary[link][1], 
urlDictionary[link][0]))

db.commit()








More information about the Python-list mailing list