help on threading in python... newbie.. a lot of questions..
eugene kim
eugene1977 at hotmail.com
Tue Oct 15 10:41:02 EDT 2002
i appended entire code at bottom..
this program read urls from database
create thread for each url to be fetched via urllib
get anchorlist and form valid complete links(which start with http://)
each thread create a databasehandler object to insert the links into
database
db.cursor(), a dictionary(urlDictionary) is global variable
so i tried to handle them with mutex.acquire()/release()
what's really confusing is "is class variable(self.*) shared among threads?)
if i lock "creating databasehandler object and call the class' object" with
mutex, there's no error,
but if i change it to class variable, errors are popping out..
additional quetions..
1) urlopen fails sometimes to grap pages(which is accessible by browser)
i guess it's because urlopen tries for too short time.. can i give more time
for it to try?
2) is my approach acceptible in terms of general structure(threading..etc)
or should i rewrite it most of part..--;
#!/usr/local/bin/python
import sys
sys.path.append('/usr/local/lib/python2.2/site-packages')
import threading, thread
import formatter
import htmllib
import urllib
import string
from pyPgSQL import PgSQL
db=PgSQL.connect(user='postgres',database='bloglog')
cursor = db.cursor()
# w = formatter.DumbWriter() # plain text
urlDictionary = {}
def url_check(link):
return not ( len(link) == 1 or link.startswith("mailto:") or
link.startswith("javascript:"))
def url_convert(blogUrl, link):
if not link.startswith("http://"):
if not blogUrl[-1] == "/":
blogUrl = blogUrl + "/"
if link.startswith("./"):
link = blogUrl +link[2:]
elif link[0] == ".":
link = blogUrl +link[1:]
elif link[0] == "/":
link = blogUrl + link[1:]
else:
link = blogUrl + link
return link
class FetchUrlThread(threading.Thread):
def __init__(self, blogUrl, blog_id):
threading.Thread.__init__(self)
self.blogUrl = blogUrl
self.blog_id = blog_id
def run(self):
try:
self.file = urllib.urlopen(self.blogUrl)
except IOError:
print self.blogUrl, " couldn't be reached"
else:
self.f = formatter.NullFormatter()
self.p = htmllib.HTMLParser(self.f)
(self.p).feed((self.file).read())
(self.p).close()
(self.file).close()
(self.p).anchorlist
# mutex.acquire()
self.t2 = DatabaseWrite(self.p.anchorlist, self.blogUrl,
self.blog_id)
self.t2.dataWrite()
# mutex.release()
mutex = thread.allocate_lock()
mutex2 = thread.allocate_lock()
mutex3 = thread.allocate_lock()
mutex4 = thread.allocate_lock()
class DatabaseWrite:
def __init__(self, anchorlist, blogUrl, blog_id):
self.anchorlist = anchorlist
self.blogUrl = blogUrl
self.blog_id = blog_id
mutex4.acquire()
self.cursor = db.cursor()
mutex4.release()
def dataWrite(self):
for self.link in self.anchorlist:
# print link
if(url_check(self.link)):
self.link = url_convert(self.blogUrl, self.link)
print self.link
if(urlDictionary.has_key(self.link)):
mutex2.acquire()
urlDictionary[self.link][1] =
urlDictionary[self.link][1] + 1
mutex2.release()
else:
self.sqlclause = 'select nextval(\'url_id_seq\')'
self.cursor.execute(self.sqlclause)
self.result = (self.cursor).fetchone()
self.url_id = str((self.result)[0])
self.sqlclause = 'insert into url ( url_id, url,
quantity) values (%s, %s, %s)'
(self.cursor).execute(self.sqlclause, ( self.url_id,
self.link, 1))
self.sqlclause = 'insert into blog_url_map (blog_id,
url_id) values (%s, %s)'
(self.cursor).execute(self.sqlclause, ( self.blog_id,
self.url_id))
mutex3.acquire()
urlDictionary[self.link] = [self.url_id, 0]
mutex3.release()
sqlselect = 'select url, blog_id from blog where visited_p = \'f\''
cursor.execute(sqlselect)
result2 = cursor.fetchall()
for i in range(len(result2)):
blogUrl = str(result2[i][0])
blog_id = str(result2[i][1])
t = FetchUrlThread(blogUrl, blog_id)
t.start()
sqlclause = 'update url set quantity = %s where url_id = %s'
for link in urlDictionary.keys():
cursor.execute(sqlclause, (urlDictionary[link][1],
urlDictionary[link][0]))
db.commit()
More information about the Python-list
mailing list