code debugging

golu bhardwajjayesh7 at gmail.com
Sun Jul 26 02:23:11 EDT 2009


here is a code which crawls links sent to it. theres some problem with
the retrieve_url function ,plz help me out in debugging the fuction
retrive_url. This function retrives pages and saves them in file
#TODO:Visited dict grows in size it needs to be handled smartly
#Moreover server program needs to be in sync with the client eg.
Myrobot
#Take care of tag - 'if modified since',repeated links,hash links
#This is the client side of the distributed crawling framework
#It gets the list of urls to be crawled
#Then crawls the urls and stores the pages in a temporary archive
#which is then transferred to the server or grey_matter
import httplib
import os
import sys
import urlparse
import urllib2
import urllib
import zipfile
import threading

from socket import *
PAGE_DIR="C:/users/jayesh/
pages/"                                          # directory where the
web pages are stored temporarily
 
# before transfer to the grey_matter
visited=
{}                                                                 # a
dict to remember visited urls
ROBOT_COUNT=4


def fget():
    """ This function retrieves the zipped file
     containing the list of urls from the grey_matter and
     saves them in a local file 'list.txt'. """

    httplib.HTTPConnection.debuglevel=1
    request=urllib2.Request('http://192.168.153.57/list.zip')
#Requesting the zipped file
    request.add_header('Accept-encoding','gzip')           #containing
the list of urls
    opener=urllib2.build_opener()
    flag=1
    s='Waiting for server'
    while flag==1:
     try:
      op=opener.open(request)
      flag=0
     except:
         s=s+'*'
         print s
    f=open('list.zip',"wb")
    f.write(op.read())
    f.close()
    z=zipfile.ZipFile('list.zip')
    p=z.namelist()
    g=open('list.txt',"wb")
    g.write(z.read(p[0]))
    g.close()
    print 'got zipped file'

def compress():
    """ This function compresses the crawled pages and stores them in
        a single compressed file ready to be sent to the
grey_matter."""

    zfile=zipfile.ZipFile('C:/xampp/htdocs/pages.zip',mode='w')
    for fil in os.listdir(PAGE_DIR):
        full=os.path.join(PAGE_DIR,fil)
        zfile.write(full,fil)
        os.remove(full)
    os.rmdir(PAGE_DIR)             #Removing the directory after
transfer to grey_matter


x=0
class robot(threading.Thread):
    """ The main robot class which does the crawling of listed
    urls it recieves from the grey matter. It uses 3 threads which
    crawl the listed urls synchronously."""

    def __init__(self,urllist,urllistlock,dblock):
        threading.Thread.__init__(self)
        self.urllist=urllist
        self.urllistlock=urllistlock
        self.dblock=dblock

    def popurl(self):
        """ This method pops out urls from the urls file one by one
        and sends them for retrieval."""

        self.urllistlock.acquire(1)
        if(len(self.urllist)<1):
            Nexturl=None
        else:
            Nexturl=self.urllist[0]
            if Nexturl[-1]=='\n':Nexturl=Nexturl[:-1]
            del self.urllist[0]
        self.urllistlock.release()
        return Nexturl

    def retrieve_url(self,url):
        """ The main method of the robot class and is called
        run method to retrieve the given urls from the web."""
        global x
        if url is not None:

         try:
            if visited.has_key(url): return
            pieces=urlparse.urlparse(url)
            filepath=pieces[2]
            if filepath != '':
             filepath=filepath[1:]
             filename=filepath.split("/")[-1]
            else:
              filename=x+'.htm'
              x+=1

            path=os.path.join(PAGE_DIR,filename)
            url=urlparse.urlunparse(pieces)
            p=url.rfind('#')                   #temporary
            if p!=-1:
                url=url[:p]

            visited[url]=1
            m=urllib2.urlopen(url)

            fopen=open(path,'wb')

            fopen.seek(0)
            fopen.write(url+'|')

            fopen.write(m.read())
            fopen.close()
            print url ,'retrieved'

         except IOError:
            print url
            print "ERROR:OOPS! THE URL CAN'T BE RETRIEVED"

        return

    def run(self):
        while(1):
            url=self.popurl()
            if url is None:
                break
            try:
             self.retrieve_url(url)
            except:sys.exit()

if __name__=='__main__':

  s=socket(AF_INET,SOCK_STREAM)
  s.bind(('',444))
  s.listen(5)
  q,v=s.accept()
  count=1
  print 'Connecting...'
  while 1:
    print 'Phase: %s' %(count)
    message=q.recv(3)

    if(message!='yes'):continue
    print 'Connected'
    count=count+1
    fget()             # Calling the fget method to get the url list
from
                       # grey_matter(server).
    try:
     os.mkdir(PAGE_DIR)
    except: print 'Cant make dir'
    try:
     f=open('list.txt','r')
     urllist=f.readlines()
     f.close()
    except:
        print 'Error opening urls file'
        sys.exit()
    print 'startting threads'
    urllistlock=threading.Lock()
    dblock=threading.Lock()
    botlist=[]
    for X in range(0,ROBOT_COUNT):
         newbot=robot(urllist,urllistlock,dblock)
         newbot.setName('X')
         botlist.append(newbot)
         newbot.start()

    for X in range(0,ROBOT_COUNT):
         botlist[X].join()

    compress()
    try:
     q.send('yes')
    except:
        print 'socket disconnected'
        print sys.exit()





More information about the Python-list mailing list