Improving the web page download code.

mukesh tiwari mukeshtiwari.iiitm at gmail.com
Tue Aug 27 15:41:10 EDT 2013


Hello All,
I am doing web stuff first time in python so I am looking for suggestions. I wrote this code to download the title of webpages using as much less resource ( server time, data download)  as possible and should be fast enough. Initially I used BeautifulSoup for parsing but the person who is going to use this code asked me not to use this and use regular expressions ( The reason was BeautifulSoup is not fast enough ? ). Also initially, I was downloading the the whole page but finally I restricted to only 30000 characters to get the title of almost all the pages. Write now I can see only two shortcomings of this code, one when I kill the code by SIGINT ( ctrl-c ) then it dies instantly. I can modify this code to process all the elements in queue and let it die. The second is one IO call per iteration in download url function ( May be I can use async IO call but I am not sure ). I don't have much web programming experience so I am looking for suggestion to make it more robust. top-1m.csv is file downloaded from alexa[1]. Also some suggestions to write more idiomatic python code.

-Mukesh Tiwari

[1]http://www.alexa.com/topsites. 


import urllib2, os, socket, Queue, thread, signal, sys, re


class Downloader():

	def __init__( self ):
		self.q = Queue.Queue( 200 )
		self.count = 0 
	


	def downloadurl( self ) :
		#open a file in append mode and write the result ( Improvement think of writing in chunks ) 
		with open('titleoutput.dat', 'a+' ) as file :	
			while True :
				try :
					url = self.q.get( )
					data = urllib2.urlopen ( url , data = None , timeout = 10 ).read( 30000 )
					regex = re.compile('<title.*>(.*?)</title>' , re.IGNORECASE)
					#Read data line by line and as soon you find the title go out of loop. 
					#title = None
					#for r in data:
					#	if not r :
					#		raise StopIteration
					#	else: 
					#		title = regex.search( r )
					#		if title is not None: break

					title = regex.search( data )
					result =  ', '.join ( [ url , title.group(1) ] )
					#data.close()
					file.write(''.join( [ result , '\n' ] ) )
				except urllib2.HTTPError as e:
				       print ''.join ( [ url, '  ', str ( e ) ] ) 
				except urllib2.URLError as e:
					print ''.join ( [ url, '  ', str ( e ) ] )
				except Exception as e :
					print ''.join ( [ url, '  ', str( e )  ] )
			#With block python calls file.close() automatically.		
				

	def createurl ( self ) :

		#check if file exist. If not then create one with default value of 0 bytes read.
		if os.path.exists('bytesread.dat'):
			f = open ( 'bytesread.dat','r')
			self.count = int ( f.readline() )
					           
		else:
			f=open('bytesread.dat','w')
			f.write('0\n')
			f.close()

		#Reading data in chunks is fast but we can miss some sites due to reading the data in chunks( It's worth missing because reading is very fast)
		with open('top-1m.csv', 'r') as file:
			prefix = ''
			file.seek(  self.count * 1024 )
			#you will land into the middle of bytes so discard upto newline
			if ( self.count ): file.readline()	
			for lines in iter ( lambda : file.read( 1024 ) , ''):
				l = lines.split('\n')
				n = len ( l )
				l[0] = ''.join( [ prefix , l[0] ] )
				for i in xrange ( n - 1 ) : self.q.put ( ''.join ( [ 'http://www.', l[i].split(',')[1] ] ) )
				prefix = l[n-1]
				self.count += 1

			
	#do graceful exit from here.
	def handleexception ( self , signal , frame) :
		with open('bytesread.dat', 'w') as file:
			print ''.join ( [ 'Number of bytes read ( probably unfinished ) ' , str ( self.count ) ] )
			file.write ( ''.join ( [ str ( self.count ) , '\n' ] ) )
			file.close()			
			sys.exit(0)

if __name__== '__main__':
	u = Downloader()
	signal.signal( signal.SIGINT , u.handleexception)
	thread.start_new_thread ( u.createurl , () )
	for i in xrange ( 5 ) :
		thread.start_new_thread ( u.downloadurl , () )
	while True : pass
			




More information about the Python-list mailing list