[Tutor] finding title tag

Samir Patel sampatel@cs.rmit.edu.au
Mon, 15 Oct 2001 16:05:42 +1000 (EST)


hi,
i have written a program which takes in the url at command line  argument
and finds the links present in that particular web page and this continues
to a depth of 3 or more.....
i store this links in a universal list ..now my problem is that i want to
find if there's any title tag present in this link e.g

<a href= "this is url" title = "i want this">or this</a>

if there's title tag then i want to store the value of  this corresponding
to the link.....

following is the code which i have written and i want to extend this code
....

import re
import sys
import urllib
from formatter import NullFormatter
from htmllib import HTMLParser
from string import find,lower
from urlparse import urljoin,urlparse



def findlinks(url):

   try:
       fp = urllib.urlopen(url)

   except IOError:
       return []              # return an empty list if we can't get a
page

   results = []               # create an empty list to store results in

   p = HTMLParser(NullFormatter())
   p.feed(fp.read())

   return p.anchorlist    # return the list of lines which have a link

def scanpage(ser_name,url, depth, maxindent):


   linklist = findlinks(url)

   if maxindent == None:

	maxindent = depth

   indent = " " * (3*(maxindent - depth))  # this will let us
progressively indent printouts from deeper levels

   print "%s   -> %s" % (indent, url)

   if depth > 0:

       for link in linklist:

	   pars = urlparse(link)

	   if pars[0] == 'https':
	       print 'Secure site !! Cannot open ....skipped...'
	       continue

	   if find(lower(link),'mailto:') != -1:   # check if the link is
a mailto tag
	       continue
	   if link[:4] != 'http' and link[0] == '/' :

	     link = urljoin(url,link)

	   if link[:4] != 'http'and link[0] != '/' :
	     link = urljoin(url,link)

	   global_list.append(link)
	   serve = get_server(link)

	   if select == '-L':
	       if serve == ser_name or serve == None:
		   scanpage(ser_name,link, depth - 1, maxindent)
#function is again called with individual url
	   else:
	       scanpage(ser_name,link, depth - 1, maxindent)

def get_server(input_url):

   try:
       server = re.split('//',input_url)
       server_name = re.split('/',server[1])
       return server_name[0]
   except:
       return None
try:
   select = sys.argv[1]
except IndexError:
   print "Error!! Please give a valid argument"
   sys.exit()

get_depth = None
global_list = []
ser_name = None

if len(sys.argv) == 2:

   parsed = urlparse(select)

   if parsed[0] != 'http':
         print '---> Invalid URL !! URL should start with http://'
#checking if the url entered is a valid one
	 sys.exit()

   scanpage(parsed[1],select,3,None)

elif select == '-d' or select == '-L':
   if select == '-d':

      try:
         get_depth = sys.argv[2]
         input_url = sys.argv[3]
      except IndexError:
         print "Error!! Please give a valid argument"
         sys.exit()

      url_parsed = urlparse(input_url)

      if url_parsed[0] != 'http':
         print '---> Invalid URL !! URL should start with http://'
#checking if the url entered is a valid one
	 sys.exit()

      print "You have selected the depth option\n"
      scanpage(url_parsed[1],input_url,int(get_depth),None)


   elif select == '-L':

      try:
         input_url = sys.argv[2]
      except IndexError:
         print "Error!! Please give a valid argument"
         sys.exit()

      url_parsed = urlparse(input_url)
      if url_parsed[0] != 'http':
         print '---> Invalid URL !! URL should start with http://'
#checking if the url entered is a valid one
	 sys.exit()

      print "You have selected Local access option\n"
      scanpage(url_parsed[1],input_url,3,None)

else:
    print "Error !! Invalid argument"
    sys.exit()
#print "global list",global_list



#thanking you

#samir