[Tutor] finding title tag
Samir Patel
sampatel@cs.rmit.edu.au
Mon, 15 Oct 2001 16:05:42 +1000 (EST)
hi,
i have written a program which takes in the url at command line argument
and finds the links present in that particular web page and this continues
to a depth of 3 or more.....
i store this links in a universal list ..now my problem is that i want to
find if there's any title tag present in this link e.g
<a href= "this is url" title = "i want this">or this</a>
if there's title tag then i want to store the value of this corresponding
to the link.....
following is the code which i have written and i want to extend this code
....
import re
import sys
import urllib
from formatter import NullFormatter
from htmllib import HTMLParser
from string import find,lower
from urlparse import urljoin,urlparse
def findlinks(url):
try:
fp = urllib.urlopen(url)
except IOError:
return [] # return an empty list if we can't get a
page
results = [] # create an empty list to store results in
p = HTMLParser(NullFormatter())
p.feed(fp.read())
return p.anchorlist # return the list of lines which have a link
def scanpage(ser_name,url, depth, maxindent):
linklist = findlinks(url)
if maxindent == None:
maxindent = depth
indent = " " * (3*(maxindent - depth)) # this will let us
progressively indent printouts from deeper levels
print "%s -> %s" % (indent, url)
if depth > 0:
for link in linklist:
pars = urlparse(link)
if pars[0] == 'https':
print 'Secure site !! Cannot open ....skipped...'
continue
if find(lower(link),'mailto:') != -1: # check if the link is
a mailto tag
continue
if link[:4] != 'http' and link[0] == '/' :
link = urljoin(url,link)
if link[:4] != 'http'and link[0] != '/' :
link = urljoin(url,link)
global_list.append(link)
serve = get_server(link)
if select == '-L':
if serve == ser_name or serve == None:
scanpage(ser_name,link, depth - 1, maxindent)
#function is again called with individual url
else:
scanpage(ser_name,link, depth - 1, maxindent)
def get_server(input_url):
try:
server = re.split('//',input_url)
server_name = re.split('/',server[1])
return server_name[0]
except:
return None
try:
select = sys.argv[1]
except IndexError:
print "Error!! Please give a valid argument"
sys.exit()
get_depth = None
global_list = []
ser_name = None
if len(sys.argv) == 2:
parsed = urlparse(select)
if parsed[0] != 'http':
print '---> Invalid URL !! URL should start with http://'
#checking if the url entered is a valid one
sys.exit()
scanpage(parsed[1],select,3,None)
elif select == '-d' or select == '-L':
if select == '-d':
try:
get_depth = sys.argv[2]
input_url = sys.argv[3]
except IndexError:
print "Error!! Please give a valid argument"
sys.exit()
url_parsed = urlparse(input_url)
if url_parsed[0] != 'http':
print '---> Invalid URL !! URL should start with http://'
#checking if the url entered is a valid one
sys.exit()
print "You have selected the depth option\n"
scanpage(url_parsed[1],input_url,int(get_depth),None)
elif select == '-L':
try:
input_url = sys.argv[2]
except IndexError:
print "Error!! Please give a valid argument"
sys.exit()
url_parsed = urlparse(input_url)
if url_parsed[0] != 'http':
print '---> Invalid URL !! URL should start with http://'
#checking if the url entered is a valid one
sys.exit()
print "You have selected Local access option\n"
scanpage(url_parsed[1],input_url,3,None)
else:
print "Error !! Invalid argument"
sys.exit()
#print "global list",global_list
#thanking you
#samir