[Tutor] Here is a completed script for everyones perusal.

montana sarmxiii@knology.net
Tue, 10 Sep 2002 13:32:50 -0500


Hi Everyone-

I've attached a script to this email for everyone to check out. Please 
feel free to make comments or suggestions. This script is a basic new 
retriever from the FoxNews site. It downloads the articles into easily 
readable html that I then transfer over to my Zaurus for reading.  The 
script is as follows:

#!/usr/bin/env python
# A simple Python script that downloads new stories from FoxNews
# Sean Armstrong, 06 September 2002

import re
import urllib, urllister
import sys
import os
import string
import time

class Html:
	def __init__(self, address):
		self.address = address
		
	#connection and parsing
	def connect(self):
		sock = urllib.urlopen("http://" + self.address)
		parser = urllister.URLLister()
		parser.feed(sock.read())
		parser.close()
		sock.close()
		parser = parser.urls
		return parser
		
	#search main doc for story links
	def linkSearch(self):
		source = self.connect()
		link = r'''\/story\/0,\d+,6\d+,00\.html'''
		sch = re.findall(link, string.join(source))
		sch = string.join(sch)
		return sch
	
	#compare old source file with new
	def compare(self):
		t = time.strftime("%j_%H%M%S_%Y", time.localtime())
		folder_contents = os.listdir("/Users/montana/News/Fox/")
		l = len(folder_contents)
		for i in folder_contents:
			if i[-3:] == "fox":
				oldnews = i
				newsin = open("/Users/montana/News/Fox/"+oldnews, "rb")
				newsfile = newsin.read()
				if self.linkSearch() != newsin:
					newsin.close()
					print "News is being updated ..."
					newsout = open("/Users/montana/News/Fox/news"+t+".fox", "wb")
					newsout.write(self.linkSearch())
					newsout.close()
					os.remove("/Users/montana/News/Fox/"+oldnews)
				else:
					print "Nothing to update. Bye."
					exit
	
	#download desired html links
	def download(self):
		sch = string.split(self.linkSearch())
		l = len(sch)
		for i in range(l):
			file = sch[i]
			os.system("touch /Users/montana/News/Fox/"+file[7:])
			sock = urllib.urlopen("http://www.foxnews.com" + sch[i])
			links = sock.read()
			output = open("/Users/montana/News/Fox/"+file[7:], "wb")
			output.write(links)
			output.close()
			sock.close()
			
class StoryExtractor:
	def __init__(self, html):
		self.html = html
		
	#search and crop headers from html
	def cropHeader(self):
		f = open(self.html, "r")
		file = f.read()
		headline = "%s.+%s" % ("<!--Headline-->", "<!--/Headline-->")
		headline = re.findall(headline, file)
		headline = string.join(headline)
		f.close()
		return headline
		
	#search and crop text from html
	def crop(self):
		f = open(self.html, "r")
		file = f.read()
		story = "(?sx)%s.+%s" % ("<!--Storytext-->", "<!--/Storytext-->")
		newhtml = re.findall(story, file)
		newhtml = string.join(newhtml)
		f.close()
#		start = "<TABLE"
#		end = "</TABLE>"
		middle = "(?sx)%s.+%s" % ("<TABLE", "</TABLE>")
		body = re.sub(middle, " ", newhtml)
		return body
	
if __name__ == "__main__":
	start = Html("www.foxnews.com")
	start.compare()
	start.download()
	list = os.listdir("/Users/montana/News/Fox/")
	l = len(list)
	t = time.strftime("%j_%H%M%S_%Y", time.localtime())
	os.mkdir("/Users/montana/News/Fox/"+t)
	savedir = "/Users/montana/News/Fox/"+t+"/"
	count = 0
	headera = '''<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 
Transitional//EN"
         			"http://www.w3.org/TR/1999/REC-html401-19991224/loose.dtd">
					<html lang="en">
					<head>
					<meta http-equiv="content-type" content="text/html; 
charset=iso-8859-1">
				<title>'''
	headerb = '''</title>
					<meta name="generator" content="BBEdit 6.5.2">
					</head>
					<body>'''
	footer = '''</body>
				</html>'''
	for i in range(l):
		count += 1
		item = list[i]
		if item[-4:] == "html":
			story = StoryExtractor("/Users/montana/News/Fox/"+item)
			headline = story.cropHeader()
			body = story.crop()
			newstory = headera + "Story" + headerb + "<h3><u>" + headline + 
"</u></h3>" + body + footer
			number = str(count)
			outfile = open(savedir+"story"+number+".html", "w")
			outfile.write(newstory)
			outfile.close()
	os.chdir(savedir)
	newlist = os.listdir(savedir)
	f = open("index"+t+".html", "a")
	l = len(newlist)
	f.write(headera + "FoxNews" + headerb)
	for i in range(l):
		item = newlist[i]
		html = open(item, "r")
		file = html.read()
		html.close()
		headline = "%s(.+)%s" % ("<!--Headline-->", "<!--/Headline-->")
		headline = re.findall(headline, file)
		f.write('''<a href="'''+item+'''">'''+headline[0]+"</a><br><br>")
	f.write(footer)
	os.chdir("/Users/montana/News/Fox/")
	os.system("rm -rf *.html")

Let me know of any bugs you come across please.

Thanks.
SA

"I can do everything on my Mac I used to do on my PC, plus alot more 
..."
--Me