Request for tips on my first python script.

Lex Hider lexhider at internode.on.net
Thu Sep 7 21:28:08 EDT 2006


Hi,
Apologies if this is against etiquette. I've just got my first python app up 
and running. It is a podcast aggregator depending on feedparser. I've really 
only learnt enough to get this up and running.

Any tips on the code quality and use of python would be appreciated. I've got 
a feeling the overall structure is up the creek.
approx 220 LOC.
file: GodCast.py

Cheers,
Lex.

#!/usr/bin/python
# GodCast: podcast aggregator!
# depends on wget & lynx
# * one of the main features of GodCast is it's use of bandwidth.
#    Many podcatchers

# http://www.faqts.com/knowledge_base/view.phtml/aid/422/fid/17
# TODO: not found log
# TODO:
# config file
# opml feed list?
# pygtk/pyqt/qtkde gui?

# possible flags: test, print but don't actual do anything

import re, feedparser, os, sys, shutil, time, getopt
import urllib2
import urllib
import md5

boz = ""
HOME = os.path.expanduser("~")
# user configurable
#maxChecksPerDay = 8
#maxChecksPerDay = 12
maxChecksPerDay = 24
myTemp = '/tmp'
#podDir = os.path.join(HOME, 'Audio/Podcasts')
podDir = os.path.join(HOME, 'Podcasts')
# end user configurable
downDir = os.path.join(myTemp, 'Podcasts')
dotDir = os.path.join(HOME, '.aGodCast')
logFile = os.path.join(dotDir, 'log') #list of downloaded urls
cacheDir = os.path.join(dotDir, 'cache')
ignoreNotFound = False # if true, add files not found to log
# list of feeds, ignore lines not beginning ^http
feedList = os.path.join(dotDir, 'feeds.txt')


def exitFunc():
    #f.close()
    #log.close()
    if boz:
        print boz


def makeDirs(*dirs):
    for dir in dirs:
        if not os.path.exists(dir):
            os.makedirs(dir)


# render is used because feeds use a lot of html, not just plain text.
def render(html):
    if html:
        html = re.sub('"', '\\"', html.encode('utf8'))
        #command = 'echo "' + html + '" | w3m -dump -T text/html'
        #command = 'echo "' + html + '" | html2text'
        command = 'echo "' + html + '" | lynx -dump -stdin -force_html'
        os.system(command)


def localMD5(url):
    hash = md5.new(url).hexdigest() + '.xml' #unique name from url
    return os.path.join(cacheDir, hash)


def cache(url):
    max  = 60 * 60 * 24 / maxChecksPerDay #seconds
    myfile = localMD5(url)
    if os.path.isfile(myfile):
        elapsed = int(time.time()) - os.path.getmtime(myfile)
        if elapsed <= max:
            return
    print "FETCHING:", url + ' ...'
    urllib.urlretrieve(url, myfile)
    # handle half finish?


def updateCache(feeds):
    l = []
    print "updating local xml cache..."
    for feed in file(feeds, "r").read().split('\n'):
        if not re.match('^http://', feed): # feedList ignores anything but 
urls
            continue
        # TODO: handle whitespace, strip trailing
        cache(feed)
        l.append([localMD5(feed), feed])
    print "cache up to date"
    return l


def geturl(url):
    try:
        redir =  urllib2.urlopen(url).geturl()
    except urllib2.HTTPError, e:
        if e.code != 404:
            print url
            print "geturl HTTPError:", e.code
        return e.code
    except urllib2.URLError, e:
        # (110, 'Connection timed out')
        print e.reason
        #print "geturl URLError:", e.code
    else:
        return redir
    return 0


def htmlTitle(mainTitle, subTitle):
    s = '<HR>'
    s += '<H2>' + mainTitle + '</H2>'
    s += '<H3>' + subTitle + '</H3>'
    return s


def downloadPod(url, dest):
    kb = 2
    success = 0
    command = 'wget --continue -O "' + dest + '" "' + url + '"'
    status  =  os.system(command)
    if status == success:
        return True
    else:
        print "\nWGET:", status
        if status == kb:
            pass
            #raise KeyboardInterrupt
        return False


def downloadQueue(q, latest):
    for x in range(latest):
        for [feedTitle, castList] in q:
            if not len(castList) > x:
                continue
            cast = castList[x]
            if cast is None:
                continue
            url = cast.enclosures[0]['href']
            redirect = geturl(url) # TRAFFIC
            if type(redirect) != int: #success
                render(htmlTitle(feedTitle + ": #" + str(x+1), cast.title))
                render(cast.description)

                podFile = os.path.basename(redirect).split('?')[0]
                permDir = os.path.join(podDir, feedTitle)
                permFile = os.path.join(permDir, podFile)
                tempDir = os.path.join(downDir, feedTitle)
                tempFile =  os.path.join(tempDir, podFile)
                if not os.path.isfile(permFile):
                    makeDirs(tempDir, permDir)
                    if downloadPod(redirect, tempFile): # TRAFFIC
                        shutil.move(tempFile, permFile)
                        log(url)
                    else:
                        print "EXITING"
                        sys.exit(2)
                else:
                    render("<BR>*** ON HARD-DRIVE ***")
                    log(url)
            elif redirect == 404:
                print 'NOT FOUND:', url
                if ignoreNotFound:
                    print '\tWILL NO LONGER ATTEMPT TO DOWNLOAD\n'
                    log(url)
            else:
                sys.exit(2)


def log(url):
    file(logFile, 'a').write(url + "\n")


def main(args):
    sys.exitfunc = exitFunc
    makeDirs(dotDir, podDir, downDir, cacheDir)
    #make file if doesn't exist, may be better solution?
    X = file(logFile, 'a')
    latest = 13 #get the first x casts for each feed

    try:
        opts, args = getopt.getopt(sys.argv[1:], "l:", 
["latest=", "notfound"])
    except getopt.GetoptError:          
        sys.exit(2)                     
        #usage()                         
    
    for opt, arg in opts:
        if opt in ("-l", "--latest"):
            latest = int(arg)
        elif opt in ("--notfound"):
            ignoreNotFound = True #add notfound files to log 
    
    Q = []
    for [xmlFile, url] in updateCache(feedList):
        output = ""
        xml = feedparser.parse(xmlFile)
        if xml.channel.has_key('title'): #skip dodgy feeds
            itemQ= []
            for item in xml['items'][:latest]:
                if item.has_key('enclosures'):
                    podURL =  item.enclosures[0]['href']
		    #check if url in log
                    if file(logFile, 'r').read().find(podURL) < 0:
                        itemQ.append(item)
                        output += htmlTitle(xml.channel.title, item.title)
                        output += item.description 
                    else:
                        itemQ.append(None)
            Q.append([xml.channel.title, itemQ])
        else:
            print "DODGY FEED:", url
            if xml.bozo:
                boz += "BOZO: " + xml.bozo_exception.getMessage() + "\t" + url
            sys.exit(2) #time.sleep(1) # allow ctrl+c #continue
        render(output)
    
    if Q is not None:
        render('<HR><H1>DOWNLOADING QUEUE</H1><HR>')
        downloadQueue(Q, latest)

######################################################
if __name__=="__main__":
    main(sys.argv)



More information about the Python-list mailing list