Request for tips on my first python script.

Fri Sep 8 06:54:09 EDT 2006

Lex Hider wrote:
> Hi,
> Apologies if this is against etiquette. I've just got my first python app up 
> and running. It is a podcast aggregator depending on feedparser. I've really 
> only learnt enough to get this up and running.
> 
> Any tips on the code quality and use of python would be appreciated. I've got 
> a feeling the overall structure is up the creek.
> approx 220 LOC.
> file: GodCast.py
> 
> #!/usr/bin/python
(snip)
> # TODO: not found log
http://docs.python.org/lib/module-logging.html

> # TODO:
> # config file
http://docs.python.org/lib/module-ConfigParser.html

> # opml feed list?
> # pygtk/pyqt/qtkde gui?
> 
> # possible flags: test, print but don't actual do anything

http://docs.python.org/lib/module-optparse.html

> import re, feedparser, os, sys, shutil, time, getopt
> import urllib2
> import urllib
> import md5
> 
> boz = ""
> HOME = os.path.expanduser("~")
> # user configurable
> #maxChecksPerDay = 8
> #maxChecksPerDay = 12
> maxChecksPerDay = 24
> myTemp = '/tmp'

http://docs.python.org/lib/module-tempfile.html

BTW, note that the most common naming convention in Python is
all_lower_with_underscore (except for ClasseName).

> #podDir = os.path.join(HOME, 'Audio/Podcasts')
> podDir = os.path.join(HOME, 'Podcasts')
> # end user configurable
> downDir = os.path.join(myTemp, 'Podcasts')
> dotDir = os.path.join(HOME, '.aGodCast')
> logFile = os.path.join(dotDir, 'log') #list of downloaded urls
> cacheDir = os.path.join(dotDir, 'cache')
> ignoreNotFound = False # if true, add files not found to log
> # list of feeds, ignore lines not beginning ^http
> feedList = os.path.join(dotDir, 'feeds.txt')
> 
> 
> def exitFunc():
>     #f.close()
>     #log.close()
>     if boz:
>         print boz
> 
> 
> def makeDirs(*dirs):
>     for dir in dirs:
>         if not os.path.exists(dir):
>             os.makedirs(dir)
> 
> 
> # render is used because feeds use a lot of html, not just plain text.
> def render(html):
>     if html:
>         html = re.sub('"', '\\"', html.encode('utf8'))
>         #command = 'echo "' + html + '" | w3m -dump -T text/html'
>         #command = 'echo "' + html + '" | html2text'
>         command = 'echo "' + html + '" | lynx -dump -stdin -force_html'

another way :
command = 'echo "%s" | lynx -dump -stdin -force_html' % html

>         os.system(command)
> 
> 
> def localMD5(url):
>     hash = md5.new(url).hexdigest() + '.xml' #unique name from url
>     return os.path.join(cacheDir, hash)
> 
> 
> def cache(url):
>     max  = 60 * 60 * 24 / maxChecksPerDay #seconds
>     myfile = localMD5(url)
>     if os.path.isfile(myfile):
>         elapsed = int(time.time()) - os.path.getmtime(myfile)
>         if elapsed <= max:
>             return
>     print "FETCHING:", url + ' ...'

Note that stdout is usually meant for normal program outputs (so one can
pipe programs). Error reporting and verbosities should go to stderr

>     urllib.urlretrieve(url, myfile)
>     # handle half finish?
> 
> 
> def updateCache(feeds):
>     l = []
>     print "updating local xml cache..."
>     for feed in file(feeds, "r").read().split('\n'):
>         if not re.match('^http://', feed): # feedList ignores anything but 
> urls
>             continue
>         # TODO: handle whitespace, strip trailing
>         cache(feed)
>         l.append([localMD5(feed), feed])
>     print "cache up to date"
>     return l
> 
> 
> def geturl(url):
>     try:
>         redir =  urllib2.urlopen(url).geturl()
>     except urllib2.HTTPError, e:
>         if e.code != 404:
>             print url
>             print "geturl HTTPError:", e.code
>         return e.code
>     except urllib2.URLError, e:
>         # (110, 'Connection timed out')
>         print e.reason
>         #print "geturl URLError:", e.code
>     else:
>         return redir
>     return 0

I'm afraid you didn't get the point of exception handling - it's meant
to free function results from error code. Your above code totally
defeats this - as is obvious when reading the calling code.

>             redirect = geturl(url) # TRAFFIC
>             if type(redirect) != int: #success
                 do_something_useful_here()
>             elif redirect == 404:
>                 print 'NOT FOUND:', url
>                 if ignoreNotFound:
>                     print '\tWILL NO LONGER ATTEMPT TO DOWNLOAD\n'
>                     log(url)
>             else:
>                 sys.exit(2)

may I suggest a rewrite :

  try:
    redirect =  urllib2.urlopen(url).geturl()
  except urllib2.HTTPError, e:
    if e.code == 404:
      print 'NOT FOUND:', url
      if ignoreNotFound:
        print '\tWILL NO LONGER ATTEMPT TO DOWNLOAD\n'
        log(url)
    else:
      print "geturl HTTPError %s on url %s" % (e.code, url)
      raise
  except urllib2.URLError, e:
    print "geturl URLError %s on url %s" % (e.reason, url)
 else:
   do_something_useful_here()

> def htmlTitle(mainTitle, subTitle):
>     s = '<HR>'
>     s += '<H2>' + mainTitle + '</H2>'
>     s += '<H3>' + subTitle + '</H3>'
>     return s

html_title_template = """
<hr>
<h2>%(title)s</h2>
<h3>%(subtitle)s</h3>
"""

def html_title(title, subtitle):
  return html_title_template % dict(title=title, subtitle=subtitle)

> 
> def downloadPod(url, dest):
>     kb = 2
>     success = 0
>     command = 'wget --continue -O "' + dest + '" "' + url + '"'
command = 'wget --continue -0 "%s" "%s"' % (dest, url)
>     status  =  os.system(command)
>     if status == success:
>         return True
>     else:
>         print "\nWGET:", status
>         if status == kb:
>             pass
>             #raise KeyboardInterrupt
>         return False
> 
> 
> def downloadQueue(q, latest):
>     for x in range(latest):
>         for [feedTitle, castList] in q:
          for feedTitle, castList in q:
>             if not len(castList) > x:

double negations are harder to grasp:
              if len(castList) <= x

>                 continue

I'm not sure to get what you're doing here....

>             cast = castList[x]
>             if cast is None:
>                 continue
>             url = cast.enclosures[0]['href']
>             redirect = geturl(url) # TRAFFIC
>             if type(redirect) != int: #success
>                 render(htmlTitle(feedTitle + ": #" + str(x+1), cast.title))
>                 render(cast.description)

Mixing logic and UI is usually a very bad idea IMHO.

>                 podFile = os.path.basename(redirect).split('?')[0]
>                 permDir = os.path.join(podDir, feedTitle)
>                 permFile = os.path.join(permDir, podFile)
>                 tempDir = os.path.join(downDir, feedTitle)
>                 tempFile =  os.path.join(tempDir, podFile)
>                 if not os.path.isfile(permFile):
>                     makeDirs(tempDir, permDir)
>                     if downloadPod(redirect, tempFile): # TRAFFIC
>                         shutil.move(tempFile, permFile)
>                         log(url)
>                     else:
>                         print "EXITING"
>                         sys.exit(2)

This will make this function hard to use in a different context (like ie
a mod_apache handler, a GUI, etc... You'd better raise some specific
exception and let the caller handle it.

(snip)

> 
> def log(url):
>     file(logFile, 'a').write(url + "\n")

> 
> def main(args):
>     sys.exitfunc = exitFunc
>     makeDirs(dotDir, podDir, downDir, cacheDir)
>     #make file if doesn't exist, may be better solution?
yes : using the logging module

>     X = file(logFile, 'a')
>     latest = 13 #get the first x casts for each feed

>     try:
>         opts, args = getopt.getopt(sys.argv[1:], "l:", 
> ["latest=", "notfound"])
>     except getopt.GetoptError:          
>         sys.exit(2)                     
>         #usage()                         
>     
>     for opt, arg in opts:
>         if opt in ("-l", "--latest"):
>             latest = int(arg)
>         elif opt in ("--notfound"):
>             ignoreNotFound = True #add notfound files to log 

OMG. optparse is far better than this.

(snip)

My overall feeling is that in most function, you're mixing too much
different concerns. Each function should do *one* thing (and do it
well). The most obvious problem here is about mixing application logic
and UI. Application logic (what your app is effectively doing) should be
as independant as possible of UI concerns. If possible, you should be
able to reuse most of the application logic (except for the main() func,
which in your case is the command-line UI) as a module in other
applications. Now there are time when the application code needs to
notify the UI. The good news is that Python makes it easy to makes this
generic, using callback functions or objects provided by the UI and
called when appropriate by the application logic. The key here is to
come up with a clear, well-defined interface between logic code and UI code.

A last point : avoid globals whenever possible. Either pass values as
arguments to functions or use a kind of config object if there are too
much configuration stuff.

My 2 cents
-- 
bruno desthuilliers
python -c "print '@'.join(['.'.join([w[::-1] for w in p.split('.')]) for
p in 'onurb at xiludom.gro'.split('@')])"