speed problems

Hans-Peter Jansen hpj at urpla.net
Thu Jun 10 14:27:22 EDT 2004


Hi Axel & Pythoneers,

I played around with your scripts, and my winner got a bit longer than 
usual. I hope, the important part doesn't suffered to much, but thanks 
to a cheat, this one is faster than your original perl script, even with 
profiling and annotations enabled! Talking 'bout the latter: shamelessly
stolen from a Zope check in by our master master, because I couldn't get 
hotshot to produce useful per line statistics out of the box. 
http://mail.zope.org/pipermail/zope-cvs/2002-May/001035.html

Well, although I have to admit, that perl seems faster on this specific 
task (since the grep cheat would work for perl too), I would never 
consider such a move, just try to do this with perl:

---8<--- [virstat.py] ---8<---
#!/usr/bin/python

import os
import re

maillogs = [
            "mail",
            "mail-20040600.gz",
            "mail-20040604.gz",
            "mail-20040607.gz",
            "mail-20040610.gz"
           ]

#gzip = "/usr/bin/gzip -dc"
#bzip2 = "/usr/bin/bzip2 -dc"
gzip = "/usr/bin/zcat"
bzip2 = "/usr/bin/bzcat"
virstat = {}
total = 0
doprof = 1
pat = re.compile( "INFECTED \((.*)\)" )

def dovirstat():
  global virstat, total
  for logfile in maillogs:
    if os.path.isfile(logfile):
      # is it compressed?
      if logfile.endswith('.gz'):
        #ifd, lfd = os.popen2("%s %s" % (gzip, logfile))
        #XXX: cheating
        ifd, lfd = os.popen2("%s %s | grep INFECTED" % (gzip, logfile))
      elif logfile.endswith('.bz2'):
        #ifd, lfd = os.popen2("%s %s" % (bzip2, logfile))
        #XXX: cheating
        ifd, lfd = os.popen2("%s %s | grep INFECTED" % (bzip2, logfile))
      else:
        # uncompressed
        lfd = open(logfile, "r")

      # hot loop
      for line in lfd:
        mo = pat.search(line)
        if mo:
          for vnam in mo.group(1).split( ", "):
            virstat[vnam] = virstat.get(vnam, 0) + 1
            total += 1
      lfd.close()
  #  else:
  #    print "logfile '%s' doesn't exist, skipping it." % logfile

def load_line_info(log):
    byline = {}
    prevloc = None
    for what, place, tdelta in log:
        if tdelta > 0:
            t, nhits = byline.get(prevloc, (0, 0))
            byline[prevloc] = (tdelta + t), (nhits + 1)
            prevloc = place
    return byline

def basename(path, cache={}):
    try:
        return cache[path]
    except KeyError:
        fn = os.path.split(path)[1]
        cache[path] = fn
        return fn

def print_results(results):
    for info, place in results:
        if not place:
            print 'Bad unpack:', info, place
            continue
        filename, line, funcname = place
        print '%8d %8d' % info, basename(filename), line

def annotate_results(results):
    files = {}
    for stats, place in results:
        if not place:
            continue
        time, hits = stats
        file, line, func = place
        l = files.get(file)
        if l is None:
            l = files[file] = []
        l.append((line, hits, time))
    order = files.keys()
    order.sort()
    for k in order:
        if os.path.exists(k):
            v = files[k]
            v.sort()
            annotate(k, v)

def annotate(file, lines):
    print "-" * 60
    print file
    print "-" * 60
    f = open(file)
    i = 1
    match = lines[0][0]
    for line in f:
        if match == i:
            print "%6d %8d " % lines[0][1:], line,
            del lines[0]
            if lines:
                match = lines[0][0]
            else:
                match = None
        else:
            print " " * 16, line,
        i += 1
    print

if not doprof:
  dovirstat()
else:
  import hotshot
  prof = hotshot.Profile("virstat.prof", lineevents=1)
  prof.runcall(dovirstat)
  prof.close()

vlist = virstat.keys()
vlist.sort()
for vname in vlist:
  p = (virstat[vname] / float(total)) * 100
  print "%-30s  %5.2f%%" % (vname, p)
print

if doprof:
  from hotshot.log import LogReader

  log = LogReader("virstat.prof")
  byline = load_line_info(log)
  results = [(v, k) for k, v in byline.items() if k and k[0] == 'virstat.py' ]
  results.sort()
  #print_results(results)
  annotate_results(results)

--->8---

Python programming is not only an easy way to get necessary work done,
on it's best it combines art and science in an esthetic manner. 

Pete



More information about the Python-list mailing list