[Spambayes-checkins] spambayes/testtools mksets.py,NONE,1.1 incremental.TODO.txt,1.2,1.3 mkgraph.py,1.1,1.2

T. Alexander Popiel popiel at users.sourceforge.net
Fri Feb 28 21:13:10 EST 2003


Update of /cvsroot/spambayes/spambayes/testtools
In directory sc8-pr-cvs1:/tmp/cvs-serv24670

Modified Files:
	incremental.TODO.txt mkgraph.py 
Added Files:
	mksets.py 
Log Message:
Actually include mksets.py.  Doh.

Give mkgraph.py a few options, so it spits out either counts
or error rates, but not both.  Also, it can do n-day averages
instead of cumulative.



--- NEW FILE: mksets.py ---
#! /usr/bin/env python

### Redistribute messages among the classic Data/*/Set* directories
### based on desired set count, desired with messages
### directories based from MH mailboxes ~/Mail/everything and
### ~/Mail/spam.

"""Usage: %(program)s [OPTIONS] ...

Where OPTIONS is one or more of:
    -h
        show usage and exit
    -s num
        random number seed
    -n num
        number of sets
    -g num
        number of groups
    -m num
        number of messages per {ham,spam}*group*set
"""

import getopt
import sys
import os
import os.path
import glob
import shutil
import random

program = sys.argv[0]
loud = True
hamdir = "Data/Ham"
spamdir = "Data/Spam"
nsets = 5
ngroups = None
nmess = None

def usage(code, msg=''):
    """Print usage message and sys.exit(code)."""
    if msg:
        print >> sys.stderr, msg
        print >> sys.stderr
    print >> sys.stderr, __doc__ % globals()
    sys.exit(code)

def bybasename(a, b):
    return cmp(os.path.basename(a).split("-", 2)[0],
               os.path.basename(b).split("-", 2)[0])

def distribute(dir):
    files = glob.glob(os.path.join(dir, "*", "*"))
    random.shuffle(files)
    files.sort(bybasename)

    trash = glob.glob(os.path.join(dir, "Set*"))
    for set in range(1, nsets + 1):
        name = os.path.join(dir, "Set%d" % set)
        try:
            os.makedirs(name)
        except:
            pass
        try:
            trash.remove(name)
        except:
            pass
    try:
        os.makedirs(os.path.join(dir, "reservoir"))
    except:
        pass

    oldgroup = ""
    cgroups = 0
    cmess = 0
    cset = 1
    for f in files:
        newgroup = (f.split('-'))[0]
        if newgroup != oldgroup:
            oldgroup = newgroup
            cgroups = cgroups + 1
            cmess = 0
        cmess = cmess + 1
        if ((ngroups is not None and cgroups > ngroups) or
            (nmess is not None and cmess > (nmess * nsets))):
            newname = os.path.join(dir, "reservoir",
                                   os.path.basename(f))
        else:
            newname = os.path.join(dir, "Set%d" % cset,
                                   os.path.basename(f))
            cset = (cset % nsets) + 1
        sys.stdout.write("%-78s\r" % ("Moving %s to %s" % (f, newname)))
        sys.stdout.flush()
        if f != newname:
            os.rename(f, newname)

    for f in trash:
        os.rmdir(f)

def main():
    """Main program; parse options and go."""

    global loud
    global nsets
    global ngroups
    global nmess

    try:
        opts, args = getopt.getopt(sys.argv[1:], 'hs:n:g:m:')
    except getopt.error, msg:
        usage(2, msg)

    if opts:
        for opt, arg in opts:
            if opt == '-h':
                usage(0)
            elif opt == '-s':
                random.seed(int(arg))
            elif opt == '-n':
                nsets = int(arg)
            elif opt == '-g':
                ngroups = int(arg)
            elif opt == '-m':
                nmess = int(arg)
        if args:
            usage(2, "Positional arguments not allowed")

    distribute(hamdir)
    distribute(spamdir)
    print


if __name__ == "__main__":
    main()

Index: incremental.TODO.txt
===================================================================
RCS file: /cvsroot/spambayes/spambayes/testtools/incremental.TODO.txt,v
retrieving revision 1.2
retrieving revision 1.3
diff -C2 -d -r1.2 -r1.3
*** incremental.TODO.txt	28 Feb 2003 17:35:28 -0000	1.2
--- incremental.TODO.txt	1 Mar 2003 05:13:07 -0000	1.3
***************
*** 20,23 ****
  
  graphing:
! separate files for each graph
! cumulative vs span totals
--- 20,23 ----
  
  graphing:
! separate files for each graph -- DONE
! cumulative vs span totals -- DONE

Index: mkgraph.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/testtools/mkgraph.py,v
retrieving revision 1.1
retrieving revision 1.2
diff -C2 -d -r1.1 -r1.2
*** mkgraph.py	28 Feb 2003 00:02:45 -0000	1.1
--- mkgraph.py	1 Mar 2003 05:13:08 -0000	1.2
***************
*** 1,3 ****
--- 1,7 ----
  import sys
+ import getopt
+ 
+ report = "error"
+ span = None
  
  set = ""
***************
*** 13,17 ****
--- 17,33 ----
  nspam_unsure = []
  
+ def line(vals):
+     global span
+     for k in range(0, len(vals)):
+         n = vals[k]
+         if span and k - span >= 0:
+             n -= vals[k - span]
+         print '%d %d' % (k, n)
+     print
+ 
+ 
  def outputset():
+     global report
+     global span
      global set
      global nham_tested
***************
*** 29,88 ****
          return
  
!     print '$ Data=Curve2d name="Set %s Cumulative"' % set
!     print '% linetype=1 linelabel="ham_tested" markertype=0 linecolor=0'
!     for k in range(0, len(nham_tested)):
!         print '%d %d' % (k, nham_tested[k])
!     print
!     print '% linetype=1 linelabel="ham_trained" markertype=0 linecolor=1'
!     for k in range(0, len(nham_trained)):
!         print '%d %d' % (k, nham_trained[k])
!     print
!     print '% linetype=1 linelabel="ham_right" markertype=0 linecolor=2'
!     for k in range(0, len(nham_right)):
!         print '%d %d' % (k, nham_right[k])
!     print
!     print '% linetype=1 linelabel="ham_wrong" markertype=0 linecolor=3'
!     for k in range(0, len(nham_wrong)):
!         print '%d %d' % (k, nham_wrong[k])
!     print
!     print '% linetype=1 linelabel="ham_unsure" markertype=0 linecolor=4'
!     for k in range(0, len(nham_unsure)):
!         print '%d %d' % (k, nham_unsure[k])
!     print
!     print '% linetype=1 linelabel="spam_tested" markertype=0 linecolor=5'
!     for k in range(0, len(nspam_tested)):
!         print '%d %d' % (k, nspam_tested[k])
!     print
!     print '% linetype=1 linelabel="spam_trained" markertype=0 linecolor=6'
!     for k in range(0, len(nspam_trained)):
!         print '%d %d' % (k, nspam_trained[k])
!     print
!     print '% linetype=1 linelabel="spam_right" markertype=0 linecolor=7'
!     for k in range(0, len(nspam_right)):
!         print '%d %d' % (k, nspam_right[k])
!     print
!     print '% linetype=1 linelabel="spam_wrong" markertype=0 linecolor=8'
!     for k in range(0, len(nspam_wrong)):
!         print '%d %d' % (k, nspam_wrong[k])
!     print
!     print '% linetype=1 linelabel="spam_unsure" markertype=0 linecolor=9'
!     for k in range(0, len(nspam_unsure)):
!         print '%d %d' % (k, nspam_unsure[k])
!     print
!     
!     print '$ Data=Curve2d name="Set %s Cumulative Error Rates"' % set
!     print '% linetype=1 linelabel="fp" markertype=0 linecolor=0'
!     for k in range(0, len(nham_wrong)):
!         print '%d %f' % (k, (nham_wrong[k] * 1.0 / (nham_tested[k] or 1)))
!     print
!     print '% linetype=1 linelabel="fn" markertype=0 linecolor=1'
!     for k in range(0, len(nspam_wrong)):
!         print '%d %f' % (k, (nspam_wrong[k] * 1.0 / (nspam_tested[k] or 1)))
!     print
!     print '% linetype=1 linelabel="fn" markertype=0 linecolor=2'
!     for k in range(0, len(nspam_unsure)):
!         print '%d %f' % (k, ((nspam_unsure[k] + nham_unsure[k]) * 1.0 /
!                              ((nspam_tested[k] + nham_tested[k]) or 1)))
!     print
  
      set = ""
--- 45,109 ----
          return
  
!     if span:
!         title = "%d-Day Average" % span
!     else:
!         title = "Cumulative"
! 
!     if report == "counts":
!         print '$ Data=Curve2d name="%s Counts"' % (title)
!         print '% linetype=1 linelabel="ham_tested" markertype=0 linecolor=0'
!         line(nham_tested)
!         print '% linetype=1 linelabel="ham_trained" markertype=0 linecolor=1'
!         line(nham_trained)
!         print '% linetype=1 linelabel="ham_right" markertype=0 linecolor=2'
!         line(nham_right)
!         print '% linetype=1 linelabel="ham_wrong" markertype=0 linecolor=3'
!         line(nham_wrong)
!         print '% linetype=1 linelabel="ham_unsure" markertype=0 linecolor=4'
!         line(nham_unsure)
!         print '% linetype=1 linelabel="spam_tested" markertype=0 linecolor=5'
!         line(nspam_tested)
!         print '% linetype=1 linelabel="spam_trained" markertype=0 linecolor=6'
!         line(nspam_trained)
!         print '% linetype=1 linelabel="spam_right" markertype=0 linecolor=7'
!         line(nspam_right)
!         print '% linetype=1 linelabel="spam_wrong" markertype=0 linecolor=8'
!         line(nspam_wrong)
!         print '% linetype=1 linelabel="spam_unsure" markertype=0 linecolor=9'
!         line(nspam_unsure)
!    
!     if report == "error": 
!         print '$ Data=Curve2d'
!         print '% toplabel="%s Error Rates"' % (title)
!         print '% ymax=5'
!         print '% xlabel="Days"'
!         print '% ylabel="Percent"'
!         print '% linetype=1 linelabel="fp" markertype=0 linecolor=0'
!         for k in range(0, len(nham_wrong)):
!             n = nham_wrong[k]
!             d = nham_tested[k]
!             if span and k - span >= 0:
!                 n -= nham_wrong[k - span]
!                 d -= nham_tested[k - span]
!             print '%d %f' % (k, (n * 100.0 / (d or 1)))
!         print
!         print '% linetype=1 linelabel="fn" markertype=0 linecolor=1'
!         for k in range(0, len(nspam_wrong)):
!             n = nspam_wrong[k]
!             d = nspam_tested[k]
!             if span and k - span >= 0:
!                 n -= nspam_wrong[k - span]
!                 d -= nspam_tested[k - span]
!             print '%d %f' % (k, (n * 100.0 / (d or 1)))
!         print
!         print '% linetype=1 linelabel="unsure" markertype=0 linecolor=2'
!         for k in range(0, len(nspam_unsure)):
!             n = nham_unsure[k] + nspam_unsure[k]
!             d = nham_tested[k] + nspam_tested[k]
!             if span and k - span >= 0:
!                 n -= nham_unsure[k - span] + nspam_unsure[k - span]
!                 d -= nham_tested[k - span] + nspam_tested[k - span]
!             print '%d %f' % (k, (n * 100.0 / (d or 1)))
!         print
  
      set = ""
***************
*** 99,102 ****
--- 120,125 ----
  
  def main():
+     global report
+     global span
      global set
      global nham_tested
***************
*** 111,114 ****
--- 134,148 ----
      global nspam_unsure
  
+     opts, args = getopt.getopt(sys.argv[1:], 's:r:')
+     for opt, arg in opts:
+         if opt == '-s':
+             span = int(arg)
+         if opt == '-r':
+             report = arg
+ 
+     if report not in ("error", "counts"):
+         print >> sys.stderr, "Unrecognized report type"
+         sys.exit(1)
+ 
      while 1:
          line = sys.stdin.readline()
***************
*** 117,121 ****
          if line.endswith("\n"):
              line = line[:-1]
-         print "# " + line
          if line.startswith("Set "):
              outputset()
--- 151,154 ----





More information about the Spambayes-checkins mailing list