[Spambayes-checkins]
spambayes/testtools mksets.py,NONE,1.1 incremental.TODO.txt,1.2,1.3
mkgraph.py,1.1,1.2
T. Alexander Popiel
popiel at users.sourceforge.net
Fri Feb 28 21:13:10 EST 2003
Update of /cvsroot/spambayes/spambayes/testtools
In directory sc8-pr-cvs1:/tmp/cvs-serv24670
Modified Files:
incremental.TODO.txt mkgraph.py
Added Files:
mksets.py
Log Message:
Actually include mksets.py. Doh.
Give mkgraph.py a few options, so it spits out either counts
or error rates, but not both. Also, it can do n-day averages
instead of cumulative.
--- NEW FILE: mksets.py ---
#! /usr/bin/env python
### Redistribute messages among the classic Data/*/Set* directories
### based on desired set count, desired with messages
### directories based from MH mailboxes ~/Mail/everything and
### ~/Mail/spam.
"""Usage: %(program)s [OPTIONS] ...
Where OPTIONS is one or more of:
-h
show usage and exit
-s num
random number seed
-n num
number of sets
-g num
number of groups
-m num
number of messages per {ham,spam}*group*set
"""
import getopt
import sys
import os
import os.path
import glob
import shutil
import random
program = sys.argv[0]
loud = True
hamdir = "Data/Ham"
spamdir = "Data/Spam"
nsets = 5
ngroups = None
nmess = None
def usage(code, msg=''):
"""Print usage message and sys.exit(code)."""
if msg:
print >> sys.stderr, msg
print >> sys.stderr
print >> sys.stderr, __doc__ % globals()
sys.exit(code)
def bybasename(a, b):
return cmp(os.path.basename(a).split("-", 2)[0],
os.path.basename(b).split("-", 2)[0])
def distribute(dir):
files = glob.glob(os.path.join(dir, "*", "*"))
random.shuffle(files)
files.sort(bybasename)
trash = glob.glob(os.path.join(dir, "Set*"))
for set in range(1, nsets + 1):
name = os.path.join(dir, "Set%d" % set)
try:
os.makedirs(name)
except:
pass
try:
trash.remove(name)
except:
pass
try:
os.makedirs(os.path.join(dir, "reservoir"))
except:
pass
oldgroup = ""
cgroups = 0
cmess = 0
cset = 1
for f in files:
newgroup = (f.split('-'))[0]
if newgroup != oldgroup:
oldgroup = newgroup
cgroups = cgroups + 1
cmess = 0
cmess = cmess + 1
if ((ngroups is not None and cgroups > ngroups) or
(nmess is not None and cmess > (nmess * nsets))):
newname = os.path.join(dir, "reservoir",
os.path.basename(f))
else:
newname = os.path.join(dir, "Set%d" % cset,
os.path.basename(f))
cset = (cset % nsets) + 1
sys.stdout.write("%-78s\r" % ("Moving %s to %s" % (f, newname)))
sys.stdout.flush()
if f != newname:
os.rename(f, newname)
for f in trash:
os.rmdir(f)
def main():
"""Main program; parse options and go."""
global loud
global nsets
global ngroups
global nmess
try:
opts, args = getopt.getopt(sys.argv[1:], 'hs:n:g:m:')
except getopt.error, msg:
usage(2, msg)
if opts:
for opt, arg in opts:
if opt == '-h':
usage(0)
elif opt == '-s':
random.seed(int(arg))
elif opt == '-n':
nsets = int(arg)
elif opt == '-g':
ngroups = int(arg)
elif opt == '-m':
nmess = int(arg)
if args:
usage(2, "Positional arguments not allowed")
distribute(hamdir)
distribute(spamdir)
print
if __name__ == "__main__":
main()
Index: incremental.TODO.txt
===================================================================
RCS file: /cvsroot/spambayes/spambayes/testtools/incremental.TODO.txt,v
retrieving revision 1.2
retrieving revision 1.3
diff -C2 -d -r1.2 -r1.3
*** incremental.TODO.txt 28 Feb 2003 17:35:28 -0000 1.2
--- incremental.TODO.txt 1 Mar 2003 05:13:07 -0000 1.3
***************
*** 20,23 ****
graphing:
! separate files for each graph
! cumulative vs span totals
--- 20,23 ----
graphing:
! separate files for each graph -- DONE
! cumulative vs span totals -- DONE
Index: mkgraph.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/testtools/mkgraph.py,v
retrieving revision 1.1
retrieving revision 1.2
diff -C2 -d -r1.1 -r1.2
*** mkgraph.py 28 Feb 2003 00:02:45 -0000 1.1
--- mkgraph.py 1 Mar 2003 05:13:08 -0000 1.2
***************
*** 1,3 ****
--- 1,7 ----
import sys
+ import getopt
+
+ report = "error"
+ span = None
set = ""
***************
*** 13,17 ****
--- 17,33 ----
nspam_unsure = []
+ def line(vals):
+ global span
+ for k in range(0, len(vals)):
+ n = vals[k]
+ if span and k - span >= 0:
+ n -= vals[k - span]
+ print '%d %d' % (k, n)
+ print
+
+
def outputset():
+ global report
+ global span
global set
global nham_tested
***************
*** 29,88 ****
return
! print '$ Data=Curve2d name="Set %s Cumulative"' % set
! print '% linetype=1 linelabel="ham_tested" markertype=0 linecolor=0'
! for k in range(0, len(nham_tested)):
! print '%d %d' % (k, nham_tested[k])
! print
! print '% linetype=1 linelabel="ham_trained" markertype=0 linecolor=1'
! for k in range(0, len(nham_trained)):
! print '%d %d' % (k, nham_trained[k])
! print
! print '% linetype=1 linelabel="ham_right" markertype=0 linecolor=2'
! for k in range(0, len(nham_right)):
! print '%d %d' % (k, nham_right[k])
! print
! print '% linetype=1 linelabel="ham_wrong" markertype=0 linecolor=3'
! for k in range(0, len(nham_wrong)):
! print '%d %d' % (k, nham_wrong[k])
! print
! print '% linetype=1 linelabel="ham_unsure" markertype=0 linecolor=4'
! for k in range(0, len(nham_unsure)):
! print '%d %d' % (k, nham_unsure[k])
! print
! print '% linetype=1 linelabel="spam_tested" markertype=0 linecolor=5'
! for k in range(0, len(nspam_tested)):
! print '%d %d' % (k, nspam_tested[k])
! print
! print '% linetype=1 linelabel="spam_trained" markertype=0 linecolor=6'
! for k in range(0, len(nspam_trained)):
! print '%d %d' % (k, nspam_trained[k])
! print
! print '% linetype=1 linelabel="spam_right" markertype=0 linecolor=7'
! for k in range(0, len(nspam_right)):
! print '%d %d' % (k, nspam_right[k])
! print
! print '% linetype=1 linelabel="spam_wrong" markertype=0 linecolor=8'
! for k in range(0, len(nspam_wrong)):
! print '%d %d' % (k, nspam_wrong[k])
! print
! print '% linetype=1 linelabel="spam_unsure" markertype=0 linecolor=9'
! for k in range(0, len(nspam_unsure)):
! print '%d %d' % (k, nspam_unsure[k])
! print
!
! print '$ Data=Curve2d name="Set %s Cumulative Error Rates"' % set
! print '% linetype=1 linelabel="fp" markertype=0 linecolor=0'
! for k in range(0, len(nham_wrong)):
! print '%d %f' % (k, (nham_wrong[k] * 1.0 / (nham_tested[k] or 1)))
! print
! print '% linetype=1 linelabel="fn" markertype=0 linecolor=1'
! for k in range(0, len(nspam_wrong)):
! print '%d %f' % (k, (nspam_wrong[k] * 1.0 / (nspam_tested[k] or 1)))
! print
! print '% linetype=1 linelabel="fn" markertype=0 linecolor=2'
! for k in range(0, len(nspam_unsure)):
! print '%d %f' % (k, ((nspam_unsure[k] + nham_unsure[k]) * 1.0 /
! ((nspam_tested[k] + nham_tested[k]) or 1)))
! print
set = ""
--- 45,109 ----
return
! if span:
! title = "%d-Day Average" % span
! else:
! title = "Cumulative"
!
! if report == "counts":
! print '$ Data=Curve2d name="%s Counts"' % (title)
! print '% linetype=1 linelabel="ham_tested" markertype=0 linecolor=0'
! line(nham_tested)
! print '% linetype=1 linelabel="ham_trained" markertype=0 linecolor=1'
! line(nham_trained)
! print '% linetype=1 linelabel="ham_right" markertype=0 linecolor=2'
! line(nham_right)
! print '% linetype=1 linelabel="ham_wrong" markertype=0 linecolor=3'
! line(nham_wrong)
! print '% linetype=1 linelabel="ham_unsure" markertype=0 linecolor=4'
! line(nham_unsure)
! print '% linetype=1 linelabel="spam_tested" markertype=0 linecolor=5'
! line(nspam_tested)
! print '% linetype=1 linelabel="spam_trained" markertype=0 linecolor=6'
! line(nspam_trained)
! print '% linetype=1 linelabel="spam_right" markertype=0 linecolor=7'
! line(nspam_right)
! print '% linetype=1 linelabel="spam_wrong" markertype=0 linecolor=8'
! line(nspam_wrong)
! print '% linetype=1 linelabel="spam_unsure" markertype=0 linecolor=9'
! line(nspam_unsure)
!
! if report == "error":
! print '$ Data=Curve2d'
! print '% toplabel="%s Error Rates"' % (title)
! print '% ymax=5'
! print '% xlabel="Days"'
! print '% ylabel="Percent"'
! print '% linetype=1 linelabel="fp" markertype=0 linecolor=0'
! for k in range(0, len(nham_wrong)):
! n = nham_wrong[k]
! d = nham_tested[k]
! if span and k - span >= 0:
! n -= nham_wrong[k - span]
! d -= nham_tested[k - span]
! print '%d %f' % (k, (n * 100.0 / (d or 1)))
! print
! print '% linetype=1 linelabel="fn" markertype=0 linecolor=1'
! for k in range(0, len(nspam_wrong)):
! n = nspam_wrong[k]
! d = nspam_tested[k]
! if span and k - span >= 0:
! n -= nspam_wrong[k - span]
! d -= nspam_tested[k - span]
! print '%d %f' % (k, (n * 100.0 / (d or 1)))
! print
! print '% linetype=1 linelabel="unsure" markertype=0 linecolor=2'
! for k in range(0, len(nspam_unsure)):
! n = nham_unsure[k] + nspam_unsure[k]
! d = nham_tested[k] + nspam_tested[k]
! if span and k - span >= 0:
! n -= nham_unsure[k - span] + nspam_unsure[k - span]
! d -= nham_tested[k - span] + nspam_tested[k - span]
! print '%d %f' % (k, (n * 100.0 / (d or 1)))
! print
set = ""
***************
*** 99,102 ****
--- 120,125 ----
def main():
+ global report
+ global span
global set
global nham_tested
***************
*** 111,114 ****
--- 134,148 ----
global nspam_unsure
+ opts, args = getopt.getopt(sys.argv[1:], 's:r:')
+ for opt, arg in opts:
+ if opt == '-s':
+ span = int(arg)
+ if opt == '-r':
+ report = arg
+
+ if report not in ("error", "counts"):
+ print >> sys.stderr, "Unrecognized report type"
+ sys.exit(1)
+
while 1:
line = sys.stdin.readline()
***************
*** 117,121 ****
if line.endswith("\n"):
line = line[:-1]
- print "# " + line
if line.startswith("Set "):
outputset()
--- 151,154 ----
More information about the Spambayes-checkins
mailing list