[Spambayes-checkins] spambayes clgen.py,NONE,1.1 README.txt,1.30,1.31
Tim Peters
tim_one@users.sourceforge.net
Fri, 04 Oct 2002 19:53:46 -0700
Update of /cvsroot/spambayes/spambayes
In directory usw-pr-cvs1:/tmp/cvs-serv30600
Modified Files:
README.txt
Added Files:
clgen.py
Log Message:
A test driver only for use with one of the speculative central-limit
schemes. Its purpose is to generate a binary pickle containing
internal information about every prediction made. This will go
away someday.
XXX Still need tools to analyze this data.
--- NEW FILE: clgen.py ---
#! /usr/bin/env python
# A test driver using "the standard" test directory structure, producing
# info about the internals of the central-limit schemes.
"""Usage: %(program)s [options] -n nsets -t int,int,...,int
Scores for all predictions are saved at the end to binary pickle clim.pik.
This contains two lists of tuples, the first list with a tuple for every
ham predicted, the second list with a tuple for every spam predicted. Each
tuple has these values:
tag the msg identifier
is_spam True if msg came from a spam Set, False if from a ham Set
zham the msg zscore relative to the population ham
zspam the msg zscore relative to the population spam
hmean the raw mean ham score
smean the raw mean spam score
n the number of clues used to judge this msg
Note that hmean and smean are the same under use_central_limit; they're
very likely to differ under use_central_limit2.
Where:
-h
Show usage and exit.
-n int
Number of Set directories (Data/Spam/Set1, ... and Data/Ham/Set1, ...).
This is required.
-t int,int,...,int
Build a classifier training on these Set directories.
This is used to predict against the remaining Set directories.
This is required.
If you only want to use some of the messages in each set,
--ham-keep int
The maximum number of msgs to use from each Ham set. The msgs are
chosen randomly. See also the -s option.
--spam-keep int
The maximum number of msgs to use from each Spam set. The msgs are
chosen randomly. See also the -s option.
-s int
A seed for the random number generator. Has no effect unless
at least on of {--ham-keep, --spam-keep} is specified. If -s
isn't specifed, the seed is taken from current time.
In addition, an attempt is made to merge bayescustomize.ini into the options.
If that exists, it can be used to change the settings in Options.options.
"""
from __future__ import generators
import sys
from heapq import heapreplace
from sets import Set
import cPickle as pickle
from Options import options
import TestDriver
from TestDriver import printmsg
import msgs
from Histogram import Hist
fname = 'clim.pik'
program = sys.argv[0]
def usage(code, msg=''):
"""Print usage message and sys.exit(code)."""
if msg:
print >> sys.stderr, msg
print >> sys.stderr
print >> sys.stderr, __doc__ % globals()
sys.exit(code)
class MyDriver(TestDriver.Driver):
def __init__(self):
TestDriver.Driver.__init__(self)
# tuples of (msg.tag, is_spam, zham, zspam, hmean, smean, n)
self.all_ham = []
self.all_spam = []
def test(self, ham, spam):
c = self.classifier
t = self.tester
local_ham_hist = Hist()
local_spam_hist = Hist()
# clues start with these:
# extra = [('*zham*', zham),
# ('*zspam*', zspam),
# ('*hmean*', hmean), # raw mean as ham
# ('*smean*', smean), # raw mean as spam
# ('*n*', n),
#
# For use_central_limit, hmean and smean have the same value.
def new_ham(msg, prob, getclues=c.spamprob):
local_ham_hist.add(prob * 100.0)
prob, clues = getclues(msg, True)
stuff = tuple([val for tag, val in clues[:5]])
self.all_ham.append((msg.tag, False) + stuff)
def new_spam(msg, prob, getclues=c.spamprob):
local_spam_hist.add(prob * 100.0)
prob, clues = getclues(msg, True)
stuff = tuple([val for tag, val in clues[:5]])
self.all_spam.append((msg.tag, True) + stuff)
t.reset_test_results()
print "-> Predicting", ham, "&", spam, "..."
t.predict(spam, True, new_spam)
t.predict(ham, False, new_ham)
print "-> <stat> tested", t.nham_tested, "hams &", t.nspam_tested, \
"spams against", c.nham, "hams &", c.nspam, "spams"
print "-> <stat> false positive %:", t.false_positive_rate()
print "-> <stat> false negative %:", t.false_negative_rate()
newfpos = Set(t.false_positives()) - self.falsepos
self.falsepos |= newfpos
print "-> <stat> %d new false positives" % len(newfpos)
if newfpos:
print " new fp:", [e.tag for e in newfpos]
if not options.show_false_positives:
newfpos = ()
for e in newfpos:
print '*' * 78
prob, clues = c.spamprob(e, True)
printmsg(e, prob, clues)
newfneg = Set(t.false_negatives()) - self.falseneg
self.falseneg |= newfneg
print "-> <stat> %d new false negatives" % len(newfneg)
if newfneg:
print " new fn:", [e.tag for e in newfneg]
if not options.show_false_negatives:
newfneg = ()
for e in newfneg:
print '*' * 78
prob, clues = c.spamprob(e, True)
printmsg(e, prob, clues)
if options.show_best_discriminators > 0:
print
print " best discriminators:"
stats = [(-1, None)] * options.show_best_discriminators
smallest_killcount = -1
for w, r in c.wordinfo.iteritems():
if r.killcount > smallest_killcount:
heapreplace(stats, (r.killcount, w))
smallest_killcount = stats[0][0]
stats.sort()
for count, w in stats:
if count < 0:
continue
r = c.wordinfo[w]
print " %r %d %g" % (w, r.killcount, r.spamprob)
self.trained_ham_hist = local_ham_hist
self.trained_spam_hist = local_spam_hist
def ints_to_string(x):
return '{' + ','.join(map(str, x)) + '}'
def drive(nsets, trainon, predicton):
print options.display()
spamdirs = [options.spam_directories % i for i in range(1, nsets+1)]
hamdirs = [options.ham_directories % i for i in range(1, nsets+1)]
train_hamdirs = [hamdirs[i-1] for i in trainon]
train_spamdirs = [spamdirs[i-1] for i in trainon]
predict_hamdirs = [hamdirs[i-1] for i in predicton]
predict_spamdirs = [spamdirs[i-1] for i in predicton]
trainints = ints_to_string(trainon)
predictints = ints_to_string(predicton)
d = MyDriver()
hamroot = options.ham_directories[:-2] # lose trailing %d
spamroot = options.spam_directories[:-2]
d.train(msgs.HamStream(hamroot + trainints, train_hamdirs),
msgs.SpamStream(spamroot + trainints, train_spamdirs))
c = d.classifier
print '-> <stat> population hammean', c.hammean, 'hamvar', c.hamvar
print '-> <stat> population spammean', c.spammean, 'spamvar', c.spamvar
d.test(msgs.HamStream(hamroot + predictints, predict_hamdirs),
msgs.SpamStream(spamroot + predictints, predict_spamdirs))
d.finishtest()
d.alldone()
print "Saving all score data to pickle", fname
f = file(fname, 'wb')
pickle.dump(d.all_ham, f, 1)
pickle.dump(d.all_spam, f, 1)
f.close()
def main():
import getopt
try:
opts, args = getopt.getopt(sys.argv[1:], 'hn:s:t:',
['ham-keep=', 'spam-keep='])
except getopt.error, msg:
usage(1, msg)
nsets = seed = hamkeep = spamkeep = trainon = None
for opt, arg in opts:
if opt == '-h':
usage(0)
elif opt == '-n':
nsets = int(arg)
elif opt == '-s':
seed = int(arg)
elif opt == '-t':
trainon = Set(map(int, arg.split(',')))
elif opt == '--ham-keep':
hamkeep = int(arg)
elif opt == '--spam-keep':
spamkeep = int(arg)
if args:
usage(1, "Positional arguments not supported")
if nsets is None:
usage(1, "-n is required")
if not trainon:
usage(1, "-t is required")
predicton = list(Set(range(1, nsets+1)) - trainon)
trainon = list(trainon)
predicton.sort()
trainon.sort()
msgs.setparms(hamkeep, spamkeep, seed)
drive(nsets, trainon, predicton)
if __name__ == "__main__":
main()
Index: README.txt
===================================================================
RCS file: /cvsroot/spambayes/spambayes/README.txt,v
retrieving revision 1.30
retrieving revision 1.31
diff -C2 -d -r1.30 -r1.31
*** README.txt 28 Sep 2002 18:50:51 -0000 1.30
--- README.txt 5 Oct 2002 02:53:43 -0000 1.31
***************
*** 173,176 ****
--- 173,186 ----
+ Experimental Files
+ ==================
+ clgen.py
+ A test driver only for use with one of the speculative central-limit
+ schemes. Its purpose is to generate a binary pickle containing
+ internal information about every prediction made. This will go
+ away someday.
+ XXX Still need tools to analyze this data.
+
+
Standard Test Data Setup
========================