[Spambayes-checkins] spambayes timcv.py,1.10,1.11 msgs.py,1.4,1.5
Anthony Baxter
anthonybaxter@users.sourceforge.net
Fri Nov 1 04:10:52 2002
Update of /cvsroot/spambayes/spambayes
In directory usw-pr-cvs1:/tmp/cvs-serv7003
Modified Files:
timcv.py msgs.py
Log Message:
Added support for specifying different numbers for training and testing
ham and spam. Old options --ham-keep and --spam-keep (or --ham/--spam)
still work as before. New options --HamTest --SpamTest --HamTrain --SpamTrain
have been added to timcv.py.
Note that msgs.setparms _tries_ to do the right thing if it's called as
an old 3-arg form, but I might not have captured all the possible
twistedness. As far as I can tell, only timcv.py and timtest.py
actually call these
Index: timcv.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/timcv.py,v
retrieving revision 1.10
retrieving revision 1.11
diff -C2 -d -r1.10 -r1.11
*** timcv.py 10 Oct 2002 04:55:15 -0000 1.10
--- timcv.py 1 Nov 2002 04:10:50 -0000 1.11
***************
*** 14,24 ****
If you only want to use some of the messages in each set,
--ham-keep int
! The maximum number of msgs to use from each Ham set. The msgs are
! chosen randomly. See also the -s option.
--spam-keep int
! The maximum number of msgs to use from each Spam set. The msgs are
! chosen randomly. See also the -s option.
-s int
--- 14,40 ----
If you only want to use some of the messages in each set,
+ --HamTrain int
+ The maximum number of msgs to use from each Ham set for training.
+ The msgs are chosen randomly. See also the -s option.
+
+ --SpamTrain int
+ The maximum number of msgs to use from each Spam set for training.
+ The msgs are chosen randomly. See also the -s option.
+
+ --HamTest int
+ The maximum number of msgs to use from each Ham set for testing.
+ The msgs are chosen randomly. See also the -s option.
+
+ --SpamTest int
+ The maximum number of msgs to use from each Spam set for testing.
+ The msgs are chosen randomly. See also the -s option.
+
--ham-keep int
! The maximum number of msgs to use from each Ham set for testing
! and training. The msgs are chosen randomly. See also the -s option.
--spam-keep int
! The maximum number of msgs to use from each Spam set for testing
! and training. The msgs are chosen randomly. See also the -s option.
-s int
***************
*** 57,62 ****
d = TestDriver.Driver()
# Train it on all sets except the first.
! d.train(msgs.HamStream("%s-%d" % (hamdirs[1], nsets), hamdirs[1:]),
! msgs.SpamStream("%s-%d" % (spamdirs[1], nsets), spamdirs[1:]))
# Now run nsets times, predicting pair i against all except pair i.
--- 73,80 ----
d = TestDriver.Driver()
# Train it on all sets except the first.
! d.train(msgs.HamStream("%s-%d" % (hamdirs[1], nsets),
! hamdirs[1:], train=1),
! msgs.SpamStream("%s-%d" % (spamdirs[1], nsets),
! spamdirs[1:], train=1))
# Now run nsets times, predicting pair i against all except pair i.
***************
*** 64,69 ****
h = hamdirs[i]
s = spamdirs[i]
! hamstream = msgs.HamStream(h, [h])
! spamstream = msgs.SpamStream(s, [s])
if i > 0:
--- 82,87 ----
h = hamdirs[i]
s = spamdirs[i]
! hamstream = msgs.HamStream(h, [h], train=0)
! spamstream = msgs.SpamStream(s, [s], train=0)
if i > 0:
***************
*** 80,84 ****
del s2[i]
! d.train(msgs.HamStream(hname, h2), msgs.SpamStream(sname, s2))
else:
--- 98,103 ----
del s2[i]
! d.train(msgs.HamStream(hname, h2, train=1),
! msgs.SpamStream(sname, s2, train=1))
else:
***************
*** 101,109 ****
try:
opts, args = getopt.getopt(sys.argv[1:], 'hn:s:',
! ['ham-keep=', 'spam-keep='])
except getopt.error, msg:
usage(1, msg)
! nsets = seed = hamkeep = spamkeep = None
for opt, arg in opts:
if opt == '-h':
--- 120,131 ----
try:
opts, args = getopt.getopt(sys.argv[1:], 'hn:s:',
! ['HamTrain=', 'SpamTrain=',
! 'HamTest=', 'SpamTest=',
! 'ham-keep=', 'spam-keep='])
except getopt.error, msg:
usage(1, msg)
! nsets = seed = hamtrain = spamtrain = None
! hamtest = spamtest = hamkeep = spamkeep = None
for opt, arg in opts:
if opt == '-h':
***************
*** 113,116 ****
--- 135,146 ----
elif opt == '-s':
seed = int(arg)
+ elif opt == '--HamTest':
+ hamtest = int(arg)
+ elif opt == '--SpamTest':
+ spamtest = int(arg)
+ elif opt == '--HamTrain':
+ hamtrain = int(arg)
+ elif opt == '--SpamTrain':
+ spamtrain = int(arg)
elif opt == '--ham-keep':
hamkeep = int(arg)
***************
*** 123,127 ****
usage(1, "-n is required")
! msgs.setparms(hamkeep, spamkeep, seed)
drive(nsets)
--- 153,160 ----
usage(1, "-n is required")
! if hamkeep is not None:
! msgs.setparms(hamkeep, spamkeep, seed=seed)
! else:
! msgs.setparms(hamtrain, spamtrain, hamtest, spamtest, seed)
drive(nsets)
Index: msgs.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/msgs.py,v
retrieving revision 1.4
retrieving revision 1.5
diff -C2 -d -r1.4 -r1.5
*** msgs.py 25 Sep 2002 20:07:06 -0000 1.4
--- msgs.py 1 Nov 2002 04:10:50 -0000 1.5
***************
*** 6,11 ****
from tokenizer import tokenize
! HAMKEEP = None
! SPAMKEEP = None
SEED = random.randrange(2000000000)
--- 6,13 ----
from tokenizer import tokenize
! HAMTEST = None
! SPAMTEST = None
! HAMTRAIN = None
! SPAMTRAIN = None
SEED = random.randrange(2000000000)
***************
*** 68,83 ****
class HamStream(MsgStream):
! def __init__(self, tag, directories):
! MsgStream.__init__(self, tag, directories, HAMKEEP)
class SpamStream(MsgStream):
! def __init__(self, tag, directories):
! MsgStream.__init__(self, tag, directories, SPAMKEEP)
! def setparms(hamkeep, spamkeep, seed=None):
! """Set HAMKEEP and SPAMKEEP. If seed is not None, also set SEED."""
! global HAMKEEP, SPAMKEEP, SEED
! HAMKEEP, SPAMKEEP = hamkeep, spamkeep
if seed is not None:
SEED = seed
--- 70,103 ----
class HamStream(MsgStream):
! def __init__(self, tag, directories, train=0):
! if train:
! MsgStream.__init__(self, tag, directories, HAMTRAIN)
! else:
! MsgStream.__init__(self, tag, directories, HAMTEST)
class SpamStream(MsgStream):
! def __init__(self, tag, directories, train=0):
! if train:
! MsgStream.__init__(self, tag, directories, SPAMTRAIN)
! else:
! MsgStream.__init__(self, tag, directories, SPAMTEST)
! def setparms(hamtrain, spamtrain, hamtest=None, spamtest=None, seed=None):
! """Set HAMTEST/TRAIN and SPAMTEST/TRAIN.
! If seed is not None, also set SEED.
! If (ham|spam)test are not set, set to the same as the (ham|spam)train
! numbers (backwards compat option).
! """
! global HAMTEST, SPAMTEST, HAMTRAIN, SPAMTRAIN, SEED
! HAMTRAIN, SPAMTRAIN = hamtrain, spamtrain
! if hamtest is None:
! HAMTEST = HAMTRAIN
! else:
! HAMTEST = hamtest
! if spamtest is None:
! SPAMTEST = SPAMTRAIN
! else:
! SPAMTEST = spamtest
if seed is not None:
SEED = seed