[Spambayes-checkins] spambayes timcv.py,1.10,1.11 msgs.py,1.4,1.5

Anthony Baxter anthonybaxter@users.sourceforge.net
Fri Nov 1 04:10:52 2002


Update of /cvsroot/spambayes/spambayes
In directory usw-pr-cvs1:/tmp/cvs-serv7003

Modified Files:
	timcv.py msgs.py 
Log Message:
Added support for specifying different numbers for training and testing
ham and spam. Old options --ham-keep and --spam-keep (or --ham/--spam) 
still work as before. New options --HamTest --SpamTest --HamTrain --SpamTrain
have been added to timcv.py.

Note that msgs.setparms _tries_ to do the right thing if it's called as
an old 3-arg form, but I might not have captured all the possible 
twistedness. As far as I can tell, only timcv.py and timtest.py
actually call these


Index: timcv.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/timcv.py,v
retrieving revision 1.10
retrieving revision 1.11
diff -C2 -d -r1.10 -r1.11
*** timcv.py	10 Oct 2002 04:55:15 -0000	1.10
--- timcv.py	1 Nov 2002 04:10:50 -0000	1.11
***************
*** 14,24 ****
  If you only want to use some of the messages in each set,
  
      --ham-keep int
!         The maximum number of msgs to use from each Ham set.  The msgs are
!         chosen randomly.  See also the -s option.
  
      --spam-keep int
!         The maximum number of msgs to use from each Spam set.  The msgs are
!         chosen randomly.  See also the -s option.
  
      -s int
--- 14,40 ----
  If you only want to use some of the messages in each set,
  
+     --HamTrain int
+         The maximum number of msgs to use from each Ham set for training.  
+         The msgs are chosen randomly.  See also the -s option.
+ 
+     --SpamTrain int
+         The maximum number of msgs to use from each Spam set for training.
+         The msgs are chosen randomly.  See also the -s option.
+ 
+     --HamTest int
+         The maximum number of msgs to use from each Ham set for testing.  
+         The msgs are chosen randomly.  See also the -s option.
+ 
+     --SpamTest int
+         The maximum number of msgs to use from each Spam set for testing.
+         The msgs are chosen randomly.  See also the -s option.
+ 
      --ham-keep int
!         The maximum number of msgs to use from each Ham set for testing
!         and training. The msgs are chosen randomly.  See also the -s option.
  
      --spam-keep int
!         The maximum number of msgs to use from each Spam set for testing
!         and training. The msgs are chosen randomly.  See also the -s option.
  
      -s int
***************
*** 57,62 ****
      d = TestDriver.Driver()
      # Train it on all sets except the first.
!     d.train(msgs.HamStream("%s-%d" % (hamdirs[1], nsets), hamdirs[1:]),
!             msgs.SpamStream("%s-%d" % (spamdirs[1], nsets), spamdirs[1:]))
  
      # Now run nsets times, predicting pair i against all except pair i.
--- 73,80 ----
      d = TestDriver.Driver()
      # Train it on all sets except the first.
!     d.train(msgs.HamStream("%s-%d" % (hamdirs[1], nsets), 
!                             hamdirs[1:], train=1),
!             msgs.SpamStream("%s-%d" % (spamdirs[1], nsets), 
!                             spamdirs[1:], train=1))
  
      # Now run nsets times, predicting pair i against all except pair i.
***************
*** 64,69 ****
          h = hamdirs[i]
          s = spamdirs[i]
!         hamstream = msgs.HamStream(h, [h])
!         spamstream = msgs.SpamStream(s, [s])
  
          if i > 0:
--- 82,87 ----
          h = hamdirs[i]
          s = spamdirs[i]
!         hamstream = msgs.HamStream(h, [h], train=0)
!         spamstream = msgs.SpamStream(s, [s], train=0)
  
          if i > 0:
***************
*** 80,84 ****
                  del s2[i]
  
!                 d.train(msgs.HamStream(hname, h2), msgs.SpamStream(sname, s2))
  
              else:
--- 98,103 ----
                  del s2[i]
  
!                 d.train(msgs.HamStream(hname, h2, train=1), 
!                         msgs.SpamStream(sname, s2, train=1))
  
              else:
***************
*** 101,109 ****
      try:
          opts, args = getopt.getopt(sys.argv[1:], 'hn:s:',
!                                    ['ham-keep=', 'spam-keep='])
      except getopt.error, msg:
          usage(1, msg)
  
!     nsets = seed = hamkeep = spamkeep = None
      for opt, arg in opts:
          if opt == '-h':
--- 120,131 ----
      try:
          opts, args = getopt.getopt(sys.argv[1:], 'hn:s:',
!                                    ['HamTrain=', 'SpamTrain=',
!                                    'HamTest=', 'SpamTest=',
!                                    'ham-keep=', 'spam-keep='])
      except getopt.error, msg:
          usage(1, msg)
  
!     nsets = seed = hamtrain = spamtrain = None
!     hamtest = spamtest = hamkeep = spamkeep = None
      for opt, arg in opts:
          if opt == '-h':
***************
*** 113,116 ****
--- 135,146 ----
          elif opt == '-s':
              seed = int(arg)
+         elif opt == '--HamTest':
+             hamtest = int(arg)
+         elif opt == '--SpamTest':
+             spamtest = int(arg)
+         elif opt == '--HamTrain':
+             hamtrain = int(arg)
+         elif opt == '--SpamTrain':
+             spamtrain = int(arg)
          elif opt == '--ham-keep':
              hamkeep = int(arg)
***************
*** 123,127 ****
          usage(1, "-n is required")
  
!     msgs.setparms(hamkeep, spamkeep, seed)
      drive(nsets)
  
--- 153,160 ----
          usage(1, "-n is required")
  
!     if hamkeep is not None:
!         msgs.setparms(hamkeep, spamkeep, seed=seed)
!     else:
!         msgs.setparms(hamtrain, spamtrain, hamtest, spamtest, seed)
      drive(nsets)
  

Index: msgs.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/msgs.py,v
retrieving revision 1.4
retrieving revision 1.5
diff -C2 -d -r1.4 -r1.5
*** msgs.py	25 Sep 2002 20:07:06 -0000	1.4
--- msgs.py	1 Nov 2002 04:10:50 -0000	1.5
***************
*** 6,11 ****
  from tokenizer import tokenize
  
! HAMKEEP  = None
! SPAMKEEP = None
  SEED = random.randrange(2000000000)
  
--- 6,13 ----
  from tokenizer import tokenize
  
! HAMTEST  = None
! SPAMTEST = None
! HAMTRAIN  = None
! SPAMTRAIN = None
  SEED = random.randrange(2000000000)
  
***************
*** 68,83 ****
  
  class HamStream(MsgStream):
!     def __init__(self, tag, directories):
!         MsgStream.__init__(self, tag, directories, HAMKEEP)
  
  class SpamStream(MsgStream):
!     def __init__(self, tag, directories):
!         MsgStream.__init__(self, tag, directories, SPAMKEEP)
  
! def setparms(hamkeep, spamkeep, seed=None):
!     """Set HAMKEEP and SPAMKEEP.  If seed is not None, also set SEED."""
  
!     global HAMKEEP, SPAMKEEP, SEED
!     HAMKEEP, SPAMKEEP = hamkeep, spamkeep
      if seed is not None:
          SEED = seed
--- 70,103 ----
  
  class HamStream(MsgStream):
!     def __init__(self, tag, directories, train=0):
!         if train:
!             MsgStream.__init__(self, tag, directories, HAMTRAIN)
!         else:
!             MsgStream.__init__(self, tag, directories, HAMTEST)
  
  class SpamStream(MsgStream):
!     def __init__(self, tag, directories, train=0):
!         if train:
!             MsgStream.__init__(self, tag, directories, SPAMTRAIN)
!         else:
!             MsgStream.__init__(self, tag, directories, SPAMTEST)
  
! def setparms(hamtrain, spamtrain, hamtest=None, spamtest=None, seed=None):
!     """Set HAMTEST/TRAIN and SPAMTEST/TRAIN.  
!        If seed is not None, also set SEED.
!        If (ham|spam)test are not set, set to the same as the (ham|spam)train
!        numbers (backwards compat option).
!     """
  
!     global HAMTEST, SPAMTEST, HAMTRAIN, SPAMTRAIN, SEED
!     HAMTRAIN, SPAMTRAIN = hamtrain, spamtrain
!     if hamtest is None:
!         HAMTEST = HAMTRAIN
!     else:
!         HAMTEST = hamtest
!     if spamtest is None:
!         SPAMTEST = SPAMTRAIN
!     else:
!         SPAMTEST = spamtest
      if seed is not None:
          SEED = seed