[Spambayes-checkins] spambayes TestDriver.py,1.1,1.2 Tester.py,1.2,1.3

Tim Peters tim_one@users.sourceforge.net
Fri, 13 Sep 2002 10:49:06 -0700


Update of /cvsroot/spambayes/spambayes
In directory usw-pr-cvs1:/tmp/cvs-serv30444

Modified Files:
	TestDriver.py Tester.py 
Log Message:
A little closer to N-fold cross validation.

Removed the Tester nham and nspam attributes.  If used properly, they
should have exactly the same values as the classifier's attributes of
the same names.  Duplicating the info just created more chances to
screw up.

Changed when classifier pickles are saved, from immediately after
training to Driver.finishtest().  This way meaningful killcounts
are pickled.  Since WordInfo.spamprob is almost never 0.5 anymore,
it would be nice to have another gimmick for pruning junk from the
database that doesn't rely on months going by to see which records
remain unused.  It *may* work well to prune away WordInfo records
that never survived into spamprob()'s nbest list during testing.  That's
speculation and needs to be verified via testing; I don't expect to
get to that in the near future, though; note that testing this would
require splitting the data in a different way, since, by construction,
a word with killcount=0 had no effect whatsoever on any outcome during
predictions.

A very quick check suggested that about half the words in a database
do have killcount 0; I'm surprised it's not a lot more than that, so
maybe I did something wrong; or maybe that's really how things are.


Index: TestDriver.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/TestDriver.py,v
retrieving revision 1.1
retrieving revision 1.2
diff -C2 -d -r1.1 -r1.2
*** TestDriver.py	13 Sep 2002 16:26:58 -0000	1.1
--- TestDriver.py	13 Sep 2002 17:49:02 -0000	1.2
***************
*** 12,15 ****
--- 12,17 ----
  #             test(ham, spam)
  #         # Display stats against all runs on this classifier variant.
+ #         # This also saves the trained classifer, if desired (option
+ #         # save_trained_pickles).
  #         finishtest()
  # # Display stats against all runs.
***************
*** 86,123 ****
          self.global_ham_hist = Hist(options.nbuckets)
          self.global_spam_hist = Hist(options.nbuckets)
!         self.ntimes_train_called = 0
  
      def train(self, ham, spam):
!         self.classifier = classifier.GrahamBayes()
!         t = self.tester = Tester.Test(self.classifier)
  
          print "Training on", ham, "&", spam, "...",
          t.train(ham, spam)
!         print t.nham, "hams &", t.nspam, "spams"
!         self.orig_nham = t.nham
!         self.orig_nspam = t.nspam
  
          self.trained_ham_hist = Hist(options.nbuckets)
          self.trained_spam_hist = Hist(options.nbuckets)
  
-         self.ntimes_train_called += 1
-         if options.save_trained_pickles:
-             fname = "%s%d.pik" % (options.pickle_basename,
-                                   self.ntimes_train_called)
-             print "    saving pickle to", fname
-             fp = file(fname, 'wb')
-             pickle.dump(self.classifier, fp, 1)
-             fp.close()
- 
      def forget(self, ham, spam):
!         c = self.classifier
!         t = self.tester
!         nham, nspam = self.orig_nham, self.orig_nspam
!         t.set_classifier(c.copy(), nham, nspam)
  
          print "Forgetting", ham, "&", spam, "...",
!         t.untrain(ham, spam)
!         print nham - t.nham, "hams &", nspam - t.nspam, "spams"
  
          self.trained_ham_hist = Hist(options.nbuckets)
          self.trained_spam_hist = Hist(options.nbuckets)
--- 88,118 ----
          self.global_ham_hist = Hist(options.nbuckets)
          self.global_spam_hist = Hist(options.nbuckets)
!         self.ntimes_finishtest_called = 0
  
      def train(self, ham, spam):
!         c = self.classifier = classifier.GrahamBayes()
!         t = self.tester = Tester.Test(c)
  
          print "Training on", ham, "&", spam, "...",
          t.train(ham, spam)
!         print c.nham, "hams &", c.nspam, "spams"
  
          self.trained_ham_hist = Hist(options.nbuckets)
          self.trained_spam_hist = Hist(options.nbuckets)
  
      def forget(self, ham, spam):
!         import copy
  
          print "Forgetting", ham, "&", spam, "...",
!         c = self.classifier
!         nham, nspam = c.nham, c.nspam
!         c = copy.deepcopy(c)
!         t.set_classifier(c)
! 
!         self.tester.untrain(ham, spam)
!         print nham - c.nham, "hams &", nspam - c.nspam, "spams"
  
+         self.global_ham_hist += self.trained_ham_hist
+         self.global_spam_hist += self.trained_spam_hist
          self.trained_ham_hist = Hist(options.nbuckets)
          self.trained_spam_hist = Hist(options.nbuckets)
***************
*** 129,132 ****
--- 124,136 ----
          self.global_ham_hist += self.trained_ham_hist
          self.global_spam_hist += self.trained_spam_hist
+ 
+         self.ntimes_finishtest_called += 1
+         if options.save_trained_pickles:
+             fname = "%s%d.pik" % (options.pickle_basename,
+                                   self.ntimes_finishtest_called)
+             print "    saving pickle to", fname
+             fp = file(fname, 'wb')
+             pickle.dump(self.classifier, fp, 1)
+             fp.close()
  
      def alldone(self):

Index: Tester.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/Tester.py,v
retrieving revision 1.2
retrieving revision 1.3
diff -C2 -d -r1.2 -r1.3
*** Tester.py	13 Sep 2002 16:26:58 -0000	1.2
--- Tester.py	13 Sep 2002 17:49:02 -0000	1.3
***************
*** 20,33 ****
  
      def __init__(self, classifier):
!         self.set_classifier(classifier, 0, 0)
          self.reset_test_results()
  
!     # Tell the tester which classifier to use, and how many ham and spam it's
!     # been trained on.
!     def set_classifier(self, classifier, nham, nspam):
          self.classifier = classifier
-         # The number of ham and spam instances in the training data.
-         self.nham = nham
-         self.nspam = nspam
  
      def reset_test_results(self):
--- 20,29 ----
  
      def __init__(self, classifier):
!         self.set_classifier(classifier)
          self.reset_test_results()
  
!     # Tell the tester which classifier to use.
!     def set_classifier(self, classifier):
          self.classifier = classifier
  
      def reset_test_results(self):
***************
*** 53,61 ****
              for example in hamstream:
                  learn(example, False, False)
-                 self.nham += 1
          if spamstream is not None:
              for example in spamstream:
                  learn(example, True, False)
-                 self.nspam += 1
          self.classifier.update_probabilities()
  
--- 49,55 ----
***************
*** 68,76 ****
              for example in hamstream:
                  unlearn(example, False, False)
-                 self.nham -= 1
          if spamstream is not None:
              for example in spamstream:
                  unlearn(example, True, False)
-                 self.nspam -= 1
          self.classifier.update_probabilities()
  
--- 62,68 ----