[Spambayes-checkins] spambayes Options.py,1.48,1.49 TestDriver.py,1.23,1.24 Tester.py,1.5,1.6

Wed, 16 Oct 2002 23:23:16 -0700

Update of /cvsroot/spambayes/spambayes
In directory usw-pr-cvs1:/tmp/cvs-serv1588

Modified Files:
	Options.py TestDriver.py Tester.py 
Log Message:
Adapted from a patch by T. Alexander Popiel, teaching Tester and
TestDriver about middle grounds.  Note that there's a new option
ham_cutoff!  The range ham_cutoff:spam_cutoff defines the middle ground.

Also repaired Tester's doctest, which started failing when we stopped
counting words multiple times per msg in training.

Index: Options.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/Options.py,v
retrieving revision 1.48
retrieving revision 1.49
diff -C2 -d -r1.48 -r1.49
*** Options.py	14 Oct 2002 17:13:47 -0000	1.48
--- Options.py	17 Oct 2002 06:23:13 -0000	1.49
***************
*** 103,109 ****
  # These control various displays in class TestDriver.Driver, and Tester.Test.

! # A message is considered spam iff it scores greater than spam_cutoff.
! # This is corpus-dependent, and values into the .600's have been known
  # to work best on some data.
  spam_cutoff: 0.560

--- 103,120 ----
  # These control various displays in class TestDriver.Driver, and Tester.Test.

! # spam_cutoff and ham_cutoff are used in Python slice sense:
! #    A msg is considered    ham if its score is in 0:ham_cutoff
! #    A msg is considered unsure if its score is in ham_cutoff:spam_cutoff
! #    A msg is considered   spam if its score is in spam_cutoff:
! #
! # So it's unsure iff  ham_cutoff <= score < spam_cutoff.
! # For a binary classifier, make ham_cutoff == spam_cutoff.
! # ham_cutoff > spam_cutoff doesn't make sense.
! #
! # The defaults are for the all-default Robinson scheme, which makes a
! # binary decision with no middle ground.  The precise value that works
! # best is corpus-dependent, and values into the .600's have been known
  # to work best on some data.
+ ham_cutoff:  0.560
  spam_cutoff: 0.560

***************
*** 147,150 ****
--- 158,162 ----
  show_false_positives: True
  show_false_negatives: False
+ show_unsure: False

  # Near the end of Driver.test(), you can get a listing of the 'best
***************
*** 312,315 ****
--- 324,328 ----
                     'show_false_positives': boolean_cracker,
                     'show_false_negatives': boolean_cracker,
+                    'show_unsure': boolean_cracker,
                     'show_histograms': boolean_cracker,
                     'show_best_discriminators': int_cracker,
***************
*** 318,321 ****
--- 331,335 ----
                     'pickle_basename': string_cracker,
                     'show_charlimit': int_cracker,
+                    'ham_cutoff': float_cracker,
                     'spam_cutoff': float_cracker,
                     'spam_directories': string_cracker,

Index: TestDriver.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/TestDriver.py,v
retrieving revision 1.23
retrieving revision 1.24
diff -C2 -d -r1.23 -r1.24
*** TestDriver.py	14 Oct 2002 18:04:56 -0000	1.23
--- TestDriver.py	17 Oct 2002 06:23:13 -0000	1.24
***************
*** 129,132 ****
--- 129,133 ----
          self.falsepos = Set()
          self.falseneg = Set()
+         self.unsure = Set()
          self.global_ham_hist = Hist()
          self.global_spam_hist = Hist()
***************
*** 187,190 ****
--- 188,196 ----
          if options.show_histograms:
              printhist("all runs:", self.global_ham_hist, self.global_spam_hist)
+         
+         print "-> <stat> cost for all runs: $%.2f" % (
+                len(self.falsepos) * options.best_cutoff_fp_weight +
+                len(self.falseneg) * options.best_cutoff_fn_weight +
+                len(self.unsure) * options.best_cutoff_unsure_weight)

          if options.save_histogram_pickles:
***************
*** 230,233 ****
--- 236,245 ----
          print "-> <stat> false positive %:", t.false_positive_rate()
          print "-> <stat> false negative %:", t.false_negative_rate()
+         print "-> <stat> unsure %:", t.unsure_rate()
+         print "-> <stat> cost: $%.2f" % (
+                t.nham_wrong * options.best_cutoff_fp_weight +
+                t.nspam_wrong * options.best_cutoff_fn_weight +
+                (t.nham_unsure + t.nspam_unsure) *
+                options.best_cutoff_unsure_weight)

          newfpos = Set(t.false_positives()) - self.falsepos
***************
*** 251,254 ****
--- 263,278 ----
              newfneg = ()
          for e in newfneg:
+             print '*' * 78
+             prob, clues = c.spamprob(e, True)
+             printmsg(e, prob, clues)
+ 
+         newunsure = Set(t.unsures()) - self.unsure
+         self.unsure |= newunsure
+         print "-> <stat> %d new unsure" % len(newunsure)
+         if newunsure:
+             print "    new unsure:", [e.tag for e in newunsure]
+         if not options.show_unsure:
+             newunsure = ()
+         for e in newunsure:
              print '*' * 78
              prob, clues = c.spamprob(e, True)

Index: Tester.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/Tester.py,v
retrieving revision 1.5
retrieving revision 1.6
diff -C2 -d -r1.5 -r1.6
*** Tester.py	27 Sep 2002 21:18:18 -0000	1.5
--- Tester.py	17 Oct 2002 06:23:13 -0000	1.6
***************
*** 36,45 ****
--- 36,48 ----
          self.nham_right = 0
          self.nham_wrong = 0
+         self.nham_unsure = 0;
          self.nspam_right = 0
          self.nspam_wrong = 0
+         self.nspam_unsure = 0;

          # Lists of bad predictions.
          self.ham_wrong_examples = []    # False positives:  ham called spam.
          self.spam_wrong_examples = []   # False negatives:  spam called ham.
+         self.unsure_examples = []       # ham and spam in middle ground

      # Train the classifier on streams of ham and spam.  Updates probabilities
***************
*** 85,107 ****
              if callback:
                  callback(example, prob)
!             is_spam_guessed = prob > options.spam_cutoff
!             correct = is_spam_guessed == is_spam
              if is_spam:
                  self.nspam_tested += 1
!                 if correct:
                      self.nspam_right += 1
!                 else:
                      self.nspam_wrong += 1
                      self.spam_wrong_examples.append(example)
              else:
                  self.nham_tested += 1
!                 if correct:
                      self.nham_right += 1
!                 else:
                      self.nham_wrong += 1
                      self.ham_wrong_examples.append(example)

!         assert self.nham_right + self.nham_wrong == self.nham_tested
!         assert self.nspam_right + self.nspam_wrong == self.nspam_tested

      def false_positive_rate(self):
--- 88,118 ----
              if callback:
                  callback(example, prob)
!             is_ham_guessed  = prob <  options.ham_cutoff
!             is_spam_guessed = prob >= options.spam_cutoff
              if is_spam:
                  self.nspam_tested += 1
!                 if is_spam_guessed:
                      self.nspam_right += 1
!                 elif is_ham_guessed:
                      self.nspam_wrong += 1
                      self.spam_wrong_examples.append(example)
+                 else:
+                     self.nspam_unsure += 1
+                     self.unsure_examples.append(example)
              else:
                  self.nham_tested += 1
!                 if is_ham_guessed:
                      self.nham_right += 1
!                 elif is_spam_guessed:
                      self.nham_wrong += 1
                      self.ham_wrong_examples.append(example)
+                 else:
+                     self.nham_unsure += 1
+                     self.unsure_examples.append(example)

!         assert (self.nham_right + self.nham_wrong + self.nham_unsure ==
!                 self.nham_tested)
!         assert (self.nspam_right + self.nspam_wrong + self.nspam_unsure ==
!                 self.nspam_tested)

      def false_positive_rate(self):
***************
*** 113,116 ****
--- 124,131 ----
          return self.nspam_wrong * 1e2 / self.nspam_tested

+     def unsure_rate(self):
+         return ((self.nham_unsure + self.nspam_unsure) * 1e2 /
+                 (self.nham_tested + self.nspam_tested))
+ 
      def false_positives(self):
          return self.ham_wrong_examples
***************
*** 119,122 ****
--- 134,139 ----
          return self.spam_wrong_examples

+     def unsures(self):
+         return self.unsure_examples

  class _Example:
***************
*** 129,146 ****
  _easy_test = """
      >>> from classifier import Bayes

!     >>> good1 = _Example('', ['a', 'b', 'c'] * 10)
!     >>> good2 = _Example('', ['a', 'b'] * 10)
!     >>> bad1 = _Example('', ['d'] * 10)

      >>> t = Test(Bayes())
      >>> t.train([good1, good2], [bad1])
      >>> t.predict([_Example('goodham', ['a', 'b']),
!     ...            _Example('badham', ['d'])
      ...           ], False)
!     >>> t.predict([_Example('goodspam', ['d', 'd']),
!     ...            _Example('badspam1', ['c']),
!     ...            _Example('badspam2', ['a'] * 15 + ['d'] * 1000),
!     ...            _Example('badspam3', ['d', 'a', 'b', 'c'])
      ...           ], True)

--- 146,165 ----
  _easy_test = """
      >>> from classifier import Bayes
+     >>> from Options import options
+     >>> options.ham_cutoff = options.spam_cutoff = 0.5

!     >>> good1 = _Example('', ['a', 'b', 'c'])
!     >>> good2 = _Example('', ['a', 'b'])
!     >>> bad1 = _Example('', ['c', 'd'])

      >>> t = Test(Bayes())
      >>> t.train([good1, good2], [bad1])
      >>> t.predict([_Example('goodham', ['a', 'b']),
!     ...            _Example('badham', ['d'])    # FP
      ...           ], False)
!     >>> t.predict([_Example('goodspam', ['d']),
!     ...            _Example('badspam1', ['a']), # FN
!     ...            _Example('badspam2', ['a', 'b']),    # FN
!     ...            _Example('badspam3', ['d', 'a', 'b'])    # FN
      ...           ], True)

***************
*** 162,165 ****
--- 181,189 ----
      >>> [e.name for e in t.false_negatives()]
      ['badspam1', 'badspam2', 'badspam3']
+ 
+     >>> [e.name for e in t.unsures()]
+     []
+     >>> t.unsure_rate()
+     0.0
  """