[Spambayes-checkins]
spambayes Options.py,1.48,1.49 TestDriver.py,1.23,1.24 Tester.py,1.5,1.6
Tim Peters
tim_one@users.sourceforge.net
Wed, 16 Oct 2002 23:23:16 -0700
Update of /cvsroot/spambayes/spambayes
In directory usw-pr-cvs1:/tmp/cvs-serv1588
Modified Files:
Options.py TestDriver.py Tester.py
Log Message:
Adapted from a patch by T. Alexander Popiel, teaching Tester and
TestDriver about middle grounds. Note that there's a new option
ham_cutoff! The range ham_cutoff:spam_cutoff defines the middle ground.
Also repaired Tester's doctest, which started failing when we stopped
counting words multiple times per msg in training.
Index: Options.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/Options.py,v
retrieving revision 1.48
retrieving revision 1.49
diff -C2 -d -r1.48 -r1.49
*** Options.py 14 Oct 2002 17:13:47 -0000 1.48
--- Options.py 17 Oct 2002 06:23:13 -0000 1.49
***************
*** 103,109 ****
# These control various displays in class TestDriver.Driver, and Tester.Test.
! # A message is considered spam iff it scores greater than spam_cutoff.
! # This is corpus-dependent, and values into the .600's have been known
# to work best on some data.
spam_cutoff: 0.560
--- 103,120 ----
# These control various displays in class TestDriver.Driver, and Tester.Test.
! # spam_cutoff and ham_cutoff are used in Python slice sense:
! # A msg is considered ham if its score is in 0:ham_cutoff
! # A msg is considered unsure if its score is in ham_cutoff:spam_cutoff
! # A msg is considered spam if its score is in spam_cutoff:
! #
! # So it's unsure iff ham_cutoff <= score < spam_cutoff.
! # For a binary classifier, make ham_cutoff == spam_cutoff.
! # ham_cutoff > spam_cutoff doesn't make sense.
! #
! # The defaults are for the all-default Robinson scheme, which makes a
! # binary decision with no middle ground. The precise value that works
! # best is corpus-dependent, and values into the .600's have been known
# to work best on some data.
+ ham_cutoff: 0.560
spam_cutoff: 0.560
***************
*** 147,150 ****
--- 158,162 ----
show_false_positives: True
show_false_negatives: False
+ show_unsure: False
# Near the end of Driver.test(), you can get a listing of the 'best
***************
*** 312,315 ****
--- 324,328 ----
'show_false_positives': boolean_cracker,
'show_false_negatives': boolean_cracker,
+ 'show_unsure': boolean_cracker,
'show_histograms': boolean_cracker,
'show_best_discriminators': int_cracker,
***************
*** 318,321 ****
--- 331,335 ----
'pickle_basename': string_cracker,
'show_charlimit': int_cracker,
+ 'ham_cutoff': float_cracker,
'spam_cutoff': float_cracker,
'spam_directories': string_cracker,
Index: TestDriver.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/TestDriver.py,v
retrieving revision 1.23
retrieving revision 1.24
diff -C2 -d -r1.23 -r1.24
*** TestDriver.py 14 Oct 2002 18:04:56 -0000 1.23
--- TestDriver.py 17 Oct 2002 06:23:13 -0000 1.24
***************
*** 129,132 ****
--- 129,133 ----
self.falsepos = Set()
self.falseneg = Set()
+ self.unsure = Set()
self.global_ham_hist = Hist()
self.global_spam_hist = Hist()
***************
*** 187,190 ****
--- 188,196 ----
if options.show_histograms:
printhist("all runs:", self.global_ham_hist, self.global_spam_hist)
+
+ print "-> <stat> cost for all runs: $%.2f" % (
+ len(self.falsepos) * options.best_cutoff_fp_weight +
+ len(self.falseneg) * options.best_cutoff_fn_weight +
+ len(self.unsure) * options.best_cutoff_unsure_weight)
if options.save_histogram_pickles:
***************
*** 230,233 ****
--- 236,245 ----
print "-> <stat> false positive %:", t.false_positive_rate()
print "-> <stat> false negative %:", t.false_negative_rate()
+ print "-> <stat> unsure %:", t.unsure_rate()
+ print "-> <stat> cost: $%.2f" % (
+ t.nham_wrong * options.best_cutoff_fp_weight +
+ t.nspam_wrong * options.best_cutoff_fn_weight +
+ (t.nham_unsure + t.nspam_unsure) *
+ options.best_cutoff_unsure_weight)
newfpos = Set(t.false_positives()) - self.falsepos
***************
*** 251,254 ****
--- 263,278 ----
newfneg = ()
for e in newfneg:
+ print '*' * 78
+ prob, clues = c.spamprob(e, True)
+ printmsg(e, prob, clues)
+
+ newunsure = Set(t.unsures()) - self.unsure
+ self.unsure |= newunsure
+ print "-> <stat> %d new unsure" % len(newunsure)
+ if newunsure:
+ print " new unsure:", [e.tag for e in newunsure]
+ if not options.show_unsure:
+ newunsure = ()
+ for e in newunsure:
print '*' * 78
prob, clues = c.spamprob(e, True)
Index: Tester.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/Tester.py,v
retrieving revision 1.5
retrieving revision 1.6
diff -C2 -d -r1.5 -r1.6
*** Tester.py 27 Sep 2002 21:18:18 -0000 1.5
--- Tester.py 17 Oct 2002 06:23:13 -0000 1.6
***************
*** 36,45 ****
--- 36,48 ----
self.nham_right = 0
self.nham_wrong = 0
+ self.nham_unsure = 0;
self.nspam_right = 0
self.nspam_wrong = 0
+ self.nspam_unsure = 0;
# Lists of bad predictions.
self.ham_wrong_examples = [] # False positives: ham called spam.
self.spam_wrong_examples = [] # False negatives: spam called ham.
+ self.unsure_examples = [] # ham and spam in middle ground
# Train the classifier on streams of ham and spam. Updates probabilities
***************
*** 85,107 ****
if callback:
callback(example, prob)
! is_spam_guessed = prob > options.spam_cutoff
! correct = is_spam_guessed == is_spam
if is_spam:
self.nspam_tested += 1
! if correct:
self.nspam_right += 1
! else:
self.nspam_wrong += 1
self.spam_wrong_examples.append(example)
else:
self.nham_tested += 1
! if correct:
self.nham_right += 1
! else:
self.nham_wrong += 1
self.ham_wrong_examples.append(example)
! assert self.nham_right + self.nham_wrong == self.nham_tested
! assert self.nspam_right + self.nspam_wrong == self.nspam_tested
def false_positive_rate(self):
--- 88,118 ----
if callback:
callback(example, prob)
! is_ham_guessed = prob < options.ham_cutoff
! is_spam_guessed = prob >= options.spam_cutoff
if is_spam:
self.nspam_tested += 1
! if is_spam_guessed:
self.nspam_right += 1
! elif is_ham_guessed:
self.nspam_wrong += 1
self.spam_wrong_examples.append(example)
+ else:
+ self.nspam_unsure += 1
+ self.unsure_examples.append(example)
else:
self.nham_tested += 1
! if is_ham_guessed:
self.nham_right += 1
! elif is_spam_guessed:
self.nham_wrong += 1
self.ham_wrong_examples.append(example)
+ else:
+ self.nham_unsure += 1
+ self.unsure_examples.append(example)
! assert (self.nham_right + self.nham_wrong + self.nham_unsure ==
! self.nham_tested)
! assert (self.nspam_right + self.nspam_wrong + self.nspam_unsure ==
! self.nspam_tested)
def false_positive_rate(self):
***************
*** 113,116 ****
--- 124,131 ----
return self.nspam_wrong * 1e2 / self.nspam_tested
+ def unsure_rate(self):
+ return ((self.nham_unsure + self.nspam_unsure) * 1e2 /
+ (self.nham_tested + self.nspam_tested))
+
def false_positives(self):
return self.ham_wrong_examples
***************
*** 119,122 ****
--- 134,139 ----
return self.spam_wrong_examples
+ def unsures(self):
+ return self.unsure_examples
class _Example:
***************
*** 129,146 ****
_easy_test = """
>>> from classifier import Bayes
! >>> good1 = _Example('', ['a', 'b', 'c'] * 10)
! >>> good2 = _Example('', ['a', 'b'] * 10)
! >>> bad1 = _Example('', ['d'] * 10)
>>> t = Test(Bayes())
>>> t.train([good1, good2], [bad1])
>>> t.predict([_Example('goodham', ['a', 'b']),
! ... _Example('badham', ['d'])
... ], False)
! >>> t.predict([_Example('goodspam', ['d', 'd']),
! ... _Example('badspam1', ['c']),
! ... _Example('badspam2', ['a'] * 15 + ['d'] * 1000),
! ... _Example('badspam3', ['d', 'a', 'b', 'c'])
... ], True)
--- 146,165 ----
_easy_test = """
>>> from classifier import Bayes
+ >>> from Options import options
+ >>> options.ham_cutoff = options.spam_cutoff = 0.5
! >>> good1 = _Example('', ['a', 'b', 'c'])
! >>> good2 = _Example('', ['a', 'b'])
! >>> bad1 = _Example('', ['c', 'd'])
>>> t = Test(Bayes())
>>> t.train([good1, good2], [bad1])
>>> t.predict([_Example('goodham', ['a', 'b']),
! ... _Example('badham', ['d']) # FP
... ], False)
! >>> t.predict([_Example('goodspam', ['d']),
! ... _Example('badspam1', ['a']), # FN
! ... _Example('badspam2', ['a', 'b']), # FN
! ... _Example('badspam3', ['d', 'a', 'b']) # FN
... ], True)
***************
*** 162,165 ****
--- 181,189 ----
>>> [e.name for e in t.false_negatives()]
['badspam1', 'badspam2', 'badspam3']
+
+ >>> [e.name for e in t.unsures()]
+ []
+ >>> t.unsure_rate()
+ 0.0
"""