[Python-checkins] python/nondist/sandbox/spambayes GBayes.py,1.5,1.6
gvanrossum@users.sourceforge.net
gvanrossum@users.sourceforge.net
Tue, 20 Aug 2002 11:22:52 -0700
Update of /cvsroot/python/python/nondist/sandbox/spambayes
In directory usw-pr-cvs1:/tmp/cvs-serv2396
Modified Files:
GBayes.py
Log Message:
Some minor cleanup:
- Move the identifying comment to the top, clarify it a bit, and add
author info.
- There's no reason for _time and _heapreplace to be hidden names;
change these back to time and heapreplace.
- Rename main1() to _test() and main2() to main(); when main() sees
there are no options or arguments, it runs _test().
- Get rid of a list comprehension from clearjunk().
- Put wordinfo.get as a local variable in _add_msg().
Index: GBayes.py
===================================================================
RCS file: /cvsroot/python/python/nondist/sandbox/spambayes/GBayes.py,v
retrieving revision 1.5
retrieving revision 1.6
diff -C2 -d -r1.5 -r1.6
*** GBayes.py 20 Aug 2002 05:16:48 -0000 1.5
--- GBayes.py 20 Aug 2002 18:22:50 -0000 1.6
***************
*** 1,4 ****
--- 1,14 ----
#!/usr/bin/env python
+ # This is an implementation of the Bayes-like spam classifier sketched
+ # by Paul Graham at <http://www.paulgraham.com/spam.html>. We say
+ # "Bayes-like" because there are many ad hoc deviations from a
+ # "normal" Bayesian classifier.
+ #
+ # Tim Peters wrote the algorithmic part of the code.
+ #
+ # Barry Warsaw added integration infrastructure like command line
+ # options and a pickled database.
+
"""Usage: %(program)s [options]
***************
*** 25,34 ****
-o file
with -m, output all messages, with marks, to file
"""
import sys
import getopt
! import time as _time
! from heapq import heapreplace as _heapreplace
import cPickle as pickle
import mailbox
--- 35,46 ----
-o file
with -m, output all messages, with marks, to file
+
+ When called without any options or arguments, a short self-test is run.
"""
import sys
import getopt
! import time
! from heapq import heapreplace
import cPickle as pickle
import mailbox
***************
*** 38,44 ****
program = sys.argv[0]
- # This is an implementation of the Bayes-like spam classifier sketched at
- # <http://www.paulgraham.com/spam.html>. I say "Bayes-like" because there
- # are many ad hoc deviations from a "normal" Bayesian classifier.
HAMBIAS = 2.0
SPAMBIAS = 1.0
--- 50,53 ----
***************
*** 56,59 ****
--- 65,70 ----
# But the code snippet considers words that appear at least five times.
# This implementation follows the code rather than the explanation.
+ # (In addition, the count compared is after multiplying it with the
+ # appropriate bias factor.)
MINCOUNT = 5.0
***************
*** 122,126 ****
wordinfoget = self.wordinfo.get
! now = _time.time()
# A priority queue to remember the MAX_DISCRIMINATORS best
--- 133,137 ----
wordinfoget = self.wordinfo.get
! now = time.time()
# A priority queue to remember the MAX_DISCRIMINATORS best
***************
*** 140,144 ****
distance = abs(prob - 0.5)
if distance > smallest_best:
! _heapreplace(nbest, (distance, prob, word, record))
smallest_best = nbest[0][0]
--- 151,155 ----
distance = abs(prob - 0.5)
if distance > smallest_best:
! heapreplace(nbest, (distance, prob, word, record))
smallest_best = nbest[0][0]
***************
*** 234,245 ****
wordinfo = self.wordinfo
mincount = float(mincount)
! tonuke = [w for w, r in wordinfo.iteritems()
! if r.modtime < oldesttime and
! SPAMBIAS*r.spamcount + HAMBIAS*r.hamcount < mincount]
! for word in tonuke:
! if self.DEBUG:
! r = wordinfo[word]
! print "clearjunk removing word %r: %r" % (word, r)
! del wordinfo[word]
def _add_msg(self, wordstream, is_spam):
--- 245,254 ----
wordinfo = self.wordinfo
mincount = float(mincount)
! for w, r in wordinfo.iteritems():
! if (r.modtime < oldesttime and
! SPAMBIAS*r.spamcount + HAMBIAS*r.hamcount < mincount):
! if self.DEBUG:
! print "clearjunk removing word %r: %r" % (w, r)
! del wordinfo[w]
def _add_msg(self, wordstream, is_spam):
***************
*** 253,259 ****
wordinfo = self.wordinfo
! now = _time.time()
for word in wordstream:
! record = wordinfo.get(word)
if record is None:
record = wordinfo[word] = WordInfo(now)
--- 262,269 ----
wordinfo = self.wordinfo
! wordinfoget = wordinfo.get
! now = time.time()
for word in wordstream:
! record = wordinfoget(word)
if record is None:
record = wordinfo[word] = WordInfo(now)
***************
*** 511,515 ****
""" #'
! def main1():
b = GrahamBayes()
b.learn(tokenize(spam1), True)
--- 521,525 ----
""" #'
! def _test():
b = GrahamBayes()
b.learn(tokenize(spam1), True)
***************
*** 529,533 ****
! def main2():
try:
opts, args = getopt.getopt(sys.argv[1:], 'hg:s:u:p:c:m:o:')
--- 539,543 ----
! def main():
try:
opts, args = getopt.getopt(sys.argv[1:], 'hg:s:u:p:c:m:o:')
***************
*** 535,538 ****
--- 545,553 ----
usage(1, msg)
+ if not opts and not args:
+ # Called without options or arguments, run the self-test
+ _test()
+ return
+
threshold = count = good = spam = unknown = pck = mark = output = None
for opt, arg in opts:
***************
*** 668,670 ****
if __name__ == '__main__':
! main2()
--- 683,685 ----
if __name__ == '__main__':
! main()