[Python-checkins] python/nondist/sandbox/spambayes GBayes.py,1.5,1.6

gvanrossum@users.sourceforge.net gvanrossum@users.sourceforge.net
Tue, 20 Aug 2002 11:22:52 -0700


Update of /cvsroot/python/python/nondist/sandbox/spambayes
In directory usw-pr-cvs1:/tmp/cvs-serv2396

Modified Files:
	GBayes.py 
Log Message:
Some minor cleanup:

- Move the identifying comment to the top, clarify it a bit, and add
  author info.

- There's no reason for _time and _heapreplace to be hidden names;
  change these back to time and heapreplace.

- Rename main1() to _test() and main2() to main(); when main() sees
  there are no options or arguments, it runs _test().

- Get rid of a list comprehension from clearjunk().

- Put wordinfo.get as a local variable in _add_msg().


Index: GBayes.py
===================================================================
RCS file: /cvsroot/python/python/nondist/sandbox/spambayes/GBayes.py,v
retrieving revision 1.5
retrieving revision 1.6
diff -C2 -d -r1.5 -r1.6
*** GBayes.py	20 Aug 2002 05:16:48 -0000	1.5
--- GBayes.py	20 Aug 2002 18:22:50 -0000	1.6
***************
*** 1,4 ****
--- 1,14 ----
  #!/usr/bin/env python
  
+ # This is an implementation of the Bayes-like spam classifier sketched
+ # by Paul Graham at <http://www.paulgraham.com/spam.html>.  We say
+ # "Bayes-like" because there are many ad hoc deviations from a
+ # "normal" Bayesian classifier.
+ #
+ # Tim Peters wrote the algorithmic part of the code.
+ #
+ # Barry Warsaw added integration infrastructure like command line
+ # options and a pickled database.
+ 
  """Usage: %(program)s [options]
  
***************
*** 25,34 ****
      -o file
          with -m, output all messages, with marks, to file
  """
  
  import sys
  import getopt
! import time as _time
! from heapq import heapreplace as _heapreplace
  import cPickle as pickle
  import mailbox
--- 35,46 ----
      -o file
          with -m, output all messages, with marks, to file
+ 
+ When called without any options or arguments, a short self-test is run.
  """
  
  import sys
  import getopt
! import time
! from heapq import heapreplace
  import cPickle as pickle
  import mailbox
***************
*** 38,44 ****
  program = sys.argv[0]
  
- # This is an implementation of the Bayes-like spam classifier sketched at
- # <http://www.paulgraham.com/spam.html>.  I say "Bayes-like" because there
- # are many ad hoc deviations from a "normal" Bayesian classifier.
  HAMBIAS  = 2.0
  SPAMBIAS = 1.0
--- 50,53 ----
***************
*** 56,59 ****
--- 65,70 ----
  # But the code snippet considers words that appear at least five times.
  # This implementation follows the code rather than the explanation.
+ # (In addition, the count compared is after multiplying it with the
+ # appropriate bias factor.)
  MINCOUNT = 5.0
  
***************
*** 122,126 ****
  
          wordinfoget = self.wordinfo.get
!         now = _time.time()
  
          # A priority queue to remember the MAX_DISCRIMINATORS best
--- 133,137 ----
  
          wordinfoget = self.wordinfo.get
!         now = time.time()
  
          # A priority queue to remember the MAX_DISCRIMINATORS best
***************
*** 140,144 ****
              distance = abs(prob - 0.5)
              if distance > smallest_best:
!                 _heapreplace(nbest, (distance, prob, word, record))
                  smallest_best = nbest[0][0]
  
--- 151,155 ----
              distance = abs(prob - 0.5)
              if distance > smallest_best:
!                 heapreplace(nbest, (distance, prob, word, record))
                  smallest_best = nbest[0][0]
  
***************
*** 234,245 ****
          wordinfo = self.wordinfo
          mincount = float(mincount)
!         tonuke = [w for w, r in wordinfo.iteritems()
!                     if r.modtime < oldesttime and
!                        SPAMBIAS*r.spamcount + HAMBIAS*r.hamcount < mincount]
!         for word in tonuke:
!             if self.DEBUG:
!                 r = wordinfo[word]
!                 print "clearjunk removing word %r: %r" % (word, r)
!             del wordinfo[word]
  
      def _add_msg(self, wordstream, is_spam):
--- 245,254 ----
          wordinfo = self.wordinfo
          mincount = float(mincount)
!         for w, r in wordinfo.iteritems():
!             if (r.modtime < oldesttime and
!                 SPAMBIAS*r.spamcount + HAMBIAS*r.hamcount < mincount):
!                 if self.DEBUG:
!                     print "clearjunk removing word %r: %r" % (w, r)
!             del wordinfo[w]
  
      def _add_msg(self, wordstream, is_spam):
***************
*** 253,259 ****
  
          wordinfo = self.wordinfo
!         now = _time.time()
          for word in wordstream:
!             record = wordinfo.get(word)
              if record is None:
                  record = wordinfo[word] = WordInfo(now)
--- 262,269 ----
  
          wordinfo = self.wordinfo
!         wordinfoget = wordinfo.get
!         now = time.time()
          for word in wordstream:
!             record = wordinfoget(word)
              if record is None:
                  record = wordinfo[word] = WordInfo(now)
***************
*** 511,515 ****
  """ #'
  
! def main1():
      b = GrahamBayes()
      b.learn(tokenize(spam1), True)
--- 521,525 ----
  """ #'
  
! def _test():
      b = GrahamBayes()
      b.learn(tokenize(spam1), True)
***************
*** 529,533 ****
  
  
! def main2():
      try:
          opts, args = getopt.getopt(sys.argv[1:], 'hg:s:u:p:c:m:o:')
--- 539,543 ----
  
  
! def main():
      try:
          opts, args = getopt.getopt(sys.argv[1:], 'hg:s:u:p:c:m:o:')
***************
*** 535,538 ****
--- 545,553 ----
          usage(1, msg)
  
+     if not opts and not args:
+         # Called without options or arguments, run the self-test
+         _test()
+         return
+ 
      threshold = count = good = spam = unknown = pck = mark = output = None
      for opt, arg in opts:
***************
*** 668,670 ****
  
  if __name__ == '__main__':
!     main2()
--- 683,685 ----
  
  if __name__ == '__main__':
!     main()