[Spambayes-checkins] spambayes neilfilter.py,1.2,1.3

Neil Schemenauer nascheme@users.sourceforge.net
Mon, 30 Sep 2002 18:33:46 -0700


Update of /cvsroot/spambayes/spambayes
In directory usw-pr-cvs1:/tmp/cvs-serv28286

Modified Files:
	neilfilter.py 
Log Message:
Update to work with the "Robinson" classifer.  Reuse the code from the
classifier module by using a Cdb wrapper class that creates the WordInfo
objects.


Index: neilfilter.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/neilfilter.py,v
retrieving revision 1.2
retrieving revision 1.3
diff -C2 -d -r1.2 -r1.3
*** neilfilter.py	20 Sep 2002 03:14:42 -0000	1.2
--- neilfilter.py	1 Oct 2002 01:33:44 -0000	1.3
***************
*** 10,19 ****
  import socket
  import email
- from heapq import heapreplace
- from sets import Set
  import cdb
  from tokenizer import tokenize
! from classifier import MIN_SPAMPROB, MAX_SPAMPROB, UNKNOWN_SPAMPROB, \
!     MAX_DISCRIMINATORS
  
  program = sys.argv[0] # For usage(); referenced by docstring above
--- 10,16 ----
  import socket
  import email
  import cdb
  from tokenizer import tokenize
! import classifier
  
  program = sys.argv[0] # For usage(); referenced by docstring above
***************
*** 21,99 ****
  BLOCK_SIZE = 10000
  SIZE_LIMIT = 5000000 # messages larger are not analyzed
! SPAM_THRESHOLD = 0.9
! 
! def spamprob(wordprobs, wordstream):
!     """Return best-guess probability that wordstream is spam.
! 
!     wordprobs is a CDB of word probabilities
! 
!     wordstream is an iterable object producing words.
!     The return value is a float in [0.0, 1.0].
!     """
! 
!     # A priority queue to remember the MAX_DISCRIMINATORS best
!     # probabilities, where "best" means largest distance from 0.5.
!     # The tuples are (distance, prob, word).
!     nbest = [(-1.0, None, None)] * MAX_DISCRIMINATORS
!     smallest_best = -1.0
! 
!     mins = []   # all words w/ prob MIN_SPAMPROB
!     maxs = []   # all words w/ prob MAX_SPAMPROB
!     # Counting a unique word multiple times hurts, although counting one
!     # at most two times had some benefit whan UNKNOWN_SPAMPROB was 0.2.
!     # When that got boosted to 0.5, counting more than once became
!     # counterproductive.
!     for word in Set(wordstream):
!         prob = float(wordprobs.get(word, UNKNOWN_SPAMPROB))
!         distance = abs(prob - 0.5)
!         if prob == MIN_SPAMPROB:
!             mins.append((distance, prob, word))
!         elif prob == MAX_SPAMPROB:
!             maxs.append((distance, prob, word))
!         elif distance > smallest_best:
!             # Subtle:  we didn't use ">" instead of ">=" just to save
!             # calls to heapreplace().  The real intent is that if
!             # there are many equally strong indicators throughout the
!             # message, we want to favor the ones that appear earliest:
!             # it's expected that spam headers will often have smoking
!             # guns, and, even when not, spam has to grab your attention
!             # early (& note that when spammers generate large blocks of
!             # random gibberish to throw off exact-match filters, it's
!             # always at the end of the msg -- if they put it at the
!             # start, *nobody* would read the msg).
!             heapreplace(nbest, (distance, prob, word))
!             smallest_best = nbest[0][0]
! 
!     # Compute the probability.  Note:  This is what Graham's code did,
!     # but it's dubious for reasons explained in great detail on Python-
!     # Dev:  it's missing P(spam) and P(not-spam) adjustments that
!     # straightforward Bayesian analysis says should be here.  It's
!     # unclear how much it matters, though, as the omissions here seem
!     # to tend in part to cancel out distortions introduced earlier by
!     # HAMBIAS.  Experiments will decide the issue.
  
!     # First cancel out competing extreme clues (see comment block at
!     # MAX_DISCRIMINATORS declaration -- this is a twist on Graham).
!     if mins or maxs:
!         if len(mins) < len(maxs):
!             shorter, longer = mins, maxs
          else:
!             shorter, longer = maxs, mins
!         tokeep = min(len(longer) - len(shorter), MAX_DISCRIMINATORS)
!         # They're all good clues, but we're only going to feed the tokeep
!         # initial clues from the longer list into the probability
!         # computation.
!         for x in longer[:tokeep]:
!             heapreplace(nbest, x)
! 
!     prob_product = inverse_prob_product = 1.0
!     for distance, prob, word in nbest:
!         if prob is None:    # it's one of the dummies nbest started with
!             continue
!         prob_product *= prob
!         inverse_prob_product *= 1.0 - prob
! 
!     prob = prob_product / (prob_product + inverse_prob_product)
!     return prob
  
  def maketmp(dir):
--- 18,37 ----
  BLOCK_SIZE = 10000
  SIZE_LIMIT = 5000000 # messages larger are not analyzed
! SPAM_CUTOFF = 0.57
  
! class CdbWrapper(cdb.Cdb):
!     def get(self, key, default=None,
!             cdb_get=cdb.Cdb.get,
!             WordInfo=classifier.WordInfo):
!         prob = cdb_get(self, key, default)
!         if prob is None:
!             return None
          else:
!             return WordInfo(0, float(prob))
!     
! class CdbBayes(classifier.Bayes):
!     def __init__(self, cdbfile):
!         classifier.Bayes.__init__(self)
!         self.wordinfo = CdbWrapper(cdbfile)
  
  def maketmp(dir):
***************
*** 156,165 ****
              msg = email.message_from_string(msgdata)
              del msgdata
!             wordprobs = cdb.Cdb(open(wordprobfilename, 'rb'))
!             prob = spamprob(wordprobs, tokenize(msg))
          else:
              prob = 0.0
  
!         if prob > SPAM_THRESHOLD:
              os.rename(pathname, "%s/new/%s" % (spamdir, filename))
          else:
--- 94,103 ----
              msg = email.message_from_string(msgdata)
              del msgdata
!             bayes = CdbBayes(open(wordprobfilename, 'rb'))
!             prob = bayes.spamprob(tokenize(msg))
          else:
              prob = 0.0
  
!         if prob > SPAM_CUTOFF:
              os.rename(pathname, "%s/new/%s" % (spamdir, filename))
          else: