[Spambayes-checkins] spambayes neilfilter.py,1.2,1.3
Neil Schemenauer
nascheme@users.sourceforge.net
Mon, 30 Sep 2002 18:33:46 -0700
Update of /cvsroot/spambayes/spambayes
In directory usw-pr-cvs1:/tmp/cvs-serv28286
Modified Files:
neilfilter.py
Log Message:
Update to work with the "Robinson" classifer. Reuse the code from the
classifier module by using a Cdb wrapper class that creates the WordInfo
objects.
Index: neilfilter.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/neilfilter.py,v
retrieving revision 1.2
retrieving revision 1.3
diff -C2 -d -r1.2 -r1.3
*** neilfilter.py 20 Sep 2002 03:14:42 -0000 1.2
--- neilfilter.py 1 Oct 2002 01:33:44 -0000 1.3
***************
*** 10,19 ****
import socket
import email
- from heapq import heapreplace
- from sets import Set
import cdb
from tokenizer import tokenize
! from classifier import MIN_SPAMPROB, MAX_SPAMPROB, UNKNOWN_SPAMPROB, \
! MAX_DISCRIMINATORS
program = sys.argv[0] # For usage(); referenced by docstring above
--- 10,16 ----
import socket
import email
import cdb
from tokenizer import tokenize
! import classifier
program = sys.argv[0] # For usage(); referenced by docstring above
***************
*** 21,99 ****
BLOCK_SIZE = 10000
SIZE_LIMIT = 5000000 # messages larger are not analyzed
! SPAM_THRESHOLD = 0.9
!
! def spamprob(wordprobs, wordstream):
! """Return best-guess probability that wordstream is spam.
!
! wordprobs is a CDB of word probabilities
!
! wordstream is an iterable object producing words.
! The return value is a float in [0.0, 1.0].
! """
!
! # A priority queue to remember the MAX_DISCRIMINATORS best
! # probabilities, where "best" means largest distance from 0.5.
! # The tuples are (distance, prob, word).
! nbest = [(-1.0, None, None)] * MAX_DISCRIMINATORS
! smallest_best = -1.0
!
! mins = [] # all words w/ prob MIN_SPAMPROB
! maxs = [] # all words w/ prob MAX_SPAMPROB
! # Counting a unique word multiple times hurts, although counting one
! # at most two times had some benefit whan UNKNOWN_SPAMPROB was 0.2.
! # When that got boosted to 0.5, counting more than once became
! # counterproductive.
! for word in Set(wordstream):
! prob = float(wordprobs.get(word, UNKNOWN_SPAMPROB))
! distance = abs(prob - 0.5)
! if prob == MIN_SPAMPROB:
! mins.append((distance, prob, word))
! elif prob == MAX_SPAMPROB:
! maxs.append((distance, prob, word))
! elif distance > smallest_best:
! # Subtle: we didn't use ">" instead of ">=" just to save
! # calls to heapreplace(). The real intent is that if
! # there are many equally strong indicators throughout the
! # message, we want to favor the ones that appear earliest:
! # it's expected that spam headers will often have smoking
! # guns, and, even when not, spam has to grab your attention
! # early (& note that when spammers generate large blocks of
! # random gibberish to throw off exact-match filters, it's
! # always at the end of the msg -- if they put it at the
! # start, *nobody* would read the msg).
! heapreplace(nbest, (distance, prob, word))
! smallest_best = nbest[0][0]
!
! # Compute the probability. Note: This is what Graham's code did,
! # but it's dubious for reasons explained in great detail on Python-
! # Dev: it's missing P(spam) and P(not-spam) adjustments that
! # straightforward Bayesian analysis says should be here. It's
! # unclear how much it matters, though, as the omissions here seem
! # to tend in part to cancel out distortions introduced earlier by
! # HAMBIAS. Experiments will decide the issue.
! # First cancel out competing extreme clues (see comment block at
! # MAX_DISCRIMINATORS declaration -- this is a twist on Graham).
! if mins or maxs:
! if len(mins) < len(maxs):
! shorter, longer = mins, maxs
else:
! shorter, longer = maxs, mins
! tokeep = min(len(longer) - len(shorter), MAX_DISCRIMINATORS)
! # They're all good clues, but we're only going to feed the tokeep
! # initial clues from the longer list into the probability
! # computation.
! for x in longer[:tokeep]:
! heapreplace(nbest, x)
!
! prob_product = inverse_prob_product = 1.0
! for distance, prob, word in nbest:
! if prob is None: # it's one of the dummies nbest started with
! continue
! prob_product *= prob
! inverse_prob_product *= 1.0 - prob
!
! prob = prob_product / (prob_product + inverse_prob_product)
! return prob
def maketmp(dir):
--- 18,37 ----
BLOCK_SIZE = 10000
SIZE_LIMIT = 5000000 # messages larger are not analyzed
! SPAM_CUTOFF = 0.57
! class CdbWrapper(cdb.Cdb):
! def get(self, key, default=None,
! cdb_get=cdb.Cdb.get,
! WordInfo=classifier.WordInfo):
! prob = cdb_get(self, key, default)
! if prob is None:
! return None
else:
! return WordInfo(0, float(prob))
!
! class CdbBayes(classifier.Bayes):
! def __init__(self, cdbfile):
! classifier.Bayes.__init__(self)
! self.wordinfo = CdbWrapper(cdbfile)
def maketmp(dir):
***************
*** 156,165 ****
msg = email.message_from_string(msgdata)
del msgdata
! wordprobs = cdb.Cdb(open(wordprobfilename, 'rb'))
! prob = spamprob(wordprobs, tokenize(msg))
else:
prob = 0.0
! if prob > SPAM_THRESHOLD:
os.rename(pathname, "%s/new/%s" % (spamdir, filename))
else:
--- 94,103 ----
msg = email.message_from_string(msgdata)
del msgdata
! bayes = CdbBayes(open(wordprobfilename, 'rb'))
! prob = bayes.spamprob(tokenize(msg))
else:
prob = 0.0
! if prob > SPAM_CUTOFF:
os.rename(pathname, "%s/new/%s" % (spamdir, filename))
else: