[Spambayes-checkins] spambayes classifier.py,1.53.2.7,1.53.2.8
Tim Stone
timstone4@users.sourceforge.net
Fri Nov 22 23:50:21 2002
Update of /cvsroot/spambayes/spambayes
In directory sc8-pr-cvs1:/tmp/cvs-serv15530
Modified Files:
Tag: hammie-playground
classifier.py
Log Message:
Corrected probability calculation result caching, which in the previous
version was <quite> flawed.
Index: classifier.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/classifier.py,v
retrieving revision 1.53.2.7
retrieving revision 1.53.2.8
diff -C2 -d -r1.53.2.7 -r1.53.2.8
*** classifier.py 22 Nov 2002 16:33:19 -0000 1.53.2.7
--- classifier.py 22 Nov 2002 23:50:18 -0000 1.53.2.8
***************
*** 48,52 ****
PICKLE_VERSION = 1
- probcache = {}
class MetaInfo(object):
--- 48,51 ----
***************
*** 56,60 ****
has a revision, incremented every time nham or nspam is adjusted to
invalidate any cached probabilities.
!
"""
def __init__(self):
--- 55,59 ----
has a revision, incremented every time nham or nspam is adjusted to
invalidate any cached probabilities.
!
"""
def __init__(self):
***************
*** 89,94 ****
nspam = property(get_nspam, set_nspam)
!
!
class WordInfo(object):
--- 88,93 ----
nspam = property(get_nspam, set_nspam)
!
!
class WordInfo(object):
***************
*** 115,119 ****
def _update_probability(self, meta):
"""Compute and store p(word) = prob(msg is spam | msg contains word).
!
This is the Graham calculation, but stripped of biases, and
stripped of clamping into 0.01 thru 0.99. The Bayesian
--- 114,118 ----
def _update_probability(self, meta):
"""Compute and store p(word) = prob(msg is spam | msg contains word).
!
This is the Graham calculation, but stripped of biases, and
stripped of clamping into 0.01 thru 0.99. The Bayesian
***************
*** 133,150 ****
assert self.spamcount <= nspam
spamratio = self.spamcount / nspam
-
- self.revision = meta.revision
-
- # do a cache lookaside here, to possibly save a bunch of calculations
- try:
- self.spamprob = probcache[hamratio][spamratio]
- return True
- except KeyError:
- pass
- except TypeError:
- probcache[hamratio] = {}
prob = spamratio / (hamratio + spamratio)
!
if options.experimental_ham_spam_imbalance_adjustment:
spam2ham = min(nspam / nham, 1.0)
--- 132,138 ----
assert self.spamcount <= nspam
spamratio = self.spamcount / nspam
prob = spamratio / (hamratio + spamratio)
!
if options.experimental_ham_spam_imbalance_adjustment:
spam2ham = min(nspam / nham, 1.0)
***************
*** 194,216 ****
prob = (StimesX + n * prob) / (S + n)
! # populate the cache, so this calculation won't have to be done again
! try:
! probcache[hamratio][spamratio] = prob
! except KeyError:
! probcache[hamratio] = {}
! probcache[hamratio][spamratio] = prob
!
! # the following code is meaningless to me, maybe a performance hack?
! # if so, it's been nullified by the cache, so simply set self.spamprob
! # and return True
!
! #if self.spamprob != prob:
! # self.spamprob = prob
! # return True
! #else:
! # return False
!
! self.spamprob = prob
! return True
def probability(self, meta):
--- 182,192 ----
prob = (StimesX + n * prob) / (S + n)
! self.revision = meta.revision
!
! if self.spamprob != prob:
! self.spamprob = prob
! return True
! else:
! return False
def probability(self, meta):
***************
*** 239,242 ****
--- 215,219 ----
self.wordinfo = {}
self.meta = MetaInfo()
+ self.probcache = {}
def __getstate__(self):
***************
*** 435,441 ****
important thing is that the probabilities get updated before
calling spamprob() again.
!
"""
self._add_msg(wordstream, is_spam)
--- 412,419 ----
important thing is that the probabilities get updated before
calling spamprob() again.
!
"""
+ self.probcache = {} # nuke the prob cache
self._add_msg(wordstream, is_spam)
***************
*** 445,449 ****
Pass the same arguments you passed to learn().
"""
!
self._remove_msg(wordstream, is_spam)
--- 423,427 ----
Pass the same arguments you passed to learn().
"""
! self.probcache = {} # nuke the prob cache
self._remove_msg(wordstream, is_spam)
***************
*** 504,508 ****
else:
record.hamcount += 1
!
# Needed to tell a persistent DB that the content changed.
wordinfo[word] = record
--- 482,486 ----
else:
record.hamcount += 1
!
# Needed to tell a persistent DB that the content changed.
wordinfo[word] = record
***************
*** 550,554 ****
prob = unknown
else:
! prob = record.probability(self.meta)
distance = abs(prob - 0.5)
if distance >= mindist:
--- 528,532 ----
prob = unknown
else:
! prob = self.probability(record)
distance = abs(prob - 0.5)
if distance >= mindist:
***************
*** 560,563 ****
--- 538,565 ----
# Return (prob, word, record).
return [t[1:] for t in clues]
+
+ def probability(self, word):
+ """Look up words (spamcount, hamcount) in the prob cache"""
+
+ # Dictionary of dictionaries is used here for efficiency
+
+ h = word.hamcount
+ s = word.spamcount
+
+ try:
+ return self.probcache[h][s]
+ except (KeyError, TypeError):
+ pass
+
+ # populate the cache, so this calculation won't have to be done again
+ try:
+ self.probcache[h]
+ except KeyError:
+ self.probcache[h] = {}
+
+ word.probability(self.meta)
+ self.probcache[h][s] = word.spamprob
+
+ return word.spamprob
Bayes = Classifier
More information about the Spambayes-checkins
mailing list