[Spambayes-checkins] spambayes classifier.py,1.53.2.6,1.53.2.7
Tim Stone
timstone4@users.sourceforge.net
Fri Nov 22 16:33:21 2002
Update of /cvsroot/spambayes/spambayes
In directory sc8-pr-cvs1:/tmp/cvs-serv400
Modified Files:
Tag: hammie-playground
classifier.py
Log Message:
Added probability calculation result caching. No benchmark available to see
how much, if any, performance gain is achieved, but it seems like it could
be significant, particularly in training large corpora, or with long running
processes.
Index: classifier.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/classifier.py,v
retrieving revision 1.53.2.6
retrieving revision 1.53.2.7
diff -C2 -d -r1.53.2.6 -r1.53.2.7
*** classifier.py 22 Nov 2002 00:12:35 -0000 1.53.2.6
--- classifier.py 22 Nov 2002 16:33:19 -0000 1.53.2.7
***************
*** 48,51 ****
--- 48,52 ----
PICKLE_VERSION = 1
+ probcache = {}
class MetaInfo(object):
***************
*** 127,130 ****
--- 128,150 ----
nspam = float(meta.nspam or 1)
+ assert self.hamcount <= nham
+ hamratio = self.hamcount / nham
+
+ assert self.spamcount <= nspam
+ spamratio = self.spamcount / nspam
+
+ self.revision = meta.revision
+
+ # do a cache lookaside here, to possibly save a bunch of calculations
+ try:
+ self.spamprob = probcache[hamratio][spamratio]
+ return True
+ except KeyError:
+ pass
+ except TypeError:
+ probcache[hamratio] = {}
+
+ prob = spamratio / (hamratio + spamratio)
+
if options.experimental_ham_spam_imbalance_adjustment:
spam2ham = min(nspam / nham, 1.0)
***************
*** 136,146 ****
StimesX = S * options.unknown_word_prob
- assert self.hamcount <= nham
- hamratio = self.hamcount / nham
-
- assert self.spamcount <= nspam
- spamratio = self.spamcount / nspam
-
- prob = spamratio / (hamratio + spamratio)
# Now do Robinson's Bayesian adjustment.
--- 156,159 ----
***************
*** 181,190 ****
prob = (StimesX + n * prob) / (S + n)
! self.revision = meta.revision
! if self.spamprob != prob:
! self.spamprob = prob
! return True
! else:
! return False
def probability(self, meta):
--- 194,216 ----
prob = (StimesX + n * prob) / (S + n)
! # populate the cache, so this calculation won't have to be done again
! try:
! probcache[hamratio][spamratio] = prob
! except KeyError:
! probcache[hamratio] = {}
! probcache[hamratio][spamratio] = prob
!
! # the following code is meaningless to me, maybe a performance hack?
! # if so, it's been nullified by the cache, so simply set self.spamprob
! # and return True
!
! #if self.spamprob != prob:
! # self.spamprob = prob
! # return True
! #else:
! # return False
!
! self.spamprob = prob
! return True
def probability(self, meta):
More information about the Spambayes-checkins
mailing list