[Spambayes-checkins] spambayes classifier.py,1.53.2.7,1.53.2.8

Fri Nov 22 23:50:21 2002

Update of /cvsroot/spambayes/spambayes
In directory sc8-pr-cvs1:/tmp/cvs-serv15530

Modified Files:
      Tag: hammie-playground
	classifier.py 
Log Message:
Corrected probability calculation result caching, which in the previous
version was <quite> flawed.

Index: classifier.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/classifier.py,v
retrieving revision 1.53.2.7
retrieving revision 1.53.2.8
diff -C2 -d -r1.53.2.7 -r1.53.2.8
*** classifier.py	22 Nov 2002 16:33:19 -0000	1.53.2.7
--- classifier.py	22 Nov 2002 23:50:18 -0000	1.53.2.8
***************
*** 48,52 ****
  
  PICKLE_VERSION = 1
- probcache = {}
  
  class MetaInfo(object):
--- 48,51 ----
***************
*** 56,60 ****
      has a revision, incremented every time nham or nspam is adjusted to
      invalidate any cached probabilities.
!     
      """
      def __init__(self):
--- 55,59 ----
      has a revision, incremented every time nham or nspam is adjusted to
      invalidate any cached probabilities.
! 
      """
      def __init__(self):
***************
*** 89,94 ****
      nspam = property(get_nspam, set_nspam)
  
!         
!     
  
  class WordInfo(object):
--- 88,93 ----
      nspam = property(get_nspam, set_nspam)
  
! 
! 
  
  class WordInfo(object):
***************
*** 115,119 ****
      def _update_probability(self, meta):
          """Compute and store p(word) = prob(msg is spam | msg contains word).
!         
          This is the Graham calculation, but stripped of biases, and
          stripped of clamping into 0.01 thru 0.99.  The Bayesian
--- 114,118 ----
      def _update_probability(self, meta):
          """Compute and store p(word) = prob(msg is spam | msg contains word).
! 
          This is the Graham calculation, but stripped of biases, and
          stripped of clamping into 0.01 thru 0.99.  The Bayesian
***************
*** 133,150 ****
          assert self.spamcount <= nspam
          spamratio = self.spamcount / nspam
-         
-         self.revision = meta.revision
-         
-         # do a cache lookaside here, to possibly save a bunch of calculations
-         try:
-             self.spamprob = probcache[hamratio][spamratio]
-             return True
-         except KeyError:
-             pass
-         except TypeError:
-             probcache[hamratio] = {}
  
          prob = spamratio / (hamratio + spamratio)
!         
          if options.experimental_ham_spam_imbalance_adjustment:
              spam2ham = min(nspam / nham, 1.0)
--- 132,138 ----
          assert self.spamcount <= nspam
          spamratio = self.spamcount / nspam
  
          prob = spamratio / (hamratio + spamratio)
! 
          if options.experimental_ham_spam_imbalance_adjustment:
              spam2ham = min(nspam / nham, 1.0)
***************
*** 194,216 ****
          prob = (StimesX + n * prob) / (S + n)
  
!         # populate the cache, so this calculation won't have to be done again
!         try:
!             probcache[hamratio][spamratio] = prob
!         except KeyError:
!             probcache[hamratio] = {}
!             probcache[hamratio][spamratio] = prob
!         
!         # the following code is meaningless to me, maybe a performance hack?
!         # if so, it's been nullified by the cache, so simply set self.spamprob
!         # and return True
!         
!         #if self.spamprob != prob:
!         #    self.spamprob = prob
!         #    return True
!         #else:
!         #    return False
!         
!         self.spamprob = prob
!         return True
  
      def probability(self, meta):
--- 182,192 ----
          prob = (StimesX + n * prob) / (S + n)
  
!         self.revision = meta.revision
! 
!         if self.spamprob != prob:
!             self.spamprob = prob
!             return True
!         else:
!             return False
  
      def probability(self, meta):
***************
*** 239,242 ****
--- 215,219 ----
          self.wordinfo = {}
          self.meta = MetaInfo()
+         self.probcache = {}
  
      def __getstate__(self):
***************
*** 435,441 ****
          important thing is that the probabilities get updated before
          calling spamprob() again.
!         
          """
  
          self._add_msg(wordstream, is_spam)
  
--- 412,419 ----
          important thing is that the probabilities get updated before
          calling spamprob() again.
! 
          """
  
+         self.probcache = {}    # nuke the prob cache
          self._add_msg(wordstream, is_spam)
  
***************
*** 445,449 ****
          Pass the same arguments you passed to learn().
          """
! 
          self._remove_msg(wordstream, is_spam)
  
--- 423,427 ----
          Pass the same arguments you passed to learn().
          """
!         self.probcache = {}    # nuke the prob cache
          self._remove_msg(wordstream, is_spam)
  
***************
*** 504,508 ****
              else:
                  record.hamcount += 1
!                 
              # Needed to tell a persistent DB that the content changed.
              wordinfo[word] = record
--- 482,486 ----
              else:
                  record.hamcount += 1
! 
              # Needed to tell a persistent DB that the content changed.
              wordinfo[word] = record
***************
*** 550,554 ****
                  prob = unknown
              else:
!                 prob = record.probability(self.meta)
              distance = abs(prob - 0.5)
              if distance >= mindist:
--- 528,532 ----
                  prob = unknown
              else:
!                 prob = self.probability(record)
              distance = abs(prob - 0.5)
              if distance >= mindist:
***************
*** 560,563 ****
--- 538,565 ----
          # Return (prob, word, record).
          return [t[1:] for t in clues]
+ 
+     def probability(self, word):
+         """Look up words (spamcount, hamcount) in the prob cache"""
+ 
+         # Dictionary of dictionaries is used here for efficiency
+ 
+         h = word.hamcount
+         s = word.spamcount
+ 
+         try:
+             return self.probcache[h][s]
+         except (KeyError, TypeError):
+             pass
+ 
+         # populate the cache, so this calculation won't have to be done again
+         try:
+             self.probcache[h]
+         except KeyError:
+             self.probcache[h] = {}
+ 
+         word.probability(self.meta)
+         self.probcache[h][s] = word.spamprob
+ 
+         return word.spamprob
  
  Bayes = Classifier