[Spambayes-checkins] spambayes/spambayes Stats.py, 1.19, 1.20 message.py, 1.75, 1.76

Kenny Pitt kpitt at users.sourceforge.net
Thu Dec 28 16:36:46 CET 2006


Update of /cvsroot/spambayes/spambayes/spambayes
In directory sc8-pr-cvs8.sourceforge.net:/tmp/cvs-serv30845/spambayes

Modified Files:
	Stats.py message.py 
Log Message:
Recalculating persistent statistics from the message info db each session can
be very time-consuming when the db gets large, so cache the persistent
statistics in the message info db instead.  If the message info db doesn't yet
contain the persistent statistics then there will be a one-time startup hit to
recalculate them, and after that startup will be significantly faster.

Index: Stats.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/spambayes/Stats.py,v
retrieving revision 1.19
retrieving revision 1.20
diff -C2 -d -r1.19 -r1.20
*** Stats.py	20 Dec 2006 22:51:18 -0000	1.19
--- Stats.py	28 Dec 2006 15:36:44 -0000	1.20
***************
*** 43,47 ****
  import types
  
! from spambayes.message import STATS_START_KEY
  from spambayes.message import database_type, open_storage, Message
  
--- 43,47 ----
  import types
  
! from spambayes.message import STATS_START_KEY, STATS_STORAGE_KEY
  from spambayes.message import database_type, open_storage, Message
  
***************
*** 59,63 ****
          # Initialize persistent stats.
          self.from_date = self.messageinfo_db.get_statistics_start_date()
!         self.persistentStatsCalculated = False
  
      def Reset(self):
--- 59,63 ----
          # Initialize persistent stats.
          self.from_date = self.messageinfo_db.get_statistics_start_date()
!         self.LoadPersistentStats()
  
      def Reset(self):
***************
*** 76,88 ****
--- 76,96 ----
              self.from_date = time.time()
              self.messageinfo_db.set_statistics_start_date(self.from_date)
+             self.messageinfo_db.set_persistent_statistics(self.totals)
  
      def RecordClassification(self, score):
          """Record that a message has been classified this session."""
+         totals = self.totals
          if score >= self.options["Categorization", "spam_cutoff"]:
              self.num_spam += 1
+             totals["num_spam"] += 1
          elif score >= self.options["Categorization", "ham_cutoff"]:
              self.num_unsure += 1
+             totals["num_unsure"] += 1
          else:
              self.num_ham += 1
+             totals["num_ham"] += 1
+         # We have to record the updated totals every time or else the
+         # persistent statistics will get out of sync.
+         self.messageinfo_db.set_persistent_statistics(self.totals)
  
      def RecordTraining(self, as_ham, old_score=None, old_class=None):
***************
*** 97,102 ****
--- 105,112 ----
          # XXX Why, oh why, does this function have as_ham, when every
          # XXX other function has isSpam???
+         totals = self.totals
          if as_ham:
              self.num_trained_ham += 1
+             totals["num_trained_ham"] += 1
              # If we are recovering an item that is in the "spam" threshold,
              # then record it as a "false positive"
***************
*** 104,111 ****
--- 114,124 ----
                 old_score > self.options["Categorization", "spam_cutoff"]:
                  self.num_trained_ham_fp += 1
+                 totals["num_trained_ham_fp"] += 1
              elif old_class == self.options["Headers", "header_spam_string"]:
                  self.num_trained_ham_fp += 1
+                 totals["num_trained_ham_fp"] += 1
          else:
              self.num_trained_spam += 1
+             totals["num_trained_spam"] += 1
              # If we are deleting as Spam an item that was in our "good"
              # range, then record it as a false negative.
***************
*** 113,118 ****
--- 126,147 ----
                 old_score < self.options["Categorization", "ham_cutoff"]:
                  self.num_trained_spam_fn += 1
+                 totals["num_trained_spam_fn"] += 1
              elif old_class == self.options["Headers", "header_ham_string"]:
                  self.num_trained_spam_fn += 1
+                 totals["num_trained_spam_fn"] += 1
+         # We have to record the updated totals every time or else the
+         # persistent statistics will get out of sync.
+         self.messageinfo_db.set_persistent_statistics(self.totals)
+ 
+     def LoadPersistentStats(self):
+         """Load the persistent statistics from the messageinfo db.
+         
+         If the persistent statistics have not yet been stored in the db
+         then we need to recalculate them by iterating through all the
+         messages.  This will result in a one-time performance hit, but
+         will greatly improve the startup time in the future."""
+         self.totals = self.messageinfo_db.get_persistent_statistics()
+         if self.totals is None:
+             self.CalculatePersistentStats()
  
      def CalculatePersistentStats(self):
***************
*** 122,134 ****
          adding up the various information.  This could get quite time
          consuming if the messageinfo database gets very large, so
!         some consideration should perhaps be made about what to do
!         then.
          """
          self.ResetTotal()
          totals = self.totals
          for msg_id in self.messageinfo_db.keys():
!             # Skip the date key.
              if msg_id == STATS_START_KEY:
                  continue
              m = Message(msg_id)
              self.messageinfo_db.load_msg(m)
--- 151,166 ----
          adding up the various information.  This could get quite time
          consuming if the messageinfo database gets very large, so
!         it should only be done if the statistics start date is reset
!         to an arbitrary point in the past.
          """
          self.ResetTotal()
          totals = self.totals
          for msg_id in self.messageinfo_db.keys():
!             # Skip the date and persistent statistics keys.
              if msg_id == STATS_START_KEY:
                  continue
+             if msg_id == STATS_STORAGE_KEY:
+                 continue
+ 
              m = Message(msg_id)
              self.messageinfo_db.load_msg(m)
***************
*** 168,198 ****
                      totals["num_trained_spam"] += 1
  
!         # If we have already accumulated any session statistics then we need
!         # to subtract those from the totals to prevent double-counting.
!         totals["num_ham"] -= self.num_ham
!         totals["num_spam"] -= self.num_spam
!         totals["num_unsure"] -= self.num_unsure
!         totals["num_trained_ham"] -= self.num_trained_ham
!         totals["num_trained_ham_fp"] -= self.num_trained_ham_fp
!         totals["num_trained_spam"] -= self.num_trained_spam
!         totals["num_trained_spam_fn"] -= self.num_trained_spam_fn
! 
!     def _CombineSessionAndTotal(self):
!         totals = self.totals
!         data = {}
!         data["num_ham"] = self.num_ham + totals["num_ham"]
!         data["num_spam"] = self.num_spam + totals["num_spam"]
!         data["num_unsure"] = self.num_unsure + totals["num_unsure"]
!         data["num_seen"] = data["num_ham"] + data["num_spam"] + \
!                            data["num_unsure"]
!         data["num_trained_ham"] = self.num_trained_ham + \
!                                   totals["num_trained_ham"]
!         data["num_trained_ham_fp"] = self.num_trained_ham_fp + \
!                                      totals["num_trained_ham_fp"]
!         data["num_trained_spam"] = self.num_trained_spam + \
!                                    totals["num_trained_spam"]
!         data["num_trained_spam_fn"] = self.num_trained_spam_fn + \
!                                       totals["num_trained_spam_fn"]
!         return data
  
      def _CalculateAdditional(self, data):
--- 200,204 ----
                      totals["num_trained_spam"] += 1
  
!         self.messageinfo_db.set_persistent_statistics(totals)
  
      def _CalculateAdditional(self, data):
***************
*** 322,329 ****
          push = chunks.append
  
          if session_only:
-             data = {}
-             data["num_seen"] = self.num_ham + self.num_spam + \
-                                self.num_unsure
              data["num_ham"] = self.num_ham
              data["num_spam"] = self.num_spam
--- 328,333 ----
          push = chunks.append
  
+         data = {}
          if session_only:
              data["num_ham"] = self.num_ham
              data["num_spam"] = self.num_spam
***************
*** 334,342 ****
              data["num_trained_spam_fn"] = self.num_trained_spam_fn
          else:
!             if not self.persistentStatsCalculated:
!                 # Load persistent stats.
!                 self.CalculatePersistentStats()
!                 self.persistentStatsCalculated = True
!             data = self._CombineSessionAndTotal()
  
          push(_("Messages classified: %d") % (data["num_seen"],))
--- 338,347 ----
              data["num_trained_spam_fn"] = self.num_trained_spam_fn
          else:
!             for stat in ["num_ham", "num_spam", "num_unsure",
!                          "num_trained_spam", "num_trained_spam_fn",
!                          "num_trained_ham", "num_trained_ham_fp",]:
!                 data[stat] = self.totals[stat]
!         data["num_seen"] = data["num_ham"] + data["num_spam"] + \
!                            data["num_unsure"]
  
          push(_("Messages classified: %d") % (data["num_seen"],))

Index: message.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/spambayes/message.py,v
retrieving revision 1.75
retrieving revision 1.76
diff -C2 -d -r1.75 -r1.76
*** message.py	7 Apr 2006 02:33:15 -0000	1.75
--- message.py	28 Dec 2006 15:36:44 -0000	1.76
***************
*** 116,119 ****
--- 116,120 ----
  
  STATS_START_KEY = "Statistics start date"
+ STATS_STORAGE_KEY = "Persistent statistics"
  PERSISTENT_HAM_STRING = 'h'
  PERSISTENT_SPAM_STRING = 's'
***************
*** 137,140 ****
--- 138,151 ----
          self.store()
  
+     def get_persistent_statistics(self):
+         if self.db.has_key(STATS_STORAGE_KEY):
+             return self.db[STATS_STORAGE_KEY]
+         else:
+             return None
+             
+     def set_persistent_statistics(self, stats):
+         self.db[STATS_STORAGE_KEY] = stats
+         self.store()
+ 
      def __getstate__(self):
          return self.db
***************
*** 402,406 ****
  
          if id == STATS_START_KEY:
!             raise ValueError, "MsgId must not be" + STATS_START_KEY
  
          self.id = id
--- 413,420 ----
  
          if id == STATS_START_KEY:
!             raise ValueError, "MsgId must not be " + STATS_START_KEY
! 
!         if id == STATS_STORAGE_KEY:
!             raise ValueError, "MsgId must not be " + STATS_STORAGE_KEY
  
          self.id = id



More information about the Spambayes-checkins mailing list