[Spambayes-checkins] spambayes/spambayes message.py, 1.39, 1.40 storage.py, 1.35, 1.36

Tony Meyer anadelonbrin at users.sourceforge.net
Wed Oct 8 00:04:37 EDT 2003


Update of /cvsroot/spambayes/spambayes/spambayes
In directory sc8-pr-cvs1:/tmp/cvs-serv11152/spambayes

Modified Files:
	message.py storage.py 
Log Message:
Improvements to the messageinfo db:
 * If the (stats) db uses a pickle, then use a pickle for the messageinfo as well.
 * Close the db when we are no longer using it (the Python docs say that __del__
   isn't guaranteed to be called here, but this is better than nothing.  Something
  needs to explicitly close it, really.
 * I'm not convinced (someone hit me if I'm wrong) that we were ever sync()ing
   the db, so make sure that we are.
 * For safety, allow setId() to be called if an id is already set, as long as it's the
   same id.  (So setId("one"); setId("one") is ok).
 * The pop3proxy didn't actually store the training status of messages (again
   this is odd, so hit me if I'm wrong), so fix that, too.

It would be fantastic if these fixed some of the corruption problems :)  Please
try at your convenience (if you had any problems) and let -dev know.  Using
a pickle surely must get rid of the problems.

Index: message.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/spambayes/message.py,v
retrieving revision 1.39
retrieving revision 1.40
diff -C2 -d -r1.39 -r1.40
*** message.py	2 Oct 2003 05:50:43 -0000	1.39
--- message.py	8 Oct 2003 04:04:35 -0000	1.40
***************
*** 94,98 ****
--- 94,100 ----
  import sys
  import types
+ import errno
  import shelve
+ import pickle
  
  import email
***************
*** 108,129 ****
  CRLF_RE = re.compile(r'\r\n|\r|\n')
  
! class MessageInfoDB:
!     def __init__(self, db_name, mode='c'):
!         self.mode = mode
          self.db_name = db_name
-         try:
-             self.dbm = dbmstorage.open(self.db_name, self.mode)
-             self.db = shelve.Shelf(self.dbm)
-         except dbmstorage.error:
-             # This probably means that we don't have a dbm module
-             # available.  Print out a warning, and continue on
-             # (not persisting any of this data).
-             if options["globals", "verbose"]:
-                 print "Warning: no dbm modules available for MessageInfoDB"
-             self.dbm = self.db = None
- 
-     def store(self):
-         if self.db is not None:
-             self.db.sync()
  
      def _getState(self, msg):
--- 110,116 ----
  CRLF_RE = re.compile(r'\r\n|\r|\n')
  
! class MessageInfoBase(object):
!     def __init__(self, db_name):
          self.db_name = db_name
  
      def _getState(self, msg):
***************
*** 137,144 ****
--- 124,193 ----
          if self.db is not None:
              self.db[msg.getId()] = (msg.c, msg.t)
+             self.store()
  
      def _delState(self, msg):
          if self.db is not None:
              del self.db[msg.getId()]
+             self.store()
+ 
+ class MessageInfoPickle(MessageInfoBase):
+     def __init__(self, db_name, pickle_type=1):
+         MessageInfoBase.__init__(self, db_name)
+         self.mode = pickle_type
+         self.load()
+ 
+     def load(self):
+         try:
+             fp = open(self.db_name, 'rb')
+         except IOError, e:
+             if e.errno == errno.ENOENT:
+                 # New pickle
+                 self.db = {}
+             else:
+                 raise
+         else:
+             self.db = pickle.load(fp)
+             fp.close()
+ 
+     def close(self):
+         # we keep no resources open - nothing to do
+         pass
+ 
+     def store(self):
+         fp = open(self.db_name, 'wb')
+         pickle.dump(self.db, fp, self.mode)
+         fp.close()
+ 
+ class MessageInfoDB(MessageInfoBase):
+     def __init__(self, db_name, mode='c'):
+         MessageInfoBase.__init__(self, db_name)
+         self.mode = mode
+         self.load()
+ 
+     def load(self):        
+         try:
+             self.dbm = dbmstorage.open(self.db_name, self.mode)
+             self.db = shelve.Shelf(self.dbm)
+         except dbmstorage.error:
+             # This probably means that we don't have a dbm module
+             # available.  Print out a warning, and continue on
+             # (not persisting any of this data).
+             if options["globals", "verbose"]:
+                 print "Warning: no dbm modules available for MessageInfoDB"
+             self.dbm = self.db = None
+ 
+     def __del__(self):
+         self.close()
+ 
+     def close(self):        
+         # Close our underlying database.  Better not assume all databases
+         # have close functions!
+         def noop(): pass
+         getattr(self.db, "close", noop)()
+         getattr(self.dbm, "close", noop)()
+ 
+     def store(self):
+         if self.db is not None:
+             self.db.sync()
  
  # This should come from a Mark Hammond idea of a master db
***************
*** 148,152 ****
  message_info_db_name = options["Storage", "messageinfo_storage_file"]
  message_info_db_name = os.path.expanduser(message_info_db_name)
! msginfoDB = MessageInfoDB(message_info_db_name)
  
  class Message(email.Message.Message):
--- 197,204 ----
  message_info_db_name = options["Storage", "messageinfo_storage_file"]
  message_info_db_name = os.path.expanduser(message_info_db_name)
! if options["Storage", "persistent_use_database"]:
!     msginfoDB = MessageInfoDB(message_info_db_name)
! else:
!     msginfoDB = MessageInfoPickle(message_info_db_name)
  
  class Message(email.Message.Message):
***************
*** 182,186 ****
  
      def setId(self, id):
!         if self.id:
              raise ValueError, "MsgId has already been set, cannot be changed"
  
--- 234,238 ----
  
      def setId(self, id):
!         if self.id and self.id != id:
              raise ValueError, "MsgId has already been set, cannot be changed"
  

Index: storage.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/spambayes/storage.py,v
retrieving revision 1.35
retrieving revision 1.36
diff -C2 -d -r1.35 -r1.36
*** storage.py	24 Sep 2003 05:28:53 -0000	1.35
--- storage.py	8 Oct 2003 04:04:35 -0000	1.36
***************
*** 577,581 ****
      def onAddMessage(self, message):
          '''A message is being added to an observed corpus.'''
- 
          self.train(message)
  
--- 577,580 ----
***************
*** 588,595 ****
          self.bayes.learn(message.tokenize(), self.is_spam)
  #                         self.updateprobs)
  
      def onRemoveMessage(self, message):
          '''A message is being removed from an observed corpus.'''
- 
          self.untrain(message)
  
--- 587,595 ----
          self.bayes.learn(message.tokenize(), self.is_spam)
  #                         self.updateprobs)
+         message.setId(message.key())
+         message.RememberTrained(self.is_spam)
  
      def onRemoveMessage(self, message):
          '''A message is being removed from an observed corpus.'''
          self.untrain(message)
  
***************
*** 604,611 ****
          # can raise ValueError if database is fouled.  If this is the case,
          # then retraining is the only recovery option.
  
      def trainAll(self, corpus):
          '''Train all the messages in the corpus'''
- 
          for msg in corpus:
              self.train(msg)
--- 604,611 ----
          # can raise ValueError if database is fouled.  If this is the case,
          # then retraining is the only recovery option.
+         message.RememberTrained(None)
  
      def trainAll(self, corpus):
          '''Train all the messages in the corpus'''
          for msg in corpus:
              self.train(msg)
***************
*** 613,617 ****
      def untrainAll(self, corpus):
          '''Untrain all the messages in the corpus'''
- 
          for msg in corpus:
              self.untrain(msg)
--- 613,616 ----
***************
*** 620,627 ****
  class SpamTrainer(Trainer):
      '''Trainer for spam'''
- 
      def __init__(self, bayes, updateprobs=NO_UPDATEPROBS):
          '''Constructor'''
- 
          Trainer.__init__(self, bayes, True, updateprobs)
  
--- 619,624 ----
***************
*** 629,636 ****
  class HamTrainer(Trainer):
      '''Trainer for ham'''
- 
      def __init__(self, bayes, updateprobs=NO_UPDATEPROBS):
          '''Constructor'''
- 
          Trainer.__init__(self, bayes, False, updateprobs)
  
--- 626,631 ----





More information about the Spambayes-checkins mailing list