[Spambayes-checkins] spambayes/Outlook2000 manager.py,1.41,1.42

Mark Hammond mhammond at users.sourceforge.net
Mon Feb 3 18:19:48 EST 2003


Update of /cvsroot/spambayes/spambayes/Outlook2000
In directory sc8-pr-cvs1:/tmp/cvs-serv11936

Modified Files:
	manager.py 
Log Message:
If a new bsddb, or bsddb3 module is available, use this instead of a pickle.  If this is available on your system, you will need to do a full retrain.

Index: manager.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/Outlook2000/manager.py,v
retrieving revision 1.41
retrieving revision 1.42
diff -C2 -d -r1.41 -r1.42
*** manager.py	14 Jan 2003 05:38:19 -0000	1.41
--- manager.py	4 Feb 2003 02:19:46 -0000	1.42
***************
*** 4,7 ****
--- 4,8 ----
  import os
  import sys
+ import errno
  
  import win32com.client
***************
*** 34,37 ****
--- 35,51 ----
      this_filename = os.path.abspath(sys.argv[0])
  
+ # See if we can use the new bsddb module. (The old one is unreliable
+ # on Windows, so we don't use that)
+ try:
+     from bsddb import db # This name doesn't exist in the old one.
+     use_db = True
+ except ImportError:
+     # See if the explicit bsddb3 module exists.
+     try:
+         import bsddb3
+         use_db = True
+     except ImportError:
+         use_db = False
+ 
  # This is a little bit of a hack <wink>.  We are generally in a child directory
  # of the bayes code.  To help installation, we handle the fact that this may
***************
*** 40,44 ****
  # spambayes code before setting that envar, our .ini file may have no effect).
  def import_core_spambayes_stuff(ini_filename):
!     global bayes_classifier, bayes_tokenize
  
      os.environ["BAYESCUSTOMIZE"] = ini_filename
--- 54,58 ----
  # spambayes code before setting that envar, our .ini file may have no effect).
  def import_core_spambayes_stuff(ini_filename):
!     global bayes_classifier, bayes_tokenize, bayes_storage
  
      os.environ["BAYESCUSTOMIZE"] = ini_filename
***************
*** 52,61 ****
--- 66,137 ----
      from spambayes import classifier
      from spambayes.tokenizer import tokenize
+     from spambayes import storage
      bayes_classifier = classifier
      bayes_tokenize = tokenize
+     bayes_storage = storage
  
  class ManagerError(Exception):
      pass
  
+ # Base class for our "storage manager" - we choose between the pickle
+ # and DB versions at runtime.  As our bayes uses spambayes.storage,
+ # our base class can share common bayes loading code.
+ class BasicStorageManager:
+     db_extension = None # for pychecker - overwritten by subclass
+     def __init__(self, bayes_base_name, mdb_base_name):
+         self.bayes_filename = bayes_base_name + self.db_extension
+         self.mdb_filename = mdb_base_name + self.db_extension
+     def new_bayes(self):
+         # Just delete the file and do an "open"
+         try:
+             os.unlink(self.bayes_filename)
+         except IOError, e:
+             if e.errno != errno.ENOENT: raise
+         return self.open_bayes()
+     def store_bayes(self, bayes):
+         bayes.store()
+     def open_bayes(self):
+         raise NotImplementedError
+ 
+ class PickleStorageManager(BasicStorageManager):
+     db_extension = ".pck"
+     def open_bayes(self):
+         return bayes_storage.PickledClassifier(self.bayes_filename)
+     def close_bayes(self, bayes):
+         pass
+     def open_mdb(self):
+         return cPickle.load(open(self.mdb_filename, 'rb'))
+     def new_mdb(self):
+         return {}
+     def store_mdb(self, mdb):
+         cPickle.dump(mdb, open(self.mdb_filename,"wb"), 1)
+     def close_mdb(self, mdb):
+         pass
+         
+ class DBStorageManager(BasicStorageManager):
+     db_extension = ".db"
+     def open_bayes(self):
+         return bayes_storage.DBDictClassifier(self.bayes_filename)
+     def close_bayes(self, bayes):
+         bayes.db.close()
+         bayes.dbm.close()
+     def open_mdb(self):
+         try:
+             import bsddb
+         except ImportError:
+             import bsddb3 as bsddb
+         return bsddb.hashopen(self.mdb_filename)
+     def new_mdb(self):
+         try:
+             os.unlink(self.mdb_filename)
+         except IOError, e:
+             if e.errno != errno.ENOENT: raise
+         return self.open_mdb()
+     def store_mdb(self, mdb):
+         mdb.sync()
+     def close_mdb(self, mdb):
+         mdb.close()
+ 
+ # Our main "bayes manager"    
  class BayesManager:
      def __init__(self, config_base="default", outlook=None, verbose=1):
***************
*** 68,76 ****
          config_base = os.path.abspath(config_base)
          self.ini_filename = config_base + "_bayes_customize.ini"
-         self.bayes_filename = config_base + "_bayes_database.pck"
-         self.message_db_filename = config_base + "_message_database.pck"
          self.config_filename = config_base + "_configuration.pck"
  
!         # First read the configuration file.
          self.config = self.LoadConfig()
  
--- 144,150 ----
          config_base = os.path.abspath(config_base)
          self.ini_filename = config_base + "_bayes_customize.ini"
          self.config_filename = config_base + "_configuration.pck"
  
!         # Read the configuration file.
          self.config = self.LoadConfig()
  
***************
*** 78,81 ****
--- 152,163 ----
  
          import_core_spambayes_stuff(self.ini_filename)
+ 
+         bayes_base = config_base + "_bayes_database"
+         mdb_base = config_base + "_message_database"
+         # determine which db manager to use, and create it.
+         ManagerClass = [PickleStorageManager, DBStorageManager][use_db]
+         self.db_manager = ManagerClass(bayes_base, mdb_base)
+ 
+         self.bayes = self.message_db = None
          self.LoadBayes()
          self.message_store = msgstore.MAPIMsgStore(outlook)
***************
*** 115,119 ****
          # (which really is OK!)
          assert self.outlook is not None, "I need outlook :("
-         ol = self.outlook
          msgstore_folder = self.message_store.GetFolder(folder_id)
          folder = msgstore_folder.GetOutlookItem()
--- 197,200 ----
***************
*** 161,174 ****
  
      def LoadBayes(self):
          if not os.path.exists(self.ini_filename):
              raise ManagerError("The file '%s' must exist before the "
                                 "database '%s' can be opened or created" % (
!                                self.ini_filename, self.bayes_filename))
          bayes = message_db = None
          try:
!             bayes = cPickle.load(open(self.bayes_filename, 'rb'))
!             print "Loaded bayes database from '%s'" % (self.bayes_filename,)
!         except IOError:
!             pass # ignore file-not-found
          except:
              print "Failed to load bayes database"
--- 242,256 ----
  
      def LoadBayes(self):
+         import time
+         start = time.clock()
          if not os.path.exists(self.ini_filename):
              raise ManagerError("The file '%s' must exist before the "
                                 "database '%s' can be opened or created" % (
!                                self.ini_filename, self.db_manager.bayes_filename))
          bayes = message_db = None
          try:
!             # file-not-found handled gracefully by storage.
!             bayes = self.db_manager.open_bayes()
!             print "Loaded bayes database from '%s'" % (self.db_manager.bayes_filename,)
          except:
              print "Failed to load bayes database"
***************
*** 176,181 ****
              traceback.print_exc()
          try:
!             message_db = cPickle.load(open(self.message_db_filename, 'rb'))
!             print "Loaded message database from '%s'" % (self.message_db_filename,)
          except IOError:
              pass
--- 258,263 ----
              traceback.print_exc()
          try:
!             message_db = self.db_manager.open_mdb()
!             print "Loaded message database from '%s'" % (self.db_manager.mdb_filename,)
          except IOError:
              pass
***************
*** 185,188 ****
--- 267,272 ----
              traceback.print_exc()
          if bayes is None or message_db is None:
+             self.bayes = bayes
+             self.message_db = message_db
              print "Either bayes database or message database is missing - creating new"
              self.InitNewBayes()
***************
*** 193,203 ****
                     "%d spam and %d good messages" % (bayes.nspam, bayes.nham))
          if len(message_db) != bayes.nham + bayes.nspam:
!             print "*** - message database only has %d messages - bayes has %d - something is screwey" % \
                      (len(message_db), bayes.nham + bayes.nspam)
          self.bayes = bayes
          self.message_db = message_db
          self.bayes_dirty = False
  
      def LoadConfig(self):
          try:
              f = open(self.config_filename, 'rb')
--- 277,290 ----
                     "%d spam and %d good messages" % (bayes.nspam, bayes.nham))
          if len(message_db) != bayes.nham + bayes.nspam:
!             print "*** - message database has %d messages - bayes has %d - something is screwey" % \
                      (len(message_db), bayes.nham + bayes.nspam)
          self.bayes = bayes
          self.message_db = message_db
          self.bayes_dirty = False
+         if self.verbose:
+             print "Loaded databases in %gms" % ((time.clock()-start)*1000)
  
      def LoadConfig(self):
+         # Our 'config' file always uses a pickle
          try:
              f = open(self.config_filename, 'rb')
***************
*** 228,233 ****
  
      def InitNewBayes(self):
!         self.bayes = bayes_classifier.Bayes()
!         self.message_db = {} # OK, so its not quite a DB yet <wink>
          self.bayes_dirty = True
  
--- 315,324 ----
  
      def InitNewBayes(self):
!         if self.bayes is not None:
!             self.db_manager.close_bayes(self.bayes)
!         if self.message_db is not None:
!             self.db_manager.close_mdb(self.message_db)
!         self.bayes = self.db_manager.new_bayes()
!         self.message_db = self.db_manager.new_mdb()
          self.bayes_dirty = True
  
***************
*** 243,251 ****
              print "Saving bayes database with %d spam and %d good messages" %\
                     (bayes.nspam, bayes.nham)
!             print " ->", self.bayes_filename
!         cPickle.dump(bayes, open(self.bayes_filename,"wb"), 1)
          if self.verbose:
!             print " ->", self.message_db_filename
!         cPickle.dump(self.message_db, open(self.message_db_filename,"wb"), 1)
          self.bayes_dirty = False
  
--- 334,342 ----
              print "Saving bayes database with %d spam and %d good messages" %\
                     (bayes.nspam, bayes.nham)
!             print " ->", self.db_manager.bayes_filename
!         self.db_manager.store_bayes(self.bayes)
          if self.verbose:
!             print " ->", self.db_manager.mdb_filename
!         self.db_manager.store_mdb(self.message_db)
          self.bayes_dirty = False
  





More information about the Spambayes-checkins mailing list