[Spambayes-checkins] spambayes classifier.py,1.60,1.61 hammie.py,1.43,1.44 storage.py,1.2,1.3 dbdict.py,1.4,NONE

Neale Pickett npickett@users.sourceforge.net
Wed Nov 27 22:38:00 2002


Update of /cvsroot/spambayes/spambayes
In directory sc8-pr-cvs1:/tmp/cvs-serv31393

Modified Files:
	classifier.py hammie.py storage.py 
Removed Files:
	dbdict.py 
Log Message:
* Caching dbdict implementation.  You'll have to retrain your
  databases again (sorry)


Index: classifier.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/classifier.py,v
retrieving revision 1.60
retrieving revision 1.61
diff -C2 -d -r1.60 -r1.61
*** classifier.py	26 Nov 2002 20:22:05 -0000	1.60
--- classifier.py	27 Nov 2002 22:37:55 -0000	1.61
***************
*** 47,75 ****
  LN2 = math.log(2)       # used frequently by chi-combining
  
! PICKLE_VERSION = 4
! 
! class MetaInfo(object):
!     """Information about the corpora.
! 
!     Contains nham and nspam, used for calculating probabilities.
! 
!     """
!     def __init__(self):
!         self.__setstate__((PICKLE_VERSION, 0, 0))
! 
!     def __repr__(self):
!         return "MetaInfo%r" % repr((self._nspam,
!                                     self._nham,
!                                     self.revision))
! 
!     def __getstate__(self):
!         return (PICKLE_VERSION, self.nspam, self.nham)
! 
!     def __setstate__(self, t):
!         if t[0] != PICKLE_VERSION:
!             raise ValueError("Can't unpickle -- version %s unknown" % t[0])
!         self.nspam, self.nham = t[1:]
!         self.revision = 0
! 
  
  class WordInfo(object):
--- 47,51 ----
  LN2 = math.log(2)       # used frequently by chi-combining
  
! PICKLE_VERSION = 5
  
  class WordInfo(object):
***************
*** 109,138 ****
      def __init__(self):
          self.wordinfo = {}
-         self.meta = MetaInfo()
          self.probcache = {}
  
      def __getstate__(self):
!         return PICKLE_VERSION, self.wordinfo, self.meta
  
      def __setstate__(self, t):
          if t[0] != PICKLE_VERSION:
              raise ValueError("Can't unpickle -- version %s unknown" % t[0])
!         self.wordinfo, self.meta = t[1:]
          self.probcache = {}
  
-     # Slacker's way out--pass calls to nham/nspam up to the meta class
- 
-     def get_nham(self):
-         return self.meta.nham
-     def set_nham(self, val):
-         self.meta.nham = val
-     nham = property(get_nham, set_nham)
- 
-     def get_nspam(self):
-         return self.meta.nspam
-     def set_nspam(self, val):
-         self.meta.nspam = val
-     nspam = property(get_nspam, set_nspam)
- 
      # spamprob() implementations.  One of the following is aliased to
      # spamprob, depending on option settings.
--- 85,100 ----
      def __init__(self):
          self.wordinfo = {}
          self.probcache = {}
+         self.nspam = self.nham = 0
  
      def __getstate__(self):
!         return (PICKLE_VERSION, self.wordinfo, self.nspam, self.nham)
  
      def __setstate__(self, t):
          if t[0] != PICKLE_VERSION:
              raise ValueError("Can't unpickle -- version %s unknown" % t[0])
!         (self.wordinfo, self.nspam, self.nham) = t[1:]
          self.probcache = {}
  
      # spamprob() implementations.  One of the following is aliased to
      # spamprob, depending on option settings.
***************
*** 331,336 ****
              pass
  
!         nham = float(self.meta.nham or 1)
!         nspam = float(self.meta.nspam or 1)
  
          assert hamcount <= nham
--- 293,298 ----
              pass
  
!         nham = float(self.nham or 1)
!         nspam = float(self.nspam or 1)
  
          assert hamcount <= nham
***************
*** 420,431 ****
          self.probcache = {}    # nuke the prob cache
          if is_spam:
!             self.meta.nspam += 1
          else:
!             self.meta.nham += 1
  
-         wordinfo = self.wordinfo
-         wordinfoget = wordinfo.get
          for word in Set(wordstream):
!             record = wordinfoget(word)
              if record is None:
                  record = self.WordInfoClass()
--- 382,391 ----
          self.probcache = {}    # nuke the prob cache
          if is_spam:
!             self.nspam += 1
          else:
!             self.nham += 1
  
          for word in Set(wordstream):
!             record = self._wordinfoget(word)
              if record is None:
                  record = self.WordInfoClass()
***************
*** 436,441 ****
                  record.hamcount += 1
  
!             # Needed to tell a persistent DB that the content changed.
!             wordinfo[word] = record
  
  
--- 396,400 ----
                  record.hamcount += 1
  
!             self._wordinfoset(word, record)
  
  
***************
*** 443,458 ****
          self.probcache = {}    # nuke the prob cache
          if is_spam:
!             if self.meta.nspam <= 0:
                  raise ValueError("spam count would go negative!")
!             self.meta.nspam -= 1
          else:
!             if self.meta.nham <= 0:
                  raise ValueError("non-spam count would go negative!")
!             self.meta.nham -= -1
  
-         wordinfo = self.wordinfo
-         wordinfoget = wordinfo.get
          for word in Set(wordstream):
!             record = wordinfoget(word)
              if record is not None:
                  if is_spam:
--- 402,415 ----
          self.probcache = {}    # nuke the prob cache
          if is_spam:
!             if self.nspam <= 0:
                  raise ValueError("spam count would go negative!")
!             self.nspam -= 1
          else:
!             if self.nham <= 0:
                  raise ValueError("non-spam count would go negative!")
!             self.nham -= -1
  
          for word in Set(wordstream):
!             record = self._wordinfoget(word)
              if record is not None:
                  if is_spam:
***************
*** 463,471 ****
                          record.hamcount -= 1
                  if record.hamcount == 0 == record.spamcount:
!                     del wordinfo[word]
                  else:
!                     # Needed to tell a persistent DB that the content
!                     # changed.
!                     wordinfo[word] = record
  
      def _getclues(self, wordstream):
--- 420,426 ----
                          record.hamcount -= 1
                  if record.hamcount == 0 == record.spamcount:
!                     self._wordinfodel(word)
                  else:
!                     self._wordinfoset(word, record)
  
      def _getclues(self, wordstream):
***************
*** 476,482 ****
          pushclue = clues.append
  
-         wordinfoget = self.wordinfo.get
          for word in Set(wordstream):
!             record = wordinfoget(word)
              if record is None:
                  prob = unknown
--- 431,436 ----
          pushclue = clues.append
  
          for word in Set(wordstream):
!             record = self._wordinfoget(word)
              if record is None:
                  prob = unknown
***************
*** 492,495 ****
--- 446,459 ----
          # Return (prob, word, record).
          return [t[1:] for t in clues]
+ 
+     def _wordinfoget(self, word):
+         return self.wordinfo.get(word)
+ 
+     def _wordinfoset(self, word, record):
+         self.wordinfo[word] = record
+ 
+     def _wordinfodel(self, word):
+         del self.wordinfo[word]
+         
  
  

Index: hammie.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/hammie.py,v
retrieving revision 1.43
retrieving revision 1.44
diff -C2 -d -r1.43 -r1.44
*** hammie.py	25 Nov 2002 20:49:17 -0000	1.43
--- hammie.py	27 Nov 2002 22:37:56 -0000	1.44
***************
*** 2,6 ****
  
  
- import dbdict
  import mboxutils
  import storage
--- 2,5 ----
***************
*** 45,49 ****
                           for word, prob in clues
                           if (word[0] == '*' or
!                              prob <= SHOWCLUE or prob >= 1.0 - SHOWCLUE)])
  
      def score(self, msg, evidence=False):
--- 44,49 ----
                           for word, prob in clues
                           if (word[0] == '*' or
!                              prob <= options.clue_mailheader_cutoff or
!                              prob >= 1.0 - options.clue_mailheader_cutoff)])
  
      def score(self, msg, evidence=False):

Index: storage.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/storage.py,v
retrieving revision 1.2
retrieving revision 1.3
diff -C2 -d -r1.2 -r1.3
*** storage.py	26 Nov 2002 00:43:51 -0000	1.2
--- storage.py	27 Nov 2002 22:37:56 -0000	1.3
***************
*** 5,9 ****
  Classes:
      PickledClassifier - Classifier that uses a pickle db
!     DBDictClassifier - Classifier that uses a DBDict db
      Trainer - Classifier training observer
      SpamTrainer - Trainer for spam
--- 5,9 ----
  Classes:
      PickledClassifier - Classifier that uses a pickle db
!     DBDictClassifier - Classifier that uses a DBM db
      Trainer - Classifier training observer
      SpamTrainer - Trainer for spam
***************
*** 18,23 ****
      databases.
  
!     DBDictClassifier is a Classifier class that uses a DBDict
!     datastore.
  
      Trainer is concrete class that observes a Corpus and trains a
--- 18,23 ----
      databases.
  
!     DBDictClassifier is a Classifier class that uses a database
!     store.
  
      Trainer is concrete class that observes a Corpus and trains a
***************
*** 50,55 ****
  from Options import options
  import cPickle as pickle
- import dbdict
  import errno
  
  PICKLE_TYPE = 1
--- 50,55 ----
  from Options import options
  import cPickle as pickle
  import errno
+ import shelve
  
  PICKLE_TYPE = 1
***************
*** 84,91 ****
              fp.close()
  
          if tempbayes:
              self.wordinfo = tempbayes.wordinfo
!             self.meta.nham = tempbayes.get_nham()
!             self.meta.nspam = tempbayes.get_nspam()
  
              if options.verbose:
--- 84,92 ----
              fp.close()
  
+         # XXX: why not self.__setstate__(tempbayes.__getstate__())?
          if tempbayes:
              self.wordinfo = tempbayes.wordinfo
!             self.nham = tempbayes.nham
!             self.nspam = tempbayes.nspam
  
              if options.verbose:
***************
*** 97,102 ****
                  print self.db_name,'is a new pickle'
              self.wordinfo = {}
!             self.meta.nham = 0
!             self.meta.nspam = 0
  
      def store(self):
--- 98,103 ----
                  print self.db_name,'is a new pickle'
              self.wordinfo = {}
!             self.nham = 0
!             self.nspam = 0
  
      def store(self):
***************
*** 110,124 ****
          fp.close()
  
-     def __getstate__(self):
-         return PICKLE_TYPE, self.wordinfo, self.meta
- 
-     def __setstate__(self, t):
-         if t[0] != PICKLE_TYPE:
-             raise ValueError("Can't unpickle -- version %s unknown" % t[0])
-         self.wordinfo, self.meta = t[1:]
- 
  
  class DBDictClassifier(classifier.Classifier):
!     '''Classifier object persisted in a WIDict'''
  
      def __init__(self, db_name, mode='c'):
--- 111,117 ----
          fp.close()
  
  
  class DBDictClassifier(classifier.Classifier):
!     '''Classifier object persisted in a caching database'''
  
      def __init__(self, db_name, mode='c'):
***************
*** 126,129 ****
--- 119,123 ----
  
          classifier.Classifier.__init__(self)
+         self.wordcache = {}
          self.statekey = "saved state"
          self.mode = mode
***************
*** 132,157 ****
  
      def load(self):
!         '''Load state from WIDict'''
  
          if options.verbose:
!             print 'Loading state from',self.db_name,'WIDict'
  
!         self.wordinfo = dbdict.DBDict(self.db_name, self.mode,
!                              classifier.WordInfo,iterskip=[self.statekey])
  
!         if self.wordinfo.has_key(self.statekey):
!             (nham, nspam) = self.wordinfo[self.statekey]
!             self.set_nham(nham)
!             self.set_nspam(nspam)
  
              if options.verbose:
!                 print '%s is an existing DBDict, with %d ham and %d spam' \
!                       % (self.db_name, self.nham, self.nspam)
          else:
!             # new dbdict
              if options.verbose:
!                 print self.db_name,'is a new DBDict'
!             self.set_nham(0)
!             self.set_nspam(0)
  
      def store(self):
--- 126,152 ----
  
      def load(self):
!         '''Load state from database'''
  
          if options.verbose:
!             print 'Loading state from',self.db_name,'database'
  
!         self.db = shelve.DbfilenameShelf(self.db_name, self.mode)
  
!         if self.db.has_key(self.statekey):
!             t = self.db[self.statekey]
!             if t[0] != classifier.PICKLE_VERSION:
!                 raise ValueError("Can't unpickle -- version %s unknown" % t[0])
!             (self.nspam, self.nham) = t[1:]
  
              if options.verbose:
!                 print '%s is an existing database, with %d spam and %d ham' \
!                       % (self.db_name, self.nspam, self.nham)
          else:
!             # new database
              if options.verbose:
!                 print self.db_name,'is a new database'
!             self.nspam = 0
!             self.nham = 0
!         self.wordinfo = {}
  
      def store(self):
***************
*** 159,166 ****
  
          if options.verbose:
!             print 'Persisting',self.db_name,'state in WIDict'
  
!         self.wordinfo[self.statekey] = (self.get_nham(), self.get_nspam())
!         self.wordinfo.sync()
  
  
--- 154,186 ----
  
          if options.verbose:
!             print 'Persisting',self.db_name,'state in database'
  
!         for key, val in self.wordinfo.iteritems():
!             if val == None:
!                 del self.wordinfo[key]
!                 try:
!                     del self.db[key]
!                 except KeyError:
!                     pass
!             else:
!                 self.db[key] = val.__getstate__()
!         self.db[self.statekey] = (classifier.PICKLE_VERSION,
!                                   self.nspam, self.nham)
!         self.db.sync()
! 
!     def _wordinfoget(self, word):
!         ret = self.wordinfo.get(word)
!         if not ret:
!             r = self.db.get(word)
!             if r:
!                 ret = self.WordInfoClass()
!                 ret.__setstate__(r)
!                 self.wordinfo[word] = ret
!         return ret
! 
!     # _wordinfoset is the same
! 
!     def _wordinfodel(self, word):
!         self.wordinfo[word] = None
  
  

--- dbdict.py DELETED ---





More information about the Spambayes-checkins mailing list