[Spambayes-checkins] spambayes storage.py,NONE,1.1 FileCorpus.py,1.3,1.4 classifier.py,1.55,1.56 hammie.py,1.41,1.42 hammiebulk.py,1.2,1.3 pop3proxy.py,1.19,1.20 Persistent.py,1.2,NONE

Mon Nov 25 06:22:28 2002

Update of /cvsroot/spambayes/spambayes
In directory sc8-pr-cvs1:/tmp/cvs-serv1046

Modified Files:
	FileCorpus.py classifier.py hammie.py hammiebulk.py 
	pop3proxy.py 
Added Files:
	storage.py 
Removed Files:
	Persistent.py 
Log Message:
* renamed Persistent.py to storage.py
* removed PersistentClassifier class, moved classify() method to
  classifier.Classifier class.
* This cvs commit has a lot of class ;)

--- NEW FILE: storage.py ---
#! /usr/bin/env python

'''storage.py - Spambayes database management framework.

Classes:
    PickledClassifier - Classifier that uses a pickle db
    DBDictClassifier - Classifier that uses a DBDict db
    Trainer - Classifier training observer
    SpamTrainer - Trainer for spam
    HamTrainer - Trainer for ham

Abstract:
    *Classifier are subclasses of Classifier (classifier.Classifier)
    that add automatic state store/restore function to the Classifier class.

    PickledClassifier is a Classifier class that uses a cPickle
    datastore.  This database is relatively small, but slower than other
    databases.

    DBDictClassifier is a Classifier class that uses a DBDict
    datastore.

    Trainer is concrete class that observes a Corpus and trains a
    Classifier object based upon movement of messages between corpora  When
    an add message notification is received, the trainer trains the
    database with the message, as spam or ham as appropriate given the
    type of trainer (spam or ham).  When a remove message notification
    is received, the trainer untrains the database as appropriate.

    SpamTrainer and HamTrainer are convenience subclasses of Trainer, that
    initialize as the appropriate type of Trainer

To Do:
    o ZODBClassifier
    o Would Trainer.trainall really want to train with the whole corpus,
        or just a random subset?
    o Suggestions?

    '''

# This module is part of the spambayes project, which is Copyright 2002
# The Python Software Foundation and is covered by the Python Software
# Foundation license.

__author__ = "Tim Stone <tim@fourstonesExpressions.com>"
__credits__ = "Richie Hindle, Tim Peters, Neale Pickett, \
all the spambayes contributors."

import classifier
from Options import options
import cPickle as pickle
import dbdict
import errno

PICKLE_TYPE = 1
NO_UPDATEPROBS = False   # Probabilities will not be autoupdated with training
UPDATEPROBS = True       # Probabilities will be autoupdated with training
DEBUG = False

class PickledClassifier(classifier.Classifier):
    '''Classifier object persisted in a pickle'''

    def __init__(self, db_name):
        classifier.Classifier.__init__(self)
        self.db_name = db_name
        self.load()    

    def load(self):
        '''Load this instance from the pickle.'''
        # This is a bit strange, because the loading process
        # creates a temporary instance of PickledClassifier, from which
        # this object's state is copied.  This is a nuance of the way
        # that pickle does its job

        if DEBUG:
            print 'Loading state from',self.db_name,'pickle'

        tempbayes = None
        try:
            fp = open(self.db_name, 'rb')
        except IOError, e:
            if e.errno != errno.ENOENT: raise
        else:
            tempbayes = pickle.load(fp)
            fp.close()

        if tempbayes:
            self.wordinfo = tempbayes.wordinfo
            self.meta.nham = tempbayes.get_nham()
            self.meta.nspam = tempbayes.get_nspam()

            if DEBUG:
                print '%s is an existing pickle, with %d ham and %d spam' \
                      % (self.db_name, self.nham, self.nspam)
        else:
            # new pickle
            if DEBUG:
                print self.db_name,'is a new pickle'
            self.wordinfo = {}
            self.meta.nham = 0
            self.meta.nspam = 0

    def store(self):
        '''Store self as a pickle'''

        if DEBUG:
            print 'Persisting',self.db_name,'as a pickle'

        fp = open(self.db_name, 'wb')
        pickle.dump(self, fp, PICKLE_TYPE)
        fp.close()

    def __getstate__(self):
        return PICKLE_TYPE, self.wordinfo, self.meta

    def __setstate__(self, t):
        if t[0] != PICKLE_TYPE:
            raise ValueError("Can't unpickle -- version %s unknown" % t[0])
        self.wordinfo, self.meta = t[1:]

class DBDictClassifier(classifier.Classifier):
    '''Classifier object persisted in a WIDict'''

    def __init__(self, db_name, mode='c'):
        '''Constructor(database name)'''

        classifier.Classifier.__init__(self)
        self.statekey = "saved state"
        self.mode = mode
        self.db_name = db_name
        self.load()

    def load(self):
        '''Load state from WIDict'''

        if DEBUG:
            print 'Loading state from',self.db_name,'WIDict'

        self.wordinfo = dbdict.DBDict(self.db_name, self.mode,
                             classifier.WordInfo,iterskip=[self.statekey])

        if self.wordinfo.has_key(self.statekey):
            (nham, nspam) = self.wordinfo[self.statekey]
            self.set_nham(nham)
            self.set_nspam(nspam)

            if DEBUG:
                print '%s is an existing DBDict, with %d ham and %d spam' \
                      % (self.db_name, self.nham, self.nspam)
        else:
            # new dbdict
            if DEBUG:
                print self.db_name,'is a new DBDict'
            self.set_nham(0)
            self.set_nspam(0)

    def store(self):
        '''Place state into persistent store'''

        if DEBUG:
            print 'Persisting',self.db_name,'state in WIDict'

        self.wordinfo[self.statekey] = (self.get_nham(), self.get_nspam())
        self.wordinfo.sync()

class Trainer:
    '''Associates a Classifier object and one or more Corpora, \
    is an observer of the corpora'''

    def __init__(self, bayes, is_spam, updateprobs=NO_UPDATEPROBS):
        '''Constructor(Classifier, is_spam(True|False), updprobs(True|False)'''

        self.bayes = bayes
        self.is_spam = is_spam
        self.updateprobs = updateprobs

    def onAddMessage(self, message):
        '''A message is being added to an observed corpus.'''

        self.train(message)

    def train(self, message):
        '''Train the database with the message'''

        if DEBUG:
            print 'training with',message.key()

        self.bayes.learn(message.tokenize(), self.is_spam)
#                         self.updateprobs)

    def onRemoveMessage(self, message):
        '''A message is being removed from an observed corpus.'''

        self.untrain(message)

    def untrain(self, message):
        '''Untrain the database with the message'''

        if DEBUG:
            print 'untraining with',message.key()

        self.bayes.unlearn(message.tokenize(), self.is_spam)
#                           self.updateprobs)
        # can raise ValueError if database is fouled.  If this is the case,
        # then retraining is the only recovery option.

    def trainAll(self, corpus):
        '''Train all the messages in the corpus'''

        for msg in corpus:
            self.train(msg)

    def untrainAll(self, corpus):
        '''Untrain all the messages in the corpus'''

        for msg in corpus:
            self.untrain(msg)

class SpamTrainer(Trainer):
    '''Trainer for spam'''

    def __init__(self, bayes, updateprobs=NO_UPDATEPROBS):
        '''Constructor'''

        Trainer.__init__(self, bayes, True, updateprobs)

class HamTrainer(Trainer):
    '''Trainer for ham'''

    def __init__(self, bayes, updateprobs=NO_UPDATEPROBS):
        '''Constructor'''

        Trainer.__init__(self, bayes, False, updateprobs)

if __name__ == '__main__':
    print >>sys.stderr, __doc__

Index: FileCorpus.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/FileCorpus.py,v
retrieving revision 1.3
retrieving revision 1.4
diff -C2 -d -r1.3 -r1.4
*** FileCorpus.py	25 Nov 2002 02:29:44 -0000	1.3
--- FileCorpus.py	25 Nov 2002 06:22:26 -0000	1.4
***************
*** 86,90 ****

  import Corpus
! import Persistent
  import sys, os, gzip, fnmatch, getopt, errno, time, stat

--- 86,90 ----

  import Corpus
! import storage
  import sys, os, gzip, fnmatch, getopt, errno, time, stat

***************
*** 344,355 ****

      print '\n\nCreating two Classifier databases'
!     miscbayes = Persistent.PickledClassifier('fctestmisc.bayes')
!     classbayes = Persistent.DBDictClassifier('fctestclass.bayes')

      print '\n\nSetting up spam corpus'
      spamcorpus = FileCorpus(fmFact, 'fctestspamcorpus')
!     spamtrainer = Persistent.SpamTrainer(miscbayes)
      spamcorpus.addObserver(spamtrainer)
!     anotherspamtrainer = Persistent.SpamTrainer(classbayes, Persistent.UPDATEPROBS)
      spamcorpus.addObserver(anotherspamtrainer)

--- 344,355 ----

      print '\n\nCreating two Classifier databases'
!     miscbayes = storage.PickledClassifier('fctestmisc.bayes')
!     classbayes = storage.DBDictClassifier('fctestclass.bayes')

      print '\n\nSetting up spam corpus'
      spamcorpus = FileCorpus(fmFact, 'fctestspamcorpus')
!     spamtrainer = storage.SpamTrainer(miscbayes)
      spamcorpus.addObserver(spamtrainer)
!     anotherspamtrainer = storage.SpamTrainer(classbayes, storage.UPDATEPROBS)
      spamcorpus.addObserver(anotherspamtrainer)

***************
*** 366,370 ****
                            'fctesthamcorpus', \
                            'MSG*')
!     hamtrainer = Persistent.HamTrainer(miscbayes)
      hamcorpus.addObserver(hamtrainer)
      hamtrainer.trainAll(hamcorpus)
--- 366,370 ----
                            'fctesthamcorpus', \
                            'MSG*')
!     hamtrainer = storage.HamTrainer(miscbayes)
      hamcorpus.addObserver(hamtrainer)
      hamtrainer.trainAll(hamcorpus)
***************
*** 420,424 ****

      print '\n\nTrain with an individual message'
!     anotherhamtrainer = Persistent.HamTrainer(classbayes)
      anotherhamtrainer.train(unsurecorpus['MSG00005'])

--- 420,424 ----

      print '\n\nTrain with an individual message'
!     anotherhamtrainer = storage.HamTrainer(classbayes)
      anotherhamtrainer.train(unsurecorpus['MSG00005'])

***************
*** 723,725 ****
          print >>sys.stderr, __doc__

!        
\ No newline at end of file
--- 723,725 ----
          print >>sys.stderr, __doc__

!        

Index: classifier.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/classifier.py,v
retrieving revision 1.55
retrieving revision 1.56
diff -C2 -d -r1.55 -r1.56
*** classifier.py	25 Nov 2002 04:11:29 -0000	1.55
--- classifier.py	25 Nov 2002 06:22:26 -0000	1.56
***************
*** 158,161 ****
--- 158,177 ----
      # spamprob, depending on option settings.

+     def classify(self, message):
+         """Return the classification of a message as a string."""
+ 
+         prob = self.spamprob(message.tokenize())
+ 
+         message.setSpamprob(prob)       # don't like this
+ 
+         if prob < options.ham_cutoff:
+             type = options.header_ham_string
+         elif prob > options.spam_cutoff:
+             type = options.header_spam_string
+         else:
+             type = options.header_unsure_string
+ 
+         return type
+ 
      def gary_spamprob(self, wordstream, evidence=False):
          """Return best-guess probability that wordstream is spam.

Index: hammie.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/hammie.py,v
retrieving revision 1.41
retrieving revision 1.42
diff -C2 -d -r1.41 -r1.42
*** hammie.py	25 Nov 2002 02:29:44 -0000	1.41
--- hammie.py	25 Nov 2002 06:22:26 -0000	1.42
***************
*** 4,8 ****
  import dbdict
  import mboxutils
! import Persistent
  from Options import options
  from tokenizer import tokenize
--- 4,8 ----
  import dbdict
  import mboxutils
! import storage
  from Options import options
  from tokenizer import tokenize
***************
*** 180,186 ****

      if usedb:
!         b = Persistent.DBDictClassifier(filename, mode)
      else:
!         b = Persistent.PickledClassifier(filename)
      return Hammie(b)

--- 180,186 ----

      if usedb:
!         b = storage.DBDictClassifier(filename, mode)
      else:
!         b = storage.PickledClassifier(filename)
      return Hammie(b)

Index: hammiebulk.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/hammiebulk.py,v
retrieving revision 1.2
retrieving revision 1.3
diff -C2 -d -r1.2 -r1.3
*** hammiebulk.py	25 Nov 2002 02:29:44 -0000	1.2
--- hammiebulk.py	25 Nov 2002 06:22:26 -0000	1.3
***************
*** 52,56 ****
  import mboxutils
  import classifier
! import Persistent
  import hammie
  import Corpus
--- 52,56 ----
  import mboxutils
  import classifier
! import storage
  import hammie
  import Corpus
***************
*** 104,117 ****
                  print h.formatclues(clues)
      return (spams, hams)
- 
- def createbayes(pck=DEFAULTDB, usedb=False, mode='r'):
-     """Create a Bayes instance for the given pickle (which
-     doesn't have to exist).  Create a PersistentBayes if
-     usedb is True."""
-     if usedb:
-         bayes = Persistent.DBDictClassifier(pck, mode)
-     else:
-         bayes = Persistent.PickledClassifier(pck)
-     return bayes

  def usage(code, msg=''):
--- 104,107 ----

Index: pop3proxy.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/pop3proxy.py,v
retrieving revision 1.19
retrieving revision 1.20
diff -C2 -d -r1.19 -r1.20
*** pop3proxy.py	25 Nov 2002 02:29:44 -0000	1.19
--- pop3proxy.py	25 Nov 2002 06:22:26 -0000	1.20
***************
*** 119,123 ****
  import os, sys, re, operator, errno, getopt, string, cStringIO, time, bisect
  import socket, asyncore, asynchat, cgi, urlparse, webbrowser
! import Persistent, tokenizer, mboxutils
  from FileCorpus import FileCorpus, FileMessageFactory, GzipFileMessageFactory
  from email.Iterators import typed_subpart_iterator
--- 119,123 ----
  import os, sys, re, operator, errno, getopt, string, cStringIO, time, bisect
  import socket, asyncore, asynchat, cgi, urlparse, webbrowser
! import storage, tokenizer, mboxutils
  from FileCorpus import FileCorpus, FileMessageFactory, GzipFileMessageFactory
  from email.Iterators import typed_subpart_iterator
***************
*** 1203,1209 ****
              self.databaseFilename = '_pop3proxy_test.pickle'   # Never saved
          if self.useDB:
!             self.bayes = Persistent.DBDictClassifier(self.databaseFilename)
          else:
!             self.bayes = Persistent.PickledClassifier(self.databaseFilename)
          print "Done."

--- 1203,1209 ----
              self.databaseFilename = '_pop3proxy_test.pickle'   # Never saved
          if self.useDB:
!             self.bayes = storage.DBDictClassifier(self.databaseFilename)
          else:
!             self.bayes = storage.PickledClassifier(self.databaseFilename)
          print "Done."

***************
*** 1230,1235 ****

              # Create the Trainers.
!             self.spamTrainer = Persistent.SpamTrainer(self.bayes)
!             self.hamTrainer = Persistent.HamTrainer(self.bayes)
              self.spamCorpus.addObserver(self.spamTrainer)
              self.hamCorpus.addObserver(self.hamTrainer)
--- 1230,1235 ----

              # Create the Trainers.
!             self.spamTrainer = storage.SpamTrainer(self.bayes)
!             self.hamTrainer = storage.HamTrainer(self.bayes)
              self.spamCorpus.addObserver(self.spamTrainer)
              self.hamCorpus.addObserver(self.hamTrainer)

--- Persistent.py DELETED ---