[Spambayes-checkins] spambayes storage.py,NONE,1.1
FileCorpus.py,1.3,1.4
classifier.py,1.55,1.56 hammie.py,1.41,1.42 hammiebulk.py,1.2,1.3
pop3proxy.py,1.19,1.20 Persistent.py,1.2,NONE
Neale Pickett
npickett@users.sourceforge.net
Mon Nov 25 06:22:28 2002
Update of /cvsroot/spambayes/spambayes
In directory sc8-pr-cvs1:/tmp/cvs-serv1046
Modified Files:
FileCorpus.py classifier.py hammie.py hammiebulk.py
pop3proxy.py
Added Files:
storage.py
Removed Files:
Persistent.py
Log Message:
* renamed Persistent.py to storage.py
* removed PersistentClassifier class, moved classify() method to
classifier.Classifier class.
* This cvs commit has a lot of class ;)
--- NEW FILE: storage.py ---
#! /usr/bin/env python
'''storage.py - Spambayes database management framework.
Classes:
PickledClassifier - Classifier that uses a pickle db
DBDictClassifier - Classifier that uses a DBDict db
Trainer - Classifier training observer
SpamTrainer - Trainer for spam
HamTrainer - Trainer for ham
Abstract:
*Classifier are subclasses of Classifier (classifier.Classifier)
that add automatic state store/restore function to the Classifier class.
PickledClassifier is a Classifier class that uses a cPickle
datastore. This database is relatively small, but slower than other
databases.
DBDictClassifier is a Classifier class that uses a DBDict
datastore.
Trainer is concrete class that observes a Corpus and trains a
Classifier object based upon movement of messages between corpora When
an add message notification is received, the trainer trains the
database with the message, as spam or ham as appropriate given the
type of trainer (spam or ham). When a remove message notification
is received, the trainer untrains the database as appropriate.
SpamTrainer and HamTrainer are convenience subclasses of Trainer, that
initialize as the appropriate type of Trainer
To Do:
o ZODBClassifier
o Would Trainer.trainall really want to train with the whole corpus,
or just a random subset?
o Suggestions?
'''
# This module is part of the spambayes project, which is Copyright 2002
# The Python Software Foundation and is covered by the Python Software
# Foundation license.
__author__ = "Tim Stone <tim@fourstonesExpressions.com>"
__credits__ = "Richie Hindle, Tim Peters, Neale Pickett, \
all the spambayes contributors."
import classifier
from Options import options
import cPickle as pickle
import dbdict
import errno
PICKLE_TYPE = 1
NO_UPDATEPROBS = False # Probabilities will not be autoupdated with training
UPDATEPROBS = True # Probabilities will be autoupdated with training
DEBUG = False
class PickledClassifier(classifier.Classifier):
'''Classifier object persisted in a pickle'''
def __init__(self, db_name):
classifier.Classifier.__init__(self)
self.db_name = db_name
self.load()
def load(self):
'''Load this instance from the pickle.'''
# This is a bit strange, because the loading process
# creates a temporary instance of PickledClassifier, from which
# this object's state is copied. This is a nuance of the way
# that pickle does its job
if DEBUG:
print 'Loading state from',self.db_name,'pickle'
tempbayes = None
try:
fp = open(self.db_name, 'rb')
except IOError, e:
if e.errno != errno.ENOENT: raise
else:
tempbayes = pickle.load(fp)
fp.close()
if tempbayes:
self.wordinfo = tempbayes.wordinfo
self.meta.nham = tempbayes.get_nham()
self.meta.nspam = tempbayes.get_nspam()
if DEBUG:
print '%s is an existing pickle, with %d ham and %d spam' \
% (self.db_name, self.nham, self.nspam)
else:
# new pickle
if DEBUG:
print self.db_name,'is a new pickle'
self.wordinfo = {}
self.meta.nham = 0
self.meta.nspam = 0
def store(self):
'''Store self as a pickle'''
if DEBUG:
print 'Persisting',self.db_name,'as a pickle'
fp = open(self.db_name, 'wb')
pickle.dump(self, fp, PICKLE_TYPE)
fp.close()
def __getstate__(self):
return PICKLE_TYPE, self.wordinfo, self.meta
def __setstate__(self, t):
if t[0] != PICKLE_TYPE:
raise ValueError("Can't unpickle -- version %s unknown" % t[0])
self.wordinfo, self.meta = t[1:]
class DBDictClassifier(classifier.Classifier):
'''Classifier object persisted in a WIDict'''
def __init__(self, db_name, mode='c'):
'''Constructor(database name)'''
classifier.Classifier.__init__(self)
self.statekey = "saved state"
self.mode = mode
self.db_name = db_name
self.load()
def load(self):
'''Load state from WIDict'''
if DEBUG:
print 'Loading state from',self.db_name,'WIDict'
self.wordinfo = dbdict.DBDict(self.db_name, self.mode,
classifier.WordInfo,iterskip=[self.statekey])
if self.wordinfo.has_key(self.statekey):
(nham, nspam) = self.wordinfo[self.statekey]
self.set_nham(nham)
self.set_nspam(nspam)
if DEBUG:
print '%s is an existing DBDict, with %d ham and %d spam' \
% (self.db_name, self.nham, self.nspam)
else:
# new dbdict
if DEBUG:
print self.db_name,'is a new DBDict'
self.set_nham(0)
self.set_nspam(0)
def store(self):
'''Place state into persistent store'''
if DEBUG:
print 'Persisting',self.db_name,'state in WIDict'
self.wordinfo[self.statekey] = (self.get_nham(), self.get_nspam())
self.wordinfo.sync()
class Trainer:
'''Associates a Classifier object and one or more Corpora, \
is an observer of the corpora'''
def __init__(self, bayes, is_spam, updateprobs=NO_UPDATEPROBS):
'''Constructor(Classifier, is_spam(True|False), updprobs(True|False)'''
self.bayes = bayes
self.is_spam = is_spam
self.updateprobs = updateprobs
def onAddMessage(self, message):
'''A message is being added to an observed corpus.'''
self.train(message)
def train(self, message):
'''Train the database with the message'''
if DEBUG:
print 'training with',message.key()
self.bayes.learn(message.tokenize(), self.is_spam)
# self.updateprobs)
def onRemoveMessage(self, message):
'''A message is being removed from an observed corpus.'''
self.untrain(message)
def untrain(self, message):
'''Untrain the database with the message'''
if DEBUG:
print 'untraining with',message.key()
self.bayes.unlearn(message.tokenize(), self.is_spam)
# self.updateprobs)
# can raise ValueError if database is fouled. If this is the case,
# then retraining is the only recovery option.
def trainAll(self, corpus):
'''Train all the messages in the corpus'''
for msg in corpus:
self.train(msg)
def untrainAll(self, corpus):
'''Untrain all the messages in the corpus'''
for msg in corpus:
self.untrain(msg)
class SpamTrainer(Trainer):
'''Trainer for spam'''
def __init__(self, bayes, updateprobs=NO_UPDATEPROBS):
'''Constructor'''
Trainer.__init__(self, bayes, True, updateprobs)
class HamTrainer(Trainer):
'''Trainer for ham'''
def __init__(self, bayes, updateprobs=NO_UPDATEPROBS):
'''Constructor'''
Trainer.__init__(self, bayes, False, updateprobs)
if __name__ == '__main__':
print >>sys.stderr, __doc__
Index: FileCorpus.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/FileCorpus.py,v
retrieving revision 1.3
retrieving revision 1.4
diff -C2 -d -r1.3 -r1.4
*** FileCorpus.py 25 Nov 2002 02:29:44 -0000 1.3
--- FileCorpus.py 25 Nov 2002 06:22:26 -0000 1.4
***************
*** 86,90 ****
import Corpus
! import Persistent
import sys, os, gzip, fnmatch, getopt, errno, time, stat
--- 86,90 ----
import Corpus
! import storage
import sys, os, gzip, fnmatch, getopt, errno, time, stat
***************
*** 344,355 ****
print '\n\nCreating two Classifier databases'
! miscbayes = Persistent.PickledClassifier('fctestmisc.bayes')
! classbayes = Persistent.DBDictClassifier('fctestclass.bayes')
print '\n\nSetting up spam corpus'
spamcorpus = FileCorpus(fmFact, 'fctestspamcorpus')
! spamtrainer = Persistent.SpamTrainer(miscbayes)
spamcorpus.addObserver(spamtrainer)
! anotherspamtrainer = Persistent.SpamTrainer(classbayes, Persistent.UPDATEPROBS)
spamcorpus.addObserver(anotherspamtrainer)
--- 344,355 ----
print '\n\nCreating two Classifier databases'
! miscbayes = storage.PickledClassifier('fctestmisc.bayes')
! classbayes = storage.DBDictClassifier('fctestclass.bayes')
print '\n\nSetting up spam corpus'
spamcorpus = FileCorpus(fmFact, 'fctestspamcorpus')
! spamtrainer = storage.SpamTrainer(miscbayes)
spamcorpus.addObserver(spamtrainer)
! anotherspamtrainer = storage.SpamTrainer(classbayes, storage.UPDATEPROBS)
spamcorpus.addObserver(anotherspamtrainer)
***************
*** 366,370 ****
'fctesthamcorpus', \
'MSG*')
! hamtrainer = Persistent.HamTrainer(miscbayes)
hamcorpus.addObserver(hamtrainer)
hamtrainer.trainAll(hamcorpus)
--- 366,370 ----
'fctesthamcorpus', \
'MSG*')
! hamtrainer = storage.HamTrainer(miscbayes)
hamcorpus.addObserver(hamtrainer)
hamtrainer.trainAll(hamcorpus)
***************
*** 420,424 ****
print '\n\nTrain with an individual message'
! anotherhamtrainer = Persistent.HamTrainer(classbayes)
anotherhamtrainer.train(unsurecorpus['MSG00005'])
--- 420,424 ----
print '\n\nTrain with an individual message'
! anotherhamtrainer = storage.HamTrainer(classbayes)
anotherhamtrainer.train(unsurecorpus['MSG00005'])
***************
*** 723,725 ****
print >>sys.stderr, __doc__
!
\ No newline at end of file
--- 723,725 ----
print >>sys.stderr, __doc__
!
Index: classifier.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/classifier.py,v
retrieving revision 1.55
retrieving revision 1.56
diff -C2 -d -r1.55 -r1.56
*** classifier.py 25 Nov 2002 04:11:29 -0000 1.55
--- classifier.py 25 Nov 2002 06:22:26 -0000 1.56
***************
*** 158,161 ****
--- 158,177 ----
# spamprob, depending on option settings.
+ def classify(self, message):
+ """Return the classification of a message as a string."""
+
+ prob = self.spamprob(message.tokenize())
+
+ message.setSpamprob(prob) # don't like this
+
+ if prob < options.ham_cutoff:
+ type = options.header_ham_string
+ elif prob > options.spam_cutoff:
+ type = options.header_spam_string
+ else:
+ type = options.header_unsure_string
+
+ return type
+
def gary_spamprob(self, wordstream, evidence=False):
"""Return best-guess probability that wordstream is spam.
Index: hammie.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/hammie.py,v
retrieving revision 1.41
retrieving revision 1.42
diff -C2 -d -r1.41 -r1.42
*** hammie.py 25 Nov 2002 02:29:44 -0000 1.41
--- hammie.py 25 Nov 2002 06:22:26 -0000 1.42
***************
*** 4,8 ****
import dbdict
import mboxutils
! import Persistent
from Options import options
from tokenizer import tokenize
--- 4,8 ----
import dbdict
import mboxutils
! import storage
from Options import options
from tokenizer import tokenize
***************
*** 180,186 ****
if usedb:
! b = Persistent.DBDictClassifier(filename, mode)
else:
! b = Persistent.PickledClassifier(filename)
return Hammie(b)
--- 180,186 ----
if usedb:
! b = storage.DBDictClassifier(filename, mode)
else:
! b = storage.PickledClassifier(filename)
return Hammie(b)
Index: hammiebulk.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/hammiebulk.py,v
retrieving revision 1.2
retrieving revision 1.3
diff -C2 -d -r1.2 -r1.3
*** hammiebulk.py 25 Nov 2002 02:29:44 -0000 1.2
--- hammiebulk.py 25 Nov 2002 06:22:26 -0000 1.3
***************
*** 52,56 ****
import mboxutils
import classifier
! import Persistent
import hammie
import Corpus
--- 52,56 ----
import mboxutils
import classifier
! import storage
import hammie
import Corpus
***************
*** 104,117 ****
print h.formatclues(clues)
return (spams, hams)
-
- def createbayes(pck=DEFAULTDB, usedb=False, mode='r'):
- """Create a Bayes instance for the given pickle (which
- doesn't have to exist). Create a PersistentBayes if
- usedb is True."""
- if usedb:
- bayes = Persistent.DBDictClassifier(pck, mode)
- else:
- bayes = Persistent.PickledClassifier(pck)
- return bayes
def usage(code, msg=''):
--- 104,107 ----
Index: pop3proxy.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/pop3proxy.py,v
retrieving revision 1.19
retrieving revision 1.20
diff -C2 -d -r1.19 -r1.20
*** pop3proxy.py 25 Nov 2002 02:29:44 -0000 1.19
--- pop3proxy.py 25 Nov 2002 06:22:26 -0000 1.20
***************
*** 119,123 ****
import os, sys, re, operator, errno, getopt, string, cStringIO, time, bisect
import socket, asyncore, asynchat, cgi, urlparse, webbrowser
! import Persistent, tokenizer, mboxutils
from FileCorpus import FileCorpus, FileMessageFactory, GzipFileMessageFactory
from email.Iterators import typed_subpart_iterator
--- 119,123 ----
import os, sys, re, operator, errno, getopt, string, cStringIO, time, bisect
import socket, asyncore, asynchat, cgi, urlparse, webbrowser
! import storage, tokenizer, mboxutils
from FileCorpus import FileCorpus, FileMessageFactory, GzipFileMessageFactory
from email.Iterators import typed_subpart_iterator
***************
*** 1203,1209 ****
self.databaseFilename = '_pop3proxy_test.pickle' # Never saved
if self.useDB:
! self.bayes = Persistent.DBDictClassifier(self.databaseFilename)
else:
! self.bayes = Persistent.PickledClassifier(self.databaseFilename)
print "Done."
--- 1203,1209 ----
self.databaseFilename = '_pop3proxy_test.pickle' # Never saved
if self.useDB:
! self.bayes = storage.DBDictClassifier(self.databaseFilename)
else:
! self.bayes = storage.PickledClassifier(self.databaseFilename)
print "Done."
***************
*** 1230,1235 ****
# Create the Trainers.
! self.spamTrainer = Persistent.SpamTrainer(self.bayes)
! self.hamTrainer = Persistent.HamTrainer(self.bayes)
self.spamCorpus.addObserver(self.spamTrainer)
self.hamCorpus.addObserver(self.hamTrainer)
--- 1230,1235 ----
# Create the Trainers.
! self.spamTrainer = storage.SpamTrainer(self.bayes)
! self.hamTrainer = storage.HamTrainer(self.bayes)
self.spamCorpus.addObserver(self.spamTrainer)
self.hamCorpus.addObserver(self.hamTrainer)
--- Persistent.py DELETED ---
More information about the Spambayes-checkins
mailing list