[Spambayes-checkins] spambayes classifier.py,1.60,1.61
hammie.py,1.43,1.44 storage.py,1.2,1.3 dbdict.py,1.4,NONE
Neale Pickett
npickett@users.sourceforge.net
Wed Nov 27 22:38:00 2002
Update of /cvsroot/spambayes/spambayes
In directory sc8-pr-cvs1:/tmp/cvs-serv31393
Modified Files:
classifier.py hammie.py storage.py
Removed Files:
dbdict.py
Log Message:
* Caching dbdict implementation. You'll have to retrain your
databases again (sorry)
Index: classifier.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/classifier.py,v
retrieving revision 1.60
retrieving revision 1.61
diff -C2 -d -r1.60 -r1.61
*** classifier.py 26 Nov 2002 20:22:05 -0000 1.60
--- classifier.py 27 Nov 2002 22:37:55 -0000 1.61
***************
*** 47,75 ****
LN2 = math.log(2) # used frequently by chi-combining
! PICKLE_VERSION = 4
!
! class MetaInfo(object):
! """Information about the corpora.
!
! Contains nham and nspam, used for calculating probabilities.
!
! """
! def __init__(self):
! self.__setstate__((PICKLE_VERSION, 0, 0))
!
! def __repr__(self):
! return "MetaInfo%r" % repr((self._nspam,
! self._nham,
! self.revision))
!
! def __getstate__(self):
! return (PICKLE_VERSION, self.nspam, self.nham)
!
! def __setstate__(self, t):
! if t[0] != PICKLE_VERSION:
! raise ValueError("Can't unpickle -- version %s unknown" % t[0])
! self.nspam, self.nham = t[1:]
! self.revision = 0
!
class WordInfo(object):
--- 47,51 ----
LN2 = math.log(2) # used frequently by chi-combining
! PICKLE_VERSION = 5
class WordInfo(object):
***************
*** 109,138 ****
def __init__(self):
self.wordinfo = {}
- self.meta = MetaInfo()
self.probcache = {}
def __getstate__(self):
! return PICKLE_VERSION, self.wordinfo, self.meta
def __setstate__(self, t):
if t[0] != PICKLE_VERSION:
raise ValueError("Can't unpickle -- version %s unknown" % t[0])
! self.wordinfo, self.meta = t[1:]
self.probcache = {}
- # Slacker's way out--pass calls to nham/nspam up to the meta class
-
- def get_nham(self):
- return self.meta.nham
- def set_nham(self, val):
- self.meta.nham = val
- nham = property(get_nham, set_nham)
-
- def get_nspam(self):
- return self.meta.nspam
- def set_nspam(self, val):
- self.meta.nspam = val
- nspam = property(get_nspam, set_nspam)
-
# spamprob() implementations. One of the following is aliased to
# spamprob, depending on option settings.
--- 85,100 ----
def __init__(self):
self.wordinfo = {}
self.probcache = {}
+ self.nspam = self.nham = 0
def __getstate__(self):
! return (PICKLE_VERSION, self.wordinfo, self.nspam, self.nham)
def __setstate__(self, t):
if t[0] != PICKLE_VERSION:
raise ValueError("Can't unpickle -- version %s unknown" % t[0])
! (self.wordinfo, self.nspam, self.nham) = t[1:]
self.probcache = {}
# spamprob() implementations. One of the following is aliased to
# spamprob, depending on option settings.
***************
*** 331,336 ****
pass
! nham = float(self.meta.nham or 1)
! nspam = float(self.meta.nspam or 1)
assert hamcount <= nham
--- 293,298 ----
pass
! nham = float(self.nham or 1)
! nspam = float(self.nspam or 1)
assert hamcount <= nham
***************
*** 420,431 ****
self.probcache = {} # nuke the prob cache
if is_spam:
! self.meta.nspam += 1
else:
! self.meta.nham += 1
- wordinfo = self.wordinfo
- wordinfoget = wordinfo.get
for word in Set(wordstream):
! record = wordinfoget(word)
if record is None:
record = self.WordInfoClass()
--- 382,391 ----
self.probcache = {} # nuke the prob cache
if is_spam:
! self.nspam += 1
else:
! self.nham += 1
for word in Set(wordstream):
! record = self._wordinfoget(word)
if record is None:
record = self.WordInfoClass()
***************
*** 436,441 ****
record.hamcount += 1
! # Needed to tell a persistent DB that the content changed.
! wordinfo[word] = record
--- 396,400 ----
record.hamcount += 1
! self._wordinfoset(word, record)
***************
*** 443,458 ****
self.probcache = {} # nuke the prob cache
if is_spam:
! if self.meta.nspam <= 0:
raise ValueError("spam count would go negative!")
! self.meta.nspam -= 1
else:
! if self.meta.nham <= 0:
raise ValueError("non-spam count would go negative!")
! self.meta.nham -= -1
- wordinfo = self.wordinfo
- wordinfoget = wordinfo.get
for word in Set(wordstream):
! record = wordinfoget(word)
if record is not None:
if is_spam:
--- 402,415 ----
self.probcache = {} # nuke the prob cache
if is_spam:
! if self.nspam <= 0:
raise ValueError("spam count would go negative!")
! self.nspam -= 1
else:
! if self.nham <= 0:
raise ValueError("non-spam count would go negative!")
! self.nham -= -1
for word in Set(wordstream):
! record = self._wordinfoget(word)
if record is not None:
if is_spam:
***************
*** 463,471 ****
record.hamcount -= 1
if record.hamcount == 0 == record.spamcount:
! del wordinfo[word]
else:
! # Needed to tell a persistent DB that the content
! # changed.
! wordinfo[word] = record
def _getclues(self, wordstream):
--- 420,426 ----
record.hamcount -= 1
if record.hamcount == 0 == record.spamcount:
! self._wordinfodel(word)
else:
! self._wordinfoset(word, record)
def _getclues(self, wordstream):
***************
*** 476,482 ****
pushclue = clues.append
- wordinfoget = self.wordinfo.get
for word in Set(wordstream):
! record = wordinfoget(word)
if record is None:
prob = unknown
--- 431,436 ----
pushclue = clues.append
for word in Set(wordstream):
! record = self._wordinfoget(word)
if record is None:
prob = unknown
***************
*** 492,495 ****
--- 446,459 ----
# Return (prob, word, record).
return [t[1:] for t in clues]
+
+ def _wordinfoget(self, word):
+ return self.wordinfo.get(word)
+
+ def _wordinfoset(self, word, record):
+ self.wordinfo[word] = record
+
+ def _wordinfodel(self, word):
+ del self.wordinfo[word]
+
Index: hammie.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/hammie.py,v
retrieving revision 1.43
retrieving revision 1.44
diff -C2 -d -r1.43 -r1.44
*** hammie.py 25 Nov 2002 20:49:17 -0000 1.43
--- hammie.py 27 Nov 2002 22:37:56 -0000 1.44
***************
*** 2,6 ****
- import dbdict
import mboxutils
import storage
--- 2,5 ----
***************
*** 45,49 ****
for word, prob in clues
if (word[0] == '*' or
! prob <= SHOWCLUE or prob >= 1.0 - SHOWCLUE)])
def score(self, msg, evidence=False):
--- 44,49 ----
for word, prob in clues
if (word[0] == '*' or
! prob <= options.clue_mailheader_cutoff or
! prob >= 1.0 - options.clue_mailheader_cutoff)])
def score(self, msg, evidence=False):
Index: storage.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/storage.py,v
retrieving revision 1.2
retrieving revision 1.3
diff -C2 -d -r1.2 -r1.3
*** storage.py 26 Nov 2002 00:43:51 -0000 1.2
--- storage.py 27 Nov 2002 22:37:56 -0000 1.3
***************
*** 5,9 ****
Classes:
PickledClassifier - Classifier that uses a pickle db
! DBDictClassifier - Classifier that uses a DBDict db
Trainer - Classifier training observer
SpamTrainer - Trainer for spam
--- 5,9 ----
Classes:
PickledClassifier - Classifier that uses a pickle db
! DBDictClassifier - Classifier that uses a DBM db
Trainer - Classifier training observer
SpamTrainer - Trainer for spam
***************
*** 18,23 ****
databases.
! DBDictClassifier is a Classifier class that uses a DBDict
! datastore.
Trainer is concrete class that observes a Corpus and trains a
--- 18,23 ----
databases.
! DBDictClassifier is a Classifier class that uses a database
! store.
Trainer is concrete class that observes a Corpus and trains a
***************
*** 50,55 ****
from Options import options
import cPickle as pickle
- import dbdict
import errno
PICKLE_TYPE = 1
--- 50,55 ----
from Options import options
import cPickle as pickle
import errno
+ import shelve
PICKLE_TYPE = 1
***************
*** 84,91 ****
fp.close()
if tempbayes:
self.wordinfo = tempbayes.wordinfo
! self.meta.nham = tempbayes.get_nham()
! self.meta.nspam = tempbayes.get_nspam()
if options.verbose:
--- 84,92 ----
fp.close()
+ # XXX: why not self.__setstate__(tempbayes.__getstate__())?
if tempbayes:
self.wordinfo = tempbayes.wordinfo
! self.nham = tempbayes.nham
! self.nspam = tempbayes.nspam
if options.verbose:
***************
*** 97,102 ****
print self.db_name,'is a new pickle'
self.wordinfo = {}
! self.meta.nham = 0
! self.meta.nspam = 0
def store(self):
--- 98,103 ----
print self.db_name,'is a new pickle'
self.wordinfo = {}
! self.nham = 0
! self.nspam = 0
def store(self):
***************
*** 110,124 ****
fp.close()
- def __getstate__(self):
- return PICKLE_TYPE, self.wordinfo, self.meta
-
- def __setstate__(self, t):
- if t[0] != PICKLE_TYPE:
- raise ValueError("Can't unpickle -- version %s unknown" % t[0])
- self.wordinfo, self.meta = t[1:]
-
class DBDictClassifier(classifier.Classifier):
! '''Classifier object persisted in a WIDict'''
def __init__(self, db_name, mode='c'):
--- 111,117 ----
fp.close()
class DBDictClassifier(classifier.Classifier):
! '''Classifier object persisted in a caching database'''
def __init__(self, db_name, mode='c'):
***************
*** 126,129 ****
--- 119,123 ----
classifier.Classifier.__init__(self)
+ self.wordcache = {}
self.statekey = "saved state"
self.mode = mode
***************
*** 132,157 ****
def load(self):
! '''Load state from WIDict'''
if options.verbose:
! print 'Loading state from',self.db_name,'WIDict'
! self.wordinfo = dbdict.DBDict(self.db_name, self.mode,
! classifier.WordInfo,iterskip=[self.statekey])
! if self.wordinfo.has_key(self.statekey):
! (nham, nspam) = self.wordinfo[self.statekey]
! self.set_nham(nham)
! self.set_nspam(nspam)
if options.verbose:
! print '%s is an existing DBDict, with %d ham and %d spam' \
! % (self.db_name, self.nham, self.nspam)
else:
! # new dbdict
if options.verbose:
! print self.db_name,'is a new DBDict'
! self.set_nham(0)
! self.set_nspam(0)
def store(self):
--- 126,152 ----
def load(self):
! '''Load state from database'''
if options.verbose:
! print 'Loading state from',self.db_name,'database'
! self.db = shelve.DbfilenameShelf(self.db_name, self.mode)
! if self.db.has_key(self.statekey):
! t = self.db[self.statekey]
! if t[0] != classifier.PICKLE_VERSION:
! raise ValueError("Can't unpickle -- version %s unknown" % t[0])
! (self.nspam, self.nham) = t[1:]
if options.verbose:
! print '%s is an existing database, with %d spam and %d ham' \
! % (self.db_name, self.nspam, self.nham)
else:
! # new database
if options.verbose:
! print self.db_name,'is a new database'
! self.nspam = 0
! self.nham = 0
! self.wordinfo = {}
def store(self):
***************
*** 159,166 ****
if options.verbose:
! print 'Persisting',self.db_name,'state in WIDict'
! self.wordinfo[self.statekey] = (self.get_nham(), self.get_nspam())
! self.wordinfo.sync()
--- 154,186 ----
if options.verbose:
! print 'Persisting',self.db_name,'state in database'
! for key, val in self.wordinfo.iteritems():
! if val == None:
! del self.wordinfo[key]
! try:
! del self.db[key]
! except KeyError:
! pass
! else:
! self.db[key] = val.__getstate__()
! self.db[self.statekey] = (classifier.PICKLE_VERSION,
! self.nspam, self.nham)
! self.db.sync()
!
! def _wordinfoget(self, word):
! ret = self.wordinfo.get(word)
! if not ret:
! r = self.db.get(word)
! if r:
! ret = self.WordInfoClass()
! ret.__setstate__(r)
! self.wordinfo[word] = ret
! return ret
!
! # _wordinfoset is the same
!
! def _wordinfodel(self, word):
! self.wordinfo[word] = None
--- dbdict.py DELETED ---
More information about the Spambayes-checkins
mailing list