[Spambayes-checkins] spambayes Persistent.py,1.1,1.2
hammiebulk.py,1.1,1.2
Corpus.py,1.2,1.3 FileCorpus.py,1.2,1.3 Options.py,1.75,1.76
TestDriver.py,1.30,1.31 Tester.py,1.8,1.9 classifier.py,1.53,1.54
dbdict.py,1.1,1.2 hammie.py,1.40,1.41 hammiefilter.py,1.2,1.3
pop3proxy.py,1.18,1.19 Bayes.py,1.5,NONE
Neale Pickett
npickett@users.sourceforge.net
Mon Nov 25 02:29:47 2002
- Previous message: [Spambayes-checkins]
spambayes/Outlook2000 README.txt,1.7,1.8 about.html,1.4,1.5
addin.py,1.38,1.39 filter.py,1.13,1.14 manager.py,1.35,1.36
- Next message: [Spambayes-checkins] spambayes classifier.py,1.54,1.55
- Messages sorted by:
[ date ]
[ thread ]
[ subject ]
[ author ]
Update of /cvsroot/spambayes/spambayes
In directory sc8-pr-cvs1:/tmp/cvs-serv31682
Modified Files:
Corpus.py FileCorpus.py Options.py TestDriver.py Tester.py
classifier.py dbdict.py hammie.py hammiefilter.py pop3proxy.py
Added Files:
Persistent.py hammiebulk.py
Removed Files:
Bayes.py
Log Message:
* Merge from hammie-playground to HEAD. See spambayes list for more
details.
Index: Corpus.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/Corpus.py,v
retrieving revision 1.2
retrieving revision 1.3
diff -C2 -d -r1.2 -r1.3
*** Corpus.py 16 Nov 2002 19:03:15 -0000 1.2
--- Corpus.py 25 Nov 2002 02:29:44 -0000 1.3
***************
*** 230,234 ****
return msg
!
class ExpiryCorpus:
--- 230,234 ----
return msg
!
class ExpiryCorpus:
***************
*** 272,276 ****
def __init__(self):
'''Constructor()'''
! pass
def load(self):
--- 272,278 ----
def __init__(self):
'''Constructor()'''
!
! self.payload = None
! self.hdrtxt = None
def load(self):
***************
*** 297,301 ****
'''Instance as a printable string'''
! return self.substance
def name(self):
--- 299,303 ----
'''Instance as a printable string'''
! return self.getSubstance()
def name(self):
***************
*** 311,322 ****
def setSubstance(self, sub):
'''set this message substance'''
!
! self.substance = sub
!
def getSubstance(self):
'''Return this message substance'''
!
! return self.substance
!
def setSpamprob(self, prob):
'''Score of the last spamprob calc, may not be persistent'''
--- 313,328 ----
def setSubstance(self, sub):
'''set this message substance'''
!
! bodyRE = re.compile(r"\r?\n(\r?\n)(.*)", re.DOTALL+re.MULTILINE)
! bmatch = bodyRE.search(sub)
! if bmatch:
! self.payload = bmatch.group(2)
! self.hdrtxt = sub[:bmatch.start(2)]
!
def getSubstance(self):
'''Return this message substance'''
!
! return self.hdrtxt + self.payload
!
def setSpamprob(self, prob):
'''Score of the last spamprob calc, may not be persistent'''
***************
*** 327,331 ****
'''Returns substance as tokens'''
! return tokenizer.tokenize(self.substance)
def createTimeStamp(self):
--- 333,337 ----
'''Returns substance as tokens'''
! return tokenizer.tokenize(self.getSubstance())
def createTimeStamp(self):
***************
*** 335,338 ****
--- 341,398 ----
raise NotImplementedError
+ def getFrom(self):
+ '''Return a message From header content'''
+
+ if self.hdrtxt:
+ match = re.search(r'^From:(.*)$', self.hdrtxt, re.MULTILINE)
+ return match.group(1)
+ else:
+ return None
+
+ def getSubject(self):
+ '''Return a message Subject header contents'''
+
+ if self.hdrtxt:
+ match = re.search(r'^Subject:(.*)$', self.hdrtxt, re.MULTILINE)
+ return match.group(1)
+ else:
+ return None
+
+ def getDate(self):
+ '''Return a message Date header contents'''
+
+ if self.hdrtxt:
+ match = re.search(r'^Date:(.*)$', self.hdrtxt, re.MULTILINE)
+ return match.group(1)
+ else:
+ return None
+
+ def getHeadersList(self):
+ '''Return a list of message header tuples'''
+
+ hdrregex = re.compile(r'^([A-Za-z0-9-_]*): ?(.*)$', re.MULTILINE)
+ data = re.sub(r'\r?\n\r?\s',' ',self.hdrtxt,re.MULTILINE)
+ match = hdrregex.findall(data)
+
+ return match
+
+ def getHeaders(self):
+ '''Return message headers as text'''
+
+ return self.hdrtxt
+
+ def getPayload(self):
+ '''Return the message body'''
+
+ return self.payload
+
+ def stripSBDHeader(self):
+ '''Removes the X-Spambayes-Disposition: header from the message'''
+
+ # This is useful for training, where a spammer may be spoofing
+ # our header, to make sure that our header doesn't become an
+ # overweight clue to hamminess
+
+ raise NotImplementedError
Index: FileCorpus.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/FileCorpus.py,v
retrieving revision 1.2
retrieving revision 1.3
diff -C2 -d -r1.2 -r1.3
*** FileCorpus.py 16 Nov 2002 19:06:27 -0000 1.2
--- FileCorpus.py 25 Nov 2002 02:29:44 -0000 1.3
***************
*** 86,90 ****
import Corpus
! import Bayes
import sys, os, gzip, fnmatch, getopt, errno, time, stat
--- 86,90 ----
import Corpus
! import Persistent
import sys, os, gzip, fnmatch, getopt, errno, time, stat
***************
*** 192,195 ****
--- 192,196 ----
'''Constructor(message file name, corpus directory name)'''
+ Corpus.Message.__init__(self)
self.file_name = file_name
self.directory = directory
***************
*** 214,218 ****
raise
else:
! self.substance = fp.read()
fp.close()
--- 215,219 ----
raise
else:
! self.setSubstance(fp.read())
fp.close()
***************
*** 225,229 ****
pn = self.pathname()
fp = open(pn, 'wb')
! fp.write(self.substance)
fp.close()
--- 226,230 ----
pn = self.pathname()
fp = open(pn, 'wb')
! fp.write(self.getSubstance())
fp.close()
***************
*** 248,260 ****
elip = ''
! sub = self.substance
!
if Corpus.Verbose:
! sub = self.substance
else:
! if len(self.substance) > 20:
! sub = self.substance[:20]
! if len(self.substance) > 40:
! sub += '...' + self.substance[-20:]
pn = os.path.join(self.directory, self.file_name)
--- 249,261 ----
elip = ''
! sub = self.getSubstance()
!
if Corpus.Verbose:
! sub = self.getSubstance()
else:
! if len(sub) > 20:
! sub = sub[:20]
! if len(sub) > 40:
! sub += '...' + sub[-20:]
pn = os.path.join(self.directory, self.file_name)
***************
*** 304,308 ****
raise
else:
! self.substance = fp.read()
fp.close()
--- 305,309 ----
raise
else:
! self.setSubstance(fp.read())
fp.close()
***************
*** 316,320 ****
pn = self.pathname()
gz = gzip.open(pn, 'wb')
! gz.write(self.substance)
gz.flush()
gz.close()
--- 317,321 ----
pn = self.pathname()
gz = gzip.open(pn, 'wb')
! gz.write(self.getSubstance())
gz.flush()
gz.close()
***************
*** 342,354 ****
print 'Executing with uncompressed files'
! print '\n\nCreating two Bayes databases'
! miscbayes = Bayes.PickledBayes('fctestmisc.bayes')
! classbayes = Bayes.DBDictBayes('fctestclass.bayes')
print '\n\nSetting up spam corpus'
spamcorpus = FileCorpus(fmFact, 'fctestspamcorpus')
! spamtrainer = Bayes.SpamTrainer(miscbayes)
spamcorpus.addObserver(spamtrainer)
! anotherspamtrainer = Bayes.SpamTrainer(classbayes, Bayes.UPDATEPROBS)
spamcorpus.addObserver(anotherspamtrainer)
--- 343,355 ----
print 'Executing with uncompressed files'
! print '\n\nCreating two Classifier databases'
! miscbayes = Persistent.PickledClassifier('fctestmisc.bayes')
! classbayes = Persistent.DBDictClassifier('fctestclass.bayes')
print '\n\nSetting up spam corpus'
spamcorpus = FileCorpus(fmFact, 'fctestspamcorpus')
! spamtrainer = Persistent.SpamTrainer(miscbayes)
spamcorpus.addObserver(spamtrainer)
! anotherspamtrainer = Persistent.SpamTrainer(classbayes, Persistent.UPDATEPROBS)
spamcorpus.addObserver(anotherspamtrainer)
***************
*** 365,374 ****
'fctesthamcorpus', \
'MSG*')
! hamtrainer = Bayes.HamTrainer(miscbayes)
hamcorpus.addObserver(hamtrainer)
hamtrainer.trainAll(hamcorpus)
!
! print '\n\nAdd a message to hamcorpus that does not match the filter'
if useGzip:
fmClass = GzipFileMessage
--- 366,374 ----
'fctesthamcorpus', \
'MSG*')
! hamtrainer = Persistent.HamTrainer(miscbayes)
hamcorpus.addObserver(hamtrainer)
hamtrainer.trainAll(hamcorpus)
! print '\n\nA couple of message related tests'
if useGzip:
fmClass = GzipFileMessage
***************
*** 377,380 ****
--- 377,383 ----
m1 = fmClass('XMG00001', 'fctestspamcorpus')
+ m1.setSubstance(testmsg2())
+
+ print '\n\nAdd a message to hamcorpus that does not match the filter'
try:
***************
*** 417,421 ****
print '\n\nTrain with an individual message'
! anotherhamtrainer = Bayes.HamTrainer(classbayes)
anotherhamtrainer.train(unsurecorpus['MSG00005'])
--- 420,424 ----
print '\n\nTrain with an individual message'
! anotherhamtrainer = Persistent.HamTrainer(classbayes)
anotherhamtrainer.train(unsurecorpus['MSG00005'])
***************
*** 428,431 ****
--- 431,443 ----
msg = spamcorpus['MSG00001']
print msg
+ print '\n\nThis is some vital information in the message'
+ print 'Date header is',msg.getDate()
+ print 'Subject header is',msg.getSubject()
+ print 'From header is',msg.getFrom()
+
+ print 'Header text is:',msg.getHeaders()
+ print 'Headers are:',msg.getHeadersList()
+ print 'Body is:',msg.getPayload()
+
***************
*** 526,538 ****
m1 = fmClass('MSG00001', 'fctestspamcorpus')
! m1.substance = tm1
m1.store()
m2 = fmClass('MSG00002', 'fctestspamcorpus')
! m2.substance = tm2
m2.store()
m3 = fmClass('MSG00003', 'fctestunsurecorpus')
! m3.substance = tm1
m3.store()
--- 538,550 ----
m1 = fmClass('MSG00001', 'fctestspamcorpus')
! m1.setSubstance(tm1)
m1.store()
m2 = fmClass('MSG00002', 'fctestspamcorpus')
! m2.setSubstance(tm2)
m2.store()
m3 = fmClass('MSG00003', 'fctestunsurecorpus')
! m3.setSubstance(tm1)
m3.store()
***************
*** 546,558 ****
m4 = fmClass('MSG00004', 'fctestunsurecorpus')
! m4.substance = tm1
m4.store()
m5 = fmClass('MSG00005', 'fctestunsurecorpus')
! m5.substance = tm2
m5.store()
m6 = fmClass('MSG00006', 'fctestunsurecorpus')
! m6.substance = tm2
m6.store()
--- 558,570 ----
m4 = fmClass('MSG00004', 'fctestunsurecorpus')
! m4.setSubstance(tm1)
m4.store()
m5 = fmClass('MSG00005', 'fctestunsurecorpus')
! m5.setSubstance(tm2)
m5.store()
m6 = fmClass('MSG00006', 'fctestunsurecorpus')
! m6.setSubstance(tm2)
m6.store()
***************
*** 583,587 ****
Content-Type:text/plain; charset=us-ascii
Content- Transfer- Encoding:7bit
-
Message-ID:<15814.42238.882013.702030@montanaro.dyndns.org>
Date:Mon, 4 Nov 2002 10:49:02 -0600
--- 595,598 ----
***************
*** 644,648 ****
Content-Type:text/plain; charset=us-ascii
Content- Transfer- Encoding:7bit
-
X-Hammie- Disposition:Unsure
--- 655,658 ----
Index: Options.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/Options.py,v
retrieving revision 1.75
retrieving revision 1.76
diff -C2 -d -r1.75 -r1.76
*** Options.py 20 Nov 2002 22:41:50 -0000 1.75
--- Options.py 25 Nov 2002 02:29:44 -0000 1.76
***************
*** 198,209 ****
show_unsure: False
- # Near the end of Driver.test(), you can get a listing of the best
- # discriminators in the words from the training sets. These are the
- # words whose WordInfo.killcount values are highest, meaning they most
- # often were among the most extreme clues spamprob() found. The number
- # of best discriminators to show is given by show_best_discriminators;
- # set this <= 0 to suppress showing any of the best discriminators.
- show_best_discriminators: 30
-
# The maximum # of characters to display for a msg displayed due to the
# show_xyz options above.
--- 198,201 ----
***************
*** 346,356 ****
clue_mailheader_cutoff: 0.5
! # The default database path used by hammie
! persistent_storage_file: hammie.db
!
! # hammie can use either a database (quick to score one message) or a pickle
! # (quick to train on huge amounts of messages). Set this to True to use a
! # database by default.
! persistent_use_database: False
[pop3proxy]
--- 338,347 ----
clue_mailheader_cutoff: 0.5
! [hammiefilter]
! # hammiefilter can use either a database (quick to score one message) or
! # a pickle (quick to train on huge amounts of messages). Set this to
! # True to use a database by default.
! hammiefilter_persistent_use_database: True
! hammiefilter_persistent_storage_file: ~/.hammiedb
[pop3proxy]
***************
*** 368,371 ****
--- 359,364 ----
pop3proxy_ham_cache: pop3proxy-ham-cache
pop3proxy_unknown_cache: pop3proxy-unknown-cache
+ pop3proxy_persistent_use_database: False
+ pop3proxy_persistent_storage_file: hammie.db
# Deprecated - use pop3proxy_servers and pop3proxy_ports instead.
***************
*** 411,415 ****
'show_histograms': boolean_cracker,
'percentiles': ('get', lambda s: map(float, s.split())),
- 'show_best_discriminators': int_cracker,
'save_trained_pickles': boolean_cracker,
'save_histogram_pickles': boolean_cracker,
--- 404,407 ----
***************
*** 436,440 ****
},
'Hammie': {'hammie_header_name': string_cracker,
- 'persistent_storage_file': string_cracker,
'clue_mailheader_cutoff': float_cracker,
'persistent_use_database': boolean_cracker,
--- 428,431 ----
***************
*** 447,450 ****
--- 438,444 ----
'hammie_debug_header_name': string_cracker,
},
+ 'hammiefilter' : {'hammiefilter_persistent_use_database': boolean_cracker,
+ 'hammiefilter_persistent_storage_file': string_cracker,
+ },
'pop3proxy': {'pop3proxy_servers': string_cracker,
'pop3proxy_ports': string_cracker,
***************
*** 457,460 ****
--- 451,456 ----
'pop3proxy_ham_cache': string_cracker,
'pop3proxy_unknown_cache': string_cracker,
+ 'pop3proxy_persistent_use_database': boolean_cracker,
+ 'pop3proxy_persistent_storage_file': string_cracker,
},
'html_ui': {'html_ui_port': int_cracker,
Index: TestDriver.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/TestDriver.py,v
retrieving revision 1.30
retrieving revision 1.31
diff -C2 -d -r1.30 -r1.31
*** TestDriver.py 19 Nov 2002 17:43:27 -0000 1.30
--- TestDriver.py 25 Nov 2002 02:29:44 -0000 1.31
***************
*** 305,324 ****
printmsg(e, prob, clues)
- if options.show_best_discriminators > 0:
- print
- print " best discriminators:"
- stats = [(-1, None)] * options.show_best_discriminators
- smallest_killcount = -1
- for w, r in c.wordinfo.iteritems():
- if r.killcount > smallest_killcount:
- heapreplace(stats, (r.killcount, w))
- smallest_killcount = stats[0][0]
- stats.sort()
- for count, w in stats:
- if count < 0:
- continue
- r = c.wordinfo[w]
- print " %r %d %g" % (w, r.killcount, r.spamprob)
-
if options.show_histograms:
printhist("this pair:", local_ham_hist, local_spam_hist)
--- 305,308 ----
Index: Tester.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/Tester.py,v
retrieving revision 1.8
retrieving revision 1.9
diff -C2 -d -r1.8 -r1.9
*** Tester.py 7 Nov 2002 22:30:04 -0000 1.8
--- Tester.py 25 Nov 2002 02:29:44 -0000 1.9
***************
*** 60,68 ****
if hamstream is not None:
for example in hamstream:
! learn(example, False, False)
if spamstream is not None:
for example in spamstream:
! learn(example, True, False)
! self.classifier.update_probabilities()
# Untrain the classifier on streams of ham and spam. Updates
--- 60,67 ----
if hamstream is not None:
for example in hamstream:
! learn(example, False)
if spamstream is not None:
for example in spamstream:
! learn(example, True)
# Untrain the classifier on streams of ham and spam. Updates
***************
*** 73,81 ****
if hamstream is not None:
for example in hamstream:
! unlearn(example, False, False)
if spamstream is not None:
for example in spamstream:
! unlearn(example, True, False)
! self.classifier.update_probabilities()
# Run prediction on each sample in stream. You're swearing that stream
--- 72,79 ----
if hamstream is not None:
for example in hamstream:
! unlearn(example, False)
if spamstream is not None:
for example in spamstream:
! unlearn(example, True)
# Run prediction on each sample in stream. You're swearing that stream
Index: classifier.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/classifier.py,v
retrieving revision 1.53
retrieving revision 1.54
diff -C2 -d -r1.53 -r1.54
*** classifier.py 18 Nov 2002 18:23:09 -0000 1.53
--- classifier.py 25 Nov 2002 02:29:44 -0000 1.54
***************
*** 1,2 ****
--- 1,3 ----
+ #! /usr/bin/env python
# An implementation of a Bayes-like spam classifier.
#
***************
*** 32,36 ****
import math
- import time
from sets import Set
--- 33,36 ----
***************
*** 47,92 ****
LN2 = math.log(2) # used frequently by chi-combining
! PICKLE_VERSION = 1
- class WordInfo(object):
- __slots__ = ('atime', # when this record was last used by scoring(*)
- 'spamcount', # # of spams in which this word appears
- 'hamcount', # # of hams in which this word appears
- 'killcount', # # of times this made it to spamprob()'s nbest
- 'spamprob', # prob(spam | msg contains this word)
- )
# Invariant: For use in a classifier database, at least one of
# spamcount and hamcount must be non-zero.
- #
- # (*)atime is the last access time, a UTC time.time() value. It's the
- # most recent time this word was used by scoring (i.e., by spamprob(),
- # not by training via learn()); or, if the word has never been used by
- # scoring, the time the word record was created (i.e., by learn()).
- # One good criterion for identifying junk (word records that have no
- # value) is to delete words that haven't been used for a long time.
- # Perhaps they were typos, or unique identifiers, or relevant to a
- # once-hot topic or scam that's fallen out of favor. Whatever, if
- # a word is no longer being used, it's just wasting space.
! def __init__(self, atime, spamprob=options.unknown_word_prob):
! self.atime = atime
! self.spamcount = self.hamcount = self.killcount = 0
! self.spamprob = spamprob
def __repr__(self):
! return "WordInfo%r" % repr((self.atime, self.spamcount,
! self.hamcount, self.killcount,
! self.spamprob))
def __getstate__(self):
! return (self.atime, self.spamcount, self.hamcount, self.killcount,
! self.spamprob)
def __setstate__(self, t):
! (self.atime, self.spamcount, self.hamcount, self.killcount,
! self.spamprob) = t
! class Bayes:
# Defining __slots__ here made Jeremy's life needlessly difficult when
# trying to hook this all up to ZODB as a persistent object. There's
--- 47,116 ----
LN2 = math.log(2) # used frequently by chi-combining
! PICKLE_VERSION = 4
!
! class MetaInfo(object):
! """Information about the corpora.
!
! Contains nham and nspam, used for calculating probabilities. Also
! has a revision, incremented every time nham or nspam is adjusted.
! Nothing uses this, currently, but it's there if you want it.
!
! """
! def __init__(self):
! self.__setstate__((PICKLE_VERSION, 0, 0))
!
! def __repr__(self):
! return "MetaInfo%r" % repr((self._nspam,
! self._nham,
! self.revision))
!
! def __getstate__(self):
! return (PICKLE_VERSION, self._nspam, self._nham)
!
! def __setstate__(self, t):
! if t[0] != PICKLE_VERSION:
! raise ValueError("Can't unpickle -- version %s unknown" % t[0])
! (self._nspam, self._nham) = t[1:]
! self.revision = 0
!
! def incr_rev(self):
! self.revision += 1
!
! def get_nham(self):
! return self._nham
! def set_nham(self, val):
! self._nham = val
! self.incr_rev()
! nham = property(get_nham, set_nham)
!
! def set_nspam(self, val):
! self._nspam = val
! def get_nspam(self):
! return self._nspam
! nspam = property(get_nspam, set_nspam)
!
!
+ class WordInfo(object):
# Invariant: For use in a classifier database, at least one of
# spamcount and hamcount must be non-zero.
! def __init__(self):
! self.__setstate__((0, 0))
def __repr__(self):
! return "WordInfo%r" % repr((self.spamcount,
! self.hamcount))
def __getstate__(self):
! return (self.spamcount,
! self.hamcount)
def __setstate__(self, t):
! (self.spamcount, self.hamcount) = t
!
! class Classifier:
# Defining __slots__ here made Jeremy's life needlessly difficult when
# trying to hook this all up to ZODB as a persistent object. There's
***************
*** 105,117 ****
def __init__(self):
self.wordinfo = {}
! self.nspam = self.nham = 0
def __getstate__(self):
! return PICKLE_VERSION, self.wordinfo, self.nspam, self.nham
def __setstate__(self, t):
if t[0] != PICKLE_VERSION:
raise ValueError("Can't unpickle -- version %s unknown" % t[0])
! self.wordinfo, self.nspam, self.nham = t[1:]
# spamprob() implementations. One of the following is aliased to
--- 129,156 ----
def __init__(self):
self.wordinfo = {}
! self.meta = MetaInfo()
! self.probcache = {}
def __getstate__(self):
! return PICKLE_VERSION, self.wordinfo, self.meta
def __setstate__(self, t):
if t[0] != PICKLE_VERSION:
raise ValueError("Can't unpickle -- version %s unknown" % t[0])
! self.wordinfo, self.meta = t[1:]
!
! # Slacker's way out--pass calls to nham/nspam up to the meta class
!
! def get_nham(self):
! return self.meta.nham
! def set_nham(self, val):
! self.meta.nham = val
! nham = property(get_nham, set_nham)
!
! def get_nspam(self):
! return self.meta.nspam
! def set_nspam(self, val):
! self.meta.nspam = val
! nspam = property(get_nspam, set_nspam)
# spamprob() implementations. One of the following is aliased to
***************
*** 145,150 ****
clues = self._getclues(wordstream)
for prob, word, record in clues:
- if record is not None: # else wordinfo doesn't know about it
- record.killcount += 1
P *= 1.0 - prob
Q *= prob
--- 184,187 ----
***************
*** 234,239 ****
clues = self._getclues(wordstream)
for prob, word, record in clues:
- if record is not None: # else wordinfo doesn't know about it
- record.killcount += 1
S *= 1.0 - prob
H *= prob
--- 271,274 ----
***************
*** 278,282 ****
spamprob = chi2_spamprob
! def learn(self, wordstream, is_spam, update_probabilities=True):
"""Teach the classifier by example.
--- 313,317 ----
spamprob = chi2_spamprob
! def learn(self, wordstream, is_spam):
"""Teach the classifier by example.
***************
*** 285,324 ****
else that it's definitely not spam.
! If optional arg update_probabilities is False (the default is True),
! don't update word probabilities. Updating them is expensive, and if
! you're going to pass many messages to learn(), it's more efficient
! to pass False here and call update_probabilities() once when you're
! done -- or to call learn() with update_probabilities=True when
! passing the last new example. The important thing is that the
! probabilities get updated before calling spamprob() again.
"""
self._add_msg(wordstream, is_spam)
- if update_probabilities:
- self.update_probabilities()
! def unlearn(self, wordstream, is_spam, update_probabilities=True):
"""In case of pilot error, call unlearn ASAP after screwing up.
Pass the same arguments you passed to learn().
"""
-
self._remove_msg(wordstream, is_spam)
- if update_probabilities:
- self.update_probabilities()
! def update_probabilities(self):
! """Update the word probabilities in the spam database.
! This computes a new probability for every word in the database,
! so can be expensive. learn() and unlearn() update the probabilities
! each time by default. Thay have an optional argument that allows
! to skip this step when feeding in many messages, and in that case
! you should call update_probabilities() after feeding the last
! message and before calling spamprob().
"""
! nham = float(self.nham or 1)
! nspam = float(self.nspam or 1)
if options.experimental_ham_spam_imbalance_adjustment:
--- 320,371 ----
else that it's definitely not spam.
! If optional arg update_word_probabilities is False (the default
! is True), don't update individual words' probabilities.
! Updating them is expensive, and if you're going to pass many
! messages to learn(), it's more efficient to pass False here and
! call update_probabilities() once when you're done. The
! important thing is that the probabilities get updated before
! calling spamprob() again.
!
"""
self._add_msg(wordstream, is_spam)
! def unlearn(self, wordstream, is_spam):
"""In case of pilot error, call unlearn ASAP after screwing up.
Pass the same arguments you passed to learn().
"""
self._remove_msg(wordstream, is_spam)
! def probability(self, record):
! """Compute, store, and return prob(msg is spam | msg contains word).
! This is the Graham calculation, but stripped of biases, and
! stripped of clamping into 0.01 thru 0.99. The Bayesian
! adjustment following keeps them in a sane range, and one
! that naturally grows the more evidence there is to back up
! a probability.
"""
! spamcount = record.spamcount
! hamcount = record.hamcount
!
! # Try the cache first
! try:
! return self.probcache[spamcount][hamcount]
! except KeyError:
! pass
!
! nham = float(self.meta.nham or 1)
! nspam = float(self.meta.nspam or 1)
!
! assert hamcount <= nham
! hamratio = hamcount / nham
!
! assert spamcount <= nspam
! spamratio = spamcount / nspam
!
! prob = spamratio / (hamratio + spamratio)
if options.experimental_ham_spam_imbalance_adjustment:
***************
*** 331,405 ****
StimesX = S * options.unknown_word_prob
- for word, record in self.wordinfo.iteritems():
- # Compute p(word) = prob(msg is spam | msg contains word).
- # This is the Graham calculation, but stripped of biases, and
- # stripped of clamping into 0.01 thru 0.99. The Bayesian
- # adjustment following keeps them in a sane range, and one
- # that naturally grows the more evidence there is to back up
- # a probability.
- hamcount = record.hamcount
- assert hamcount <= nham
- hamratio = hamcount / nham
! spamcount = record.spamcount
! assert spamcount <= nspam
! spamratio = spamcount / nspam
!
! prob = spamratio / (hamratio + spamratio)
! # Now do Robinson's Bayesian adjustment.
! #
! # s*x + n*p(w)
! # f(w) = --------------
! # s + n
! #
! # I find this easier to reason about like so (equivalent when
! # s != 0):
! #
! # x - p
! # p + -------
! # 1 + n/s
! #
! # IOW, it moves p a fraction of the distance from p to x, and
! # less so the larger n is, or the smaller s is.
! # Experimental:
! # Picking a good value for n is interesting: how much empirical
! # evidence do we really have? If nham == nspam,
! # hamcount + spamcount makes a lot of sense, and the code here
! # does that by default.
! # But if, e.g., nham is much larger than nspam, p(w) can get a
! # lot closer to 0.0 than it can get to 1.0. That in turn makes
! # strong ham words (high hamcount) much stronger than strong
! # spam words (high spamcount), and that makes the accidental
! # appearance of a strong ham word in spam much more damaging than
! # the accidental appearance of a strong spam word in ham.
! # So we don't give hamcount full credit when nham > nspam (or
! # spamcount when nspam > nham): instead we knock hamcount down
! # to what it would have been had nham been equal to nspam. IOW,
! # we multiply hamcount by nspam/nham when nspam < nham; or, IOOW,
! # we don't "believe" any count to an extent more than
! # min(nspam, nham) justifies.
! n = hamcount * spam2ham + spamcount * ham2spam
! prob = (StimesX + n * prob) / (S + n)
! if record.spamprob != prob:
! record.spamprob = prob
! # The next seemingly pointless line appears to be a hack
! # to allow a persistent db to realize the record has changed.
! self.wordinfo[word] = record
! def clearjunk(self, oldesttime):
! """Forget useless wordinfo records. This can shrink the database size.
! A record for a word will be retained only if the word was accessed
! at or after oldesttime.
"""
! wordinfo = self.wordinfo
! tonuke = [w for w, r in wordinfo.iteritems() if r.atime < oldesttime]
! for w in tonuke:
! del wordinfo[w]
# NOTE: Graham's scheme had a strange asymmetry: when a word appeared
--- 378,440 ----
StimesX = S * options.unknown_word_prob
! # Now do Robinson's Bayesian adjustment.
! #
! # s*x + n*p(w)
! # f(w) = --------------
! # s + n
! #
! # I find this easier to reason about like so (equivalent when
! # s != 0):
! #
! # x - p
! # p + -------
! # 1 + n/s
! #
! # IOW, it moves p a fraction of the distance from p to x, and
! # less so the larger n is, or the smaller s is.
! # Experimental:
! # Picking a good value for n is interesting: how much empirical
! # evidence do we really have? If nham == nspam,
! # hamcount + spamcount makes a lot of sense, and the code here
! # does that by default.
! # But if, e.g., nham is much larger than nspam, p(w) can get a
! # lot closer to 0.0 than it can get to 1.0. That in turn makes
! # strong ham words (high hamcount) much stronger than strong
! # spam words (high spamcount), and that makes the accidental
! # appearance of a strong ham word in spam much more damaging than
! # the accidental appearance of a strong spam word in ham.
! # So we don't give hamcount full credit when nham > nspam (or
! # spamcount when nspam > nham): instead we knock hamcount down
! # to what it would have been had nham been equal to nspam. IOW,
! # we multiply hamcount by nspam/nham when nspam < nham; or, IOOW,
! # we don't "believe" any count to an extent more than
! # min(nspam, nham) justifies.
! n = hamcount * spam2ham + spamcount * ham2spam
! prob = (StimesX + n * prob) / (S + n)
! # Update the cache
! try:
! self.probcache[spamcount][hamcount] = prob
! except KeyError:
! self.probcache[spamcount] = {hamcount: prob}
! return prob
! def update_probabilities(self):
! """Update the word probabilities in the spam database.
! This computes a new probability for every word in the database,
! which can be expensive. learn() and unlearn() clear the
! probability cache each time by default, and that will be rebuilt
! as probabilities are looked up. If for some reason you need to
! update all the probabilities in one step (say, for
! benchmarking), you can call this method.
"""
! for word, record in self.wordinfo.iteritems():
! self.probability(record)
# NOTE: Graham's scheme had a strange asymmetry: when a word appeared
***************
*** 424,439 ****
# to exploit it.
def _add_msg(self, wordstream, is_spam):
if is_spam:
! self.nspam += 1
else:
! self.nham += 1
wordinfo = self.wordinfo
wordinfoget = wordinfo.get
- now = time.time()
for word in Set(wordstream):
record = wordinfoget(word)
if record is None:
! record = self.WordInfoClass(now)
if is_spam:
--- 459,474 ----
# to exploit it.
def _add_msg(self, wordstream, is_spam):
+ self.probcache = {} # nuke the prob cache
if is_spam:
! self.meta.nspam += 1
else:
! self.meta.nham += 1
wordinfo = self.wordinfo
wordinfoget = wordinfo.get
for word in Set(wordstream):
record = wordinfoget(word)
if record is None:
! record = self.WordInfoClass()
if is_spam:
***************
*** 441,456 ****
else:
record.hamcount += 1
# Needed to tell a persistent DB that the content changed.
wordinfo[word] = record
def _remove_msg(self, wordstream, is_spam):
if is_spam:
! if self.nspam <= 0:
raise ValueError("spam count would go negative!")
! self.nspam -= 1
else:
! if self.nham <= 0:
raise ValueError("non-spam count would go negative!")
! self.nham -= 1
wordinfo = self.wordinfo
--- 476,494 ----
else:
record.hamcount += 1
+
# Needed to tell a persistent DB that the content changed.
wordinfo[word] = record
+
def _remove_msg(self, wordstream, is_spam):
+ self.probcache = {} # nuke the prob cache
if is_spam:
! if self.meta.nspam <= 0:
raise ValueError("spam count would go negative!")
! self.meta.nspam -= 1
else:
! if self.meta.nham <= 0:
raise ValueError("non-spam count would go negative!")
! self.meta.nham -= -1
wordinfo = self.wordinfo
***************
*** 468,472 ****
del wordinfo[word]
else:
! # Needed to tell a persistent DB that the content changed.
wordinfo[word] = record
--- 506,511 ----
del wordinfo[word]
else:
! # Needed to tell a persistent DB that the content
! # changed.
wordinfo[word] = record
***************
*** 479,483 ****
wordinfoget = self.wordinfo.get
- now = time.time()
for word in Set(wordstream):
record = wordinfoget(word)
--- 518,521 ----
***************
*** 485,490 ****
prob = unknown
else:
! record.atime = now
! prob = record.spamprob
distance = abs(prob - 0.5)
if distance >= mindist:
--- 523,527 ----
prob = unknown
else:
! prob = self.probability(record)
distance = abs(prob - 0.5)
if distance >= mindist:
***************
*** 496,497 ****
--- 533,537 ----
# Return (prob, word, record).
return [t[1:] for t in clues]
+
+
+ Bayes = Classifier
Index: dbdict.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/dbdict.py,v
retrieving revision 1.1
retrieving revision 1.2
diff -C2 -d -r1.1 -r1.2
*** dbdict.py 19 Nov 2002 23:31:44 -0000 1.1
--- dbdict.py 25 Nov 2002 02:29:44 -0000 1.2
***************
*** 1,6 ****
#! /usr/bin/env python
from __future__ import generators
! import dbhash
try:
import cPickle as pickle
--- 1,55 ----
#! /usr/bin/env python
+ """DBDict.py - Dictionary access to dbhash
+
+ Classes:
+ DBDict - wraps a dbhash file
+
+ Abstract:
+ DBDict class wraps a dbhash file with a reasonably complete set
+ of dictionary access methods. DBDicts can be iterated like a dictionary.
+
+ The constructor accepts a class name which is used specifically to
+ to pickle/unpickle an instance of that class. When an instance of
+ that class is being pickled, the pickler (actually __getstate__) prepends
+ a 'W' to the pickled string, and when the unpickler (really __setstate__)
+ encounters that 'W', it constructs that class (with no constructor
+ arguments) and executes __setstate__ on the constructed instance.
+
+ DBDict accepts an iterskip operand on the constructor. This is a tuple
+ of hash keys that will be skipped (not seen) during iteration. This
+ is for iteration only. Methods such as keys() will return the entire
+ complement of keys in the dbm hash, even if they're in iterskip. An
+ iterkeys() method is provided for iterating with skipped keys, and
+ itervaluess() is provided for iterating values with skipped keys.
+
+ >>> d = DBDict('/tmp/goober.db', MODE_CREATE, ('skipme', 'skipmetoo'))
+ >>> d['skipme'] = 'booga'
+ >>> d['countme'] = 'wakka'
+ >>> print d.keys()
+ ['skipme', 'countme']
+ >>> for k in d.iterkeys():
+ ... print k
+ countme
+ >>> for v in d.itervalues():
+ ... print v
+ wakka
+ >>> for k,v in d.iteritems():
+ ... print k,v
+ countme wakka
+
+ To Do:
+ """
+
+ # This module is part of the spambayes project, which is Copyright 2002
+ # The Python Software Foundation and is covered by the Python Software
+ # Foundation license.
+
+ __author__ = "Neale Pickett <neale@woozle.org>, \
+ Tim Stone <tim@fourstonesExpressions.com>"
+ __credits__ = "Tim Peters (author of DBDict class), \
+ all the spambayes contributors."
from __future__ import generators
!
try:
import cPickle as pickle
***************
*** 8,11 ****
--- 57,72 ----
import pickle
+ import dbhash
+ import errno
+ import copy
+ import shutil
+ import os
+
+ MODE_CREATE = 'c' # create file if necessary, open for readwrite
+ MODE_NEW = 'n' # always create new file, open for readwrite
+ MODE_READWRITE = 'w' # open existing file for readwrite
+ MODE_READONLY = 'r' # open existing file for read only
+
+
class DBDict:
"""Database Dictionary.
***************
*** 19,23 ****
like .keys() still list everything. For instance:
! >>> d = DBDict('goober.db', 'c', ('skipme', 'skipmetoo'))
>>> d['skipme'] = 'booga'
>>> d['countme'] = 'wakka'
--- 80,84 ----
like .keys() still list everything. For instance:
! >>> d = DBDict('goober.db', MODE_CREATE, ('skipme', 'skipmetoo'))
>>> d['skipme'] = 'booga'
>>> d['countme'] = 'wakka'
***************
*** 30,36 ****
"""
! def __init__(self, dbname, mode, iterskip=()):
self.hash = dbhash.open(dbname, mode)
! self.iterskip = iterskip
def __getitem__(self, key):
--- 91,121 ----
"""
! def __init__(self, dbname, mode, wclass, iterskip=()):
self.hash = dbhash.open(dbname, mode)
! if not iterskip:
! self.iterskip = iterskip
! else:
! self.iterskip = ()
! self.wclass=wclass
!
! def __getitem__(self, key):
! v = self.hash[key]
! if v[0] == 'W':
! val = pickle.loads(v[1:])
! # We could be sneaky, like pickle.Unpickler.load_inst,
! # but I think that's overly confusing.
! obj = self.wclass()
! obj.__setstate__(val)
! return obj
! else:
! return pickle.loads(v)
!
! def __setitem__(self, key, val):
! if isinstance(val, self.wclass):
! val = val.__getstate__()
! v = 'W' + pickle.dumps(val, 1)
! else:
! v = pickle.dumps(val, 1)
! self.hash[key] = v
def __getitem__(self, key):
***************
*** 79,82 ****
--- 164,168 ----
def itervalues(self):
return self.__iter__(lambda k: k[1])
+
open = DBDict
Index: hammie.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/hammie.py,v
retrieving revision 1.40
retrieving revision 1.41
diff -C2 -d -r1.40 -r1.41
*** hammie.py 18 Nov 2002 18:13:54 -0000 1.40
--- hammie.py 25 Nov 2002 02:29:44 -0000 1.41
***************
*** 1,56 ****
#! /usr/bin/env python
- # A driver for the classifier module and Tim's tokenizer that you can
- # call from procmail.
-
- """Usage: %(program)s [options]
-
- Where:
- -h
- show usage and exit
- -g PATH
- mbox or directory of known good messages (non-spam) to train on.
- Can be specified more than once, or use - for stdin.
- -s PATH
- mbox or directory of known spam messages to train on.
- Can be specified more than once, or use - for stdin.
- -u PATH
- mbox of unknown messages. A ham/spam decision is reported for each.
- Can be specified more than once.
- -r
- reverse the meaning of the check (report ham instead of spam).
- Only meaningful with the -u option.
- -p FILE
- use file as the persistent store. loads data from this file if it
- exists, and saves data to this file at the end.
- Default: %(DEFAULTDB)s
- -d
- use the DBM store instead of cPickle. The file is larger and
- creating it is slower, but checking against it is much faster,
- especially for large word databases. Default: %(USEDB)s
- -D
- the reverse of -d: use the cPickle instead of DBM
- -f
- run as a filter: read a single message from stdin, add an
- %(DISPHEADER)s header, and write it to stdout. If you want to
- run from procmail, this is your option.
- """
-
- from __future__ import generators
-
- import sys
- import os
- import types
- import getopt
- import mailbox
- import glob
- import email
- import errno
- import anydbm
- import cPickle as pickle
import mboxutils
! import classifier
from Options import options
try:
--- 1,10 ----
#! /usr/bin/env python
+ import dbdict
import mboxutils
! import Persistent
from Options import options
+ from tokenizer import tokenize
try:
***************
*** 61,224 ****
! program = sys.argv[0] # For usage(); referenced by docstring above
!
! # Name of the header to add in filter mode
! DISPHEADER = options.hammie_header_name
! DEBUGHEADER = options.hammie_debug_header_name
! DODEBUG = options.hammie_debug_header
!
! # Default database name
! DEFAULTDB = options.persistent_storage_file
!
! # Probability at which a message is considered spam
! SPAM_THRESHOLD = options.spam_cutoff
! HAM_THRESHOLD = options.ham_cutoff
!
! # Probability limit for a clue to be added to the DISPHEADER
! SHOWCLUE = options.clue_mailheader_cutoff
!
! # Use a database? If False, use a pickle
! USEDB = options.persistent_use_database
!
! # Tim's tokenizer kicks far more booty than anything I would have
! # written. Score one for analysis ;)
! from tokenizer import tokenize
!
! class DBDict:
!
! """Database Dictionary.
!
! This wraps an anydbm to make it look even more like a dictionary.
!
! Call it with the name of your database file. Optionally, you can
! specify a list of keys to skip when iterating. This only affects
! iterators; things like .keys() still list everything. For instance:
!
! >>> d = DBDict('/tmp/goober.db', ('skipme', 'skipmetoo'))
! >>> d['skipme'] = 'booga'
! >>> d['countme'] = 'wakka'
! >>> print d.keys()
! ['skipme', 'countme']
! >>> for k in d.iterkeys():
! ... print k
! countme
!
! """
!
! def __init__(self, dbname, mode, iterskip=()):
! self.hash = anydbm.open(dbname, mode)
! self.iterskip = iterskip
!
! def __getitem__(self, key):
! v = self.hash[key]
! if v[0] == 'W':
! val = pickle.loads(v[1:])
! # We could be sneaky, like pickle.Unpickler.load_inst,
! # but I think that's overly confusing.
! obj = classifier.WordInfo(0)
! obj.__setstate__(val)
! return obj
! else:
! return pickle.loads(v)
!
! def __setitem__(self, key, val):
! if isinstance(val, classifier.WordInfo):
! val = val.__getstate__()
! v = 'W' + pickle.dumps(val, 1)
! else:
! v = pickle.dumps(val, 1)
! self.hash[key] = v
!
! def __delitem__(self, key, val):
! del(self.hash[key])
!
! def __iter__(self, fn=None):
! k = self.hash.first()
! while k != None:
! key = k[0]
! val = self.__getitem__(key)
! if key not in self.iterskip:
! if fn:
! yield fn((key, val))
! else:
! yield (key, val)
! try:
! k = self.hash.next()
! except KeyError:
! break
!
! def __contains__(self, name):
! return self.has_key(name)
!
! def __getattr__(self, name):
! # Pass the buck
! return getattr(self.hash, name)
!
! def get(self, key, dfl=None):
! if self.has_key(key):
! return self[key]
! else:
! return dfl
!
! def iteritems(self):
! return self.__iter__()
!
! def iterkeys(self):
! return self.__iter__(lambda k: k[0])
!
! def itervalues(self):
! return self.__iter__(lambda k: k[1])
!
!
! class PersistentBayes(classifier.Bayes):
!
! """A persistent Bayes classifier.
!
! This is just like classifier.Bayes, except that the dictionary is a
! database. You take less disk this way and you can pretend it's
! persistent. The tradeoffs vs. a pickle are: 1. it's slower
! training, but faster checking, and 2. it needs less memory to run,
! but takes more space on the hard drive.
! On destruction, an instantiation of this class will write its state
! to a special key. When you instantiate a new one, it will attempt
! to read these values out of that key again, so you can pick up where
! you left off.
"""
- # XXX: Would it be even faster to remember (in a list) which keys
- # had been modified, and only recalculate those keys? No sense in
- # going over the entire word database if only 100 words are
- # affected.
-
- # XXX: Another idea: cache stuff in memory. But by then maybe we
- # should just use ZODB.
-
- def __init__(self, dbname, mode):
- classifier.Bayes.__init__(self)
- self.statekey = "saved state"
- self.wordinfo = DBDict(dbname, mode, (self.statekey,))
- self.dbmode = mode
-
- self.restore_state()
-
- def __del__(self):
- #super.__del__(self)
- self.save_state()
-
- def save_state(self):
- if self.dbmode != 'r':
- self.wordinfo[self.statekey] = (self.nham, self.nspam)
-
- def restore_state(self):
- if self.wordinfo.has_key(self.statekey):
- self.nham, self.nspam = self.wordinfo[self.statekey]
-
-
- class Hammie:
-
- """A spambayes mail filter"""
-
def __init__(self, bayes):
self.bayes = bayes
--- 15,26 ----
! class Hammie:
! """A spambayes mail filter.
! This implements the basic functionality needed to score, filter, or
! train.
"""
def __init__(self, bayes):
self.bayes = bayes
***************
*** 256,269 ****
"""
! try:
! return self._scoremsg(msg, evidence)
! except:
! print msg
! import traceback
! traceback.print_exc()
! def filter(self, msg, header=DISPHEADER, spam_cutoff=SPAM_THRESHOLD,
! ham_cutoff=HAM_THRESHOLD, debugheader=DEBUGHEADER,
! debug=DODEBUG):
"""Score (judge) a message and add a disposition header.
--- 58,66 ----
"""
! return self._scoremsg(msg, evidence)
! def filter(self, msg, header=None, spam_cutoff=None,
! ham_cutoff=None, debugheader=None,
! debug=None):
"""Score (judge) a message and add a disposition header.
***************
*** 283,286 ****
--- 80,94 ----
"""
+ if header == None:
+ header = options.hammie_header_name
+ if spam_cutoff == None:
+ spam_cutoff = options.spam_cutoff
+ if ham_cutoff == None:
+ ham_cutoff = options.ham_cutoff
+ if debugheader == None:
+ debugheader = options.hammie_debug_header_name
+ if debug == None:
+ debug = options.hammie_debug_header
+
msg = mboxutils.get_message(msg)
try:
***************
*** 323,327 ****
"""
! self.bayes.learn(tokenize(msg), is_spam, False)
def train_ham(self, msg):
--- 131,135 ----
"""
! self.bayes.learn(tokenize(msg), is_spam)
def train_ham(self, msg):
***************
*** 349,510 ****
self.train(msg, True)
! def update_probabilities(self):
! """Update probability values.
! You would want to call this after a training session. It's
! pretty slow, so if you have a lot of messages to train, wait
! until you're all done before calling this.
"""
! self.bayes.update_probabilities()
!
!
! def train(hammie, msgs, is_spam):
! """Train bayes with all messages from a mailbox."""
! mbox = mboxutils.getmbox(msgs)
! i = 0
! for msg in mbox:
! i += 1
! # XXX: Is the \r a Unixism? I seem to recall it working in DOS
! # back in the day. Maybe it's a line-printer-ism ;)
! sys.stdout.write("\r%6d" % i)
! sys.stdout.flush()
! hammie.train(msg, is_spam)
! print
!
! def score(hammie, msgs, reverse=0):
! """Score (judge) all messages from a mailbox."""
! # XXX The reporting needs work!
! mbox = mboxutils.getmbox(msgs)
! i = 0
! spams = hams = 0
! for msg in mbox:
! i += 1
! prob, clues = hammie.score(msg, True)
! if hasattr(msg, '_mh_msgno'):
! msgno = msg._mh_msgno
! else:
! msgno = i
! isspam = (prob >= SPAM_THRESHOLD)
! if isspam:
! spams += 1
! if not reverse:
! print "%6s %4.2f %1s" % (msgno, prob, isspam and "S" or "."),
! print hammie.formatclues(clues)
! else:
! hams += 1
! if reverse:
! print "%6s %4.2f %1s" % (msgno, prob, isspam and "S" or "."),
! print hammie.formatclues(clues)
! return (spams, hams)
!
! def createbayes(pck=DEFAULTDB, usedb=False, mode='r'):
! """Create a Bayes instance for the given pickle (which
! doesn't have to exist). Create a PersistentBayes if
! usedb is True."""
! if usedb:
! bayes = PersistentBayes(pck, mode)
! else:
! bayes = None
! try:
! fp = open(pck, 'rb')
! except IOError, e:
! if e.errno <> errno.ENOENT: raise
! else:
! bayes = pickle.load(fp)
! fp.close()
! if bayes is None:
! bayes = classifier.Bayes()
! return bayes
!
! def usage(code, msg=''):
! """Print usage message and sys.exit(code)."""
! if msg:
! print >> sys.stderr, msg
! print >> sys.stderr
! print >> sys.stderr, __doc__ % globals()
! sys.exit(code)
!
! def main():
! """Main program; parse options and go."""
! try:
! opts, args = getopt.getopt(sys.argv[1:], 'hdDfg:s:p:u:r')
! except getopt.error, msg:
! usage(2, msg)
!
! if not opts:
! usage(2, "No options given")
!
! pck = DEFAULTDB
! good = []
! spam = []
! unknown = []
! reverse = 0
! do_filter = False
! usedb = USEDB
! mode = 'r'
! for opt, arg in opts:
! if opt == '-h':
! usage(0)
! elif opt == '-g':
! good.append(arg)
! mode = 'c'
! elif opt == '-s':
! spam.append(arg)
! mode = 'c'
! elif opt == '-p':
! pck = arg
! elif opt == "-d":
! usedb = True
! elif opt == "-D":
! usedb = False
! elif opt == "-f":
! do_filter = True
! elif opt == '-u':
! unknown.append(arg)
! elif opt == '-r':
! reverse = 1
! if args:
! usage(2, "Positional arguments not allowed")
!
! save = False
- bayes = createbayes(pck, usedb, mode)
- h = Hammie(bayes)
! for g in good:
! print "Training ham (%s):" % g
! train(h, g, False)
! save = True
! for s in spam:
! print "Training spam (%s):" % s
! train(h, s, True)
! save = True
! if save:
! h.update_probabilities()
! if not usedb and pck:
! fp = open(pck, 'wb')
! pickle.dump(bayes, fp, 1)
! fp.close()
! if do_filter:
! msg = sys.stdin.read()
! filtered = h.filter(msg)
! sys.stdout.write(filtered)
! if unknown:
! (spams, hams) = (0, 0)
! for u in unknown:
! if len(unknown) > 1:
! print "Scoring", u
! s, g = score(h, u, reverse)
! spams += s
! hams += g
! print "Total %d spam, %d ham" % (spams, hams)
if __name__ == "__main__":
! main()
--- 157,192 ----
self.train(msg, True)
! def store(self):
! """Write out the persistent store.
! This makes sure the persistent store reflects what is currently
! in memory. You would want to do this after a write and before
! exiting.
"""
! self.bayes.store()
! def open(filename, usedb=True, mode='r'):
! """Open a file, returning a Hammie instance.
! If usedb is False, open as a pickle instead of a DBDict. mode is
! used as the flag to open DBDict objects. 'c' for read-write (create
! if needed), 'r' for read-only, 'w' for read-write.
! """
! if usedb:
! b = Persistent.DBDictClassifier(filename, mode)
! else:
! b = Persistent.PickledClassifier(filename)
! return Hammie(b)
if __name__ == "__main__":
! # Everybody's used to running hammie.py. Why mess with success? ;)
! import hammiebulk
!
! hammiebulk.main()
Index: hammiefilter.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/hammiefilter.py,v
retrieving revision 1.2
retrieving revision 1.3
diff -C2 -d -r1.2 -r1.3
*** hammiefilter.py 18 Nov 2002 18:14:04 -0000 1.2
--- hammiefilter.py 25 Nov 2002 02:29:44 -0000 1.3
***************
*** 52,95 ****
sys.exit(code)
! def jar_pickle(h):
! if not options.persistent_use_database:
! import pickle
! fp = open(options.persistent_storage_file, 'wb')
! pickle.dump(h.bayes, fp, 1)
! fp.close()
!
!
! def hammie_open(mode):
! b = hammie.createbayes(options.persistent_storage_file,
! options.persistent_use_database,
! mode)
! return hammie.Hammie(b)
! def newdb():
! h = hammie_open('n')
! jar_pickle(h)
! print "Created new database in", options.persistent_storage_file
! def filter():
! h = hammie_open('r')
! msg = sys.stdin.read()
! print h.filter(msg)
! def train_ham():
! h = hammie_open('w')
! msg = sys.stdin.read()
! h.train_ham(msg)
! h.update_probabilities()
! jar_pickle(h)
! def train_spam():
! h = hammie_open('w')
! msg = sys.stdin.read()
! h.train_spam(msg)
! h.update_probabilities()
! jar_pickle(h)
def main():
! action = filter
opts, args = getopt.getopt(sys.argv[1:], 'hngs')
for opt, arg in opts:
--- 52,91 ----
sys.exit(code)
! class HammieFilter(object):
! def __init__(self):
! options = Options.options
! options.mergefiles(['/etc/hammierc',
! os.path.expanduser('~/.hammierc')])
!
! self.dbname = options.hammiefilter_persistent_storage_file
! self.dbname = os.path.expanduser(self.dbname)
! self.usedb = options.hammiefilter_persistent_use_database
!
! def newdb(self):
! h = hammie.open(self.dbname, self.usedb, 'n')
! h.store()
! print "Created new database in", self.dbname
! def filter(self):
! h = hammie.open(self.dbname, self.usedb, 'r')
! msg = sys.stdin.read()
! print h.filter(msg)
! def train_ham(self):
! h = hammie.open(self.dbname, self.usedb, 'c')
! msg = sys.stdin.read()
! h.train_ham(msg)
! h.store()
! def train_spam(self):
! h = hammie.open(self.dbname, self.usedb, 'c')
! msg = sys.stdin.read()
! h.train_spam(msg)
! h.store()
def main():
! h = HammieFilter()
! action = h.filter
opts, args = getopt.getopt(sys.argv[1:], 'hngs')
for opt, arg in opts:
***************
*** 97,114 ****
usage(0)
elif opt == '-g':
! action = train_ham
elif opt == '-s':
! action = train_spam
elif opt == "-n":
! action = newdb
!
! # hammiefilter overrides
! config_overrides = """[Hammie]
! persistent_storage_file = %s
! persistent_use_database = True
! """ % os.path.expanduser('~/.hammiedb')
! options.mergefilelike(StringIO.StringIO(config_overrides))
! options.mergefiles(['/etc/hammierc',
! os.path.expanduser('~/.hammierc')])
action()
--- 93,101 ----
usage(0)
elif opt == '-g':
! action = h.train_ham
elif opt == '-s':
! action = h.train_spam
elif opt == "-n":
! action = h.newdb
action()
Index: pop3proxy.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/pop3proxy.py,v
retrieving revision 1.18
retrieving revision 1.19
diff -C2 -d -r1.18 -r1.19
*** pop3proxy.py 20 Nov 2002 22:41:50 -0000 1.18
--- pop3proxy.py 25 Nov 2002 02:29:44 -0000 1.19
***************
*** 119,123 ****
import os, sys, re, operator, errno, getopt, string, cStringIO, time, bisect
import socket, asyncore, asynchat, cgi, urlparse, webbrowser
! import Bayes, tokenizer, mboxutils
from FileCorpus import FileCorpus, FileMessageFactory, GzipFileMessageFactory
from email.Iterators import typed_subpart_iterator
--- 119,123 ----
import os, sys, re, operator, errno, getopt, string, cStringIO, time, bisect
import socket, asyncore, asynchat, cgi, urlparse, webbrowser
! import Persistent, tokenizer, mboxutils
from FileCorpus import FileCorpus, FileMessageFactory, GzipFileMessageFactory
from email.Iterators import typed_subpart_iterator
***************
*** 819,822 ****
--- 819,825 ----
stateDict = state.__dict__
stateDict.update(state.bayes.__dict__)
+ # so the property() isn't as cool as we thought. -ntp
+ stateDict['nham'] = state.bayes.nham
+ stateDict['nspam'] = state.bayes.nspam
body = (self.pageSection % ('Status', self.summary % stateDict)+
self.pageSection % ('Train on proxied messages', self.review)+
***************
*** 1119,1123 ****
# This keeps the global state of the module - the command-line options,
# statistics like how many mails have been classified, the handle of the
! # log file, the Bayes and FileCorpus objects, and so on.
class State:
def __init__(self):
--- 1122,1126 ----
# This keeps the global state of the module - the command-line options,
# statistics like how many mails have been classified, the handle of the
! # log file, the Classifier and FileCorpus objects, and so on.
class State:
def __init__(self):
***************
*** 1162,1167 ****
# Load up the other settings from Option.py / bayescustomize.ini
! self.databaseFilename = options.persistent_storage_file
! self.useDB = options.persistent_use_database
self.uiPort = options.html_ui_port
self.launchUI = options.html_ui_launch_browser
--- 1165,1170 ----
# Load up the other settings from Option.py / bayescustomize.ini
! self.databaseFilename = options.pop3proxy_persistent_storage_file
! self.useDB = options.pop3proxy_persistent_use_database
self.uiPort = options.html_ui_port
self.launchUI = options.html_ui_launch_browser
***************
*** 1200,1206 ****
self.databaseFilename = '_pop3proxy_test.pickle' # Never saved
if self.useDB:
! self.bayes = Bayes.DBDictBayes(self.databaseFilename)
else:
! self.bayes = Bayes.PickledBayes(self.databaseFilename)
print "Done."
--- 1203,1209 ----
self.databaseFilename = '_pop3proxy_test.pickle' # Never saved
if self.useDB:
! self.bayes = Persistent.DBDictClassifier(self.databaseFilename)
else:
! self.bayes = Persistent.PickledClassifier(self.databaseFilename)
print "Done."
***************
*** 1227,1232 ****
# Create the Trainers.
! self.spamTrainer = Bayes.SpamTrainer(self.bayes)
! self.hamTrainer = Bayes.HamTrainer(self.bayes)
self.spamCorpus.addObserver(self.spamTrainer)
self.hamCorpus.addObserver(self.hamTrainer)
--- 1230,1235 ----
# Create the Trainers.
! self.spamTrainer = Persistent.SpamTrainer(self.bayes)
! self.hamTrainer = Persistent.HamTrainer(self.bayes)
self.spamCorpus.addObserver(self.spamTrainer)
self.hamCorpus.addObserver(self.hamTrainer)
--- Bayes.py DELETED ---
- Previous message: [Spambayes-checkins]
spambayes/Outlook2000 README.txt,1.7,1.8 about.html,1.4,1.5
addin.py,1.38,1.39 filter.py,1.13,1.14 manager.py,1.35,1.36
- Next message: [Spambayes-checkins] spambayes classifier.py,1.54,1.55
- Messages sorted by:
[ date ]
[ thread ]
[ subject ]
[ author ]
More information about the Spambayes-checkins
mailing list