[Spambayes-checkins] spambayes hammiebulk.py,NONE,1.1.2.1
classifier.py,1.53.2.4,1.53.2.5 hammie.py,1.40.2.2,1.40.2.3
Neale Pickett
npickett@users.sourceforge.net
Thu Nov 21 23:00:01 2002
Update of /cvsroot/spambayes/spambayes
In directory sc8-pr-cvs1:/tmp/cvs-serv1861
Modified Files:
Tag: hammie-playground
classifier.py hammie.py
Added Files:
Tag: hammie-playground
hammiebulk.py
Log Message:
* Bayes.py: removed a debug print
* hammie.py: removed some debug code I put in for hammiesrv
* hammiebulk.py: this does what hammie.py used to do.
--- NEW FILE: hammiebulk.py ---
#! /usr/bin/env python
"""Usage: %(program)s [options]
Where:
-h
show usage and exit
-g PATH
mbox or directory of known good messages (non-spam) to train on.
Can be specified more than once, or use - for stdin.
-s PATH
mbox or directory of known spam messages to train on.
Can be specified more than once, or use - for stdin.
-u PATH
mbox of unknown messages. A ham/spam decision is reported for each.
Can be specified more than once.
-r
reverse the meaning of the check (report ham instead of spam).
Only meaningful with the -u option.
-p FILE
use file as the persistent store. loads data from this file if it
exists, and saves data to this file at the end.
Default: %(DEFAULTDB)s
-d
use the DBM store instead of cPickle. The file is larger and
creating it is slower, but checking against it is much faster,
especially for large word databases. Default: %(USEDB)s
-D
the reverse of -d: use the cPickle instead of DBM
-f
run as a filter: read a single message from stdin, add a new
header, and write it to stdout. If you want to run from
procmail, this is your option.
"""
import sys
import os
import types
import getopt
import mailbox
import glob
import email
import errno
import anydbm
import cPickle as pickle
from Options import options
import mboxutils
import classifier
import hammie
program = sys.argv[0] # For usage(); referenced by docstring above
# Default database name
DEFAULTDB = os.path.expanduser(options.hammiefilter_persistent_storage_file)
# Use a database? If False, use a pickle
USEDB = options.hammiefilter_persistent_use_database
# Probability at which a message is considered spam
SPAM_THRESHOLD = options.spam_cutoff
HAM_THRESHOLD = options.ham_cutoff
def train(h, msgs, is_spam):
"""Train bayes with all messages from a mailbox."""
mbox = mboxutils.getmbox(msgs)
i = 0
for msg in mbox:
i += 1
# XXX: Is the \r a Unixism? I seem to recall it working in DOS
# back in the day. Maybe it's a line-printer-ism ;)
sys.stdout.write("\r%6d" % i)
sys.stdout.flush()
h.train(msg, is_spam)
print
def score(h, msgs, reverse=0):
"""Score (judge) all messages from a mailbox."""
# XXX The reporting needs work!
mbox = mboxutils.getmbox(msgs)
i = 0
spams = hams = 0
for msg in mbox:
i += 1
prob, clues = h.score(msg, True)
if hasattr(msg, '_mh_msgno'):
msgno = msg._mh_msgno
else:
msgno = i
isspam = (prob >= SPAM_THRESHOLD)
if isspam:
spams += 1
if not reverse:
print "%6s %4.2f %1s" % (msgno, prob, isspam and "S" or "."),
print h.formatclues(clues)
else:
hams += 1
if reverse:
print "%6s %4.2f %1s" % (msgno, prob, isspam and "S" or "."),
print h.formatclues(clues)
return (spams, hams)
def createbayes(pck=DEFAULTDB, usedb=False, mode='r'):
"""Create a Bayes instance for the given pickle (which
doesn't have to exist). Create a PersistentBayes if
usedb is True."""
if usedb:
bayes = PersistentBayes(pck, mode)
else:
bayes = None
try:
fp = open(pck, 'rb')
except IOError, e:
if e.errno <> errno.ENOENT: raise
else:
bayes = pickle.load(fp)
fp.close()
if bayes is None:
bayes = classifier.Bayes()
return bayes
def usage(code, msg=''):
"""Print usage message and sys.exit(code)."""
if msg:
print >> sys.stderr, msg
print >> sys.stderr
print >> sys.stderr, __doc__ % globals()
sys.exit(code)
def main():
"""Main program; parse options and go."""
try:
opts, args = getopt.getopt(sys.argv[1:], 'hdDfg:s:p:u:r')
except getopt.error, msg:
usage(2, msg)
if not opts:
usage(2, "No options given")
pck = DEFAULTDB
good = []
spam = []
unknown = []
reverse = 0
do_filter = False
usedb = USEDB
mode = 'r'
for opt, arg in opts:
if opt == '-h':
usage(0)
elif opt == '-g':
good.append(arg)
mode = 'c'
elif opt == '-s':
spam.append(arg)
mode = 'c'
elif opt == '-p':
pck = arg
elif opt == "-d":
usedb = True
elif opt == "-D":
usedb = False
elif opt == "-f":
do_filter = True
elif opt == '-u':
unknown.append(arg)
elif opt == '-r':
reverse = 1
if args:
usage(2, "Positional arguments not allowed")
save = False
h = hammie.open(pck, usedb, mode)
for g in good:
print "Training ham (%s):" % g
train(h, g, False)
save = True
for s in spam:
print "Training spam (%s):" % s
train(h, s, True)
save = True
if save:
h.store()
if do_filter:
msg = sys.stdin.read()
filtered = h.filter(msg)
sys.stdout.write(filtered)
if unknown:
(spams, hams) = (0, 0)
for u in unknown:
if len(unknown) > 1:
print "Scoring", u
s, g = score(h, u, reverse)
spams += s
hams += g
print "Total %d spam, %d ham" % (spams, hams)
if __name__ == "__main__":
main()
Index: classifier.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/classifier.py,v
retrieving revision 1.53.2.4
retrieving revision 1.53.2.5
diff -C2 -d -r1.53.2.4 -r1.53.2.5
*** classifier.py 21 Nov 2002 06:03:24 -0000 1.53.2.4
--- classifier.py 21 Nov 2002 22:59:55 -0000 1.53.2.5
***************
*** 1,2 ****
--- 1,3 ----
+ #! /usr/bin/env python
# An implementation of a Bayes-like spam classifier.
#
***************
*** 72,76 ****
def incr_rev(self):
- print "revision going up...", self.revision
self.revision += 1
--- 73,76 ----
***************
*** 135,139 ****
S = options.unknown_word_strength
StimesX = S * options.unknown_word_prob
!
assert self.hamcount <= nham
hamratio = self.hamcount / nham
--- 135,139 ----
S = options.unknown_word_strength
StimesX = S * options.unknown_word_prob
!
assert self.hamcount <= nham
hamratio = self.hamcount / nham
Index: hammie.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/hammie.py,v
retrieving revision 1.40.2.2
retrieving revision 1.40.2.3
diff -C2 -d -r1.40.2.2 -r1.40.2.3
*** hammie.py 21 Nov 2002 04:27:27 -0000 1.40.2.2
--- hammie.py 21 Nov 2002 22:59:56 -0000 1.40.2.3
***************
*** 58,67 ****
"""
! try:
! return self._scoremsg(msg, evidence)
! except:
! print msg
! import traceback
! traceback.print_exc()
def filter(self, msg, header=None, spam_cutoff=None,
--- 58,62 ----
"""
! return self._scoremsg(msg, evidence)
def filter(self, msg, header=None, spam_cutoff=None,
More information about the Spambayes-checkins
mailing list