[Spambayes-checkins]
spambayes hammie.py,1.22,1.23 hammiesrv.py,1.2,1.3 runtest.sh,1.3,1.4
Neale Pickett
npickett@users.sourceforge.net
Fri, 27 Sep 2002 12:40:27 -0700
- Previous message: [Spambayes-checkins] spambayes hammie.py,1.21,1.22
- Next message: [Spambayes-checkins]
spambayes HistToGNU.py,1.5,1.6 TestDriver.py,1.15,1.16
hammie.py,1.23,1.24 hammiesrv.py,1.3,1.4 setup.py,1.5,1.6
- Messages sorted by:
[ date ]
[ thread ]
[ subject ]
[ author ]
Update of /cvsroot/spambayes/spambayes
In directory usw-pr-cvs1:/tmp/cvs-serv7026
Modified Files:
hammie.py hammiesrv.py runtest.sh
Log Message:
* hammie.py now has a Hammie class, which hammiesrv now uses.
hammie.py could still stand some more clean-up. Don't worry, I'm
on it :)
* runtest now has a run1 target to generate the first data
Index: hammie.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/hammie.py,v
retrieving revision 1.22
retrieving revision 1.23
diff -C2 -d -r1.22 -r1.23
*** hammie.py 27 Sep 2002 18:48:05 -0000 1.22
--- hammie.py 27 Sep 2002 19:40:21 -0000 1.23
***************
*** 61,65 ****
class DBDict:
! """Database Dictionary
This wraps an anydbm to make it look even more like a dictionary.
--- 61,66 ----
class DBDict:
!
! """Database Dictionary.
This wraps an anydbm to make it look even more like a dictionary.
***************
*** 136,140 ****
class PersistentGrahamBayes(classifier.GrahamBayes):
! """A persistent GrahamBayes classifier
This is just like classifier.GrahamBayes, except that the dictionary
--- 137,142 ----
class PersistentGrahamBayes(classifier.GrahamBayes):
!
! """A persistent GrahamBayes classifier.
This is just like classifier.GrahamBayes, except that the dictionary
***************
*** 177,181 ****
! def train(bayes, msgs, is_spam):
"""Train bayes with all messages from a mailbox."""
mbox = mboxutils.getmbox(msgs)
--- 179,303 ----
! class Hammie:
!
! """A spambayes mail filter"""
!
! def __init__(self, bayes):
! self.bayes = bayes
!
! def _scoremsg(self, msg, evidence=False):
! """Score a Message.
!
! msg can be a string, a file object, or a Message object.
!
! Returns the probability the message is spam. If evidence is
! true, returns a tuple: (probability, clues), where clues is a
! list of the words which contributed to the score.
!
! """
!
! return self.bayes.spamprob(tokenize(msg), evidence)
!
! def formatclues(self, clues, sep="; "):
! """Format the clues into something readable."""
!
! return sep.join(["%r: %.2f" % (word, prob) for word, prob in clues])
!
! def score(self, msg, evidence=False):
! """Score (judge) a message.
!
! msg can be a string, a file object, or a Message object.
!
! Returns the probability the message is spam. If evidence is
! true, returns a tuple: (probability, clues), where clues is a
! list of the words which contributed to the score.
!
! """
!
! try:
! return self._scoremsg(msg, evidence)
! except:
! print msg
! import traceback
! traceback.print_exc()
!
! def filter(self, msg, header=DISPHEADER, cutoff=SPAM_THRESHOLD):
! """Score (judge) a message and add a disposition header.
!
! msg can be a string, a file object, or a Message object.
!
! Optionally, set header to the name of the header to add, and/or
! cutoff to the probability value which must be met or exceeded
! for a message to get a 'Yes' disposition.
!
! Returns the same message with a new disposition header.
!
! """
!
! if hasattr(msg, "readlines"):
! msg = email.message_from_file(msg)
! elif not hasattr(msg, "add_header"):
! msg = email.message_from_string(msg)
! prob, clues = self._scoremsg(msg, True)
! if prob < cutoff:
! disp = "No"
! else:
! disp = "Yes"
! disp += "; %.2f" % prob
! disp += "; " + self.formatclues(clues)
! msg.add_header(header, disp)
! return msg.as_string(unixfrom=(msg.get_unixfrom() is not None))
!
! def train(self, msg, is_spam):
! """Train bayes with a message.
!
! msg can be a string, a file object, or a Message object.
!
! is_spam should be 1 if the message is spam, 0 if not.
!
! Probabilities are not updated after this call is made; to do
! that, call update_probabilities().
!
! """
!
! self.bayes.learn(tokenize(msg), is_spam, False)
!
! def train_ham(self, msg):
! """Train bayes with ham.
!
! msg can be a string, a file object, or a Message object.
!
! Probabilities are not updated after this call is made; to do
! that, call update_probabilities().
!
! """
!
! self.train(msg, False)
!
! def train_spam(self, msg):
! """Train bayes with spam.
!
! msg can be a string, a file object, or a Message object.
!
! Probabilities are not updated after this call is made; to do
! that, call update_probabilities().
!
! """
!
! self.train(msg, True)
!
! def update_probabilities(self):
! """Update probability values.
!
! You would want to call this after a training session. It's
! pretty slow, so if you have a lot of messages to train, wait
! until you're all done before calling this.
!
! """
!
! self.bayes.update_probabilities()
!
!
! def train(hammie, msgs, is_spam):
"""Train bayes with all messages from a mailbox."""
mbox = mboxutils.getmbox(msgs)
***************
*** 187,211 ****
sys.stdout.write("\r%6d" % i)
sys.stdout.flush()
! bayes.learn(tokenize(msg), is_spam, False)
print
! def formatclues(clues, sep="; "):
! """Format the clues into something readable."""
! return sep.join(["%r: %.2f" % (word, prob) for word, prob in clues])
!
! def filter(bayes, input, output):
! """Filter (judge) a message"""
! msg = email.message_from_file(input)
! prob, clues = bayes.spamprob(tokenize(msg), True)
! if prob < SPAM_THRESHOLD:
! disp = "No"
! else:
! disp = "Yes"
! disp += "; %.2f" % prob
! disp += "; " + formatclues(clues)
! msg.add_header(DISPHEADER, disp)
! output.write(msg.as_string(unixfrom=(msg.get_unixfrom() is not None)))
!
! def score(bayes, msgs):
"""Score (judge) all messages from a mailbox."""
# XXX The reporting needs work!
--- 309,316 ----
sys.stdout.write("\r%6d" % i)
sys.stdout.flush()
! hammie.train(msg, is_spam)
print
! def score(hammie, msgs):
"""Score (judge) all messages from a mailbox."""
# XXX The reporting needs work!
***************
*** 215,219 ****
for msg in mbox:
i += 1
! prob, clues = bayes.spamprob(tokenize(msg), True)
isspam = prob >= SPAM_THRESHOLD
if hasattr(msg, '_mh_msgno'):
--- 320,324 ----
for msg in mbox:
i += 1
! prob, clues = hammie.score(msg, True)
isspam = prob >= SPAM_THRESHOLD
if hasattr(msg, '_mh_msgno'):
***************
*** 224,228 ****
spams += 1
print "%6s %4.2f %1s" % (msgno, prob, isspam and "S" or "."),
! print formatclues(clues)
else:
hams += 1
--- 329,333 ----
spams += 1
print "%6s %4.2f %1s" % (msgno, prob, isspam and "S" or "."),
! print hammie.formatclues(clues)
else:
hams += 1
***************
*** 292,309 ****
bayes = createbayes(pck, usedb)
! if good:
! for g in good:
! print "Training ham (%s):" % g
! train(bayes, g, False)
save = True
! if spam:
! for s in spam:
! print "Training spam (%s):" % s
! train(bayes, s, True)
save = True
if save:
! bayes.update_probabilities()
if not usedb and pck:
fp = open(pck, 'wb')
--- 397,414 ----
bayes = createbayes(pck, usedb)
+ h = Hammie(bayes)
! for g in good:
! print "Training ham (%s):" % g
! train(h, g, False)
save = True
!
! for s in spam:
! print "Training spam (%s):" % s
! train(h, s, True)
save = True
if save:
! h.update_probabilities()
if not usedb and pck:
fp = open(pck, 'wb')
***************
*** 312,316 ****
if do_filter:
! filter(bayes, sys.stdin, sys.stdout)
if unknown:
--- 417,423 ----
if do_filter:
! msg = sys.stdin.read()
! filtered = h.filter(msg)
! sys.stdout.write(filtered)
if unknown:
***************
*** 318,322 ****
if len(unknown) > 1:
print "Scoring", u
! score(bayes, u)
if __name__ == "__main__":
--- 425,429 ----
if len(unknown) > 1:
print "Scoring", u
! score(h, u)
if __name__ == "__main__":
Index: hammiesrv.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/hammiesrv.py,v
retrieving revision 1.2
retrieving revision 1.3
diff -C2 -d -r1.2 -r1.3
*** hammiesrv.py 23 Sep 2002 21:20:10 -0000 1.2
--- hammiesrv.py 27 Sep 2002 19:40:22 -0000 1.3
***************
*** 3,139 ****
# A server version of hammie.py
- # Server code
! import SimpleXMLRPCServer
! import email
! import hammie
! from tokenizer import tokenize
!
! # Default header to add
! DFL_HEADER = "X-Hammie-Disposition"
!
! # Default spam cutoff
! DFL_CUTOFF = 0.9
!
! class Hammie:
! def __init__(self, bayes):
! self.bayes = bayes
! def _scoremsg(self, msg, evidence=False):
! """Score an email.Message.
! Returns the probability the message is spam. If evidence is
! true, returns a tuple: (probability, clues), where clues is a
! list of the words which contributed to the score.
! """
! return self.bayes.spamprob(tokenize(msg), evidence)
! def score(self, msg, evidence=False):
! """Score (judge) a message.
! Pass in a message as a string.
! Returns the probability the message is spam. If evidence is
! true, returns a tuple: (probability, clues), where clues is a
! list of the words which contributed to the score.
"""
! return self._scoremsg(email.message_from_string(msg), evidence)
!
! def filter(self, msg, header=DFL_HEADER, cutoff=DFL_CUTOFF):
! """Score (judge) a message and add a disposition header.
!
! Pass in a message as a string. Optionally, set header to the
! name of the header to add, and/or cutoff to the probability
! value which must be met or exceeded for a message to get a 'Yes'
! disposition.
!
! Returns the same message with a new disposition header.
!
! """
! msg = email.message_from_string(msg)
! prob, clues = self._scoremsg(msg, True)
! if prob < cutoff:
! disp = "No"
else:
! disp = "Yes"
! disp += "; %.2f" % prob
! disp += "; " + hammie.formatclues(clues)
! msg.add_header(header, disp)
! return msg.as_string(unixfrom=(msg.get_unixfrom() is not None))
!
! def train(self, msg, is_spam):
! """Train bayes with a message.
!
! msg should be the message as a string, and is_spam should be 1
! if the message is spam, 0 if not.
!
! Probabilities are not updated after this call is made; to do
! that, call update_probabilities().
!
! """
!
! self.bayes.learn(tokenize(msg), is_spam, False)
!
! def train_ham(self, msg):
! """Train bayes with ham.
!
! msg should be the message as a string.
!
! Probabilities are not updated after this call is made; to do
! that, call update_probabilities().
!
! """
!
! self.train(msg, False)
!
! def train_spam(self, msg):
! """Train bayes with spam.
!
! msg should be the message as a string.
!
! Probabilities are not updated after this call is made; to do
! that, call update_probabilities().
!
! """
! self.train(msg, True)
! def update_probabilities(self):
! """Update probability values.
- You would want to call this after a training session. It's
- pretty slow, so if you have a lot of messages to train, wait
- until you're all done before calling this.
! """
! self.bayes.update_probabilites()
! def main():
! usedb = True
! pck = "/home/neale/lib/hammie.db"
! if usedb:
! bayes = hammie.PersistentGrahamBayes(pck)
! else:
! bayes = None
! try:
! fp = open(pck, 'rb')
! except IOError, e:
! if e.errno <> errno.ENOENT: raise
! else:
! bayes = pickle.load(fp)
! fp.close()
! if bayes is None:
! import classifier
! bayes = classifier.GrahamBayes()
! server = SimpleXMLRPCServer.SimpleXMLRPCServer(("localhost", 7732))
! server.register_instance(Hammie(bayes))
server.serve_forever()
--- 3,121 ----
# A server version of hammie.py
! """Usage: %(program)s [options] IP:PORT
! Where:
! -h
! show usage and exit
! -p FILE
! use file as the persistent store. loads data from this file if it
! exists, and saves data to this file at the end. Default: %(DEFAULTDB)s
! -d
! use the DBM store instead of cPickle. The file is larger and
! creating it is slower, but checking against it is much faster,
! especially for large word databases.
! IP
! IP address to bind (use 0.0.0.0 to listen on all IPs of this machine)
! PORT
! Port number to listen to.
! """
! import SimpleXMLRPCServer
! import getopt
! import sys
! import traceback
! import xmlrpclib
! import hammie
! program = sys.argv[0] # For usage(); referenced by docstring above
! # Default DB path
! DEFAULTDB = hammie.DEFAULTDB
! class HammieHandler(SimpleXMLRPCServer.SimpleXMLRPCRequestHandler):
! def do_POST(self):
! """Handles the HTTP POST request.
! Attempts to interpret all HTTP POST requests as XML-RPC calls,
! which are forwarded to the _dispatch method for handling.
+ This one also prints out tracebacks, to help me debug :)
"""
! try:
! # get arguments
! data = self.rfile.read(int(self.headers["content-length"]))
! params, method = xmlrpclib.loads(data)
! # generate response
! try:
! response = self._dispatch(method, params)
! # wrap response in a singleton tuple
! response = (response,)
! except:
! # report exception back to server
! response = xmlrpclib.dumps(
! xmlrpclib.Fault(1, "%s:%s" % (sys.exc_type, sys.exc_value))
! )
! else:
! response = xmlrpclib.dumps(response, methodresponse=1)
! except:
! # internal error, report as HTTP server error
! traceback.print_exc()
! print `data`
! self.send_response(500)
! self.end_headers()
else:
! # got a valid XML RPC response
! self.send_response(200)
! self.send_header("Content-type", "text/xml")
! self.send_header("Content-length", str(len(response)))
! self.end_headers()
! self.wfile.write(response)
! # shut down the connection
! self.wfile.flush()
! self.connection.shutdown(1)
!
! def usage(code, msg=''):
! """Print usage message and sys.exit(code)."""
! if msg:
! print >> sys.stderr, msg
! print >> sys.stderr
! print >> sys.stderr, __doc__ % globals()
! sys.exit(code)
! def main():
! """Main program; parse options and go."""
! try:
! opts, args = getopt.getopt(sys.argv[1:], 'hdp:')
! except getopt.error, msg:
! usage(2, msg)
! pck = DEFAULTDB
! usedb = False
! for opt, arg in opts:
! if opt == '-h':
! usage(0)
! elif opt == '-p':
! pck = arg
! elif opt == "-d":
! usedb = True
! if len(args) != 1:
! usage(2, "IP:PORT not specified")
! ip, port = args[0].split(":")
! port = int(port)
!
! bayes = hammie.createbayes(pck, usedb)
! h = hammie.Hammie(bayes)
! server = SimpleXMLRPCServer.SimpleXMLRPCServer((ip, port), HammieHandler)
! server.register_instance(h)
server.serve_forever()
Index: runtest.sh
===================================================================
RCS file: /cvsroot/spambayes/spambayes/runtest.sh,v
retrieving revision 1.3
retrieving revision 1.4
diff -C2 -d -r1.3 -r1.4
*** runtest.sh 19 Sep 2002 00:17:41 -0000 1.3
--- runtest.sh 27 Sep 2002 19:40:22 -0000 1.4
***************
*** 40,43 ****
--- 40,46 ----
case "$TEST" in
+ run1)
+ python timcv.py -n $SETS > run1.txt
+ ;;
run2|useold)
python timcv.py -n $SETS > run2.txt
- Previous message: [Spambayes-checkins] spambayes hammie.py,1.21,1.22
- Next message: [Spambayes-checkins]
spambayes HistToGNU.py,1.5,1.6 TestDriver.py,1.15,1.16
hammie.py,1.23,1.24 hammiesrv.py,1.3,1.4 setup.py,1.5,1.6
- Messages sorted by:
[ date ]
[ thread ]
[ subject ]
[ author ]