[Spambayes-checkins] spambayes hammie.py,1.22,1.23 hammiesrv.py,1.2,1.3 runtest.sh,1.3,1.4

Fri, 27 Sep 2002 12:40:27 -0700

Update of /cvsroot/spambayes/spambayes
In directory usw-pr-cvs1:/tmp/cvs-serv7026

Modified Files:
	hammie.py hammiesrv.py runtest.sh 
Log Message:
* hammie.py now has a Hammie class, which hammiesrv now uses.
  hammie.py could still stand some more clean-up.  Don't worry, I'm
  on it :)
* runtest now has a run1 target to generate the first data

Index: hammie.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/hammie.py,v
retrieving revision 1.22
retrieving revision 1.23
diff -C2 -d -r1.22 -r1.23
*** hammie.py	27 Sep 2002 18:48:05 -0000	1.22
--- hammie.py	27 Sep 2002 19:40:21 -0000	1.23
***************
*** 61,65 ****

  class DBDict:
!     """Database Dictionary

      This wraps an anydbm to make it look even more like a dictionary.
--- 61,66 ----

  class DBDict:
! 
!     """Database Dictionary.

      This wraps an anydbm to make it look even more like a dictionary.
***************
*** 136,140 ****

  class PersistentGrahamBayes(classifier.GrahamBayes):
!     """A persistent GrahamBayes classifier

      This is just like classifier.GrahamBayes, except that the dictionary
--- 137,142 ----

  class PersistentGrahamBayes(classifier.GrahamBayes):
! 
!     """A persistent GrahamBayes classifier.

      This is just like classifier.GrahamBayes, except that the dictionary
***************
*** 177,181 ****

! def train(bayes, msgs, is_spam):
      """Train bayes with all messages from a mailbox."""
      mbox = mboxutils.getmbox(msgs)
--- 179,303 ----

! class Hammie:
! 
!     """A spambayes mail filter"""
!     
!     def __init__(self, bayes):
!         self.bayes = bayes
! 
!     def _scoremsg(self, msg, evidence=False):
!         """Score a Message.
! 
!         msg can be a string, a file object, or a Message object.
! 
!         Returns the probability the message is spam.  If evidence is
!         true, returns a tuple: (probability, clues), where clues is a
!         list of the words which contributed to the score.
! 
!         """
! 
!         return self.bayes.spamprob(tokenize(msg), evidence)
!         
!     def formatclues(self, clues, sep="; "):
!         """Format the clues into something readable."""
! 
!         return sep.join(["%r: %.2f" % (word, prob) for word, prob in clues])
! 
!     def score(self, msg, evidence=False):
!         """Score (judge) a message.
! 
!         msg can be a string, a file object, or a Message object.
! 
!         Returns the probability the message is spam.  If evidence is
!         true, returns a tuple: (probability, clues), where clues is a
!         list of the words which contributed to the score.
! 
!         """
! 
!         try:
!             return self._scoremsg(msg, evidence)
!         except:
!             print msg
!             import traceback
!             traceback.print_exc()
! 
!     def filter(self, msg, header=DISPHEADER, cutoff=SPAM_THRESHOLD):
!         """Score (judge) a message and add a disposition header.
! 
!         msg can be a string, a file object, or a Message object.
! 
!         Optionally, set header to the name of the header to add, and/or
!         cutoff to the probability value which must be met or exceeded
!         for a message to get a 'Yes' disposition.
!         
!         Returns the same message with a new disposition header.
! 
!         """
! 
!         if hasattr(msg, "readlines"):
!             msg = email.message_from_file(msg)
!         elif not hasattr(msg, "add_header"):
!             msg = email.message_from_string(msg)
!         prob, clues = self._scoremsg(msg, True)
!         if prob < cutoff:
!             disp = "No"
!         else:
!             disp = "Yes"
!         disp += "; %.2f" % prob
!         disp += "; " + self.formatclues(clues)
!         msg.add_header(header, disp)
!         return msg.as_string(unixfrom=(msg.get_unixfrom() is not None))
! 
!     def train(self, msg, is_spam):
!         """Train bayes with a message.
! 
!         msg can be a string, a file object, or a Message object.
! 
!         is_spam should be 1 if the message is spam, 0 if not.
! 
!         Probabilities are not updated after this call is made; to do
!         that, call update_probabilities().
!         
!         """
!         
!         self.bayes.learn(tokenize(msg), is_spam, False)
! 
!     def train_ham(self, msg):
!         """Train bayes with ham.
! 
!         msg can be a string, a file object, or a Message object.
! 
!         Probabilities are not updated after this call is made; to do
!         that, call update_probabilities().
! 
!         """
! 
!         self.train(msg, False)
! 
!     def train_spam(self, msg):
!         """Train bayes with spam.
! 
!         msg can be a string, a file object, or a Message object.
! 
!         Probabilities are not updated after this call is made; to do
!         that, call update_probabilities().
! 
!         """
! 
!         self.train(msg, True)
! 
!     def update_probabilities(self):
!         """Update probability values.
! 
!         You would want to call this after a training session.  It's
!         pretty slow, so if you have a lot of messages to train, wait
!         until you're all done before calling this.
! 
!         """
!         
!         self.bayes.update_probabilities()
!     
! 
! def train(hammie, msgs, is_spam):
      """Train bayes with all messages from a mailbox."""
      mbox = mboxutils.getmbox(msgs)
***************
*** 187,211 ****
          sys.stdout.write("\r%6d" % i)
          sys.stdout.flush()
!         bayes.learn(tokenize(msg), is_spam, False)
      print

! def formatclues(clues, sep="; "):
!     """Format the clues into something readable."""
!     return sep.join(["%r: %.2f" % (word, prob) for word, prob in clues])
! 
! def filter(bayes, input, output):
!     """Filter (judge) a message"""
!     msg = email.message_from_file(input)
!     prob, clues = bayes.spamprob(tokenize(msg), True)
!     if prob < SPAM_THRESHOLD:
!         disp = "No"
!     else:
!         disp = "Yes"
!     disp += "; %.2f" % prob
!     disp += "; " + formatclues(clues)
!     msg.add_header(DISPHEADER, disp)
!     output.write(msg.as_string(unixfrom=(msg.get_unixfrom() is not None)))
! 
! def score(bayes, msgs):
      """Score (judge) all messages from a mailbox."""
      # XXX The reporting needs work!
--- 309,316 ----
          sys.stdout.write("\r%6d" % i)
          sys.stdout.flush()
!         hammie.train(msg, is_spam)
      print

! def score(hammie, msgs):
      """Score (judge) all messages from a mailbox."""
      # XXX The reporting needs work!
***************
*** 215,219 ****
      for msg in mbox:
          i += 1
!         prob, clues = bayes.spamprob(tokenize(msg), True)
          isspam = prob >= SPAM_THRESHOLD
          if hasattr(msg, '_mh_msgno'):
--- 320,324 ----
      for msg in mbox:
          i += 1
!         prob, clues = hammie.score(msg, True)
          isspam = prob >= SPAM_THRESHOLD
          if hasattr(msg, '_mh_msgno'):
***************
*** 224,228 ****
              spams += 1
              print "%6s %4.2f %1s" % (msgno, prob, isspam and "S" or "."),
!             print formatclues(clues)
          else:
              hams += 1
--- 329,333 ----
              spams += 1
              print "%6s %4.2f %1s" % (msgno, prob, isspam and "S" or "."),
!             print hammie.formatclues(clues)
          else:
              hams += 1
***************
*** 292,309 ****

      bayes = createbayes(pck, usedb)

!     if good:
!         for g in good:
!             print "Training ham (%s):" % g
!             train(bayes, g, False)
          save = True
!     if spam:
!         for s in spam:
!             print "Training spam (%s):" % s
!             train(bayes, s, True)
          save = True

      if save:
!         bayes.update_probabilities()
          if not usedb and pck:
              fp = open(pck, 'wb')
--- 397,414 ----

      bayes = createbayes(pck, usedb)
+     h = Hammie(bayes)

!     for g in good:
!         print "Training ham (%s):" % g
!         train(h, g, False)
          save = True
! 
!     for s in spam:
!         print "Training spam (%s):" % s
!         train(h, s, True)
          save = True

      if save:
!         h.update_probabilities()
          if not usedb and pck:
              fp = open(pck, 'wb')
***************
*** 312,316 ****

      if do_filter:
!         filter(bayes, sys.stdin, sys.stdout)

      if unknown:
--- 417,423 ----

      if do_filter:
!         msg = sys.stdin.read()
!         filtered = h.filter(msg)
!         sys.stdout.write(filtered)

      if unknown:
***************
*** 318,322 ****
              if len(unknown) > 1:
                  print "Scoring", u
!             score(bayes, u)

  if __name__ == "__main__":
--- 425,429 ----
              if len(unknown) > 1:
                  print "Scoring", u
!             score(h, u)

  if __name__ == "__main__":

Index: hammiesrv.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/hammiesrv.py,v
retrieving revision 1.2
retrieving revision 1.3
diff -C2 -d -r1.2 -r1.3
*** hammiesrv.py	23 Sep 2002 21:20:10 -0000	1.2
--- hammiesrv.py	27 Sep 2002 19:40:22 -0000	1.3
***************
*** 3,139 ****
  # A server version of hammie.py

- # Server code

! import SimpleXMLRPCServer
! import email
! import hammie
! from tokenizer import tokenize
! 
! # Default header to add
! DFL_HEADER = "X-Hammie-Disposition"
! 
! # Default spam cutoff
! DFL_CUTOFF = 0.9
! 
! class Hammie:
!     def __init__(self, bayes):
!         self.bayes = bayes

!     def _scoremsg(self, msg, evidence=False):
!         """Score an email.Message.

!         Returns the probability the message is spam.  If evidence is
!         true, returns a tuple: (probability, clues), where clues is a
!         list of the words which contributed to the score.

!         """

!         return self.bayes.spamprob(tokenize(msg), evidence)

!     def score(self, msg, evidence=False):
!         """Score (judge) a message.

!         Pass in a message as a string.

!         Returns the probability the message is spam.  If evidence is
!         true, returns a tuple: (probability, clues), where clues is a
!         list of the words which contributed to the score.

          """

!         return self._scoremsg(email.message_from_string(msg), evidence)
! 
!     def filter(self, msg, header=DFL_HEADER, cutoff=DFL_CUTOFF):
!         """Score (judge) a message and add a disposition header.
! 
!         Pass in a message as a string.  Optionally, set header to the
!         name of the header to add, and/or cutoff to the probability
!         value which must be met or exceeded for a message to get a 'Yes'
!         disposition.
! 
!         Returns the same message with a new disposition header.
! 
!         """

!         msg = email.message_from_string(msg)
!         prob, clues = self._scoremsg(msg, True)
!         if prob < cutoff:
!             disp = "No"
          else:
!             disp = "Yes"
!         disp += "; %.2f" % prob
!         disp += "; " + hammie.formatclues(clues)
!         msg.add_header(header, disp)
!         return msg.as_string(unixfrom=(msg.get_unixfrom() is not None))
! 
!     def train(self, msg, is_spam):
!         """Train bayes with a message.
! 
!         msg should be the message as a string, and is_spam should be 1
!         if the message is spam, 0 if not.
! 
!         Probabilities are not updated after this call is made; to do
!         that, call update_probabilities().
! 
!         """
! 
!         self.bayes.learn(tokenize(msg), is_spam, False)
! 
!     def train_ham(self, msg):
!         """Train bayes with ham.
! 
!         msg should be the message as a string.
! 
!         Probabilities are not updated after this call is made; to do
!         that, call update_probabilities().
! 
!         """
! 
!         self.train(msg, False)
! 
!     def train_spam(self, msg):
!         """Train bayes with spam.
! 
!         msg should be the message as a string.
! 
!         Probabilities are not updated after this call is made; to do
!         that, call update_probabilities().
! 
!         """

!         self.train(msg, True)

!     def update_probabilities(self):
!         """Update probability values.

-         You would want to call this after a training session.  It's
-         pretty slow, so if you have a lot of messages to train, wait
-         until you're all done before calling this.

!         """

!         self.bayes.update_probabilites()

! def main():
!     usedb = True
!     pck = "/home/neale/lib/hammie.db"

!     if usedb:
!         bayes = hammie.PersistentGrahamBayes(pck)
!     else:
!         bayes = None
!         try:
!             fp = open(pck, 'rb')
!         except IOError, e:
!             if e.errno <> errno.ENOENT: raise
!         else:
!             bayes = pickle.load(fp)
!             fp.close()
!         if bayes is None:
!             import classifier
!             bayes = classifier.GrahamBayes()

!     server = SimpleXMLRPCServer.SimpleXMLRPCServer(("localhost", 7732))
!     server.register_instance(Hammie(bayes))
      server.serve_forever()

--- 3,121 ----
  # A server version of hammie.py

! """Usage: %(program)s [options] IP:PORT

! Where:
!     -h
!         show usage and exit
!     -p FILE
!         use file as the persistent store.  loads data from this file if it
!         exists, and saves data to this file at the end.  Default: %(DEFAULTDB)s
!     -d
!         use the DBM store instead of cPickle.  The file is larger and
!         creating it is slower, but checking against it is much faster,
!         especially for large word databases.

!     IP
!         IP address to bind (use 0.0.0.0 to listen on all IPs of this machine)
!     PORT
!         Port number to listen to.
! """

! import SimpleXMLRPCServer
! import getopt
! import sys
! import traceback
! import xmlrpclib
! import hammie

! program = sys.argv[0] # For usage(); referenced by docstring above

! # Default DB path
! DEFAULTDB = hammie.DEFAULTDB

! class HammieHandler(SimpleXMLRPCServer.SimpleXMLRPCRequestHandler):
!     def do_POST(self):
!         """Handles the HTTP POST request.

!         Attempts to interpret all HTTP POST requests as XML-RPC calls,
!         which are forwarded to the _dispatch method for handling.

+         This one also prints out tracebacks, to help me debug :)
          """

!         try:
!             # get arguments
!             data = self.rfile.read(int(self.headers["content-length"]))
!             params, method = xmlrpclib.loads(data)

!             # generate response
!             try:
!                 response = self._dispatch(method, params)
!                 # wrap response in a singleton tuple
!                 response = (response,)
!             except:
!                 # report exception back to server
!                 response = xmlrpclib.dumps(
!                     xmlrpclib.Fault(1, "%s:%s" % (sys.exc_type, sys.exc_value))
!                     )
!             else:
!                 response = xmlrpclib.dumps(response, methodresponse=1)
!         except:
!             # internal error, report as HTTP server error
!             traceback.print_exc()
!             print `data`
!             self.send_response(500)
!             self.end_headers()
          else:
!             # got a valid XML RPC response
!             self.send_response(200)
!             self.send_header("Content-type", "text/xml")
!             self.send_header("Content-length", str(len(response)))
!             self.end_headers()
!             self.wfile.write(response)

!             # shut down the connection
!             self.wfile.flush()
!             self.connection.shutdown(1)
!             

! def usage(code, msg=''):
!     """Print usage message and sys.exit(code)."""
!     if msg:
!         print >> sys.stderr, msg
!         print >> sys.stderr
!     print >> sys.stderr, __doc__ % globals()
!     sys.exit(code)

! def main():
!     """Main program; parse options and go."""
!     try:
!         opts, args = getopt.getopt(sys.argv[1:], 'hdp:')
!     except getopt.error, msg:
!         usage(2, msg)

!     pck = DEFAULTDB
!     usedb = False
!     for opt, arg in opts:
!         if opt == '-h':
!             usage(0)
!         elif opt == '-p':
!             pck = arg
!         elif opt == "-d":
!             usedb = True

!     if len(args) != 1:
!         usage(2, "IP:PORT not specified")

!     ip, port = args[0].split(":")
!     port = int(port)
!     
!     bayes = hammie.createbayes(pck, usedb)
!     h = hammie.Hammie(bayes)

!     server = SimpleXMLRPCServer.SimpleXMLRPCServer((ip, port), HammieHandler)
!     server.register_instance(h)
      server.serve_forever()

Index: runtest.sh
===================================================================
RCS file: /cvsroot/spambayes/spambayes/runtest.sh,v
retrieving revision 1.3
retrieving revision 1.4
diff -C2 -d -r1.3 -r1.4
*** runtest.sh	19 Sep 2002 00:17:41 -0000	1.3
--- runtest.sh	27 Sep 2002 19:40:22 -0000	1.4
***************
*** 40,43 ****
--- 40,46 ----

  case "$TEST" in
+     run1)
+ 	python timcv.py -n $SETS > run1.txt
+ 	;;
      run2|useold)
  	python timcv.py -n $SETS > run2.txt