[Python-checkins] python/nondist/sandbox/spambayes GBayes.py,1.2,1.3

Mon, 19 Aug 2002 16:32:19 -0700

Update of /cvsroot/python/python/nondist/sandbox/spambayes
In directory usw-pr-cvs1:/tmp/cvs-serv2462

Modified Files:
	GBayes.py 
Log Message:
Lots of hacks great and small to the main() program, but I didn't
touch the guts of the algorithm.

Added a module docstring/usage message.

Added a bunch of switches to train the system on an mbox of known good
and known spam messages (using PortableUnixMailbox only for now).
Uses the email package but does not decoding of message bodies.  Also,
allows you to specify a file for pickling the training data, and for
setting a threshold, above which messages get an X-Bayes-Score
header.  Also output messages (marked and unmarked) to an output file
for retraining.

Print some statistics at the end.

Index: GBayes.py
===================================================================
RCS file: /cvsroot/python/python/nondist/sandbox/spambayes/GBayes.py,v
retrieving revision 1.2
retrieving revision 1.3
diff -C2 -d -r1.2 -r1.3
*** GBayes.py	19 Aug 2002 19:43:36 -0000	1.2
--- GBayes.py	19 Aug 2002 23:32:17 -0000	1.3
***************
*** 1,4 ****
--- 1,38 ----
+ """Usage: %(program)s [options]
+ 
+ Where:
+     -h
+         show usage and exit
+     -g mboxfile
+         mbox of known good messages (non-spam)
+     -s mboxfile
+         mbox of known spam messages
+     -u mboxfile
+         mbox of unknown messages
+     -p file
+         use file as the persistent pickle.  loads data from this file if it
+         exists, and saves data to this file at the end.  omit for one shot.
+     -c num
+         train the system with just `num' number of messages from both the
+         known spam and known good files.  the system works best with
+         approximately the same number of messages from both collections.
+     -m threshold
+         mark messages with a threshold above float `threshold' with a header
+         such as "X-Bayes-Score: score".  use the -o option to output the
+         marked folder.
+     -o file
+         with -m, output all messages, with marks, to file
+ """
+ 
+ import sys
+ import getopt
  import time as _time
  from heapq import heapreplace as _heapreplace
+ import cPickle as pickle
+ import mailbox
+ import email
+ import errno
+ 
+ program = sys.argv[0]

  # This is an implementation of the Bayes-like spam classifier sketched at
***************
*** 473,484 ****
  -Barry

! """

! b = GrahamBayes()
! b.learn(tokenize(spam1), True)
! b.learn(tokenize(spam2), True)
! b.learn(tokenize(good1), False)
! b.learn(tokenize(good2), False)

! print "P(spam3 is spam) =", b.spamprob(tokenize(spam3))
! print "P(good3 is spam) =", b.spamprob(tokenize(good3))
--- 507,668 ----
  -Barry

! """ #'

! def main1():
!     b = GrahamBayes()
!     b.learn(tokenize(spam1), True)
!     b.learn(tokenize(spam2), True)
!     b.learn(tokenize(good1), False)
!     b.learn(tokenize(good2), False)

!     print "P(spam3 is spam) =", b.spamprob(tokenize(spam3))
!     print "P(good3 is spam) =", b.spamprob(tokenize(good3))
! 
! 
! def usage(code, msg=''):
!     print >> sys.stderr, __doc__ % globals()
!     if msg:
!         print >> sys.stderr, msg
!     sys.exit(code)
! 
! 
! def main2():
!     try:
!         opts, args = getopt.getopt(sys.argv[1:], 'hg:s:u:p:c:m:o:')
!     except getopt.error, msg:
!         usage(1, msg)
! 
!     threshold = count = good = spam = unknown = pck = mark = output = None
!     for opt, arg in opts:
!         if opt == '-h':
!             usage(0)
!         elif opt == '-g':
!             good = arg
!         elif opt == '-s':
!             spam = arg
!         elif opt == '-u':
!             unknown = arg
!         elif opt == '-p':
!             pck = arg
!         elif opt == '-c':
!             count = int(arg)
!         elif opt == '-m':
!             threshold = float(arg)
!         elif opt == '-o':
!             output = arg
! 
!     if args:
!         usage(1)
! 
!     save = False
!     bayes = None
!     if pck:
!         try:
!             fp = open(pck, 'rb')
!         except IOError, e:
!             if e.errno <> errno.ENOENT: raise
!         else:
!             bayes = pickle.load(fp)
!             fp.close()
!     if bayes is None:
!         bayes = GrahamBayes()
! 
!     def _factory(fp):
!         # Guido sez: IMO, for body encoding, we should do the same level of
!         # decoding that a typical mail client does, so that we "see" the same
!         # thing an end user sees.  This means base64, but not uuencode
!         # (because most mailers don't unpack that automatically).  We may save
!         # time base64-decoding by not botherin with attachments, since those
!         # aren't shown by default.
!         try:
!             return email.message_from_file(fp)
!         except email.Errors.MessageParseError:
!             return ''
! 
!     # Assume Unix mailbox format
!     if good:
!         print 'training with the known good messages'
!         fp = open(good)
!         mbox = mailbox.PortableUnixMailbox(fp, _factory)
!         i = 0
!         for msg in mbox:
!             # For now we'll take an extremely naive view of messages; we won't
!             # decode them at all, just to see what happens.  Later, we might
!             # want to uu- or base64-decode, or do other pre-processing on the
!             # message.
!             bayes.learn(tokenize(str(msg)), False, False)
!             i += 1
!             if count is not None and i > count:
!                 break
!         fp.close()
!         save = True
!         print 'done training', i, 'messages'
! 
!     if spam:
!         print 'training with the known spam messages'
!         fp = open(spam)
!         mbox = mailbox.PortableUnixMailbox(fp, _factory)
!         i = 0
!         for msg in mbox:
!             # For now we'll take an extremely naive view of messages; we won't
!             # decode them at all, just to see what happens.  Later, we might
!             # want to uu- or base64-decode, or do other pre-processing on the
!             # message.
!             bayes.learn(tokenize(str(msg)), True, False)
!             i += 1
!             if count is not None and i > count:
!                 break
!         fp.close()
!         save = True
!         print 'done training', i, 'messages'
! 
!     bayes.update_probabilities()
! 
!     if pck and save:
!         fp = open(pck, 'wb')
!         pickle.dump(bayes, fp, 1)
!         fp.close()
! 
!     if unknown:
!         if output:
!             output = open(output, 'w')
!         print 'classifying the unknown'
!         fp = open(unknown)
!         mbox = mailbox.PortableUnixMailbox(fp, email.message_from_file)
!         pos = 0
!         allcnt = 0
!         spamcnt = goodcnt = 0
!         for msg in mbox:
!             msgid = msg.get('message-id', '<file offset %d>' % pos)
!             pos = fp.tell()
!             # For now we'll take an extremely naive view of messages; we won't
!             # decode them at all, just to see what happens.  Later, we might
!             # want to uu- or base64-decode, or do other pre-processing on the
!             # message.
!             try:
!                 prob = bayes.spamprob(tokenize(str(msg)))
!             except ValueError:
!                 # Sigh, bad Content-Type
!                 continue
!             if threshold is not None and prob > threshold:
!                 msg['X-Bayes-Score'] = str(prob)
!             print 'P(%s) =' % msgid, prob
!             if output:
!                 print >> output, msg
!             # XXX hardcode
!             if prob > 0.90:
!                 spamcnt += 1
!             if prob < 0.09:
!                 goodcnt += 1
!             allcnt += 1
!         if output:
!             output.close()
!         fp.close()
!         print 'Num messages =', allcnt
!         print 'Good count =', goodcnt
!         print 'Spam count =', spamcnt
!         print 'Hard to tell =', allcnt - (goodcnt + spamcnt)
! 
! 
! if __name__ == '__main__':
!     main2()