[Spambayes-checkins] spambayes hammiebulk.py,NONE,1.1.2.1 classifier.py,1.53.2.4,1.53.2.5 hammie.py,1.40.2.2,1.40.2.3

Thu Nov 21 23:00:01 2002

Update of /cvsroot/spambayes/spambayes
In directory sc8-pr-cvs1:/tmp/cvs-serv1861

Modified Files:
      Tag: hammie-playground
	classifier.py hammie.py 
Added Files:
      Tag: hammie-playground
	hammiebulk.py 
Log Message:
* Bayes.py: removed a debug print
* hammie.py: removed some debug code I put in for hammiesrv
* hammiebulk.py: this does what hammie.py used to do.

--- NEW FILE: hammiebulk.py ---
#! /usr/bin/env python

"""Usage: %(program)s [options]

Where:
    -h
        show usage and exit
    -g PATH
        mbox or directory of known good messages (non-spam) to train on.
        Can be specified more than once, or use - for stdin.
    -s PATH
        mbox or directory of known spam messages to train on.
        Can be specified more than once, or use - for stdin.
    -u PATH
        mbox of unknown messages.  A ham/spam decision is reported for each.
        Can be specified more than once.
    -r
        reverse the meaning of the check (report ham instead of spam).
        Only meaningful with the -u option.
    -p FILE
        use file as the persistent store.  loads data from this file if it
        exists, and saves data to this file at the end.
        Default: %(DEFAULTDB)s
    -d
        use the DBM store instead of cPickle.  The file is larger and
        creating it is slower, but checking against it is much faster,
        especially for large word databases. Default: %(USEDB)s
    -D
        the reverse of -d: use the cPickle instead of DBM
    -f
        run as a filter: read a single message from stdin, add a new
        header, and write it to stdout.  If you want to run from
        procmail, this is your option.
"""

import sys
import os
import types
import getopt
import mailbox
import glob
import email
import errno
import anydbm
import cPickle as pickle

from Options import options
import mboxutils
import classifier
import hammie

program = sys.argv[0] # For usage(); referenced by docstring above

# Default database name
DEFAULTDB = os.path.expanduser(options.hammiefilter_persistent_storage_file)

# Use a database? If False, use a pickle
USEDB = options.hammiefilter_persistent_use_database

# Probability at which a message is considered spam
SPAM_THRESHOLD = options.spam_cutoff
HAM_THRESHOLD = options.ham_cutoff

def train(h, msgs, is_spam):
    """Train bayes with all messages from a mailbox."""
    mbox = mboxutils.getmbox(msgs)
    i = 0
    for msg in mbox:
        i += 1
        # XXX: Is the \r a Unixism?  I seem to recall it working in DOS
        # back in the day.  Maybe it's a line-printer-ism ;)
        sys.stdout.write("\r%6d" % i)
        sys.stdout.flush()
        h.train(msg, is_spam)
    print

def score(h, msgs, reverse=0):
    """Score (judge) all messages from a mailbox."""
    # XXX The reporting needs work!
    mbox = mboxutils.getmbox(msgs)
    i = 0
    spams = hams = 0
    for msg in mbox:
        i += 1
        prob, clues = h.score(msg, True)
        if hasattr(msg, '_mh_msgno'):
            msgno = msg._mh_msgno
        else:
            msgno = i
        isspam = (prob >= SPAM_THRESHOLD)
        if isspam:
            spams += 1
            if not reverse:
                print "%6s %4.2f %1s" % (msgno, prob, isspam and "S" or "."),
                print h.formatclues(clues)
        else:
            hams += 1
            if reverse:
                print "%6s %4.2f %1s" % (msgno, prob, isspam and "S" or "."),
                print h.formatclues(clues)
    return (spams, hams)

def createbayes(pck=DEFAULTDB, usedb=False, mode='r'):
    """Create a Bayes instance for the given pickle (which
    doesn't have to exist).  Create a PersistentBayes if
    usedb is True."""
    if usedb:
        bayes = PersistentBayes(pck, mode)
    else:
        bayes = None
        try:
            fp = open(pck, 'rb')
        except IOError, e:
            if e.errno <> errno.ENOENT: raise
        else:
            bayes = pickle.load(fp)
            fp.close()
        if bayes is None:
            bayes = classifier.Bayes()
    return bayes

def usage(code, msg=''):
    """Print usage message and sys.exit(code)."""
    if msg:
        print >> sys.stderr, msg
        print >> sys.stderr
    print >> sys.stderr, __doc__ % globals()
    sys.exit(code)

def main():
    """Main program; parse options and go."""
    try:
        opts, args = getopt.getopt(sys.argv[1:], 'hdDfg:s:p:u:r')
    except getopt.error, msg:
        usage(2, msg)

    if not opts:
        usage(2, "No options given")

    pck = DEFAULTDB
    good = []
    spam = []
    unknown = []
    reverse = 0
    do_filter = False
    usedb = USEDB
    mode = 'r'
    for opt, arg in opts:
        if opt == '-h':
            usage(0)
        elif opt == '-g':
            good.append(arg)
            mode = 'c'
        elif opt == '-s':
            spam.append(arg)
            mode = 'c'
        elif opt == '-p':
            pck = arg
        elif opt == "-d":
            usedb = True
        elif opt == "-D":
            usedb = False
        elif opt == "-f":
            do_filter = True
        elif opt == '-u':
            unknown.append(arg)
        elif opt == '-r':
            reverse = 1
    if args:
        usage(2, "Positional arguments not allowed")

    save = False

    h = hammie.open(pck, usedb, mode)

    for g in good:
        print "Training ham (%s):" % g
        train(h, g, False)
        save = True

    for s in spam:
        print "Training spam (%s):" % s
        train(h, s, True)
        save = True

    if save:
        h.store()

    if do_filter:
        msg = sys.stdin.read()
        filtered = h.filter(msg)
        sys.stdout.write(filtered)

    if unknown:
        (spams, hams) = (0, 0)
        for u in unknown:
            if len(unknown) > 1:
                print "Scoring", u
            s, g = score(h, u, reverse)
            spams += s
            hams += g
        print "Total %d spam, %d ham" % (spams, hams)

if __name__ == "__main__":
    main()

Index: classifier.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/classifier.py,v
retrieving revision 1.53.2.4
retrieving revision 1.53.2.5
diff -C2 -d -r1.53.2.4 -r1.53.2.5
*** classifier.py	21 Nov 2002 06:03:24 -0000	1.53.2.4
--- classifier.py	21 Nov 2002 22:59:55 -0000	1.53.2.5
***************
*** 1,2 ****
--- 1,3 ----
+ #! /usr/bin/env python
  # An implementation of a Bayes-like spam classifier.
  #
***************
*** 72,76 ****

      def incr_rev(self):
-         print "revision going up...", self.revision
          self.revision += 1

--- 73,76 ----
***************
*** 135,139 ****
          S = options.unknown_word_strength
          StimesX = S * options.unknown_word_prob
!                 
          assert self.hamcount <= nham
          hamratio = self.hamcount / nham
--- 135,139 ----
          S = options.unknown_word_strength
          StimesX = S * options.unknown_word_prob
! 
          assert self.hamcount <= nham
          hamratio = self.hamcount / nham

Index: hammie.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/hammie.py,v
retrieving revision 1.40.2.2
retrieving revision 1.40.2.3
diff -C2 -d -r1.40.2.2 -r1.40.2.3
*** hammie.py	21 Nov 2002 04:27:27 -0000	1.40.2.2
--- hammie.py	21 Nov 2002 22:59:56 -0000	1.40.2.3
***************
*** 58,67 ****
          """

!         try:
!             return self._scoremsg(msg, evidence)
!         except:
!             print msg
!             import traceback
!             traceback.print_exc()

      def filter(self, msg, header=None, spam_cutoff=None,
--- 58,62 ----
          """

!         return self._scoremsg(msg, evidence)

      def filter(self, msg, header=None, spam_cutoff=None,