mbox despamming script

Paul Rubin http
Wed Nov 26 20:16:04 EST 2003


I was surprised there was no obvious way with spamassassin (maybe I
shoulda looked at spambayes) to split an existing mbox file into its
spam and non-spam messages.  So I wrote one.  It's pretty slow, taking
around 1.5 seconds per message on a 2 ghz Athlon, making me wonder how
serious ISP's getting thousands of incoming messages per hour can run
anything like spamassassin on all of them.  But for my purposes it's ok.
Comments and improvements are welcome.

================================================================

#!/usr/bin/python

# Spam filter for mbox files.  Reads mailfile and makes two new
# files, mailfile.spam and mailfile.ham, containing the spam and non-spam
# messages from mailfile as determined by piping through spamc.

# Copyright 2003 Paul Rubin <http://www.paulrubin.com>
# Copying permission: GNU General Public License ver. 2, <http://www.gnu.org>

import mailbox,os,sys
from time import time

def mktemp():
    import sha,os,time
    d = sha.new("spam:%s,%s"%(os.getpid(),time.time())).hexdigest()
    return "spam%s.temp"% d[:10]

tempfilename = mktemp()

def main():
    print sys.argv
    if len(sys.argv) > 1:
        filename = sys.argv[1]
    else:
        print "Usage: spam.py mboxfile"

    print "marking up", filename
    mailfile = open(filename, 'r')
    ham = open(filename + ".ham", 'w')
    spam = open(filename + ".spam", 'w')

    mbox = mailbox.UnixMailbox(mailfile)
    i = 0
    
    while 1:
        i += 1
        m1 = mailfile.tell()
        msg = mbox.next()
        if not msg: break
        body = msg.fp.read()
        envelope = env_header(mailfile, m1)
        print "%5d"%i, m1, mailfile.tell(), msg.startofbody, len(body),
        is_spam, txt = spam_filter (envelope, msg, body)
        print ['HAM','SPAM'][is_spam]
        
        if is_spam:
            spam.write(txt)
        else:
            ham.write(txt)

def spam_filter(envelope, msg, body):
    txt = envelope + ''.join(msg.headers) + '\n' + body
    out = os.popen("spamc > %s"% tempfilename, "w")
    out.write(txt)
    out.close()

    t = mailbox.UnixMailbox(open(tempfilename))
    spam_level = len(t.next().get('X-Spam-Level', ''))
    txt = open(tempfilename).read()
    return (spam_level >= 5, txt)

def env_header(fp, pos):
    t = fp.tell()
    fp.seek(pos)
    e = fp.readline()
    fp.seek(t)
    return e

try:
    t=time()
    main()
    dt = time()-t
    print "elapsed: %d min %d sec"% divmod(int(dt), 60)
finally:
    os.unlink(tempfilename)




More information about the Python-list mailing list