From npickett@users.sourceforge.net Sun Dec 1 04:08:06 2002 From: npickett@users.sourceforge.net (Neale Pickett) Date: Sat, 30 Nov 2002 20:08:06 -0800 Subject: [Spambayes-checkins] spambayes anydbm.py,NONE,1.1 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv8361a Added Files: anydbm.py Log Message: * New anydbm module. Works just like the one in the standard distribution, but does a check to see if we're on windows. Currently this check probably doesn't work. Someone with a Windows machine will have to alter it to do a better check :) --- NEW FILE: anydbm.py --- """Generic interface to all dbm clones. This is just like anydbm from the Python distribution, except that this one leaves out the "dbm" type on Windows, since reliable reports have it that this module is antiquated and most dreadful. """ import sys try: class error(Exception): pass except (NameError, TypeError): error = "anydbm.error" # XXX: Some windows dude should fix this test if sys.platform == "windows": # dbm on windows is awful. _names = ["dbhash", "gdbm", "dumbdbm"] else: _names = ["dbhash", "gdbm", "dbm", "dumbdbm"] _errors = [error] _defaultmod = None for _name in _names: try: _mod = __import__(_name) except ImportError: continue if not _defaultmod: _defaultmod = _mod _errors.append(_mod.error) if not _defaultmod: raise ImportError, "no dbm clone found; tried %s" % _names error = tuple(_errors) def open(file, flag = 'r', mode = 0666): # guess the type of an existing database from whichdb import whichdb result=whichdb(file) if result is None: # db doesn't exist if 'c' in flag or 'n' in flag: # file doesn't exist and the new # flag was used so use default type mod = _defaultmod else: raise error, "need 'c' or 'n' flag to open new db" elif result == "": # db type cannot be determined raise error, "db type could not be determined" else: mod = __import__(result) return mod.open(file, flag, mode) From timstone4@users.sourceforge.net Sun Dec 1 04:11:41 2002 From: timstone4@users.sourceforge.net (Tim Stone) Date: Sat, 30 Nov 2002 20:11:41 -0800 Subject: [Spambayes-checkins] spambayes OptionConfig.py,NONE,1.1 SmarterHTTPServer.py,NONE,1.1 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv11369 Added Files: OptionConfig.py SmarterHTTPServer.py Log Message: Option Configurator application --- NEW FILE: OptionConfig.py --- """Options Configurator Classes: OptionsConfigurator - changes select values in Options.py Abstract: This module implements a browser based Spambayes option file configuration utility. Users may use the pages in this application to customize the settings in the bayescustomize.ini file. This does not support the BAYESCUSTOMIZE environment variable. Is this even used anywhere? To execute this module, just invoke OptionConfig.py The port number is the port the http server will listen on, and defaults to 8000. Then point your browser at http://locahost:8000 (or whatever port you chose). To Do: o Suggestions? """ # This module is part of the spambayes project, which is Copyright 2002 # The Python Software Foundation and is covered by the Python Software # Foundation license. __author__ = "Tim Stone " import SmarterHTTPServer import BaseHTTPServer from Options import options import re from cStringIO import StringIO import os import ConfigParser import copy # This control dictionary maps http request parameters and template fields # to ConfigParser sections and options. The key matches both the input # field that corresponds to a section/option, and also the <> template # variable that is used to display the value of that section/option. parm_ini_map = \ {'hamcutoff': ('TestDriver', 'ham_cutoff'), 'spamcutoff': ('TestDriver', 'spam_cutoff'), 'dbname': ('pop3proxy', 'pop3proxy_persistent_storage_file'), 'headername': ('Hammie', 'hammie_header_name'), 'spamstring': ('Hammie', 'header_spam_string'), 'hamstring': ('Hammie', 'header_ham_string'), 'unsurestring': ('Hammie', 'header_unsure_string'), 'p3servers': ('pop3proxy', 'pop3proxy_servers'), 'p3ports': ('pop3proxy', 'pop3proxy_ports'), 'p3hamdir': ('pop3proxy', 'pop3proxy_ham_cache'), 'p3spamdir': ('pop3proxy', 'pop3proxy_spam_cache'), 'p3unknowndir': ('pop3proxy', 'pop3proxy_unknown_cache') } PIMapSect = 0 PIMapOpt = 1 class OptionsConfigurator(SmarterHTTPServer.SmarterHTTPRequestHandler): def homepage(self, parms): self.send_header("Content-type", 'text/html') # start with the options config file, add bayescustomize.ini to it bcini = ConfigParser.ConfigParser() # this is a pain... for sect in options._config.sections(): for opt in options._config.options(sect): try: bcini.set(sect, opt, options._config.get(sect, opt)) except ConfigParser.NoSectionError: bcini.add_section(sect) bcini.set(sect, opt, options._config.get(sect, opt)) bcini.read('bayescustomize.ini') html = templateGet('ocHome.html') for httpparm in parm_ini_map: html = templateSub(html, 'PY-%s' % (httpparm), \ bcini.get(parm_ini_map[httpparm][PIMapSect], \ parm_ini_map[httpparm][PIMapOpt])) html = addSbLookAndFeel(html) html = templateSub(html, 'PY-TITLE', \ 'Spambayes Options Configurator: Home') html = templateSub(html, 'PY-SBLAFNAV', \ 'Spambayes Options Configurator: Home') html = addSbFooter(html) html = templateSub(html, 'PY-FOOTERTITLE', \ 'Spambayes Options Configuration') return html def changeopts(self,parms): self.send_header("Content-type", 'text/html') errmsg = editInput(parms) if errmsg != '': html = templateGet('ocError.html') html = templateSub(html, 'PY-ERROR', errmsg) html = addSbLookAndFeel(html) html = templateSub(html, 'PY-TITLE', \ 'Spambayes Options Configurator: Home > Error') html = templateSub(html, 'PY-SBLAFNAV', \ 'Spambayes Options Configurator: \ Home > Error') html = addSbFooter(html) html = templateSub(html, 'PY-FOOTERTITLE', \ 'Spambayes Options Configuration') return html updateIniFile(parms) html = templateGet('ocChanged.html') html = addSbLookAndFeel(html) html = templateSub(html, 'PY-TITLE', \ 'Spambayes Options Configurator: Home > Options Changed') html = templateSub(html, 'PY-SBLAFNAV', \ 'Spambayes Options Configurator: \ Home > Options Changed') html = addSbFooter(html) html = templateSub(html, 'PY-FOOTERTITLE', \ 'Spambayes Options Configuration') return html def restoredflts(self, parms): restoreIniDefaults() html = templateGet('ocDefault.html') html = addSbLookAndFeel(html) html = templateSub(html, 'PY-TITLE', \ 'Spambayes Options Configurator: Home > Defaults Restored') html = templateSub(html, 'PY-SBLAFNAV', \ 'Spambayes Options Configurator: \ Home > Defaults Restored') html = addSbFooter(html) html = templateSub(html, 'PY-FOOTERTITLE', \ 'Spambayes Options Configuration') return html def templateSub(mass, tmplvar, val): regex = '<<%s>>' % (tmplvar) hc = re.compile(regex, re.MULTILINE) return hc.sub('%s' % (val), mass) def addSbLookAndFeel(str): sbstr = templateGet('sblookandfeel.thtml') return templateSub(str, 'PY-SBLOOKANDFEEL', sbstr) def addSbFooter(str): ftstr = templateGet('sbfooter.thtml') return templateSub(str, 'PY-FOOTER', ftstr) def editInput(parms): errmsg = '' # edit numericity of hamcutoff and spamcutoff try: hco = parms['hamcutoff'][0] except KeyError: hco = options.ham_cutoff try: sco = parms['spamcutoff'][0] except KeyError: sco = options.spam_cutoff errmsg = '' try: hco = float(hco) except ValueError: errmsg += '
  • Ham cutoff must be a number, between 0 and 1
  • \n' try: sco = float(sco) except ValueError: errmsg += '
  • Spam cutoff must be a number, \ between 0 and 1
  • \n' # edit 0 <= hamcutoff < spamcutoff <= 1 if hco < 0 or hco > 1: errmsg += '
  • Ham cutoff must be between 0 and 1
  • \n' if sco < 0 or sco > 1: errmsg += '
  • Spam cutoff must be between 0 and 1
  • \n' if not hco < sco: errmsg += '
  • Ham cutoff must be less than Spam cutoff
  • \n' # edit for equal number of pop3servers and ports try: slist = parms['p3servers'][0].split(',') except KeyError: slist = options.pop3proxy_servers.split(',') try: plist = parms['p3ports'][0].split(',') except KeyError: plist = options.pop3proxy_ports.split(',') # edit for duplicate ports if len(slist) != len(plist): errmsg += '
  • The number of ports specified must match the \ number of servers specified
  • \n' plist.sort() for p in range(len(plist)-1): try: if plist[p] == plist[p+1]: errmsg += '
  • All port numbers must be unique
  • ' break except IndexError: pass return errmsg def updateIniFile(parms): # assumes bayescustomize.ini is in this process' working directory inipath = os.path.abspath('bayescustomize.ini') bcini = ConfigParser.ConfigParser() bcini.read(inipath) for httpParm in parm_ini_map: map = parm_ini_map[httpParm] sect = map[PIMapSect] opt = map[PIMapOpt] try: val = parms[httpParm][0] except KeyError: continue try: bcini.add_section(sect) except ConfigParser.DuplicateSectionError: pass bcini.set(sect, opt, val) o = open(inipath, 'wb') bcini.write(o) o.close() def restoreIniDefaults(): # assumes bayescustomize.ini is in this process' working directory inipath = os.path.abspath('bayescustomize.ini') bcini = ConfigParser.ConfigParser() bcini.read(inipath) for sect in bcini.sections(): for opt in bcini.options(sect): bcini.remove_option(sect, opt) o = open(inipath, 'wb') bcini.write(o) o.close() ocHome = """ <>

    This page allows you to change certain customizable options that control the way in which Spambayes processes your email. Hover your mouse pointer over an item name for an explanation of that item

    Statistics Options
    Ham cutoff Current Value: <>
    Spam cutoff Current Value: <>
    Database file name Current Value: <>

    Inserted Header Options
    Header Name Current Value: <>
    Spam Designation Current Value: <>
    Ham Designation Current Value: <>
    Unsure Designation Current Value: <>

    POP3 Options
    Servers Current Value: <>
    Ports Current Value: <>
    Ham Directory Current Value: <>
    Spam Directory Current Value: <>
    Untrained Mail Directory Current Value: <>

    <> """ ocChanged = """ <>
    Options Changed
    The options changes you've made have been recorded. You will need to restart any Spambayes processes you have running, such as the pop3proxy, in order for your changes to take effect. When you return to the Options Configuration homepage, you may need to refresh the page to see the changes you have made.
    <> """ ocDefault = """ <>
    Option Defaults Restored
    All options have been reverted to their default values. You will need to restart any Spambayes processes you have running, such as the pop3proxy, in order for your changes to take effect. When you return to the Options Configuration homepage, you may need to refresh the page to see the changes you have made.
    <> """ ocError = """ <>
    Errors Detected
      <>
    <> """ sbLAF = """ <<PY-TITLE>> """ sbFoot = """
    """ # This control dictionary is used to locate html within this or another # module. It maps a filename to an attribute, which is used to acquire # content when a url references a resource named in the dictionary. # # A filename could be mapped to a variable or a function, either within # this module or in a separate module (which would have to be imported) localFiles = {'ocHome.html':ocHome, \ 'ocChanged.html':ocChanged, \ 'ocDefault.html':ocDefault, \ 'ocError.html':ocError, \ 'sblookandfeel.thtml':sbLAF, \ 'sbfooter.thtml':sbFoot} def templateGet(filename): try: str = localFiles[filename] except KeyError: try: f = open(filename, 'rb') except IOError: str = 'Template file %s Not Found' % (filename) else: str = f.read() f.close() return str def run(HandlerClass = OptionsConfigurator, ServerClass = BaseHTTPServer.HTTPServer): BaseHTTPServer.test(HandlerClass, ServerClass) if __name__ == '__main__': run() --- NEW FILE: SmarterHTTPServer.py --- """Smarter HTTP Server. This module builds on SimpleHTTPServer, adding 'methlet' invokation by handling urls with a file extension of .methlet. In this instance, the so-called filename actually names a method on the handler, which is invoked with a single parameter, a dictionary of the url's parsed query string. This class is intended to be subclassed, with subclasses adding the appropriate methlet methods for the application being served. """ __version__ = "0.6" __all__ = ["SmarterHTTPRequestHandler"] import os import posixpath import BaseHTTPServer import SimpleHTTPServer import urllib import cgi import shutil import mimetypes import re from StringIO import StringIO class SmarterHTTPRequestHandler(SimpleHTTPServer.SimpleHTTPRequestHandler): """Smarter HTTP request handler based on SimpleHTTPRequestHandler. Adds GET with parameters, which calls a method. """ server_version = "SmarterHTTP/" + __version__ def send_head(self): """Common code for GET and HEAD commands. This sends the response code and MIME headers. Return value is either a file object (which has to be copied to the outputfile by the caller unless the command was HEAD, and must be closed by the caller under all circumstances), or None, in which case the caller has nothing further to do. """ path, parms = self.translate_path(self.path) f = None if os.path.isdir(path): if hasattr(self, 'homepage'): path = 'homepage.methlet' else: for index in "index.html", "index.htm": index = os.path.join(path, index) if os.path.exists(index): path = index break else: return self.list_directory(path) ctype = self.guess_type(path) if ctype != 'application/method': if ctype.startswith('text/'): mode = 'r' else: mode = 'rb' try: f = open(path, mode) except IOError: self.send_error(404, "File not found") return None else: self.send_response(200) self.send_header("Content-type", ctype) self.end_headers() else: head, tail = os.path.split(path) methname = tail.split('.')[0] pdict = {} if parms: pdict = cgi.parse_qs(parms, False) # ctype application/method methlets (invented here) may # send whatever headers they like. However, the server has # already sent the 200 response, so Location: headers are # not meaningful. Also, the server will always send # Content-type: text/html, so the methlets should not send # anything incompatible with text/html type. Methlets should # not invoke end_headers(). if hasattr(self, methname): self.send_response(200) retstr = getattr(self, methname)(pdict) f = StringIO(retstr) self.send_header("Content-type", 'text/html') self.end_headers() else: self.send_error(404, "File not found") return None return f def translate_path(self, url): """Translate a /-separated PATH to the local filename syntax. Components that mean special things to the local file system (e.g. drive or directory names) are ignored. (XXX They should probably be diagnosed.) """ parmre = re.compile(r'^(.*)[\?](.*)$') match = parmre.search(url) if match: path = match.group(1) parms = match.group(2) else: path = url parms = None path = posixpath.normpath(urllib.unquote(path)) words = path.split('/') words = filter(None, words) path = os.getcwd() for word in words: drive, word = os.path.splitdrive(word) head, word = os.path.split(word) if word in (os.curdir, os.pardir): continue path = os.path.join(path, word) return (path, parms) def guess_type(self, path): """Guess the type of a file. Argument is a PATH (a filename). Return value is a string of the form type/subtype, usable for a MIME Content-type header. The default implementation looks the file's extension up in the table self.extensions_map, using text/plain as a default; however it would be permissible (if slow) to look inside the data to make a better guess. """ base, ext = posixpath.splitext(path) if self.extensions_map.has_key(ext): return self.extensions_map[ext] ext = ext.lower() if self.extensions_map.has_key(ext): return self.extensions_map[ext] else: return self.extensions_map[''] extensions_map = mimetypes.types_map.copy() extensions_map.update({ '': 'application/octet-stream', # Default '.py': 'text/plain', '.c': 'text/plain', '.h': 'text/plain', '.methlet': 'application/method', }) def test(HandlerClass = SmarterHTTPRequestHandler, ServerClass = BaseHTTPServer.HTTPServer): BaseHTTPServer.test(HandlerClass, ServerClass) if __name__ == '__main__': test() From timstone4@users.sourceforge.net Sun Dec 1 04:12:20 2002 From: timstone4@users.sourceforge.net (Tim Stone) Date: Sat, 30 Nov 2002 20:12:20 -0800 Subject: [Spambayes-checkins] spambayes helmet.gif,NONE,1.1 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv11881 Added Files: helmet.gif Log Message: Needed for the Option Configurator application and pop3proxy... where should stuff like this go ultimately? --- NEW FILE: helmet.gif --- (This appears to be a binary file; contents omitted.) From timstone4@users.sourceforge.net Sun Dec 1 05:04:05 2002 From: timstone4@users.sourceforge.net (Tim Stone) Date: Sat, 30 Nov 2002 21:04:05 -0800 Subject: [Spambayes-checkins] spambayes anydbm.py,1.1,1.2 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv20718 Modified Files: anydbm.py Log Message: On my win2k, sys.platform == 'win32' Index: anydbm.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/anydbm.py,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** anydbm.py 1 Dec 2002 04:08:04 -0000 1.1 --- anydbm.py 1 Dec 2002 05:04:03 -0000 1.2 *************** *** 15,20 **** error = "anydbm.error" ! # XXX: Some windows dude should fix this test ! if sys.platform == "windows": # dbm on windows is awful. _names = ["dbhash", "gdbm", "dumbdbm"] --- 15,19 ---- error = "anydbm.error" ! if sys.platform in ["win32"]: # dbm on windows is awful. _names = ["dbhash", "gdbm", "dumbdbm"] From richiehindle@users.sourceforge.net Sun Dec 1 21:43:22 2002 From: richiehindle@users.sourceforge.net (Richie Hindle) Date: Sun, 01 Dec 2002 13:43:22 -0800 Subject: [Spambayes-checkins] spambayes FileCorpus.py,1.8,1.9 Corpus.py,1.5,1.6 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv7752 Modified Files: FileCorpus.py Corpus.py Log Message: Made Corpus.Message load the message substance on demand. Previously, startup of pop3proxy.py could be slow because flushing the trained message cache would load up all the messages from the disk. The loading is now done through Corpus.Message.__getattr__, so the on-demand-ness should come for free for all Corpus-using code. Index: FileCorpus.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/FileCorpus.py,v retrieving revision 1.8 retrieving revision 1.9 diff -C2 -d -r1.8 -r1.9 *** FileCorpus.py 28 Nov 2002 22:02:46 -0000 1.8 --- FileCorpus.py 1 Dec 2002 21:43:19 -0000 1.9 *************** *** 196,200 **** self.file_name = file_name self.directory = directory ! self.load() def pathname(self): --- 196,202 ---- self.file_name = file_name self.directory = directory ! ! # No calling of self.load() here - that's done on demand by ! # Message.__getattr__. def pathname(self): Index: Corpus.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Corpus.py,v retrieving revision 1.5 retrieving revision 1.6 diff -C2 -d -r1.5 -r1.6 *** Corpus.py 28 Nov 2002 22:02:46 -0000 1.5 --- Corpus.py 1 Dec 2002 21:43:19 -0000 1.6 *************** *** 256,261 **** '''Constructor()''' ! self.payload = None ! self.hdrtxt = None def load(self): --- 256,271 ---- '''Constructor()''' ! # The text of the message headers and body are held in attributes ! # called 'hdrtxt' and 'payload', created on demand in __getattr__ ! # by calling load(), which should in turn call setSubstance(). ! # This means you don't need to remember to call load() before ! # using these attributes. ! ! def __getattr__(self, attributeName): ! '''On-demand loading of the message text.''' ! ! if attributeName in ('hdrtxt', 'payload'): ! self.load() ! return getattr(self, attributeName) def load(self): From mhammond@skippinet.com.au Sun Dec 1 22:39:59 2002 From: mhammond@skippinet.com.au (Mark Hammond) Date: Mon, 2 Dec 2002 09:39:59 +1100 Subject: [Spambayes-checkins] spambayes FileCorpus.py,1.8,1.9Corpus.py,1.5,1.6 In-Reply-To: Message-ID: > so the on-demand-ness should come for free for all Corpus-using code. How much Corpus-using code is there? Are there any plans to move any existing code that does not use it towards using it? I've raised this with Tim S for Outlook, and it doesn't appear we will - I have no idea about the other apps though. In the back of my mind, I am pondering if we need a better directory structure - maybe with the core engine in a package, and some of these "wrappers" used only by a few application also into their own? Mark. From npickett@users.sourceforge.net Mon Dec 2 04:43:40 2002 From: npickett@users.sourceforge.net (Neale Pickett) Date: Sun, 01 Dec 2002 20:43:40 -0800 Subject: [Spambayes-checkins] spambayes mboxtrain.py,NONE,1.1 README.txt,1.44,1.45 hammie.py,1.44,1.45 hammiebulk.py,1.4,1.5 hammiefilter.py,1.3,1.4 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv23996 Modified Files: README.txt hammie.py hammiebulk.py hammiefilter.py Added Files: mboxtrain.py Log Message: * New mboxtrain.py program, which will train a mailbox, remembering which messages it trained on. See README.txt for more info. * README.txt modified to talk about mboxtrain.py, and gets silly hammiesrv and hammiecli down below the other, more generally useful drivers. * hammie.py now has untrain methods * hammiebulk.py no longer imports unneeded storage module * hammiefilter.py now supports untraining --- NEW FILE: mboxtrain.py --- #! /usr/bin/env python ### Train spambayes on all previously-untrained messages in a mailbox. ### ### This keeps track of messages it's already trained by adding an ### X-Spambayes-Trained: header to each one. Then, if you move one to ### another folder, it will retrain that message. You would want to run ### this from a cron job on your server. """Usage: %(program)s [OPTIONS] ... Where OPTIONS is one or more of: -h show usage and exit -d DBNAME use the DBM store. A DBM file is larger than the pickle and creating it is slower, but loading it is much faster, especially for large word databases. Recommended for use with hammiefilter or any procmail-based filter. -D DBNAME use the pickle store. A pickle is smaller and faster to create, but much slower to load. Recommended for use with pop3proxy and hammiesrv. -g PATH mbox or directory of known good messages (non-spam) to train on. Can be specified more than once. -s PATH mbox or directory of known spam messages to train on. Can be specified more than once. -f force training, ignoring the trained header. Use this if you need to rebuild your database from scratch. -q quiet mode; no output """ import mboxutils import getopt import hammie import sys import os program = sys.argv[0] TRAINED_HDR = "X-Spambayes-Trained" loud = True def msg_train(h, msg, is_spam, force): """Train bayes with a single message.""" # XXX: big hack -- why is email.Message unable to represent # multipart/alternative? try: msg.as_string() except TypeError: # We'll be unable to represent this as text :( return False if is_spam: spamtxt = "spam" else: spamtxt = "ham" oldtxt = msg.get(TRAINED_HDR) if force: # Train no matter what. if oldtxt != None: del msg[TRAINED_HDR] elif oldtxt == spamtxt: # Skip this one, we've already trained with it. return False elif oldtxt != None: # It's been trained, but as something else. Untrain. del msg[TRAINED_HDR] h.untrain(msg, not is_spam) h.train(msg, is_spam) msg.add_header(TRAINED_HDR, spamtxt) return True def maildir_train(h, path, is_spam, force): """Train bayes with all messages from a maildir.""" if loud: print " Reading as Maildir" import time import socket pid = os.getpid() host = socket.gethostname() counter = 0 trained = 0 for fn in os.listdir(os.path.join(path, "cur")): counter += 1 cfn = os.path.join(path, "cur", fn) tfn = os.path.join(path, "tmp", "%d.%d_%d.%s" % (time.time(), pid, counter, host)) if loud: sys.stdout.write(" %s \r" % fn) sys.stdout.flush() f = file(cfn, "rb") msg = mboxutils.get_message(f) f.close() if not msg_train(h, msg, is_spam, force): continue trained += 1 f = file(tfn, "wb") f.write(msg.as_string()) f.close() # XXX: This will raise an exception on Windows. Do any Windows # people actually use Maildirs? os.rename(tfn, cfn) if loud: print (" Trained %d out of %d messages " % (trained, counter)) def mbox_train(h, path, is_spam, force): """Train bayes with a Unix mbox""" if loud: print " Reading as Unix mbox" import mailbox import fcntl import tempfile # Open and lock the mailbox. Some systems require it be opened for # writes in order to assert an exclusive lock. f = file(path, "r+b") fcntl.flock(f, fcntl.LOCK_EX) mbox = mailbox.PortableUnixMailbox(f, mboxutils.get_message) outf = os.tmpfile() counter = 0 trained = 0 for msg in mbox: counter += 1 if loud: sys.stdout.write(" %s\r" % counter) sys.stdout.flush() if not msg_train(h, msg, is_spam, force): continue trained += 1 # Write it out with the Unix "From " line outf.write(msg.as_string(True)) outf.seek(0) try: os.ftruncate(f.fileno(), 0) f.seek(0) except: # If anything goes wrong, don't try to write print "Problem truncating mbox--nothing written" raise try: for line in outf.xreadlines(): f.write(line) except: print >> sys.stderr ("Problem writing mbox! Sorry, " "I tried my best, but your mail " "may be corrupted.") raise fcntl.lockf(f, fcntl.LOCK_UN) f.close() if loud: print (" Trained %d out of %d messages " % (trained, counter)) def mhdir_train(h, path, is_spam, force): """Train bayes with an mh directory""" if loud: print " Reading as MH mailbox" import glob counter = 0 trained = 0 for fn in glob.glob(os.path.join(path, "[0-9]*")): counter += 1 cfn = fn tfn = os.path.join(path, "spambayes.tmp") if loud: sys.stdout.write(" %s \r" % fn) sys.stdout.flush() f = file(fn, "rb") msg = mboxutils.get_message(f) f.close() msg_train(h, msg, is_spam, force) trained += 1 f = file(tfn, "wb") f.write(msg.as_string()) f.close() # XXX: This will raise an exception on Windows. Do any Windows # people actually use MH directories? os.rename(tfn, cfn) if loud: print (" Trained %d out of %d messages " % (trained, counter)) def train(h, path, is_spam, force): if os.path.isfile(path): mbox_train(h, path, is_spam, force) elif os.path.isdir(os.path.join(path, "cur")): maildir_train(h, path, is_spam, force) elif os.path.isdir(path): mhdir_train(h, path, is_spam, force) else: raise ValueError("Unable to determine mailbox type: " + path) def usage(code, msg=''): """Print usage message and sys.exit(code).""" if msg: print >> sys.stderr, msg print >> sys.stderr print >> sys.stderr, __doc__ % globals() sys.exit(code) def main(): """Main program; parse options and go.""" global loud try: opts, args = getopt.getopt(sys.argv[1:], 'hfqd:D:g:s:') except getopt.error, msg: usage(2, msg) if not opts: usage(2, "No options given") pck = None usedb = None force = False good = [] spam = [] for opt, arg in opts: if opt == '-h': usage(0) elif opt == "-f": force = True elif opt == "-q": loud = False elif opt == '-g': good.append(arg) elif opt == '-s': spam.append(arg) elif opt == "-d": usedb = True pck = arg elif opt == "-D": usedb = False pck = arg if args: usage(2, "Positional arguments not allowed") if usedb == None: usage(2, "Must specify one of -d or -D") h = hammie.open(pck, usedb, "c") for g in good: if loud: print "Training ham (%s):" % g train(h, g, False, force) save = True for s in spam: if loud: print "Training spam (%s):" % s train(h, s, True, force) save = True if save: h.store() if __name__ == "__main__": main() Index: README.txt =================================================================== RCS file: /cvsroot/spambayes/spambayes/README.txt,v retrieving revision 1.44 retrieving revision 1.45 diff -C2 -d -r1.44 -r1.45 *** README.txt 29 Nov 2002 00:57:23 -0000 1.44 --- README.txt 2 Dec 2002 04:43:37 -0000 1.45 *************** *** 64,78 **** A spamassassin-like filter which uses tokenizer and classifier (above). - hammiesrv.py - A first stab at making hammie into a client/server model, using - XML-RPC. - - hammiecli.py - A client for hammiesrv. - hammiefilter.py A simpler hammie front-end that doesn't print anything. Useful for procmail filering and scoring from your MUA. pop3proxy.py A spam-classifying POP3 proxy. It adds a spam-judgement header to --- 64,81 ---- A spamassassin-like filter which uses tokenizer and classifier (above). hammiefilter.py A simpler hammie front-end that doesn't print anything. Useful for procmail filering and scoring from your MUA. + mboxtrain.py + Trainer for Maildir, MH, or mbox mailboxes. Remembers which + messages it saw the last time you ran it, and will only train on new + messages or messages which should be retrained. + + The idea is to run this automatically every night on your Inbox and + Spam folders, and then sort misclassified messages by hand. This + will work with any IMAP4 mail client, or any client running on the + server. + pop3proxy.py A spam-classifying POP3 proxy. It adds a spam-judgement header to *************** *** 91,94 **** --- 94,103 ---- classifier score. Note that both Maildirs must be on the same device. + + hammiesrv.py + A stab at making hammie into a client/server model, using XML-RPC. + + hammiecli.py + A client for hammiesrv. Index: hammie.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/hammie.py,v retrieving revision 1.44 retrieving revision 1.45 diff -C2 -d -r1.44 -r1.45 *** hammie.py 27 Nov 2002 22:37:56 -0000 1.44 --- hammie.py 2 Dec 2002 04:43:37 -0000 1.45 *************** *** 130,133 **** --- 130,144 ---- self.bayes.learn(tokenize(msg), is_spam) + def untrain(self, msg, is_spam): + """Untrain bayes with a message. + + msg can be a string, a file object, or a Message object. + + is_spam should be 1 if the message is spam, 0 if not. + + """ + + self.bayes.unlearn(tokenize(msg), is_spam) + def train_ham(self, msg): """Train bayes with ham. *************** *** 147,150 **** --- 158,179 ---- self.train(msg, True) + + def untrain_ham(self, msg): + """Untrain bayes with ham. + + msg can be a string, a file object, or a Message object. + + """ + + self.untrain(msg, False) + + def train_spam(self, msg): + """Untrain bayes with spam. + + msg can be a string, a file object, or a Message object. + + """ + + self.untrain(msg, True) def store(self): Index: hammiebulk.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/hammiebulk.py,v retrieving revision 1.4 retrieving revision 1.5 diff -C2 -d -r1.4 -r1.5 *** hammiebulk.py 25 Nov 2002 16:24:26 -0000 1.4 --- hammiebulk.py 2 Dec 2002 04:43:37 -0000 1.5 *************** *** 51,55 **** import mboxutils import classifier - import storage import hammie import Corpus --- 51,54 ---- Index: hammiefilter.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/hammiefilter.py,v retrieving revision 1.3 retrieving revision 1.4 diff -C2 -d -r1.3 -r1.4 *** hammiefilter.py 25 Nov 2002 02:29:44 -0000 1.3 --- hammiefilter.py 2 Dec 2002 04:43:37 -0000 1.4 *************** *** 15,21 **** ## ! """Usage: %(program)s [option] ! Where [option] is one of: -h show usage and exit --- 15,21 ---- ## ! """Usage: %(program)s [OPTION] ! Where [OPTION] is one of: -h show usage and exit *************** *** 26,29 **** --- 26,35 ---- -s train on stdin as a bad (spam) message + -G + untrain ham on stdin -- only use if you've already trained this + message! + -S + untrain spam on stdin -- only use if you've already trained this + message! If neither -g nor -s is given, stdin will be scored: the same message, *************** *** 83,86 **** --- 89,104 ---- msg = sys.stdin.read() h.train_spam(msg) + h.store() + + def untrain_ham(self): + h = hammie.open(self.dbname, self.usedb, 'c') + msg = sys.stdin.read() + h.untrain_ham(msg) + h.store() + + def untrain_spam(self): + h = hammie.open(self.dbname, self.usedb, 'c') + msg = sys.stdin.read() + h.untrain_spam(msg) h.store() From npickett@users.sourceforge.net Mon Dec 2 06:02:05 2002 From: npickett@users.sourceforge.net (Neale Pickett) Date: Sun, 01 Dec 2002 22:02:05 -0800 Subject: [Spambayes-checkins] spambayes storage.py,1.4,1.5 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv15502 Modified Files: storage.py Log Message: * storage.DBDictClassifier.store() now iterates over a list, not a generator. The latter was causing problems when removing keys. Index: storage.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/storage.py,v retrieving revision 1.4 retrieving revision 1.5 diff -C2 -d -r1.4 -r1.5 *** storage.py 27 Nov 2002 23:04:14 -0000 1.4 --- storage.py 2 Dec 2002 06:02:03 -0000 1.5 *************** *** 156,160 **** print 'Persisting',self.db_name,'state in database' ! for key, val in self.wordinfo.iteritems(): if val == None: del self.wordinfo[key] --- 156,162 ---- print 'Persisting',self.db_name,'state in database' ! # Must use .keys() since we modify the dict in the loop ! for key in self.wordinfo.keys(): ! val = self.wordinfo[key] if val == None: del self.wordinfo[key] From npickett@users.sourceforge.net Mon Dec 2 06:02:36 2002 From: npickett@users.sourceforge.net (Neale Pickett) Date: Sun, 01 Dec 2002 22:02:36 -0800 Subject: [Spambayes-checkins] spambayes hammiefilter.py,1.4,1.5 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv18116 Modified Files: hammiefilter.py Log Message: * Added -G and -S options to hammiefilter ;) Index: hammiefilter.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/hammiefilter.py,v retrieving revision 1.4 retrieving revision 1.5 diff -C2 -d -r1.4 -r1.5 *** hammiefilter.py 2 Dec 2002 04:43:37 -0000 1.4 --- hammiefilter.py 2 Dec 2002 06:02:34 -0000 1.5 *************** *** 106,110 **** h = HammieFilter() action = h.filter ! opts, args = getopt.getopt(sys.argv[1:], 'hngs') for opt, arg in opts: if opt == '-h': --- 106,110 ---- h = HammieFilter() action = h.filter ! opts, args = getopt.getopt(sys.argv[1:], 'hngsGS') for opt, arg in opts: if opt == '-h': *************** *** 114,117 **** --- 114,121 ---- elif opt == '-s': action = h.train_spam + elif opt == '-G': + action = h.untrain_ham + elif opt == '-S': + action = h.untrain_spam elif opt == "-n": action = h.newdb From npickett at users.sourceforge.net Mon Dec 2 18:59:21 2002 From: npickett at users.sourceforge.net (Neale Pickett) Date: Mon, 02 Dec 2002 10:59:21 -0800 Subject: [Spambayes-checkins] spambayes HAMMIE.txt,NONE,1.1 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv10402 Added Files: HAMMIE.txt Log Message: * Added 3-minute hammie setup tutorial. There should probably be a doc/ directory for stuff like this. --- NEW FILE: HAMMIE.txt --- 3-Minute Procmail-based Setup ------------------------------------- What you will have after doing all this: * All your existing mail will have a new "X-Spambayes-Trained" header. Spambayes uses this to keep track of which messages it's already learned about. * Spambayes will look at all your incoming mail. Messages it thinks are spam will be put in a "spam" mail folder. Everything else will be delivered normally. * Every morning, spambayes will go through your mail folders and train itself on any new messages. It will also pick up mail that's been re-filed: something it thought was ham but was actually spam, and vice-versa. Be sure to keep spam in your spam folder for at least a day or two before deleting it--I suggest keeping it for a full year, just in case you need to re-train spambayes. ----- What you need: * the spambayes package * Python 2.2.2 or newer * a text editor * procmail (most systems have this) * a working crond (most systems have this) * (optional) a mailbox full of spam and a mailbox full of ham ----- Instructions: 1. Download the spambayes package. I'll assume you've put it in $HOME/src/spambayes. 2. Create a new database: $HOME/src/hammiefilter.py -n 3. (optional) Train it on your existing mail: $HOME/src/mboxtrain.py -d $HOME/.hammiedb -g $HOME/Mail/inbox -s $HOME/Mail/spam You can add additional folder names if you like, using -g for "good" mail folders, and -s for "spam" folders. 4. Add the following two recipies to the top of your .procmailrc: :0fw | $HOME/src/hammiefilter.py :0 $HOME/Maildir/.spam/ The above is for the maildir message format. If you need mbox (the default on many systems) or MH, the second recipie should look something like this: :0: $HOME/Mail/spam If you're not sure what format you use, ask your system administrator. If you are the system administrator, check the documentation of your mail program. With the notable exception of Pine, which can only read mbox format unless patched, most modern MUAs can handle both Maildir and mbox formats. 5. Add the following cron job ("crontab -e" with vixie cron, the default on most Linux systems): 21 2 * * * $HOME/src/mboxtrain.py -d $HOME/.hammiedb -g $HOME/Mail/inbox -s $HOME/Mail/spam As in step 3, you can add additional folder names here too. It's important to do so if you regularly file mail in different folders, since otherwise spambays will never learn anything about those messages. ----- That's it! You're all done. If you have questions or comments about these instructions, please mail them to neale-spambayes@woozle.org. Neale Pickett From npickett at users.sourceforge.net Mon Dec 2 19:03:52 2002 From: npickett at users.sourceforge.net (Neale Pickett) Date: Mon, 02 Dec 2002 11:03:52 -0800 Subject: [Spambayes-checkins] spambayes HAMMIE.txt,1.1,1.2 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv15779 Modified Files: HAMMIE.txt Log Message: * A little touch-up Index: HAMMIE.txt =================================================================== RCS file: /cvsroot/spambayes/spambayes/HAMMIE.txt,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** HAMMIE.txt 2 Dec 2002 18:59:18 -0000 1.1 --- HAMMIE.txt 2 Dec 2002 19:03:46 -0000 1.2 *************** *** 67,72 **** MUAs can handle both Maildir and mbox formats. ! 5. Add the following cron job ("crontab -e" with vixie cron, the ! default on most Linux systems): 21 2 * * * $HOME/src/mboxtrain.py -d $HOME/.hammiedb -g $HOME/Mail/inbox -s $HOME/Mail/spam --- 67,73 ---- MUAs can handle both Maildir and mbox formats. ! 5. Add the following cron job to train on new or refiled messages every ! morning at 2:21am ("crontab -e" with vixie cron, the default on most ! Linux systems): 21 2 * * * $HOME/src/mboxtrain.py -d $HOME/.hammiedb -g $HOME/Mail/inbox -s $HOME/Mail/spam *************** *** 77,80 **** --- 78,86 ---- messages. + 6. Spambayes should now be filtering all your mail and training itself + on your mailboxes. Occasionally a message will be misfiled. Just + move that message to the correct folder, and spambayes will learn + from its mistake the next morning. + ----- From npickett at users.sourceforge.net Mon Dec 2 20:23:43 2002 From: npickett at users.sourceforge.net (Neale Pickett) Date: Mon, 02 Dec 2002 12:23:43 -0800 Subject: [Spambayes-checkins] spambayes anydbm.py,1.2,1.3 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv2558 Modified Files: anydbm.py Log Message: * dbhash on windows is just as broken as dbm, but bsddb3 is apparently okay. So anydbm.py now checks for that instead of dbhash. Index: anydbm.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/anydbm.py,v retrieving revision 1.2 retrieving revision 1.3 diff -C2 -d -r1.2 -r1.3 *** anydbm.py 1 Dec 2002 05:04:03 -0000 1.2 --- anydbm.py 2 Dec 2002 20:23:39 -0000 1.3 *************** *** 1,2 **** --- 1,3 ---- + #! /usr/bin/env python """Generic interface to all dbm clones. *************** *** 17,21 **** if sys.platform in ["win32"]: # dbm on windows is awful. ! _names = ["dbhash", "gdbm", "dumbdbm"] else: _names = ["dbhash", "gdbm", "dbm", "dumbdbm"] --- 18,22 ---- if sys.platform in ["win32"]: # dbm on windows is awful. ! _names = ["bsddb3", "gdbm", "dumbdbm"] else: _names = ["dbhash", "gdbm", "dbm", "dumbdbm"] From npickett at users.sourceforge.net Mon Dec 2 22:02:28 2002 From: npickett at users.sourceforge.net (Neale Pickett) Date: Mon, 02 Dec 2002 14:02:28 -0800 Subject: [Spambayes-checkins] spambayes anydbm.py,1.3,1.4 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv7838 Modified Files: anydbm.py Log Message: * Didn't read Richie's post closely enough. Until we figure out a way to use bsddb3 gracefully, Windows folks don't get dbhash or bsddb3 (which didn't work anyway). Index: anydbm.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/anydbm.py,v retrieving revision 1.3 retrieving revision 1.4 diff -C2 -d -r1.3 -r1.4 *** anydbm.py 2 Dec 2002 20:23:39 -0000 1.3 --- anydbm.py 2 Dec 2002 22:02:23 -0000 1.4 *************** *** 18,22 **** if sys.platform in ["win32"]: # dbm on windows is awful. ! _names = ["bsddb3", "gdbm", "dumbdbm"] else: _names = ["dbhash", "gdbm", "dbm", "dumbdbm"] --- 18,22 ---- if sys.platform in ["win32"]: # dbm on windows is awful. ! _names = ["gdbm", "dumbdbm"] else: _names = ["dbhash", "gdbm", "dbm", "dumbdbm"] From npickett at users.sourceforge.net Tue Dec 3 20:11:23 2002 From: npickett at users.sourceforge.net (Neale Pickett) Date: Tue, 03 Dec 2002 12:11:23 -0800 Subject: [Spambayes-checkins] spambayes dbmstorage.py,NONE,1.1 Options.py,1.78,1.79 storage.py,1.5,1.6 anydbm.py,1.4,NONE Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv26221 Modified Files: Options.py storage.py Added Files: dbmstorage.py Removed Files: anydbm.py Log Message: * New option "dbm_type" which can be"best ", "db3hash", "dbhash", "gdbm", or "dumbdbm". If it's "best", then the best available dbm implementation will be used. Note that "best" on Windows excludes "dbhash". --- NEW FILE: dbmstorage.py --- """Wrapper to open an appropriate dbm storage type.""" from Options import options import sys class error(Exception): pass def open_db3hash(*args): """Open a bsddb3 hash.""" import bsddb3 return bsddb3.hashopen(*args) def open_dbhash(*args): """Open a bsddb hash. Don't use this on Windows.""" import bsddb return bsddb.hashopen(*args) def open_gdbm(*args): """Open a gdbm database.""" import gdbm return gdbm.open(*args) def open_dumbdbm(*args): """Open a dumbdbm database.""" import dumbdbm return dumbdbm.open(*args) def open_best(*args): if sys.platform == "win32": funcs = [open_db3hash, open_gdbm, open_dumbdbm] else: funcs = [open_db3hash, open_dbhash, open_gdbm, open_dumbdbm] for f in funcs: try: return f(*args) except ImportError: pass raise error("No dbm modules available!") open_funcs = { "best": open_best, "db3hash": open_db3hash, "dbhash": open_dbhash, "gdbm": open_gdbm, "dumbdbm": open_dumbdbm, } def open(*args): dbm_type = options.dbm_type.lower() f = open_funcs.get(dbm_type) if not f: raise error("Unknown dbm type in options file") return f(*args) Index: Options.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Options.py,v retrieving revision 1.78 retrieving revision 1.79 diff -C2 -d -r1.78 -r1.79 *** Options.py 26 Nov 2002 00:43:51 -0000 1.78 --- Options.py 3 Dec 2002 20:11:13 -0000 1.79 *************** *** 373,376 **** --- 373,380 ---- [globals] verbose: False + # What DBM storage type should we use? Must be best, db3hash, dbhash, + # gdbm, dumbdbm. Windows folk should steer clear of dbhash. Default is + # "best", which will pick the best DBM type available on your platform. + dbm_type: best """ *************** *** 461,464 **** --- 465,469 ---- }, 'globals': {'verbose': boolean_cracker, + 'dbm_type': string_cracker, }, } Index: storage.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/storage.py,v retrieving revision 1.5 retrieving revision 1.6 diff -C2 -d -r1.5 -r1.6 *** storage.py 2 Dec 2002 06:02:03 -0000 1.5 --- storage.py 3 Dec 2002 20:11:18 -0000 1.6 *************** *** 52,55 **** --- 52,56 ---- import errno import shelve + import dbmstorage PICKLE_TYPE = 1 *************** *** 131,135 **** print 'Loading state from',self.db_name,'database' ! self.db = shelve.DbfilenameShelf(self.db_name, self.mode) if self.db.has_key(self.statekey): --- 132,137 ---- print 'Loading state from',self.db_name,'database' ! self.dbm = dbmstorage.open(self.db_name, self.mode) ! self.db = shelve.Shelf(self.dbm) if self.db.has_key(self.statekey): --- anydbm.py DELETED --- From richiehindle at users.sourceforge.net Tue Dec 3 21:22:27 2002 From: richiehindle at users.sourceforge.net (Richie Hindle) Date: Tue, 03 Dec 2002 13:22:27 -0800 Subject: [Spambayes-checkins] spambayes pop3proxy.py,1.31,1.32 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv22329 Modified Files: pop3proxy.py Log Message: Fixed the web interface's "Word query" form, which had been broken by the Classifier abstraction work (my fault). Index: pop3proxy.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/pop3proxy.py,v retrieving revision 1.31 retrieving revision 1.32 diff -C2 -d -r1.31 -r1.32 *** pop3proxy.py 28 Nov 2002 22:02:46 -0000 1.31 --- pop3proxy.py 3 Dec 2002 21:22:22 -0000 1.32 *************** *** 101,104 **** --- 101,105 ---- Debugger into a library. o Cope with the email client timing out and closing the connection. + o Lose the trailing dot from cached messages. *************** *** 1195,1200 **** word = params['word'] word = word.lower() ! try: ! wi = state.bayes.wordinfo[word] members = wi.__dict__ members['spamprob'] = state.bayes.probability(wi) --- 1196,1201 ---- word = params['word'] word = word.lower() ! wi = state.bayes._wordinfoget(word) ! if wi: members = wi.__dict__ members['spamprob'] = state.bayes.probability(wi) *************** *** 1203,1207 **** Probability that a message containing this word is spam: %(spamprob)f.
    """ % members ! except KeyError: info = "%r does not appear in the database." % word --- 1204,1208 ---- Probability that a message containing this word is spam: %(spamprob)f.
    """ % members ! else: info = "%r does not appear in the database." % word From mhammond at users.sourceforge.net Fri Dec 6 12:56:54 2002 From: mhammond at users.sourceforge.net (Mark Hammond) Date: Fri, 06 Dec 2002 04:56:54 -0800 Subject: [Spambayes-checkins] spambayes Corpus.py,1.6,1.7 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv14671 Modified Files: Corpus.py Log Message: Fix whitespace. Index: Corpus.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Corpus.py,v retrieving revision 1.6 retrieving revision 1.7 diff -C2 -d -r1.6 -r1.7 *** Corpus.py 1 Dec 2002 21:43:19 -0000 1.6 --- Corpus.py 6 Dec 2002 12:56:51 -0000 1.7 *************** *** 368,372 **** match = hdrregex.findall(data) ! return match def getHeaders(self): --- 368,372 ---- match = hdrregex.findall(data) ! return match def getHeaders(self): From npickett at users.sourceforge.net Fri Dec 6 16:12:50 2002 From: npickett at users.sourceforge.net (Neale Pickett) Date: Fri, 06 Dec 2002 08:12:50 -0800 Subject: [Spambayes-checkins] spambayes HAMMIE.txt,1.2,1.3 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv28286 Modified Files: HAMMIE.txt Log Message: * Fixed glaring error in procmail recipie (thanks Don Marti) Index: HAMMIE.txt =================================================================== RCS file: /cvsroot/spambayes/spambayes/HAMMIE.txt,v retrieving revision 1.2 retrieving revision 1.3 diff -C2 -d -r1.2 -r1.3 *** HAMMIE.txt 2 Dec 2002 19:03:46 -0000 1.2 --- HAMMIE.txt 6 Dec 2002 16:12:39 -0000 1.3 *************** *** 52,55 **** --- 52,56 ---- :0 + * ^X-Spambayes-Classification: spam $HOME/Maildir/.spam/ From npickett at users.sourceforge.net Fri Dec 6 16:14:48 2002 From: npickett at users.sourceforge.net (Neale Pickett) Date: Fri, 06 Dec 2002 08:14:48 -0800 Subject: [Spambayes-checkins] spambayes HAMMIE.txt,1.3,1.4 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv29084 Modified Files: HAMMIE.txt Log Message: * Guh, fixed same error in a second place. Index: HAMMIE.txt =================================================================== RCS file: /cvsroot/spambayes/spambayes/HAMMIE.txt,v retrieving revision 1.3 retrieving revision 1.4 diff -C2 -d -r1.3 -r1.4 *** HAMMIE.txt 6 Dec 2002 16:12:39 -0000 1.3 --- HAMMIE.txt 6 Dec 2002 16:14:28 -0000 1.4 *************** *** 60,63 **** --- 60,64 ---- :0: + * ^X-Spambayes-Classification: spam $HOME/Mail/spam From montanaro at users.sourceforge.net Sat Dec 7 16:58:38 2002 From: montanaro at users.sourceforge.net (Skip Montanaro) Date: Sat, 07 Dec 2002 08:58:38 -0800 Subject: [Spambayes-checkins] spambayes rates.py,1.7,1.8 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv22023 Modified Files: rates.py Log Message: guard against short lines Index: rates.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/rates.py,v retrieving revision 1.7 retrieving revision 1.8 diff -C2 -d -r1.7 -r1.8 *** rates.py 25 Sep 2002 02:22:15 -0000 1.7 --- rates.py 7 Dec 2002 16:58:35 -0000 1.8 *************** *** 82,86 **** # 0 1 2 3 4 5 # -> 1 new false positives ! if fields[3] == 'new' and fields[4] == 'false': kind = fields[-1] count = int(fields[2]) --- 82,86 ---- # 0 1 2 3 4 5 # -> 1 new false positives ! if len(fields) >= 5 and fields[3] == 'new' and fields[4] == 'false': kind = fields[-1] count = int(fields[2]) From montanaro at users.sourceforge.net Sun Dec 8 17:38:28 2002 From: montanaro at users.sourceforge.net (Skip Montanaro) Date: Sun, 08 Dec 2002 09:38:28 -0800 Subject: [Spambayes-checkins] spambayes loosecksum.py,1.3,1.4 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv21036 Modified Files: loosecksum.py Log Message: delete pmguid: tokens which turn up occasionally and kill the loose checksum Index: loosecksum.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/loosecksum.py,v retrieving revision 1.3 retrieving revision 1.4 diff -C2 -d -r1.3 -r1.4 *** loosecksum.py 23 Sep 2002 21:20:10 -0000 1.3 --- loosecksum.py 8 Dec 2002 17:38:26 -0000 1.4 *************** *** 66,69 **** --- 66,71 ---- # delete anything that looks like a url (catch bare urls) data = re.sub(r"(?i)(ftp|http|gopher)://[-a-z0-9_/?&%@=+:;#!~|.,$*]+", "", data) + # delete pmguid: stuff (turns up frequently) + data = re.sub(r"pmguid:[^.\s]+(\.[^.\s]+)*", "", data) # throw away everything other than alpha & digits return re.sub(r"[^A-Za-z0-9]+", "", data) From mhammond at users.sourceforge.net Mon Dec 9 00:36:20 2002 From: mhammond at users.sourceforge.net (Mark Hammond) Date: Mon Dec 9 03:36:23 2002 Subject: [Spambayes-checkins] spambayes classifier.py,1.61,1.62 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv22625 Modified Files: classifier.py Log Message: nham should decrement on unlearn (rather than doubly negating which results in increment!) At least Outlook now has a test suite that picks this up Index: classifier.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/classifier.py,v retrieving revision 1.61 retrieving revision 1.62 diff -C2 -d -r1.61 -r1.62 *** classifier.py 27 Nov 2002 22:37:55 -0000 1.61 --- classifier.py 9 Dec 2002 08:36:17 -0000 1.62 *************** *** 408,412 **** if self.nham <= 0: raise ValueError("non-spam count would go negative!") ! self.nham -= -1 for word in Set(wordstream): --- 408,412 ---- if self.nham <= 0: raise ValueError("non-spam count would go negative!") ! self.nham -= 1 for word in Set(wordstream): From mhammond at users.sourceforge.net Mon Dec 9 01:18:40 2002 From: mhammond at users.sourceforge.net (Mark Hammond) Date: Mon Dec 9 04:18:46 2002 Subject: [Spambayes-checkins] spambayes/Outlook2000 tester.py,NONE,1.1 addin.py,1.40,1.41 train.py,1.20,1.21 Message-ID: Update of /cvsroot/spambayes/spambayes/Outlook2000 In directory sc8-pr-cvs1:/tmp/cvs-serv14252 Modified Files: addin.py train.py Added Files: tester.py Log Message: Add a fairly comprehensive (of Outlook's code) test suite. --- NEW FILE: tester.py --- # unit tester for the Outlook addin. # # Note we are only attempting to test Outlook specific # functionality, such as filters, etc. # # General process is to create test messages known to contain ham/spam # keywords, and tracking their progress through the filters. We also # move this test message back around, and watch the incremental retrain # in action. Also checks that the message correctly remains classified # after a message move. from win32com.client import constants from time import sleep HAM="ham" SPAM="spam" UNSURE="unsure" TEST_SUBJECT = "SpamBayes addin auto-generated test message" class TestFailure(Exception): pass def TestFailed(msg): raise TestFailure(msg) def WaitForFilters(): import pythoncom for i in range(100): pythoncom.PumpWaitingMessages() sleep(0.01) # Find the top 'n' words in the Spam database that are clearly # marked as either ham or spam. Simply enumerates the # bayes word list looking for any word with zero count in the # non-requested category. def FindTopWords(bayes, num, get_spam): items = [] for word, info in bayes.wordinfo.items(): if ":" in word: continue if get_spam: if info.hamcount==0: items.append((info.spamcount, word)) else: if info.spamcount==0: items.append((info.hamcount, word)) items.sort() return [item[1] for item in items] # A little driver/manager for our tests class Driver: def __init__(self, mgr): if mgr is None: import manager mgr = manager.GetManager() self.manager = mgr # Remember the "spam" folder. folder = mgr.message_store.GetFolder(mgr.config.filter.spam_folder_id) self.folder_spam = folder.GetOutlookItem() # Remember the "unsure" folder. folder = mgr.message_store.GetFolder(mgr.config.filter.unsure_folder_id) self.folder_unsure = folder.GetOutlookItem() # The "watch" folder is a folder we can stick stuff into to have them # filtered - just use the first one nominated. for folder in mgr.message_store.GetFolderGenerator( mgr.config.filter.watch_folder_ids, mgr.config.filter.watch_include_sub): self.folder_watch = folder.GetOutlookItem() break # And the drafts folder where new messages are created. self.folder_drafts = mgr.outlook.Session.GetDefaultFolder(constants.olFolderDrafts) def FindTestMessage(self, folder): subject = TEST_SUBJECT items = folder.Items return items.Find("[Subject] = '%s'" % (subject,)) def _CleanTestMessageFromFolder(self, folder): subject = TEST_SUBJECT num = 0 while True: msg = self.FindTestMessage(folder) if msg is None: break msg.Delete() num += 1 if num: print "Cleaned %d test messages from folder '%s'" % (num, folder.Name) def CleanAllTestMessages(self): subject = TEST_SUBJECT self._CleanTestMessageFromFolder(self.folder_spam) self._CleanTestMessageFromFolder(self.folder_unsure) self._CleanTestMessageFromFolder(self.folder_watch) self._CleanTestMessageFromFolder(self.folder_drafts) def CreateTestMessageInFolder(self, spam_status, folder): msg = self.CreateTestMessage(spam_status) msg.Save() # Put into "Drafts". assert self.FindTestMessage(self.folder_drafts) is not None # Move it to the specified folder msg.Move(folder) # And now find it in the specified folder return self.FindTestMessage(folder) def CreateTestMessage(self, spam_status): words = [] if spam_status != SPAM: words.extend(FindTopWords(self.manager.bayes, 50, False)) if spam_status != HAM: words.extend(FindTopWords(self.manager.bayes, 50, True)) # Create a new blank message with our words msg = self.manager.outlook.CreateItem(0) msg.Body = "\n".join(words) msg.Subject = TEST_SUBJECT return msg # The tests themselves. # The "spam" test is huge - we do standard filter tests, but # also do incremental retrain tests. def TestSpamFilter(driver): nspam = driver.manager.bayes.nspam nham = driver.manager.bayes.nham import copy original_bayes = copy.copy(driver.manager.bayes) # Create a spam message in the Inbox - it should get immediately filtered msg = driver.CreateTestMessageInFolder(SPAM, driver.folder_watch) # sleep to ensure filtering. WaitForFilters() # It should no longer be in the Inbox. if driver.FindTestMessage(driver.folder_watch) is not None: TestFailed("The test message appeared to not be filtered") # It should be in the "sure is spam" folder. spam_msg = driver.FindTestMessage(driver.folder_spam) if spam_msg is None: TestFailed("The test message vanished from the Inbox, but didn't appear in Spam") # Check that none of the above caused training. if nspam != driver.manager.bayes.nspam: TestFailed("Something caused a new spam message to appear") if nham != driver.manager.bayes.nham: TestFailed("Something caused a new ham message to appear") # Now move the message back to the inbox - it should get trained. store_msg = driver.manager.message_store.GetMessage(spam_msg) import train if train.been_trained_as_ham(store_msg, driver.manager): TestFailed("This new spam message should not have been trained as ham yet") if train.been_trained_as_spam(store_msg, driver.manager): TestFailed("This new spam message should not have been trained as spam yet") spam_msg.Move(driver.folder_watch) WaitForFilters() spam_msg = driver.FindTestMessage(driver.folder_watch) store_msg = driver.manager.message_store.GetMessage(spam_msg) need_untrain = True try: if nspam != driver.manager.bayes.nspam: TestFailed("There were not the same number of spam messages after a re-train") if nham+1 != driver.manager.bayes.nham: TestFailed("There was not one more ham messages after a re-train") if train.been_trained_as_spam(store_msg, driver.manager): TestFailed("This new spam message should not have been trained as spam yet") if not train.been_trained_as_ham(store_msg, driver.manager): TestFailed("This new spam message should have been trained as ham now") # Now move it back to the Spam folder. # This should see the message un-trained as ham, and re-trained as Spam spam_msg.Move(driver.folder_spam) WaitForFilters() spam_msg = driver.FindTestMessage(driver.folder_spam) if spam_msg is None: TestFailed("Could not find the message in the Spam folder") store_msg = driver.manager.message_store.GetMessage(spam_msg) if nspam +1 != driver.manager.bayes.nspam: TestFailed("There should be one more spam now") if nham != driver.manager.bayes.nham: TestFailed("There should be the same number of hams again") if not train.been_trained_as_spam(store_msg, driver.manager): TestFailed("This new spam message should have been trained as spam by now") if train.been_trained_as_ham(store_msg, driver.manager): TestFailed("This new spam message should have been un-trained as ham") # Move the message to another folder, and make sure we still # identify it correctly as having been trained. # Move to the "unsure" folder, just cos we know about it, and # we know that no special watching of this folder exists. spam_msg.Move(driver.folder_unsure) spam_msg = driver.FindTestMessage(driver.folder_unsure) if spam_msg is None: TestFailed("Could not find the message in the Unsure folder") store_msg = driver.manager.message_store.GetMessage(spam_msg) if not train.been_trained_as_spam(store_msg, driver.manager): TestFailed("Message was not identified as Spam after moving") # Now undo the damage we did. was_spam = train.untrain_message(store_msg, driver.manager) if not was_spam: TestFailed("Untraining this message did not indicate it was spam") need_untrain = False finally: if need_untrain: train.untrain_message(store_msg, driver.manager) # Check all the counts are back where we started. if nspam != driver.manager.bayes.nspam: TestFailed("Spam count didn't get back to the same") if nham != driver.manager.bayes.nham: TestFailed("Ham count didn't get back to the same") if driver.manager.bayes.wordinfo != original_bayes.wordinfo: TestFailed("The bayes object's 'wordinfo' did not compare the same at the end of all this!") if driver.manager.bayes.probcache != original_bayes.probcache: TestFailed("The bayes object's 'probcache' did not compare the same at the end of all this!") spam_msg.Delete() print "Created a Spam message, and saw it get filtered and trained." def TestHamFilter(driver): # Create a spam message in the Inbox - it should get immediately filtered msg = driver.CreateTestMessageInFolder(HAM, driver.folder_watch) # sleep to ensure filtering. WaitForFilters() # It should still be in the Inbox. if driver.FindTestMessage(driver.folder_watch) is None: TestFailed("The test ham message appeared to have been filtered!") msg.Delete() print "Created a Ham message, and saw it remain in place." def TestUnsureFilter(driver): # Create a spam message in the Inbox - it should get immediately filtered msg = driver.CreateTestMessageInFolder(UNSURE, driver.folder_watch) # sleep to ensure filtering. WaitForFilters() # It should no longer be in the Inbox. if driver.FindTestMessage(driver.folder_watch) is not None: TestFailed("The test unsure message appeared to not be filtered") # It should be in the "unsure" folder. spam_msg = driver.FindTestMessage(driver.folder_unsure) if spam_msg is None: TestFailed("The test message vanished from the Inbox, but didn't appear in Unsure") spam_msg.Delete() print "Created an unsure message, and saw it get filtered" def test(manager = None): # Run the tests - called from our plugin. driver = Driver(manager) assert driver.manager.config.filter.enabled, "Filtering must be enabled for these tests" assert driver.manager.config.training.train_recovered_spam and \ driver.manager.config.training.train_manual_spam, "Incremental training must be enabled for these tests" driver.CleanAllTestMessages() TestSpamFilter(driver) TestUnsureFilter(driver) TestHamFilter(driver) driver.CleanAllTestMessages() if __name__=='__main__': print "NOTE: This will NOT work from the command line" print "(it nearly will, and is useful for debugging the tests" print "themselves, so we will run them anyway!)" test() Index: addin.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/addin.py,v retrieving revision 1.40 retrieving revision 1.41 diff -C2 -d -r1.40 -r1.41 *** addin.py 27 Nov 2002 05:49:52 -0000 1.40 --- addin.py 9 Dec 2002 09:18:38 -0000 1.41 *************** *** 264,267 **** --- 264,279 ---- new_msg.Display() + # A hook for whatever tests we have setup + def Tester(manager): + import tester, traceback + try: + print "Executing automated tests..." + tester.test(manager) + print "Tests worked." + except: + traceback.print_exc() + print "Tests FAILED. Sorry about that. If I were you, I would do a full re-train ASAP" + print "Please delete any test messages from your Spam, Unsure or Inbox folders first." + # The "Delete As Spam" and "Recover Spam" button # The event from Outlook's explorer that our folder has changed. *************** *** 424,427 **** --- 436,444 ---- Caption="Show spam clues for current message", Enabled=True) + # If we are running from Python sources, enable a few extra items + if not hasattr(sys, "frozen"): + self._AddPopup(popup, Tester, (self.manager,), + Caption="Execute test suite", + Enabled=True) self.have_setup_ui = True Index: train.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/train.py,v retrieving revision 1.20 retrieving revision 1.21 diff -C2 -d -r1.20 -r1.21 *** train.py 25 Nov 2002 20:52:49 -0000 1.20 --- train.py 9 Dec 2002 09:18:38 -0000 1.21 *************** *** 58,61 **** --- 58,78 ---- return True + # Untrain a message. + # Return: None == not previously trained + # True == was_spam + # False == was_ham + def untrain_message(msg, mgr): + from tokenizer import tokenize + stream = msg.GetEmailPackageObject() + if been_trained_as_spam(msg, mgr): + assert not been_trained_as_ham(msg, mgr), "Can't have been both!" + mgr.bayes.unlearn(tokenize(stream), True) + return True + if been_trained_as_ham(msg, mgr): + assert not been_trained_as_spam(msg, mgr), "Can't have been both!" + mgr.bayes.unlearn(tokenize(stream), False) + return False + return None + def train_folder(f, isspam, mgr, progress): num = num_added = 0 From montanaro at users.sourceforge.net Tue Dec 10 20:56:32 2002 From: montanaro at users.sourceforge.net (Skip Montanaro) Date: Tue Dec 10 23:56:36 2002 Subject: [Spambayes-checkins] spambayes setup.py,1.9,1.10 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv5450 Modified Files: setup.py Log Message: * neiltrain.py disappeared * add several missing modules (are more missing?) Index: setup.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/setup.py,v retrieving revision 1.9 retrieving revision 1.10 diff -C2 -d -r1.9 -r1.10 *** setup.py 27 Oct 2002 21:59:55 -0000 1.9 --- setup.py 11 Dec 2002 04:56:30 -0000 1.10 *************** *** 16,20 **** 'mboxcount.py', 'mboxtest.py', - 'neiltrain.py', 'cmp.py', 'table.py', --- 16,19 ---- *************** *** 25,28 **** --- 24,31 ---- 'hammie', 'msgs', + 'storage', + 'dbmstorage', + 'Corpus', + 'hammiebulk', 'chi2', 'Histogram', From montanaro at users.sourceforge.net Tue Dec 10 20:57:26 2002 From: montanaro at users.sourceforge.net (Skip Montanaro) Date: Tue Dec 10 23:57:30 2002 Subject: [Spambayes-checkins] spambayes Options.py,1.79,1.80 tokenizer.py,1.70,1.71 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv5498 Modified Files: Options.py tokenizer.py Log Message: New option summarize_email_prefixes attempts to capitalize on the fact that spam is often sent to multiple similar addresses. Index: Options.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Options.py,v retrieving revision 1.79 retrieving revision 1.80 diff -C2 -d -r1.79 -r1.80 *** Options.py 3 Dec 2002 20:11:13 -0000 1.79 --- Options.py 11 Dec 2002 04:57:24 -0000 1.80 *************** *** 105,108 **** --- 105,111 ---- generate_long_skips: True + # Try to capitalize on mail sent to multiple similar addresses. + summarize_email_prefixes: False + # # Length of words that triggers 'long skips'. Longer than this *************** *** 390,393 **** --- 393,397 ---- 'record_header_absence': boolean_cracker, 'generate_long_skips': boolean_cracker, + 'summarize_email_prefixes': boolean_cracker, 'skip_max_word_size': int_cracker, 'extract_dow': boolean_cracker, Index: tokenizer.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/tokenizer.py,v retrieving revision 1.70 retrieving revision 1.71 diff -C2 -d -r1.70 -r1.71 *** tokenizer.py 24 Nov 2002 07:41:03 -0000 1.70 --- tokenizer.py 11 Dec 2002 04:57:24 -0000 1.71 *************** *** 12,15 **** --- 12,16 ---- import math import time + import os from sets import Set *************** *** 1136,1139 **** --- 1137,1175 ---- yield "%s:no real name:2**%d" % (field, round(log2(noname_count))) + + # Spammers sometimes send out mail alphabetically to fairly large + # numbers of addresses. This results in headers like: + # To: + # Cc: , , + # , , + # + # + # This token attempts to exploit that property. The above would + # give a common prefix of "itinera" for 6 addresses, yielding a + # gross score of 42. We group scores into buckets by dividing by 10 + # to yield a final token value of "pfxlen:04". The length test + # eliminates the bad case where the message was sent to a single + # individual. + if options.summarize_email_prefixes: + all_addrs = [] + addresses = msg.get_all('to', []) + msg.get_all('cc', []) + for name, addr in email.Utils.getaddresses(addresses): + all_addrs.append(addr.lower()) + + if len(all_addrs) > 1: + # don't be fooled by "os.path." - commonprefix + # operates char-by-char! + pfx = os.path.commonprefix(all_addrs) + if pfx: + score = (len(pfx) * len(all_addrs)) // 10 + # After staring at pflen:* values generated from a large + # number of ham & spam I saw that any scores greater + # than 3 were always associated with spam. Collapsing + # all such scores into a single token avoids a bunch of + # hapaxes like "pfxlen:28". + if score > 3: + yield "pfxlen:big" + else: + yield "pfxlen:%d" % score # To: From npickett at users.sourceforge.net Wed Dec 11 08:21:21 2002 From: npickett at users.sourceforge.net (Neale Pickett) Date: Wed Dec 11 11:21:25 2002 Subject: [Spambayes-checkins] spambayes mboxtrain.py,1.1,1.2 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv7361 Modified Files: mboxtrain.py Log Message: * Should fix the deletion of already-trained-mail in an mbox Index: mboxtrain.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/mboxtrain.py,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** mboxtrain.py 2 Dec 2002 04:43:37 -0000 1.1 --- mboxtrain.py 11 Dec 2002 16:21:15 -0000 1.2 *************** *** 140,146 **** sys.stdout.write(" %s\r" % counter) sys.stdout.flush() ! if not msg_train(h, msg, is_spam, force): ! continue ! trained += 1 # Write it out with the Unix "From " line outf.write(msg.as_string(True)) --- 140,145 ---- sys.stdout.write(" %s\r" % counter) sys.stdout.flush() ! if msg_train(h, msg, is_spam, force): ! trained += 1 # Write it out with the Unix "From " line outf.write(msg.as_string(True)) From mhammond at users.sourceforge.net Fri Dec 13 01:27:14 2002 From: mhammond at users.sourceforge.net (Mark Hammond) Date: Fri Dec 13 04:27:18 2002 Subject: [Spambayes-checkins] spambayes/Outlook2000 config.py,1.4,1.5 Message-ID: Update of /cvsroot/spambayes/spambayes/Outlook2000 In directory sc8-pr-cvs1:/tmp/cvs-serv16704 Modified Files: config.py Log Message: Prevent accidental setting of config options due to typos etc. Index: config.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/config.py,v retrieving revision 1.4 retrieving revision 1.5 diff -C2 -d -r1.4 -r1.5 *** config.py 7 Nov 2002 22:30:09 -0000 1.4 --- config.py 13 Dec 2002 09:27:12 -0000 1.5 *************** *** 14,17 **** --- 14,21 ---- self.__dict__.update(kw) + def __setattr__(self, attr, val): + if not self.__dict__.has_key(attr): + raise AttributeError, attr + self.__dict__[attr] = val # Crap state-loading code so when we load an early version of the pickle # any attributes in the new version are considered defaults. *************** *** 42,47 **** class ConfigurationRoot(_ConfigurationContainer): def __init__(self): ! _ConfigurationContainer.__init__(self) ! self.training = _ConfigurationContainer( ham_folder_ids = [], ham_include_sub = False, --- 46,50 ---- class ConfigurationRoot(_ConfigurationContainer): def __init__(self): ! training = _ConfigurationContainer( ham_folder_ids = [], ham_include_sub = False, *************** *** 53,57 **** train_manual_spam = True, ) ! self.filter = _ConfigurationContainer( watch_folder_ids = [], watch_include_sub = False, --- 56,60 ---- train_manual_spam = True, ) ! filter = _ConfigurationContainer( watch_folder_ids = [], watch_include_sub = False, *************** *** 64,68 **** enabled = False, ) ! self.filter_now = _ConfigurationContainer( folder_ids = [], include_sub = False, --- 67,71 ---- enabled = False, ) ! filter_now = _ConfigurationContainer( folder_ids = [], include_sub = False, *************** *** 71,75 **** action_all = True, ) ! self.field_score_name = "Spam" if __name__=='__main__': --- 74,83 ---- action_all = True, ) ! field_score_name = "Spam" ! _ConfigurationContainer.__init__(self, ! training=training, ! filter=filter, ! filter_now = filter_now, ! field_score_name = field_score_name) if __name__=='__main__': From mhammond at users.sourceforge.net Fri Dec 13 01:28:12 2002 From: mhammond at users.sourceforge.net (Mark Hammond) Date: Fri Dec 13 04:28:16 2002 Subject: [Spambayes-checkins] spambayes/Outlook2000 manager.py,1.37,1.38 Message-ID: Update of /cvsroot/spambayes/spambayes/Outlook2000 In directory sc8-pr-cvs1:/tmp/cvs-serv16930 Modified Files: manager.py Log Message: Some more debugging code to try and track where the database sizes occasionally get screwed. Index: manager.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/manager.py,v retrieving revision 1.37 retrieving revision 1.38 diff -C2 -d -r1.37 -r1.38 *** manager.py 27 Nov 2002 05:49:53 -0000 1.37 --- manager.py 13 Dec 2002 09:28:09 -0000 1.38 *************** *** 225,231 **** def SaveBayes(self): bayes = self.bayes if self.verbose: ! print ("Saving bayes database with %d spam and %d good messages" % ! (bayes.nspam, bayes.nham)) print " ->", self.bayes_filename cPickle.dump(bayes, open(self.bayes_filename,"wb"), 1) --- 225,237 ---- def SaveBayes(self): bayes = self.bayes + # Try and work out where this count sometimes goes wrong. + if bayes.nspam + bayes.nham != len(self.message_db): + print "WARNING: Bayes database has %d messages, " \ + "but training database has %d" % \ + (bayes.nspam + bayes.nham, len(self.message_db)) + if self.verbose: ! print "Saving bayes database with %d spam and %d good messages" %\ ! (bayes.nspam, bayes.nham) print " ->", self.bayes_filename cPickle.dump(bayes, open(self.bayes_filename,"wb"), 1) From mhammond at users.sourceforge.net Sat Dec 14 06:48:43 2002 From: mhammond at users.sourceforge.net (Mark Hammond) Date: Sat Dec 14 09:48:47 2002 Subject: [Spambayes-checkins] spambayes/Outlook2000 addin.py,1.41,1.42 Message-ID: Update of /cvsroot/spambayes/spambayes/Outlook2000 In directory sc8-pr-cvs1:/tmp/cvs-serv4259 Modified Files: addin.py Log Message: Handle failure locating the previously configured folders when starting up - particularly when, say, the "Store ID" of the main store changes - which it does when you reinstall Outlook (even using the old .pst file) or even re-configuring Outlook in "Corporate" mode seems to do it. We disable filtering and display a message box indicating that we should be reconfigured. Also make some incremental trace messages (and comments) clearer they are doing the right thing! Index: addin.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/addin.py,v retrieving revision 1.41 retrieving revision 1.42 diff -C2 -d -r1.41 -r1.42 *** addin.py 9 Dec 2002 09:18:38 -0000 1.41 --- addin.py 14 Dec 2002 14:48:40 -0000 1.42 *************** *** 311,315 **** for msgstore_message in msgstore_messages: # Must train before moving, else we lose the message! ! print "Training on message - ", if train.train_message(msgstore_message, True, self.manager, rescore = True): print "trained as spam" --- 311,316 ---- for msgstore_message in msgstore_messages: # Must train before moving, else we lose the message! ! subject = msgstore_message.GetSubject() ! print "Deleting and spam training message '%s' - " % (subject,), if train.train_message(msgstore_message, True, self.manager, rescore = True): print "trained as spam" *************** *** 318,321 **** --- 319,324 ---- # Now move it. msgstore_message.MoveTo(spam_folder) + # Note the move will possibly also trigger a re-train + # but we are smart enough to know we have already done it. class ButtonRecoverFromSpamEvent(ButtonDeleteAsEventBase): *************** *** 345,349 **** for msgstore_message in msgstore_messages: # Must train before moving, else we lose the message! ! print "Training on message - ", if train.train_message(msgstore_message, False, self.manager, rescore = True): print "trained as ham" --- 348,353 ---- for msgstore_message in msgstore_messages: # Must train before moving, else we lose the message! ! subject = msgstore_message.GetSubject() ! print "Recovering and ham training message '%s' - " % (subject,), if train.train_message(msgstore_message, False, self.manager, rescore = True): print "trained as ham" *************** *** 353,356 **** --- 357,362 ---- # XXX - still don't write the source, so no point looking :( msgstore_message.MoveTo(inbox_folder) + # Note the move will possibly also trigger a re-train + # but we are smart enough to know we have already done it. # Helpers to work with images on buttons/toolbars. *************** *** 572,577 **** self.explorers_events._DoNewExplorer(explorer, True) - self.FiltersChanged() if self.manager.config.filter.enabled: try: self.ProcessMissedMessages() --- 578,583 ---- self.explorers_events._DoNewExplorer(explorer, True) if self.manager.config.filter.enabled: + self.FiltersChanged() try: self.ProcessMissedMessages() *************** *** 600,605 **** def FiltersChanged(self): ! # Create a notification hook for all folders we filter. ! self.UpdateFolderHooks() def UpdateFolderHooks(self): --- 606,623 ---- def FiltersChanged(self): ! try: ! # Create a notification hook for all folders we filter. ! self.UpdateFolderHooks() ! except: ! import traceback ! print "Error installing folder hooks." ! traceback.print_exc() ! self.manager.config.filter.enabled = False ! self.manager.SaveConfig() ! win32ui.MessageBox( ! "There was an error initializing the Spam plugin\r\n\r\n" ! "Spam filtering has been disabled. Please re-configure\r\n" ! "and re-enable this plugin", ! "Anti-Spam plugin") def UpdateFolderHooks(self): From mhammond at users.sourceforge.net Sun Dec 15 06:04:01 2002 From: mhammond at users.sourceforge.net (Mark Hammond) Date: Sun Dec 15 09:04:08 2002 Subject: [Spambayes-checkins] spambayes/Outlook2000 manager.py,1.38,1.39 Message-ID: Update of /cvsroot/spambayes/spambayes/Outlook2000 In directory sc8-pr-cvs1:/tmp/cvs-serv15725 Modified Files: manager.py Log Message: Whitespace normalization Index: manager.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/manager.py,v retrieving revision 1.38 retrieving revision 1.39 diff -C2 -d -r1.38 -r1.39 *** manager.py 13 Dec 2002 09:28:09 -0000 1.38 --- manager.py 15 Dec 2002 14:03:58 -0000 1.39 *************** *** 230,234 **** "but training database has %d" % \ (bayes.nspam + bayes.nham, len(self.message_db)) ! if self.verbose: print "Saving bayes database with %d spam and %d good messages" %\ --- 230,234 ---- "but training database has %d" % \ (bayes.nspam + bayes.nham, len(self.message_db)) ! if self.verbose: print "Saving bayes database with %d spam and %d good messages" %\ From mhammond at users.sourceforge.net Sun Dec 15 06:05:38 2002 From: mhammond at users.sourceforge.net (Mark Hammond) Date: Sun Dec 15 09:05:42 2002 Subject: [Spambayes-checkins] spambayes/Outlook2000 addin.py,1.42,1.43 Message-ID: Update of /cvsroot/spambayes/spambayes/Outlook2000 In directory sc8-pr-cvs1:/tmp/cvs-serv15835 Modified Files: addin.py Log Message: Add a "fix" for multiple Spam buttons sometimes appearing on the toolbar. See the URL in the comments - the buttons will still double up occasionally, but when Outlook is restarted it will auto-correct (whereas previously we would just create new buttons) Index: addin.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/addin.py,v retrieving revision 1.42 retrieving revision 1.43 diff -C2 -d -r1.42 -r1.43 *** addin.py 14 Dec 2002 14:48:40 -0000 1.42 --- addin.py 15 Dec 2002 14:05:35 -0000 1.43 *************** *** 152,156 **** # Button/Menu and other UI event handler classes class ButtonEvent: ! def Init(self, handler, args = ()): self.handler = handler self.args = args --- 152,156 ---- # Button/Menu and other UI event handler classes class ButtonEvent: ! def Init(self, handler, *args): self.handler = handler self.args = args *************** *** 393,397 **** def SetupUI(self): - application = self.Application manager = self.manager self.buttons = [] --- 393,396 ---- *************** *** 400,431 **** toolbar = bars.Item("Standard") # Add our "Delete as ..." and "Recover as" buttons ! self.but_delete_as = button = toolbar.Controls.Add( ! Type=constants.msoControlButton, ! Temporary=True) ! # Hook events for the item ! button.BeginGroup = True ! button = DispatchWithEvents(button, ButtonDeleteAsSpamEvent) ! button.Init(self.manager, self) ! self.buttons.append(button) # And again for "Recover as" ! self.but_recover_as = button = toolbar.Controls.Add( ! Type=constants.msoControlButton, ! Temporary=True) ! button = DispatchWithEvents(button, ButtonRecoverFromSpamEvent) ! self.buttons.append(button) ! # Hook our explorer events, and pass the buttons. ! button.Init(self.manager, self) ! ! # And prime our event handler. self.OnFolderSwitch() # The main tool-bar dropdown with all our entries. # Add a pop-up menu to the toolbar ! popup = toolbar.Controls.Add( ! Type=constants.msoControlPopup, ! Temporary=True) ! popup.Caption="Anti-Spam" ! popup.TooltipText = "Anti-Spam filters and functions" ! popup.Enabled = True # Convert from "CommandBarItem" to derived # "CommandBarPopup" Not sure if we should be able to work --- 399,427 ---- toolbar = bars.Item("Standard") # Add our "Delete as ..." and "Recover as" buttons ! self.but_delete_as = self._AddControl( ! toolbar, ! constants.msoControlButton, ! ButtonDeleteAsSpamEvent, (self.manager, self), ! BeginGroup = True, ! Tag = "SpamBayes.DeleteAsSpam") # And again for "Recover as" ! self.but_recover_as = self._AddControl( ! toolbar, ! constants.msoControlButton, ! ButtonRecoverFromSpamEvent, (self.manager, self), ! Tag = "SpamBayes.RecoverFromSpam") ! # Prime our event handler. self.OnFolderSwitch() # The main tool-bar dropdown with all our entries. # Add a pop-up menu to the toolbar ! popup = self._AddControl( ! toolbar, ! constants.msoControlPopup, ! None, None, ! Caption="Anti-Spam", ! TooltipText = "Anti-Spam filters and functions", ! Enabled = True, ! Tag = "SpamBayes.Popup") # Convert from "CommandBarItem" to derived # "CommandBarPopup" Not sure if we should be able to work *************** *** 435,460 **** popup = CastTo(popup, "CommandBarPopup") # And add our children. ! self._AddPopup(popup, manager.ShowManager, (), Caption="Anti-Spam Manager...", TooltipText = "Show the Anti-Spam manager dialog.", ! Enabled = True) ! self._AddPopup(popup, ShowClues, (self.manager, self), Caption="Show spam clues for current message", ! Enabled=True) # If we are running from Python sources, enable a few extra items if not hasattr(sys, "frozen"): ! self._AddPopup(popup, Tester, (self.manager,), Caption="Execute test suite", ! Enabled=True) self.have_setup_ui = True ! def _AddPopup(self, parent, target, target_args, **item_attrs): ! item = parent.Controls.Add(Type=constants.msoControlButton, Temporary=True) # Hook events for the item ! item = DispatchWithEvents(item, ButtonEvent) ! item.Init(target, target_args) for attr, val in item_attrs.items(): setattr(item, attr, val) self.buttons.append(item) def GetSelectedMessages(self, allow_multi = True, explorer = None): --- 431,484 ---- popup = CastTo(popup, "CommandBarPopup") # And add our children. ! self._AddControl(popup, ! constants.msoControlButton, ! ButtonEvent, (manager.ShowManager,), Caption="Anti-Spam Manager...", TooltipText = "Show the Anti-Spam manager dialog.", ! Enabled = True, ! Tag = "SpamBayes.Manager") ! self._AddControl(popup, ! constants.msoControlButton, ! ButtonEvent, (ShowClues, self.manager, self), Caption="Show spam clues for current message", ! Enabled=True, ! Tag = "SpamBayes.Clues") # If we are running from Python sources, enable a few extra items if not hasattr(sys, "frozen"): ! self._AddControl(popup, ! constants.msoControlButton, ! ButtonEvent, (Tester, self.manager), Caption="Execute test suite", ! Enabled=True, ! Tag = "SpamBayes.TestSuite") self.have_setup_ui = True ! def _AddControl(self, ! parent, # who the control is added to ! control_type, # type of control to add. ! events_class, events_init_args, # class/Init() args ! **item_attrs): # extra control attributes. ! # Sigh - sometimes our toolbar etc items will become ! # permanent, even though we make them temporary. ! # I found ! # http://groups.google.com/groups?threadm=eKKmbvQvAHA.1808%40tkmsftngp02 ! # Maybe we should consider making them permanent - this would then ! # allow the user to drag them around the toolbars and have them ! # stick. The downside is that should the user uninstall this addin ! # there is no clean way to remove the buttons. Do we even care? ! assert item_attrs.has_key('Tag'), "Need a 'Tag' attribute!" ! item = self.CommandBars.FindControl( ! Type = control_type, ! Tag = item_attrs['Tag']) ! if item is None: ! item = parent.Controls.Add(Type=control_type, Temporary=True) # Hook events for the item ! if events_class is not None: ! item = DispatchWithEvents(item, events_class) ! item.Init(*events_init_args) for attr, val in item_attrs.items(): setattr(item, attr, val) self.buttons.append(item) + return item def GetSelectedMessages(self, allow_multi = True, explorer = None): *************** *** 490,494 **** self.explorer_list = None for button in self.buttons: ! button.Close() self.buttons = [] self.close() # disconnect events. --- 514,520 ---- self.explorer_list = None for button in self.buttons: ! closer = getattr(button, "Close", None) ! if closer is not None: ! closer() self.buttons = [] self.close() # disconnect events. From mhammond at users.sourceforge.net Sun Dec 15 20:12:02 2002 From: mhammond at users.sourceforge.net (Mark Hammond) Date: Sun Dec 15 23:12:05 2002 Subject: [Spambayes-checkins] spambayes/Outlook2000 tester.py,1.1,1.2 train.py,1.21,1.22 Message-ID: Update of /cvsroot/spambayes/spambayes/Outlook2000 In directory sc8-pr-cvs1:/tmp/cvs-serv31561 Modified Files: tester.py train.py Log Message: My test suite needed an "untrain" function - but failed to actually test that it worked . Fixed untrain and the test. Index: tester.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/tester.py,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** tester.py 9 Dec 2002 09:18:38 -0000 1.1 --- tester.py 16 Dec 2002 04:11:59 -0000 1.2 *************** *** 195,198 **** --- 195,201 ---- if not was_spam: TestFailed("Untraining this message did not indicate it was spam") + if train.been_trained_as_spam(store_msg, driver.manager) or \ + train.been_trained_as_ham(store_msg, driver.manager): + TestFailed("Untraining this message kept it has ham/spam") need_untrain = False finally: Index: train.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/train.py,v retrieving revision 1.21 retrieving revision 1.22 diff -C2 -d -r1.21 -r1.22 *** train.py 9 Dec 2002 09:18:38 -0000 1.21 --- train.py 16 Dec 2002 04:12:00 -0000 1.22 *************** *** 68,75 **** --- 68,79 ---- assert not been_trained_as_ham(msg, mgr), "Can't have been both!" mgr.bayes.unlearn(tokenize(stream), True) + del mgr.message_db[msg.searchkey] + mgr.bayes_dirty = True return True if been_trained_as_ham(msg, mgr): assert not been_trained_as_spam(msg, mgr), "Can't have been both!" mgr.bayes.unlearn(tokenize(stream), False) + del mgr.message_db[msg.searchkey] + mgr.bayes_dirty = True return False return None From tim_one at users.sourceforge.net Sun Dec 15 21:14:33 2002 From: tim_one at users.sourceforge.net (Tim Peters) Date: Mon Dec 16 00:14:36 2002 Subject: [Spambayes-checkins] spambayes tokenizer.py,1.71,1.72 Message-ID: Update of /cvsroot/spambayes/spambayes In directory sc8-pr-cvs1:/tmp/cvs-serv29458 Modified Files: tokenizer.py Log Message: The Stripper class replaced each stripped-out portion with a single blank. Made this parameterizable (via a class atrribute), and changed the default to an empty string. See the comments for why. Index: tokenizer.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/tokenizer.py,v retrieving revision 1.71 retrieving revision 1.72 diff -C2 -d -r1.71 -r1.72 *** tokenizer.py 11 Dec 2002 04:57:24 -0000 1.71 --- tokenizer.py 16 Dec 2002 05:14:30 -0000 1.72 *************** *** 867,870 **** --- 867,878 ---- class Stripper(object): + + # The retained portions are catenated together with self.separator. + # CAUTION: This used to be blank. But then I noticed spam putting + # HTML comments embedded in words, like + # FREE! + # Breaking this into "FR" and "EE!" wasn't a real help . + separator = '' # a subclass can override if this isn't appropriate + def __init__(self, find_start, find_end): # find_start and find_end have signature *************** *** 903,908 **** break dummy, i = m.span() ! # Replace each skipped portion with a single blank. ! return ' '.join(retained), tokens def tokenize(self, match_object): --- 911,915 ---- break dummy, i = m.span() ! return self.separator.join(retained), tokens def tokenize(self, match_object): *************** *** 914,918 **** # uuencoded stuff. Note that we're not bothering to decode it! Maybe # we should. One of my persistent false negatives is a spam containing ! # nothing but a uuencoded money.txt; OTOH, uuencoded seems to be on # its way out (that's an old spam). --- 921,925 ---- # uuencoded stuff. Note that we're not bothering to decode it! Maybe # we should. One of my persistent false negatives is a spam containing ! # nothing but a uuencoded money.txt; OTOH, uuencode seems to be on # its way out (that's an old spam). From popiel at users.sourceforge.net Wed Dec 18 15:11:42 2002 From: popiel at users.sourceforge.net (T. Alexander Popiel) Date: Wed Dec 18 18:11:46 2002 Subject: [Spambayes-checkins] spambayes/hammie - New directory Message-ID: Update of /cvsroot/spambayes/spambayes/hammie In directory sc8-pr-cvs1:/tmp/cvs-serv9769/hammie Log Message: Directory /cvsroot/spambayes/spambayes/hammie added to the repository From popiel at users.sourceforge.net Wed Dec 18 15:14:53 2002 From: popiel at users.sourceforge.net (T. Alexander Popiel) Date: Wed Dec 18 18:14:57 2002 Subject: [Spambayes-checkins] spambayes/hammie BULK.txt,NONE,1.1 bulkgraph.py,NONE,1.1 bulktrain.sh,NONE,1.1 procmailrc,NONE,1.1 Message-ID: Update of /cvsroot/spambayes/spambayes/hammie In directory sc8-pr-cvs1:/tmp/cvs-serv10421/hammie Added Files: BULK.txt bulkgraph.py bulktrain.sh procmailrc Log Message: Added my personal scripts for dealing with retraining on a collection of MH folders, where one folder contains all messages, and other folders contain only spam. --- NEW FILE: BULK.txt --- Alex's spambayes filter scripts ------------------------------- I've finally started using spambayes for my incoming mail filtering. I've got a slightly unusual setup, so I had to write a couple scripts to deal with the nightly retraining... First off, let me describe how I've got things set up. I am an avid (and rather religious) MH user, so my mail folders are of course stored in the MH format (directories full of single-message files, where the filenames are numbers indicating ordering in the folder). I've got four mail folders of interest for this discussion: everything, spam, newspam, and inbox. When mail arrives, it is classified, then immediately copied in the everything folder. If it was classified as spam or ham, it is trained as such, reinforcing the classification. Then, if it was labeled as spam, it goes into the newspam folder; otherwise it goes into my inbox. When I read my mail (from inbox or newspam), I move any confirmed spam into my spam folder; ham may be deleted. (Of course, I still have a copy of my ham in the everything folder.) Every night, I run a complete retraining (from cron at 2:10am); it trains on all mail in the everything folder that is less than 4 months old. If a given message has an identical copy in the spam or newspam folder, then it is trained as spam; otherwise it is trained as ham. This does mean that unread unsures will be treated as ham for up to a day; there's few enough of them that I don't care. The four-month age limit will have the effect of expiring old mail out of the training set, which will keep the database size fairly manageable (it's currently just under 10 meg, with 6 days to go until I have 4 months of data). The retraining generates a little report for me each night, showing a graph of my ham and spam levels over time. Here's a sample: | Scanning spamdir (/home/cashew/popiel/Mail/spam): | Scanning spamdir (/home/cashew/popiel/Mail/newspam): | Scanning everything | sshsshsshsshsshsshsshshsshshshshsshshshshshshsshsshshsshssshsshshsshshsshshs | sshshshshsshshsshshshshshssshshshsshsshsshshshshshshsshshhshshsshshshshssshs | sshshsssshs | 154 | 152| | 144| | 136| | 128| h | 120| h s | 112| s ss ss s h s ss | 104| ss ss ss sHs h s ss | 96| s ss s sH s ss sHs h Sss ss | 88| h ss s sss ss sH sss ssssHHhS sSsssss | 80| s sSH ss ssssss sssssH HssssHsHHHSS sSsssss | 72| ssHSH ssssssssssssHHsHSHssHsHsHHHSSssSsssss | 64| s s s s sHsHSHsssssssHsHsssHHsHSHssHsHsHHHSSssSsssss | 56| s sss ss sssssHHHSHsHsssHsHHHHssHHsHSHHsHHHsHHHSSsHSsssss | 48| ssssssssssssssHHHSHHHHssHsHHHHHsHHsHSHHsHHHsHHHSSsHSssHsss | 40| ssssssssssHsHHHHHSHHHHHsHsHHHHHHHHHHSHHsHHHHHHHSSsHSHsHHss | 32| ssHHssHsssHHHHHHHSHHHHHHHsHHHHHHHHHHSHHsHHHHHHHSSHHSHHHHHs | 24| ssHHHHHHHsHHHHHHHSHHHHHHHsHHHHHHHHHHSHHHHHHHHHHSSHHSHHHHHs | 16| HsHHHHHHHHHHHHHHHSHHHHHHHHHHHHHHHHHHSHHHHHHHHHHSSHHSHHHHHs | 8| HHHHHHHHHHHHHHHHHSHHHHHHHHHHHHHHHHHHSHHHHHHHHHHSSHHSHHHHHH | 0|SSSUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUU | +------------------------------------------------------------ | | Total: 6441 ham, 9987 spam (60.79% spam) | | real 7m45.049s | user 5m38.980s | sys 0m39.170s At the top of the output it mentions what it's scanning, and has a long line of s and h indicating progress (so it doesn't look hung if you run it by hand). Below is a set of overlaid bar graphs; s is for spam, h is for ham, u is unsure. The shorter bars are in front and capitalized. In the example, I have very few days where I have more ham than spam. Finally, there's the amount of time it took to run the retraining. My scripts are: bulkgraph.py read and train on messages, and generate the graph bulktrain.sh wrapper for bulkgraph.py, times the process and moves databases around procmailrc a slightly edited version of my .procmailrc file When I actually use this, I put bulkgraph.py and bulktrain.py in the root of my spambayes tree. Minor tweaks would probably make this unnecessary, but as a python newbie I don't know what they are off the top of my head, and I can't be bothered to find out. ;-) --- NEW FILE: bulkgraph.py --- #! /usr/bin/env python ### Train spambayes on messages in an MH mailbox, with spam identified ### by identical copies in other designated MH mailboxes. ### ### Run this from a cron job on your server. """Usage: %(program)s [OPTIONS] ... Where OPTIONS is one or more of: -h show usage and exit -d DBNAME use the DBM store. A DBM file is larger than the pickle and creating it is slower, but loading it is much faster, especially for large word databases. Recommended for use with hammiefilter or any procmail-based filter. -D DBNAME use the pickle store. A pickle is smaller and faster to create, but much slower to load. Recommended for use with pop3proxy and hammiesrv. -g PATH mbox or directory of known good messages (non-spam) to train on. Can be specified more than once. -s PATH mbox or directory of known spam messages to train on. Can be specified more than once. -f force training, ignoring the trained header. Use this if you need to rebuild your database from scratch. -q quiet mode; no output """ import mboxutils import getopt import hammie import sys import os import re import time import filecmp program = sys.argv[0] loud = True day = 24 * 60 * 60 # The following are in days expire = 4 * 30 grouping = 2 def usage(code, msg=''): """Print usage message and sys.exit(code).""" if msg: print >> sys.stderr, msg print >> sys.stderr print >> sys.stderr, __doc__ % globals() sys.exit(code) def row(value, spamday, hamday, unsureday): line = "%5d|" % value for j in range((expire) // grouping, -1, -1): spamv = 0 hamv = 0 unsurev = 0 for k in range(j * grouping, (j + 1) * grouping): try: spamv += spamday[k] hamv += hamday[k] unsurev += unsureday[k] except: pass spamv = spamv // grouping hamv = hamv // grouping unsurev = unsurev // grouping # print "%d: %ds %dh %du" % (j, spamv, hamv, unsurev) count = 0 char = ' ' if spamv >= value: count += 1 char = 's' if hamv >= value: count += 1 if (char == ' ' or hamv < spamv): char = 'h' if unsurev >= value: count += 1 if (char == ' ' or (char == 's' and unsurev < spamv) or (char == 'h' and unsurev < hamv)): char = 'u' if count > 1: char = char.upper() line += char return line def main(): """Main program; parse options and go.""" global loud try: opts, args = getopt.getopt(sys.argv[1:], 'hfqd:D:s:e:') except getopt.error, msg: usage(2, msg) if not opts: usage(2, "No options given") pck = None usedb = None force = False everything = None spam = [] for opt, arg in opts: if opt == '-h': usage(0) elif opt == "-f": force = True elif opt == "-q": loud = False elif opt == '-e': everything = arg elif opt == '-s': spam.append(arg) elif opt == "-d": usedb = True pck = arg elif opt == "-D": usedb = False pck = arg if args: usage(2, "Positional arguments not allowed") if usedb == None: usage(2, "Must specify one of -d or -D") h = hammie.open(pck, usedb, "c") spamsizes = {} for s in spam: if loud: print "Scanning spamdir (%s):" % s files = os.listdir(s) for f in files: if f[0] in ('1', '2', '3', '4', '5', '6', '7', '8', '9'): name = os.path.join(s, f) size = os.stat(name).st_size try: spamsizes[size].append(name) except KeyError: spamsizes[size] = [name] skipcount = 0 spamcount = 0 hamcount = 0 spamday = [0] * expire hamday = [0] * expire unsureday = [0] * expire date_re = re.compile( r";.* (\d{1,2} (?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec) \d{2,4})") now = time.mktime(time.strptime(time.strftime("%d %b %Y"), "%d %b %Y")) if loud: print "Scanning everything" for f in os.listdir(everything): if f[0] in ('1', '2', '3', '4', '5', '6', '7', '8', '9'): name = os.path.join(everything, f) fh = file(name, "rb") msg = mboxutils.get_message(fh) fh.close() # Figure out how old the message is age = 2 * expire try: received = (msg.get_all("Received"))[0] received = date_re.search(received).group(1) # if loud: print " %s" % received date = time.mktime(time.strptime(received, "%d %b %Y")) # if loud: print " %d" % date age = (now - date) // day # Can't just continue here... we're in a try if age < 0: age = 2 * expire except: pass # Skip anything that has no date or is too old or from the future # if loud: print "%s: %d" % (name, age) if age >= expire: skipcount += 1 if loud and not (skipcount % 100): sys.stdout.write("-") sys.stdout.flush() continue age = int(age) try: if msg.get("X-Spambayes-Classification").find("unsure") >= 0: unsureday[age] += 1 except: pass size = os.stat(name).st_size isspam = False try: for s in spamsizes[size]: if filecmp.cmp(name, s): isspam = True except KeyError: pass if isspam: spamcount += 1 spamday[age] += 1 if loud and not (spamcount % 100): sys.stdout.write("s") sys.stdout.flush() else: hamcount += 1 hamday[age] += 1 if loud and not (hamcount % 100): sys.stdout.write("h") sys.stdout.flush() h.train(msg, isspam) if loud: print mval = max(max(spamday), max(hamday), max(unsureday)) scale = (mval + 19) // 20 print "%5d" % mval for j in range(19, -1, -1): print row(scale * j, spamday, hamday, unsureday) print " +" + ('-' * 60) print print "Total: %d ham, %d spam (%.2f%% spam)" % ( hamcount, spamcount, spamcount * 100.0 / (hamcount + spamcount)) h.store() if __name__ == "__main__": main() --- NEW FILE: bulktrain.sh --- #!/bin/bash cd $HOME/spambayes/active/spambayes rm -f tmpdb 2>/dev/null time /usr/bin/python2.2 bulkgraph.py \ -d tmpdb \ -e $HOME/Mail/everything/ \ -s $HOME/Mail/spam \ -s $HOME/Mail/newspam \ && mv -f tmpdb hammiedb ls -l hammiedb --- NEW FILE: procmailrc --- MAILDIR=/home/cashew/popiel/Mail HOME=/home/cashew/popiel # Classify message (up here so all copies have the classification) :0fw: | /usr/bin/python2.2 $HOME/spambayes/active/spambayes/hammiefilter.py # And trust the classification :0Hc: * ^X-Spambayes-Classification: ham | /usr/bin/python2.2 $HOME/spambayes/active/spambayes/hammiefilter.py -g :0Hc: * ^X-Spambayes-Classification: spam | /usr/bin/python2.2 $HOME/spambayes/active/spambayes/hammiefilter.py -s # Save all mail for analysis :0c: everything/. # Block spam :0H: * ^Content-Type:.*text/html newspam/. :0H: * ^X-Spambayes-Classification: spam newspam/. # Put mail from myself in outbox :0H: * ^From:.*popiel\@wolfskeep outbox/. # Everything else is presumably good :0: inbox/. From anthonybaxter at users.sourceforge.net Sun Dec 29 23:37:17 2002 From: anthonybaxter at users.sourceforge.net (Anthony Baxter) Date: Mon Dec 30 02:37:21 2002 Subject: [Spambayes-checkins] website docs.ht,1.4,1.5 Message-ID: Update of /cvsroot/spambayes/website In directory sc8-pr-cvs1:/tmp/cvs-serv23271 Modified Files: docs.ht Log Message: quick'n'dirty search interface for mailing lists. Index: docs.ht =================================================================== RCS file: /cvsroot/spambayes/website/docs.ht,v retrieving revision 1.4 retrieving revision 1.5 diff -C2 -d -r1.4 -r1.5 *** docs.ht 12 Nov 2002 00:37:19 -0000 1.4 --- docs.ht 30 Dec 2002 07:37:15 -0000 1.5 *************** *** 11,14 **** --- 11,23 ---- +

    Search the mailing lists

    +

    A quick-n-dirty google search interface for the mailing list archives - put your search terms in the box with the existing ones: +

    +
    +

    Enter search terms: + + +

    +

    Glossary

    A useful(?) glossary of terminology

    From anthonybaxter at users.sourceforge.net Sun Dec 29 23:38:55 2002 From: anthonybaxter at users.sourceforge.net (Anthony Baxter) Date: Mon Dec 30 02:38:58 2002 Subject: [Spambayes-checkins] website index.ht,1.2,1.3 Message-ID: Update of /cvsroot/spambayes/website In directory sc8-pr-cvs1:/tmp/cvs-serv23514 Modified Files: index.ht Log Message: note about the mailing list search. Index: index.ht =================================================================== RCS file: /cvsroot/spambayes/website/index.ht,v retrieving revision 1.2 retrieving revision 1.3 diff -C2 -d -r1.2 -r1.3 *** index.ht 1 Nov 2002 04:50:19 -0000 1.2 --- index.ht 30 Dec 2002 07:38:53 -0000 1.3 *************** *** 25,29 ****